```
├── .cursorignore
├── .github/
   ├── ISSUE_TEMPLATE/
      ├── bug_report.yaml (500 tokens)
      ├── feature_request.yaml (200 tokens)
   ├── PULL_REQUEST_TEMPLATE.md (500 tokens)
   ├── PULL_REQUEST_TEMPLATE/
      ├── pr_form.yml (600 tokens)
   ├── dependabot.yml (100 tokens)
   ├── labels.yml (400 tokens)
   ├── release-drafter.yml (200 tokens)
   ├── workflows/
      ├── codeql.yml (900 tokens)
      ├── docs.yml (200 tokens)
      ├── labeler.yml (200 tokens)
      ├── lint.yml (100 tokens)
      ├── pr-lint.yml (300 tokens)
      ├── publish-to-pypi.yml (900 tokens)
      ├── test.yml (400 tokens)
├── .gitignore (100 tokens)
├── .pre-commit-config.yaml (100 tokens)
├── LICENSE (omitted)
├── README.md (5.1k tokens)
├── babeldoc/
   ├── __init__.py
   ├── assets/
      ├── assets.py (3.4k tokens)
      ├── embedding_assets_metadata.py (4.3k tokens)
   ├── asynchronize/
      ├── __init__.py (400 tokens)
   ├── babeldoc_exception/
      ├── BabelDOCException.py
      ├── __init__.py
   ├── const.py (300 tokens)
   ├── docvision/
      ├── README.md
      ├── __init__.py
      ├── base_doclayout.py (400 tokens)
      ├── doclayout.py (1600 tokens)
      ├── rpc_doclayout.py (2.1k tokens)
      ├── rpc_doclayout2.py (2.2k tokens)
      ├── rpc_doclayout3.py (2.3k tokens)
      ├── table_detection/
         ├── rapidocr.py (2.3k tokens)
   ├── format/
      ├── __init__.py
      ├── pdf/
         ├── __init__.py
         ├── babelpdf/
            ├── encoding.py (3k tokens)
         ├── converter.py (4.4k tokens)
         ├── document_il/
            ├── __init__.py (400 tokens)
            ├── backend/
               ├── __init__.py
               ├── pdf_creater.py (8.6k tokens)
            ├── frontend/
               ├── __init__.py
               ├── il_creater.py (5.2k tokens)
            ├── il_version_1.py (3.9k tokens)
            ├── il_version_1.rnc (800 tokens)
            ├── il_version_1.rng (2.3k tokens)
            ├── il_version_1.xsd (2000 tokens)
            ├── midend/
               ├── __init__.py
               ├── add_debug_information.py (1100 tokens)
               ├── automatic_term_extractor.py (2.3k tokens)
               ├── detect_scanned_file.py (1300 tokens)
               ├── il_translator.py (7.4k tokens)
               ├── il_translator_llm_only.py (5.4k tokens)
               ├── layout_parser.py (1100 tokens)
               ├── paragraph_finder.py (7.3k tokens)
               ├── remove_descent.py (1400 tokens)
               ├── styles_and_formulas.py (6.8k tokens)
               ├── table_parser.py (1200 tokens)
               ├── typesetting.py (10.3k tokens)
            ├── utils/
               ├── __init__.py
               ├── fontmap.py (2.4k tokens)
               ├── formular_helper.py (2k tokens)
               ├── layout_helper.py (5k tokens)
               ├── mupdf_helper.py (100 tokens)
               ├── paragraph_helper.py (300 tokens)
               ├── style_helper.py (600 tokens)
               ├── zstd_helper.py (100 tokens)
            ├── xml_converter.py (400 tokens)
         ├── high_level.py (8k tokens)
         ├── pdfinterp.py (4.3k tokens)
         ├── result_merger.py (1500 tokens)
         ├── split_manager.py (400 tokens)
         ├── translation_config.py (3.4k tokens)
   ├── glossary.py (1600 tokens)
   ├── main.py (4.9k tokens)
   ├── pdfminer/
      ├── LICENSE (200 tokens)
      ├── __init__.py (100 tokens)
      ├── _saslprep.py (700 tokens)
      ├── arcfour.py (200 tokens)
      ├── ascii85.py (400 tokens)
      ├── casting.py (400 tokens)
      ├── ccitt.py (4.3k tokens)
      ├── cmap/
         ├── 78-EUC-H.pickle.gz
         ├── 78-EUC-V.pickle.gz
         ├── 78-H.pickle.gz
         ├── 78-RKSJ-H.pickle.gz
         ├── 78-RKSJ-V.pickle.gz
         ├── 78-V.pickle.gz
         ├── 78ms-RKSJ-H.pickle.gz
         ├── 78ms-RKSJ-V.pickle.gz
         ├── 83pv-RKSJ-H.pickle.gz
         ├── 83pv-RKSJ-V.pickle.gz
         ├── 90ms-RKSJ-H.pickle.gz
         ├── 90ms-RKSJ-V.pickle.gz
         ├── 90msp-RKSJ-H.pickle.gz
         ├── 90msp-RKSJ-V.pickle.gz
         ├── 90pv-RKSJ-H.pickle.gz
         ├── 90pv-RKSJ-V.pickle.gz
         ├── Add-H.pickle.gz
         ├── Add-RKSJ-H.pickle.gz
         ├── Add-RKSJ-V.pickle.gz
         ├── Add-V.pickle.gz
         ├── B5-H.pickle.gz
         ├── B5-V.pickle.gz
         ├── B5pc-H.pickle.gz
         ├── B5pc-V.pickle.gz
         ├── CNS-EUC-H.pickle.gz
         ├── CNS-EUC-V.pickle.gz
         ├── CNS1-H.pickle.gz
         ├── CNS1-V.pickle.gz
         ├── CNS2-H.pickle.gz
         ├── CNS2-V.pickle.gz
         ├── ETHK-B5-H.pickle.gz
         ├── ETHK-B5-V.pickle.gz
         ├── ETen-B5-H.pickle.gz
         ├── ETen-B5-V.pickle.gz
         ├── ETenms-B5-H.pickle.gz
         ├── ETenms-B5-V.pickle.gz
         ├── EUC-H.pickle.gz
         ├── EUC-V.pickle.gz
         ├── Ext-H.pickle.gz
         ├── Ext-RKSJ-H.pickle.gz
         ├── Ext-RKSJ-V.pickle.gz
         ├── Ext-V.pickle.gz
         ├── GB-EUC-H.pickle.gz
         ├── GB-EUC-V.pickle.gz
         ├── GB-H.pickle.gz
         ├── GB-V.pickle.gz
         ├── GBK-EUC-H.pickle.gz
         ├── GBK-EUC-V.pickle.gz
         ├── GBK2K-H.pickle.gz
         ├── GBK2K-V.pickle.gz
         ├── GBKp-EUC-H.pickle.gz
         ├── GBKp-EUC-V.pickle.gz
         ├── GBT-EUC-H.pickle.gz
         ├── GBT-EUC-V.pickle.gz
         ├── GBT-H.pickle.gz
         ├── GBT-V.pickle.gz
         ├── GBTpc-EUC-H.pickle.gz
         ├── GBTpc-EUC-V.pickle.gz
         ├── GBpc-EUC-H.pickle.gz
         ├── GBpc-EUC-V.pickle.gz
         ├── H.pickle.gz
         ├── HKdla-B5-H.pickle.gz
         ├── HKdla-B5-V.pickle.gz
         ├── HKdlb-B5-H.pickle.gz
         ├── HKdlb-B5-V.pickle.gz
         ├── HKgccs-B5-H.pickle.gz
         ├── HKgccs-B5-V.pickle.gz
         ├── HKm314-B5-H.pickle.gz
         ├── HKm314-B5-V.pickle.gz
         ├── HKm471-B5-H.pickle.gz
         ├── HKm471-B5-V.pickle.gz
         ├── HKscs-B5-H.pickle.gz
         ├── HKscs-B5-V.pickle.gz
         ├── Hankaku-H.pickle.gz
         ├── Hankaku-V.pickle.gz
         ├── Hiragana-H.pickle.gz
         ├── Hiragana-V.pickle.gz
         ├── KSC-EUC-H.pickle.gz
         ├── KSC-EUC-V.pickle.gz
         ├── KSC-H.pickle.gz
         ├── KSC-Johab-H.pickle.gz
         ├── KSC-Johab-V.pickle.gz
         ├── KSC-V.pickle.gz
         ├── KSCms-UHC-H.pickle.gz
         ├── KSCms-UHC-HW-H.pickle.gz
         ├── KSCms-UHC-HW-V.pickle.gz
         ├── KSCms-UHC-V.pickle.gz
         ├── KSCpc-EUC-H.pickle.gz
         ├── KSCpc-EUC-V.pickle.gz
         ├── Katakana-H.pickle.gz
         ├── Katakana-V.pickle.gz
         ├── NWP-H.pickle.gz
         ├── NWP-V.pickle.gz
         ├── README.txt (800 tokens)
         ├── RKSJ-H.pickle.gz
         ├── RKSJ-V.pickle.gz
         ├── Roman-H.pickle.gz
         ├── Roman-V.pickle.gz
         ├── UniCNS-UCS2-H.pickle.gz
         ├── UniCNS-UCS2-V.pickle.gz
         ├── UniCNS-UTF16-H.pickle.gz
         ├── UniCNS-UTF16-V.pickle.gz
         ├── UniCNS-UTF32-H.pickle.gz
         ├── UniCNS-UTF32-V.pickle.gz
         ├── UniCNS-UTF8-H.pickle.gz
         ├── UniCNS-UTF8-V.pickle.gz
         ├── UniGB-UCS2-H.pickle.gz
         ├── UniGB-UCS2-V.pickle.gz
         ├── UniGB-UTF16-H.pickle.gz
         ├── UniGB-UTF16-V.pickle.gz
         ├── UniGB-UTF32-H.pickle.gz
         ├── UniGB-UTF32-V.pickle.gz
         ├── UniGB-UTF8-H.pickle.gz
         ├── UniGB-UTF8-V.pickle.gz
         ├── UniJIS-UCS2-H.pickle.gz
         ├── UniJIS-UCS2-HW-H.pickle.gz
         ├── UniJIS-UCS2-HW-V.pickle.gz
         ├── UniJIS-UCS2-V.pickle.gz
         ├── UniJIS-UTF16-H.pickle.gz
         ├── UniJIS-UTF16-V.pickle.gz
         ├── UniJIS-UTF32-H.pickle.gz
         ├── UniJIS-UTF32-V.pickle.gz
         ├── UniJIS-UTF8-H.pickle.gz
         ├── UniJIS-UTF8-V.pickle.gz
         ├── UniJIS2004-UTF16-H.pickle.gz
         ├── UniJIS2004-UTF16-V.pickle.gz
         ├── UniJIS2004-UTF32-H.pickle.gz
         ├── UniJIS2004-UTF32-V.pickle.gz
         ├── UniJIS2004-UTF8-H.pickle.gz
         ├── UniJIS2004-UTF8-V.pickle.gz
         ├── UniJISX0213-UTF32-H.pickle.gz
         ├── UniJISX0213-UTF32-V.pickle.gz
         ├── UniJISX02132004-UTF32-H.pickle.gz
         ├── UniJISX02132004-UTF32-V.pickle.gz
         ├── UniKS-UCS2-H.pickle.gz
         ├── UniKS-UCS2-V.pickle.gz
         ├── UniKS-UTF16-H.pickle.gz
         ├── UniKS-UTF16-V.pickle.gz
         ├── UniKS-UTF32-H.pickle.gz
         ├── UniKS-UTF32-V.pickle.gz
         ├── UniKS-UTF8-H.pickle.gz
         ├── UniKS-UTF8-V.pickle.gz
         ├── V.pickle.gz
         ├── WP-Symbol-H.pickle.gz
         ├── WP-Symbol-V.pickle.gz
         ├── to-unicode-Adobe-CNS1.pickle.gz
         ├── to-unicode-Adobe-GB1.pickle.gz
         ├── to-unicode-Adobe-Japan1.pickle.gz
         ├── to-unicode-Adobe-Korea1.pickle.gz
      ├── cmapdb.py (3.2k tokens)
      ├── converter.py (7.3k tokens)
      ├── data_structures.py (400 tokens)
      ├── encodingdb.py (800 tokens)
      ├── fontmetrics.py (22.5k tokens)
      ├── glyphlist.py (26.2k tokens)
      ├── high_level.py (1600 tokens)
      ├── image.py (2000 tokens)
      ├── jbig2.py (2.3k tokens)
      ├── latin_enc.py (1700 tokens)
      ├── layout.py (6.8k tokens)
      ├── lzw.py (700 tokens)
      ├── pdfcolor.py (200 tokens)
      ├── pdfdevice.py (1900 tokens)
      ├── pdfdocument.py (7.7k tokens)
      ├── pdfexceptions.py (100 tokens)
      ├── pdffont.py (7.1k tokens)
      ├── pdfinterp.py (8.8k tokens)
      ├── pdfpage.py (1800 tokens)
      ├── pdfparser.py (1200 tokens)
      ├── pdftypes.py (2.5k tokens)
      ├── psexceptions.py
      ├── psparser.py (4.1k tokens)
      ├── py.typed
      ├── runlength.py (300 tokens)
      ├── settings.py
      ├── utils.py (4.2k tokens)
   ├── progress_monitor.py (2.3k tokens)
   ├── tools/
      ├── generate_font_metadata.py (800 tokens)
      ├── italic_assistance.py (2.1k tokens)
      ├── italic_recognize_tool.py (600 tokens)
   ├── translator/
      ├── __init__.py
      ├── cache.py (900 tokens)
      ├── translator.py (2.1k tokens)
   ├── utils/
      ├── __init__.py
      ├── atomic_integer.py (100 tokens)
      ├── priority_thread_pool_executor.py (1800 tokens)
├── docs/
   ├── CODE_OF_CONDUCT.md (1000 tokens)
   ├── CONTRIBUTING.md (1500 tokens)
   ├── CONTRIBUTOR_REWARD.md (600 tokens)
   ├── ImplementationDetails/
      ├── AsyncTranslate/
         ├── AsyncTranslate.md (1100 tokens)
      ├── ILTranslator/
         ├── ILTranslator.md (600 tokens)
      ├── PDFCreation/
         ├── PDFCreation.md (700 tokens)
      ├── PDFParsing/
         ├── PDFParsing.md (800 tokens)
      ├── ParagraphFinding/
         ├── ParagraphFinding.md (600 tokens)
      ├── README.md (300 tokens)
      ├── StylesAndFormulas/
         ├── StylesAndFormulas.md (600 tokens)
      ├── Typesetting/
         ├── Typesetting.md (800 tokens)
   ├── README.md (100 tokens)
   ├── deploy.sh (200 tokens)
   ├── example/
      ├── demo_glossary.csv
   ├── images/
      ├── babeldoc-banner.png
      ├── babeldoc-big-logo-darkmode-with-transparent-background.png
      ├── babeldoc-big-logo-darkmode-with-transparent-background.svg (1600 tokens)
      ├── babeldoc-big-logo-with-transparent-background.png
      ├── babeldoc-big-logo-with-transparent-background.svg (1600 tokens)
      ├── babeldoc-big-logo.png
      ├── babeldoc-contributor_reward_example.png
      ├── babeldoc-preview.gif
      ├── babeldoc-small-logo-with-transparent-background.png
      ├── babeldoc-small-logo-with-transparent-background.svg (400 tokens)
      ├── babeldoc-small-logo.png
   ├── index.md
   ├── intro-to-pdf-object.md (1300 tokens)
   ├── requirements.txt
   ├── supported_languages.md (2.5k tokens)
├── examples/
   ├── basic.xml (100 tokens)
   ├── ci/
      ├── test.pdf
   ├── code-figure.xml (200 tokens)
   ├── complex.xml (2000 tokens)
   ├── formular.xml (100 tokens)
   ├── table.xml (300 tokens)
├── mkdocs.yml (1000 tokens)
├── pyproject.toml (900 tokens)
├── tests/
   ├── test_translation_config.py (100 tokens)
├── uv.lock (omitted)
```


## /.cursorignore

```cursorignore path="/.cursorignore" 
# Project notes and templates
xnotes/

```

## /.github/ISSUE_TEMPLATE/bug_report.yaml

```yaml path="/.github/ISSUE_TEMPLATE/bug_report.yaml" 
name: "🐞 Bug Report"
description: Create a report to help us improve
labels: ['bug']
body:
  - type: checkboxes
    id: checks
    attributes:
      label: Before you submit
      options:
        - label: I have searched existing issues
          required: true
        - label: I spent at least 5 minutes investigating and preparing this report
          required: true
        - label: I confirmed this is not caused by a network issue
          required: true
        - label: I have fully read and understood the [README](https://github.com/funstory-ai/BabelDOC/blob/main/README.md)
          required: true
        - label: I am certain that this issue is with BabelDOC itself and can be reproduced through the BabelDOC cli
          required: true

  - type: markdown
    attributes:
      value: |
        Thank you for using **BabelDOC** and helping us improve it! 🙏

  - type: textarea
    id: environment
    attributes:
      label: Environment
      description: Provide your system details (required)
      value: |
        - OS:
        - Python:
        - BabelDOC:
      render: markdown
    validations:
      required: true

  - type: textarea
    id: describe
    attributes:
      label: Describe the bug
      description: A clear and concise description of what the bug is.
    validations:
      required: true

  - type: textarea
    id: reproduce
    attributes:
      label: Steps to Reproduce
      description: Help us reproduce the issue
      value: |
        1. Go to '...'
        2. Click on '...'
        3. See error
    validations:
      required: false

  - type: textarea
    id: expected
    attributes:
      label: Expected Behavior
      description: What did you expect to happen?
    validations:
      required: false

  - type: textarea
    id: logs
    attributes:
      label: Relevant Log Output or Screenshots
      description: Copy and paste any logs or attach screenshots. This will be formatted automatically.
      render: text
    validations:
      required: false

  - type: textarea
    id: pdf
    attributes:
      label: Original PDF File
      description: Upload the input PDF if applicable.
    validations:
      required: false

  - type: textarea
    id: others
    attributes:
      label: Additional Context
      description: Anything else we should know?
    validations:
      required: false

```

## /.github/ISSUE_TEMPLATE/feature_request.yaml

```yaml path="/.github/ISSUE_TEMPLATE/feature_request.yaml" 
name: "✨ Feature Request"
description: Suggest a new idea or improvement for BabelDOC
labels: ['enhancement']
body:
  - type: markdown
    attributes:
      value: |
        Thank you for helping improve **BabelDOC**! Please fill out the form below to suggest a feature.

  - type: textarea
    id: describe
    attributes:
      label: Is your feature request related to a problem?
      description: If applicable, describe what problem this feature would solve.
      placeholder: Ex. I'm always frustrated when ...
    validations:
      required: false

  - type: textarea
    id: solution
    attributes:
      label: Describe the solution you'd like
      description: What would you like to see happen?
    validations:
      required: true

  - type: textarea
    id: alternatives
    attributes:
      label: Describe alternatives you've considered
      description: Have you thought of other ways to solve this?
    validations:
      required: false

  - type: textarea
    id: additional
    attributes:
      label: Additional context
      description: Any other context, examples, or screenshots?
    validations:
      required: false

```

## /.github/PULL_REQUEST_TEMPLATE.md

### PR Title

<!-- Please fill in a concise and clear PR title below -->
[PR] <Your concise title here>

### Related Issue(s)

<!-- If this PR closes or is related to one or more issues, please list them here (e.g., #37) -->
<!-- e.g.: Closes #37, Relates to #42 -->

### Motivation and Context

<!-- Why is this change required? What problem does it solve? -->
<!-- If it fixes an open issue, please link to the issue here. -->

### Summary of Changes

<!-- What does this PR introduce or fix? Please describe concisely. -->

### PR Type

<!-- What kind of change is this? Please select one or more -->
- [ ] ✨ Enhancement
- [ ] 🐛 Bug Fix
- [ ] 📚 Documentation
- [ ] 🏗️ Refactor
- [ ] 🧪 Test
- [ ] 🧹 Chore

### Breaking Changes

<!-- Does this PR introduce any breaking changes? If so, please describe them. -->
<!-- - [ ] Yes, this PR introduces breaking changes.
<!-- - [ ] No, this PR does not introduce breaking changes. -->
<!-- Detailed description of breaking changes (if any): -->

### Contributor Checklist

- [ ] I have fully read and understood the **[CONTRIBUTING.md](https://funstory-ai.github.io/BabelDOC/CONTRIBUTING/)** guide.
- [ ] I have performed a self-review of my own code.
- [ ] My changes follow the project's code style and guidelines
- [ ] I have linked the related issue(s) in the description above (if applicable)
- [ ] I have updated relevant documentation (if applicable)
- [ ] I have added necessary tests that prove my fix is effective or that my feature works (if applicable)
- [ ] All new and existing tests passed locally with my changes
- [ ] My changes generate no new warnings or errors
- [ ] I understand that due to limited maintainer resources, only small PRs are accepted. Suggestions with proof-of-concept patches are appreciated, and my patch may be rewritten if necessary.

### Testing Instructions

<!-- Please provide clear and concise step-by-step instructions on how to test your changes. -->
<!-- e.g.: -->
<!-- 1. Check out this branch. -->
<!-- 2. Run `...` to install dependencies. -->
<!-- 3. Run `...` to start the application/run the script. -->
<!-- 4. Navigate to `...` or observe `...` -->
<!-- 5. Verify that `...` (expected outcome). -->

### Screenshots (if applicable)

<!-- If your changes include UI modifications, please add screenshots or GIFs to show the before and after. -->

### Additional Notes

<!-- Is there anything else the reviewer should know? For example, any dependencies, or potential impacts. --> 

## /.github/PULL_REQUEST_TEMPLATE/pr_form.yml

```yml path="/.github/PULL_REQUEST_TEMPLATE/pr_form.yml" 
name: Pull Request
description: Submit a pull request to contribute to BabelDOC
title: "[PR] <Your concise title here>"
labels:
  - needs triage
body:
  - type: markdown
    attributes:
      value: |
        ## 👋 Thanks for contributing to **BabelDOC**!

        Please fill out this form to help us review your pull request effectively.

  - type: input
    id: issue
    attributes:
      label: Related Issue(s)
      description: If this pull request closes or is related to one or more issues, list them here (e.g., #37)
      placeholder: "#37"
    validations:
      required: false

  - type: textarea
    id: summary
    attributes:
      label: Description
      description: Describe the purpose of this pull request and what was changed.
      placeholder: |
        - What does this PR introduce or fix?
        - What is the motivation behind it?
    validations:
      required: true

  - type: dropdown
    id: pr_type
    attributes:
      label: PR Type
      description: What kind of change is this?
      multiple: true
      options:
        - enhancement
        - bug
        - documentation
        - refactor
        - test
        - chore
    validations:
      required: true

  - type: checkboxes
    id: checklist
    attributes:
      label: Contributor Checklist
      options:
        - label: I’ve fully read and understood the **[CONTRIBUTING.md](https://funstory-ai.github.io/BabelDOC/CONTRIBUTING/)** guide
          required: true
        - label: My changes follow the project’s code style and guidelines
          required: true
        - label: I’ve linked the related issue(s) in the description above
        - label: I’ve updated relevant documentation (if applicable)
        - label: I’ve added necessary tests (if applicable)
        - label: All new and existing tests passed locally
        - label: I understand that due to limited maintainer resources, only small pull requests are accepted. Suggestions with proof-of-concept patches are appreciated, and my patch may be rewritten if necessary.

  - type: textarea
    id: testing
    attributes:
      label: Testing Instructions
      description: Provide step-by-step instructions on how to test your changes
      placeholder: |
        1. Run `...`
        2. Visit `...`
        3. Click `...`
        4. Verify `...`
    validations:
      required: false

  - type: textarea
    id: screenshots
    attributes:
      label: Screenshots (if applicable)
      description: If UI changes were made, please attach before/after screenshots.
    validations:
      required: false

  - type: textarea
    id: notes
    attributes:
      label: Additional Notes
      description: Anything else the reviewer should know?
    validations:
      required: false

```

## /.github/dependabot.yml

```yml path="/.github/dependabot.yml" 
version: 2
updates:
  - package-ecosystem: github-actions
    directory: "/"
    schedule:
      interval: weekly
  # - package-ecosystem: pip
  #   directory: "/.github/workflows"
  #   schedule:
  #     interval: weekly
  # - package-ecosystem: pip
  #   directory: "/docs"
  #   schedule:
  #     interval: weekly
  - package-ecosystem: pip
    directory: "/"
    schedule:
      interval: weekly
    versioning-strategy: lockfile-only
    allow:
      - dependency-type: "all"
```

## /.github/labels.yml

```yml path="/.github/labels.yml" 
---
# Labels names are important as they are used by Release Drafter to decide
# regarding where to record them in changelog or if to skip them.
#
# The repository labels will be automatically configured using this file and
# the GitHub Action https://github.com/marketplace/actions/github-labeler.
- name: breaking
  description: Breaking Changes
  color: "bfd4f2"
- name: bug
  description: Something isn't working
  color: "d73a4a"
- name: build
  description: Build System and Dependencies
  color: "bfdadc"
- name: ci
  description: Continuous Integration
  color: "4a97d6"
- name: dependencies
  description: Pull requests that update a dependency file
  color: "0366d6"
- name: documentation
  description: Improvements or additions to documentation
  color: "0075ca"
- name: duplicate
  description: This issue or pull request already exists
  color: "cfd3d7"
- name: enhancement
  description: New feature or request
  color: "a2eeef"
- name: github_actions
  description: Pull requests that update Github_actions code
  color: "000000"
- name: good first issue
  description: Good for newcomers
  color: "7057ff"
- name: help wanted
  description: Extra attention is needed
  color: "008672"
- name: invalid
  description: This doesn't seem right
  color: "e4e669"
- name: performance
  description: Performance
  color: "016175"
- name: python
  description: Pull requests that update Python code
  color: "2b67c6"
- name: question
  description: Further information is requested
  color: "d876e3"
- name: refactoring
  description: Refactoring
  color: "ef67c4"
- name: removal
  description: Removals and Deprecations
  color: "9ae7ea"
- name: style
  description: Style
  color: "c120e5"
- name: testing
  description: Testing
  color: "b1fc6f"
- name: wontfix
  description: This will not be worked on
  color: "ffffff"
```

## /.github/release-drafter.yml

```yml path="/.github/release-drafter.yml" 
name-template: 'v$RESOLVED_VERSION'
tag-template: 'v$RESOLVED_VERSION'
categories:
  - title: '🚀 Features'
    labels:
      - 'feature'
      - 'enhancement'
  - title: '🐛 Bug Fixes'
    labels:
      - 'fix'
      - 'bugfix'
      - 'bug'
  - title: '🧰 Maintenance'
    labels:
      - 'chore'
      - 'maintenance'
      - 'refactor'
  - title: '📝 Documentation'
    labels:
      - 'docs'
      - 'documentation'
change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
change-title-escapes: '\<*_&' # You can add # and @ to disable mentions
version-resolver:
  major:
    labels:
      - 'major'
  minor:
    labels:
      - 'minor'
  patch:
    labels:
      - 'patch'
  default: patch
template: |
  ## Changes

  $CHANGES

  ## Contributors
  
  $CONTRIBUTORS

```

## /.github/workflows/codeql.yml

```yml path="/.github/workflows/codeql.yml" 
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL Advanced"

on:
  push:
  pull_request:
    branches: [ "main" ]
  schedule:
    - cron: '36 14 * * 1'

jobs:
  analyze:
    name: Analyze (${{ matrix.language }})
    # Runner size impacts CodeQL analysis time. To learn more, please see:
    #   - https://gh.io/recommended-hardware-resources-for-running-codeql
    #   - https://gh.io/supported-runners-and-hardware-resources
    #   - https://gh.io/using-larger-runners (GitHub.com only)
    # Consider using larger runners or machines with greater resources for possible analysis time improvements.
    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
    permissions:
      # required for all workflows
      security-events: write

      # required to fetch internal or private CodeQL packs
      packages: read

      # only required for workflows in private repositories
      actions: read
      contents: read

    strategy:
      fail-fast: false
      matrix:
        include:
        - language: python
          build-mode: none
        - language: actions
        # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
        # Use `c-cpp` to analyze code written in C, C++ or both
        # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
        # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
        # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
        # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
        # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
        # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4

    # Initializes the CodeQL tools for scanning.
    - name: Initialize CodeQL
      uses: github/codeql-action/init@v3
      with:
        languages: ${{ matrix.language }}
        build-mode: ${{ matrix.build-mode }}
        # If you wish to specify custom queries, you can do so here or in a config file.
        # By default, queries listed here will override any specified in a config file.
        # Prefix the list here with "+" to use these queries and those in the config file.

        # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
        # queries: security-extended,security-and-quality

    # If the analyze step fails for one of the languages you are analyzing with
    # "We were unable to automatically build your code", modify the matrix above
    # to set the build mode to "manual" for that language. Then modify this step
    # to build your code.
    # ℹ️ Command-line programs to run using the OS shell.
    # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
    - if: matrix.build-mode == 'manual'
      shell: bash
      run: |
        echo 'If you are using a "manual" build mode for one or more of the' \
          'languages you are analyzing, replace this with the commands to build' \
          'your code, for example:'
        echo '  make bootstrap'
        echo '  make release'
        exit 1

    - name: Perform CodeQL Analysis
      uses: github/codeql-action/analyze@v3
      with:
        category: "/language:${{matrix.language}}"

```

## /.github/workflows/docs.yml

```yml path="/.github/workflows/docs.yml" 
name: docs
on:
  push:
    branches:
      - main
permissions:
  contents: write
jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Configure Git Credentials
        run: |
          git config user.name github-actions[bot]
          git config user.email 41898282+github-actions[bot]@users.noreply.github.com
      - name: Setup uv with Python 3.12
        uses: astral-sh/setup-uv@bd01e18f51369d5a26f1651c3cb451d3417e3bba # v6.3.1
        with:
          python-version: "3.12"
          enable-cache: true
          cache-dependency-glob: "uv.lock"
          activate-environment: true
      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 
      - uses: actions/cache@v4
        with:
          key: mkdocs-material-${{ env.cache_id }}
          path: .cache
          restore-keys: |
            mkdocs-material-
      - run: uv sync
      - run: uv run mkdocs gh-deploy --force
```

## /.github/workflows/labeler.yml

```yml path="/.github/workflows/labeler.yml" 
name: Labeler

on:
  push:
    branches:
      - 'main'
    paths:
      - '.github/labels.yml'
      - '.github/workflows/labels.yml'
  pull_request:
    paths:
      - '.github/labels.yml'
      - '.github/workflows/labels.yml'

permissions:
  contents: read
  issues: write
  pull-requests: write

jobs:
  labeler:
    runs-on: ubuntu-latest
    steps:
      - name: Check out the repository
        uses: actions/checkout@v4

      - name: Run Labeler
        uses: crazy-max/ghaction-github-labeler@24d110aa46a59976b8a7f35518cb7f14f434c916 # v5.3.0
        with:
          skip-delete: true
          dry-run: ${{ github.event_name == 'pull_request' }}
          github-token: ${{ secrets.GITHUB_TOKEN }}
          yaml-file: .github/labels.yml
          exclude: |
            help*
            *issue
```

## /.github/workflows/lint.yml

```yml path="/.github/workflows/lint.yml" 
name: Lint Code
permissions:
  contents: read
  pull-requests: write
on: [push]

jobs:
  lint:
    strategy:
      fail-fast: false
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Ruff
        uses: astral-sh/ruff-action@v3
      - name: AutoCorrect
        uses: huacnlee/autocorrect-action@main

```

## /.github/workflows/pr-lint.yml

```yml path="/.github/workflows/pr-lint.yml" 
name: Lint Code and Review Dog Report

on: [pull_request]
permissions:
  contents: read
  pull-requests: write
jobs:
  ruff:
    name: runner / ruff
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
          
      - name: Install ruff
        run: pip install ruff
        
      - name: Install reviewdog
        uses: reviewdog/action-setup@e04ffabe3898a0af8d0fb1af00c188831c4b5893 # v1.3.2
        with:
          reviewdog_version: latest
          
      - name: Run ruff with reviewdog
        env:
          REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          ruff check . --output-format=rdjson | reviewdog -f=rdjson -reporter=github-pr-review -fail-on-error
          
  autocorrect:
    name: runner / autocorrect
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: AutoCorrect
        uses: huacnlee/autocorrect-action@bf91ab3904c2908dd8e71312a8a83ed1eb632997 # v2.13.3
      - name: Report ReviewDog
        if: failure()
        uses: huacnlee/autocorrect-action@bf91ab3904c2908dd8e71312a8a83ed1eb632997 # v2.13.3
        env:
          REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          reviewdog: true
```

## /.github/workflows/publish-to-pypi.yml

```yml path="/.github/workflows/publish-to-pypi.yml" 
name: Release

on:
  push:
    branches:
      - main
      - master

permissions:
  id-token: write
  contents: write
  pull-requests: write

jobs:
  check-repository:
    name: Check if running in main repository
    runs-on: ubuntu-latest
    outputs:
      is_main_repo: ${{ github.repository == 'funstory-ai/BabelDOC' }}
    steps:
      - run: echo "Running repository check"

  build:
    name: Build distribution 📦
    needs: check-repository
    if: needs.check-repository.outputs.is_main_repo == 'true'
    runs-on: ubuntu-latest
    outputs:
      is_release: ${{ steps.check-version.outputs.tag }}
    steps:
      - uses: actions/checkout@v4
        with:
          persist-credentials: true
          fetch-depth: 2
          token: ${{ secrets.GITHUB_TOKEN }}
          
      - name: Setup uv with Python 3.12
        uses: astral-sh/setup-uv@bd01e18f51369d5a26f1651c3cb451d3417e3bba # v6.3.1
        with:
          python-version: "3.12"
          enable-cache: true
          cache-dependency-glob: "uv.lock"
          activate-environment: true

      - name: Check if there is a parent commit
        id: check-parent-commit
        run: |
          echo "sha=$(git rev-parse --verify --quiet HEAD^)" >> $GITHUB_OUTPUT

      - name: Detect and tag new version
        id: check-version
        if: steps.check-parent-commit.outputs.sha
        uses: salsify/action-detect-and-tag-new-version@b1778166f13188a9d478e2d1198f993011ba9864 # v2.0.3
        with:
          version-command: |
            cat pyproject.toml | grep "version = " | head -n 1 | awk -F'"' '{print $2}'

      - name: Install Dependencies
        run: |
          uv sync

      - name: Bump version for developmental release
        if: "! steps.check-version.outputs.tag"
        run: |
          version=$(uv run bumpver update --patch --tag=final --dry 2>&1 | grep "New Version" | awk '{print $NF}') &&
          uv run bumpver update --set-version $version.dev$(date +%s)

      - name: Build package
        run: "uv build"

      - name: Store the distribution packages
        uses: actions/upload-artifact@v4.6.2
        with:
          name: python-package-distributions
          path: dist/

  publish-to-pypi:
    name: Publish Python 🐍 distribution 📦 to PyPI
    if: needs.build.outputs.is_release != ''
    needs:
      - check-repository
      - build
    runs-on: ubuntu-latest
    environment:
      name: pypi
      url: https://pypi.org/p/BabelDOC

    permissions:
      id-token: write

    steps:
      - name: Download all the dists
        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
        with:
          name: python-package-distributions
          path: dist/

      - name: Publish distribution 📦 to PyPI
        uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4

  publish-to-testpypi:
    name: Publish Python 🐍 distribution 📦 to TestPyPI
    if: needs.build.outputs.is_release == ''
    needs:
      - check-repository
      - build
    runs-on: ubuntu-latest
    environment:
      name: testpypi
      url: https://test.pypi.org/p/BabelDOC

    permissions:
      id-token: write

    steps:
      - name: Download all the dists
        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
        with:
          name: python-package-distributions
          path: dist/

      - name: Publish distribution 📦 to TestPyPI
        uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
        with:
          repository-url: https://test.pypi.org/legacy/

  post-release:
    name: Post Release Tasks
    needs:
      - check-repository
      - build
      - publish-to-pypi
      - publish-to-testpypi
    if: |
      always() && needs.check-repository.outputs.is_main_repo == 'true' && 
      (needs.publish-to-pypi.result == 'success' || needs.publish-to-testpypi.result == 'success')
    runs-on: ubuntu-latest
    permissions:
      contents: write
      pull-requests: write
    steps:
      - uses: actions/checkout@v4
        with:
          persist-credentials: true
          fetch-depth: 2
          token: ${{ secrets.GITHUB_TOKEN }}

      - name: Publish the release notes
        uses: release-drafter/release-drafter@b1476f6e6eb133afa41ed8589daba6dc69b4d3f5 # v6.1.0
        with:
          publish: ${{ needs.build.outputs.is_release != '' }}
          tag: ${{ needs.build.outputs.is_release }}
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
```

## /.github/workflows/test.yml

```yml path="/.github/workflows/test.yml" 
name: Run Tests 🧪

on:
  push:
  pull_request:
    branches: ["main"]

permissions:
  contents: read
  pull-requests: read

jobs:
  test:
    name: Run Python Tests
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.10", "3.11", "3.12", "3.13"]

    steps:
      - uses: actions/checkout@v4
        with:
          persist-credentials: false
      - name: Cached Assets
        id: cache-assets
        uses: actions/cache@v4.2.0
        with:
          path: ~/.cache/babeldoc
          key: babeldoc-assets-${{ hashFiles('babeldoc/assets/embedding_assets_metadata.py') }}
      - name: Setup uv with Python ${{ matrix.python-version }}
        uses: astral-sh/setup-uv@bd01e18f51369d5a26f1651c3cb451d3417e3bba # v6.3.1
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true
          cache-dependency-glob: "uv.lock"
          activate-environment: true
      - name: Warm up cache
        run: |
          uv run babeldoc --warmup
      - name: Run tests
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAIAPIKEY }}
          OPENAI_BASE_URL: ${{ secrets.OPENAIAPIURL }}
          OPENAI_MODEL: ${{ secrets.OPENAIMODEL }}
        run: |
          uv run babeldoc --help
          uv run babeldoc --openai --files examples/ci/test.pdf --openai-api-key ${{ env.OPENAI_API_KEY }} --openai-base-url ${{ env.OPENAI_BASE_URL }} --openai-model ${{ env.OPENAI_MODEL }}
      - name: Generate offline assets package
        run: |
          uv run babeldoc --generate-offline-assets /tmp/offline_assets
      - name: Restore offline assets package
        run: |
          rm -rf ~/.cache/babeldoc
          uv run babeldoc --restore-offline-assets /tmp/offline_assets
      - name: Clean up
        run: |
          rm -rf /tmp/offline_assets
          rm -rf ~/.cache/babeldoc/cache.v1.db
          rm -rf ~/.cache/babeldoc/working

```

## /.gitignore

```gitignore path="/.gitignore" 
# Logs
web/logs
web/*.log
web/npm-debug.log*
web/yarn-debug.log*
web/yarn-error.log*
web/pnpm-debug.log*
web/lerna-debug.log*

web/node_modules
web/dist
web/dist-ssr
web/*.local

memray*
**/*.so
*.pdf
*.docx
*.json
**/*.pyc
.venv
.idea
*.egg-info
.DS_Store
.vscode
__pycache__
.ruff_cache
yadt.toml
examples/
/make_gif.py
/dist
.cache
.cursor/rules/_*.mdc
/.cursor
/xnotes
/docs/workflow-rules.md
babeldoc/format/txt
/profile.svg


# uv
uv.lock

```

## /.pre-commit-config.yaml

```yaml path="/.pre-commit-config.yaml" 
files: '^.*\.py$'
repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
    # Ruff version.
    rev: v0.9.5
    hooks:
      # Run the linter.
      - id: ruff
        args: [ "--fix",
                "--ignore=E203,E261,E501,E741,F841" ]
      # Run the formatter.
      - id: ruff-format

```

## /README.md

<!-- # Yet Another Document Translator -->

<div align="center">
<!-- <img src="https://s.immersivetranslate.com/assets/r2-uploads/images/babeldoc-banner.png" width="320px"  alt="YADT"/> -->

<br/>

<picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://s.immersivetranslate.com/assets/uploads/babeldoc-big-logo-darkmode-with-transparent-background-IKuNO1.svg" width="320px" alt="BabelDOC"/>
  <img src="https://s.immersivetranslate.com/assets/uploads/babeldoc-big-logo-with-transparent-background-2xweBr.svg" width="320px" alt="BabelDOC"/>
</picture>

<!-- <h2 id="title">BabelDOC</h2> -->

<p>
  <!-- PyPI -->
  <a href="https://pypi.org/project/BabelDOC/">
    <img src="https://img.shields.io/pypi/v/BabelDOC"></a>
  <a href="https://pepy.tech/projects/BabelDOC">
    <img src="https://static.pepy.tech/badge/BabelDOC"></a>
  <!-- <a href="https://github.com/funstory-ai/BabelDOC/pulls">
    <img src="https://img.shields.io/badge/contributions-welcome-green"></a> -->
  <!-- License -->
  <a href="./LICENSE">
    <img src="https://img.shields.io/github/license/funstory-ai/BabelDOC"></a>
  <a href="https://t.me/+Z9_SgnxmsmA5NzBl">
    <img src="https://img.shields.io/badge/Telegram-2CA5E0?style=flat-squeare&logo=telegram&logoColor=white"></a>
</p>

<a href="https://trendshift.io/repositories/13358" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13358" alt="funstory-ai%2FBabelDOC | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>

</div>

PDF scientific paper translation and bilingual comparison library.

- **Online Service**: Beta version launched [Immersive Translate - BabelDOC](https://app.immersivetranslate.com/babel-doc/) 1000 free pages per month.
- **Self-deployment**: [PDFMathTranslate 2.0](https://github.com/PDFMathTranslate/PDFMathTranslate-next) support for BabelDOC, available for self-deployment + WebUI with more translation services.
- Provides a simple [command line interface](#getting-started).
- Provides a [Python API](#python-api).
- Mainly designed to be embedded into other programs, but can also be used directly for simple translation tasks.

> [!TIP]
>
> How to use BabelDOC in Zotero
>
> 1. Immersive Translate Pro members can use the [immersive-translate/zotero-immersivetranslate](https://github.com/immersive-translate/zotero-immersivetranslate) plugin
>
> 2. PDFMathTranslate self-deployed users can use the [guaguastandup/zotero-pdf2zh](https://github.com/guaguastandup/zotero-pdf2zh) plugin

[Supported Language](https://funstory-ai.github.io/BabelDOC/supported_languages/)

## Preview

<div align="center">
<img src="https://s.immersivetranslate.com/assets/r2-uploads/images/babeldoc-preview.png" width="80%"/>
</div>

## We are hiring

See details: [EN](https://github.com/funstory-ai/jobs) | [ZH](https://github.com/funstory-ai/jobs/blob/main/README_ZH.md)

## Getting Started

### Install from PyPI

We recommend using the Tool feature of [uv](https://github.com/astral-sh/uv) to install yadt.

1. First, you need to refer to [uv installation](https://github.com/astral-sh/uv#installation) to install uv and set up the `PATH` environment variable as prompted.

2. Use the following command to install yadt:

```bash
uv tool install --python 3.12 BabelDOC

babeldoc --help
```

3. Use the `babeldoc` command. For example:

```bash
babeldoc --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here"  --files example.pdf

# multiple files
babeldoc --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here"  --files example1.pdf --files example2.pdf
```

### Install from Source

We still recommend using [uv](https://github.com/astral-sh/uv) to manage virtual environments.

1. First, you need to refer to [uv installation](https://github.com/astral-sh/uv#installation) to install uv and set up the `PATH` environment variable as prompted.

2. Use the following command to install yadt:

```bash
# clone the project
git clone https://github.com/funstory-ai/BabelDOC

# enter the project directory
cd BabelDOC

# install dependencies and run babeldoc
uv run babeldoc --help
```

3. Use the `uv run babeldoc` command. For example:

```bash
uv run babeldoc --files example.pdf --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here"

# multiple files
uv run babeldoc --files example.pdf --files example2.pdf --openai --openai-model "gpt-4o-mini" --openai-base-url "https://api.openai.com/v1" --openai-api-key "your-api-key-here"
```

> [!TIP]
> The absolute path is recommended.

## Advanced Options

> [!NOTE]
> This CLI is mainly for debugging purposes. Although end users can use this CLI to translate files, we do not provide any technical support for this purpose.
>
> End users should directly use **Online Service**: Beta version launched [Immersive Translate - BabelDOC](https://app.immersivetranslate.com/babel-doc/) 1000 free pages per month.
>
> End users who need self-deployment should use [PDFMathTranslate 2.0](https://github.com/PDFMathTranslate/PDFMathTranslate-next)
> 
> If you find that an option is not listed below, it means that this option is a debugging option for maintainers. Please do not use these options.


### Language Options

- `--lang-in`, `-li`: Source language code (default: en)
- `--lang-out`, `-lo`: Target language code (default: zh)

> [!TIP]
> Currently, this project mainly focuses on English-to-Chinese translation, and other scenarios have not been tested yet.
> 
> (2025.3.1 update): Basic English target language support has been added, primarily to minimize line breaks within words([0-9A-Za-z]+).
> 
> [HELP WANTED: Collecting word regular expressions for more languages](https://github.com/funstory-ai/BabelDOC/issues/129)

### PDF Processing Options

- `--files`: One or more file paths to input PDF documents.
- `--pages`, `-p`: Specify pages to translate (e.g., "1,2,1-,-3,3-5"). If not set, translate all pages
- `--split-short-lines`: Force split short lines into different paragraphs (may cause poor typesetting & bugs)
- `--short-line-split-factor`: Split threshold factor (default: 0.8). The actual threshold is the median length of all lines on the current page \* this factor
- `--skip-clean`: Skip PDF cleaning step
- `--dual-translate-first`: Put translated pages first in dual PDF mode (default: original pages first)
- `--disable-rich-text-translate`: Disable rich text translation (may help improve compatibility with some PDFs)
- `--enhance-compatibility`: Enable all compatibility enhancement options (equivalent to --skip-clean --dual-translate-first --disable-rich-text-translate)
- `--use-alternating-pages-dual`: Use alternating pages mode for dual PDF. When enabled, original and translated pages are arranged in alternate order. When disabled (default), original and translated pages are shown side by side on the same page.
- `--watermark-output-mode`: Control watermark output mode: 'watermarked' (default) adds watermark to translated PDF, 'no_watermark' doesn't add watermark, 'both' outputs both versions.
- `--max-pages-per-part`: Maximum number of pages per part for split translation. If not set, no splitting will be performed.
- `--no-watermark`: [DEPRECATED] Use --watermark-output-mode=no_watermark instead.
- `--translate-table-text`: Translate table text (experimental, default: False)
- `--formular-font-pattern`: Font pattern to identify formula text (default: None)
- `--formular-char-pattern`: Character pattern to identify formula text (default: None)
- `--show-char-box`: Show character bounding boxes (debug only, default: False)
- `--skip-scanned-detection`: Skip scanned document detection (default: False). When using split translation, only the first part performs detection if not skipped.
- `--ocr-workaround`: Use OCR workaround (default: False). Only suitable for documents with black text on white background. When enabled, white rectangular blocks will be added below the translation to cover the original text content, and all text will be forced to black color.
- `--auto-enable-ocr-workaround`: Enable automatic OCR workaround (default: False). If a document is detected as heavily scanned, this will attempt to enable OCR processing and skip further scan detection. See "Important Interaction Note" below for crucial details on how this interacts with `--ocr-workaround` and `--skip-scanned-detection`.
- `--primary-font-family`: Override primary font family for translated text. Choices: 'serif' for serif fonts, 'sans-serif' for sans-serif fonts, 'script' for script/italic fonts. If not specified, uses automatic font selection based on original text properties.
- `--only-include-translated-page`: Only include translated pages in the output PDF. This option is only effective when `--pages` is used. (default: False)

- `--rpc-doclayout`: RPC service host address for document layout analysis (default: None)
- `--working-dir`: Working directory for translation. If not set, use temp directory.
- `--no-auto-extract-glossary`: Disable automatic term extraction. If this flag is present, the step is skipped. Defaults to enabled.
- `--save-auto-extracted-glossary`: Save automatically extracted glossary to the specified file. If not set, the glossary will not be saved.

> [!TIP]
> - Both `--skip-clean` and `--dual-translate-first` may help improve compatibility with some PDF readers
> - `--disable-rich-text-translate` can also help with compatibility by simplifying translation input
> - However, using `--skip-clean` will result in larger file sizes
> - If you encounter any compatibility issues, try using `--enhance-compatibility` first
> - Use `--max-pages-per-part` for large documents to split them into smaller parts for translation and automatically merge them back.
> - Use `--skip-scanned-detection` to speed up processing when you know your document is not a scanned PDF.
> - Use `--ocr-workaround` to fill background for scanned PDF. (Current assumption: background is pure white, text is pure black, this option will also auto enable `--skip-scanned-detection`)

### Translation Service Options

- `--qps`: QPS (Queries Per Second) limit for translation service (default: 4)
- `--ignore-cache`: Ignore translation cache and force retranslation
- `--no-dual`: Do not output bilingual PDF files
- `--no-mono`: Do not output monolingual PDF files
- `--min-text-length`: Minimum text length to translate (default: 5)
- `--openai`: Use OpenAI for translation (default: False)
- `--custom-system-prompt`: Custom system prompt for translation.
- `--add-formula-placehold-hint`: Add formula placeholder hint for translation. (Currently not recommended, it may affect translation quality, default: False)
- `--pool-max-workers`: Maximum number of worker threads for internal task processing pools. If not specified, defaults to QPS value. This parameter directly sets the worker count, replacing previous QPS-based dynamic calculations.
- `--no-auto-extract-glossary`: Disable automatic term extraction. If this flag is present, the step is skipped. Defaults to enabled.

> [!TIP]
>
> 1. Currently, only OpenAI-compatible LLM is supported. For more translator support, please use [PDFMathTranslate 2.0](https://github.com/PDFMathTranslate/PDFMathTranslate-next).
> 2. It is recommended to use models with strong compatibility with OpenAI, such as: `glm-4-flash`, `deepseek-chat`, etc.
> 3. Currently, it has not been optimized for traditional translation engines like Bing/Google, it is recommended to use LLMs.
> 4. You can use [litellm](https://github.com/BerriAI/litellm) to access multiple models.
> 5. `--custom-system-prompt`: It is mainly used to add the `/no_think` instruction of Qwen 3 in the prompt. For example: `--custom-system-prompt "/no_think You are a professional, authentic machine translation engine."`

### OpenAI Specific Options

- `--openai-model`: OpenAI model to use (default: gpt-4o-mini)
- `--openai-base-url`: Base URL for OpenAI API
- `--openai-api-key`: API key for OpenAI service

> [!TIP]
>
> 1. This tool supports any OpenAI-compatible API endpoints. Just set the correct base URL and API key. (e.g. `https://xxx.custom.xxx/v1`)
> 2. For local models like Ollama, you can use any value as the API key (e.g. `--openai-api-key a`).

### Glossary Options

- `--glossary-files`: Comma-separated paths to glossary CSV files.
  - Each CSV file should have the columns: `source`, `target`, and an optional `tgt_lng`.
  - The `source` column contains the term in the original language.
  - The `target` column contains the term in the target language.
  - The `tgt_lng` column (optional) specifies the target language for that specific entry (e.g., "zh-CN", "en-US").
    - If `tgt_lng` is provided for an entry, that entry will only be loaded and used if its (normalized) `tgt_lng` matches the (normalized) overall target language specified by `--lang-out`. Normalization involves lowercasing and replacing hyphens (`-`) with underscores (`_`).
    - If `tgt_lng` is omitted for an entry, that entry is considered applicable for any `--lang-out`.
  - The name of each glossary (used in LLM prompts) is derived from its filename (without the .csv extension).
  - During translation, the system will check the input text against the loaded glossaries. If terms from a glossary are found in the current text segment, that glossary (with the relevant terms) will be included in the prompt to the language model, along with an instruction to adhere to it.

### Output Control

- `--output`, `-o`: Output directory for translated files. If not set, use current working directory.
- `--debug`: Enable debug logging level and export detailed intermediate results in `~/.cache/yadt/working`.
- `--report-interval`: Progress report interval in seconds (default: 0.1).

### General Options

- `--warmup`: Only download and verify required assets then exit (default: False)

### Offline Assets Management

- `--generate-offline-assets`: Generate an offline assets package in the specified directory. This creates a zip file containing all required models and fonts.
- `--restore-offline-assets`: Restore an offline assets package from the specified file. This extracts models and fonts from a previously generated package.

> [!TIP]
> 
> 1. Offline assets packages are useful for environments without internet access or to speed up installation on multiple machines.
> 2. Generate a package once with `babeldoc --generate-offline-assets /path/to/output/dir` and then distribute it.
> 3. Restore the package on target machines with `babeldoc --restore-offline-assets /path/to/offline_assets_*.zip`.
> 4. The offline assets package name cannot be modified because the file list hash is encoded in the name.
> 5. If you provide a directory path to `--restore-offline-assets`, the tool will automatically look for the correct offline assets package file in that directory.
> 6. The package contains all necessary fonts and models required for document processing, ensuring consistent results across different environments.
> 7. The integrity of all assets is verified using SHA3-256 hashes during both packaging and restoration.
> 8. If you're deploying in an air-gapped environment, make sure to generate the package on a machine with internet access first.

### Configuration File

- `--config`, `-c`: Configuration file path. Use the TOML format.

Example Configuration:

```toml
[babeldoc]
# Basic settings
debug = true
lang-in = "en-US"
lang-out = "zh-CN"
qps = 10
output = "/path/to/output/dir"

# PDF processing options
split-short-lines = false
short-line-split-factor = 0.8
skip-clean = false
dual-translate-first = false
disable-rich-text-translate = false
use-alternating-pages-dual = false
watermark-output-mode = "watermarked"  # Choices: "watermarked", "no_watermark", "both"
max-pages-per-part = 50  # Automatically split the document for translation and merge it back.
only_include_translated_page = false # Only include translated pages in the output PDF. Effective only when `pages` is used.
# no-watermark = false  # DEPRECATED: Use watermark-output-mode instead
skip-scanned-detection = false  # Skip scanned document detection for faster processing
auto_extract_glossary = true # Set to false to disable automatic term extraction
formular_font_pattern = "" # Font pattern for formula text
formular_char_pattern = "" # Character pattern for formula text
show_char_box = false # Show character bounding boxes (debug)
ocr_workaround = false # Use OCR workaround for scanned PDFs
rpc_doclayout = "" # RPC service host for document layout analysis
working_dir = "" # Working directory for translation
auto_enable_ocr_workaround = false # Enable automatic OCR workaround for scanned PDFs. See docs for interaction with ocr_workaround and skip_scanned_detection.

# Translation service
openai = true
openai-model = "gpt-4o-mini"
openai-base-url = "https://api.openai.com/v1"
openai-api-key = "your-api-key-here"
pool-max-workers = 8  # Maximum worker threads for task processing (defaults to QPS value if not set)

# Glossary Options (Optional)
# glossary-files = "/path/to/glossary1.csv,/path/to/glossary2.csv"

# Output control
no-dual = false
no-mono = false
min-text-length = 5
report-interval = 0.5

# Offline assets management
# Uncomment one of these options as needed:
# generate-offline-assets = "/path/to/output/dir"
# restore-offline-assets = "/path/to/offline_assets_package.zip"
```

## Python API

> [!TIP]
>
> 1. Before pdf2zh 2.0 is released, you can temporarily use BabelDOC's Python API. However, after pdf2zh 2.0 is released, please directly use pdf2zh's Python API.
>
> 2. This project's Python API does not guarantee any compatibility. However, the Python API from pdf2zh will guarantee a certain level of compatibility.
>
> 3. We do not provide any technical support for the BabelDOC API.
>
> 4. When performing secondary development, please refer to [pdf2zh 2.0 high level](https://github.com/PDFMathTranslate/PDFMathTranslate-next/blob/main/pdf2zh_next/high_level.py) and ensure that BabelDOC runs in a subprocess.

You can refer to the example in [main.py](https://github.com/funstory-ai/yadt/blob/main/babeldoc/main.py) to use BabelDOC's Python API.

Please note:

1. Make sure call `babeldoc.format.pdf.high_level.init()` before using the API

2. The current `TranslationConfig` does not fully validate input parameters, so you need to ensure the validity of input parameters

3. For offline assets management, you can use the following functions:
   ```python
   # Generate an offline assets package
   from pathlib import Path
   import babeldoc.assets.assets
   
   # Generate package to a specific directory
   # path is optional, default is ~/.cache/babeldoc/assets/offline_assets_{hash}.zip
   babeldoc.assets.assets.generate_offline_assets_package(Path("/path/to/output/dir"))
   
   # Restore from a package file
   # path is optional, default is ~/.cache/babeldoc/assets/offline_assets_{hash}.zip
   babeldoc.assets.assets.restore_offline_assets_package(Path("/path/to/offline_assets_package.zip"))
   
   # You can also restore from a directory containing the offline assets package
   # The tool will automatically find the correct package file based on the hash
   babeldoc.assets.assets.restore_offline_assets_package(Path("/path/to/directory"))
   ```

> [!TIP]
> 
> 1. The offline assets package name cannot be modified because the file list hash is encoded in the name.
> 2. When using in production environments, it's recommended to pre-generate the assets package and include it with your application distribution.
> 3. The package verification ensures that all required assets are intact and match their expected checksums.

## Background

There are a lot projects and teams working on to make document editing and translating easier like:

- [mathpix](https://mathpix.com/)
- [Doc2X](https://doc2x.noedgeai.com/)
- [minerU](https://github.com/opendatalab/MinerU)
- [PDFMathTranslate](https://github.com/funstory-ai/yadt)

There are also some solutions to solve specific parts of the problem like:

- [layoutreader](https://github.com/microsoft/unilm/tree/master/layoutreader): the read order of the text block in a pdf
- [Surya](https://github.com/surya-is/surya): the structure of the pdf

This project hopes to promote a standard pipeline and interface to solve the problem.

In fact, there are two main stages of a PDF parser or translator:

- **Parsing**: A stage of parsing means to get the structure of the pdf such as text blocks, images, tables, etc.
- **Rendering**: A stage of rendering means to render the structure into a new pdf or other format.

For a service like mathpix, it will parse the pdf into a structure may be in a XML format, and then render them using a single column reader order as [layoutreader](https://github.com/microsoft/unilm/tree/master/layoutreader) does. The bad news is that the original structure lost.

Some people will use Adobe PDF Parser because it will generate a Word document and it keeps the original structure. But it is somewhat expensive.
And you know, a pdf or word document is not a good format for reading in mobile devices.

We offer an intermediate representation of the results from parser and can be rendered into a new pdf or other format. The pipeline is also a plugin-based system which everybody can add their new model, ocr, renderer, etc.

## Roadmap

- [ ] Add line support
- [ ] Add table support
- [ ] Add cross-page/cross-column paragraph support
- [ ] More advanced typesetting features
- [ ] Outline support
- [ ] ...

Our first 1.0 version goal is to finish a translation from [PDF Reference, Version 1.7](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf) to the following language version:

- Simplified Chinese
- Traditional Chinese
- Japanese
- Spanish

And meet the following requirements:

- layout error less than 1%
- content loss less than 1%

## Version Number Explanation

This project uses a combination of [Semantic Versioning](https://semver.org/) and [Pride Versioning](https://pridever.org/). The version number format is: "0.MAJOR.MINOR".

- MAJOR: Incremented by 1 when API incompatible changes are made or when proud improvements are implemented.

- MINOR: Incremented by 1 when any API compatible changes are made.

## Known Issues

1. Parsing errors in the author and reference sections; they get merged into one paragraph after translation.
2. Lines are not supported.
3. Does not support drop caps.
4. Large pages will be skipped.

## How to Contribute

We encourage you to contribute to YADT! Please check out the [CONTRIBUTING](https://github.com/funstory-ai/yadt/blob/main/docs/CONTRIBUTING.md) guide.

Everyone interacting in YADT and its sub-projects' codebases, issue trackers, chat rooms, and mailing lists is expected to follow the YADT [Code of Conduct](https://github.com/funstory-ai/yadt/blob/main/docs/CODE_OF_CONDUCT.md).

[Immersive Translation](https://immersivetranslate.com) sponsors monthly Pro membership redemption codes for active contributors to this project, see details at: [CONTRIBUTOR_REWARD.md](https://github.com/funstory-ai/BabelDOC/blob/main/docs/CONTRIBUTOR_REWARD.md)

## Acknowledgements

- [PDFMathTranslate](https://github.com/Byaidu/PDFMathTranslate)
- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
- [pdfminer](https://github.com/pdfminer/pdfminer.six)
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
- [Asynchronize](https://github.com/multimeric/Asynchronize/tree/master?tab=readme-ov-file)
- [PriorityThreadPoolExecutor](https://github.com/oleglpts/PriorityThreadPoolExecutor)

<h2 id="star_hist">Star History</h2>

<a href="https://star-history.com/#funstory-ai/babeldoc&Date">
 <picture>
   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=funstory-ai/babeldoc&type=Date&theme=dark" />
   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=funstory-ai/babeldoc&type=Date" />
   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=funstory-ai/babeldoc&type=Date"/>
 </picture>
</a>

> [!WARNING]
> **Important Interaction Note for `--auto-enable-ocr-workaround`:**
>
> When `--auto-enable-ocr-workaround` is set to `true` (either via command line or config file):
>
> 1.  During the initial setup, the values for `ocr_workaround` and `skip_scanned_detection` will be forced to `false` by `TranslationConfig`, regardless of whether you also set `--ocr-workaround` or `--skip-scanned-detection` flags.
> 2.  Then, during the scanned document detection phase (`DetectScannedFile` stage):
>     *   If the document is identified as heavily scanned (e.g., >80% scanned pages) AND `auto_enable_ocr_workaround` is `true` (i.e., `translation_config.auto_enable_ocr_workaround` is true), the system will then attempt to set both `ocr_workaround` to `true` and `skip_scanned_detection` to `true`.
>
> This means that `--auto-enable-ocr-workaround` effectively gives the system control to enable OCR processing for scanned documents, potentially overriding manual settings for `--ocr-workaround` and `--skip_scanned_detection` based on its detection results. If the document is *not* detected as heavily scanned, then the initial `false` values for `ocr_workaround` and `skip_scanned_detection` (forced by `--auto-enable-ocr-workaround` at the `TranslationConfig` initialization stage) will remain in effect unless changed by other logic.

## /babeldoc/__init__.py

```py path="/babeldoc/__init__.py" 
__version__ = "0.4.15"

```

## /babeldoc/assets/assets.py

```py path="/babeldoc/assets/assets.py" 
import asyncio
import hashlib
import logging
import threading
import zipfile
from pathlib import Path

import httpx
from babeldoc.assets import embedding_assets_metadata
from babeldoc.assets.embedding_assets_metadata import DOC_LAYOUT_ONNX_MODEL_URL
from babeldoc.assets.embedding_assets_metadata import (
    DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256,
)
from babeldoc.assets.embedding_assets_metadata import EMBEDDING_FONT_METADATA
from babeldoc.assets.embedding_assets_metadata import FONT_METADATA_URL
from babeldoc.assets.embedding_assets_metadata import FONT_URL_BY_UPSTREAM
from babeldoc.assets.embedding_assets_metadata import (
    TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256,
)
from babeldoc.assets.embedding_assets_metadata import TABLE_DETECTION_RAPIDOCR_MODEL_URL
from babeldoc.assets.embedding_assets_metadata import TIKTOKEN_CACHES
from babeldoc.const import get_cache_file_path
from tenacity import retry
from tenacity import stop_after_attempt
from tenacity import wait_exponential

logger = logging.getLogger(__name__)


class ResultContainer:
    def __init__(self):
        self.result = None

    def set_result(self, result):
        self.result = result


def run_in_another_thread(coro):
    result_container = ResultContainer()

    def _wrapper():
        result_container.set_result(asyncio.run(coro))

    thread = threading.Thread(target=_wrapper)
    thread.start()
    thread.join()
    return result_container.result


def run_coro(coro):
    return run_in_another_thread(coro)


def _retry_if_not_cancelled_and_failed(retry_state):
    """Only retry if the exception is not CancelledError and the attempt failed."""
    if retry_state.outcome.failed:
        exception = retry_state.outcome.exception()
        # Don't retry on CancelledError
        if isinstance(exception, asyncio.CancelledError):
            logger.debug("Operation was cancelled, not retrying")
            return False
        # Retry on network related errors
        if isinstance(
            exception, httpx.HTTPError | ConnectionError | ValueError | TimeoutError
        ):
            logger.warning(f"Network error occurred: {exception}, will retry")
            return True
    # Don't retry on success
    return False


def verify_file(path: Path, sha3_256: str):
    if not path.exists():
        return False
    hash_ = hashlib.sha3_256()
    with path.open("rb") as f:
        while True:
            chunk = f.read(1024 * 1024)
            if not chunk:
                break
            hash_.update(chunk)
    return hash_.hexdigest() == sha3_256


@retry(
    retry=_retry_if_not_cancelled_and_failed,
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=15),
    before_sleep=lambda retry_state: logger.warning(
        f"Download file failed, retrying in {retry_state.next_action.sleep} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
async def download_file(
    client: httpx.AsyncClient | None = None,
    url: str = None,
    path: Path = None,
    sha3_256: str = None,
):
    if client is None:
        async with httpx.AsyncClient() as client:
            response = await client.get(url, follow_redirects=True)
    else:
        response = await client.get(url, follow_redirects=True)

    response.raise_for_status()
    with path.open("wb") as f:
        f.write(response.content)
    if not verify_file(path, sha3_256):
        path.unlink(missing_ok=True)
        raise ValueError(f"File {path} is corrupted")


@retry(
    retry=_retry_if_not_cancelled_and_failed,
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=15),
    before_sleep=lambda retry_state: logger.warning(
        f"Get font metadata failed, retrying in {retry_state.next_action.sleep} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
async def get_font_metadata(
    client: httpx.AsyncClient | None = None, upstream: str = None
):
    if upstream not in FONT_METADATA_URL:
        logger.critical(f"Invalid upstream: {upstream}")
        exit(1)

    if client is None:
        async with httpx.AsyncClient() as client:
            response = await client.get(
                FONT_METADATA_URL[upstream], follow_redirects=True
            )
    else:
        response = await client.get(FONT_METADATA_URL[upstream], follow_redirects=True)

    response.raise_for_status()
    logger.debug(f"Get font metadata from {upstream} success")
    return upstream, response.json()


async def get_fastest_upstream_for_font(
    client: httpx.AsyncClient | None = None, exclude_upstream: list[str] = None
):
    tasks: list[asyncio.Task[tuple[str, dict]]] = []
    for upstream in FONT_METADATA_URL:
        if exclude_upstream and upstream in exclude_upstream:
            continue
        tasks.append(asyncio.create_task(get_font_metadata(client, upstream)))
    for future in asyncio.as_completed(tasks):
        try:
            result = await future
            for task in tasks:
                if not task.done():
                    task.cancel()
            return result
        except Exception as e:
            logger.exception(f"Error getting font metadata: {e}")
    logger.error("All upstreams failed")
    return None, None


async def get_fastest_upstream_for_model(client: httpx.AsyncClient | None = None):
    return await get_fastest_upstream_for_font(client, exclude_upstream=["github"])


async def get_fastest_upstream(client: httpx.AsyncClient | None = None):
    (
        fastest_upstream_for_font,
        online_font_metadata,
    ) = await get_fastest_upstream_for_font(client)
    if fastest_upstream_for_font is None:
        logger.error("Failed to get fastest upstream")
        exit(1)

    if fastest_upstream_for_font == "github":
        # since github is only store font, we need to get the fastest upstream for model
        fastest_upstream_for_model, _ = await get_fastest_upstream_for_model(client)
        if fastest_upstream_for_model is None:
            logger.error("Failed to get fastest upstream")
            exit(1)
    else:
        fastest_upstream_for_model = fastest_upstream_for_font

    return online_font_metadata, fastest_upstream_for_font, fastest_upstream_for_model


async def get_doclayout_onnx_model_path_async(client: httpx.AsyncClient | None = None):
    onnx_path = get_cache_file_path(
        "doclayout_yolo_docstructbench_imgsz1024.onnx", "models"
    )
    if verify_file(onnx_path, DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256):
        return onnx_path

    logger.info("doclayout onnx model not found or corrupted, downloading...")
    fastest_upstream, _ = await get_fastest_upstream_for_model(client)
    if fastest_upstream is None:
        logger.error("Failed to get fastest upstream")
        exit(1)

    url = DOC_LAYOUT_ONNX_MODEL_URL[fastest_upstream]

    await download_file(
        client, url, onnx_path, DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256
    )
    logger.info(f"Download doclayout onnx model from {fastest_upstream} success")
    return onnx_path


async def get_table_detection_rapidocr_model_path_async(
    client: httpx.AsyncClient | None = None,
):
    onnx_path = get_cache_file_path("ch_PP-OCRv4_det_infer.onnx", "models")
    if verify_file(onnx_path, TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256):
        return onnx_path

    logger.info("table detection rapidocr model not found or corrupted, downloading...")
    fastest_upstream, _ = await get_fastest_upstream_for_model(client)
    if fastest_upstream is None:
        logger.error("Failed to get fastest upstream")
        exit(1)

    url = TABLE_DETECTION_RAPIDOCR_MODEL_URL[fastest_upstream]

    await download_file(client, url, onnx_path, TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256)
    logger.info(
        f"Download table detection rapidocr model from {fastest_upstream} success"
    )
    return onnx_path


def get_doclayout_onnx_model_path():
    return run_coro(get_doclayout_onnx_model_path_async())


def get_table_detection_rapidocr_model_path():
    return run_coro(get_table_detection_rapidocr_model_path_async())


def get_font_url_by_name_and_upstream(font_file_name: str, upstream: str):
    if upstream not in FONT_URL_BY_UPSTREAM:
        logger.critical(f"Invalid upstream: {upstream}")
        exit(1)

    return FONT_URL_BY_UPSTREAM[upstream](font_file_name)


async def get_font_and_metadata_async(
    font_file_name: str,
    client: httpx.AsyncClient | None = None,
    fastest_upstream: str | None = None,
    font_metadata: dict | None = None,
):
    cache_file_path = get_cache_file_path(font_file_name, "fonts")
    if font_file_name in EMBEDDING_FONT_METADATA and verify_file(
        cache_file_path, EMBEDDING_FONT_METADATA[font_file_name]["sha3_256"]
    ):
        return cache_file_path, EMBEDDING_FONT_METADATA[font_file_name]

    logger.info(f"Font {cache_file_path} not found or corrupted, downloading...")
    if fastest_upstream is None:
        fastest_upstream, font_metadata = await get_fastest_upstream_for_font(client)
        if fastest_upstream is None:
            logger.critical("Failed to get fastest upstream")
            exit(1)

        if font_file_name not in font_metadata:
            logger.critical(f"Font {font_file_name} not found in {font_metadata}")
            exit(1)

        if verify_file(cache_file_path, font_metadata[font_file_name]["sha3_256"]):
            return cache_file_path, font_metadata[font_file_name]

    assert font_metadata is not None

    url = get_font_url_by_name_and_upstream(font_file_name, fastest_upstream)
    if "sha3_256" not in font_metadata[font_file_name]:
        logger.critical(f"Font {font_file_name} not found in {font_metadata}")
        exit(1)
    await download_file(
        client, url, cache_file_path, font_metadata[font_file_name]["sha3_256"]
    )
    return cache_file_path, font_metadata[font_file_name]


def get_font_and_metadata(font_file_name: str):
    return run_coro(get_font_and_metadata_async(font_file_name))


def get_font_family(lang_code: str):
    font_family = embedding_assets_metadata.get_font_family(lang_code)
    return font_family


async def download_all_fonts_async(client: httpx.AsyncClient | None = None):
    for font_file_name in EMBEDDING_FONT_METADATA:
        if not verify_file(
            get_cache_file_path(font_file_name, "fonts"),
            EMBEDDING_FONT_METADATA[font_file_name]["sha3_256"],
        ):
            break
    else:
        logger.debug("All fonts are already downloaded")
        return

    fastest_upstream, font_metadata = await get_fastest_upstream_for_font(client)
    if fastest_upstream is None:
        logger.error("Failed to get fastest upstream")
        exit(1)
    logger.info(f"Downloading fonts from {fastest_upstream}")

    font_tasks = [
        asyncio.create_task(
            get_font_and_metadata_async(
                font_file_name, client, fastest_upstream, font_metadata
            )
        )
        for font_file_name in EMBEDDING_FONT_METADATA
    ]
    await asyncio.gather(*font_tasks)


async def async_warmup():
    logger.info("Downloading all assets...")
    from tiktoken import encoding_for_model

    _ = encoding_for_model("gpt-4o")
    async with httpx.AsyncClient() as client:
        onnx_task = asyncio.create_task(get_doclayout_onnx_model_path_async(client))
        onnx_task2 = asyncio.create_task(
            get_table_detection_rapidocr_model_path_async(client)
        )
        font_tasks = asyncio.create_task(download_all_fonts_async(client))
        await asyncio.gather(onnx_task, onnx_task2, font_tasks)


def warmup():
    run_coro(async_warmup())


def generate_all_assets_file_list():
    result = {}
    result["fonts"] = []
    result["models"] = []
    result["tiktoken"] = []
    for font_file_name in EMBEDDING_FONT_METADATA:
        result["fonts"].append(
            {
                "name": font_file_name,
                "sha3_256": EMBEDDING_FONT_METADATA[font_file_name]["sha3_256"],
            }
        )
    for tiktoken_file, sha3_256 in TIKTOKEN_CACHES.items():
        result["tiktoken"].append(
            {
                "name": tiktoken_file,
                "sha3_256": sha3_256,
            }
        )
    result["models"].append(
        {
            "name": "doclayout_yolo_docstructbench_imgsz1024.onnx",
            "sha3_256": DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256,
        },
    )
    result["models"].append(
        {
            "name": "ch_PP-OCRv4_det_infer.onnx",
            "sha3_256": TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256,
        },
    )
    return result


async def generate_offline_assets_package_async(output_directory: Path | None = None):
    await async_warmup()
    logger.info("Generating offline assets package...")
    file_list = generate_all_assets_file_list()
    offline_assets_tag = get_offline_assets_tag(file_list)
    if output_directory is None:
        output_path = get_cache_file_path(
            f"offline_assets_{offline_assets_tag}.zip", "assets"
        )
    else:
        output_directory.mkdir(parents=True, exist_ok=True)
        output_path = output_directory / f"offline_assets_{offline_assets_tag}.zip"
    with zipfile.ZipFile(
        output_path, "w", compression=zipfile.ZIP_DEFLATED, compresslevel=9
    ) as zipf:
        for file_type, file_descs in file_list.items():
            # zipf.mkdir(file_type)
            for file_desc in file_descs:
                file_name = file_desc["name"]
                sha3_256 = file_desc["sha3_256"]
                file_path = get_cache_file_path(file_name, file_type)
                if not verify_file(file_path, sha3_256):
                    logger.error(f"File {file_path} is corrupted")
                    exit(1)

                with file_path.open("rb") as f:
                    zipf.writestr(f"{file_type}/{file_name}", f.read())
    logger.info(f"Offline assets package generated at {output_path}")


async def restore_offline_assets_package_async(input_path: Path | None = None):
    file_list = generate_all_assets_file_list()
    offline_assets_tag = get_offline_assets_tag(file_list)
    if input_path is None:
        input_path = get_cache_file_path(
            f"offline_assets_{offline_assets_tag}.zip", "assets"
        )
    else:
        if input_path.exists() and input_path.is_dir():
            input_path = input_path / f"offline_assets_{offline_assets_tag}.zip"
        if not input_path.exists():
            logger.critical(f"Offline assets package not found: {input_path}")
            exit(1)

        import re

        offline_assets_tag_from_input_path = re.match(
            r"offline_assets_(.*)\.zip", input_path.name
        ).group(1)
        if offline_assets_tag != offline_assets_tag_from_input_path:
            logger.critical(
                f"Offline assets tag mismatch: {offline_assets_tag} != {offline_assets_tag_from_input_path}"
            )
            exit(1)
    nothing_changed = True
    with zipfile.ZipFile(input_path, "r") as zipf:
        for file_type, file_descs in file_list.items():
            for file_desc in file_descs:
                file_name = file_desc["name"]
                file_path = get_cache_file_path(file_name, file_type)

                if verify_file(file_path, file_desc["sha3_256"]):
                    continue
                nothing_changed = False
                with zipf.open(f"{file_type}/{file_name}", "r") as f:
                    with file_path.open("wb") as f2:
                        f2.write(f.read())
                if not verify_file(file_path, file_desc["sha3_256"]):
                    logger.critical(
                        "Offline assets package is corrupted, please delete it and try again"
                    )
                    exit(1)
    if not nothing_changed:
        logger.info(f"Offline assets package restored from {input_path}")


def get_offline_assets_tag(file_list: dict | None = None):
    if file_list is None:
        file_list = generate_all_assets_file_list()
    import orjson

    # noinspection PyTypeChecker
    offline_assets_tag = hashlib.sha3_256(
        orjson.dumps(
            file_list,
            option=orjson.OPT_APPEND_NEWLINE
            | orjson.OPT_INDENT_2
            | orjson.OPT_SORT_KEYS,
        )
    ).hexdigest()
    return offline_assets_tag


def generate_offline_assets_package(output_directory: Path | None = None):
    return run_coro(generate_offline_assets_package_async(output_directory))


def restore_offline_assets_package(input_path: Path | None = None):
    return run_coro(restore_offline_assets_package_async(input_path))


if __name__ == "__main__":
    from rich.logging import RichHandler

    logging.basicConfig(level=logging.DEBUG, handlers=[RichHandler()])
    logging.getLogger("httpx").setLevel(logging.WARNING)
    logging.getLogger("httpcore").setLevel(logging.WARNING)
    # warmup()
    # generate_offline_assets_package()
    # restore_offline_assets_package(Path(
    #     '/Users/aw/.cache/babeldoc/assets/offline_assets_33971e4940e90ba0c35baacda44bbe83b214f4703a7bdb8b837de97d0383508c.zip'))
    # warmup()

```

## /babeldoc/assets/embedding_assets_metadata.py

```py path="/babeldoc/assets/embedding_assets_metadata.py" 
import itertools

DOCLAYOUT_YOLO_DOCSTRUCTBENCH_IMGSZ1024ONNX_SHA3_256 = (
    "60be061226930524958b5465c8c04af3d7c03bcb0beb66454f5da9f792e3cf2a"
)

TABLE_DETECTION_RAPIDOCR_MODEL_SHA3_256 = (
    "062f4619afe91b33147c033acadecbb53f2a7b99ac703d157b96d5b10948da5e"
)

TIKTOKEN_CACHES = {
    "fb374d419588a4632f3f557e76b4b70aebbca790": "cb04bcda5782cfbbe77f2f991d92c0ea785d9496ef1137c91dfc3c8c324528d6"
}

FONT_METADATA_URL = {
    "github": "https://raw.githubusercontent.com/funstory-ai/BabelDOC-Assets/refs/heads/main/font_metadata.json",
    "huggingface": "https://huggingface.co/datasets/awwaawwa/BabelDOC-Assets/resolve/main/font_metadata.json?download=true",
    "hf-mirror": "https://hf-mirror.com/datasets/awwaawwa/BabelDOC-Assets/resolve/main/font_metadata.json?download=true",
    "modelscope": "https://www.modelscope.cn/datasets/awwaawwa/BabelDOCAssets/resolve/master/font_metadata.json",
}

FONT_URL_BY_UPSTREAM = {
    "github": lambda name: f"https://raw.githubusercontent.com/funstory-ai/BabelDOC-Assets/refs/heads/main/fonts/{name}",
    "huggingface": lambda name: f"https://huggingface.co/datasets/awwaawwa/BabelDOC-Assets/resolve/main/fonts/{name}?download=true",
    "hf-mirror": lambda name: f"https://hf-mirror.com/datasets/awwaawwa/BabelDOC-Assets/resolve/main/fonts/{name}?download=true",
    "modelscope": lambda name: f"https://www.modelscope.cn/datasets/awwaawwa/BabelDOCAssets/resolve/master/fonts/{name}",
}

DOC_LAYOUT_ONNX_MODEL_URL = {
    "huggingface": "https://huggingface.co/wybxc/DocLayout-YOLO-DocStructBench-onnx/resolve/main/doclayout_yolo_docstructbench_imgsz1024.onnx?download=true",
    "hf-mirror": "https://hf-mirror.com/wybxc/DocLayout-YOLO-DocStructBench-onnx/resolve/main/doclayout_yolo_docstructbench_imgsz1024.onnx?download=true",
    "modelscope": "https://www.modelscope.cn/models/AI-ModelScope/DocLayout-YOLO-DocStructBench-onnx/resolve/master/doclayout_yolo_docstructbench_imgsz1024.onnx",
}

TABLE_DETECTION_RAPIDOCR_MODEL_URL = {
    "huggingface": "https://huggingface.co/spaces/RapidAI/RapidOCR/resolve/main/models/text_det/ch_PP-OCRv4_det_infer.onnx",
    "hf-mirror": "https://hf-mirror.com/spaces/RapidAI/RapidOCR/resolve/main/models/text_det/ch_PP-OCRv4_det_infer.onnx",
    "modelscope": "https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/master/onnx/PP-OCRv4/det/ch_PP-OCRv4_det_infer.onnx",
}

# from https://github.com/funstory-ai/BabelDOC-Assets/blob/main/font_metadata.json
EMBEDDING_FONT_METADATA = {
    "GoNotoKurrent-Bold.ttf": {
        "ascent": 1069,
        "bold": 1,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "GoNotoKurrent-Bold.ttf",
        "font_name": "Go Noto Kurrent-Bold Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "000b37f592477945b27b7702dcad39f73e23e140e66ddff9847eb34f32389566",
        "size": 15303772,
    },
    "GoNotoKurrent-Regular.ttf": {
        "ascent": 1069,
        "bold": 0,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "GoNotoKurrent-Regular.ttf",
        "font_name": "Go Noto Kurrent-Regular Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "4324a60d507c691e6efc97420647f4d2c2d86d9de35009d1c769861b76074ae6",
        "size": 15515760,
    },
    "KleeOne-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "KleeOne-Regular.ttf",
        "font_name": "Klee One Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "8585c29f89b322d937f83739f61ede5d84297873e1465cad9a120a208ac55ce0",
        "size": 8724704,
    },
    "LXGWWenKaiGB-Regular.ttf": {
        "ascent": 928,
        "bold": 0,
        "descent": -256,
        "encoding_length": 2,
        "file_name": "LXGWWenKaiGB-Regular.ttf",
        "font_name": "LXGW WenKai GB Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "b563a5e8d9db4cd15602a3a3700b01925e80a21f99fb88e1b763b1fb8685f8ee",
        "size": 19558756,
    },
    "LXGWWenKaiMonoTC-Regular.ttf": {
        "ascent": 928,
        "bold": 0,
        "descent": -241,
        "encoding_length": 2,
        "file_name": "LXGWWenKaiMonoTC-Regular.ttf",
        "font_name": "LXGW WenKai Mono TC Regular",
        "italic": 0,
        "monospace": 1,
        "serif": 0,
        "sha3_256": "596b278d11418d374a1cfa3a50cbfb82b31db82d3650cfacae8f94311b27fdc5",
        "size": 13115416,
    },
    "LXGWWenKaiTC-Regular.ttf": {
        "ascent": 928,
        "bold": 0,
        "descent": -256,
        "encoding_length": 2,
        "file_name": "LXGWWenKaiTC-Regular.ttf",
        "font_name": "LXGW WenKai TC Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "66ccd0ffe8e56cd585dabde8d1292c3f551b390d8ed85f81d7a844825f9c2379",
        "size": 13100328,
    },
    "MaruBuri-Regular.ttf": {
        "ascent": 800,
        "bold": 0,
        "descent": -200,
        "encoding_length": 2,
        "file_name": "MaruBuri-Regular.ttf",
        "font_name": "MaruBuri Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "abb672dde7b89e06914ce27c59159b7a2933f26207bfcc47981c67c11c41e6d1",
        "size": 3268988,
    },
    "NotoSans-Bold.ttf": {
        "ascent": 1069,
        "bold": 1,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSans-Bold.ttf",
        "font_name": "Noto Sans Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "ecd38d472c1cad07d8a5dffd2b5a0f72edcd40fff2b4e68d770da8f2ef343a82",
        "size": 630964,
    },
    "NotoSans-BoldItalic.ttf": {
        "ascent": 1069,
        "bold": 1,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSans-BoldItalic.ttf",
        "font_name": "Noto Sans Bold Italic",
        "italic": 1,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "0b6c690a4a6b7d605b2ecbde00c7ac1a23e60feb17fa30d8b972d61ec3ff732b",
        "size": 644340,
    },
    "NotoSans-Italic.ttf": {
        "ascent": 1069,
        "bold": 0,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSans-Italic.ttf",
        "font_name": "Noto Sans Italic",
        "italic": 1,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "830652f61724c017e5a29a96225b484a2ccbd25f69a1b3f47e5f466a2dbed1ad",
        "size": 642344,
    },
    "NotoSans-Regular.ttf": {
        "ascent": 1069,
        "bold": 0,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSans-Regular.ttf",
        "font_name": "Noto Sans Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "7dfe2bbf97dc04c852d1223b220b63430e6ad03b0dbb28ebe6328a20a2d45eb8",
        "size": 629024,
    },
    "NotoSerif-Bold.ttf": {
        "ascent": 1069,
        "bold": 1,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSerif-Bold.ttf",
        "font_name": "Noto Serif Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "28d88d924285eadb9f9ce49f2d2b95473f89a307b226c5f6ebed87a654898312",
        "size": 506864,
    },
    "NotoSerif-BoldItalic.ttf": {
        "ascent": 1069,
        "bold": 1,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSerif-BoldItalic.ttf",
        "font_name": "Noto Serif Bold Italic",
        "italic": 1,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "b69ee56af6351b2fb4fbce623f8e1c1f9fb19170686a9e5db2cf260b8cf24ac7",
        "size": 535724,
    },
    "NotoSerif-Italic.ttf": {
        "ascent": 1069,
        "bold": 0,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSerif-Italic.ttf",
        "font_name": "Noto Serif Italic",
        "italic": 1,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "9b7773c24ab8a29e3c1c03efa4ab652d051e4c209134431953463aa946d62868",
        "size": 535340,
    },
    "NotoSerif-Regular.ttf": {
        "ascent": 1069,
        "bold": 0,
        "descent": -293,
        "encoding_length": 2,
        "file_name": "NotoSerif-Regular.ttf",
        "font_name": "Noto Serif Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "c2bbe984e65bafd3bcd38b3cb1e1344f3b7b79d6beffc7a3d883b57f8358559d",
        "size": 504932,
    },
    "SourceHanSansCN-Bold.ttf": {
        "ascent": 1160,
        "bold": 1,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansCN-Bold.ttf",
        "font_name": "Source Han Sans CN Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "82314c11016a04ef03e7afd00abe0ccc8df54b922dee79abf6424f3002a31825",
        "size": 10174460,
    },
    "SourceHanSansCN-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansCN-Regular.ttf",
        "font_name": "Source Han Sans CN Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "b45a80cf3650bfc62aa014e58243c6325e182c4b0c5819e41a583c699cce9a8f",
        "size": 10397552,
    },
    "SourceHanSansHK-Bold.ttf": {
        "ascent": 1160,
        "bold": 1,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansHK-Bold.ttf",
        "font_name": "Source Han Sans HK Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "3eecd57457ba9a0fbad6c794f40e7ae704c4f825091aef2ac18902ffdde50608",
        "size": 6856692,
    },
    "SourceHanSansHK-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansHK-Regular.ttf",
        "font_name": "Source Han Sans HK Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "5fe4141f9164c03616323400b2936ee4c8265314492e2b822c3a6fbfb63ffe08",
        "size": 6999792,
    },
    "SourceHanSansJP-Bold.ttf": {
        "ascent": 1160,
        "bold": 1,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansJP-Bold.ttf",
        "font_name": "Source Han Sans JP Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "fb05bd84d62e8064117ee357ab6a4481e1cde931e8e984c0553c8c4b09dc3938",
        "size": 5603068,
    },
    "SourceHanSansJP-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansJP-Regular.ttf",
        "font_name": "Source Han Sans JP Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "722cfbdcc0fd83fe07a3d1b10e9e64343c924a351d02cfe8dbb6ec4c6bc38230",
        "size": 5723960,
    },
    "SourceHanSansKR-Bold.ttf": {
        "ascent": 1160,
        "bold": 1,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansKR-Bold.ttf",
        "font_name": "Source Han Sans KR Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "02959eb2c1eea0786a736aeb50b6e61f2ab873cd69c659389b7511f80f734838",
        "size": 5858892,
    },
    "SourceHanSansKR-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansKR-Regular.ttf",
        "font_name": "Source Han Sans KR Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "aba70109eff718e8f796f0185f8dca38026c1661b43c195883c84577e501adf2",
        "size": 5961704,
    },
    "SourceHanSansTW-Bold.ttf": {
        "ascent": 1160,
        "bold": 1,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansTW-Bold.ttf",
        "font_name": "Source Han Sans TW Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "4a92730e644a1348e87bba7c77e9b462f257f381bd6abbeac5860d8f8306aee6",
        "size": 6883224,
    },
    "SourceHanSansTW-Regular.ttf": {
        "ascent": 1160,
        "bold": 0,
        "descent": -288,
        "encoding_length": 2,
        "file_name": "SourceHanSansTW-Regular.ttf",
        "font_name": "Source Han Sans TW Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 0,
        "sha3_256": "6129b68ff4b0814624cac7edca61fbacf8f4d79db6f4c3cfc46b1c48ea2f81ac",
        "size": 7024812,
    },
    "SourceHanSerifCN-Bold.ttf": {
        "ascent": 1150,
        "bold": 1,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifCN-Bold.ttf",
        "font_name": "Source Han Serif CN Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "77816a54957616e140e25a36a41fc061ddb505a1107de4e6a65f561e5dcf8310",
        "size": 14134156,
    },
    "SourceHanSerifCN-Regular.ttf": {
        "ascent": 1150,
        "bold": 0,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifCN-Regular.ttf",
        "font_name": "Source Han Serif CN Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "c8bf74da2c3b7457c9d887465b42fb6f80d3d84f361cfe5b0673a317fb1f85ad",
        "size": 14047768,
    },
    "SourceHanSerifHK-Bold.ttf": {
        "ascent": 1150,
        "bold": 1,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifHK-Bold.ttf",
        "font_name": "Source Han Serif HK Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "0f81296f22846b622a26f7342433d6c5038af708a32fc4b892420c150227f4bb",
        "size": 9532580,
    },
    "SourceHanSerifHK-Regular.ttf": {
        "ascent": 1150,
        "bold": 0,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifHK-Regular.ttf",
        "font_name": "Source Han Serif HK Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "d5232ec3adf4fb8604bb4779091169ec9bd9d574b513e4a75752e614193afebe",
        "size": 9467292,
    },
    "SourceHanSerifJP-Bold.ttf": {
        "ascent": 1150,
        "bold": 1,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifJP-Bold.ttf",
        "font_name": "Source Han Serif JP Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "a4a8c22e8ec7bb6e66b9caaff1e12c7a52b5a4201eec3d074b35957c0126faef",
        "size": 7811832,
    },
    "SourceHanSerifJP-Regular.ttf": {
        "ascent": 1150,
        "bold": 0,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifJP-Regular.ttf",
        "font_name": "Source Han Serif JP Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "3d1f9933c7f3abc8c285e317119a533e6dcfe6027d1f5f066ba71b3eb9161e9c",
        "size": 7748816,
    },
    "SourceHanSerifKR-Bold.ttf": {
        "ascent": 1150,
        "bold": 1,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifKR-Bold.ttf",
        "font_name": "Source Han Serif KR Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "b071b1aecb042aa779e1198767048438dc756d0da8f90660408abb421393f5cb",
        "size": 12387920,
    },
    "SourceHanSerifKR-Regular.ttf": {
        "ascent": 1150,
        "bold": 0,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifKR-Regular.ttf",
        "font_name": "Source Han Serif KR Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "a85913439f0a49024ca77c02dfede4318e503ee6b2b7d8fef01eb42435f27b61",
        "size": 12459924,
    },
    "SourceHanSerifTW-Bold.ttf": {
        "ascent": 1150,
        "bold": 1,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifTW-Bold.ttf",
        "font_name": "Source Han Serif TW Bold",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "562eea88895ab79ffefab7eabb4d322352a7b1963764c524c6d5242ca456bb6e",
        "size": 9551724,
    },
    "SourceHanSerifTW-Regular.ttf": {
        "ascent": 1150,
        "bold": 0,
        "descent": -286,
        "encoding_length": 2,
        "file_name": "SourceHanSerifTW-Regular.ttf",
        "font_name": "Source Han Serif TW Regular",
        "italic": 0,
        "monospace": 0,
        "serif": 1,
        "sha3_256": "85c1d6460b2e169b3d53ac60f6fb7a219fb99923027d78fb64b679475e2ddae4",
        "size": 9486772,
    },
}

FONT_NAMES = {v["font_name"] for v in EMBEDDING_FONT_METADATA.values()}

CN_FONT_FAMILY = {
    # 手写体
    "script": [
        "LXGWWenKaiGB-Regular.ttf",
    ],
    # 正文字体
    "normal": [
        "SourceHanSerifCN-Bold.ttf",
        "SourceHanSerifCN-Regular.ttf",
        "SourceHanSansCN-Bold.ttf",
        "SourceHanSansCN-Regular.ttf",
    ],
    # 备用字体
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": ["SourceHanSansCN-Regular.ttf"],
}

HK_FONT_FAMILY = {
    "script": ["LXGWWenKaiTC-Regular.ttf"],
    "normal": [
        "SourceHanSerifHK-Bold.ttf",
        "SourceHanSerifHK-Regular.ttf",
        "SourceHanSansHK-Bold.ttf",
        "SourceHanSansHK-Regular.ttf",
    ],
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": ["SourceHanSansCN-Regular.ttf"],
}

TW_FONT_FAMILY = {
    "script": ["LXGWWenKaiTC-Regular.ttf"],
    "normal": [
        "SourceHanSerifTW-Bold.ttf",
        "SourceHanSerifTW-Regular.ttf",
        "SourceHanSansTW-Bold.ttf",
        "SourceHanSansTW-Regular.ttf",
    ],
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": ["SourceHanSansCN-Regular.ttf"],
}

KR_FONT_FAMILY = {
    "script": ["MaruBuri-Regular.ttf"],
    "normal": [
        "SourceHanSerifKR-Bold.ttf",
        "SourceHanSerifKR-Regular.ttf",
        "SourceHanSansKR-Bold.ttf",
        "SourceHanSansKR-Regular.ttf",
    ],
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": ["SourceHanSansCN-Regular.ttf"],
}

JP_FONT_FAMILY = {
    "script": ["KleeOne-Regular.ttf"],
    "normal": [
        "SourceHanSerifJP-Bold.ttf",
        "SourceHanSerifJP-Regular.ttf",
        "SourceHanSansJP-Bold.ttf",
        "SourceHanSansJP-Regular.ttf",
    ],
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": ["SourceHanSansCN-Regular.ttf"],
}

EN_FONT_FAMILY = {
    "script": [
        "NotoSans-Italic.ttf",
        "NotoSans-BoldItalic.ttf",
        "NotoSerif-Italic.ttf",
        "NotoSerif-BoldItalic.ttf",
    ],
    "normal": [
        "NotoSerif-Regular.ttf",
        "NotoSerif-Bold.ttf",
        "NotoSans-Regular.ttf",
        "NotoSans-Bold.ttf",
    ],
    "fallback": [
        "GoNotoKurrent-Regular.ttf",
        "GoNotoKurrent-Bold.ttf",
    ],
    "base": [
        "NotoSans-Regular.ttf",
    ],
}

ALL_FONT_FAMILY = {
    "CN": CN_FONT_FAMILY,
    "TW": TW_FONT_FAMILY,
    "HK": HK_FONT_FAMILY,
    "KR": KR_FONT_FAMILY,
    "JP": JP_FONT_FAMILY,
    "EN": EN_FONT_FAMILY,
    "JA": JP_FONT_FAMILY,
}


def __add_fallback_to_font_family():
    for lang1, family1 in ALL_FONT_FAMILY.items():
        added_font = set()
        for font in itertools.chain.from_iterable(family1.values()):
            added_font.add(font)

        for lang2, family2 in ALL_FONT_FAMILY.items():
            if lang1 != lang2:
                for type_ in family1:
                    for font in family2[type_]:
                        if font not in added_font:
                            family1[type_].append(font)
                            added_font.add(font)


__add_fallback_to_font_family()


def get_font_family(lang_code: str):
    lang_code = lang_code.upper()
    if "KR" in lang_code:
        font_family = KR_FONT_FAMILY
    elif "JP" in lang_code or "JA" in lang_code:
        font_family = JP_FONT_FAMILY
    elif "HK" in lang_code:
        font_family = HK_FONT_FAMILY
    elif "TW" in lang_code:
        font_family = TW_FONT_FAMILY
    elif "EN" in lang_code:
        font_family = EN_FONT_FAMILY
    elif "CN" in lang_code:
        font_family = CN_FONT_FAMILY
    else:
        font_family = EN_FONT_FAMILY
    verify_font_family(font_family)
    return font_family


def verify_font_family(font_family: str | dict):
    if isinstance(font_family, str):
        font_family = ALL_FONT_FAMILY[font_family]
    for k in font_family:
        if k not in ["script", "normal", "fallback", "base"]:
            raise ValueError(f"Invalid font family: {font_family}")
        for font_file_name in font_family[k]:
            if font_file_name not in EMBEDDING_FONT_METADATA:
                raise ValueError(f"Invalid font file: {font_file_name}")


if __name__ == "__main__":
    for k in ALL_FONT_FAMILY:
        verify_font_family(k)

```

## /babeldoc/asynchronize/__init__.py

```py path="/babeldoc/asynchronize/__init__.py" 
import asyncio
import time


class Args:
    def __init__(self, args, kwargs):
        self.args = args
        self.kwargs = kwargs


class AsyncCallback:
    def __init__(self):
        self.queue = asyncio.Queue()
        self.finished = False
        self.loop = asyncio.get_event_loop()

    def step_callback(self, *args, **kwargs):
        # Whenever a step is called, add to the queue but don't set finished to True, so __anext__ will continue
        args = Args(args, kwargs)

        # We have to use the threadsafe call so that it wakes up the event loop, in case it's sleeping:
        # https://stackoverflow.com/a/49912853/2148718
        self.loop.call_soon_threadsafe(self.queue.put_nowait, args)

        # Add a small delay to release the GIL, ensuring the event loop has time to process messages
        time.sleep(0.01)

    def finished_callback(self, *args, **kwargs):
        # Whenever a finished is called, add to the queue as with step, but also set finished to True, so __anext__
        # will terminate after processing the remaining items
        if self.finished:
            return
        self.step_callback(*args, **kwargs)
        self.finished = True

    def __await__(self):
        # Since this implements __anext__, this can return itself
        return self.queue.get().__await__()

    def __aiter__(self):
        # Since this implements __anext__, this can return itself
        return self

    async def __anext__(self):
        # Keep waiting for the queue if a) we haven't finished, or b) if the queue is still full. This lets us finish
        # processing the remaining items even after we've finished
        if self.finished and self.queue.empty():
            raise StopAsyncIteration

        result = await self.queue.get()
        return result

```

## /babeldoc/babeldoc_exception/BabelDOCException.py

```py path="/babeldoc/babeldoc_exception/BabelDOCException.py" 
class ScannedPDFError(Exception):
    def __init__(self, message):
        super().__init__(message)


class ExtractTextError(Exception):
    def __init__(self, message):
        super().__init__(message)

```

## /babeldoc/babeldoc_exception/__init__.py

```py path="/babeldoc/babeldoc_exception/__init__.py" 

```

## /babeldoc/const.py

```py path="/babeldoc/const.py" 
import os
import shutil
import subprocess
from pathlib import Path

__version__ = "0.4.15"

CACHE_FOLDER = Path.home() / ".cache" / "babeldoc"


def get_cache_file_path(filename: str, sub_folder: str | None = None) -> Path:
    if sub_folder is not None:
        sub_folder = sub_folder.strip("/")
        sub_folder_path = CACHE_FOLDER / sub_folder
        sub_folder_path.mkdir(parents=True, exist_ok=True)
        return sub_folder_path / filename
    return CACHE_FOLDER / filename


try:
    git_path = shutil.which("git")
    if git_path is None:
        raise FileNotFoundError("git executable not found")
    two_parent = Path(__file__).resolve().parent.parent
    md_ = two_parent / "docs" / "README.md"
    if two_parent.name == "site-packages" or not md_.exists():
        raise FileNotFoundError("not in git repo")
    WATERMARK_VERSION = (
        subprocess.check_output(  # noqa: S603
            [git_path, "describe", "--always"],
            cwd=Path(__file__).resolve().parent,
        )
        .strip()
        .decode()
    )
except (OSError, FileNotFoundError, subprocess.CalledProcessError):
    WATERMARK_VERSION = f"v{__version__}"

TIKTOKEN_CACHE_FOLDER = CACHE_FOLDER / "tiktoken"
TIKTOKEN_CACHE_FOLDER.mkdir(parents=True, exist_ok=True)
os.environ["TIKTOKEN_CACHE_DIR"] = str(TIKTOKEN_CACHE_FOLDER)

```

## /babeldoc/docvision/README.md


## /babeldoc/docvision/__init__.py

```py path="/babeldoc/docvision/__init__.py" 

```

## /babeldoc/docvision/base_doclayout.py

```py path="/babeldoc/docvision/base_doclayout.py" 
import abc
import logging
from collections.abc import Generator

import pymupdf

from babeldoc.format.pdf.document_il.il_version_1 import Page

logger = logging.getLogger(__name__)


class YoloResult:
    """Helper class to store detection results from ONNX model."""

    def __init__(self, names, boxes=None, boxes_data=None):
        if boxes is not None:
            self.boxes = boxes
        else:
            assert boxes_data is not None
            self.boxes = [YoloBox(data=d) for d in boxes_data]
        self.boxes.sort(key=lambda x: x.conf, reverse=True)
        self.names = names


class YoloBox:
    """Helper class to store detection results from ONNX model."""

    def __init__(self, data=None, xyxy=None, conf=None, cls=None):
        if data is not None:
            self.xyxy = data[:4]
            self.conf = data[-2]
            self.cls = data[-1]
            return
        assert xyxy is not None and conf is not None and cls is not None
        self.xyxy = xyxy
        self.conf = conf
        self.cls = cls


class DocLayoutModel(abc.ABC):
    @staticmethod
    def load_onnx():
        logger.info("Loading ONNX model...")
        from babeldoc.docvision.doclayout import OnnxModel

        model = OnnxModel.from_pretrained()
        return model

    @staticmethod
    def load_available():
        return DocLayoutModel.load_onnx()

    @property
    @abc.abstractmethod
    def stride(self) -> int:
        """Stride of the model input."""

    @abc.abstractmethod
    def predict(self, image: bytes, imgsz: int = 1024, **kwargs) -> list[int]:
        """
        Predict the layout of a document page.

        Args:
            image: The image of the document page.
            imgsz: Resize the image to this size. Must be a multiple of the stride.
            **kwargs: Additional arguments.
        """

    @abc.abstractmethod
    def handle_document(
        self,
        pages: list[Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ) -> Generator[tuple[Page, YoloResult], None, None]:
        """
        Handle a document.
        """

```

## /babeldoc/docvision/doclayout.py

```py path="/babeldoc/docvision/doclayout.py" 
import ast
import logging
import platform
import re
import threading
from collections.abc import Generator

import cv2
import numpy as np

from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

try:
    import onnx
    import onnxruntime
except ImportError as e:
    if "DLL load failed" in str(e):
        raise OSError(
            "Microsoft Visual C++ Redistributable is not installed. "
            "Download it at https://aka.ms/vs/17/release/vc_redist.x64.exe"
        ) from e
    raise
import pymupdf

import babeldoc.format.pdf.document_il.il_version_1
from babeldoc.assets.assets import get_doclayout_onnx_model_path

# from huggingface_hub import hf_hub_download

logger = logging.getLogger(__name__)


# 检测操作系统类型
os_name = platform.system()


class OnnxModel(DocLayoutModel):
    def __init__(self, model_path: str):
        self.model_path = model_path

        model = onnx.load(model_path)
        metadata = {d.key: d.value for d in model.metadata_props}
        self._stride = ast.literal_eval(metadata["stride"])
        self._names = ast.literal_eval(metadata["names"])
        providers = []

        available_providers = onnxruntime.get_available_providers()
        for provider in available_providers:
            if re.match(r"dml|cuda|cpu", provider, re.IGNORECASE):
                logger.info(f"Available Provider: {provider}")
                providers.append(provider)
        self.model = onnxruntime.InferenceSession(
            model.SerializeToString(),
            providers=providers,
        )
        self.lock = threading.Lock()

    @staticmethod
    def from_pretrained():
        pth = get_doclayout_onnx_model_path()
        return OnnxModel(pth)

    @property
    def stride(self):
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image,
            (resized_w, resized_h),
            interpolation=cv2.INTER_LINEAR,
        )

        # Calculate padding size and align to stride multiple
        pad_w = (new_w - resized_w) % self.stride
        pad_h = (new_h - resized_h) % self.stride
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image,
            top,
            bottom,
            left,
            right,
            cv2.BORDER_CONSTANT,
            value=(114, 114, 114),
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict(self, image, imgsz=800, batch_size=16, **kwargs):
        """
        Predict the layout of document pages.

        Args:
            image: A single image or a list of images of document pages.
            imgsz: Resize the image to this size. Must be a multiple of the stride.
            batch_size: Number of images to process in one batch.
            **kwargs: Additional arguments.

        Returns:
            A list of YoloResult objects, one for each input image.
        """
        # Handle single image input
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        total_images = len(image)
        results = []
        batch_size = 1

        # Process images in batches
        for i in range(0, total_images, batch_size):
            batch_images = image[i : i + batch_size]
            batch_size_actual = len(batch_images)

            # Calculate target size based on the maximum height in the batch
            max_height = max(img.shape[0] for img in batch_images)
            target_imgsz = 1024

            # Preprocess batch
            processed_batch = []
            orig_shapes = []
            for img in batch_images:
                orig_h, orig_w = img.shape[:2]
                orig_shapes.append((orig_h, orig_w))

                pix = self.resize_and_pad_image(img, new_shape=target_imgsz)
                pix = np.transpose(pix, (2, 0, 1))  # CHW
                pix = pix.astype(np.float32) / 255.0  # Normalize to [0, 1]
                processed_batch.append(pix)

            # Stack batch
            batch_input = np.stack(processed_batch, axis=0)  # BCHW
            new_h, new_w = batch_input.shape[2:]

            # Run inference
            batch_preds = self.model.run(None, {"images": batch_input})[0]

            # Process each prediction in the batch
            for j in range(batch_size_actual):
                preds = batch_preds[j]
                preds = preds[preds[..., 4] > 0.25]
                if len(preds) > 0:
                    preds[..., :4] = self.scale_boxes(
                        (new_h, new_w),
                        preds[..., :4],
                        orig_shapes[j],
                    )
                results.append(YoloResult(boxes_data=preds, names=self._names))

        return results

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ) -> Generator[
        tuple[babeldoc.format.pdf.document_il.il_version_1.Page, YoloResult], None, None
    ]:
        for page in pages:
            translate_config.raise_if_cancelled()
            with self.lock:
                # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
                pix = get_no_rotation_img(mupdf_doc[page.page_number])
            image = np.frombuffer(pix.samples, np.uint8).reshape(
                pix.height,
                pix.width,
                3,
            )[:, :, ::-1]
            predict_result = self.predict(image)[0]
            save_debug_image(
                image,
                predict_result,
                page.page_number + 1,
            )
            yield page, predict_result

```

## /babeldoc/docvision/rpc_doclayout.py

```py path="/babeldoc/docvision/rpc_doclayout.py" 
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import httpx
import msgpack
import numpy as np
import pymupdf
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

import babeldoc
from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

logger = logging.getLogger(__name__)


def encode_image(image) -> bytes:
    """Read and encode image to bytes

    Args:
        image: Can be either a file path (str) or numpy array
    """
    if isinstance(image, str):
        if not Path(image).exists():
            raise FileNotFoundError(f"Image file not found: {image}")
        img = cv2.imread(image)
        if img is None:
            raise ValueError(f"Failed to read image: {image}")
    else:
        img = image

    # logger.debug(f"Image shape: {img.shape}")
    encoded = cv2.imencode(".jpg", img)[1].tobytes()
    # logger.debug(f"Encoded image size: {len(encoded)} bytes")
    return encoded


@retry(
    stop=stop_after_attempt(3),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed, retrying in {retry_state.next_action.sleep} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
def predict_layout(
    image,
    host: str = "http://localhost:8000",
    imgsz: int = 1024,
):
    """
    Predict document layout using the MOSEC service

    Args:
        image: Can be either a file path (str) or numpy array
        host: Service host URL
        imgsz: Image size for model input

    Returns:
        List of predictions containing bounding boxes and classes
    """
    # Prepare request data
    if not isinstance(image, list):
        image = [image]
    image_data = [encode_image(image) for image in image]
    data = {
        "image": image_data,
        "imgsz": imgsz,
    }

    # Pack data using msgpack
    packed_data = msgpack.packb(data, use_bin_type=True)
    # logger.debug(f"Packed data size: {len(packed_data)} bytes")

    # Send request
    # logger.debug(f"Sending request to {host}/inference")
    response = httpx.post(
        f"{host}/inference",
        data=packed_data,
        headers={
            "Content-Type": "application/msgpack",
            "Accept": "application/msgpack",
        },
        timeout=300,
        follow_redirects=True,
    )

    # logger.debug(f"Response status: {response.status_code}")
    # logger.debug(f"Response headers: {response.headers}")

    if response.status_code == 200:
        try:
            result = msgpack.unpackb(response.content, raw=False)
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.content}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


class ResultContainer:
    def __init__(self):
        self.result = YoloResult(boxes_data=np.array([]), names=[])


class RpcDocLayoutModel(DocLayoutModel):
    """DocLayoutModel implementation that uses RPC service."""

    def __init__(self, host: str = "http://localhost:8000"):
        """Initialize RPC model with host address."""
        self.host = host
        self._stride = 32  # Default stride value
        self._names = ["text", "title", "list", "table", "figure"]
        self.lock = threading.Lock()

    @property
    def stride(self) -> int:
        """Stride of the model input."""
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size,
        ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
        )

        # Calculate padding size
        pad_h = new_h - resized_h
        pad_w = new_w - resized_w
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict_image(
        self,
        image,
        host: str = None,
        result_container: ResultContainer | None = None,
        imgsz: int = 1024,
    ) -> ResultContainer:
        """Predict the layout of document pages using RPC service."""
        if result_container is None:
            result_container = ResultContainer()
        target_imgsz = (800, 800)
        orig_h, orig_w = image.shape[:2]
        if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]:
            image = self.resize_and_pad_image(image, new_shape=target_imgsz)
        preds = predict_layout([image], host=self.host, imgsz=800)

        if len(preds) > 0:
            for pred in preds:
                boxes = [
                    YoloBox(
                        None,
                        self.scale_boxes(
                            (800, 800), np.array(x["xyxy"]), (orig_h, orig_w)
                        ),
                        np.array(x["conf"]),
                        x["cls"],
                    )
                    for x in pred["boxes"]
                ]
                result_container.result = YoloResult(
                    boxes=boxes,
                    names={int(k): v for k, v in pred["names"].items()},
                )
        return result_container.result

    def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]:
        """Predict the layout of document pages using RPC service."""
        # Handle single image input
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        result_containers = [ResultContainer() for _ in image]
        predict_thread = ThreadPoolExecutor(max_workers=len(image))
        for img, result_container in zip(image, result_containers, strict=True):
            predict_thread.submit(
                self.predict_image, img, self.host, result_container, 800
            )
        predict_thread.shutdown(wait=True)
        result = [result_container.result for result_container in result_containers]
        return result

    def predict_page(
        self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image
    ):
        translate_config.raise_if_cancelled()
        with self.lock:
            # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
            pix = get_no_rotation_img(mupdf_doc[page.page_number])
        image = np.frombuffer(pix.samples, np.uint8).reshape(
            pix.height,
            pix.width,
            3,
        )[:, :, ::-1]
        predict_result = self.predict_image(image, self.host, None, 800)
        save_debug_image(image, predict_result, page.page_number + 1)
        return page, predict_result

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ):
        with ThreadPoolExecutor(max_workers=16) as executor:
            yield from executor.map(
                self.predict_page,
                pages,
                (mupdf_doc for _ in range(len(pages))),
                (translate_config for _ in range(len(pages))),
                (save_debug_image for _ in range(len(pages))),
            )

    @staticmethod
    def from_host(host: str) -> "RpcDocLayoutModel":
        """Create RpcDocLayoutModel from host address."""
        return RpcDocLayoutModel(host=host)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    # Test the service
    try:
        # Use a default test image if example/1.png doesn't exist
        image_path = "example/1.png"
        if not Path(image_path).exists():
            print(f"Warning: {image_path} not found.")
            print("Please provide the path to a test image:")
            image_path = input("> ")

        logger.info(f"Processing image: {image_path}")
        result = predict_layout(image_path)
        print("Prediction results:")
        print(result)
    except Exception as e:
        print(f"Error: {e!s}")

```

## /babeldoc/docvision/rpc_doclayout2.py

```py path="/babeldoc/docvision/rpc_doclayout2.py" 
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import httpx
import msgpack
import numpy as np
import pymupdf
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

import babeldoc
from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

logger = logging.getLogger(__name__)


def encode_image(image) -> bytes:
    """Read and encode image to bytes

    Args:
        image: Can be either a file path (str) or numpy array
    """
    if isinstance(image, str):
        if not Path(image).exists():
            raise FileNotFoundError(f"Image file not found: {image}")
        img = cv2.imread(image)
        if img is None:
            raise ValueError(f"Failed to read image: {image}")
    else:
        img = image

    # logger.debug(f"Image shape: {img.shape}")
    encoded = cv2.imencode(".jpg", img)[1].tobytes()
    # logger.debug(f"Encoded image size: {len(encoded)} bytes")
    return encoded


@retry(
    stop=stop_after_attempt(3),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed, retrying in {retry_state.next_action.sleep} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
def predict_layout(
    image,
    host: str = "http://localhost:8000",
    imgsz: int = 1024,
):
    """
    Predict document layout using the MOSEC service

    Args:
        image: Can be either a file path (str) or numpy array
        host: Service host URL
        imgsz: Image size for model input

    Returns:
        List of predictions containing bounding boxes and classes
    """
    # Prepare request data
    if not isinstance(image, list):
        image = [image]
    image_data = [encode_image(image) for image in image]
    data = {
        "image": image_data,
        "imgsz": imgsz,
    }

    # Pack data using msgpack
    packed_data = msgpack.packb(data, use_bin_type=True)
    # logger.debug(f"Packed data size: {len(packed_data)} bytes")

    # Send request
    # logger.debug(f"Sending request to {host}/inference")
    response = httpx.post(
        f"{host}/inference",
        data=packed_data,
        headers={
            "Content-Type": "application/msgpack",
            "Accept": "application/msgpack",
        },
        timeout=300,
        follow_redirects=True,
    )

    # logger.debug(f"Response status: {response.status_code}")
    # logger.debug(f"Response headers: {response.headers}")

    if response.status_code == 200:
        try:
            result = msgpack.unpackb(response.content, raw=False)
            if isinstance(result, dict):
                names = {}
                for box in result["boxes"]:
                    box["xyxy"] = box["coordinate"]
                    box["conf"] = box["score"]
                    box["cls"] = box["cls_id"]
                    names[box["cls_id"]] = box["label"]
                if "names" not in result:
                    result["names"] = names
                result = [result]
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.content}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


class ResultContainer:
    def __init__(self):
        self.result = YoloResult(boxes_data=np.array([]), names=[])


class RpcDocLayoutModel(DocLayoutModel):
    """DocLayoutModel implementation that uses RPC service."""

    def __init__(self, host: str = "http://localhost:8000"):
        """Initialize RPC model with host address."""
        self.host = host
        self._stride = 32  # Default stride value
        self._names = ["text", "title", "list", "table", "figure"]
        self.lock = threading.Lock()

    @property
    def stride(self) -> int:
        """Stride of the model input."""
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size,
        ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
        )

        # Calculate padding size
        pad_h = new_h - resized_h
        pad_w = new_w - resized_w
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict_image(
        self,
        image,
        host: str = None,
        result_container: ResultContainer | None = None,
        imgsz: int = 1024,
    ) -> ResultContainer:
        """Predict the layout of document pages using RPC service."""
        if result_container is None:
            result_container = ResultContainer()
        target_imgsz = (800, 800)
        orig_h, orig_w = image.shape[:2]
        target_imgsz = (orig_h, orig_w)
        if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]:
            image = self.resize_and_pad_image(image, new_shape=target_imgsz)
        preds = predict_layout([image], host=self.host, imgsz=800)

        if len(preds) > 0:
            for pred in preds:
                boxes = [
                    YoloBox(
                        None,
                        self.scale_boxes(
                            target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w)
                        ),
                        np.array(x["conf"]),
                        x["cls"],
                    )
                    for x in pred["boxes"]
                ]
                result_container.result = YoloResult(
                    boxes=boxes,
                    names={int(k): v for k, v in pred["names"].items()},
                )
        return result_container.result

    def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]:
        """Predict the layout of document pages using RPC service."""
        # Handle single image input
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        result_containers = [ResultContainer() for _ in image]
        predict_thread = ThreadPoolExecutor(max_workers=len(image))
        for img, result_container in zip(image, result_containers, strict=True):
            predict_thread.submit(
                self.predict_image, img, self.host, result_container, 800
            )
        predict_thread.shutdown(wait=True)
        result = [result_container.result for result_container in result_containers]
        return result

    def predict_page(
        self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image
    ):
        translate_config.raise_if_cancelled()
        with self.lock:
            # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
            pix = get_no_rotation_img(mupdf_doc[page.page_number])
        image = np.frombuffer(pix.samples, np.uint8).reshape(
            pix.height,
            pix.width,
            3,
        )[:, :, ::-1]
        predict_result = self.predict_image(image, self.host, None, 800)
        save_debug_image(image, predict_result, page.page_number + 1)
        return page, predict_result

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ):
        with ThreadPoolExecutor(max_workers=128) as executor:
            yield from executor.map(
                self.predict_page,
                pages,
                (mupdf_doc for _ in range(len(pages))),
                (translate_config for _ in range(len(pages))),
                (save_debug_image for _ in range(len(pages))),
            )

    @staticmethod
    def from_host(host: str) -> "RpcDocLayoutModel":
        """Create RpcDocLayoutModel from host address."""
        return RpcDocLayoutModel(host=host)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    # Test the service
    try:
        # Use a default test image if example/1.png doesn't exist
        image_path = "example/1.png"
        if not Path(image_path).exists():
            print(f"Warning: {image_path} not found.")
            print("Please provide the path to a test image:")
            image_path = input("> ")

        logger.info(f"Processing image: {image_path}")
        result = predict_layout(image_path)
        print("Prediction results:")
        print(result)
    except Exception as e:
        print(f"Error: {e!s}")

```

## /babeldoc/docvision/rpc_doclayout3.py

```py path="/babeldoc/docvision/rpc_doclayout3.py" 
import json
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import cv2
import httpx
import numpy as np
import pymupdf
from tenacity import retry
from tenacity import retry_if_exception_type
from tenacity import stop_after_attempt
from tenacity import wait_exponential

import babeldoc
from babeldoc.docvision.base_doclayout import DocLayoutModel
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img

logger = logging.getLogger(__name__)
DPI = 150


def encode_image(image) -> bytes:
    """Read and encode image to bytes

    Args:
        image: Can be either a file path (str) or numpy array
    """
    if isinstance(image, str):
        if not Path(image).exists():
            raise FileNotFoundError(f"Image file not found: {image}")
        img = cv2.imread(image)
        if img is None:
            raise ValueError(f"Failed to read image: {image}")
    else:
        img = image

    # logger.debug(f"Image shape: {img.shape}")
    encoded = cv2.imencode(".jpg", img)[1].tobytes()
    # logger.debug(f"Encoded image size: {len(encoded)} bytes")
    return encoded


@retry(
    stop=stop_after_attempt(3),  # 最多重试 3 次
    wait=wait_exponential(
        multiplier=1, min=1, max=10
    ),  # 指数退避策略，初始 1 秒，最大 10 秒
    retry=retry_if_exception_type((httpx.HTTPError, Exception)),  # 针对哪些异常重试
    before_sleep=lambda retry_state: logger.warning(
        f"Request failed, retrying in {getattr(retry_state.next_action, 'sleep', 'unknown')} seconds... "
        f"(Attempt {retry_state.attempt_number}/3)"
    ),
)
def predict_layout(
    image,
    host: str = "http://localhost:8000",
    _imgsz: int = 1024,
):
    """
    Predict document layout using the MOSEC service

    Args:
        image: Can be either a file path (str) or numpy array
        host: Service host URL
        imgsz: Image size for model input

    Returns:
        List of predictions containing bounding boxes and classes
    """
    # Prepare request data

    image_data = encode_image(image)

    # Pack data using msgpack
    # packed_data = msgpack.packb(data, use_bin_type=True)
    # logger.debug(f"Packed data size: {len(packed_data)} bytes")

    # Send request
    # logger.debug(f"Sending request to {host}/inference")
    response = httpx.post(
        f"{host}/analyze?min_sim=0.7&early_stop=0.99&timeout=480",
        files={"file": ("image.jpg", image_data, "image/jpeg")},
        headers={
            "Accept": "application/json",
        },
        timeout=480,
        follow_redirects=True,
    )

    # logger.debug(f"Response status: {response.status_code}")
    # logger.debug(f"Response headers: {response.headers}")
    idx = 0
    id_lookup = {}
    if response.status_code == 200:
        try:
            result = json.loads(response.text)
            useful_result = []
            if isinstance(result, dict):
                names = {}
                for box in result["boxes"]:
                    if box["ocr_match_score"] < 0.7:
                        continue

                    box["xyxy"] = box["coords"]
                    box["conf"] = box["ocr_match_score"]
                    if box["label"] not in names:
                        idx += 1
                        names[idx] = box["label"]
                        box["cls_id"] = idx
                        id_lookup[box["label"]] = idx
                    else:
                        box["cls_id"] = id_lookup[box["label"]]
                    names[box["cls_id"]] = box["label"]
                    box["cls"] = box["cls_id"]
                    useful_result.append(box)
                if "names" not in result:
                    result["names"] = names
                result["boxes"] = useful_result
                result = [result]
            return result
        except Exception as e:
            logger.exception(f"Failed to unpack response: {e!s}")
            raise
    else:
        logger.error(f"Request failed with status {response.status_code}")
        logger.error(f"Response content: {response.content}")
        raise Exception(
            f"Request failed with status {response.status_code}: {response.text}",
        )


class ResultContainer:
    def __init__(self):
        self.result = YoloResult(boxes_data=np.array([]), names=[])


class RpcDocLayoutModel(DocLayoutModel):
    """DocLayoutModel implementation that uses RPC service."""

    def __init__(self, host: str = "http://localhost:8000"):
        """Initialize RPC model with host address."""
        self.host = host
        self._stride = 32  # Default stride value
        self._names = ["text", "title", "list", "table", "figure"]
        self.lock = threading.Lock()

    @property
    def stride(self) -> int:
        """Stride of the model input."""
        return self._stride

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size,
        ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
        )

        # Calculate padding size
        pad_h = new_h - resized_h
        pad_w = new_w - resized_w
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes = (boxes - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict_image(
        self,
        image,
        host: str | None = None,
        result_container: ResultContainer | None = None,
        imgsz: int = 1024,
    ) -> ResultContainer:
        """Predict the layout of document pages using RPC service."""
        if result_container is None:
            result_container = ResultContainer()
        target_imgsz = (800, 800)
        orig_h, orig_w = image.shape[:2]
        target_imgsz = (orig_h, orig_w)
        if image.shape[0] != target_imgsz[0] or image.shape[1] != target_imgsz[1]:
            image = self.resize_and_pad_image(image, new_shape=target_imgsz)
        preds = predict_layout(image, host=self.host)
        orig_h, orig_w = orig_h / DPI * 72, orig_w / DPI * 72
        if len(preds) > 0:
            for pred in preds:
                boxes = [
                    YoloBox(
                        None,
                        self.scale_boxes(
                            target_imgsz, np.array(x["xyxy"]), (orig_h, orig_w)
                        ),
                        np.array(x["conf"]),
                        x["cls"],
                    )
                    for x in pred["boxes"]
                ]
                result_container.result = YoloResult(
                    boxes=boxes,
                    names={int(k): v for k, v in pred["names"].items()},
                )
        return result_container.result

    def predict(self, image, imgsz=1024, **kwargs) -> list[YoloResult]:
        """Predict the layout of document pages using RPC service."""
        # Handle single image input
        if isinstance(image, np.ndarray) and len(image.shape) == 3:
            image = [image]

        result_containers = [ResultContainer() for _ in image]
        predict_thread = ThreadPoolExecutor(max_workers=len(image))
        for img, result_container in zip(image, result_containers, strict=True):
            predict_thread.submit(
                self.predict_image, img, self.host, result_container, 800
            )
        predict_thread.shutdown(wait=True)
        result = [result_container.result for result_container in result_containers]
        return result

    def predict_page(
        self, page, mupdf_doc: pymupdf.Document, translate_config, save_debug_image
    ):
        translate_config.raise_if_cancelled()
        with self.lock:
            # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
            pix = get_no_rotation_img(mupdf_doc[page.page_number], dpi=DPI)
        image = np.frombuffer(pix.samples, np.uint8).reshape(
            pix.height,
            pix.width,
            3,
        )[:, :, ::-1]
        predict_result = self.predict_image(image, self.host, None, 800)
        save_debug_image(image, predict_result, page.page_number + 1)
        return page, predict_result

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ):
        with ThreadPoolExecutor(max_workers=4) as executor:
            yield from executor.map(
                self.predict_page,
                pages,
                (mupdf_doc for _ in range(len(pages))),
                (translate_config for _ in range(len(pages))),
                (save_debug_image for _ in range(len(pages))),
            )

    @staticmethod
    def from_host(host: str) -> "RpcDocLayoutModel":
        """Create RpcDocLayoutModel from host address."""
        return RpcDocLayoutModel(host=host)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    # Test the service
    try:
        # Use a default test image if example/1.png doesn't exist
        image_path = "example/1.png"
        if not Path(image_path).exists():
            print(f"Warning: {image_path} not found.")
            print("Please provide the path to a test image:")
            image_path = input("> ")

        logger.info(f"Processing image: {image_path}")
        result = predict_layout(image_path)
        print("Prediction results:")
        print(result)
    except Exception as e:
        print(f"Error: {e!s}")

```

## /babeldoc/docvision/table_detection/rapidocr.py

```py path="/babeldoc/docvision/table_detection/rapidocr.py" 
import logging
import re
import threading
from collections.abc import Generator

import cv2
import numpy as np
from babeldoc.assets.assets import get_table_detection_rapidocr_model_path
from babeldoc.docvision.base_doclayout import YoloBox
from babeldoc.docvision.base_doclayout import YoloResult
from babeldoc.format.pdf.document_il.utils.mupdf_helper import get_no_rotation_img
from rapidocr_onnxruntime import RapidOCR

try:
    import onnxruntime
except ImportError as e:
    if "DLL load failed" in str(e):
        raise OSError(
            "Microsoft Visual C++ Redistributable is not installed. "
            "Download it at https://aka.ms/vs/17/release/vc_redist.x64.exe"
        ) from e
    raise
import babeldoc.format.pdf.document_il.il_version_1
import pymupdf

logger = logging.getLogger(__name__)


def convert_to_yolo_result(predictions):
    """
    Convert RapidOCR predictions to YoloResult format.

    Args:
        predictions (list): List of predictions, where each prediction is a list of coordinates
                           in format [[x1, y1], [x2, y2], [x3, y3], [x4, y4], (text, confidence)]
                           or a numpy array of format [x1, y1, x2, y2, ...]

    Returns:
        YoloResult: Converted predictions in YoloResult format
    """
    boxes = []

    for pred in predictions:
        # Check if the prediction is in the format of 4 corner points
        if isinstance(pred, list) and len(pred) >= 5 and isinstance(pred[0], list):
            # Convert 4 corner points to xyxy format (min x, min y, max x, max y)
            points = np.array(pred[:4])
            x1, y1 = points[:, 0].min(), points[:, 1].min()
            x2, y2 = points[:, 0].max(), points[:, 1].max()
            xyxy = [x1, y1, x2, y2]
            box = YoloBox(xyxy=xyxy, conf=1.0, cls="text")
        # Check if the prediction is already in xyxy format
        elif isinstance(pred, list | np.ndarray) and len(pred) >= 4:
            if isinstance(pred, np.ndarray):
                pred = pred.tolist()
            xyxy = pred[:4]
            box = YoloBox(xyxy=xyxy, conf=1.0, cls="text")
        else:
            continue

        boxes.append(box)

    return YoloResult(names=["text"], boxes=boxes)


def create_yolo_result_from_nested_coords(nested_coords: np.ndarray, names: dict):
    boxes = []

    for quad in nested_coords.tolist():
        if len(quad) != 4:
            continue

        # Convert quad coordinates to xyxy format (min x, min y, max x, max y)
        x1, y1, x2, y2 = quad

        # Create YoloBox with confidence 1.0 and class 'text'
        box = YoloBox(
            xyxy=[float(x1), float(y1), float(x2), float(y2)], conf=np.array(1.0), cls=0
        )
        boxes.append(box)

    return YoloResult(names=names, boxes=boxes)


class RapidOCRModel:
    def __init__(self):
        self.use_cuda = False
        self.use_dml = False
        available_providers = onnxruntime.get_available_providers()
        for provider in available_providers:
            if re.match(r"dml", provider, re.IGNORECASE):
                self.use_dml = True
            elif re.match(r"cuda", provider, re.IGNORECASE):
                self.use_cuda = True
        self.use_dml = False  # force disable directml
        self.model = RapidOCR(
            det_model_path=get_table_detection_rapidocr_model_path(),
            det_use_cuda=self.use_cuda,
            det_use_dml=False,
        )
        self.names = {0: "table_text"}
        self.lock = threading.Lock()

    @property
    def stride(self):
        return 32

    def resize_and_pad_image(self, image, new_shape):
        """
        Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.

        Parameters:
        - image: Input image
        - new_shape: Target size (integer or (height, width) tuple)
        - stride: Padding alignment stride, default 32

        Returns:
        - Processed image
        """
        if isinstance(new_shape, int):
            new_shape = (new_shape, new_shape)

        h, w = image.shape[:2]
        new_h, new_w = new_shape

        # Calculate scaling ratio
        r = min(new_h / h, new_w / w)
        resized_h, resized_w = int(round(h * r)), int(round(w * r))

        # Resize image
        image = cv2.resize(
            image,
            (resized_w, resized_h),
            interpolation=cv2.INTER_LINEAR,
        )

        # Calculate padding size and align to stride multiple
        pad_w = (new_w - resized_w) % self.stride
        pad_h = (new_h - resized_h) % self.stride
        top, bottom = pad_h // 2, pad_h - pad_h // 2
        left, right = pad_w // 2, pad_w - pad_w // 2

        # Add padding
        image = cv2.copyMakeBorder(
            image,
            top,
            bottom,
            left,
            right,
            cv2.BORDER_CONSTANT,
            value=(114, 114, 114),
        )

        return image

    def scale_boxes(self, img1_shape, boxes, img0_shape):
        """
        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
        specified in (img1_shape) to the shape of a different image (img0_shape).

        Args:
            img1_shape (tuple): The shape of the image that the bounding boxes are for,
                in the format of (height, width).
            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
            img0_shape (tuple): the shape of the target image, in the format of (height, width).

        Returns:
            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
        """

        # Calculate scaling ratio
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])

        # Calculate padding size
        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)

        # Remove padding and scale boxes
        boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
        return boxes

    def predict(self, image, imgsz=800, batch_size=16, **kwargs):
        """
        Predict the layout of document pages.

        Args:
            image: A single image or a list of images of document pages.
            imgsz: Resize the image to this size. Must be a multiple of the stride.
            batch_size: Number of images to process in one batch.
            **kwargs: Additional arguments.

        Returns:
            A YoloResult object containing the detected boxes.
        """
        # Handle single image input
        assert isinstance(image, np.ndarray) and len(image.shape) == 3

        # Calculate target size based on the maximum height in the batch
        target_imgsz = 1024

        orig_shape = (image.shape[0], image.shape[1])

        pix = self.resize_and_pad_image(image, new_shape=target_imgsz)
        # pix = np.transpose(pix, (2, 0, 1))  # CHW
        # pix = pix.astype(np.float32) / 255.0  # Normalize to [0, 1]
        input_ = pix

        new_h, new_w = input_.shape[:2]

        # Run inference
        preds = self.model(input_, use_det=True, use_cls=False, use_rec=False)

        # Process each prediction in the batch
        if len(preds) > 0:
            preds_np = np.array(preds[0])[:, [0, 2], :].reshape([-1, 4])
            preds_np[..., :4] = self.scale_boxes(
                (new_h, new_w),
                preds_np[..., :4],
                orig_shape,
            )

            # Convert predictions to YoloResult format
            return create_yolo_result_from_nested_coords(preds_np, self.names)
        else:
            # Return empty YoloResult if no predictions
            return YoloResult(names=self.names, boxes=[])

    def handle_document(
        self,
        pages: list[babeldoc.format.pdf.document_il.il_version_1.Page],
        mupdf_doc: pymupdf.Document,
        translate_config,
        save_debug_image,
    ) -> Generator[
        tuple[babeldoc.format.pdf.document_il.il_version_1.Page, YoloResult], None, None
    ]:
        for page in pages:
            translate_config.raise_if_cancelled()
            with self.lock:
                # pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
                pix = get_no_rotation_img(mupdf_doc[page.page_number])
            image = np.frombuffer(pix.samples, np.uint8).reshape(
                pix.height,
                pix.width,
                3,
            )[:, :, ::-1]

            table_boxes = []
            for layout in page.page_layout:
                if layout.class_name == "table":
                    table_boxes.append(layout.box)

            predict_result = self.predict(image)

            ok_boxes = []
            for box in predict_result.boxes:
                # Convert the box coordinates to float for proper comparison
                box_xyxy = [float(coord) for coord in box.xyxy]

                # Check if this box is inside any of the table boxes
                for table_box in table_boxes:
                    # Determine if box is inside or overlapping with table_box with image dimensions
                    if self._is_box_in_table(
                        box_xyxy, table_box, page, image.shape[1], image.shape[0]
                    ):
                        ok_boxes.append(box)
                        break

            yolo_result = YoloResult(names=self.names, boxes=ok_boxes)
            save_debug_image(
                image,
                yolo_result,
                page.page_number + 1,
            )
            yield page, yolo_result

    def _is_box_in_table(self, box_xyxy, table_box, page, img_width, img_height):
        """
        Check if a box from image coordinates is inside a table box from PDF coordinates.

        Args:
            box_xyxy (list): Box coordinates in image coordinate system [x1, y1, x2, y2]
            table_box (Box): Table box in PDF coordinate system
            page: The page object containing information for coordinate conversion
            img_width: Width of the image
            img_height: Height of the image

        Returns:
            bool: True if the box is inside or significantly overlapping with the table box
        """

        # Get table box coordinates in PDF coordinate system
        table_pdf_x1 = table_box.x
        table_pdf_y1 = table_box.y
        table_pdf_x2 = table_box.x2
        table_pdf_y2 = table_box.y2

        # Convert table box to image coordinates
        table_img_x1 = table_pdf_x1
        table_img_y1 = img_height - table_pdf_y2
        table_img_x2 = table_pdf_x2
        table_img_y2 = img_height - table_pdf_y1

        # Now check for overlap between the boxes
        # Calculate the area of overlap
        x_overlap = max(
            0, min(box_xyxy[2], table_img_x2) - max(box_xyxy[0], table_img_x1)
        )
        y_overlap = max(
            0, min(box_xyxy[3], table_img_y2) - max(box_xyxy[1], table_img_y1)
        )
        overlap_area = x_overlap * y_overlap

        # Calculate area of the detected box
        box_area = (box_xyxy[2] - box_xyxy[0]) * (box_xyxy[3] - box_xyxy[1])

        # If overlap area is significant relative to the box area, consider it inside
        if box_area > 0 and overlap_area / box_area > 0.5:
            return True

        return False

```

## /babeldoc/format/__init__.py

```py path="/babeldoc/format/__init__.py" 

```

## /babeldoc/format/pdf/__init__.py

```py path="/babeldoc/format/pdf/__init__.py" 

```

## /babeldoc/format/pdf/babelpdf/encoding.py

```py path="/babeldoc/format/pdf/babelpdf/encoding.py" 
adobe_standard = [
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "space",
    "exclam",
    "quotedbl",
    "numbersign",
    "dollar",
    "percent",
    "ampersand",
    "quoteright",
    "parenleft",
    "parenright",
    "asterisk",
    "plus",
    "comma",
    "hyphen",
    "period",
    "slash",
    "zero",
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "colon",
    "semicolon",
    "less",
    "equal",
    "greater",
    "question",
    "at",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "bracketleft",
    "backslash",
    "bracketright",
    "asciicircum",
    "underscore",
    "quoteleft",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
    "braceleft",
    "bar",
    "braceright",
    "asciitilde",
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "exclamdown",
    "cent",
    "sterling",
    "fraction",
    "yen",
    "florin",
    "section",
    "currency",
    "quotesingle",
    "quotedblleft",
    "guillemotleft",
    "guilsinglleft",
    "guilsinglright",
    "fi",
    "fl",
    None,
    "endash",
    "dagger",
    "daggerdbl",
    "periodcentered",
    None,
    "paragraph",
    "bullet",
    "quotesinglbase",
    "quotedblbase",
    "quotedblright",
    "guillemotright",
    "ellipsis",
    "perthousand",
    None,
    "questiondown",
    None,
    "grave",
    "acute",
    "circumflex",
    "tilde",
    "macron",
    "breve",
    "dotaccent",
    "dieresis",
    None,
    "ring",
    "cedilla",
    None,
    "hungarumlaut",
    "ogonek",
    "caron",
    "emdash",
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "AE",
    None,
    "ordfeminine",
    None,
    None,
    None,
    None,
    "Lslash",
    "Oslash",
    "OE",
    "ordmasculine",
    None,
    None,
    None,
    None,
    None,
    "ae",
    None,
    None,
    None,
    "dotlessi",
    None,
    None,
    "lslash",
    "oslash",
    "oe",
    "germandbls",
    None,
    None,
    None,
    None,
]

mac_expert = [
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "space",
    "exclamsmall",
    "Hungarumlautsmall",
    "centoldstyle",
    "dollaroldstyle",
    "dollarsuperior",
    "ampersandsmall",
    "Acutesmall",
    "parenleftsuperior",
    "parenrightsuperior",
    "twodotenleader",
    "onedotenleader",
    "comma",
    "hyphen",
    "period",
    "fraction",
    "zerooldstyle",
    "oneoldstyle",
    "twooldstyle",
    "threeoldstyle",
    "fouroldstyle",
    "fiveoldstyle",
    "sixoldstyle",
    "sevenoldstyle",
    "eightoldstyle",
    "nineoldstyle",
    "colon",
    "semicolon",
    None,
    "threequartersemdash",
    None,
    "questionsmall",
    None,
    None,
    None,
    None,
    "Ethsmall",
    None,
    None,
    "onequarter",
    "onehalf",
    "threequarters",
    "oneeighth",
    "threeeighths",
    "fiveeighths",
    "seveneighths",
    "onethird",
    "twothirds",
    None,
    None,
    None,
    None,
    None,
    None,
    "ff",
    "fi",
    "fl",
    "ffi",
    "ffl",
    "parenleftinferior",
    None,
    "parenrightinferior",
    "Circumflexsmall",
    "hypheninferior",
    "Gravesmall",
    "Asmall",
    "Bsmall",
    "Csmall",
    "Dsmall",
    "Esmall",
    "Fsmall",
    "Gsmall",
    "Hsmall",
    "Ismall",
    "Jsmall",
    "Ksmall",
    "Lsmall",
    "Msmall",
    "Nsmall",
    "Osmall",
    "Psmall",
    "Qsmall",
    "Rsmall",
    "Ssmall",
    "Tsmall",
    "Usmall",
    "Vsmall",
    "Wsmall",
    "Xsmall",
    "Ysmall",
    "Zsmall",
    "colonmonetary",
    "onefitted",
    "rupiah",
    "Tildesmall",
    None,
    None,
    "asuperior",
    "centsuperior",
    None,
    None,
    None,
    None,
    "Aacutesmall",
    "Agravesmall",
    "Acircumflexsmall",
    "Adieresissmall",
    "Atildesmall",
    "Aringsmall",
    "Ccedillasmall",
    "Eacutesmall",
    "Egravesmall",
    "Ecircumflexsmall",
    "Edieresissmall",
    "Iacutesmall",
    "Igravesmall",
    "Icircumflexsmall",
    "Idieresissmall",
    "Ntildesmall",
    "Oacutesmall",
    "Ogravesmall",
    "Ocircumflexsmall",
    "Odieresissmall",
    "Otildesmall",
    "Uacutesmall",
    "Ugravesmall",
    "Ucircumflexsmall",
    "Udieresissmall",
    None,
    "eightsuperior",
    "fourinferior",
    "threeinferior",
    "sixinferior",
    "eightinferior",
    "seveninferior",
    "Scaronsmall",
    None,
    "centinferior",
    "twoinferior",
    None,
    "Dieresissmall",
    None,
    "Caronsmall",
    "osuperior",
    "fiveinferior",
    None,
    "commainferior",
    "periodinferior",
    "Yacutesmall",
    None,
    "dollarinferior",
    None,
    None,
    "Thornsmall",
    None,
    "nineinferior",
    "zeroinferior",
    "Zcaronsmall",
    "AEsmall",
    "Oslashsmall",
    "questiondownsmall",
    "oneinferior",
    "Lslashsmall",
    None,
    None,
    None,
    None,
    None,
    None,
    "Cedillasmall",
    None,
    None,
    None,
    None,
    None,
    "OEsmall",
    "figuredash",
    "hyphensuperior",
    None,
    None,
    None,
    None,
    "exclamdownsmall",
    None,
    "Ydieresissmall",
    None,
    "onesuperior",
    "twosuperior",
    "threesuperior",
    "foursuperior",
    "fivesuperior",
    "sixsuperior",
    "sevensuperior",
    "ninesuperior",
    "zerosuperior",
    None,
    "esuperior",
    "rsuperior",
    "tsuperior",
    None,
    None,
    "isuperior",
    "ssuperior",
    "dsuperior",
    None,
    None,
    None,
    None,
    None,
    "lsuperior",
    "Ogoneksmall",
    "Brevesmall",
    "Macronsmall",
    "bsuperior",
    "nsuperior",
    "msuperior",
    "commasuperior",
    "periodsuperior",
    "Dotaccentsmall",
    "Ringsmall",
    None,
    None,
    None,
    None,
]

mac_roman = [
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "space",
    "exclamsmall",
    "Hungarumlautsmall",
    "centoldstyle",
    "dollaroldstyle",
    "dollarsuperior",
    "ampersandsmall",
    "Acutesmall",
    "parenleftsuperior",
    "parenrightsuperior",
    "twodotenleader",
    "onedotenleader",
    "comma",
    "hyphen",
    "period",
    "fraction",
    "zerooldstyle",
    "oneoldstyle",
    "twooldstyle",
    "threeoldstyle",
    "fouroldstyle",
    "fiveoldstyle",
    "sixoldstyle",
    "sevenoldstyle",
    "eightoldstyle",
    "nineoldstyle",
    "colon",
    "semicolon",
    None,
    "threequartersemdash",
    None,
    "questionsmall",
    None,
    None,
    None,
    None,
    "Ethsmall",
    None,
    None,
    "onequarter",
    "onehalf",
    "threequarters",
    "oneeighth",
    "threeeighths",
    "fiveeighths",
    "seveneighths",
    "onethird",
    "twothirds",
    None,
    None,
    None,
    None,
    None,
    None,
    "ff",
    "fi",
    "fl",
    "ffi",
    "ffl",
    "parenleftinferior",
    None,
    "parenrightinferior",
    "Circumflexsmall",
    "hypheninferior",
    "Gravesmall",
    "Asmall",
    "Bsmall",
    "Csmall",
    "Dsmall",
    "Esmall",
    "Fsmall",
    "Gsmall",
    "Hsmall",
    "Ismall",
    "Jsmall",
    "Ksmall",
    "Lsmall",
    "Msmall",
    "Nsmall",
    "Osmall",
    "Psmall",
    "Qsmall",
    "Rsmall",
    "Ssmall",
    "Tsmall",
    "Usmall",
    "Vsmall",
    "Wsmall",
    "Xsmall",
    "Ysmall",
    "Zsmall",
    "colonmonetary",
    "onefitted",
    "rupiah",
    "Tildesmall",
    None,
    None,
    "asuperior",
    "centsuperior",
    None,
    None,
    None,
    None,
    "Aacutesmall",
    "Agravesmall",
    "Acircumflexsmall",
    "Adieresissmall",
    "Atildesmall",
    "Aringsmall",
    "Ccedillasmall",
    "Eacutesmall",
    "Egravesmall",
    "Ecircumflexsmall",
    "Edieresissmall",
    "Iacutesmall",
    "Igravesmall",
    "Icircumflexsmall",
    "Idieresissmall",
    "Ntildesmall",
    "Oacutesmall",
    "Ogravesmall",
    "Ocircumflexsmall",
    "Odieresissmall",
    "Otildesmall",
    "Uacutesmall",
    "Ugravesmall",
    "Ucircumflexsmall",
    "Udieresissmall",
    None,
    "eightsuperior",
    "fourinferior",
    "threeinferior",
    "sixinferior",
    "eightinferior",
    "seveninferior",
    "Scaronsmall",
    None,
    "centinferior",
    "twoinferior",
    None,
    "Dieresissmall",
    None,
    "Caronsmall",
    "osuperior",
    "fiveinferior",
    None,
    "commainferior",
    "periodinferior",
    "Yacutesmall",
    None,
    "dollarinferior",
    None,
    None,
    "Thornsmall",
    None,
    "nineinferior",
    "zeroinferior",
    "Zcaronsmall",
    "AEsmall",
    "Oslashsmall",
    "questiondownsmall",
    "oneinferior",
    "Lslashsmall",
    None,
    None,
    None,
    None,
    None,
    None,
    "Cedillasmall",
    None,
    None,
    None,
    None,
    None,
    "OEsmall",
    "figuredash",
    "hyphensuperior",
    None,
    None,
    None,
    None,
    "exclamdownsmall",
    None,
    "Ydieresissmall",
    None,
    "onesuperior",
    "twosuperior",
    "threesuperior",
    "foursuperior",
    "fivesuperior",
    "sixsuperior",
    "sevensuperior",
    "ninesuperior",
    "zerosuperior",
    None,
    "esuperior",
    "rsuperior",
    "tsuperior",
    None,
    None,
    "isuperior",
    "ssuperior",
    "dsuperior",
    None,
    None,
    None,
    None,
    None,
    "lsuperior",
    "Ogoneksmall",
    "Brevesmall",
    "Macronsmall",
    "bsuperior",
    "nsuperior",
    "msuperior",
    "commasuperior",
    "periodsuperior",
    "Dotaccentsmall",
    "Ringsmall",
    None,
    None,
    None,
    None,
]

win_ansi = [
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    "space",
    "exclam",
    "quotedbl",
    "numbersign",
    "dollar",
    "percent",
    "ampersand",
    "quotesingle",
    "parenleft",
    "parenright",
    "asterisk",
    "plus",
    "comma",
    "hyphen",
    "period",
    "slash",
    "zero",
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "colon",
    "semicolon",
    "less",
    "equal",
    "greater",
    "question",
    "at",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "bracketleft",
    "backslash",
    "bracketright",
    "asciicircum",
    "underscore",
    "grave",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
    "braceleft",
    "bar",
    "braceright",
    "asciitilde",
    "bullet",
    "Euro",
    "bullet",
    "quotesinglbase",
    "florin",
    "quotedblbase",
    "ellipsis",
    "dagger",
    "daggerdbl",
    "circumflex",
    "perthousand",
    "Scaron",
    "guilsinglleft",
    "OE",
    "bullet",
    "Zcaron",
    "bullet",
    "bullet",
    "quoteleft",
    "quoteright",
    "quotedblleft",
    "quotedblright",
    "bullet",
    "endash",
    "emdash",
    "tilde",
    "trademark",
    "scaron",
    "guilsinglright",
    "oe",
    "bullet",
    "zcaron",
    "Ydieresis",
    "space",
    "exclamdown",
    "cent",
    "sterling",
    "currency",
    "yen",
    "brokenbar",
    "section",
    "dieresis",
    "copyright",
    "ordfeminine",
    "guillemotleft",
    "logicalnot",
    "hyphen",
    "registered",
    "macron",
    "degree",
    "plusminus",
    "twosuperior",
    "threesuperior",
    "acute",
    "mu",
    "paragraph",
    "periodcentered",
    "cedilla",
    "onesuperior",
    "ordmasculine",
    "guillemotright",
    "onequarter",
    "onehalf",
    "threequarters",
    "questiondown",
    "Agrave",
    "Aacute",
    "Acircumflex",
    "Atilde",
    "Adieresis",
    "Aring",
    "AE",
    "Ccedilla",
    "Egrave",
    "Eacute",
    "Ecircumflex",
    "Edieresis",
    "Igrave",
    "Iacute",
    "Icircumflex",
    "Idieresis",
    "Eth",
    "Ntilde",
    "Ograve",
    "Oacute",
    "Ocircumflex",
    "Otilde",
    "Odieresis",
    "multiply",
    "Oslash",
    "Ugrave",
    "Uacute",
    "Ucircumflex",
    "Udieresis",
    "Yacute",
    "Thorn",
    "germandbls",
    "agrave",
    "aacute",
    "acircumflex",
    "atilde",
    "adieresis",
    "aring",
    "ae",
    "ccedilla",
    "egrave",
    "eacute",
    "ecircumflex",
    "edieresis",
    "igrave",
    "iacute",
    "icircumflex",
    "idieresis",
    "eth",
    "ntilde",
    "ograve",
    "oacute",
    "ocircumflex",
    "otilde",
    "odieresis",
    "divide",
    "oslash",
    "ugrave",
    "uacute",
    "ucircumflex",
    "udieresis",
    "yacute",
    "thorn",
    "ydieresis",
]


def get_type1_encoding(name):
    match name:
        case "StandardEncoding":
            return adobe_standard
        case "MacRomanEncoding":
            return mac_roman
        case "WinAnsiEncoding":
            return win_ansi
        case "MacExpertEncoding":
            return mac_expert

```

## /babeldoc/format/pdf/converter.py

```py path="/babeldoc/format/pdf/converter.py" 
import logging
import re
import unicodedata

import numpy as np
from pymupdf import Font

from babeldoc.format.pdf.document_il.frontend.il_creater import ILCreater
from babeldoc.pdfminer.converter import PDFConverter
from babeldoc.pdfminer.layout import LTChar
from babeldoc.pdfminer.layout import LTComponent
from babeldoc.pdfminer.layout import LTFigure
from babeldoc.pdfminer.layout import LTLine
from babeldoc.pdfminer.layout import LTPage
from babeldoc.pdfminer.layout import LTText
from babeldoc.pdfminer.pdfcolor import PDFColorSpace
from babeldoc.pdfminer.pdffont import PDFCIDFont
from babeldoc.pdfminer.pdffont import PDFFont
from babeldoc.pdfminer.pdffont import PDFUnicodeNotDefined
from babeldoc.pdfminer.pdfinterp import PDFGraphicState
from babeldoc.pdfminer.pdfinterp import PDFResourceManager
from babeldoc.pdfminer.utils import Matrix
from babeldoc.pdfminer.utils import apply_matrix_pt
from babeldoc.pdfminer.utils import bbox2str
from babeldoc.pdfminer.utils import matrix2str
from babeldoc.pdfminer.utils import mult_matrix

log = logging.getLogger(__name__)


class PDFConverterEx(PDFConverter):
    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        il_creater: ILCreater | None = None,
    ) -> None:
        PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
        self.il_creater = il_creater

    def begin_page(self, page, ctm) -> None:
        # 重载替换 cropbox
        (x0, y0, x1, y1) = page.cropbox
        (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
        (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
        mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
        self.il_creater.on_page_media_box(
            mediabox[0],
            mediabox[1],
            mediabox[2],
            mediabox[3],
        )
        self.il_creater.on_page_number(page.pageno)
        self.cur_item = LTPage(page.pageno, mediabox)

    def end_page(self, _page) -> None:
        # 重载返回指令流
        return self.receive_layout(self.cur_item)

    def begin_figure(self, name, bbox, matrix) -> None:
        # 重载设置 pageid
        self._stack.append(self.cur_item)
        self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
        self.cur_item.pageid = self._stack[-1].pageid

    def end_figure(self, _: str) -> None:
        # 重载返回指令流
        fig = self.cur_item
        if not isinstance(self.cur_item, LTFigure):
            raise ValueError(f"Unexpected item type: {type(self.cur_item)}")
        self.cur_item = self._stack.pop()
        self.cur_item.add(fig)
        return self.receive_layout(fig)

    def render_char(
        self,
        matrix,
        font,
        fontsize: float,
        scaling: float,
        rise: float,
        cid: int,
        ncs,
        graphicstate: PDFGraphicState,
    ) -> float:
        # 重载设置 cid 和 font
        try:
            text = font.to_unichr(cid)
            if not isinstance(text, str):
                raise TypeError(f"Expected string, got {type(text)}")
        except PDFUnicodeNotDefined:
            text = self.handle_undefined_char(font, cid)
        textwidth = font.char_width(cid)
        textdisp = font.char_disp(cid)

        if not hasattr(font, "xobj_id"):
            log.debug(
                f"Font {font.fontname} does not have xobj_id attribute.",
            )
            font_id = "UNKNOW"
        else:
            font_id = self.il_creater.current_page_font_name_id_map.get(
                font.xobj_id, None
            )

        item = AWLTChar(
            matrix,
            font,
            fontsize,
            scaling,
            rise,
            text,
            textwidth,
            textdisp,
            ncs,
            graphicstate,
            self.il_creater.xobj_id,
            font_id,
        )
        self.cur_item.add(item)
        item.cid = cid  # hack 插入原字符编码
        item.font = font  # hack 插入原字符字体
        return item.adv


class AWLTChar(LTChar):
    """Actual letter in the text as a Unicode string."""

    def __init__(
        self,
        matrix: Matrix,
        font: PDFFont,
        fontsize: float,
        scaling: float,
        rise: float,
        text: str,
        textwidth: float,
        textdisp: float | tuple[float | None, float],
        ncs: PDFColorSpace,
        graphicstate: PDFGraphicState,
        xobj_id: int,
        font_id: str,
    ) -> None:
        LTText.__init__(self)
        self._text = text
        self.matrix = matrix
        self.fontname = font.fontname
        self.ncs = ncs
        self.graphicstate = graphicstate
        self.xobj_id = xobj_id
        self.adv = textwidth * fontsize * scaling
        self.aw_font_id = font_id
        # compute the boundary rectangle.
        if font.is_vertical():
            # vertical
            assert isinstance(textdisp, tuple)
            (vx, vy) = textdisp
            if vx is None:
                vx = fontsize * 0.5
            else:
                vx = vx * fontsize * 0.001
            vy = (1000 - vy) * fontsize * 0.001
            bbox_lower_left = (-vx, vy + rise + self.adv)
            bbox_upper_right = (-vx + fontsize, vy + rise)
        else:
            # horizontal
            descent = font.get_descent() * fontsize
            bbox_lower_left = (0, descent + rise)
            bbox_upper_right = (self.adv, descent + rise + fontsize)
        (a, b, c, d, e, f) = self.matrix
        self.upright = a * d * scaling > 0 and b * c <= 0
        (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
        (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
        if x1 < x0:
            (x0, x1) = (x1, x0)
        if y1 < y0:
            (y0, y1) = (y1, y0)
        LTComponent.__init__(self, (x0, y0, x1, y1))
        if font.is_vertical() or matrix[0] == 0:
            self.size = self.width
        else:
            self.size = self.height
        return

    def __repr__(self) -> str:
        return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"

    def get_text(self) -> str:
        return self._text


class Paragraph:
    def __init__(self, y, x, x0, x1, size, brk):
        self.y: float = y  # 初始纵坐标
        self.x: float = x  # 初始横坐标
        self.x0: float = x0  # 左边界
        self.x1: float = x1  # 右边界
        self.size: float = size  # 字体大小
        self.brk: bool = brk  # 换行标记


# fmt: off
class TranslateConverter(PDFConverterEx):
    def __init__(
        self,
        rsrcmgr,
        vfont: str | None = None,
        vchar: str | None = None,
        thread: int = 0,
        layout: dict | None = None,
        lang_in: str = "",  # 保留参数但添加未使用标记
        _lang_out: str = "",  # 改为未使用参数
        _service: str = "",  # 改为未使用参数
        resfont: str = "",
        noto: Font | None = None,
        envs: dict | None = None,
        _prompt: list | None = None,  # 改为未使用参数
        il_creater: ILCreater | None = None,
    ):
        layout = layout or {}
        super().__init__(rsrcmgr, il_creater)
        self.vfont = vfont
        self.vchar = vchar
        self.thread = thread
        self.layout = layout
        self.resfont = resfont
        self.noto = noto

    def receive_layout(self, ltpage: LTPage):
        # 段落
        sstk: list[str] = []            # 段落文字栈
        pstk: list[Paragraph] = []      # 段落属性栈
        vbkt: int = 0                   # 段落公式括号计数
        # 公式组
        vstk: list[LTChar] = []         # 公式符号组
        vlstk: list[LTLine] = []        # 公式线条组
        vfix: float = 0                 # 公式纵向偏移
        # 公式组栈
        var: list[list[LTChar]] = []    # 公式符号组栈
        varl: list[list[LTLine]] = []   # 公式线条组栈
        varf: list[float] = []          # 公式纵向偏移栈
        vlen: list[float] = []          # 公式宽度栈
        # 全局
        lstk: list[LTLine] = []         # 全局线条栈
        xt: LTChar = None               # 上一个字符
        xt_cls: int = -1                # 上一个字符所属段落，保证无论第一个字符属于哪个类别都可以触发新段落
        vmax: float = ltpage.width / 4  # 行内公式最大宽度
        ops: str = ""                   # 渲染结果

        def vflag(font: str, char: str):    # 匹配公式（和角标）字体
            if isinstance(font, bytes):     # 不一定能 decode，直接转 str
                font = str(font)
            font = font.split("+")[-1]      # 字体名截断
            if re.match(r"\(cid:", char):
                return True
            # 基于字体名规则的判定
            if self.vfont:
                if re.match(self.vfont, font):
                    return True
            else:
                if re.match(                                            # latex 字体
                    r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
                    font,
                ):
                    return True
            # 基于字符集规则的判定
            if self.vchar:
                if re.match(self.vchar, char):
                    return True
            else:
                if (
                    char
                    and char != " "                                     # 非空格
                    and (
                        unicodedata.category(char[0])
                        in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]   # 文字修饰符、数学符号、分隔符号
                        or ord(char[0]) in range(0x370, 0x400)          # 希腊字母
                    )
                ):
                    return True
            return False

        ############################################################
        # A. 原文档解析
        for child in ltpage:
            if isinstance(child, LTChar):
                try:
                    self.il_creater.on_lt_char(child)
                except Exception:
                    log.exception(
                        'Error processing LTChar',
                    )
                continue
                cur_v = False
                layout = self.layout[ltpage.pageid]
                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
                h, w = layout.shape
                # 读取当前字符在 layout 中的类别
                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
                cls = layout[cy, cx]
                # 锚定文档中 bullet 的位置
                if child.get_text() == "•":
                    cls = 0
                # 判定当前字符是否属于公式
                if (                                                                                        # 判定当前字符是否属于公式
                    cls == 0                                                                                # 1. 类别为保留区域
                    or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79)  # 2. 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
                    or vflag(child.fontname, child.get_text())                                              # 3. 公式字体
                    or (child.matrix[0] == 0 and child.matrix[3] == 0)                                      # 4. 垂直字体
                ):
                    cur_v = True
                # 判定括号组是否属于公式
                if not cur_v:
                    if vstk and child.get_text() == "(":
                        cur_v = True
                        vbkt += 1
                    if vbkt and child.get_text() == ")":
                        cur_v = True
                        vbkt -= 1
                if (                                                        # 判定当前公式是否结束
                    not cur_v                                               # 1. 当前字符不属于公式
                    or cls != xt_cls                                        # 2. 当前字符与前一个字符不属于同一段落
                    # or (abs(child.x0 - xt.x0) > vmax and cls != 0)        # 3. 段落内换行，可能是一长串斜体的段落，也可能是段内分式换行，这里设个阈值进行区分
                    # 禁止纯公式（代码）段落换行，直到文字开始再重开文字段落，保证只存在两种情况
                    # A. 纯公式（代码）段落（锚定绝对位置）sstk[-1]=="" -> sstk[-1]=="{v*}"
                    # B. 文字开头段落（排版相对位置）sstk[-1]!=""
                    or (sstk[-1] != "" and abs(child.x0 - xt.x0) > vmax)    # 因为 cls==xt_cls==0 一定有 sstk[-1]==""，所以这里不需要再判定 cls!=0
                ):
                    if vstk:
                        if (                                                # 根据公式右侧的文字修正公式的纵向偏移
                            not cur_v                                       # 1. 当前字符不属于公式
                            and cls == xt_cls                               # 2. 当前字符与前一个字符属于同一段落
                            and child.x0 > max([vch.x0 for vch in vstk])    # 3. 当前字符在公式右侧
                        ):
                            vfix = vstk[0].y0 - child.y0
                        if sstk[-1] == "":
                            xt_cls = -1 # 禁止纯公式段落（sstk[-1]=="{v*}"）的后续连接，但是要考虑新字符和后续字符的连接，所以这里修改的是上个字符的类别
                        sstk[-1] += f"{{v{len(var)}}}"
                        var.append(vstk)
                        varl.append(vlstk)
                        varf.append(vfix)
                        vstk = []
                        vlstk = []
                        vfix = 0
                # 当前字符不属于公式或当前字符是公式的第一个字符
                if not vstk:
                    if cls == xt_cls:               # 当前字符与前一个字符属于同一段落
                        if child.x0 > xt.x1 + 1:    # 添加行内空格
                            sstk[-1] += " "
                        elif child.x1 < xt.x0:      # 添加换行空格并标记原文段落存在换行
                            sstk[-1] += " "
                            pstk[-1].brk = True
                    else:                           # 根据当前字符构建一个新的段落
                        sstk.append("")
                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
                if not cur_v:                                               # 文字入栈
                    if (                                                    # 根据当前字符修正段落属性
                        child.size > pstk[-1].size / 0.79                   # 1. 当前字符显著比段落字体大
                        or len(sstk[-1].strip()) == 1                       # 2. 当前字符为段落第二个文字（考虑首字母放大的情况）
                    ) and child.get_text() != " ":                          # 3. 当前字符不是空格
                        pstk[-1].y -= child.size - pstk[-1].size            # 修正段落初始纵坐标，假设两个不同大小字符的上边界对齐
                        pstk[-1].size = child.size
                    sstk[-1] += child.get_text()
                else:                                                       # 公式入栈
                    if (                                                    # 根据公式左侧的文字修正公式的纵向偏移
                        not vstk                                            # 1. 当前字符是公式的第一个字符
                        and cls == xt_cls                                   # 2. 当前字符与前一个字符属于同一段落
                        and child.x0 > xt.x0                                # 3. 前一个字符在公式左侧
                    ):
                        vfix = child.y0 - xt.y0
                    vstk.append(child)
                # 更新段落边界，因为段落内换行之后可能是公式开头，所以要在外边处理
                pstk[-1].x0 = min(pstk[-1].x0, child.x0)
                pstk[-1].x1 = max(pstk[-1].x1, child.x1)
                # 更新上一个字符
                xt = child
                xt_cls = cls
            elif isinstance(child, LTFigure):
                # 图表
                self.il_creater.on_pdf_figure(child)
                pass
            elif isinstance(child, LTLine):     # 线条
                continue
                layout = self.layout[ltpage.pageid]
                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
                h, w = layout.shape
                # 读取当前线条在 layout 中的类别
                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
                cls = layout[cy, cx]
                if vstk and cls == xt_cls:      # 公式线条
                    vlstk.append(child)
                else:                           # 全局线条
                    lstk.append(child)
            else:
                pass
        return
        # 处理结尾
        if vstk:    # 公式出栈
            sstk[-1] += f"{{v{len(var)}}}"
            var.append(vstk)
            varl.append(vlstk)
            varf.append(vfix)
        log.debug("\n==========[VSTACK]==========\n")
        for var_id, v in enumerate(var):  # 计算公式宽度
            l = max([vch.x1 for vch in v]) - v[0].x0
            log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[var_id])} > v{var_id} = {"".join([ch.get_text() for ch in v])}')
            vlen.append(l)

        ############################################################
        # B. 段落翻译
        log.debug("\n==========[SSTACK]==========\n")

        news = sstk.copy()

        ############################################################
        # C. 新文档排版
        def raw_string(fcur: str, cstk: str):  # 编码字符串
            if fcur == 'noto':
                return "".join([f"{self.noto.has_glyph(ord(c)):04x}" for c in cstk])
            elif isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
                return "".join([f"{ord(c):04x}" for c in cstk])
            else:
                return "".join([f"{ord(c):02x}" for c in cstk])

        _x, _y = 0, 0
        for para_id, new in enumerate(news):
            x: float = pstk[para_id].x           # 段落初始横坐标
            y: float = pstk[para_id].y           # 段落初始纵坐标
            x0: float = pstk[para_id].x0         # 段落左边界
            x1: float = pstk[para_id].x1         # 段落右边界
            size: float = pstk[para_id].size     # 段落字体大小
            brk: bool = pstk[para_id].brk        # 段落换行标记
            cstk: str = ""                  # 当前文字栈
            fcur: str = None                # 当前字体 ID
            tx = x
            fcur_ = fcur
            ptr = 0
            log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[para_id]} | {new}")
            while ptr < len(new):
                vy_regex = re.match(
                    r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE,
                )  # 匹配 {vn} 公式标记
                mod = 0  # 文字修饰符
                if vy_regex:  # 加载公式
                    ptr += len(vy_regex.group(0))
                    try:
                        vid = int(vy_regex.group(1).replace(" ", ""))
                        adv = vlen[vid]
                    except Exception as e:
                        log.debug("Skipping formula placeholder due to: %s", e)
                        continue  # 翻译器可能会自动补个越界的公式标记
                    if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]:  # 文字修饰符
                        mod = var[vid][-1].width
                else:  # 加载文字
                    ch = new[ptr]
                    fcur_ = None
                    try:
                        if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
                            fcur_ = "tiro"  # 默认拉丁字体
                    except Exception:
                        pass
                    if fcur_ is None:
                        fcur_ = self.resfont  # 默认非拉丁字体
                    if fcur_ == 'noto':
                        adv = self.noto.char_lengths(ch, size)[0]
                    else:
                        adv = self.fontmap[fcur_].char_width(ord(ch)) * size
                    ptr += 1
                if (                                # 输出文字缓冲区
                    fcur_ != fcur                   # 1. 字体更新
                    or vy_regex                     # 2. 插入公式
                    or x + adv > x1 + 0.1 * size    # 3. 到达右边界（可能一整行都被符号化，这里需要考虑浮点误差）
                ):
                    if cstk:
                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
                        cstk = ""
                if brk and x + adv > x1 + 0.1 * size:  # 到达右边界且原文段落存在换行
                    x = x0
                    lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
                    # y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1)  # 小语种大多适配 1.1
                    y -= size * 1.4
                if vy_regex:  # 插入公式
                    fix = 0
                    if fcur is not None:  # 段落内公式修正纵向偏移
                        fix = varf[vid]
                    for vch in var[vid]:  # 排版公式字符
                        vc = chr(vch.cid)
                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm <{raw_string(self.fontid[vch.font], vc)}> TJ "
                        if log.isEnabledFor(logging.DEBUG):
                            lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
                            _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
                    for l in varl[vid]:  # 排版公式线条
                        if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
                else:  # 插入文字缓冲区
                    if not cstk:  # 单行开头
                        tx = x
                        if x == x0 and ch == " ":  # 消除段落换行空格
                            adv = 0
                        else:
                            cstk += ch
                    else:
                        cstk += ch
                adv -= mod # 文字修饰符
                fcur = fcur_
                x += adv
                if log.isEnabledFor(logging.DEBUG):
                    lstk.append(LTLine(0.1, (_x, _y), (x, y)))
                    _x, _y = x, y
            # 处理结尾
            if cstk:
                ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm <{raw_string(fcur, cstk)}> TJ "
        for l in lstk:  # 排版全局线条
            if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
        ops = f"BT {ops}ET "
        return ops

```

## /babeldoc/format/pdf/document_il/__init__.py

```py path="/babeldoc/format/pdf/document_il/__init__.py" 
from babeldoc.format.pdf.document_il.il_version_1 import BaseOperations
from babeldoc.format.pdf.document_il.il_version_1 import Box
from babeldoc.format.pdf.document_il.il_version_1 import Cropbox
from babeldoc.format.pdf.document_il.il_version_1 import Document
from babeldoc.format.pdf.document_il.il_version_1 import GraphicState
from babeldoc.format.pdf.document_il.il_version_1 import Mediabox
from babeldoc.format.pdf.document_il.il_version_1 import Page
from babeldoc.format.pdf.document_il.il_version_1 import PageLayout
from babeldoc.format.pdf.document_il.il_version_1 import PdfCharacter
from babeldoc.format.pdf.document_il.il_version_1 import PdfFigure
from babeldoc.format.pdf.document_il.il_version_1 import PdfFont
from babeldoc.format.pdf.document_il.il_version_1 import PdfFontCharBoundingBox
from babeldoc.format.pdf.document_il.il_version_1 import PdfFormula
from babeldoc.format.pdf.document_il.il_version_1 import PdfLine
from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraph
from babeldoc.format.pdf.document_il.il_version_1 import PdfParagraphComposition
from babeldoc.format.pdf.document_il.il_version_1 import PdfRectangle
from babeldoc.format.pdf.document_il.il_version_1 import PdfSameStyleCharacters
from babeldoc.format.pdf.document_il.il_version_1 import PdfSameStyleUnicodeCharacters
from babeldoc.format.pdf.document_il.il_version_1 import PdfStyle
from babeldoc.format.pdf.document_il.il_version_1 import PdfXobject
from babeldoc.format.pdf.document_il.il_version_1 import VisualBbox

__all__ = [
    "BaseOperations",
    "Box",
    "Cropbox",
    "Document",
    "GraphicState",
    "Mediabox",
    "Page",
    "PageLayout",
    "PdfCharacter",
    "PdfFigure",
    "PdfFont",
    "PdfFontCharBoundingBox",
    "PdfFormula",
    "PdfLine",
    "PdfParagraph",
    "PdfParagraphComposition",
    "PdfRectangle",
    "PdfSameStyleCharacters",
    "PdfSameStyleUnicodeCharacters",
    "PdfStyle",
    "PdfXobject",
    "VisualBbox",
]

```

## /babeldoc/format/pdf/document_il/backend/__init__.py

```py path="/babeldoc/format/pdf/document_il/backend/__init__.py" 

```

## /babeldoc/format/pdf/document_il/backend/pdf_creater.py

```py path="/babeldoc/format/pdf/document_il/backend/pdf_creater.py" 
import io
import itertools
import logging
import os
import re
import time
import unicodedata
from multiprocessing import Process
from pathlib import Path

import freetype
import pymupdf
from bitstring import BitStream

from babeldoc.assets.embedding_assets_metadata import FONT_NAMES
from babeldoc.format.pdf.document_il import il_version_1
from babeldoc.format.pdf.document_il.utils.fontmap import FontMapper
from babeldoc.format.pdf.document_il.utils.zstd_helper import zstd_decompress
from babeldoc.format.pdf.translation_config import TranslateResult
from babeldoc.format.pdf.translation_config import TranslationConfig
from babeldoc.format.pdf.translation_config import WatermarkOutputMode

logger = logging.getLogger(__name__)

SUBSET_FONT_STAGE_NAME = "Subset font"
SAVE_PDF_STAGE_NAME = "Save PDF"


def to_int(src):
    return int(re.search(r"\d+", src).group(0))


def parse_mapping(text):
    mapping = []
    for x in re.finditer(rb"<(?P<num>[a-fA-F0-9]+)>", text):
        mapping.append(int(x.group("num"), 16))
    return mapping


def apply_normalization(cmap, gid, code):
    need = False
    if 0x2F00 <= code <= 0x2FD5:  # Kangxi Radicals
        need = True
    if 0xF900 <= code <= 0xFAFF:  # CJK Compatibility Ideographs
        need = True
    if need:
        norm = unicodedata.normalize("NFD", chr(code))
        cmap[gid] = ord(norm)
    else:
        cmap[gid] = code


def batched(iterable, n, *, strict=False):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    iterator = iter(iterable)
    while batch := tuple(itertools.islice(iterator, n)):
        if strict and len(batch) != n:
            raise ValueError("batched(): incomplete batch")
        yield batch


def update_tounicode_cmap_pair(cmap, data):
    for start, stop, value in batched(data, 3):
        for gid in range(start, stop + 1):
            code = value + gid - start
            apply_normalization(cmap, gid, code)


def update_tounicode_cmap_code(cmap, data):
    for gid, code in batched(data, 2):
        apply_normalization(cmap, gid, code)


def parse_tounicode_cmap(data):
    cmap = {}
    for x in re.finditer(
        rb"\s+beginbfrange\s*(?P<r>(<[0-9a-fA-F]+>\s*)+)endbfrange\s+", data
    ):
        update_tounicode_cmap_pair(cmap, parse_mapping(x.group("r")))
    for x in re.finditer(
        rb"\s+beginbfchar\s*(?P<c>(<[0-9a-fA-F]+>\s*)+)endbfchar", data
    ):
        update_tounicode_cmap_code(cmap, parse_mapping(x.group("c")))
    return cmap


def parse_truetype_data(data):
    glyph_in_use = []
    face = freetype.Face(io.BytesIO(data))
    for i in range(face.num_glyphs):
        face.load_glyph(i)
        if face.glyph.outline.contours:
            glyph_in_use.append(i)
    return glyph_in_use


TOUNICODE_HEAD = """\
/CIDInit /ProcSet findresource begin
12 dict begin
begincmap
/CIDSystemInfo <</Registry(Adobe)/Ordering(UCS)/Supplement 0>> def
/CMapName /Adobe-Identity-UCS def
/CMapType 2 def
1 begincodespacerange
<0000> <FFFF>
endcodespacerange"""
TOUNICODE_TAIL = """\
endcmap
CMapName currentdict /CMap defineresource pop
end
end"""


def make_tounicode(cmap, used):
    short = []
    for x in used:
        if x in cmap:
            short.append((x, cmap[x]))
    line = [TOUNICODE_HEAD]
    for block in batched(short, 100):
        line.append(f"{len(block)} beginbfchar")
        for glyph, code in block:
            if code < 0x10000:
                line.append(f"<{glyph:04x}><{code:04x}>")
            else:
                code -= 0x10000
                high = 0xD800 + (code >> 10)
                low = 0xDC00 + (code & 0b1111111111)
                line.append(f"<{glyph:04x}><{high:04x}{low:04x}>")
        line.append("endbfchar")
    line.append(TOUNICODE_TAIL)
    return "\n".join(line)


def reproduce_one_font(doc, index):
    m = doc.xref_get_key(index, "ToUnicode")
    f = doc.xref_get_key(index, "DescendantFonts")
    if m[0] == "xref" and f[0] == "array":
        mi = to_int(m[1])
        fi = to_int(f[1])
        ff = doc.xref_get_key(fi, "FontDescriptor/FontFile2")
        ms = doc.xref_stream(mi)
        fs = doc.xref_stream(to_int(ff[1]))
        cmap = parse_tounicode_cmap(ms)
        used = parse_truetype_data(fs)
        text = make_tounicode(cmap, used)
        doc.update_stream(mi, bytes(text, "U8"))


def reproduce_cmap(doc):
    assert doc
    font_set = set()
    for page in doc:
        font_list = page.get_fonts()
        for font in font_list:
            if font[1] == "ttf" and font[3] in FONT_NAMES and ".ttf" in font[4]:
                font_set.add(font)
    for font in font_set:
        reproduce_one_font(doc, font[0])
    return doc


def _subset_fonts_process(pdf_path, output_path):
    """Function to run in subprocess for font subsetting.

    Args:
        pdf_path: Path to the PDF file to subset
        output_path: Path where to save the result
    """
    try:
        pdf = pymupdf.open(pdf_path)
        pdf.subset_fonts(fallback=False)
        pdf.save(output_path)
        # 返回 0 表示成功
        os._exit(0)
    except Exception as e:
        logger.error(f"Error in font subsetting subprocess: {e}")
        # 返回 1 表示失败
        os._exit(1)


def _save_pdf_clean_process(
    pdf_path,
    output_path,
    garbage=1,
    deflate=True,
    clean=True,
    deflate_fonts=True,
    linear=False,
):
    """Function to run in subprocess for saving PDF with clean=True which can be time-consuming.

    Args:
        pdf_path: Path to the PDF file to save
        output_path: Path where to save the result
        garbage: Garbage collection level (0, 1, 2, 3, 4)
        deflate: Whether to deflate the PDF
        clean: Whether to clean the PDF
        deflate_fonts: Whether to deflate fonts
        linear: Whether to linearize the PDF
    """
    try:
        pdf = pymupdf.open(pdf_path)
        pdf.save(
            output_path,
            garbage=garbage,
            deflate=deflate,
            clean=clean,
            deflate_fonts=deflate_fonts,
            linear=linear,
        )
        # 返回 0 表示成功
        os._exit(0)
    except Exception as e:
        logger.error(f"Error in save PDF with clean=True subprocess: {e}")
        # 返回 1 表示失败
        os._exit(1)


class PDFCreater:
    stage_name = "Generate drawing instructions"

    def __init__(
        self,
        original_pdf_path: str,
        document: il_version_1.Document,
        translation_config: TranslationConfig,
        mediabox_data: dict,
    ):
        self.original_pdf_path = original_pdf_path
        self.docs = document
        self.font_path = translation_config.font
        self.font_mapper = FontMapper(translation_config)
        self.translation_config = translation_config
        self.mediabox_data = mediabox_data

    def render_graphic_state(
        self,
        draw_op: BitStream,
        graphic_state: il_version_1.GraphicState,
    ):
        if graphic_state is None:
            return
        # if graphic_state.stroking_color_space_name:
        #     draw_op.append(
        #         f"/{graphic_state.stroking_color_space_name} CS \n".encode()
        #     )
        # if graphic_state.non_stroking_color_space_name:
        #     draw_op.append(
        #         f"/{graphic_state.non_stroking_color_space_name}"
        #         f" cs \n".encode()
        #     )
        # if graphic_state.ncolor is not None:
        #     if len(graphic_state.ncolor) == 1:
        #         draw_op.append(f"{graphic_state.ncolor[0]} g \n".encode())
        #     elif len(graphic_state.ncolor) == 3:
        #         draw_op.append(
        #             f"{' '.join((str(x) for x in graphic_state.ncolor))} sc \n".encode()
        #         )
        # if graphic_state.scolor is not None:
        #     if len(graphic_state.scolor) == 1:
        #         draw_op.append(f"{graphic_state.scolor[0]} G \n".encode())
        #     elif len(graphic_state.scolor) == 3:
        #         draw_op.append(
        #             f"{' '.join((str(x) for x in graphic_state.scolor))} SC \n".encode()
        #         )

        if graphic_state.passthrough_per_char_instruction:
            draw_op.append(
                f"{graphic_state.passthrough_per_char_instruction} \n".encode(),
            )

    def render_paragraph_to_char(
        self,
        paragraph: il_version_1.PdfParagraph,
    ) -> list[il_version_1.PdfCharacter]:
        chars = []
        for composition in paragraph.pdf_paragraph_composition:
            if not isinstance(composition.pdf_character, il_version_1.PdfCharacter):
                logger.error(
                    f"Unknown composition type. "
                    f"This type only appears in the IL "
                    f"after the translation is completed."
                    f"During pdf rendering, this type is not supported."
                    f"Composition: {composition}. "
                    f"Paragraph: {paragraph}. ",
                )
                continue
            chars.append(composition.pdf_character)
        if not chars and paragraph.unicode and paragraph.debug_id:
            logger.error(
                f"Unable to export paragraphs that have "
                f"not yet been formatted: {paragraph}",
            )
            return chars
        return chars

    def get_available_font_list(self, pdf, page):
        page_xref_id = pdf[page.page_number].xref
        return self.get_xobj_available_fonts(page_xref_id, pdf)

    def get_xobj_available_fonts(self, page_xref_id, pdf):
        try:
            resources_type, r_id = pdf.xref_get_key(page_xref_id, "Resources")
            if resources_type == "xref":
                resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
                r_id = pdf.xref_object(int(resource_xref_id))
                resources_type = "dict"
            if resources_type == "dict":
                xref_id = re.search("/Font (\\d+) 0 R", r_id)
                if xref_id is not None:
                    xref_id = xref_id.group(1)
                    font_dict = pdf.xref_object(int(xref_id))
                else:
                    search = re.search("/Font *<<(.+?)>>", r_id.replace("\n", " "))
                    if search is None:
                        # Have resources but no fonts
                        return set()
                    font_dict = search.group(1)
            else:
                r_id = int(r_id.split(" ")[0])
                _, font_dict = pdf.xref_get_key(r_id, "Font")
            fonts = re.findall("/([^ ]+?) ", font_dict)
            return set(fonts)
        except Exception:
            return set()

    def _render_rectangle(
        self,
        draw_op: BitStream,
        rectangle: il_version_1.PdfRectangle,
        line_width: float = 0.4,
    ):
        """Draw a rectangle in PDF for visualization purposes.

        Args:
            draw_op: BitStream to append PDF drawing operations
            rectangle: Rectangle object containing position information
            line_width: Line width
        """
        x1 = rectangle.box.x
        y1 = rectangle.box.y
        x2 = rectangle.box.x2
        y2 = rectangle.box.y2
        width = x2 - x1
        height = y2 - y1
        # Save graphics state
        draw_op.append(b"q ")

        # Set green color for debug visibility
        draw_op.append(
            rectangle.graphic_state.passthrough_per_char_instruction.encode(),
        )  # Green stroke
        if rectangle.line_width is not None:
            line_width = rectangle.line_width
        if line_width > 0:
            draw_op.append(f" {line_width} w ".encode())  # Line width
        draw_op.append(f"{x1} {y1} {width} {height} re ".encode())
        if rectangle.fill_background:
            draw_op.append(b" f ")
        else:
            draw_op.append(b" S ")

        # Restore graphics state
        draw_op.append(b"Q\n")

    def create_side_by_side_dual_pdf(
        self,
        original_pdf: pymupdf.Document,
        translated_pdf: pymupdf.Document,
        dual_out_path: str,
        translation_config: TranslationConfig,
    ) -> pymupdf.Document:
        """Create a dual PDF with side-by-side pages (original and translation).

        Args:
            original_pdf: Original PDF document
            translated_pdf: Translated PDF document
            dual_out_path: Output path for the dual PDF
            translation_config: Translation configuration

        Returns:
            The created dual PDF document
        """
        # Create a new PDF for side-by-side pages
        dual = pymupdf.open()
        page_count = min(original_pdf.page_count, translated_pdf.page_count)

        for page_id in range(page_count):
            # Get pages from both PDFs
            orig_page = original_pdf[page_id]
            trans_page = translated_pdf[page_id]
            rotate_angle = orig_page.rotation
            total_width = orig_page.rect.width + trans_page.rect.width
            max_height = max(orig_page.rect.height, trans_page.rect.height)
            left_width = (
                orig_page.rect.width
                if not translation_config.dual_translate_first
                else trans_page.rect.width
            )

            orig_page.set_rotation(0)
            trans_page.set_rotation(0)

            # Create new page with combined width
            dual_page = dual.new_page(width=total_width, height=max_height)

            # Define rectangles for left and right sides
            rect_left = pymupdf.Rect(0, 0, left_width, max_height)
            rect_right = pymupdf.Rect(left_width, 0, total_width, max_height)

            # Show pages according to dual_translate_first setting
            if translation_config.dual_translate_first:
                # Show translated page on left and original on right
                rect_left, rect_right = rect_right, rect_left
            try:
                # Show original page on left and translated on right (default)
                dual_page.show_pdf_page(
                    rect_left,
                    original_pdf,
                    page_id,
                    keep_proportion=True,
                    rotate=-rotate_angle,
                )
            except Exception as e:
                logger.warning(
                    f"Failed to show original page on left and translated on right (default). "
                    f"Page ID: {page_id}. "
                    f"Original PDF: {self.original_pdf_path}. "
                    f"Translated PDF: {translation_config.input_file}. ",
                    exc_info=e,
                )
            try:
                dual_page.show_pdf_page(
                    rect_right,
                    translated_pdf,
                    page_id,
                    keep_proportion=True,
                    rotate=-rotate_angle,
                )
            except Exception as e:
                logger.warning(
                    f"Failed to show translated page on left and original on right. "
                    f"Page ID: {page_id}. "
                    f"Original PDF: {self.original_pdf_path}. "
                    f"Translated PDF: {translation_config.input_file}. ",
                    exc_info=e,
                )
        return dual

    def create_alternating_pages_dual_pdf(
        self,
        original_pdf_path: str,
        translated_pdf: pymupdf.Document,
        translation_config: TranslationConfig,
    ) -> pymupdf.Document:
        """Create a dual PDF with alternating pages (original and translation).

        Args:
            original_pdf_path: Path to the original PDF
            translated_pdf: Translated PDF document
            translation_config: Translation configuration

        Returns:
            The created dual PDF document
        """
        # Open the original PDF and insert translated PDF
        dual = pymupdf.open(original_pdf_path)
        dual.insert_file(translated_pdf)

        # Rearrange pages to alternate between original and translated
        page_count = translated_pdf.page_count
        for page_id in range(page_count):
            if translation_config.dual_translate_first:
                dual.move_page(page_count + page_id, page_id * 2)
            else:
                dual.move_page(page_count + page_id, page_id * 2 + 1)

        return dual

    def write_debug_info(
        self,
        pdf: pymupdf.Document,
        translation_config: TranslationConfig,
    ):
        self.font_mapper.add_font(pdf, self.docs)

        for page in self.docs.page:
            _, r_id = pdf.xref_get_key(pdf[page.page_number].xref, "Contents")
            resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
            base_op = pdf.xref_stream(int(resource_xref_id))
            translation_config.raise_if_cancelled()
            xobj_available_fonts = {}
            xobj_draw_ops = {}
            xobj_encoding_length_map = {}
            available_font_list = self.get_available_font_list(pdf, page)

            page_encoding_length_map = {
                f.font_id: f.encoding_length for f in page.pdf_font
            }
            page_op = BitStream()
            # q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
            page_op.append(b"q ")
            if base_op is not None:
                page_op.append(base_op)
            page_op.append(b" Q ")
            page_op.append(
                f"q Q 1 0 0 1 {page.cropbox.box.x} {page.cropbox.box.y} cm \n".encode(),
            )
            # 收集所有字符
            chars = []
            # 首先添加页面级别的字符
            if page.pdf_character:
                chars.extend(page.pdf_character)
            # 然后添加段落中的字符
            for paragraph in page.pdf_paragraph:
                chars.extend(self.render_paragraph_to_char(paragraph))

            # 渲染所有字符
            for char in chars:
                if not getattr(char, "debug_info", False):
                    continue
                if char.char_unicode == "\n":
                    continue
                if char.pdf_character_id is None:
                    # dummy char
                    continue
                char_size = char.pdf_style.font_size
                font_id = char.pdf_style.font_id

                if font_id not in available_font_list:
                    continue
                draw_op = page_op
                encoding_length_map = page_encoding_length_map

                draw_op.append(b"q ")
                self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
                if char.vertical:
                    draw_op.append(
                        f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode(),
                    )
                else:
                    draw_op.append(
                        f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode(),
                    )

                encoding_length = encoding_length_map[font_id]
                # pdf32000-2008 page14:
                # As hexadecimal data enclosed in angle brackets < >
                # see 7.3.4.3, "Hexadecimal Strings."
                draw_op.append(
                    f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode(),
                )

                draw_op.append(b" Tj ET Q \n")
            for rect in page.pdf_rectangle:
                if not rect.debug_info:
                    continue
                self._render_rectangle(page_op, rect)
            draw_op = page_op
            # Since this is a draw instruction container,
            # no additional information is needed
            pdf.update_stream(int(resource_xref_id), draw_op.tobytes())
        translation_config.raise_if_cancelled()

        # 使用子进程进行字体子集化
        if not translation_config.skip_clean:
            pdf = self.subset_fonts_in_subprocess(pdf, translation_config, tag="debug")
        return pdf

    @staticmethod
    def subset_fonts_in_subprocess(
        pdf: pymupdf.Document, translation_config: TranslationConfig, tag: str
    ) -> pymupdf.Document:
        """Run font subsetting in a subprocess with timeout.

        Args:
            pdf: The PDF document object
            translation_config: Translation configuration

        Returns:
            Path to the PDF with subsetted fonts, or original path if subsetting failed or timed out
        """
        original_pdf = pdf
        # Create temporary file paths
        temp_input = str(
            translation_config.get_working_file_path(f"temp_subset_input_{tag}.pdf")
        )
        temp_output = str(
            translation_config.get_working_file_path(f"temp_subset_output_{tag}.pdf")
        )

        # Save PDF to temporary file without subsetting
        pdf.save(temp_input)

        # Create and start subprocess
        process = Process(target=_subset_fonts_process, args=(temp_input, temp_output))
        process.start()

        # Wait for subprocess with timeout (1 minute)
        timeout = 60  # 1 minutes in seconds
        start_time = time.time()

        while process.is_alive():
            if time.time() - start_time > timeout:
                logger.warning(
                    f"Font subsetting timeout after {timeout} seconds, terminating subprocess"
                )
                process.terminate()
                try:
                    process.join(5)  # Give it 5 seconds to clean up
                    if process.is_alive():
                        logger.warning("Subprocess did not terminate, killing it")
                        process.kill()
                        process.terminate()
                        process.kill()
                        process.terminate()
                        process.kill()
                        process.terminate()
                except Exception as e:
                    logger.error(f"Error terminating font subsetting process: {e}")

                return original_pdf

            time.sleep(0.5)  # Check every half second

        # Process completed, check exit code
        exit_code = process.exitcode
        success = exit_code == 0

        # Check if subsetting was successful
        if (
            success
            and Path(temp_output).exists()
            and Path(temp_output).stat().st_size > 0
        ):
            logger.info("Font subsetting completed successfully")
            return pymupdf.open(temp_output)
        else:
            logger.warning(
                f"Font subsetting failed with exit code {exit_code} or produced empty file"
            )
            return original_pdf

    @staticmethod
    def save_pdf_with_timeout(
        pdf: pymupdf.Document,
        output_path: str,
        translation_config: TranslationConfig,
        garbage: int = 1,
        deflate: bool = True,
        clean: bool = True,
        deflate_fonts: bool = True,
        linear: bool = False,
        timeout: int = 120,
        tag: str = "",
    ) -> bool:
        """Save a PDF document with a timeout for the clean=True operation.

        Args:
            pdf: The PDF document object
            output_path: Path where to save the PDF
            translation_config: Translation configuration
            garbage: Garbage collection level (0, 1, 2, 3, 4)
            deflate: Whether to deflate the PDF
            clean: Whether to clean the PDF
            deflate_fonts: Whether to deflate fonts
            linear: Whether to linearize the PDF
            timeout: Timeout in seconds (default: 2 minutes)

        Returns:
            True if saved with clean=True successfully, False if fallback to clean=False was used
        """
        # Create temporary file paths
        temp_input = str(
            translation_config.get_working_file_path(f"temp_save_input_{tag}.pdf")
        )
        temp_output = str(
            translation_config.get_working_file_path(f"temp_save_output_{tag}.pdf")
        )

        # Save PDF to temporary file first
        pdf.save(temp_input)

        # Try to save with clean=True in a subprocess
        process = Process(
            target=_save_pdf_clean_process,
            args=(
                temp_input,
                temp_output,
                garbage,
                deflate,
                clean,
                deflate_fonts,
                linear,
            ),
        )
        process.start()

        # Wait for subprocess with timeout
        start_time = time.time()

        while process.is_alive():
            if time.time() - start_time > timeout:
                logger.warning(
                    f"PDF save with clean={clean} timeout after {timeout} seconds, terminating subprocess"
                )
                process.terminate()
                try:
                    process.join(5)  # Give it 5 seconds to clean up
                    if process.is_alive():
                        logger.warning("Subprocess did not terminate, killing it")
                        process.kill()
                        process.terminate()
                        process.kill()
                        process.terminate()
                        process.kill()
                        process.terminate()
                except Exception as e:
                    logger.error(f"Error terminating PDF save process: {e}")

                # Fallback to save without clean parameter
                logger.info("Falling back to save with clean=False")
                try:
                    pdf.save(
                        output_path,
                        garbage=garbage,
                        deflate=deflate,
                        clean=False,
                        deflate_fonts=deflate_fonts,
                        linear=linear,
                    )
                    return False
                except Exception as e:
                    logger.error(f"Error in fallback save: {e}")
                    # Last resort: basic save
                    pdf.save(output_path)
                    return False

            time.sleep(0.5)  # Check every half second

        # Process completed, check exit code
        exit_code = process.exitcode
        success = exit_code == 0

        # Check if save was successful
        if (
            success
            and Path(temp_output).exists()
            and Path(temp_output).stat().st_size > 0
        ):
            logger.info(f"PDF save with clean={clean} completed successfully")
            # Copy the successfully created file to the target path
            try:
                import shutil

                shutil.copy2(temp_output, output_path)
                return True
            except Exception as e:
                logger.error(f"Error copying saved PDF: {e}")
                pdf.save(output_path)  # Fallback to direct save
                return False
            finally:
                Path(temp_input).unlink()
                Path(temp_output).unlink()
        else:
            logger.warning(
                f"PDF save with clean={clean} failed with exit code {exit_code} or produced empty file"
            )
            # Fallback to save without clean parameter
            try:
                pdf.save(
                    output_path,
                    garbage=garbage,
                    deflate=deflate,
                    clean=False,
                    deflate_fonts=deflate_fonts,
                    linear=linear,
                )
            except Exception as e:
                logger.error(f"Error in fallback save: {e}")
                # Last resort: basic save
                pdf.save(output_path)

            return False

    def restore_media_box(self, doc: pymupdf.Document, mediabox_data: dict) -> None:
        for xref, page_box_data in mediabox_data.items():
            for name, box in page_box_data.items():
                try:
                    doc.xref_set_key(xref, name, box)
                except Exception:
                    logger.debug(f"Error restoring media box {name} from PDF")

    def write(
        self, translation_config: TranslationConfig, check_font_exists: bool = False
    ) -> TranslateResult:
        try:
            basename = Path(translation_config.input_file).stem
            debug_suffix = ".debug" if translation_config.debug else ""
            if (
                translation_config.watermark_output_mode
                != WatermarkOutputMode.Watermarked
            ):
                debug_suffix += ".no_watermark"
            mono_out_path = translation_config.get_output_file_path(
                f"{basename}{debug_suffix}.{translation_config.lang_out}.mono.pdf",
            )
            pdf = pymupdf.open(self.original_pdf_path)
            self.font_mapper.add_font(pdf, self.docs)
            with self.translation_config.progress_monitor.stage_start(
                self.stage_name,
                len(self.docs.page),
            ) as pbar:
                for page in self.docs.page:
                    translation_config.raise_if_cancelled()
                    xobj_available_fonts = {}
                    xobj_draw_ops = {}
                    xobj_encoding_length_map = {}
                    available_font_list = self.get_available_font_list(pdf, page)
                    page_encoding_length_map = {
                        f.font_id: f.encoding_length for f in page.pdf_font
                    }
                    all_encoding_length_map = page_encoding_length_map.copy()

                    for xobj in page.pdf_xobject:
                        xobj_available_fonts[xobj.xobj_id] = available_font_list.copy()
                        try:
                            xobj_available_fonts[xobj.xobj_id].update(
                                self.get_xobj_available_fonts(xobj.xref_id, pdf),
                            )
                        except Exception:
                            pass
                        xobj_encoding_length_map[xobj.xobj_id] = {
                            f.font_id: f.encoding_length for f in xobj.pdf_font
                        }
                        all_encoding_length_map.update(
                            xobj_encoding_length_map[xobj.xobj_id]
                        )
                        xobj_encoding_length_map[xobj.xobj_id].update(
                            page_encoding_length_map
                        )
                        xobj_op = BitStream()
                        base_op = xobj.base_operations.value
                        base_op = zstd_decompress(base_op)
                        xobj_op.append(base_op.encode())
                        xobj_draw_ops[xobj.xobj_id] = xobj_op

                    page_op = BitStream()
                    # q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
                    # page_op.append(b"q ")
                    base_op = page.base_operations.value
                    base_op = zstd_decompress(base_op)
                    page_op.append(base_op.encode())
                    page_op.append(b" \n")
                    # page_op.append(b" Q ")
                    # page_op.append(
                    #     f"q Q 1 0 0 1 {page.cropbox.box.x} {page.cropbox.box.y} cm \n".encode(),
                    # )
                    # 收集所有字符
                    chars = []
                    # 首先添加页面级别的字符
                    if page.pdf_character:
                        chars.extend(page.pdf_character)
                    # 然后添加段落中的字符
                    for paragraph in page.pdf_paragraph:
                        chars.extend(self.render_paragraph_to_char(paragraph))
                    for rect in page.pdf_rectangle:
                        if (
                            translation_config.ocr_workaround
                            and not rect.debug_info
                            and rect.fill_background
                        ):
                            if rect.xobj_id in xobj_available_fonts:
                                draw_op = xobj_draw_ops[rect.xobj_id]
                            else:
                                draw_op = page_op
                            self._render_rectangle(draw_op, rect, line_width=0.1)
                    # 渲染所有字符
                    for char in chars:
                        if char.char_unicode == "\n":
                            continue
                        if char.pdf_character_id is None:
                            # dummy char
                            continue
                        char_size = char.pdf_style.font_size
                        font_id = char.pdf_style.font_id
                        if char.xobj_id in xobj_available_fonts:
                            if (
                                check_font_exists
                                and font_id not in xobj_available_fonts[char.xobj_id]
                            ):
                                continue
                            draw_op = xobj_draw_ops[char.xobj_id]
                            encoding_length_map = xobj_encoding_length_map[char.xobj_id]
                        else:
                            if check_font_exists and font_id not in available_font_list:
                                continue
                            draw_op = page_op
                            encoding_length_map = page_encoding_length_map

                        draw_op.append(b"q ")
                        self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
                        if char.vertical:
                            draw_op.append(
                                f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode(),
                            )
                        else:
                            draw_op.append(
                                f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode(),
                            )

                        encoding_length = encoding_length_map.get(font_id, None)
                        if encoding_length is None:
                            if font_id in all_encoding_length_map:
                                encoding_length = all_encoding_length_map[font_id]
                            else:
                                logger.debug(
                                    f"Font {font_id} not found in encoding length map for page {page.page_number}"
                                )
                                continue
                        # pdf32000-2008 page14:
                        # As hexadecimal data enclosed in angle brackets < >
                        # see 7.3.4.3, "Hexadecimal Strings."
                        draw_op.append(
                            f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode(),
                        )

                        draw_op.append(b" Tj ET Q \n")
                    for xobj in page.pdf_xobject:
                        draw_op = xobj_draw_ops[xobj.xobj_id]
                        try:
                            pdf.update_stream(xobj.xref_id, draw_op.tobytes())
                        except Exception:
                            logger.warning(
                                f"update xref {xobj.xref_id} stream fail, continue"
                            )
                        # pdf.update_stream(xobj.xref_id, b'')
                    for rect in page.pdf_rectangle:
                        if translation_config.debug and rect.debug_info:
                            self._render_rectangle(page_op, rect)

                    draw_op = page_op
                    op_container = pdf.get_new_xref()
                    # Since this is a draw instruction container,
                    # no additional information is needed
                    pdf.update_object(op_container, "<<>>")
                    pdf.update_stream(op_container, draw_op.tobytes())
                    pdf[page.page_number].set_contents(op_container)
                    pbar.advance()
            translation_config.raise_if_cancelled()
            gc_level = 1
            if self.translation_config.ocr_workaround:
                gc_level = 4
            with self.translation_config.progress_monitor.stage_start(
                SUBSET_FONT_STAGE_NAME,
                1,
            ) as pbar:
                if not translation_config.skip_clean:
                    pdf = self.subset_fonts_in_subprocess(
                        pdf, translation_config, tag="mono"
                    )

                pbar.advance()
            try:
                self.restore_media_box(pdf, self.mediabox_data)
            except Exception:
                logger.exception("restore media box failed")

            if translation_config.only_include_translated_page:
                total_page = set(range(0, len(pdf)))

                pages_to_translate = {
                    page.page_number
                    for page in self.docs.page
                    if self.translation_config.should_translate_page(
                        page.page_number + 1
                    )
                }

                should_removed_page = list(total_page - pages_to_translate)

                pdf.delete_pages(should_removed_page)

            with self.translation_config.progress_monitor.stage_start(
                SAVE_PDF_STAGE_NAME,
                2,
            ) as pbar:
                if not translation_config.no_mono:
                    if translation_config.debug:
                        translation_config.raise_if_cancelled()
                        pdf.save(
                            f"{mono_out_path}.decompressed.pdf",
                            expand=True,
                            pretty=True,
                        )
                    translation_config.raise_if_cancelled()
                    self.save_pdf_with_timeout(
                        pdf,
                        mono_out_path,
                        translation_config,
                        garbage=gc_level,
                        deflate=True,
                        clean=not translation_config.skip_clean,
                        deflate_fonts=True,
                        linear=False,
                        tag="mono",
                    )
                pbar.advance()
                dual_out_path = None
                if not translation_config.no_dual:
                    dual_out_path = translation_config.get_output_file_path(
                        f"{basename}{debug_suffix}.{translation_config.lang_out}.dual.pdf",
                    )
                    translation_config.raise_if_cancelled()
                    original_pdf = pymupdf.open(self.original_pdf_path)

                    if translation_config.debug:
                        translation_config.raise_if_cancelled()
                        try:
                            original_pdf = self.write_debug_info(
                                original_pdf, translation_config
                            )
                        except Exception:
                            logger.warning(
                                "Failed to write debug info to dual PDF",
                                exc_info=True,
                            )

                    if (
                        self.translation_config.only_include_translated_page
                        and should_removed_page
                    ):
                        original_pdf.delete_pages(should_removed_page)
                    translated_pdf = pdf

                    # Choose between alternating pages and side-by-side format
                    # Default to side-by-side if not specified
                    use_alternating_pages = (
                        translation_config.use_alternating_pages_dual
                    )

                    if use_alternating_pages:
                        # Create a dual PDF with alternating pages (original and translation)
                        dual = self.create_alternating_pages_dual_pdf(
                            self.original_pdf_path,
                            translated_pdf,
                            translation_config,
                        )
                    else:
                        # Create a dual PDF with side-by-side pages (original and translation)
                        dual = self.create_side_by_side_dual_pdf(
                            original_pdf,
                            translated_pdf,
                            dual_out_path,
                            translation_config,
                        )

                    self.save_pdf_with_timeout(
                        dual,
                        dual_out_path,
                        translation_config,
                        garbage=gc_level,
                        deflate=True,
                        clean=not translation_config.skip_clean,
                        deflate_fonts=True,
                        linear=False,
                        tag="dual",
                    )
                    if translation_config.debug:
                        translation_config.raise_if_cancelled()
                        dual.save(
                            f"{dual_out_path}.decompressed.pdf",
                            expand=True,
                            pretty=True,
                        )
                pbar.advance()
            if self.translation_config.no_mono:
                mono_out_path = None
            if self.translation_config.no_dual:
                dual_out_path = None
            auto_extracted_glossary_path = None
            if (
                self.translation_config.save_auto_extracted_glossary
                and self.translation_config.shared_context_cross_split_part.auto_extracted_glossary
            ):
                auto_extracted_glossary_path = self.translation_config.get_output_file_path(
                    f"{basename}{debug_suffix}.{translation_config.lang_out}.glossary.csv"
                )
                with auto_extracted_glossary_path.open("w", encoding="utf-8") as f:
                    logger.info(
                        f"save auto extracted glossary to {auto_extracted_glossary_path}"
                    )
                    f.write(
                        self.translation_config.shared_context_cross_split_part.auto_extracted_glossary.to_csv()
                    )

            return TranslateResult(
                mono_out_path, dual_out_path, auto_extracted_glossary_path
            )
        except Exception:
            logger.exception(
                "Failed to create PDF: %s",
                translation_config.input_file,
            )
            if not check_font_exists:
                return self.write(translation_config, True)
            raise

```

## /babeldoc/format/pdf/document_il/frontend/__init__.py

```py path="/babeldoc/format/pdf/document_il/frontend/__init__.py" 

```

## /babeldoc/format/pdf/document_il/il_version_1.rnc

```rnc path="/babeldoc/format/pdf/document_il/il_version_1.rnc" 
start = Document
Document =
  element document {
    Page+,
    attribute totalPages { xsd:int }
  }
Page =
  element page {
    element mediabox { Box },
    element cropbox { Box },
    PDFXobject*,
    PageLayout*,
    PDFRectangle*,
    PDFFont*,
    PDFParagraph*,
    PDFFigure*,
    PDFCharacter*,
    attribute pageNumber { xsd:int },
    attribute Unit { xsd:string },
    element baseOperations { xsd:string }
  }
Box =
  element box {
    # from (x,y) to (x2,y2)
    attribute x { xsd:float },
    attribute y { xsd:float },
    attribute x2 { xsd:float },
    attribute y2 { xsd:float }
  }
PDFXrefId = xsd:int
PDFFont =
  element pdfFont {
    attribute name { xsd:string },
    attribute fontId { xsd:string },
    attribute xrefId { PDFXrefId },
    attribute encodingLength { xsd:int },
    attribute bold { xsd:boolean }?,
    attribute italic { xsd:boolean }?,
    attribute monospace { xsd:boolean }?,
    attribute serif { xsd:boolean }?,
    attribute ascent { xsd:float }?,
    attribute descent { xsd:float }?,
    PDFFontCharBoundingBox*
  }
PDFFontCharBoundingBox =
  element pdfFontCharBoundingBox {
    attribute x { xsd:float },
    attribute y { xsd:float },
    attribute x2 { xsd:float },
    attribute y2 { xsd:float },
    attribute char_id { xsd:int }
  }
PDFXobject =
  element pdfXobject {
    attribute xobjId { xsd:int },
    attribute xrefId { PDFXrefId },
    Box,
    PDFFont*,
    element baseOperations { xsd:string }
  }
PDFCharacter =
  element pdfCharacter {
    attribute vertical { xsd:boolean }?,
    attribute scale { xsd:float }?,
    attribute pdfCharacterId { xsd:int }?,
    attribute char_unicode { xsd:string },
    attribute advance { xsd:float }?,
    # xobject nesting depth
    attribute xobjId { xsd:int }?,
    attribute debug_info { xsd:boolean }?,
    attribute formula_layout_id { xsd:int }?,
    PDFStyle,
    Box,
    element visual_bbox { Box }?
  }
PageLayout =
  element pageLayout {
    attribute id { xsd:int },
    attribute conf { xsd:float },
    attribute class_name { xsd:string },
    Box
  }
GraphicState =
  element graphicState {
    attribute linewidth { xsd:float }?,
    attribute dash {
      list { xsd:float+ }
    }?,
    attribute flatness { xsd:float }?,
    attribute intent { xsd:string }?,
    attribute linecap { xsd:int }?,
    attribute linejoin { xsd:int }?,
    attribute miterlimit { xsd:float }?,
    attribute ncolor {
      list { xsd:float+ }
    }?,
    attribute scolor {
      list { xsd:float+ }
    }?,
    attribute strokingColorSpaceName { xsd:string }?,
    attribute nonStrokingColorSpaceName { xsd:string }?,
    attribute passthroughPerCharInstruction { xsd:string }?
  }
PDFStyle =
  element pdfStyle {
    attribute font_id { xsd:string },
    attribute font_size { xsd:float },
    GraphicState
  }
PDFParagraph =
  element pdfParagraph {
    attribute xobjId { xsd:int }?,
    attribute unicode { xsd:string },
    attribute scale { xsd:float }?,
    attribute optimal_scale { xsd:float }?,
    attribute vertical { xsd:boolean }?,
    attribute FirstLineIndent { xsd:boolean }?,
    attribute debug_id { xsd:string }?,
    attribute layout_label { xsd:string }?,
    attribute layout_id { xsd:int }?,
    Box,
    PDFStyle,
    PDFParagraphComposition*
  }
PDFParagraphComposition =
  element pdfParagraphComposition {
    PDFLine
    | PDFFormula
    | PDFSameStyleCharacters
    | PDFCharacter
    | PDFSameStyleUnicodeCharacters
  }
PDFLine = element pdfLine { Box, PDFCharacter+ }
PDFSameStyleCharacters =
  element pdfSameStyleCharacters { Box, PDFStyle, PDFCharacter+ }
PDFSameStyleUnicodeCharacters =
  element pdfSameStyleUnicodeCharacters {
    PDFStyle?,
    attribute unicode { xsd:string },
    attribute debug_info { xsd:boolean }?
  }
PDFFormula =
  element pdfFormula {
    Box,
    PDFCharacter+,
    attribute x_offset { xsd:float },
    attribute y_offset { xsd:float },
    attribute x_advance { xsd:float }?
  }
PDFFigure = element pdfFigure { Box }
PDFRectangle =
  element pdfRectangle {
    Box,
    GraphicState,
    attribute debug_info { xsd:boolean }?,
    attribute fill_background { xsd:boolean }?,
    attribute xobjId { xsd:int }?,
    attribute lineWidth { xsd:float }?
  }

```

## /babeldoc/format/pdf/document_il/il_version_1.xsd

```xsd path="/babeldoc/format/pdf/document_il/il_version_1.xsd" 
<?xml version="1.0" encoding="UTF-8"?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified">
  <xs:element name="document">
    <xs:complexType>
      <xs:sequence>
        <xs:element maxOccurs="unbounded" ref="page"/>
      </xs:sequence>
      <xs:attribute name="totalPages" use="required" type="xs:int"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="page">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="mediabox"/>
        <xs:element ref="cropbox"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfXobject"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pageLayout"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfRectangle"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraph"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFigure"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfCharacter"/>
        <xs:element ref="baseOperations"/>
      </xs:sequence>
      <xs:attribute name="pageNumber" use="required" type="xs:int"/>
      <xs:attribute name="Unit" use="required" type="xs:string"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="mediabox">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="cropbox">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="baseOperations" type="xs:string"/>
  <xs:element name="box">
    <xs:complexType>
      <xs:attribute name="x" use="required" type="xs:float"/>
      <xs:attribute name="y" use="required" type="xs:float"/>
      <xs:attribute name="x2" use="required" type="xs:float"/>
      <xs:attribute name="y2" use="required" type="xs:float"/>
    </xs:complexType>
  </xs:element>
  <xs:simpleType name="PDFXrefId">
    <xs:restriction base="xs:int"/>
  </xs:simpleType>
  <xs:element name="pdfFont">
    <xs:complexType>
      <xs:sequence>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFontCharBoundingBox"/>
      </xs:sequence>
      <xs:attribute name="name" use="required" type="xs:string"/>
      <xs:attribute name="fontId" use="required" type="xs:string"/>
      <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
      <xs:attribute name="encodingLength" use="required" type="xs:int"/>
      <xs:attribute name="bold" type="xs:boolean"/>
      <xs:attribute name="italic" type="xs:boolean"/>
      <xs:attribute name="monospace" type="xs:boolean"/>
      <xs:attribute name="serif" type="xs:boolean"/>
      <xs:attribute name="ascent" type="xs:float"/>
      <xs:attribute name="descent" type="xs:float"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfFontCharBoundingBox">
    <xs:complexType>
      <xs:attribute name="x" use="required" type="xs:float"/>
      <xs:attribute name="y" use="required" type="xs:float"/>
      <xs:attribute name="x2" use="required" type="xs:float"/>
      <xs:attribute name="y2" use="required" type="xs:float"/>
      <xs:attribute name="char_id" use="required" type="xs:int"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfXobject">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
        <xs:element ref="baseOperations"/>
      </xs:sequence>
      <xs:attribute name="xobjId" use="required" type="xs:int"/>
      <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfCharacter">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="pdfStyle"/>
        <xs:element ref="box"/>
        <xs:element minOccurs="0" ref="visual_bbox"/>
      </xs:sequence>
      <xs:attribute name="vertical" type="xs:boolean"/>
      <xs:attribute name="scale" type="xs:float"/>
      <xs:attribute name="pdfCharacterId" type="xs:int"/>
      <xs:attribute name="char_unicode" use="required" type="xs:string"/>
      <xs:attribute name="advance" type="xs:float"/>
      <xs:attribute name="xobjId" type="xs:int"/>
      <xs:attribute name="debug_info" type="xs:boolean"/>
      <xs:attribute name="formula_layout_id" type="xs:int"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="visual_bbox">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="pageLayout">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
      </xs:sequence>
      <xs:attribute name="id" use="required" type="xs:int"/>
      <xs:attribute name="conf" use="required" type="xs:float"/>
      <xs:attribute name="class_name" use="required" type="xs:string"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="graphicState">
    <xs:complexType>
      <xs:attribute name="linewidth" type="xs:float"/>
      <xs:attribute name="dash">
        <xs:simpleType>
          <xs:restriction>
            <xs:simpleType>
              <xs:list itemType="xs:float"/>
            </xs:simpleType>
            <xs:minLength value="1"/>
          </xs:restriction>
        </xs:simpleType>
      </xs:attribute>
      <xs:attribute name="flatness" type="xs:float"/>
      <xs:attribute name="intent" type="xs:string"/>
      <xs:attribute name="linecap" type="xs:int"/>
      <xs:attribute name="linejoin" type="xs:int"/>
      <xs:attribute name="miterlimit" type="xs:float"/>
      <xs:attribute name="ncolor">
        <xs:simpleType>
          <xs:restriction>
            <xs:simpleType>
              <xs:list itemType="xs:float"/>
            </xs:simpleType>
            <xs:minLength value="1"/>
          </xs:restriction>
        </xs:simpleType>
      </xs:attribute>
      <xs:attribute name="scolor">
        <xs:simpleType>
          <xs:restriction>
            <xs:simpleType>
              <xs:list itemType="xs:float"/>
            </xs:simpleType>
            <xs:minLength value="1"/>
          </xs:restriction>
        </xs:simpleType>
      </xs:attribute>
      <xs:attribute name="strokingColorSpaceName" type="xs:string"/>
      <xs:attribute name="nonStrokingColorSpaceName" type="xs:string"/>
      <xs:attribute name="passthroughPerCharInstruction" type="xs:string"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfStyle">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="graphicState"/>
      </xs:sequence>
      <xs:attribute name="font_id" use="required" type="xs:string"/>
      <xs:attribute name="font_size" use="required" type="xs:float"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfParagraph">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element ref="pdfStyle"/>
        <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraphComposition"/>
      </xs:sequence>
      <xs:attribute name="xobjId" type="xs:int"/>
      <xs:attribute name="unicode" use="required" type="xs:string"/>
      <xs:attribute name="scale" type="xs:float"/>
      <xs:attribute name="optimal_scale" type="xs:float"/>
      <xs:attribute name="vertical" type="xs:boolean"/>
      <xs:attribute name="FirstLineIndent" type="xs:boolean"/>
      <xs:attribute name="debug_id" type="xs:string"/>
      <xs:attribute name="layout_label" type="xs:string"/>
      <xs:attribute name="layout_id" type="xs:int"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfParagraphComposition">
    <xs:complexType>
      <xs:choice>
        <xs:element ref="pdfLine"/>
        <xs:element ref="pdfFormula"/>
        <xs:element ref="pdfSameStyleCharacters"/>
        <xs:element ref="pdfCharacter"/>
        <xs:element ref="pdfSameStyleUnicodeCharacters"/>
      </xs:choice>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfLine">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfSameStyleCharacters">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element ref="pdfStyle"/>
        <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfSameStyleUnicodeCharacters">
    <xs:complexType>
      <xs:sequence>
        <xs:element minOccurs="0" ref="pdfStyle"/>
      </xs:sequence>
      <xs:attribute name="unicode" use="required" type="xs:string"/>
      <xs:attribute name="debug_info" type="xs:boolean"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfFormula">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
      </xs:sequence>
      <xs:attribute name="x_offset" use="required" type="xs:float"/>
      <xs:attribute name="y_offset" use="required" type="xs:float"/>
      <xs:attribute name="x_advance" type="xs:float"/>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfFigure">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
      </xs:sequence>
    </xs:complexType>
  </xs:element>
  <xs:element name="pdfRectangle">
    <xs:complexType>
      <xs:sequence>
        <xs:element ref="box"/>
        <xs:element ref="graphicState"/>
      </xs:sequence>
      <xs:attribute name="debug_info" type="xs:boolean"/>
      <xs:attribute name="fill_background" type="xs:boolean"/>
      <xs:attribute name="xobjId" type="xs:int"/>
      <xs:attribute name="lineWidth" type="xs:float"/>
    </xs:complexType>
  </xs:element>
</xs:schema>

```

## /babeldoc/format/pdf/document_il/midend/__init__.py

```py path="/babeldoc/format/pdf/document_il/midend/__init__.py" 

```

## /babeldoc/format/pdf/document_il/utils/__init__.py

```py path="/babeldoc/format/pdf/document_il/utils/__init__.py" 

```

## /babeldoc/format/pdf/document_il/utils/mupdf_helper.py

```py path="/babeldoc/format/pdf/document_il/utils/mupdf_helper.py" 
import pymupdf


def get_no_rotation_img(page: pymupdf.Page, dpi: int = 72) -> pymupdf.Pixmap:
    # return page.get_pixmap(dpi=72)
    original_rotation = page.rotation
    page.set_rotation(0)
    pix = page.get_pixmap(dpi=dpi)
    page.set_rotation(original_rotation)
    return pix

```

## /babeldoc/pdfminer/__init__.py

```py path="/babeldoc/pdfminer/__init__.py" 
from importlib.metadata import PackageNotFoundError
from importlib.metadata import version

try:
    __version__ = version("pdfminer.six")
except PackageNotFoundError:
    # package is not installed, return default
    __version__ = "0.0"

if __name__ == "__main__":
    print(__version__)

```

## /babeldoc/pdfminer/py.typed

```typed path="/babeldoc/pdfminer/py.typed" 

```


The content has been capped at 50000 tokens. The user could consider applying other filters to refine the result. The better and more specific the context, the better the LLM can follow instructions. If the context seems verbose, the user can refine the filter using uithub. Thank you for using https://uithub.com - Perfect LLM context for any GitHub repo.