``` ├── .devcontainer/ ├── devcontainer.json ├── .dockerignore ├── .gitattributes ├── .github/ ├── dependabot.yml ├── workflows/ ├── pre-commit.yml ├── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── packages/ ├── markitdown-mcp/ ├── Dockerfile ├── README.md ├── pyproject.toml ├── src/ ├── markitdown_mcp/ ├── __about__.py ├── __init__.py ├── __main__.py ├── py.typed ├── tests/ ├── __init__.py ├── markitdown-sample-plugin/ ├── README.md ├── pyproject.toml ├── src/ ├── markitdown_sample_plugin/ ├── __about__.py ├── __init__.py ├── _plugin.py ├── py.typed ├── tests/ ├── __init__.py ├── test_files/ ├── test.rtf ├── test_sample_plugin.py ├── markitdown/ ├── README.md ├── ThirdPartyNotices.md ├── pyproject.toml ├── src/ ├── markitdown/ ├── __about__.py ├── __init__.py ├── __main__.py ├── _base_converter.py ├── _exceptions.py ├── _markitdown.py ├── _stream_info.py ├── _uri_utils.py ├── converter_utils/ ├── __init__.py ├── docx/ ├── __init__.py ├── math/ ├── __init__.py ├── latex_dict.py ├── omml.py ├── pre_process.py ├── converters/ ├── __init__.py ├── _audio_converter.py ├── _bing_serp_converter.py ├── _csv_converter.py ├── _doc_intel_converter.py ├── _docx_converter.py ├── _epub_converter.py ├── _exiftool.py ├── _html_converter.py ├── _image_converter.py ├── _ipynb_converter.py ├── _llm_caption.py ├── _markdownify.py ├── _outlook_msg_converter.py ├── _pdf_converter.py ├── _plain_text_converter.py ├── _pptx_converter.py ├── _rss_converter.py ``` ## /.devcontainer/devcontainer.json ```json path="/.devcontainer/devcontainer.json" // For format details, see https://aka.ms/devcontainer.json. For config options, see the // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile { "name": "Existing Dockerfile", "build": { // Sets the run context to one level up instead of the .devcontainer folder. "context": "..", // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. "dockerfile": "../Dockerfile", "args": { "INSTALL_GIT": "true" } }, // Features to add to the dev container. More info: https://containers.dev/features. // "features": {}, "features": { "ghcr.io/devcontainers-extra/features/hatch:2": {} }, // Use 'forwardPorts' to make a list of ports inside the container available locally. // "forwardPorts": [], // Uncomment the next line to run commands after the container is created. // "postCreateCommand": "cat /etc/os-release", // Configure tool-specific properties. // "customizations": {}, // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. "remoteUser": "root" } ``` ## /.dockerignore ```dockerignore path="/.dockerignore" * !packages/ ``` ## /.gitattributes ```gitattributes path="/.gitattributes" packages/markitdown/tests/test_files/** linguist-vendored packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored ``` ## /.github/dependabot.yml ```yml path="/.github/dependabot.yml" version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "weekly" ``` ## /.github/workflows/pre-commit.yml ```yml path="/.github/workflows/pre-commit.yml" name: pre-commit on: [pull_request] jobs: pre-commit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.x" - name: Install pre-commit run: | pip install pre-commit pre-commit install --install-hooks - name: Run pre-commit run: pre-commit run --all-files ``` ## /.github/workflows/tests.yml ```yml path="/.github/workflows/tests.yml" name: tests on: [pull_request] jobs: tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: | 3.10 3.11 3.12 - name: Install Hatch run: pipx install hatch - name: Run tests run: cd packages/markitdown; hatch test ``` ## /.gitignore ```gitignore path="/.gitignore" .vscode # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/latest/usage/project/#working-with-version-control .pdm.toml .pdm-python .pdm-build/ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ src/.DS_Store .DS_Store ``` ## /.pre-commit-config.yaml ```yaml path="/.pre-commit-config.yaml" repos: - repo: https://github.com/psf/black rev: 23.7.0 # Use the latest version of Black hooks: - id: black ``` ## /CODE_OF_CONDUCT.md # Microsoft Open Source Code of Conduct This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). Resources: - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns ## /Dockerfile ``` path="/Dockerfile" FROM python:3.13-slim-bullseye ENV DEBIAN_FRONTEND=noninteractive ENV EXIFTOOL_PATH=/usr/bin/exiftool ENV FFMPEG_PATH=/usr/bin/ffmpeg # Runtime dependency RUN apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ exiftool ARG INSTALL_GIT=false RUN if [ "$INSTALL_GIT" = "true" ]; then \ apt-get install -y --no-install-recommends \ git; \ fi # Cleanup RUN rm -rf /var/lib/apt/lists/* WORKDIR /app COPY . /app RUN pip --no-cache-dir install \ /app/packages/markitdown[all] \ /app/packages/markitdown-sample-plugin # Default USERID and GROUPID ARG USERID=nobody ARG GROUPID=nogroup USER $USERID:$GROUPID ENTRYPOINT [ "markitdown" ] ``` ## /LICENSE ``` path="/LICENSE" MIT License Copyright (c) Microsoft Corporation. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE ``` ## /README.md # MarkItDown [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown) [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen) > [!TIP] > MarkItDown now offers an MCP (Model Context Protocol) server for integration with LLM applications like Claude Desktop. See [markitdown-mcp](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp) for more information. > [!IMPORTANT] > Breaking changes between 0.0.1 to 0.1.0: > * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior. > * convert\_stream() now requires a binary file-like object (e.g., a file opened in binary mode, or an io.BytesIO object). This is a breaking change from the previous version, where it previously also accepted text file-like objects, like io.StringIO. > * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything. MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption. At present, MarkItDown supports: - PDF - PowerPoint - Word - Excel - Images (EXIF metadata and OCR) - Audio (EXIF metadata and speech transcription) - HTML - Text-based formats (CSV, JSON, XML) - ZIP files (iterates over contents) - Youtube URLs - EPubs - ... and more! ## Why Markdown? Markdown is extremely close to plain text, with minimal markup or formatting, but still provides a way to represent important document structure. Mainstream LLMs, such as OpenAI's GPT-4o, natively "_speak_" Markdown, and often incorporate Markdown into their responses unprompted. This suggests that they have been trained on vast amounts of Markdown-formatted text, and understand it well. As a side benefit, Markdown conventions are also highly token-efficient. ## Installation To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source: ```bash git clone git@github.com:microsoft/markitdown.git cd markitdown pip install -e 'packages/markitdown[all]' ``` ## Usage ### Command-Line ```bash markitdown path-to-file.pdf > document.md ``` Or use `-o` to specify the output file: ```bash markitdown path-to-file.pdf -o document.md ``` You can also pipe content: ```bash cat path-to-file.pdf | markitdown ``` ### Optional Dependencies MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example: ```bash pip install 'markitdown[pdf, docx, pptx]' ``` will install only the dependencies for PDF, DOCX, and PPTX files. At the moment, the following optional dependencies are available: * `[all]` Installs all optional dependencies * `[pptx]` Installs dependencies for PowerPoint files * `[docx]` Installs dependencies for Word files * `[xlsx]` Installs dependencies for Excel files * `[xls]` Installs dependencies for older Excel files * `[pdf]` Installs dependencies for PDF files * `[outlook]` Installs dependencies for Outlook messages * `[az-doc-intel]` Installs dependencies for Azure Document Intelligence * `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files * `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription ### Plugins MarkItDown also supports 3rd-party plugins. Plugins are disabled by default. To list installed plugins: ```bash markitdown --list-plugins ``` To enable plugins use: ```bash markitdown --use-plugins path-to-file.pdf ``` To find available plugins, search GitHub for the hashtag `#markitdown-plugin`. To develop a plugin, see `packages/markitdown-sample-plugin`. ### Azure Document Intelligence To use Microsoft Document Intelligence for conversion: ```bash markitdown path-to-file.pdf -o document.md -d -e "" ``` More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0) ### Python API Basic usage in Python: ```python from markitdown import MarkItDown md = MarkItDown(enable_plugins=False) # Set to True to enable plugins result = md.convert("test.xlsx") print(result.text_content) ``` Document Intelligence conversion in Python: ```python from markitdown import MarkItDown md = MarkItDown(docintel_endpoint="") result = md.convert("test.pdf") print(result.text_content) ``` To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: ```python from markitdown import MarkItDown from openai import OpenAI client = OpenAI() md = MarkItDown(llm_client=client, llm_model="gpt-4o") result = md.convert("example.jpg") print(result.text_content) ``` ### Docker ```sh docker build -t markitdown:latest . docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md ``` ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. ### How to Contribute You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
| | All | Especially Needs Help from Community | | ---------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | | **Issues** | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) | | **PRs** | [All PRs](https://github.com/microsoft/markitdown/pulls) | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22) |
### Running Tests and Checks - Navigate to the MarkItDown package: ```sh cd packages/markitdown ``` - Install `hatch` in your environment and run tests: ```sh pip install hatch # Other ways of installing hatch: https://hatch.pypa.io/dev/install/ hatch shell hatch test ``` (Alternative) Use the Devcontainer which has all the dependencies installed: ```sh # Reopen the project in Devcontainer and run: hatch test ``` - Run pre-commit checks before submitting a PR: `pre-commit run --all-files` ### Contributing 3rd-party Plugins You can also contribute by creating and sharing 3rd party plugins. See `packages/markitdown-sample-plugin` for more details. ## Trademarks This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. ## /SECURITY.md ## Security Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. ## Reporting Security Issues **Please do not report security vulnerabilities through public GitHub issues.** Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) * Full paths of source file(s) related to the manifestation of the issue * The location of the affected source code (tag/branch/commit or direct URL) * Any special configuration required to reproduce the issue * Step-by-step instructions to reproduce the issue * Proof-of-concept or exploit code (if possible) * Impact of the issue, including how an attacker might exploit the issue This information will help us triage your report more quickly. If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. ## Preferred Languages We prefer all communications to be in English. ## Policy Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). ## /SUPPORT.md # TODO: The maintainer of this repo has not yet edited this file **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? - **No CSS support:** Fill out this template with information about how to file issues and get help. - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* # Support ## How to file issues and get help This project uses GitHub Issues to track bugs and feature requests. Please search the existing issues before filing new issues to avoid duplicates. For new issues, file your bug or feature request as a new Issue. For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER CHANNEL. WHERE WILL YOU HELP PEOPLE?**. ## Microsoft Support Policy Support for this **PROJECT or PRODUCT** is limited to the resources listed above. ## /packages/markitdown-mcp/Dockerfile ``` path="/packages/markitdown-mcp/Dockerfile" FROM python:3.13-slim-bullseye ENV DEBIAN_FRONTEND=noninteractive ENV EXIFTOOL_PATH=/usr/bin/exiftool ENV FFMPEG_PATH=/usr/bin/ffmpeg # Runtime dependency RUN apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ exiftool # Cleanup RUN rm -rf /var/lib/apt/lists/* COPY . /app RUN pip --no-cache-dir install /app WORKDIR /workdir # Default USERID and GROUPID ARG USERID=nobody ARG GROUPID=nogroup USER $USERID:$GROUPID ENTRYPOINT [ "markitdown-mcp" ] ``` ## /packages/markitdown-mcp/README.md # MarkItDown-MCP [![PyPI](https://img.shields.io/pypi/v/markitdown-mcp.svg)](https://pypi.org/project/markitdown-mcp/) ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown-mcp) [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen) The `markitdown-mcp` package provides a lightweight STDIO and SSE MCP server for calling MarkItDown. It exposes one tool: `convert_to_markdown(uri)`, where uri can be any `http:`, `https:`, `file:`, or `data:` URI. ## Installation To install the package, use pip: ```bash pip install markitdown-mcp ``` ## Usage To run the MCP server, ussing STDIO (default) use the following command: ```bash markitdown-mcp ``` To run the MCP server, using SSE use the following command: ```bash markitdown-mcp --sse --host 127.0.0.1 --port 3001 ``` ## Running in Docker To run `markitdown-mcp` in Docker, build the Docker image using the provided Dockerfile: ```bash docker build -t markitdown-mcp:latest . ``` And run it using: ```bash docker run -it --rm markitdown-mcp:latest ``` This will be sufficient for remote URIs. To access local files, you need to mount the local directory into the container. For example, if you want to access files in `/home/user/data`, you can run: ```bash docker run -it --rm -v /home/user/data:/workdir markitdown-mcp:latest ``` Once mounted, all files under data will be accessible under `/workdir` in the container. For example, if you have a file `example.txt` in `/home/user/data`, it will be accessible in the container at `/workdir/example.txt`. ## Accessing from Claude Desktop It is recommended to use the Docker image when running the MCP server for Claude Desktop. Follow [these instrutions](https://modelcontextprotocol.io/quickstart/user#for-claude-desktop-users) to access Claude's `claude_desktop_config.json` file. Edit it to include the following JSON entry: ```json { "mcpServers": { "markitdown": { "command": "docker", "args": [ "run", "--rm", "-i", "markitdown-mcp:latest" ] } } } ``` If you want to mount a directory, adjust it accordingly: ```json { "mcpServers": { "markitdown": { "command": "docker", "args": [ "run", "--rm", "-i", "-v", "/home/user/data:/workdir", "markitdown-mcp:latest" ] } } } ``` ## Debugging To debug the MCP server you can use the `mcpinspector` tool. ```bash npx @modelcontextprotocol/inspector ``` You can then connect to the insepctor through the specified host and port (e.g., `http://localhost:5173/`). If using STDIO: * select `STDIO` as the transport type, * input `markitdown-mcp` as the command, and * click `Connect` If using SSE: * select `SSE` as the transport type, * input `http://127.0.0.1:3001/sse` as the URL, and * click `Connect` Finally: * click the `Tools` tab, * click `List Tools`, * click `convert_to_markdown`, and * run the tool on any valid URI. ## Security Considerations The server does not support authentication, and runs with the privileges if the user running it. For this reason, when running in SSE mode, it is recommended to run the server bound to `localhost` (default). ## Trademarks This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. ## /packages/markitdown-mcp/pyproject.toml ```toml path="/packages/markitdown-mcp/pyproject.toml" [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] name = "markitdown-mcp" dynamic = ["version"] description = 'An MCP server for the "markitdown" library.' readme = "README.md" requires-python = ">=3.10" license = "MIT" keywords = [] authors = [ { name = "Adam Fourney", email = "adamfo@microsoft.com" }, ] classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ "mcp~=1.5.0", "markitdown[all]>=0.1.1,<0.2.0", ] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" Issues = "https://github.com/microsoft/markitdown/issues" Source = "https://github.com/microsoft/markitdown" [tool.hatch.version] path = "src/markitdown_mcp/__about__.py" [project.scripts] markitdown-mcp = "markitdown_mcp.__main__:main" [tool.hatch.envs.types] extra-dependencies = [ "mypy>=1.0.0", ] [tool.hatch.envs.types.scripts] check = "mypy --install-types --non-interactive {args:src/markitdown_mcp tests}" [tool.coverage.run] source_pkgs = ["markitdown-mcp", "tests"] branch = true parallel = true omit = [ "src/markitdown_mcp/__about__.py", ] [tool.coverage.paths] markitdown-mcp = ["src/markitdown_mcp", "*/markitdown-mcp/src/markitdown_mcp"] tests = ["tests", "*/markitdown-mcp/tests"] [tool.coverage.report] exclude_lines = [ "no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:", ] [tool.hatch.build.targets.sdist] only-include = ["src/markitdown_mcp"] ``` ## /packages/markitdown-mcp/src/markitdown_mcp/__about__.py ```py path="/packages/markitdown-mcp/src/markitdown_mcp/__about__.py" # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT __version__ = "0.0.1a3" ``` ## /packages/markitdown-mcp/src/markitdown_mcp/__init__.py ```py path="/packages/markitdown-mcp/src/markitdown_mcp/__init__.py" # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT from .__about__ import __version__ __all__ = [ "__version__", ] ``` ## /packages/markitdown-mcp/src/markitdown_mcp/__main__.py ```py path="/packages/markitdown-mcp/src/markitdown_mcp/__main__.py" import sys from typing import Any from mcp.server.fastmcp import FastMCP from starlette.applications import Starlette from mcp.server.sse import SseServerTransport from starlette.requests import Request from starlette.routing import Mount, Route from mcp.server import Server from markitdown import MarkItDown import uvicorn # Initialize FastMCP server for MarkItDown (SSE) mcp = FastMCP("markitdown") @mcp.tool() async def convert_to_markdown(uri: str) -> str: """Convert a resource described by an http:, https:, file: or data: URI to markdown""" return MarkItDown().convert_uri(uri).markdown def create_starlette_app(mcp_server: Server, *, debug: bool = False) -> Starlette: sse = SseServerTransport("/messages/") async def handle_sse(request: Request) -> None: async with sse.connect_sse( request.scope, request.receive, request._send, ) as (read_stream, write_stream): await mcp_server.run( read_stream, write_stream, mcp_server.create_initialization_options(), ) return Starlette( debug=debug, routes=[ Route("/sse", endpoint=handle_sse), Mount("/messages/", app=sse.handle_post_message), ], ) # Main entry point def main(): import argparse mcp_server = mcp._mcp_server parser = argparse.ArgumentParser(description="Run MCP SSE-based MarkItDown server") parser.add_argument( "--sse", action="store_true", help="Run the server with SSE transport rather than STDIO (default: False)", ) parser.add_argument( "--host", default=None, help="Host to bind to (default: 127.0.0.1)" ) parser.add_argument( "--port", type=int, default=None, help="Port to listen on (default: 3001)" ) args = parser.parse_args() if not args.sse and (args.host or args.port): parser.error("Host and port arguments are only valid when using SSE transport.") sys.exit(1) if args.sse: starlette_app = create_starlette_app(mcp_server, debug=True) uvicorn.run( starlette_app, host=args.host if args.host else "127.0.0.1", port=args.port if args.port else 3001, ) else: mcp.run() if __name__ == "__main__": main() ``` ## /packages/markitdown-mcp/src/markitdown_mcp/py.typed ```typed path="/packages/markitdown-mcp/src/markitdown_mcp/py.typed" ``` ## /packages/markitdown-mcp/tests/__init__.py ```py path="/packages/markitdown-mcp/tests/__init__.py" # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT ``` ## /packages/markitdown-sample-plugin/README.md # MarkItDown Sample Plugin [![PyPI](https://img.shields.io/pypi/v/markitdown-sample-plugin.svg)](https://pypi.org/project/markitdown-sample-plugin/) ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown-sample-plugin) [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen) This project shows how to create a sample plugin for MarkItDown. The most important parts are as follows: Next, implement your custom DocumentConverter: ```python from typing import BinaryIO, Any from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo class RtfConverter(DocumentConverter): def __init__( self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT ): super().__init__(priority=priority) def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, ) -> bool: # Implement logic to check if the file stream is an RTF file # ... raise NotImplementedError() def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, ) -> DocumentConverterResult: # Implement logic to convert the file stream to Markdown # ... raise NotImplementedError() ``` Next, make sure your package implements and exports the following: ```python # The version of the plugin interface that this plugin uses. # The only supported version is 1 for now. __plugin_interface_version__ = 1 # The main entrypoint for the plugin. This is called each time MarkItDown instances are created. def register_converters(markitdown: MarkItDown, **kwargs): """ Called during construction of MarkItDown instances to register converters provided by plugins. """ # Simply create and attach an RtfConverter instance markitdown.register_converter(RtfConverter()) ``` Finally, create an entrypoint in the `pyproject.toml` file: ```toml [project.entry-points."markitdown.plugin"] sample_plugin = "markitdown_sample_plugin" ``` Here, the value of `sample_plugin` can be any key, but should ideally be the name of the plugin. The value is the fully qualified name of the package implementing the plugin. ## Installation To use the plugin with MarkItDown, it must be installed. To install the plugin from the current directory use: ```bash pip install -e . ``` Once the plugin package is installed, verify that it is available to MarkItDown by running: ```bash markitdown --list-plugins ``` To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert an RTF file: ```bash markitdown --use-plugins path-to-file.rtf ``` In Python, plugins can be enabled as follows: ```python from markitdown import MarkItDown md = MarkItDown(enable_plugins=True) result = md.convert("path-to-file.rtf") print(result.text_content) ``` ## Trademarks This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. ## /packages/markitdown-sample-plugin/pyproject.toml ```toml path="/packages/markitdown-sample-plugin/pyproject.toml" [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] name = "markitdown-sample-plugin" dynamic = ["version"] description = 'A sample plugin for the "markitdown" library.' readme = "README.md" requires-python = ">=3.10" license = "MIT" keywords = [] authors = [ { name = "Adam Fourney", email = "adamfo@microsoft.com" }, ] classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ "markitdown>=0.1.0a1", "striprtf", ] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" Issues = "https://github.com/microsoft/markitdown/issues" Source = "https://github.com/microsoft/markitdown" [tool.hatch.version] path = "src/markitdown_sample_plugin/__about__.py" # IMPORTANT: MarkItDown will look for this entry point to find the plugin. [project.entry-points."markitdown.plugin"] sample_plugin = "markitdown_sample_plugin" [tool.hatch.envs.types] extra-dependencies = [ "mypy>=1.0.0", ] [tool.hatch.envs.types.scripts] check = "mypy --install-types --non-interactive {args:src/markitdown_sample_plugin tests}" [tool.coverage.run] source_pkgs = ["markitdown-sample-plugin", "tests"] branch = true parallel = true omit = [ "src/markitdown_sample_plugin/__about__.py", ] [tool.coverage.paths] markitdown-sample-plugin = ["src/markitdown_sample_plugin", "*/markitdown-sample-plugin/src/markitdown_sample_plugin"] tests = ["tests", "*/markitdown-sample-plugin/tests"] [tool.coverage.report] exclude_lines = [ "no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:", ] [tool.hatch.build.targets.sdist] only-include = ["src/markitdown_sample_plugin"] ``` ## /packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py ```py path="/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py" # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT __version__ = "0.1.0a1" ``` ## /packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__init__.py ```py path="/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__init__.py" # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT from ._plugin import __plugin_interface_version__, register_converters, RtfConverter from .__about__ import __version__ __all__ = [ "__version__", "__plugin_interface_version__", "register_converters", "RtfConverter", ] ``` ## /packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py ```py path="/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py" import locale from typing import BinaryIO, Any from striprtf.striprtf import rtf_to_text from markitdown import ( MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo, ) __plugin_interface_version__ = ( 1 # The version of the plugin interface that this plugin uses ) ACCEPTED_MIME_TYPE_PREFIXES = [ "text/rtf", "application/rtf", ] ACCEPTED_FILE_EXTENSIONS = [".rtf"] def register_converters(markitdown: MarkItDown, **kwargs): """ Called during construction of MarkItDown instances to register converters provided by plugins. """ # Simply create and attach an RtfConverter instance markitdown.register_converter(RtfConverter()) class RtfConverter(DocumentConverter): """ Converts an RTF file to in the simplest possible way. """ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, ) -> DocumentConverterResult: # Read the file stream into an str using hte provided charset encoding, or using the system default encoding = stream_info.charset or locale.getpreferredencoding() stream_data = file_stream.read().decode(encoding) # Return the result return DocumentConverterResult( title=None, markdown=rtf_to_text(stream_data), ) ``` ## /packages/markitdown-sample-plugin/src/markitdown_sample_plugin/py.typed ```typed path="/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/py.typed" ``` ## /packages/markitdown-sample-plugin/tests/__init__.py ```py path="/packages/markitdown-sample-plugin/tests/__init__.py" # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT ``` ## /packages/markitdown-sample-plugin/tests/test_files/test.rtf ```rtf path="/packages/markitdown-sample-plugin/tests/test_files/test.rtf" {\rtf1\adeflang1025\ansi\ansicpg1252\uc1\adeff31507\deff0\stshfdbch31506\stshfloch31506\stshfhich31506\stshfbi31507\deflang1033\deflangfe1033\themelang1033\themelangfe0\themelangcs0{\fonttbl{\f0\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f34\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria Math;} {\f42\fbidi \fswiss\fcharset0\fprq2 Aptos Display;}{\f43\fbidi \fswiss\fcharset0\fprq2 Aptos;}{\flomajor\f31500\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;} {\fdbmajor\f31501\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhimajor\f31502\fbidi \fswiss\fcharset0\fprq2 Aptos Display;}{\fbimajor\f31503\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;} {\flominor\f31504\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fdbminor\f31505\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhiminor\f31506\fbidi \fswiss\fcharset0\fprq2 Aptos;} {\fbiminor\f31507\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f51\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\f52\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;} {\f54\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\f55\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\f56\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f57\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);} {\f58\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\f59\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\f391\fbidi \froman\fcharset238\fprq2 Cambria Math CE;}{\f392\fbidi \froman\fcharset204\fprq2 Cambria Math Cyr;} {\f394\fbidi \froman\fcharset161\fprq2 Cambria Math Greek;}{\f395\fbidi \froman\fcharset162\fprq2 Cambria Math Tur;}{\f398\fbidi \froman\fcharset186\fprq2 Cambria Math Baltic;}{\f399\fbidi \froman\fcharset163\fprq2 Cambria Math (Vietnamese);} {\f471\fbidi \fswiss\fcharset238\fprq2 Aptos Display CE;}{\f472\fbidi \fswiss\fcharset204\fprq2 Aptos Display Cyr;}{\f474\fbidi \fswiss\fcharset161\fprq2 Aptos Display Greek;}{\f475\fbidi \fswiss\fcharset162\fprq2 Aptos Display Tur;} {\f478\fbidi \fswiss\fcharset186\fprq2 Aptos Display Baltic;}{\f479\fbidi \fswiss\fcharset163\fprq2 Aptos Display (Vietnamese);}{\f481\fbidi \fswiss\fcharset238\fprq2 Aptos CE;}{\f482\fbidi \fswiss\fcharset204\fprq2 Aptos Cyr;} {\f484\fbidi \fswiss\fcharset161\fprq2 Aptos Greek;}{\f485\fbidi \fswiss\fcharset162\fprq2 Aptos Tur;}{\f488\fbidi \fswiss\fcharset186\fprq2 Aptos Baltic;}{\f489\fbidi \fswiss\fcharset163\fprq2 Aptos (Vietnamese);} {\flomajor\f31508\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flomajor\f31509\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\flomajor\f31511\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;} {\flomajor\f31512\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flomajor\f31513\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\flomajor\f31514\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);} {\flomajor\f31515\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flomajor\f31516\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fdbmajor\f31518\fbidi \froman\fcharset238\fprq2 Times New Roman CE;} {\fdbmajor\f31519\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbmajor\f31521\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fdbmajor\f31522\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;} {\fdbmajor\f31523\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbmajor\f31524\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fdbmajor\f31525\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;} {\fdbmajor\f31526\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhimajor\f31528\fbidi \fswiss\fcharset238\fprq2 Aptos Display CE;}{\fhimajor\f31529\fbidi \fswiss\fcharset204\fprq2 Aptos Display Cyr;} {\fhimajor\f31531\fbidi \fswiss\fcharset161\fprq2 Aptos Display Greek;}{\fhimajor\f31532\fbidi \fswiss\fcharset162\fprq2 Aptos Display Tur;}{\fhimajor\f31535\fbidi \fswiss\fcharset186\fprq2 Aptos Display Baltic;} {\fhimajor\f31536\fbidi \fswiss\fcharset163\fprq2 Aptos Display (Vietnamese);}{\fbimajor\f31538\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fbimajor\f31539\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;} {\fbimajor\f31541\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbimajor\f31542\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fbimajor\f31543\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);} {\fbimajor\f31544\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbimajor\f31545\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fbimajor\f31546\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);} {\flominor\f31548\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flominor\f31549\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\flominor\f31551\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;} {\flominor\f31552\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flominor\f31553\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\flominor\f31554\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);} {\flominor\f31555\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flominor\f31556\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fdbminor\f31558\fbidi \froman\fcharset238\fprq2 Times New Roman CE;} {\fdbminor\f31559\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbminor\f31561\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fdbminor\f31562\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;} {\fdbminor\f31563\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbminor\f31564\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fdbminor\f31565\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;} {\fdbminor\f31566\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhiminor\f31568\fbidi \fswiss\fcharset238\fprq2 Aptos CE;}{\fhiminor\f31569\fbidi \fswiss\fcharset204\fprq2 Aptos Cyr;} {\fhiminor\f31571\fbidi \fswiss\fcharset161\fprq2 Aptos Greek;}{\fhiminor\f31572\fbidi \fswiss\fcharset162\fprq2 Aptos Tur;}{\fhiminor\f31575\fbidi \fswiss\fcharset186\fprq2 Aptos Baltic;} {\fhiminor\f31576\fbidi \fswiss\fcharset163\fprq2 Aptos (Vietnamese);}{\fbiminor\f31578\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fbiminor\f31579\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;} {\fbiminor\f31581\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbiminor\f31582\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fbiminor\f31583\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);} {\fbiminor\f31584\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbiminor\f31585\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fbiminor\f31586\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}} {\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0; \red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;\red0\green0\blue0;\red0\green0\blue0;\caccentone\ctint255\cshade191\red15\green71\blue97; \ctextone\ctint166\cshade255\red89\green89\blue89;\ctextone\ctint216\cshade255\red39\green39\blue39;\ctextone\ctint191\cshade255\red64\green64\blue64;}{\*\defchp \f31506\fs24\kerning2 }{\*\defpap \ql \li0\ri0\sa160\sl278\slmult1 \widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 }\noqfpromote {\stylesheet{\ql \li0\ri0\sa160\sl278\slmult1\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31507\afs24\alang1025 \ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033 \snext0 \sqformat \spriority0 Normal;}{\s1\ql \li0\ri0\sb360\sa80\sl278\slmult1 \keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel0\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs40\alang1025 \ltrch\fcs0 \fs40\cf19\lang1033\langfe1033\kerning2\loch\f31502\hich\af31502\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink15 \sqformat \spriority9 \styrsid15678446 heading 1;}{\s2\ql \li0\ri0\sb160\sa80\sl278\slmult1 \keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel1\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs32\alang1025 \ltrch\fcs0 \fs32\cf19\lang1033\langfe1033\kerning2\loch\f31502\hich\af31502\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink16 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 2;}{\s3\ql \li0\ri0\sb160\sa80\sl278\slmult1 \keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel2\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs28\alang1025 \ltrch\fcs0 \fs28\cf19\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink17 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 3;}{\s4\ql \li0\ri0\sb80\sa40\sl278\slmult1 \keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel3\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \ai\af31503\afs24\alang1025 \ltrch\fcs0 \i\fs24\cf19\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink18 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 4;}{\s5\ql \li0\ri0\sb80\sa40\sl278\slmult1 \keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel4\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs24\alang1025 \ltrch\fcs0 \fs24\cf19\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink19 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 5;}{\s6\ql \li0\ri0\sb40\sl278\slmult1 \keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel5\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \ai\af31503\afs24\alang1025 \ltrch\fcs0 \i\fs24\cf20\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink20 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 6;}{\s7\ql \li0\ri0\sb40\sl278\slmult1 \keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel6\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs24\alang1025 \ltrch\fcs0 \fs24\cf20\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink21 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 7;}{\s8\ql \li0\ri0\sl278\slmult1 \keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel7\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \ai\af31503\afs24\alang1025 \ltrch\fcs0 \i\fs24\cf21\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink22 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 8;}{\s9\ql \li0\ri0\sl278\slmult1 \keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel8\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs24\alang1025 \ltrch\fcs0 \fs24\cf21\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink23 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 9;}{\*\cs10 \additive \ssemihidden \sunhideused \spriority1 Default Paragraph Font;}{\* \ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tblind0\tblindtype3\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv \ql \li0\ri0\sa160\sl278\slmult1 \widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31507\afs24\alang1025 \ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033 \snext11 \ssemihidden \sunhideused Normal Table;}{\*\cs15 \additive \rtlch\fcs1 \af31503\afs40 \ltrch\fcs0 \fs40\cf19\loch\f31502\hich\af31502\dbch\af31501 \sbasedon10 \slink1 \spriority9 \styrsid15678446 Heading 1 Char;}{\*\cs16 \additive \rtlch\fcs1 \af31503\afs32 \ltrch\fcs0 \fs32\cf19\loch\f31502\hich\af31502\dbch\af31501 \sbasedon10 \slink2 \ssemihidden \spriority9 \styrsid15678446 Heading 2 Char;}{\*\cs17 \additive \rtlch\fcs1 \af31503\afs28 \ltrch\fcs0 \fs28\cf19\dbch\af31501 \sbasedon10 \slink3 \ssemihidden \spriority9 \styrsid15678446 Heading 3 Char;}{\*\cs18 \additive \rtlch\fcs1 \ai\af31503 \ltrch\fcs0 \i\cf19\dbch\af31501 \sbasedon10 \slink4 \ssemihidden \spriority9 \styrsid15678446 Heading 4 Char;}{\*\cs19 \additive \rtlch\fcs1 \af31503 \ltrch\fcs0 \cf19\dbch\af31501 \sbasedon10 \slink5 \ssemihidden \spriority9 \styrsid15678446 Heading 5 Char;}{\*\cs20 \additive \rtlch\fcs1 \ai\af31503 \ltrch\fcs0 \i\cf20\dbch\af31501 \sbasedon10 \slink6 \ssemihidden \spriority9 \styrsid15678446 Heading 6 Char;}{\*\cs21 \additive \rtlch\fcs1 \af31503 \ltrch\fcs0 \cf20\dbch\af31501 \sbasedon10 \slink7 \ssemihidden \spriority9 \styrsid15678446 Heading 7 Char;}{\*\cs22 \additive \rtlch\fcs1 \ai\af31503 \ltrch\fcs0 \i\cf21\dbch\af31501 \sbasedon10 \slink8 \ssemihidden \spriority9 \styrsid15678446 Heading 8 Char;}{\*\cs23 \additive \rtlch\fcs1 \af31503 \ltrch\fcs0 \cf21\dbch\af31501 \sbasedon10 \slink9 \ssemihidden \spriority9 \styrsid15678446 Heading 9 Char;}{\s24\ql \li0\ri0\sa80\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\contextualspace \rtlch\fcs1 \af31503\afs56\alang1025 \ltrch\fcs0 \fs56\expnd-2\expndtw-10\lang1033\langfe1033\kerning28\loch\f31502\hich\af31502\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink25 \sqformat \spriority10 \styrsid15678446 Title;}{\*\cs25 \additive \rtlch\fcs1 \af31503\afs56 \ltrch\fcs0 \fs56\expnd-2\expndtw-10\kerning28\loch\f31502\hich\af31502\dbch\af31501 \sbasedon10 \slink24 \spriority10 \styrsid15678446 Title Char;}{\s26\ql \li0\ri0\sa160\sl278\slmult1 \widctlpar\wrapdefault\aspalpha\aspnum\faauto\ilvl1\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs28\alang1025 \ltrch\fcs0 \fs28\expnd3\expndtw15\cf20\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink27 \sqformat \spriority11 \styrsid15678446 Subtitle;}{\*\cs27 \additive \rtlch\fcs1 \af31503\afs28 \ltrch\fcs0 \fs28\expnd3\expndtw15\cf20\dbch\af31501 \sbasedon10 \slink26 \spriority11 \styrsid15678446 Subtitle Char;}{ \s28\qc \li0\ri0\sb160\sa160\sl278\slmult1\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \ai\af31507\afs24\alang1025 \ltrch\fcs0 \i\f31506\fs24\cf22\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink29 \sqformat \spriority29 \styrsid15678446 Quote;}{\*\cs29 \additive \rtlch\fcs1 \ai\af0 \ltrch\fcs0 \i\cf22 \sbasedon10 \slink28 \spriority29 \styrsid15678446 Quote Char;}{\s30\ql \li720\ri0\sa160\sl278\slmult1 \widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin720\itap0\contextualspace \rtlch\fcs1 \af31507\afs24\alang1025 \ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext30 \sqformat \spriority34 \styrsid15678446 List Paragraph;}{\*\cs31 \additive \rtlch\fcs1 \ai\af0 \ltrch\fcs0 \i\cf19 \sbasedon10 \sqformat \spriority21 \styrsid15678446 Intense Emphasis;}{\s32\qc \li864\ri864\sb360\sa360\sl278\slmult1 \widctlpar\brdrt\brdrs\brdrw10\brsp200\brdrcf19 \brdrb\brdrs\brdrw10\brsp200\brdrcf19 \wrapdefault\aspalpha\aspnum\faauto\adjustright\rin864\lin864\itap0 \rtlch\fcs1 \ai\af31507\afs24\alang1025 \ltrch\fcs0 \i\f31506\fs24\cf19\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink33 \sqformat \spriority30 \styrsid15678446 Intense Quote;}{\*\cs33 \additive \rtlch\fcs1 \ai\af0 \ltrch\fcs0 \i\cf19 \sbasedon10 \slink32 \spriority30 \styrsid15678446 Intense Quote Char;}{\*\cs34 \additive \rtlch\fcs1 \ab\af0 \ltrch\fcs0 \b\scaps\expnd1\expndtw5\cf19 \sbasedon10 \sqformat \spriority32 \styrsid15678446 Intense Reference;}}{\*\rsidtbl \rsid3543682 \rsid6316520\rsid7364952\rsid8278432\rsid9589131\rsid10298217\rsid15678446\rsid15953651}{\mmathPr\mmathFont34\mbrkBin0\mbrkBinSub0\msmallFrac0\mdispDef1\mlMargin0\mrMargin0\mdefJc1\mwrapIndent1440\mintLim0\mnaryLim1}{\info{\author Adam Fourney} {\operator Adam Fourney}{\creatim\yr2025\mo2\dy9\hr22\min56}{\revtim\yr2025\mo2\dy9\hr22\min58}{\version1}{\edmins2}{\nofpages1}{\nofwords17}{\nofchars98}{\nofcharsws114}{\vern115}}{\*\xmlnstbl {\xmlns1 http://schemas.microsoft.com/office/word/2003/wordm l}}\paperw12240\paperh15840\margl1440\margr1440\margt1440\margb1440\gutter0\ltrsect \widowctrl\ftnbj\aenddoc\trackmoves0\trackformatting1\donotembedsysfont1\relyonvml0\donotembedlingdata0\grfdocevents0\validatexml1\showplaceholdtext0\ignoremixedcontent0\saveinvalidxml0\showxmlerrors1\noxlattoyen \expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1440\dgvorigin1440\dghshow1\dgvshow1 \jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\snaptogridincell\allowfieldendsel\wrppunct \asianbrkrule\rsidroot15678446\newtblstyruls\nogrowautofit\usenormstyforlist\noindnmbrts\felnbrelev\nocxsptable\indrlsweleven\noafcnsttbl\afelev\utinl\hwelev\spltpgpar\notcvasp\notbrkcnstfrctbl\notvatxbx\krnprsnet\cachedcolbal \nouicompat \fet0 {\*\wgrffmtfilter 2450}\nofeaturethrottle1\ilfomacatclnup0\ltrpar \sectd \ltrsect\linex0\endnhere\sectlinegrid360\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}} \pard\plain \ltrpar\s24\ql \li0\ri0\sa80\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid15678446\contextualspace \rtlch\fcs1 \af31503\afs56\alang1025 \ltrch\fcs0 \fs56\expnd-2\expndtw-10\lang1033\langfe1033\kerning28\loch\af31502\hich\af31502\dbch\af31501\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af31503 \ltrch\fcs0 \insrsid15678446 \hich\af31502\dbch\af31501\loch\f31502 This is a \hich\af31502\dbch\af31501\loch\f31502 S\hich\af31502\dbch\af31501\loch\f31502 ample RT\hich\af31502\dbch\af31501\loch\f31502 F \hich\af31502\dbch\af31501\loch\f31502 File}{\rtlch\fcs1 \af31503 \ltrch\fcs0 \insrsid8278432 \par }\pard\plain \ltrpar\ql \li0\ri0\sa160\sl278\slmult1\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31507\afs24\alang1025 \ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033 { \rtlch\fcs1 \af31507 \ltrch\fcs0 \insrsid15678446 \par It is included to test if the MarkItDown sample plugin can correctly convert RTF files. \par }{\*\themedata 504b030414000600080000002100e9de0fbfff0000001c020000130000005b436f6e74656e745f54797065735d2e786d6cac91cb4ec3301045f748fc83e52d4a 9cb2400825e982c78ec7a27cc0c8992416c9d8b2a755fbf74cd25442a820166c2cd933f79e3be372bd1f07b5c3989ca74aaff2422b24eb1b475da5df374fd9ad 5689811a183c61a50f98f4babebc2837878049899a52a57be670674cb23d8e90721f90a4d2fa3802cb35762680fd800ecd7551dc18eb899138e3c943d7e503b6 b01d583deee5f99824e290b4ba3f364eac4a430883b3c092d4eca8f946c916422ecab927f52ea42b89a1cd59c254f919b0e85e6535d135a8de20f20b8c12c3b0 0c895fcf6720192de6bf3b9e89ecdbd6596cbcdd8eb28e7c365ecc4ec1ff1460f53fe813d3cc7f5b7f020000ffff0300504b030414000600080000002100a5d6 a7e7c0000000360100000b0000005f72656c732f2e72656c73848fcf6ac3300c87ef85bd83d17d51d2c31825762fa590432fa37d00e1287f68221bdb1bebdb4f c7060abb0884a4eff7a93dfeae8bf9e194e720169aaa06c3e2433fcb68e1763dbf7f82c985a4a725085b787086a37bdbb55fbc50d1a33ccd311ba548b6309512 0f88d94fbc52ae4264d1c910d24a45db3462247fa791715fd71f989e19e0364cd3f51652d73760ae8fa8c9ffb3c330cc9e4fc17faf2ce545046e37944c69e462 a1a82fe353bd90a865aad41ed0b5b8f9d6fd010000ffff0300504b0304140006000800000021006b799616830000008a0000001c0000007468656d652f746865 6d652f7468656d654d616e616765722e786d6c0ccc4d0ac3201040e17da17790d93763bb284562b2cbaebbf600439c1a41c7a0d29fdbd7e5e38337cedf14d59b 4b0d592c9c070d8a65cd2e88b7f07c2ca71ba8da481cc52c6ce1c715e6e97818c9b48d13df49c873517d23d59085adb5dd20d6b52bd521ef2cdd5eb9246a3d8b 4757e8d3f729e245eb2b260a0238fd010000ffff0300504b030414000600080000002100d3d1e707f007000012220000160000007468656d652f7468656d652f 7468656d65312e786d6cec5a4b8fdbc811be07c87f20789745ea414903cb0b3d3d6bcfd8034b76b0c796d812dbd36413ecd6cc080b0381f794cb020b6c825c02 e496431064812c90452ef931066c249b1f91ea2645754b2dcf030662043373215b5f557f5d555d556cf2e1175731752e70c6094bbaaeffc0731d9ccc59489265 d77d391d57daaec3054a42445982bbee1a73f78b47bffcc5437424221c6307e4137e84ba6e24447a54adf2390c23fe80a53881df162c8b9180db6c590d337409 7a635aad795e508d11495c274131a87dbe58903976a652a5fb68a37c44e136115c0ecc693691aab121a1b0e1b92f117ccd0734732e10edba304fc82ea7f84ab8 0e455cc00f5dd7537f6ef5d1c32a3a2a84a83820abc98dd55f21570884e7353567b69c95937aa35abbe197fa15808a7ddca82dff4b7d0a80e6735869ce45d7e9 3703af5d2bb01a28bfb4e8eeb4fcba89d7f4d7f738fb9da05f6b18fa1528d7dfd8c37be3ce68d834f00a94e39b7bf89e57eb77ea065e81727cb0876f8c7aadda c8c02b50444972be8f0e5aed7650a04bc882d1632bbc13045e6b58c0b728888632bae4140b968843b116a3d72c1b03400229122471c43ac50b348728eea58271 6748784ad1da755294300ec35ecdf721f41a5eadfc571647471869d2921730e17b43928fc3e7194945d77d025a5d0df2fea79fdebdfdf1dddbbfbffbe69b776f ffea9c906524725586dc314a96badccf7ffaee3f7ff8b5f3efbffdf1e7ef7f6bc7731dffe12fbff9f08f7f7e4c3d6cb5ad29deffee870f3ffef0fef7dffeebcf df5bb4f73234d3e1531263ee3cc397ce0b16c30295294cfe7896dd4e621a21a24bf49225470992b358f48f4464a09fad1145165c1f9b767c9541aab1011faf5e 1b842751b612c4a2f169141bc053c6689f65562b3c957369669eae92a57df26ca5e35e2074619b7b8012c3cba3550a3996d8540e226cd03ca328116889132c1c f91b3bc7d8b2baaf0831ec7a4ae619e36c219caf88d347c46a92299919d1b4153a2631f8656d2308fe366c73facae9336a5bf5105f9848d81b885ac84f3135cc f818ad048a6d2aa728a6bac14f90886c2427eb6caee3465c80a7979832671462ce6d32cf3358afe6f4a708b29bd5eda7741d9bc84c90739bce13c4988e1cb2f3 4184e2d4869d9024d2b15ff2730851e49c3161839f327387c87bf0034a0ebafb15c186bbafcf062f21cbe994b601227f5965165f3ec6cc88dfc99a2e10b6a59a 5e161b29b697116b74f4574b23b44f30a6e81285183b2fbfb430e8b3d4b0f996f49308b2ca31b605d61364c6aabc4f30875e493637fb79f2847023642778c90e f0395def249e354a62941dd2fc0cbcaedb7c34cb60335a283ca7f3731df88c400f08f16235ca730e3ab4e03ea8f52c42460193f7dc1eafebccf0df4df618eccb d7068d1bec4b90c1b79681c4aecb7cd43653448d09b6013345c439b1a55b1031dcbf1591c55589adac720b73d36edd00dd91d1f4c424b9a603fadf743e9640fc 343d8f5db191b06ed9ed1c4a28c73b3dce21dc6e67336059483effc6668856c919865ab29fb5eefb9afbbec6fdbfef6b0eede7fb6ee650cf71dfcdb8d065dc77 33c501cba7e966b60d0cf436f290213fec51473ff1c1939f05a17422d6149f7075f8c3e199261cc3a09453a79eb83c094c23b894650e263070cb0c29192763e2 5744449308a57042e4bb52c99217aa97dc4919878323356cd52df174159fb2303ff054274c5e5e593912db71af09474ff9381c56891c1db48a41c94f9daa025f c576a90e5b3704a4ec6d4868939924ea1612adcde03524e4d9d9a761d1b1b0684bf51b57ed9902a8955e81876e071ed5bb6eb32109c149399f43831e4a3fe5ae de785739f3537afa90318d0880c3c57c2570345f7aba23b91e5c9e5c5d1e6a37f0b4414239250f2b9384b28c6af078048fc24574cad19bd0b8adaf3b5b971af4 a429d47c10df5b1aadf6c758dcd5d720b79b1b68a2670a9a38975d37a8372164e628edba0b383886cb3885d8e1f2b90bd125bc7d998b2cdff077c92c69c6c510 f12837b84a3ab97b622270e65012775db9fcd20d3451394471f36b90103e5b721d482b9f1b3970bae964bc58e0b9d0ddae8d484be7b790e1f35c61fd5589df1d 2c25d90adc3d89c24b674657d90b0421d66cf9d28021e1f0fec0cfad191278215626b26dfced14a622f9eb6fa4540ce5e388a6112a2a8a9ecc73b8aa27251d75 57da40bb2bd60c06d54c5214c2d9521658dda846352d4b57cee160d5bd5e485a4e4b9adb9a6964155935ed59cc98615306766c79b722afb1da9818729a5ee1f3 d4bd9b723b9b5cb7d3279455020c5edaef6ea55fa3b69dcca02619efa76199b38b51b3766c16780db59b14092deb071bb53b762b6b84753a18bc53e507b9dda8 85a1c5a6af5496566fcef597db6cf61a92c710badc15cd5f77d304ee6454f2f42c53be9db1705d5c529e279adce7b22795489abcc00b8784579b7eb2746fbe3d f257ae7ed10c28b41493b5ab14b4367ba6608197a2f986bd8d7029a16686d6bb1456c78ab67e575c6d28cb561df0ca843c5f3598b6b0145ced5b118ec83304ad ed44357679ee05da57a2c82f70e5ac32d275bff69abdc6a0d61c54bc76735469d41b5ea5ddecd52bbd66b3ee8f9abe37ecd7de003d11c57e33fff4610c6f82e8 baf800428def7d04116f5e763d98b3b8cad4470e55e57df511845f3bfc110438126805b571a7dee907954ebd37ae3486fd76a53308fa956130680dc7c341b3dd 19bf719d0b056ef4ea8346306a57027f30a834024fd26f772aad46add66bb47aed51a3f7a6703fac3ccfc1852dc07c8ad7a3ff020000ffff0300504b03041400 06000800000021000dd1909fb60000001b010000270000007468656d652f7468656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c7384 8f4d0ac2301484f78277086f6fd3ba109126dd88d0add40384e4350d363f2451eced0dae2c082e8761be9969bb979dc9136332de3168aa1a083ae995719ac16d b8ec8e4052164e89d93b64b060828e6f37ed1567914b284d262452282e3198720e274a939cd08a54f980ae38a38f56e422a3a641c8bbd048f7757da0f19b017c c524bd62107bd5001996509affb3fd381a89672f1f165dfe514173d9850528a2c6cce0239baa4c04ca5bbabac4df000000ffff0300504b01022d001400060008 0000002100e9de0fbfff0000001c0200001300000000000000000000000000000000005b436f6e74656e745f54797065735d2e786d6c504b01022d0014000600 080000002100a5d6a7e7c0000000360100000b00000000000000000000000000300100005f72656c732f2e72656c73504b01022d00140006000800000021006b 799616830000008a0000001c00000000000000000000000000190200007468656d652f7468656d652f7468656d654d616e616765722e786d6c504b01022d0014 000600080000002100d3d1e707f0070000122200001600000000000000000000000000d60200007468656d652f7468656d652f7468656d65312e786d6c504b01 022d00140006000800000021000dd1909fb60000001b0100002700000000000000000000000000fa0a00007468656d652f7468656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c73504b050600000000050005005d010000f50b00000000} {\*\colorschememapping 3c3f786d6c2076657273696f6e3d22312e302220656e636f64696e673d225554462d3822207374616e64616c6f6e653d22796573223f3e0d0a3c613a636c724d 617020786d6c6e733a613d22687474703a2f2f736368656d61732e6f70656e786d6c666f726d6174732e6f72672f64726177696e676d6c2f323030362f6d6169 6e22206267313d226c743122207478313d22646b3122206267323d226c743222207478323d22646b322220616363656e74313d22616363656e74312220616363 656e74323d22616363656e74322220616363656e74333d22616363656e74332220616363656e74343d22616363656e74342220616363656e74353d22616363656e74352220616363656e74363d22616363656e74362220686c696e6b3d22686c696e6b2220666f6c486c696e6b3d22666f6c486c696e6b222f3e} {\*\latentstyles\lsdstimax376\lsdlockeddef0\lsdsemihiddendef0\lsdunhideuseddef0\lsdqformatdef0\lsdprioritydef99{\lsdlockedexcept \lsdqformat1 \lsdpriority0 \lsdlocked0 Normal;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 1; \lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 2;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 3;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 4; \lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 5;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 6;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 7; \lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 8;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 9;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 1; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 5; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 6;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 7;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 8;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 9; \lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 1;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 2;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 3; \lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 4;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 5;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 6; \lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 7;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 8;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 9;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Normal Indent; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 footnote text;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 annotation text;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 header;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 footer; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index heading;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority35 \lsdlocked0 caption;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 table of figures; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 envelope address;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 envelope return;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 footnote reference;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 annotation reference; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 line number;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 page number;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 endnote reference;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 endnote text; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 table of authorities;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 macro;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 toa heading;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Bullet;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Number;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List 3; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Bullet 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Bullet 3; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Bullet 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Bullet 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Number 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Number 3; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Number 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Number 5;\lsdqformat1 \lsdpriority10 \lsdlocked0 Title;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Closing; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Signature;\lsdsemihidden1 \lsdunhideused1 \lsdpriority1 \lsdlocked0 Default Paragraph Font;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text Indent; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Continue;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Continue 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Continue 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Continue 4; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Continue 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Message Header;\lsdqformat1 \lsdpriority11 \lsdlocked0 Subtitle;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Salutation; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Date;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text First Indent;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text First Indent 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Note Heading; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text Indent 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text Indent 3; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Block Text;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Hyperlink;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 FollowedHyperlink;\lsdqformat1 \lsdpriority22 \lsdlocked0 Strong; \lsdqformat1 \lsdpriority20 \lsdlocked0 Emphasis;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Document Map;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Plain Text;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 E-mail Signature; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Top of Form;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Bottom of Form;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Normal (Web);\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Acronym; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Address;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Cite;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Code;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Definition; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Keyboard;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Preformatted;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Sample;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Typewriter; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Variable;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Normal Table;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 annotation subject;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 No List; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Outline List 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Outline List 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Outline List 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Simple 1; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Simple 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Simple 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Classic 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Classic 2; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Classic 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Classic 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Colorful 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Colorful 2; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Colorful 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Columns 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Columns 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Columns 3; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Columns 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Columns 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 2; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 6; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 7;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 8;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 2; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 6; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 7;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 8;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table 3D effects 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table 3D effects 2; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table 3D effects 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Contemporary;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Elegant;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Professional; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Subtle 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Subtle 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Web 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Web 2; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Web 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Balloon Text;\lsdpriority39 \lsdlocked0 Table Grid;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Theme;\lsdsemihidden1 \lsdlocked0 Placeholder Text; \lsdqformat1 \lsdpriority1 \lsdlocked0 No Spacing;\lsdpriority60 \lsdlocked0 Light Shading;\lsdpriority61 \lsdlocked0 Light List;\lsdpriority62 \lsdlocked0 Light Grid;\lsdpriority63 \lsdlocked0 Medium Shading 1;\lsdpriority64 \lsdlocked0 Medium Shading 2; \lsdpriority65 \lsdlocked0 Medium List 1;\lsdpriority66 \lsdlocked0 Medium List 2;\lsdpriority67 \lsdlocked0 Medium Grid 1;\lsdpriority68 \lsdlocked0 Medium Grid 2;\lsdpriority69 \lsdlocked0 Medium Grid 3;\lsdpriority70 \lsdlocked0 Dark List; \lsdpriority71 \lsdlocked0 Colorful Shading;\lsdpriority72 \lsdlocked0 Colorful List;\lsdpriority73 \lsdlocked0 Colorful Grid;\lsdpriority60 \lsdlocked0 Light Shading Accent 1;\lsdpriority61 \lsdlocked0 Light List Accent 1; \lsdpriority62 \lsdlocked0 Light Grid Accent 1;\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 1;\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 1;\lsdpriority65 \lsdlocked0 Medium List 1 Accent 1;\lsdsemihidden1 \lsdlocked0 Revision; \lsdqformat1 \lsdpriority34 \lsdlocked0 List Paragraph;\lsdqformat1 \lsdpriority29 \lsdlocked0 Quote;\lsdqformat1 \lsdpriority30 \lsdlocked0 Intense Quote;\lsdpriority66 \lsdlocked0 Medium List 2 Accent 1;\lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 1; \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 1;\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 1;\lsdpriority70 \lsdlocked0 Dark List Accent 1;\lsdpriority71 \lsdlocked0 Colorful Shading Accent 1;\lsdpriority72 \lsdlocked0 Colorful List Accent 1; \lsdpriority73 \lsdlocked0 Colorful Grid Accent 1;\lsdpriority60 \lsdlocked0 Light Shading Accent 2;\lsdpriority61 \lsdlocked0 Light List Accent 2;\lsdpriority62 \lsdlocked0 Light Grid Accent 2;\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 2; \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 2;\lsdpriority65 \lsdlocked0 Medium List 1 Accent 2;\lsdpriority66 \lsdlocked0 Medium List 2 Accent 2;\lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 2;\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 2; \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 2;\lsdpriority70 \lsdlocked0 Dark List Accent 2;\lsdpriority71 \lsdlocked0 Colorful Shading Accent 2;\lsdpriority72 \lsdlocked0 Colorful List Accent 2;\lsdpriority73 \lsdlocked0 Colorful Grid Accent 2; \lsdpriority60 \lsdlocked0 Light Shading Accent 3;\lsdpriority61 \lsdlocked0 Light List Accent 3;\lsdpriority62 \lsdlocked0 Light Grid Accent 3;\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 3;\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 3; \lsdpriority65 \lsdlocked0 Medium List 1 Accent 3;\lsdpriority66 \lsdlocked0 Medium List 2 Accent 3;\lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 3;\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 3;\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 3; \lsdpriority70 \lsdlocked0 Dark List Accent 3;\lsdpriority71 \lsdlocked0 Colorful Shading Accent 3;\lsdpriority72 \lsdlocked0 Colorful List Accent 3;\lsdpriority73 \lsdlocked0 Colorful Grid Accent 3;\lsdpriority60 \lsdlocked0 Light Shading Accent 4; \lsdpriority61 \lsdlocked0 Light List Accent 4;\lsdpriority62 \lsdlocked0 Light Grid Accent 4;\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 4;\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 4;\lsdpriority65 \lsdlocked0 Medium List 1 Accent 4; \lsdpriority66 \lsdlocked0 Medium List 2 Accent 4;\lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 4;\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 4;\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 4;\lsdpriority70 \lsdlocked0 Dark List Accent 4; \lsdpriority71 \lsdlocked0 Colorful Shading Accent 4;\lsdpriority72 \lsdlocked0 Colorful List Accent 4;\lsdpriority73 \lsdlocked0 Colorful Grid Accent 4;\lsdpriority60 \lsdlocked0 Light Shading Accent 5;\lsdpriority61 \lsdlocked0 Light List Accent 5; \lsdpriority62 \lsdlocked0 Light Grid Accent 5;\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 5;\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 5;\lsdpriority65 \lsdlocked0 Medium List 1 Accent 5;\lsdpriority66 \lsdlocked0 Medium List 2 Accent 5; \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 5;\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 5;\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 5;\lsdpriority70 \lsdlocked0 Dark List Accent 5;\lsdpriority71 \lsdlocked0 Colorful Shading Accent 5; \lsdpriority72 \lsdlocked0 Colorful List Accent 5;\lsdpriority73 \lsdlocked0 Colorful Grid Accent 5;\lsdpriority60 \lsdlocked0 Light Shading Accent 6;\lsdpriority61 \lsdlocked0 Light List Accent 6;\lsdpriority62 \lsdlocked0 Light Grid Accent 6; \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 6;\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 6;\lsdpriority65 \lsdlocked0 Medium List 1 Accent 6;\lsdpriority66 \lsdlocked0 Medium List 2 Accent 6; \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 6;\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 6;\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 6;\lsdpriority70 \lsdlocked0 Dark List Accent 6;\lsdpriority71 \lsdlocked0 Colorful Shading Accent 6; \lsdpriority72 \lsdlocked0 Colorful List Accent 6;\lsdpriority73 \lsdlocked0 Colorful Grid Accent 6;\lsdqformat1 \lsdpriority19 \lsdlocked0 Subtle Emphasis;\lsdqformat1 \lsdpriority21 \lsdlocked0 Intense Emphasis; \lsdqformat1 \lsdpriority31 \lsdlocked0 Subtle Reference;\lsdqformat1 \lsdpriority32 \lsdlocked0 Intense Reference;\lsdqformat1 \lsdpriority33 \lsdlocked0 Book Title;\lsdsemihidden1 \lsdunhideused1 \lsdpriority37 \lsdlocked0 Bibliography; \lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority39 \lsdlocked0 TOC Heading;\lsdpriority41 \lsdlocked0 Plain Table 1;\lsdpriority42 \lsdlocked0 Plain Table 2;\lsdpriority43 \lsdlocked0 Plain Table 3;\lsdpriority44 \lsdlocked0 Plain Table 4; \lsdpriority45 \lsdlocked0 Plain Table 5;\lsdpriority40 \lsdlocked0 Grid Table Light;\lsdpriority46 \lsdlocked0 Grid Table 1 Light;\lsdpriority47 \lsdlocked0 Grid Table 2;\lsdpriority48 \lsdlocked0 Grid Table 3;\lsdpriority49 \lsdlocked0 Grid Table 4; \lsdpriority50 \lsdlocked0 Grid Table 5 Dark;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful;\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful;\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 1;\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 1; \lsdpriority48 \lsdlocked0 Grid Table 3 Accent 1;\lsdpriority49 \lsdlocked0 Grid Table 4 Accent 1;\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 1;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 1; \lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 1;\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 2;\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 2;\lsdpriority48 \lsdlocked0 Grid Table 3 Accent 2; \lsdpriority49 \lsdlocked0 Grid Table 4 Accent 2;\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 2;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 2;\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 2; \lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 3;\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 3;\lsdpriority48 \lsdlocked0 Grid Table 3 Accent 3;\lsdpriority49 \lsdlocked0 Grid Table 4 Accent 3; \lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 3;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 3;\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 3;\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 4; \lsdpriority47 \lsdlocked0 Grid Table 2 Accent 4;\lsdpriority48 \lsdlocked0 Grid Table 3 Accent 4;\lsdpriority49 \lsdlocked0 Grid Table 4 Accent 4;\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 4; \lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 4;\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 4;\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 5;\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 5; \lsdpriority48 \lsdlocked0 Grid Table 3 Accent 5;\lsdpriority49 \lsdlocked0 Grid Table 4 Accent 5;\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 5;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 5; \lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 5;\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 6;\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 6;\lsdpriority48 \lsdlocked0 Grid Table 3 Accent 6; \lsdpriority49 \lsdlocked0 Grid Table 4 Accent 6;\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 6;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 6;\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 6; \lsdpriority46 \lsdlocked0 List Table 1 Light;\lsdpriority47 \lsdlocked0 List Table 2;\lsdpriority48 \lsdlocked0 List Table 3;\lsdpriority49 \lsdlocked0 List Table 4;\lsdpriority50 \lsdlocked0 List Table 5 Dark; \lsdpriority51 \lsdlocked0 List Table 6 Colorful;\lsdpriority52 \lsdlocked0 List Table 7 Colorful;\lsdpriority46 \lsdlocked0 List Table 1 Light Accent 1;\lsdpriority47 \lsdlocked0 List Table 2 Accent 1;\lsdpriority48 \lsdlocked0 List Table 3 Accent 1; \lsdpriority49 \lsdlocked0 List Table 4 Accent 1;\lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 1;\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 1;\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 1; \lsdpriority46 \lsdlocked0 List Table 1 Light Accent 2;\lsdpriority47 \lsdlocked0 List Table 2 Accent 2;\lsdpriority48 \lsdlocked0 List Table 3 Accent 2;\lsdpriority49 \lsdlocked0 List Table 4 Accent 2; \lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 2;\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 2;\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 2;\lsdpriority46 \lsdlocked0 List Table 1 Light Accent 3; \lsdpriority47 \lsdlocked0 List Table 2 Accent 3;\lsdpriority48 \lsdlocked0 List Table 3 Accent 3;\lsdpriority49 \lsdlocked0 List Table 4 Accent 3;\lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 3; \lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 3;\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 3;\lsdpriority46 \lsdlocked0 List Table 1 Light Accent 4;\lsdpriority47 \lsdlocked0 List Table 2 Accent 4; \lsdpriority48 \lsdlocked0 List Table 3 Accent 4;\lsdpriority49 \lsdlocked0 List Table 4 Accent 4;\lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 4;\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 4; \lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 4;\lsdpriority46 \lsdlocked0 List Table 1 Light Accent 5;\lsdpriority47 \lsdlocked0 List Table 2 Accent 5;\lsdpriority48 \lsdlocked0 List Table 3 Accent 5; \lsdpriority49 \lsdlocked0 List Table 4 Accent 5;\lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 5;\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 5;\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 5; \lsdpriority46 \lsdlocked0 List Table 1 Light Accent 6;\lsdpriority47 \lsdlocked0 List Table 2 Accent 6;\lsdpriority48 \lsdlocked0 List Table 3 Accent 6;\lsdpriority49 \lsdlocked0 List Table 4 Accent 6; \lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 6;\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 6;\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 6;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Mention; \lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Smart Hyperlink;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Hashtag;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Unresolved Mention;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Smart Link;}}{\*\datastore 01050000 02000000180000004d73786d6c322e534158584d4c5265616465722e362e3000000000000000000000060000 d0cf11e0a1b11ae1000000000000000000000000000000003e000300feff090006000000000000000000000001000000010000000000000000100000feffffff00000000feffffff0000000000000000ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff fffffffffffffffffdfffffffeffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff ffffffffffffffffffffffffffffffff52006f006f007400200045006e00740072007900000000000000000000000000000000000000000000000000000000000000000000000000000000000000000016000500ffffffffffffffffffffffff0c6ad98892f1d411a65f0040963251e5000000000000000000000000f0af 5b31897bdb01feffffff00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff00000000000000000000000000000000000000000000000000000000 00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff0000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000105000000000000}} ``` ## /packages/markitdown-sample-plugin/tests/test_sample_plugin.py ```py path="/packages/markitdown-sample-plugin/tests/test_sample_plugin.py" #!/usr/bin/env python3 -m pytest import os import pytest from markitdown import MarkItDown, StreamInfo from markitdown_sample_plugin import RtfConverter TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") RTF_TEST_STRINGS = { "This is a Sample RTF File", "It is included to test if the MarkItDown sample plugin can correctly convert RTF files.", } def test_converter() -> None: """Tests the RTF converter dirctly.""" with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream: converter = RtfConverter() result = converter.convert( file_stream=file_stream, stream_info=StreamInfo( mimetype="text/rtf", extension=".rtf", filename="test.rtf" ), ) for test_string in RTF_TEST_STRINGS: assert test_string in result.text_content def test_markitdown() -> None: """Tests that MarkItDown correctly loads the plugin.""" md = MarkItDown(enable_plugins=True) result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf")) for test_string in RTF_TEST_STRINGS: assert test_string in result.text_content if __name__ == "__main__": """Runs this file's tests from the command line.""" test_converter() test_markitdown() print("All tests passed.") ``` ## /packages/markitdown/README.md # MarkItDown > [!IMPORTANT] > MarkItDown is a Python package and command-line utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). > > For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub. ## Installation From PyPI: ```bash pip install markitdown[all] ``` From source: ```bash git clone git@github.com:microsoft/markitdown.git cd markitdown pip install -e packages/markitdown[all] ``` ## Usage ### Command-Line ```bash markitdown path-to-file.pdf > document.md ``` ### Python API ```python from markitdown import MarkItDown md = MarkItDown() result = md.convert("test.xlsx") print(result.text_content) ``` ### More Information For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub. ## Trademarks This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. ## /packages/markitdown/ThirdPartyNotices.md # THIRD-PARTY SOFTWARE NOTICES AND INFORMATION **Do Not Translate or Localize** This project incorporates components from the projects listed below. The original copyright notices and the licenses under which MarkItDown received such components are set forth below. MarkItDown reserves all rights not expressly granted herein, whether by implication, estoppel or otherwise. 1.dwml (https://github.com/xiilei/dwml) dwml NOTICES AND INFORMATION BEGIN HERE ----------------------------------------- NOTE 1: What follows is a verbatim copy of dwml's LICENSE file, as it appeared on March 28th, 2025 - including placeholders for the copyright owner and year. NOTE 2: The Apache License, Version 2.0, requires that modifications to the dwml source code be documented. The following section summarizes these changes. The full details are available in the MarkItDown source code repository under PR #1160 (https://github.com/microsoft/markitdown/pull/1160) This project incorporates `dwml/latex_dict.py` and `dwml/omml.py` files without any additional logic modifications (which lives in `packages/markitdown/src/markitdown/converter_utils/docx/math` location). However, we have reformatted the code according to `black` code formatter. From `tests/docx.py` file, we have used `DOCXML_ROOT` XML namespaces and the rest of the file is not used. ----------------------------------------- Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ----------------------------------------- END OF dwml NOTICES AND INFORMATION ## /packages/markitdown/pyproject.toml ```toml path="/packages/markitdown/pyproject.toml" [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [project] name = "markitdown" dynamic = ["version"] description = 'Utility tool for converting various files to Markdown' readme = "README.md" requires-python = ">=3.10" license = "MIT" keywords = [] authors = [ { name = "Adam Fourney", email = "adamfo@microsoft.com" }, ] classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ "beautifulsoup4", "requests", "markdownify", "magika~=0.6.1", "charset-normalizer", ] [project.optional-dependencies] all = [ "python-pptx", "mammoth", "pandas", "openpyxl", "xlrd", "lxml", "pdfminer.six", "olefile", "pydub", "SpeechRecognition", "youtube-transcript-api~=1.0.0", "azure-ai-documentintelligence", "azure-identity" ] pptx = ["python-pptx"] docx = ["mammoth", "lxml"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] pdf = ["pdfminer.six"] outlook = ["olefile"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"] [project.urls] Documentation = "https://github.com/microsoft/markitdown#readme" Issues = "https://github.com/microsoft/markitdown/issues" Source = "https://github.com/microsoft/markitdown" [tool.hatch.version] path = "src/markitdown/__about__.py" [project.scripts] markitdown = "markitdown.__main__:main" [tool.hatch.envs.default] features = ["all"] [tool.hatch.envs.hatch-test] features = ["all"] extra-dependencies = [ "openai", ] [tool.hatch.envs.types] features = ["all"] extra-dependencies = [ "openai", "mypy>=1.0.0", ] [tool.hatch.envs.types.scripts] check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}" [tool.coverage.run] source_pkgs = ["markitdown", "tests"] branch = true parallel = true omit = [ "src/markitdown/__about__.py", ] [tool.coverage.paths] markitdown = ["src/markitdown", "*/markitdown/src/markitdown"] tests = ["tests", "*/markitdown/tests"] [tool.coverage.report] exclude_lines = [ "no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:", ] [tool.hatch.build.targets.sdist] only-include = ["src/markitdown"] ``` ## /packages/markitdown/src/markitdown/__about__.py ```py path="/packages/markitdown/src/markitdown/__about__.py" # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT __version__ = "0.1.1" ``` ## /packages/markitdown/src/markitdown/__init__.py ```py path="/packages/markitdown/src/markitdown/__init__.py" # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT from .__about__ import __version__ from ._markitdown import ( MarkItDown, PRIORITY_SPECIFIC_FILE_FORMAT, PRIORITY_GENERIC_FILE_FORMAT, ) from ._base_converter import DocumentConverterResult, DocumentConverter from ._stream_info import StreamInfo from ._exceptions import ( MarkItDownException, MissingDependencyException, FailedConversionAttempt, FileConversionException, UnsupportedFormatException, ) __all__ = [ "__version__", "MarkItDown", "DocumentConverter", "DocumentConverterResult", "MarkItDownException", "MissingDependencyException", "FailedConversionAttempt", "FileConversionException", "UnsupportedFormatException", "StreamInfo", "PRIORITY_SPECIFIC_FILE_FORMAT", "PRIORITY_GENERIC_FILE_FORMAT", ] ``` ## /packages/markitdown/src/markitdown/__main__.py ```py path="/packages/markitdown/src/markitdown/__main__.py" # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT import argparse import sys import codecs import locale from textwrap import dedent from importlib.metadata import entry_points from .__about__ import __version__ from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", prog="markitdown", formatter_class=argparse.RawDescriptionHelpFormatter, usage=dedent( """ SYNTAX: markitdown If FILENAME is empty, markitdown reads from stdin. EXAMPLE: markitdown example.pdf OR cat example.pdf | markitdown OR markitdown < example.pdf OR to save to a file use markitdown example.pdf -o example.md OR markitdown example.pdf > example.md """ ).strip(), ) parser.add_argument( "-v", "--version", action="version", version=f"%(prog)s {__version__}", help="show the version number and exit", ) parser.add_argument( "-o", "--output", help="Output file name. If not provided, output is written to stdout.", ) parser.add_argument( "-x", "--extension", help="Provide a hint about the file extension (e.g., when reading from stdin).", ) parser.add_argument( "-m", "--mime-type", help="Provide a hint about the file's MIME type.", ) parser.add_argument( "-c", "--charset", help="Provide a hint about the file's charset (e.g, UTF-8).", ) parser.add_argument( "-d", "--use-docintel", action="store_true", help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", ) parser.add_argument( "-e", "--endpoint", type=str, help="Document Intelligence Endpoint. Required if using Document Intelligence.", ) parser.add_argument( "-p", "--use-plugins", action="store_true", help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.", ) parser.add_argument( "--list-plugins", action="store_true", help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", ) parser.add_argument( "--keep-data-uris", action="store_true", help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) parser.add_argument("filename", nargs="?") args = parser.parse_args() # Parse the extension hint extension_hint = args.extension if extension_hint is not None: extension_hint = extension_hint.strip().lower() if len(extension_hint) > 0: if not extension_hint.startswith("."): extension_hint = "." + extension_hint else: extension_hint = None # Parse the mime type mime_type_hint = args.mime_type if mime_type_hint is not None: mime_type_hint = mime_type_hint.strip() if len(mime_type_hint) > 0: if mime_type_hint.count("/") != 1: _exit_with_error(f"Invalid MIME type: {mime_type_hint}") else: mime_type_hint = None # Parse the charset charset_hint = args.charset if charset_hint is not None: charset_hint = charset_hint.strip() if len(charset_hint) > 0: try: charset_hint = codecs.lookup(charset_hint).name except LookupError: _exit_with_error(f"Invalid charset: {charset_hint}") else: charset_hint = None stream_info = None if ( extension_hint is not None or mime_type_hint is not None or charset_hint is not None ): stream_info = StreamInfo( extension=extension_hint, mimetype=mime_type_hint, charset=charset_hint ) if args.list_plugins: # List installed plugins, then exit print("Installed MarkItDown 3rd-party Plugins:\n") plugin_entry_points = list(entry_points(group="markitdown.plugin")) if len(plugin_entry_points) == 0: print(" * No 3rd-party plugins installed.") print( "\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n" ) else: for entry_point in plugin_entry_points: print(f" * {entry_point.name:<16}\t(package: {entry_point.value})") print( "\nUse the -p (or --use-plugins) option to enable 3rd-party plugins.\n" ) sys.exit(0) if args.use_docintel: if args.endpoint is None: _exit_with_error( "Document Intelligence Endpoint is required when using Document Intelligence." ) elif args.filename is None: _exit_with_error("Filename is required when using Document Intelligence.") markitdown = MarkItDown( enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint ) else: markitdown = MarkItDown(enable_plugins=args.use_plugins) if args.filename is None: result = markitdown.convert_stream( sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris, ) else: result = markitdown.convert( args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris ) _handle_output(args, result) def _handle_output(args, result: DocumentConverterResult): """Handle output to stdout or file""" if args.output: with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) else: # Handle stdout encoding errors more gracefully print( result.markdown.encode(sys.stdout.encoding, errors="replace").decode( sys.stdout.encoding ) ) def _exit_with_error(message: str): print(message) sys.exit(1) if __name__ == "__main__": main() ``` ## /packages/markitdown/src/markitdown/_base_converter.py ```py path="/packages/markitdown/src/markitdown/_base_converter.py" import os import tempfile from warnings import warn from typing import Any, Union, BinaryIO, Optional, List from ._stream_info import StreamInfo class DocumentConverterResult: """The result of converting a document to Markdown.""" def __init__( self, markdown: str, *, title: Optional[str] = None, ): """ Initialize the DocumentConverterResult. The only required parameter is the converted Markdown text. The title, and any other metadata that may be added in the future, are optional. Parameters: - markdown: The converted Markdown text. - title: Optional title of the document. """ self.markdown = markdown self.title = title @property def text_content(self) -> str: """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" return self.markdown @text_content.setter def text_content(self, markdown: str): """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__.""" self.markdown = markdown def __str__(self) -> str: """Return the converted Markdown text.""" return self.markdown class DocumentConverter: """Abstract superclass of all DocumentConverters.""" def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: """ Return a quick determination on if the converter should attempt converting the document. This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`). In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to make a determination (e.g., special converters for Wikipedia, YouTube etc). Finally, it is conceivable that the `stream_info.filename` might be used to in cases where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc) NOTE: The method signature is designed to match that of the convert() method. This provides some assurance that, if accepts() returns True, the convert() method will also be able to handle the document. IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final determination. Read operations inevitably advances the position in file_stream. In these case, the position MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately after accepts(), and will expect the file_stream to be at the original position. E.g., cur_pos = file_stream.tell() # Save the current position data = file_stream.read(100) # ... peek at the first 100 bytes, etc. file_stream.seek(cur_pos) # Reset the position to the original position Prameters: - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set) - kwargs: Additional keyword arguments for the converter. Returns: - bool: True if the converter can handle the document, False otherwise. """ raise NotImplementedError( f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document." ) def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: """ Convert a document to Markdown text. Prameters: - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods. - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set) - kwargs: Additional keyword arguments for the converter. Returns: - DocumentConverterResult: The result of the conversion, which includes the title and markdown content. Raises: - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason. - MissingDependencyException: If the converter requires a dependency that is not installed. """ raise NotImplementedError("Subclasses must implement this method") ``` ## /packages/markitdown/src/markitdown/_exceptions.py ```py path="/packages/markitdown/src/markitdown/_exceptions.py" from typing import Optional, List, Any MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential {extension} file, but the dependencies needed to read {extension} files have not been installed. To resolve this error, include the optional dependency [{feature}] or [all] when installing MarkItDown. For example: * pip install markitdown[{feature}] * pip install markitdown[all] * pip install markitdown[{feature}, ...] * etc.""" class MarkItDownException(Exception): """ Base exception class for MarkItDown. """ pass class MissingDependencyException(MarkItDownException): """ Converters shipped with MarkItDown may depend on optional dependencies. This exception is thrown when a converter's convert() method is called, but the required dependency is not installed. This is not necessarily a fatal error, as the converter will simply be skipped (an error will bubble up only if no other suitable converter is found). Error messages should clearly indicate which dependency is missing. """ pass class UnsupportedFormatException(MarkItDownException): """ Thrown when no suitable converter was found for the given file. """ pass class FailedConversionAttempt(object): """ Represents an a single attempt to convert a file. """ def __init__(self, converter: Any, exc_info: Optional[tuple] = None): self.converter = converter self.exc_info = exc_info class FileConversionException(MarkItDownException): """ Thrown when a suitable converter was found, but the conversion process fails for any reason. """ def __init__( self, message: Optional[str] = None, attempts: Optional[List[FailedConversionAttempt]] = None, ): self.attempts = attempts if message is None: if attempts is None: message = "File conversion failed." else: message = f"File conversion failed after {len(attempts)} attempts:\n" for attempt in attempts: if attempt.exc_info is None: message += f" - {type(attempt.converter).__name__} provided no execution info." else: message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n" super().__init__(message) ``` ## /packages/markitdown/src/markitdown/_markitdown.py ```py path="/packages/markitdown/src/markitdown/_markitdown.py" import copy import mimetypes import os import re import sys import shutil import tempfile import warnings import traceback import io from dataclasses import dataclass from importlib.metadata import entry_points from typing import Any, List, Dict, Optional, Union, BinaryIO from pathlib import Path from urllib.parse import urlparse from warnings import warn import requests import magika import charset_normalizer import codecs from ._stream_info import StreamInfo from ._uri_utils import parse_data_uri, file_uri_to_path from .converters import ( PlainTextConverter, HtmlConverter, RssConverter, WikipediaConverter, YouTubeConverter, IpynbConverter, BingSerpConverter, PdfConverter, DocxConverter, XlsxConverter, XlsConverter, PptxConverter, ImageConverter, AudioConverter, OutlookMsgConverter, ZipConverter, EpubConverter, DocumentIntelligenceConverter, CsvConverter, ) from ._base_converter import DocumentConverter, DocumentConverterResult from ._exceptions import ( FileConversionException, UnsupportedFormatException, FailedConversionAttempt, ) # Lower priority values are tried first. PRIORITY_SPECIFIC_FILE_FORMAT = ( 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia ) PRIORITY_GENERIC_FILE_FORMAT = ( 10.0 # Near catch-all converters for mimetypes like text/*, etc. ) _plugins: Union[None, List[Any]] = None # If None, plugins have not been loaded yet. def _load_plugins() -> Union[None, List[Any]]: """Lazy load plugins, exiting early if already loaded.""" global _plugins # Skip if we've already loaded plugins if _plugins is not None: return _plugins # Load plugins _plugins = [] for entry_point in entry_points(group="markitdown.plugin"): try: _plugins.append(entry_point.load()) except Exception: tb = traceback.format_exc() warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}") return _plugins @dataclass(kw_only=True, frozen=True) class ConverterRegistration: """A registration of a converter with its priority and other metadata.""" converter: DocumentConverter priority: float class MarkItDown: """(In preview) An extremely simple text-based document reader, suitable for LLM use. This reader will convert common file-types or webpages to Markdown.""" def __init__( self, *, enable_builtins: Union[None, bool] = None, enable_plugins: Union[None, bool] = None, **kwargs, ): self._builtins_enabled = False self._plugins_enabled = False requests_session = kwargs.get("requests_session") if requests_session is None: self._requests_session = requests.Session() else: self._requests_session = requests_session self._magika = magika.Magika() # TODO - remove these (see enable_builtins) self._llm_client: Any = None self._llm_model: Union[str | None] = None self._exiftool_path: Union[str | None] = None self._style_map: Union[str | None] = None # Register the converters self._converters: List[ConverterRegistration] = [] if ( enable_builtins is None or enable_builtins ): # Default to True when not specified self.enable_builtins(**kwargs) if enable_plugins: self.enable_plugins(**kwargs) def enable_builtins(self, **kwargs) -> None: """ Enable and register built-in converters. Built-in converters are enabled by default. This method should only be called once, if built-ins were initially disabled. """ if not self._builtins_enabled: # TODO: Move these into converter constructors self._llm_client = kwargs.get("llm_client") self._llm_model = kwargs.get("llm_model") self._exiftool_path = kwargs.get("exiftool_path") self._style_map = kwargs.get("style_map") if self._exiftool_path is None: self._exiftool_path = os.getenv("EXIFTOOL_PATH") # Still none? Check well-known paths if self._exiftool_path is None: candidate = shutil.which("exiftool") if candidate: candidate = os.path.abspath(candidate) if any( d == os.path.dirname(candidate) for d in [ "/usr/bin", "/usr/local/bin", "/opt", "/opt/bin", "/opt/local/bin", "/opt/homebrew/bin", "C:\\Windows\\System32", "C:\\Program Files", "C:\\Program Files (x86)", ] ): self._exiftool_path = candidate # Register converters for successful browsing operations # Later registrations are tried first / take higher priority than earlier registrations # To this end, the most specific converters should appear below the most generic converters self.register_converter( PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT ) self.register_converter( ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT ) self.register_converter( HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT ) self.register_converter(RssConverter()) self.register_converter(WikipediaConverter()) self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) self.register_converter(DocxConverter()) self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) self.register_converter(AudioConverter()) self.register_converter(ImageConverter()) self.register_converter(IpynbConverter()) self.register_converter(PdfConverter()) self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") if docintel_endpoint is not None: docintel_args: Dict[str, Any] = {} docintel_args["endpoint"] = docintel_endpoint docintel_credential = kwargs.get("docintel_credential") if docintel_credential is not None: docintel_args["credential"] = docintel_credential docintel_types = kwargs.get("docintel_file_types") if docintel_types is not None: docintel_args["file_types"] = docintel_types self.register_converter( DocumentIntelligenceConverter(**docintel_args), ) self._builtins_enabled = True else: warn("Built-in converters are already enabled.", RuntimeWarning) def enable_plugins(self, **kwargs) -> None: """ Enable and register converters provided by plugins. Plugins are disabled by default. This method should only be called once, if plugins were initially disabled. """ if not self._plugins_enabled: # Load plugins plugins = _load_plugins() assert plugins is not None for plugin in plugins: try: plugin.register_converters(self, **kwargs) except Exception: tb = traceback.format_exc() warn(f"Plugin '{plugin}' failed to register converters:\n{tb}") self._plugins_enabled = True else: warn("Plugins converters are already enabled.", RuntimeWarning) def convert( self, source: Union[str, requests.Response, Path, BinaryIO], *, stream_info: Optional[StreamInfo] = None, **kwargs: Any, ) -> DocumentConverterResult: # TODO: deal with kwargs """ Args: - source: can be a path (str or Path), url, or a requests.response object - stream_info: optional stream info to use for the conversion. If None, infer from source - kwargs: additional arguments to pass to the converter """ # Local path or url if isinstance(source, str): if ( source.startswith("http:") or source.startswith("https:") or source.startswith("file:") or source.startswith("data:") ): # Rename the url argument to mock_url # (Deprecated -- use stream_info) _kwargs = {k: v for k, v in kwargs.items()} if "url" in _kwargs: _kwargs["mock_url"] = _kwargs["url"] del _kwargs["url"] return self.convert_uri(source, stream_info=stream_info, **_kwargs) else: return self.convert_local(source, stream_info=stream_info, **kwargs) # Path object elif isinstance(source, Path): return self.convert_local(source, stream_info=stream_info, **kwargs) # Request response elif isinstance(source, requests.Response): return self.convert_response(source, stream_info=stream_info, **kwargs) # Binary stream elif ( hasattr(source, "read") and callable(source.read) and not isinstance(source, io.TextIOBase) ): return self.convert_stream(source, stream_info=stream_info, **kwargs) else: raise TypeError( f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO." ) def convert_local( self, path: Union[str, Path], *, stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info url: Optional[str] = None, # Deprecated -- use stream_info **kwargs: Any, ) -> DocumentConverterResult: if isinstance(path, Path): path = str(path) # Build a base StreamInfo object from which to start guesses base_guess = StreamInfo( local_path=path, extension=os.path.splitext(path)[1], filename=os.path.basename(path), ) # Extend the base_guess with any additional info from the arguments if stream_info is not None: base_guess = base_guess.copy_and_update(stream_info) if file_extension is not None: # Deprecated -- use stream_info base_guess = base_guess.copy_and_update(extension=file_extension) if url is not None: # Deprecated -- use stream_info base_guess = base_guess.copy_and_update(url=url) with open(path, "rb") as fh: guesses = self._get_stream_info_guesses( file_stream=fh, base_guess=base_guess ) return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs) def convert_stream( self, stream: BinaryIO, *, stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info url: Optional[str] = None, # Deprecated -- use stream_info **kwargs: Any, ) -> DocumentConverterResult: guesses: List[StreamInfo] = [] # Do we have anything on which to base a guess? base_guess = None if stream_info is not None or file_extension is not None or url is not None: # Start with a non-Null base guess if stream_info is None: base_guess = StreamInfo() else: base_guess = stream_info if file_extension is not None: # Deprecated -- use stream_info assert base_guess is not None # for mypy base_guess = base_guess.copy_and_update(extension=file_extension) if url is not None: # Deprecated -- use stream_info assert base_guess is not None # for mypy base_guess = base_guess.copy_and_update(url=url) # Check if we have a seekable stream. If not, load the entire stream into memory. if not stream.seekable(): buffer = io.BytesIO() while True: chunk = stream.read(4096) if not chunk: break buffer.write(chunk) buffer.seek(0) stream = buffer # Add guesses based on stream content guesses = self._get_stream_info_guesses( file_stream=stream, base_guess=base_guess or StreamInfo() ) return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs) def convert_url( self, url: str, *, stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, mock_url: Optional[str] = None, **kwargs: Any, ) -> DocumentConverterResult: """Alias for convert_uri()""" # convert_url will likely be deprecated in the future in favor of convert_uri return self.convert_uri( url, stream_info=stream_info, file_extension=file_extension, mock_url=mock_url, **kwargs, ) def convert_uri( self, uri: str, *, stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info mock_url: Optional[ str ] = None, # Mock the request as if it came from a different URL **kwargs: Any, ) -> DocumentConverterResult: uri = uri.strip() # File URIs if uri.startswith("file:"): netloc, path = file_uri_to_path(uri) if netloc and netloc != "localhost": raise ValueError( f"Unsupported file URI: {uri}. Netloc must be empty or localhost." ) return self.convert_local( path, stream_info=stream_info, file_extension=file_extension, url=mock_url, **kwargs, ) # Data URIs elif uri.startswith("data:"): mimetype, attributes, data = parse_data_uri(uri) base_guess = StreamInfo( mimetype=mimetype, charset=attributes.get("charset"), ) if stream_info is not None: base_guess = base_guess.copy_and_update(stream_info) return self.convert_stream( io.BytesIO(data), stream_info=base_guess, file_extension=file_extension, url=mock_url, **kwargs, ) # HTTP/HTTPS URIs elif uri.startswith("http:") or uri.startswith("https:"): response = self._requests_session.get(uri, stream=True) response.raise_for_status() return self.convert_response( response, stream_info=stream_info, file_extension=file_extension, url=mock_url, **kwargs, ) else: raise ValueError( f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:" ) def convert_response( self, response: requests.Response, *, stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info url: Optional[str] = None, # Deprecated -- use stream_info **kwargs: Any, ) -> DocumentConverterResult: # If there is a content-type header, get the mimetype and charset (if present) mimetype: Optional[str] = None charset: Optional[str] = None if "content-type" in response.headers: parts = response.headers["content-type"].split(";") mimetype = parts.pop(0).strip() for part in parts: if part.strip().startswith("charset="): _charset = part.split("=")[1].strip() if len(_charset) > 0: charset = _charset # If there is a content-disposition header, get the filename and possibly the extension filename: Optional[str] = None extension: Optional[str] = None if "content-disposition" in response.headers: m = re.search(r"filename=([^;]+)", response.headers["content-disposition"]) if m: filename = m.group(1).strip("\"'") _, _extension = os.path.splitext(filename) if len(_extension) > 0: extension = _extension # If there is still no filename, try to read it from the url if filename is None: parsed_url = urlparse(response.url) _, _extension = os.path.splitext(parsed_url.path) if len(_extension) > 0: # Looks like this might be a file! filename = os.path.basename(parsed_url.path) extension = _extension # Create an initial guess from all this information base_guess = StreamInfo( mimetype=mimetype, charset=charset, filename=filename, extension=extension, url=response.url, ) # Update with any additional info from the arguments if stream_info is not None: base_guess = base_guess.copy_and_update(stream_info) if file_extension is not None: # Deprecated -- use stream_info base_guess = base_guess.copy_and_update(extension=file_extension) if url is not None: # Deprecated -- use stream_info base_guess = base_guess.copy_and_update(url=url) # Read into BytesIO buffer = io.BytesIO() for chunk in response.iter_content(chunk_size=512): buffer.write(chunk) buffer.seek(0) # Convert guesses = self._get_stream_info_guesses( file_stream=buffer, base_guess=base_guess ) return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs) def _convert( self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs ) -> DocumentConverterResult: res: Union[None, DocumentConverterResult] = None # Keep track of which converters throw exceptions failed_attempts: List[FailedConversionAttempt] = [] # Create a copy of the page_converters list, sorted by priority. # We do this with each call to _convert because the priority of converters may change between calls. # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. sorted_registrations = sorted(self._converters, key=lambda x: x.priority) # Remember the initial stream position so that we can return to it cur_pos = file_stream.tell() for stream_info in stream_info_guesses + [StreamInfo()]: for converter_registration in sorted_registrations: converter = converter_registration.converter # Sanity check -- make sure the cur_pos is still the same assert ( cur_pos == file_stream.tell() ), f"File stream position should NOT change between guess iterations" _kwargs = {k: v for k, v in kwargs.items()} # Copy any additional global options if "llm_client" not in _kwargs and self._llm_client is not None: _kwargs["llm_client"] = self._llm_client if "llm_model" not in _kwargs and self._llm_model is not None: _kwargs["llm_model"] = self._llm_model if "style_map" not in _kwargs and self._style_map is not None: _kwargs["style_map"] = self._style_map if "exiftool_path" not in _kwargs and self._exiftool_path is not None: _kwargs["exiftool_path"] = self._exiftool_path # Add the list of converters for nested processing _kwargs["_parent_converters"] = self._converters # Add legaxy kwargs if stream_info is not None: if stream_info.extension is not None: _kwargs["file_extension"] = stream_info.extension if stream_info.url is not None: _kwargs["url"] = stream_info.url # Check if the converter will accept the file, and if so, try to convert it _accepts = False try: _accepts = converter.accepts(file_stream, stream_info, **_kwargs) except NotImplementedError: pass # accept() should not have changed the file stream position assert ( cur_pos == file_stream.tell() ), f"{type(converter).__name__}.accept() should NOT change the file_stream position" # Attempt the conversion if _accepts: try: res = converter.convert(file_stream, stream_info, **_kwargs) except Exception: failed_attempts.append( FailedConversionAttempt( converter=converter, exc_info=sys.exc_info() ) ) finally: file_stream.seek(cur_pos) if res is not None: # Normalize the content res.text_content = "\n".join( [line.rstrip() for line in re.split(r"\r?\n", res.text_content)] ) res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) return res # If we got this far without success, report any exceptions if len(failed_attempts) > 0: raise FileConversionException(attempts=failed_attempts) # Nothing can handle it! raise UnsupportedFormatException( f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported." ) def register_page_converter(self, converter: DocumentConverter) -> None: """DEPRECATED: User register_converter instead.""" warn( "register_page_converter is deprecated. Use register_converter instead.", DeprecationWarning, ) self.register_converter(converter) def register_converter( self, converter: DocumentConverter, *, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT, ) -> None: """ Register a DocumentConverter with a given priority. Priorities work as follows: By default, most converters get priority DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception is the PlainTextConverter, HtmlConverter, and ZipConverter, which get priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values being tried first (i.e., higher priority). Just prior to conversion, the converters are sorted by priority, using a stable sort. This means that converters with the same priority will remain in the same order, with the most recently registered converters appearing first. We have tight control over the order of built-in converters, but plugins can register converters in any order. The registration's priority field reasserts some control over the order of converters. Plugins can register converters with any priority, to appear before or after the built-ins. For example, a plugin with priority 9 will run before the PlainTextConverter, but after the built-in converters. """ self._converters.insert( 0, ConverterRegistration(converter=converter, priority=priority) ) def _get_stream_info_guesses( self, file_stream: BinaryIO, base_guess: StreamInfo ) -> List[StreamInfo]: """ Given a base guess, attempt to guess or expand on the stream info using the stream content (via magika). """ guesses: List[StreamInfo] = [] # Enhance the base guess with information based on the extension or mimetype enhanced_guess = base_guess.copy_and_update() # If there's an extension and no mimetype, try to guess the mimetype if base_guess.mimetype is None and base_guess.extension is not None: _m, _ = mimetypes.guess_type( "placeholder" + base_guess.extension, strict=False ) if _m is not None: enhanced_guess = enhanced_guess.copy_and_update(mimetype=_m) # If there's a mimetype and no extension, try to guess the extension if base_guess.mimetype is not None and base_guess.extension is None: _e = mimetypes.guess_all_extensions(base_guess.mimetype, strict=False) if len(_e) > 0: enhanced_guess = enhanced_guess.copy_and_update(extension=_e[0]) # Call magika to guess from the stream cur_pos = file_stream.tell() try: result = self._magika.identify_stream(file_stream) if result.status == "ok" and result.prediction.output.label != "unknown": # If it's text, also guess the charset charset = None if result.prediction.output.is_text: # Read the first 4k to guess the charset file_stream.seek(cur_pos) stream_page = file_stream.read(4096) charset_result = charset_normalizer.from_bytes(stream_page).best() if charset_result is not None: charset = self._normalize_charset(charset_result.encoding) # Normalize the first extension listed guessed_extension = None if len(result.prediction.output.extensions) > 0: guessed_extension = "." + result.prediction.output.extensions[0] # Determine if the guess is compatible with the base guess compatible = True if ( base_guess.mimetype is not None and base_guess.mimetype != result.prediction.output.mime_type ): compatible = False if ( base_guess.extension is not None and base_guess.extension.lstrip(".") not in result.prediction.output.extensions ): compatible = False if ( base_guess.charset is not None and self._normalize_charset(base_guess.charset) != charset ): compatible = False if compatible: # Add the compatible base guess guesses.append( StreamInfo( mimetype=base_guess.mimetype or result.prediction.output.mime_type, extension=base_guess.extension or guessed_extension, charset=base_guess.charset or charset, filename=base_guess.filename, local_path=base_guess.local_path, url=base_guess.url, ) ) else: # The magika guess was incompatible with the base guess, so add both guesses guesses.append(enhanced_guess) guesses.append( StreamInfo( mimetype=result.prediction.output.mime_type, extension=guessed_extension, charset=charset, filename=base_guess.filename, local_path=base_guess.local_path, url=base_guess.url, ) ) else: # There were no other guesses, so just add the base guess guesses.append(enhanced_guess) finally: file_stream.seek(cur_pos) return guesses def _normalize_charset(self, charset: str | None) -> str | None: """ Normalize a charset string to a canonical form. """ if charset is None: return None try: return codecs.lookup(charset).name except LookupError: return charset ``` ## /packages/markitdown/src/markitdown/_stream_info.py ```py path="/packages/markitdown/src/markitdown/_stream_info.py" from dataclasses import dataclass, asdict from typing import Optional @dataclass(kw_only=True, frozen=True) class StreamInfo: """The StreamInfo class is used to store information about a file stream. All fields can be None, and will depend on how the stream was opened. """ mimetype: Optional[str] = None extension: Optional[str] = None charset: Optional[str] = None filename: Optional[ str ] = None # From local path, url, or Content-Disposition header local_path: Optional[str] = None # If read from disk url: Optional[str] = None # If read from url def copy_and_update(self, *args, **kwargs): """Copy the StreamInfo object and update it with the given StreamInfo instance and/or other keyword arguments.""" new_info = asdict(self) for si in args: assert isinstance(si, StreamInfo) new_info.update({k: v for k, v in asdict(si).items() if v is not None}) if len(kwargs) > 0: new_info.update(kwargs) return StreamInfo(**new_info) ``` ## /packages/markitdown/src/markitdown/_uri_utils.py ```py path="/packages/markitdown/src/markitdown/_uri_utils.py" import base64 import os from typing import Tuple, Dict from urllib.request import url2pathname from urllib.parse import urlparse, unquote_to_bytes def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]: """Convert a file URI to a local file path""" parsed = urlparse(file_uri) if parsed.scheme != "file": raise ValueError(f"Not a file URL: {file_uri}") netloc = parsed.netloc if parsed.netloc else None path = os.path.abspath(url2pathname(parsed.path)) return netloc, path def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]: if not uri.startswith("data:"): raise ValueError("Not a data URI") header, _, data = uri.partition(",") if not _: raise ValueError("Malformed data URI, missing ',' separator") meta = header[5:] # Strip 'data:' parts = meta.split(";") is_base64 = False # Ends with base64? if parts[-1] == "base64": parts.pop() is_base64 = True mime_type = None # Normally this would default to text/plain but we won't assume if len(parts) and len(parts[0]) > 0: # First part is the mime type mime_type = parts.pop(0) attributes: Dict[str, str] = {} for part in parts: # Handle key=value pairs in the middle if "=" in part: key, value = part.split("=", 1) attributes[key] = value elif len(part) > 0: attributes[part] = "" content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data) return mime_type, attributes, content ``` ## /packages/markitdown/src/markitdown/converter_utils/__init__.py ```py path="/packages/markitdown/src/markitdown/converter_utils/__init__.py" ``` ## /packages/markitdown/src/markitdown/converter_utils/docx/__init__.py ```py path="/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py" ``` ## /packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py ```py path="/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py" ``` ## /packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py ```py path="/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py" # -*- coding: utf-8 -*- """ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py On 25/03/2025 """ from __future__ import unicode_literals CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") BLANK = "" BACKSLASH = "\\" ALN = "&" CHR = { # Unicode : Latex Math Symbols # Top accents "\u0300": "\\grave{{{0}}}", "\u0301": "\\acute{{{0}}}", "\u0302": "\\hat{{{0}}}", "\u0303": "\\tilde{{{0}}}", "\u0304": "\\bar{{{0}}}", "\u0305": "\\overbar{{{0}}}", "\u0306": "\\breve{{{0}}}", "\u0307": "\\dot{{{0}}}", "\u0308": "\\ddot{{{0}}}", "\u0309": "\\ovhook{{{0}}}", "\u030a": "\\ocirc{{{0}}}}", "\u030c": "\\check{{{0}}}}", "\u0310": "\\candra{{{0}}}", "\u0312": "\\oturnedcomma{{{0}}}", "\u0315": "\\ocommatopright{{{0}}}", "\u031a": "\\droang{{{0}}}", "\u0338": "\\not{{{0}}}", "\u20d0": "\\leftharpoonaccent{{{0}}}", "\u20d1": "\\rightharpoonaccent{{{0}}}", "\u20d2": "\\vertoverlay{{{0}}}", "\u20d6": "\\overleftarrow{{{0}}}", "\u20d7": "\\vec{{{0}}}", "\u20db": "\\dddot{{{0}}}", "\u20dc": "\\ddddot{{{0}}}", "\u20e1": "\\overleftrightarrow{{{0}}}", "\u20e7": "\\annuity{{{0}}}", "\u20e9": "\\widebridgeabove{{{0}}}", "\u20f0": "\\asteraccent{{{0}}}", # Bottom accents "\u0330": "\\wideutilde{{{0}}}", "\u0331": "\\underbar{{{0}}}", "\u20e8": "\\threeunderdot{{{0}}}", "\u20ec": "\\underrightharpoondown{{{0}}}", "\u20ed": "\\underleftharpoondown{{{0}}}", "\u20ee": "\\underledtarrow{{{0}}}", "\u20ef": "\\underrightarrow{{{0}}}", # Over | group "\u23b4": "\\overbracket{{{0}}}", "\u23dc": "\\overparen{{{0}}}", "\u23de": "\\overbrace{{{0}}}", # Under| group "\u23b5": "\\underbracket{{{0}}}", "\u23dd": "\\underparen{{{0}}}", "\u23df": "\\underbrace{{{0}}}", } CHR_BO = { # Big operators, "\u2140": "\\Bbbsum", "\u220f": "\\prod", "\u2210": "\\coprod", "\u2211": "\\sum", "\u222b": "\\int", "\u22c0": "\\bigwedge", "\u22c1": "\\bigvee", "\u22c2": "\\bigcap", "\u22c3": "\\bigcup", "\u2a00": "\\bigodot", "\u2a01": "\\bigoplus", "\u2a02": "\\bigotimes", } T = { "\u2192": "\\rightarrow ", # Greek letters "\U0001d6fc": "\\alpha ", "\U0001d6fd": "\\beta ", "\U0001d6fe": "\\gamma ", "\U0001d6ff": "\\theta ", "\U0001d700": "\\epsilon ", "\U0001d701": "\\zeta ", "\U0001d702": "\\eta ", "\U0001d703": "\\theta ", "\U0001d704": "\\iota ", "\U0001d705": "\\kappa ", "\U0001d706": "\\lambda ", "\U0001d707": "\\m ", "\U0001d708": "\\n ", "\U0001d709": "\\xi ", "\U0001d70a": "\\omicron ", "\U0001d70b": "\\pi ", "\U0001d70c": "\\rho ", "\U0001d70d": "\\varsigma ", "\U0001d70e": "\\sigma ", "\U0001d70f": "\\ta ", "\U0001d710": "\\upsilon ", "\U0001d711": "\\phi ", "\U0001d712": "\\chi ", "\U0001d713": "\\psi ", "\U0001d714": "\\omega ", "\U0001d715": "\\partial ", "\U0001d716": "\\varepsilon ", "\U0001d717": "\\vartheta ", "\U0001d718": "\\varkappa ", "\U0001d719": "\\varphi ", "\U0001d71a": "\\varrho ", "\U0001d71b": "\\varpi ", # Relation symbols "\u2190": "\\leftarrow ", "\u2191": "\\uparrow ", "\u2192": "\\rightarrow ", "\u2193": "\\downright ", "\u2194": "\\leftrightarrow ", "\u2195": "\\updownarrow ", "\u2196": "\\nwarrow ", "\u2197": "\\nearrow ", "\u2198": "\\searrow ", "\u2199": "\\swarrow ", "\u22ee": "\\vdots ", "\u22ef": "\\cdots ", "\u22f0": "\\adots ", "\u22f1": "\\ddots ", "\u2260": "\\ne ", "\u2264": "\\leq ", "\u2265": "\\geq ", "\u2266": "\\leqq ", "\u2267": "\\geqq ", "\u2268": "\\lneqq ", "\u2269": "\\gneqq ", "\u226a": "\\ll ", "\u226b": "\\gg ", "\u2208": "\\in ", "\u2209": "\\notin ", "\u220b": "\\ni ", "\u220c": "\\nni ", # Ordinary symbols "\u221e": "\\infty ", # Binary relations "\u00b1": "\\pm ", "\u2213": "\\mp ", # Italic, Latin, uppercase "\U0001d434": "A", "\U0001d435": "B", "\U0001d436": "C", "\U0001d437": "D", "\U0001d438": "E", "\U0001d439": "F", "\U0001d43a": "G", "\U0001d43b": "H", "\U0001d43c": "I", "\U0001d43d": "J", "\U0001d43e": "K", "\U0001d43f": "L", "\U0001d440": "M", "\U0001d441": "N", "\U0001d442": "O", "\U0001d443": "P", "\U0001d444": "Q", "\U0001d445": "R", "\U0001d446": "S", "\U0001d447": "T", "\U0001d448": "U", "\U0001d449": "V", "\U0001d44a": "W", "\U0001d44b": "X", "\U0001d44c": "Y", "\U0001d44d": "Z", # Italic, Latin, lowercase "\U0001d44e": "a", "\U0001d44f": "b", "\U0001d450": "c", "\U0001d451": "d", "\U0001d452": "e", "\U0001d453": "f", "\U0001d454": "g", "\U0001d456": "i", "\U0001d457": "j", "\U0001d458": "k", "\U0001d459": "l", "\U0001d45a": "m", "\U0001d45b": "n", "\U0001d45c": "o", "\U0001d45d": "p", "\U0001d45e": "q", "\U0001d45f": "r", "\U0001d460": "s", "\U0001d461": "t", "\U0001d462": "u", "\U0001d463": "v", "\U0001d464": "w", "\U0001d465": "x", "\U0001d466": "y", "\U0001d467": "z", } FUNC = { "sin": "\\sin({fe})", "cos": "\\cos({fe})", "tan": "\\tan({fe})", "arcsin": "\\arcsin({fe})", "arccos": "\\arccos({fe})", "arctan": "\\arctan({fe})", "arccot": "\\arccot({fe})", "sinh": "\\sinh({fe})", "cosh": "\\cosh({fe})", "tanh": "\\tanh({fe})", "coth": "\\coth({fe})", "sec": "\\sec({fe})", "csc": "\\csc({fe})", } FUNC_PLACE = "{fe}" BRK = "\\\\" CHR_DEFAULT = { "ACC_VAL": "\\hat{{{0}}}", } POS = { "top": "\\overline{{{0}}}", # not sure "bot": "\\underline{{{0}}}", } POS_DEFAULT = { "BAR_VAL": "\\overline{{{0}}}", } SUB = "_{{{0}}}" SUP = "^{{{0}}}" F = { "bar": "\\frac{{{num}}}{{{den}}}", "skw": r"^{{{num}}}/_{{{den}}}", "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}", "lin": "{{{num}}}/{{{den}}}", } F_DEFAULT = "\\frac{{{num}}}{{{den}}}" D = "\\left{left}{text}\\right{right}" D_DEFAULT = { "left": "(", "right": ")", "null": ".", } RAD = "\\sqrt[{deg}]{{{text}}}" RAD_DEFAULT = "\\sqrt{{{text}}}" ARR = "\\begin{{array}}{{c}}{text}\\end{{array}}" LIM_FUNC = { "lim": "\\lim_{{{lim}}}", "max": "\\max_{{{lim}}}", "min": "\\min_{{{lim}}}", } LIM_TO = ("\\rightarrow", "\\to") LIM_UPP = "\\overset{{{lim}}}{{{text}}}" M = "\\begin{{matrix}}{text}\\end{{matrix}}" ``` ## /packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py ```py path="/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py" # -*- coding: utf-8 -*- """ Office Math Markup Language (OMML) Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py On 25/03/2025 """ import xml.etree.ElementTree as ET from .latex_dict import ( CHARS, CHR, CHR_BO, CHR_DEFAULT, POS, POS_DEFAULT, SUB, SUP, F, F_DEFAULT, T, FUNC, D, D_DEFAULT, RAD, RAD_DEFAULT, ARR, LIM_FUNC, LIM_TO, LIM_UPP, M, BRK, BLANK, BACKSLASH, ALN, FUNC_PLACE, ) OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" def load(stream): tree = ET.parse(stream) for omath in tree.findall(OMML_NS + "oMath"): yield oMath2Latex(omath) def load_string(string): root = ET.fromstring(string) for omath in root.findall(OMML_NS + "oMath"): yield oMath2Latex(omath) def escape_latex(strs): last = None new_chr = [] strs = strs.replace(r"\\", "\\") for c in strs: if (c in CHARS) and (last != BACKSLASH): new_chr.append(BACKSLASH + c) else: new_chr.append(c) last = c return BLANK.join(new_chr) def get_val(key, default=None, store=CHR): if key is not None: return key if not store else store.get(key, key) else: return default class Tag2Method(object): def call_method(self, elm, stag=None): getmethod = self.tag2meth.get if stag is None: stag = elm.tag.replace(OMML_NS, "") method = getmethod(stag) if method: return method(self, elm) else: return None def process_children_list(self, elm, include=None): """ process children of the elm,return iterable """ for _e in list(elm): if OMML_NS not in _e.tag: continue stag = _e.tag.replace(OMML_NS, "") if include and (stag not in include): continue t = self.call_method(_e, stag=stag) if t is None: t = self.process_unknow(_e, stag) if t is None: continue yield (stag, t, _e) def process_children_dict(self, elm, include=None): """ process children of the elm,return dict """ latex_chars = dict() for stag, t, e in self.process_children_list(elm, include): latex_chars[stag] = t return latex_chars def process_children(self, elm, include=None): """ process children of the elm,return string """ return BLANK.join( ( t if not isinstance(t, Tag2Method) else str(t) for stag, t, e in self.process_children_list(elm, include) ) ) def process_unknow(self, elm, stag): return None class Pr(Tag2Method): text = "" __val_tags = ("chr", "pos", "begChr", "endChr", "type") __innerdict = None # can't use the __dict__ """ common properties of element""" def __init__(self, elm): self.__innerdict = {} self.text = self.process_children(elm) def __str__(self): return self.text def __unicode__(self): return self.__str__(self) def __getattr__(self, name): return self.__innerdict.get(name, None) def do_brk(self, elm): self.__innerdict["brk"] = BRK return BRK def do_common(self, elm): stag = elm.tag.replace(OMML_NS, "") if stag in self.__val_tags: t = elm.get("{0}val".format(OMML_NS)) self.__innerdict[stag] = t return None tag2meth = { "brk": do_brk, "chr": do_common, "pos": do_common, "begChr": do_common, "endChr": do_common, "type": do_common, } class oMath2Latex(Tag2Method): """ Convert oMath element of omml to latex """ _t_dict = T __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e") def __init__(self, element): self._latex = self.process_children(element) def __str__(self): return self.latex def __unicode__(self): return self.__str__(self) def process_unknow(self, elm, stag): if stag in self.__direct_tags: return self.process_children(elm) elif stag[-2:] == "Pr": return Pr(elm) else: return None @property def latex(self): return self._latex def do_acc(self, elm): """ the accent function """ c_dict = self.process_children_dict(elm) latex_s = get_val( c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR ) return latex_s.format(c_dict["e"]) def do_bar(self, elm): """ the bar function """ c_dict = self.process_children_dict(elm) pr = c_dict["barPr"] latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS) return pr.text + latex_s.format(c_dict["e"]) def do_d(self, elm): """ the delimiter object """ c_dict = self.process_children_dict(elm) pr = c_dict["dPr"] null = D_DEFAULT.get("null") s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T) e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T) return pr.text + D.format( left=null if not s_val else escape_latex(s_val), text=c_dict["e"], right=null if not e_val else escape_latex(e_val), ) def do_spre(self, elm): """ the Pre-Sub-Superscript object -- Not support yet """ pass def do_sub(self, elm): text = self.process_children(elm) return SUB.format(text) def do_sup(self, elm): text = self.process_children(elm) return SUP.format(text) def do_f(self, elm): """ the fraction object """ c_dict = self.process_children_dict(elm) pr = c_dict["fPr"] latex_s = get_val(pr.type, default=F_DEFAULT, store=F) return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den")) def do_func(self, elm): """ the Function-Apply object (Examples:sin cos) """ c_dict = self.process_children_dict(elm) func_name = c_dict.get("fName") return func_name.replace(FUNC_PLACE, c_dict.get("e")) def do_fname(self, elm): """ the func name """ latex_chars = [] for stag, t, e in self.process_children_list(elm): if stag == "r": if FUNC.get(t): latex_chars.append(FUNC[t]) else: raise NotImplemented("Not support func %s" % t) else: latex_chars.append(t) t = BLANK.join(latex_chars) return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this def do_groupchr(self, elm): """ the Group-Character object """ c_dict = self.process_children_dict(elm) pr = c_dict["groupChrPr"] latex_s = get_val(pr.chr) return pr.text + latex_s.format(c_dict["e"]) def do_rad(self, elm): """ the radical object """ c_dict = self.process_children_dict(elm) text = c_dict.get("e") deg_text = c_dict.get("deg") if deg_text: return RAD.format(deg=deg_text, text=text) else: return RAD_DEFAULT.format(text=text) def do_eqarr(self, elm): """ the Array object """ return ARR.format( text=BRK.join( [t for stag, t, e in self.process_children_list(elm, include=("e",))] ) ) def do_limlow(self, elm): """ the Lower-Limit object """ t_dict = self.process_children_dict(elm, include=("e", "lim")) latex_s = LIM_FUNC.get(t_dict["e"]) if not latex_s: raise NotImplemented("Not support lim %s" % t_dict["e"]) else: return latex_s.format(lim=t_dict.get("lim")) def do_limupp(self, elm): """ the Upper-Limit object """ t_dict = self.process_children_dict(elm, include=("e", "lim")) return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e")) def do_lim(self, elm): """ the lower limit of the limLow object and the upper limit of the limUpp function """ return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1]) def do_m(self, elm): """ the Matrix object """ rows = [] for stag, t, e in self.process_children_list(elm): if stag == "mPr": pass elif stag == "mr": rows.append(t) return M.format(text=BRK.join(rows)) def do_mr(self, elm): """ a single row of the matrix m """ return ALN.join( [t for stag, t, e in self.process_children_list(elm, include=("e",))] ) def do_nary(self, elm): """ the n-ary object """ res = [] bo = "" for stag, t, e in self.process_children_list(elm): if stag == "naryPr": bo = get_val(t.chr, store=CHR_BO) else: res.append(t) return bo + BLANK.join(res) def do_r(self, elm): """ Get text from 'r' element,And try convert them to latex symbols @todo text style support , (sty) @todo \text (latex pure text support) """ _str = [] for s in elm.findtext("./{0}t".format(OMML_NS)): # s = s if isinstance(s,unicode) else unicode(s,'utf-8') _str.append(self._t_dict.get(s, s)) return escape_latex(BLANK.join(_str)) tag2meth = { "acc": do_acc, "r": do_r, "bar": do_bar, "sub": do_sub, "sup": do_sup, "f": do_f, "func": do_func, "fName": do_fname, "groupChr": do_groupchr, "d": do_d, "rad": do_rad, "eqArr": do_eqarr, "limLow": do_limlow, "limUpp": do_limupp, "lim": do_lim, "m": do_m, "mr": do_mr, "nary": do_nary, } ``` ## /packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py ```py path="/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py" import zipfile from io import BytesIO from typing import BinaryIO from xml.etree import ElementTree as ET from bs4 import BeautifulSoup, Tag from .math.omml import OMML_NS, oMath2Latex MATH_ROOT_TEMPLATE = "".join( ( "', "{0}", ) ) def _convert_omath_to_latex(tag: Tag) -> str: """ Converts an OMML (Office Math Markup Language) tag to LaTeX format. Args: tag (Tag): A BeautifulSoup Tag object representing the OMML element. Returns: str: The LaTeX representation of the OMML element. """ # Format the tag into a complete XML document string math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag))) # Find the 'oMath' element within the XML document math_element = math_root.find(OMML_NS + "oMath") # Convert the 'oMath' element to LaTeX using the oMath2Latex function latex = oMath2Latex(math_element).latex return latex def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag: """ Creates a replacement tag for an OMML (Office Math Markup Language) element. Args: tag (Tag): A BeautifulSoup Tag object representing the "oMath" element. block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False. Returns: Tag: A BeautifulSoup Tag object representing the replacement element. """ t_tag = Tag(name="w:t") t_tag.string = ( f"$${_convert_omath_to_latex(tag)}$$" if block else f"${_convert_omath_to_latex(tag)}$" ) r_tag = Tag(name="w:r") r_tag.append(t_tag) return r_tag def _replace_equations(tag: Tag): """ Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents. Args: tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath". Raises: ValueError: If the tag is not supported. """ if tag.name == "oMathPara": # Create a new paragraph tag p_tag = Tag(name="w:p") # Replace each 'oMath' child tag with its LaTeX equivalent as block equations for child_tag in tag.find_all("oMath"): p_tag.append(_get_omath_tag_replacement(child_tag, block=True)) # Replace the original 'oMathPara' tag with the new paragraph tag tag.replace_with(p_tag) elif tag.name == "oMath": # Replace the 'oMath' tag with its LaTeX equivalent as inline equation tag.replace_with(_get_omath_tag_replacement(tag, block=False)) else: raise ValueError(f"Not supported tag: {tag.name}") def _pre_process_math(content: bytes) -> bytes: """ Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX. This preprocessed content can be directly replaced in the DOCX file -> XMLs. Args: content (bytes): The XML content of the DOCX file as bytes. Returns: bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes. """ soup = BeautifulSoup(content.decode(), features="xml") for tag in soup.find_all("oMathPara"): _replace_equations(tag) for tag in soup.find_all("oMath"): _replace_equations(tag) return str(soup).encode() def pre_process_docx(input_docx: BinaryIO) -> BinaryIO: """ Pre-processes a DOCX file with provided steps. The process works by unzipping the DOCX file in memory, transforming specific XML files (such as converting OMML elements to LaTeX), and then zipping everything back into a DOCX file without writing to disk. Args: input_docx (BinaryIO): A binary input stream representing the DOCX file. Returns: BinaryIO: A binary output stream representing the processed DOCX file. """ output_docx = BytesIO() # The files that need to be pre-processed from .docx pre_process_enable_files = [ "word/document.xml", "word/footnotes.xml", "word/endnotes.xml", ] with zipfile.ZipFile(input_docx, mode="r") as zip_input: files = {name: zip_input.read(name) for name in zip_input.namelist()} with zipfile.ZipFile(output_docx, mode="w") as zip_output: zip_output.comment = zip_input.comment for name, content in files.items(): if name in pre_process_enable_files: try: # Pre-process the content updated_content = _pre_process_math(content) # In the future, if there are more pre-processing steps, they can be added here zip_output.writestr(name, updated_content) except: # If there is an error in processing the content, write the original content zip_output.writestr(name, content) else: zip_output.writestr(name, content) output_docx.seek(0) return output_docx ``` ## /packages/markitdown/src/markitdown/converters/__init__.py ```py path="/packages/markitdown/src/markitdown/converters/__init__.py" # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT from ._plain_text_converter import PlainTextConverter from ._html_converter import HtmlConverter from ._rss_converter import RssConverter from ._wikipedia_converter import WikipediaConverter from ._youtube_converter import YouTubeConverter from ._ipynb_converter import IpynbConverter from ._bing_serp_converter import BingSerpConverter from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter from ._audio_converter import AudioConverter from ._outlook_msg_converter import OutlookMsgConverter from ._zip_converter import ZipConverter from ._doc_intel_converter import ( DocumentIntelligenceConverter, DocumentIntelligenceFileType, ) from ._epub_converter import EpubConverter from ._csv_converter import CsvConverter __all__ = [ "PlainTextConverter", "HtmlConverter", "RssConverter", "WikipediaConverter", "YouTubeConverter", "IpynbConverter", "BingSerpConverter", "PdfConverter", "DocxConverter", "XlsxConverter", "XlsConverter", "PptxConverter", "ImageConverter", "AudioConverter", "OutlookMsgConverter", "ZipConverter", "DocumentIntelligenceConverter", "DocumentIntelligenceFileType", "EpubConverter", "CsvConverter", ] ``` ## /packages/markitdown/src/markitdown/converters/_audio_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_audio_converter.py" import io from typing import Any, BinaryIO, Optional from ._exiftool import exiftool_metadata from ._transcribe_audio import transcribe_audio from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException ACCEPTED_MIME_TYPE_PREFIXES = [ "audio/x-wav", "audio/mpeg", "video/mp4", ] ACCEPTED_FILE_EXTENSIONS = [ ".wav", ".mp3", ".m4a", ".mp4", ] class AudioConverter(DocumentConverter): """ Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). """ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: md_content = "" # Add metadata metadata = exiftool_metadata( file_stream, exiftool_path=kwargs.get("exiftool_path") ) if metadata: for f in [ "Title", "Artist", "Author", "Band", "Album", "Genre", "Track", "DateTimeOriginal", "CreateDate", # "Duration", -- Wrong values when read from memory "NumChannels", "SampleRate", "AvgBytesPerSec", "BitsPerSample", ]: if f in metadata: md_content += f"{f}: {metadata[f]}\n" # Figure out the audio format for transcription if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav": audio_format = "wav" elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg": audio_format = "mp3" elif ( stream_info.extension in [".mp4", ".m4a"] or stream_info.mimetype == "video/mp4" ): audio_format = "mp4" else: audio_format = None # Transcribe if audio_format: try: transcript = transcribe_audio(file_stream, audio_format=audio_format) if transcript: md_content += "\n\n### Audio Transcript:\n" + transcript except MissingDependencyException: pass # Return the result return DocumentConverterResult(markdown=md_content.strip()) ``` ## /packages/markitdown/src/markitdown/converters/_bing_serp_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py" import io import re import base64 import binascii from urllib.parse import parse_qs, urlparse from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from ._markdownify import _CustomMarkdownify ACCEPTED_MIME_TYPE_PREFIXES = [ "text/html", "application/xhtml", ] ACCEPTED_FILE_EXTENSIONS = [ ".html", ".htm", ] class BingSerpConverter(DocumentConverter): """ Handle Bing results pages (only the organic search results). NOTE: It is better to use the Bing API """ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: """ Make sure we're dealing with HTML content *from* Bing. """ url = stream_info.url or "" mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if not re.search(r"^https://www\.bing\.com/search\?q=", url): # Not a Bing SERP URL return False if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True # Not HTML content return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: assert stream_info.url is not None # Parse the query parameters parsed_params = parse_qs(urlparse(stream_info.url).query) query = parsed_params.get("q", [""])[0] # Parse the stream encoding = "utf-8" if stream_info.charset is None else stream_info.charset soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Clean up some formatting for tptt in soup.find_all(class_="tptt"): if hasattr(tptt, "string") and tptt.string: tptt.string += " " for slug in soup.find_all(class_="algoSlug_icon"): slug.extract() # Parse the algorithmic results _markdownify = _CustomMarkdownify(**kwargs) results = list() for result in soup.find_all(class_="b_algo"): if not hasattr(result, "find_all"): continue # Rewrite redirect urls for a in result.find_all("a", href=True): parsed_href = urlparse(a["href"]) qs = parse_qs(parsed_href.query) # The destination is contained in the u parameter, # but appears to be base64 encoded, with some prefix if "u" in qs: u = ( qs["u"][0][2:].strip() + "==" ) # Python 3 doesn't care about extra padding try: # RFC 4648 / Base64URL" variant, which uses "-" and "_" a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8") except UnicodeDecodeError: pass except binascii.Error: pass # Convert to markdown md_result = _markdownify.convert_soup(result).strip() lines = [line.strip() for line in re.split(r"\n+", md_result)] results.append("\n".join([line for line in lines if len(line) > 0])) webpage_text = ( f"## A Bing search for '{query}' found the following results:\n\n" + "\n\n".join(results) ) return DocumentConverterResult( markdown=webpage_text, title=None if soup.title is None else soup.title.string, ) ``` ## /packages/markitdown/src/markitdown/converters/_csv_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_csv_converter.py" import sys import csv import io from typing import BinaryIO, Any from charset_normalizer import from_bytes from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo ACCEPTED_MIME_TYPE_PREFIXES = [ "text/csv", "application/csv", ] ACCEPTED_FILE_EXTENSIONS = [".csv"] class CsvConverter(DocumentConverter): """ Converts CSV files to Markdown tables. """ def __init__(self): super().__init__() def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Read the file content if stream_info.charset: content = file_stream.read().decode(stream_info.charset) else: content = str(from_bytes(file_stream.read()).best()) # Parse CSV content reader = csv.reader(io.StringIO(content)) rows = list(reader) if not rows: return DocumentConverterResult(markdown="") # Create markdown table markdown_table = [] # Add header row markdown_table.append("| " + " | ".join(rows[0]) + " |") # Add separator row markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |") # Add data rows for row in rows[1:]: # Make sure row has the same number of columns as header while len(row) < len(rows[0]): row.append("") # Truncate if row has more columns than header row = row[: len(rows[0])] markdown_table.append("| " + " | ".join(row) + " |") result = "\n".join(markdown_table) return DocumentConverterResult(markdown=result) ``` ## /packages/markitdown/src/markitdown/converters/_doc_intel_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py" import sys import re import os from typing import BinaryIO, Any, List, Optional, Union from enum import Enum from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _dependency_exc_info = None try: from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.ai.documentintelligence.models import ( AnalyzeDocumentRequest, AnalyzeResult, DocumentAnalysisFeature, ) from azure.core.credentials import AzureKeyCredential, TokenCredential from azure.identity import DefaultAzureCredential except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() # Define these types for type hinting when the package is not available class AzureKeyCredential: pass class TokenCredential: pass class DocumentIntelligenceClient: pass class AnalyzeDocumentRequest: pass class AnalyzeResult: pass class DocumentAnalysisFeature: pass class DefaultAzureCredential: pass # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # This constant is a temporary fix until the bug is resolved. CONTENT_FORMAT = "markdown" class DocumentIntelligenceFileType(str, Enum): """Enum of file types supported by the Document Intelligence Converter.""" # No OCR DOCX = "docx" PPTX = "pptx" XLSX = "xlsx" HTML = "html" # OCR PDF = "pdf" JPEG = "jpeg" PNG = "png" BMP = "bmp" TIFF = "tiff" def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]: """Get the MIME type prefixes for the given file types.""" prefixes: List[str] = [] for type_ in types: if type_ == DocumentIntelligenceFileType.DOCX: prefixes.append( "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) elif type_ == DocumentIntelligenceFileType.PPTX: prefixes.append( "application/vnd.openxmlformats-officedocument.presentationml" ) elif type_ == DocumentIntelligenceFileType.XLSX: prefixes.append( "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) elif type_ == DocumentIntelligenceFileType.PDF: prefixes.append("application/pdf") prefixes.append("application/x-pdf") elif type_ == DocumentIntelligenceFileType.JPEG: prefixes.append("image/jpeg") elif type_ == DocumentIntelligenceFileType.PNG: prefixes.append("image/png") elif type_ == DocumentIntelligenceFileType.BMP: prefixes.append("image/bmp") elif type_ == DocumentIntelligenceFileType.TIFF: prefixes.append("image/tiff") return prefixes def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]: """Get the file extensions for the given file types.""" extensions: List[str] = [] for type_ in types: if type_ == DocumentIntelligenceFileType.DOCX: extensions.append(".docx") elif type_ == DocumentIntelligenceFileType.PPTX: extensions.append(".pptx") elif type_ == DocumentIntelligenceFileType.XLSX: extensions.append(".xlsx") elif type_ == DocumentIntelligenceFileType.PDF: extensions.append(".pdf") elif type_ == DocumentIntelligenceFileType.JPEG: extensions.append(".jpg") extensions.append(".jpeg") elif type_ == DocumentIntelligenceFileType.PNG: extensions.append(".png") elif type_ == DocumentIntelligenceFileType.BMP: extensions.append(".bmp") elif type_ == DocumentIntelligenceFileType.TIFF: extensions.append(".tiff") return extensions class DocumentIntelligenceConverter(DocumentConverter): """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" def __init__( self, *, endpoint: str, api_version: str = "2024-07-31-preview", credential: AzureKeyCredential | TokenCredential | None = None, file_types: List[DocumentIntelligenceFileType] = [ DocumentIntelligenceFileType.DOCX, DocumentIntelligenceFileType.PPTX, DocumentIntelligenceFileType.XLSX, DocumentIntelligenceFileType.PDF, DocumentIntelligenceFileType.JPEG, DocumentIntelligenceFileType.PNG, DocumentIntelligenceFileType.BMP, DocumentIntelligenceFileType.TIFF, ], ): """ Initialize the DocumentIntelligenceConverter. Args: endpoint (str): The endpoint for the Document Intelligence service. api_version (str): The API version to use. Defaults to "2024-07-31-preview". credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication. file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types. """ super().__init__() self._file_types = file_types # Raise an error if the dependencies are not available. # This is different than other converters since this one isn't even instantiated # unless explicitly requested. if _dependency_exc_info is not None: raise MissingDependencyException( "DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`" ) from _dependency_exc_info[ 1 ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] ) if credential is None: if os.environ.get("AZURE_API_KEY") is None: credential = DefaultAzureCredential() else: credential = AzureKeyCredential(os.environ["AZURE_API_KEY"]) self.endpoint = endpoint self.api_version = api_version self.doc_intel_client = DocumentIntelligenceClient( endpoint=self.endpoint, api_version=self.api_version, credential=credential, ) def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in _get_file_extensions(self._file_types): return True for prefix in _get_mime_type_prefixes(self._file_types): if mimetype.startswith(prefix): return True return False def _analysis_features(self, stream_info: StreamInfo) -> List[str]: """ Helper needed to determine which analysis features to use. Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx) """ mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() # Types that don't support ocr no_ocr_types = [ DocumentIntelligenceFileType.DOCX, DocumentIntelligenceFileType.PPTX, DocumentIntelligenceFileType.XLSX, DocumentIntelligenceFileType.HTML, ] if extension in _get_file_extensions(no_ocr_types): return [] for prefix in _get_mime_type_prefixes(no_ocr_types): if mimetype.startswith(prefix): return [] return [ DocumentAnalysisFeature.FORMULAS, # enable formula extraction DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction ] def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Extract the text using Azure Document Intelligence poller = self.doc_intel_client.begin_analyze_document( model_id="prebuilt-layout", body=AnalyzeDocumentRequest(bytes_source=file_stream.read()), features=self._analysis_features(stream_info), output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed ) result: AnalyzeResult = poller.result() # remove comments from the markdown content generated by Doc Intelligence and append to markdown string markdown_text = re.sub(r"", "", result.content, flags=re.DOTALL) return DocumentConverterResult(markdown=markdown_text) ``` ## /packages/markitdown/src/markitdown/converters/_docx_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_docx_converter.py" import sys from typing import BinaryIO, Any from ._html_converter import HtmlConverter from ..converter_utils.docx.pre_process import pre_process_docx from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _dependency_exc_info = None try: import mammoth except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() ACCEPTED_MIME_TYPE_PREFIXES = [ "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ] ACCEPTED_FILE_EXTENSIONS = [".docx"] class DocxConverter(HtmlConverter): """ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ def __init__(self): super().__init__() self._html_converter = HtmlConverter() def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Check: the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( converter=type(self).__name__, extension=".docx", feature="docx", ) ) from _dependency_exc_info[ 1 ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] ) style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, **kwargs, ) ``` ## /packages/markitdown/src/markitdown/converters/_epub_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_epub_converter.py" import os import zipfile import xml.dom.minidom as minidom from typing import BinaryIO, Any, Dict, List from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo ACCEPTED_MIME_TYPE_PREFIXES = [ "application/epub", "application/epub+zip", "application/x-epub+zip", ] ACCEPTED_FILE_EXTENSIONS = [".epub"] MIME_TYPE_MAPPING = { ".html": "text/html", ".xhtml": "application/xhtml+xml", } class EpubConverter(HtmlConverter): """ Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ def __init__(self): super().__init__() self._html_converter = HtmlConverter() def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: with zipfile.ZipFile(file_stream, "r") as z: # Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file.""" # Locate content.opf container_dom = minidom.parse(z.open("META-INF/container.xml")) opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute( "full-path" ) # Parse content.opf opf_dom = minidom.parse(z.open(opf_path)) metadata: Dict[str, Any] = { "title": self._get_text_from_node(opf_dom, "dc:title"), "authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"), "language": self._get_text_from_node(opf_dom, "dc:language"), "publisher": self._get_text_from_node(opf_dom, "dc:publisher"), "date": self._get_text_from_node(opf_dom, "dc:date"), "description": self._get_text_from_node(opf_dom, "dc:description"), "identifier": self._get_text_from_node(opf_dom, "dc:identifier"), } # Extract manifest items (ID → href mapping) manifest = { item.getAttribute("id"): item.getAttribute("href") for item in opf_dom.getElementsByTagName("item") } # Extract spine order (ID refs) spine_items = opf_dom.getElementsByTagName("itemref") spine_order = [item.getAttribute("idref") for item in spine_items] # Convert spine order to actual file paths base_path = "/".join( opf_path.split("/")[:-1] ) # Get base directory of content.opf spine = [ f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id] for item_id in spine_order if item_id in manifest ] # Extract and convert the content markdown_content: List[str] = [] for file in spine: if file in z.namelist(): with z.open(file) as f: filename = os.path.basename(file) extension = os.path.splitext(filename)[1].lower() mimetype = MIME_TYPE_MAPPING.get(extension) converted_content = self._html_converter.convert( f, StreamInfo( mimetype=mimetype, extension=extension, filename=filename, ), ) markdown_content.append(converted_content.markdown.strip()) # Format and add the metadata metadata_markdown = [] for key, value in metadata.items(): if isinstance(value, list): value = ", ".join(value) if value: metadata_markdown.append(f"**{key.capitalize()}:** {value}") markdown_content.insert(0, "\n".join(metadata_markdown)) return DocumentConverterResult( markdown="\n\n".join(markdown_content), title=metadata["title"] ) def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None: """Convenience function to extract a single occurrence of a tag (e.g., title).""" texts = self._get_all_texts_from_nodes(dom, tag_name) if len(texts) > 0: return texts[0] else: return None def _get_all_texts_from_nodes( self, dom: minidom.Document, tag_name: str ) -> List[str]: """Helper function to extract all occurrences of a tag (e.g., multiple authors).""" texts: List[str] = [] for node in dom.getElementsByTagName(tag_name): if node.firstChild and hasattr(node.firstChild, "nodeValue"): texts.append(node.firstChild.nodeValue.strip()) return texts ``` ## /packages/markitdown/src/markitdown/converters/_exiftool.py ```py path="/packages/markitdown/src/markitdown/converters/_exiftool.py" import json import subprocess import locale import sys import shutil import os import warnings from typing import BinaryIO, Any, Union def exiftool_metadata( file_stream: BinaryIO, *, exiftool_path: Union[str, None], ) -> Any: # Need a better type for json data # Nothing to do if not exiftool_path: return {} # Run exiftool cur_pos = file_stream.tell() try: output = subprocess.run( [exiftool_path, "-json", "-"], input=file_stream.read(), capture_output=True, text=False, ).stdout return json.loads( output.decode(locale.getpreferredencoding(False)), )[0] finally: file_stream.seek(cur_pos) ``` ## /packages/markitdown/src/markitdown/converters/_html_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_html_converter.py" import io from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from ._markdownify import _CustomMarkdownify ACCEPTED_MIME_TYPE_PREFIXES = [ "text/html", "application/xhtml", ] ACCEPTED_FILE_EXTENSIONS = [ ".html", ".htm", ] class HtmlConverter(DocumentConverter): """Anything with content type text/html""" def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Parse the stream encoding = "utf-8" if stream_info.charset is None else stream_info.charset soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) # Remove javascript and style blocks for script in soup(["script", "style"]): script.extract() # Print only the main content body_elm = soup.find("body") webpage_text = "" if body_elm: webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) else: webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) assert isinstance(webpage_text, str) # remove leading and trailing \n webpage_text = webpage_text.strip() return DocumentConverterResult( markdown=webpage_text, title=None if soup.title is None else soup.title.string, ) def convert_string( self, html_content: str, *, url: Optional[str] = None, **kwargs ) -> DocumentConverterResult: """ Non-standard convenience method to convert a string to markdown. Given that many converters produce HTML as intermediate output, this allows for easy conversion of HTML to markdown. """ return self.convert( file_stream=io.BytesIO(html_content.encode("utf-8")), stream_info=StreamInfo( mimetype="text/html", extension=".html", charset="utf-8", url=url, ), **kwargs, ) ``` ## /packages/markitdown/src/markitdown/converters/_image_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_image_converter.py" from typing import BinaryIO, Any, Union import base64 import mimetypes from ._exiftool import exiftool_metadata from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo ACCEPTED_MIME_TYPE_PREFIXES = [ "image/jpeg", "image/png", ] ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"] class ImageConverter(DocumentConverter): """ Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured). """ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: md_content = "" # Add metadata metadata = exiftool_metadata( file_stream, exiftool_path=kwargs.get("exiftool_path") ) if metadata: for f in [ "ImageSize", "Title", "Caption", "Description", "Keywords", "Artist", "Author", "DateTimeOriginal", "CreateDate", "GPSPosition", ]: if f in metadata: md_content += f"{f}: {metadata[f]}\n" # Try describing the image with GPT llm_client = kwargs.get("llm_client") llm_model = kwargs.get("llm_model") if llm_client is not None and llm_model is not None: llm_description = self._get_llm_description( file_stream, stream_info, client=llm_client, model=llm_model, prompt=kwargs.get("llm_prompt"), ) if llm_description is not None: md_content += "\n# Description:\n" + llm_description.strip() + "\n" return DocumentConverterResult( markdown=md_content, ) def _get_llm_description( self, file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None, ) -> Union[None, str]: if prompt is None or prompt.strip() == "": prompt = "Write a detailed caption for this image." # Get the content type content_type = stream_info.mimetype if not content_type: content_type, _ = mimetypes.guess_type( "_dummy" + (stream_info.extension or "") ) if not content_type: content_type = "application/octet-stream" # Convert to base64 cur_pos = file_stream.tell() try: base64_image = base64.b64encode(file_stream.read()).decode("utf-8") except Exception as e: return None finally: file_stream.seek(cur_pos) # Prepare the data-uri data_uri = f"data:{content_type};base64,{base64_image}" # Prepare the OpenAI API request messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": { "url": data_uri, }, }, ], } ] # Call the OpenAI API response = client.chat.completions.create(model=model, messages=messages) return response.choices[0].message.content ``` ## /packages/markitdown/src/markitdown/converters/_ipynb_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_ipynb_converter.py" from typing import BinaryIO, Any import json from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import FileConversionException from .._stream_info import StreamInfo CANDIDATE_MIME_TYPE_PREFIXES = [ "application/json", ] ACCEPTED_FILE_EXTENSIONS = [".ipynb"] class IpynbConverter(DocumentConverter): """Converts Jupyter Notebook (.ipynb) files to Markdown.""" def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in CANDIDATE_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): # Read further to see if it's a notebook cur_pos = file_stream.tell() try: encoding = stream_info.charset or "utf-8" notebook_content = file_stream.read().decode(encoding) return ( "nbformat" in notebook_content and "nbformat_minor" in notebook_content ) finally: file_stream.seek(cur_pos) return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Parse and convert the notebook result = None encoding = stream_info.charset or "utf-8" notebook_content = file_stream.read().decode(encoding=encoding) return self._convert(json.loads(notebook_content)) def _convert(self, notebook_content: dict) -> DocumentConverterResult: """Helper function that converts notebook JSON content to Markdown.""" try: md_output = [] title = None for cell in notebook_content.get("cells", []): cell_type = cell.get("cell_type", "") source_lines = cell.get("source", []) if cell_type == "markdown": md_output.append("".join(source_lines)) # Extract the first # heading as title if not already found if title is None: for line in source_lines: if line.startswith("# "): title = line.lstrip("# ").strip() break elif cell_type == "code": # Code cells are wrapped in Markdown code blocks md_output.append(f"\`\`\`python\n{''.join(source_lines)}\n\`\`\`") elif cell_type == "raw": md_output.append(f"\`\`\`\n{''.join(source_lines)}\n\`\`\`") md_text = "\n\n".join(md_output) # Check for title in notebook metadata title = notebook_content.get("metadata", {}).get("title", title) return DocumentConverterResult( markdown=md_text, title=title, ) except Exception as e: raise FileConversionException( f"Error converting .ipynb file: {str(e)}" ) from e ``` ## /packages/markitdown/src/markitdown/converters/_llm_caption.py ```py path="/packages/markitdown/src/markitdown/converters/_llm_caption.py" from typing import BinaryIO, Any, Union import base64 import mimetypes from .._stream_info import StreamInfo def llm_caption( file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None ) -> Union[None, str]: if prompt is None or prompt.strip() == "": prompt = "Write a detailed caption for this image." # Get the content type content_type = stream_info.mimetype if not content_type: content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or "")) if not content_type: content_type = "application/octet-stream" # Convert to base64 cur_pos = file_stream.tell() try: base64_image = base64.b64encode(file_stream.read()).decode("utf-8") except Exception as e: return None finally: file_stream.seek(cur_pos) # Prepare the data-uri data_uri = f"data:{content_type};base64,{base64_image}" # Prepare the OpenAI API request messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": { "url": data_uri, }, }, ], } ] # Call the OpenAI API response = client.chat.completions.create(model=model, messages=messages) return response.choices[0].message.content ``` ## /packages/markitdown/src/markitdown/converters/_markdownify.py ```py path="/packages/markitdown/src/markitdown/converters/_markdownify.py" import re import markdownify from typing import Any, Optional from urllib.parse import quote, unquote, urlparse, urlunparse class _CustomMarkdownify(markdownify.MarkdownConverter): """ A custom version of markdownify's MarkdownConverter. Changes include: - Altering the default heading style to use '#', '##', etc. - Removing javascript hyperlinks. - Truncating images with large data:uri sources. - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax """ def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) options["keep_data_uris"] = options.get("keep_data_uris", False) # Explicitly cast options to the expected type if necessary super().__init__(**options) def convert_hn( self, n: int, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs, ) -> str: """Same as usual, but be sure to start with a new line""" if not convert_as_inline: if not re.search(r"^\n", text): return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore return super().convert_hn(n, el, text, convert_as_inline) # type: ignore def convert_a( self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs, ): """Same as usual converter, but removes Javascript links and escapes URIs.""" prefix, suffix, text = markdownify.chomp(text) # type: ignore if not text: return "" if el.find_parent("pre") is not None: return text href = el.get("href") title = el.get("title") # Escape URIs and skip non-http or file schemes if href: try: parsed_url = urlparse(href) # type: ignore if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore return "%s%s%s" % (prefix, text, suffix) href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore except ValueError: # It's not clear if this ever gets thrown return "%s%s%s" % (prefix, text, suffix) # For the replacement see #29: text nodes underscores are escaped if ( self.options["autolinks"] and text.replace(r"\_", "_") == href and not title and not self.options["default_title"] ): # Shortcut syntax return "<%s>" % href if self.options["default_title"] and not title: title = href title_part = ' "%s"' % title.replace('"', r"\"") if title else "" return ( "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text ) def convert_img( self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs, ) -> str: """Same as usual converter, but removes data URIs""" alt = el.attrs.get("alt", None) or "" src = el.attrs.get("src", None) or "" title = el.attrs.get("title", None) or "" title_part = ' "%s"' % title.replace('"', r"\"") if title else "" if ( convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"] ): return alt # Remove dataURIs if src.startswith("data:") and not self.options["keep_data_uris"]: src = src.split(",")[0] + "..." return "![%s](%s%s)" % (alt, src, title_part) def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore ``` ## /packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py" import sys from typing import Any, Union, BinaryIO from .._stream_info import StreamInfo from .._base_converter import DocumentConverter, DocumentConverterResult from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _dependency_exc_info = None olefile = None try: import olefile # type: ignore[no-redef] except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() ACCEPTED_MIME_TYPE_PREFIXES = [ "application/vnd.ms-outlook", ] ACCEPTED_FILE_EXTENSIONS = [".msg"] class OutlookMsgConverter(DocumentConverter): """Converts Outlook .msg files to markdown by extracting email metadata and content. Uses the olefile package to parse the .msg file structure and extract: - Email headers (From, To, Subject) - Email body content """ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() # Check the extension and mimetype if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True # Brute force, check if we have an OLE file cur_pos = file_stream.tell() try: if olefile and not olefile.isOleFile(file_stream): return False finally: file_stream.seek(cur_pos) # Brue force, check if it's an Outlook file try: if olefile is not None: msg = olefile.OleFileIO(file_stream) toc = "\n".join([str(stream) for stream in msg.listdir()]) return ( "__properties_version1.0" in toc and "__recip_version1.0_#00000000" in toc ) except Exception as e: pass finally: file_stream.seek(cur_pos) return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Check: the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( converter=type(self).__name__, extension=".msg", feature="outlook", ) ) from _dependency_exc_info[ 1 ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] ) assert ( olefile is not None ) # If we made it this far, olefile should be available msg = olefile.OleFileIO(file_stream) # Extract email metadata md_content = "# Email Message\n\n" # Get headers headers = { "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"), "To": self._get_stream_data(msg, "__substg1.0_0E04001F"), "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"), } # Add headers to markdown for key, value in headers.items(): if value: md_content += f"**{key}:** {value}\n" md_content += "\n## Content\n\n" # Get email body body = self._get_stream_data(msg, "__substg1.0_1000001F") if body: md_content += body msg.close() return DocumentConverterResult( markdown=md_content.strip(), title=headers.get("Subject"), ) def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]: """Helper to safely extract and decode stream data from the MSG file.""" assert olefile is not None assert isinstance( msg, olefile.OleFileIO ) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package) try: if msg.exists(stream_path): data = msg.openstream(stream_path).read() # Try UTF-16 first (common for .msg files) try: return data.decode("utf-16-le").strip() except UnicodeDecodeError: # Fall back to UTF-8 try: return data.decode("utf-8").strip() except UnicodeDecodeError: # Last resort - ignore errors return data.decode("utf-8", errors="ignore").strip() except Exception: pass return None ``` ## /packages/markitdown/src/markitdown/converters/_pdf_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_pdf_converter.py" import sys import io from typing import BinaryIO, Any from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _dependency_exc_info = None try: import pdfminer import pdfminer.high_level except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() ACCEPTED_MIME_TYPE_PREFIXES = [ "application/pdf", "application/x-pdf", ] ACCEPTED_FILE_EXTENSIONS = [".pdf"] class PdfConverter(DocumentConverter): """ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. """ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Check the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( converter=type(self).__name__, extension=".pdf", feature="pdf", ) ) from _dependency_exc_info[ 1 ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] ) assert isinstance(file_stream, io.IOBase) # for mypy return DocumentConverterResult( markdown=pdfminer.high_level.extract_text(file_stream), ) ``` ## /packages/markitdown/src/markitdown/converters/_plain_text_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_plain_text_converter.py" import sys from typing import BinaryIO, Any from charset_normalizer import from_bytes from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _dependency_exc_info = None try: import mammoth except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() ACCEPTED_MIME_TYPE_PREFIXES = [ "text/", "application/json", "application/markdown", ] ACCEPTED_FILE_EXTENSIONS = [ ".txt", ".text", ".md", ".markdown", ".json", ".jsonl", ] class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() # If we have a charset, we can safely assume it's text # With Magika in the earlier stages, this handles most cases if stream_info.charset is not None: return True # Otherwise, check the mimetype and extension if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: if stream_info.charset: text_content = file_stream.read().decode(stream_info.charset) else: text_content = str(from_bytes(file_stream.read()).best()) return DocumentConverterResult(markdown=text_content) ``` ## /packages/markitdown/src/markitdown/converters/_pptx_converter.py ```py path="/packages/markitdown/src/markitdown/converters/_pptx_converter.py" import sys import base64 import os import io import re import html from typing import BinaryIO, Any from operator import attrgetter from ._html_converter import HtmlConverter from ._llm_caption import llm_caption from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE # Try loading optional (but in this case, required) dependencies # Save reporting of any exceptions for later _dependency_exc_info = None try: import pptx except ImportError: # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() ACCEPTED_MIME_TYPE_PREFIXES = [ "application/vnd.openxmlformats-officedocument.presentationml", ] ACCEPTED_FILE_EXTENSIONS = [".pptx"] class PptxConverter(DocumentConverter): """ Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ def __init__(self): super().__init__() self._html_converter = HtmlConverter() def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() if extension in ACCEPTED_FILE_EXTENSIONS: return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True return False def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Check the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( converter=type(self).__name__, extension=".pptx", feature="pptx", ) ) from _dependency_exc_info[ 1 ].with_traceback( # type: ignore[union-attr] _dependency_exc_info[2] ) # Perform the conversion presentation = pptx.Presentation(file_stream) md_content = "" slide_num = 0 for slide in presentation.slides: slide_num += 1 md_content += f"\n\n\n" title = slide.shapes.title def get_shape_content(shape, **kwargs): nonlocal md_content # Pictures if self._is_picture(shape): # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 llm_description = "" alt_text = "" # Potentially generate a description using an LLM llm_client = kwargs.get("llm_client") llm_model = kwargs.get("llm_model") if llm_client is not None and llm_model is not None: # Prepare a file_stream and stream_info for the image data image_filename = shape.image.filename image_extension = None if image_filename: image_extension = os.path.splitext(image_filename)[1] image_stream_info = StreamInfo( mimetype=shape.image.content_type, extension=image_extension, filename=image_filename, ) image_stream = io.BytesIO(shape.image.blob) # Caption the image try: llm_description = llm_caption( image_stream, image_stream_info, client=llm_client, model=llm_model, prompt=kwargs.get("llm_prompt"), ) except Exception: # Unable to generate a description pass # Also grab any description embedded in the deck try: alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") except Exception: # Unable to get alt text pass # Prepare the alt, escaping any special characters alt_text = "\n".join([llm_description, alt_text]) or shape.name alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) alt_text = re.sub(r"\s+", " ", alt_text).strip() # If keep_data_uris is True, use base64 encoding for images if kwargs.get("keep_data_uris", False): blob = shape.image.blob content_type = shape.image.content_type or "image/png" b64_string = base64.b64encode(blob).decode("utf-8") md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" else: # A placeholder name filename = re.sub(r"\W", "", shape.name) + ".jpg" md_content += "\n![" + alt_text + "](" + filename + ")\n" # Tables if self._is_table(shape): md_content += self._convert_table_to_markdown(shape.table, **kwargs) # Charts if shape.has_chart: md_content += self._convert_chart_to_markdown(shape.chart) # Text areas elif shape.has_text_frame: if shape == title: md_content += "# " + shape.text.lstrip() + "\n" else: md_content += shape.text + "\n" # Group Shapes if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left")) for subshape in sorted_shapes: get_shape_content(subshape, **kwargs) sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left")) for shape in sorted_shapes: get_shape_content(shape, **kwargs) md_content = md_content.strip() if slide.has_notes_slide: md_content += "\n\n### Notes:\n" notes_frame = slide.notes_slide.notes_text_frame if notes_frame is not None: md_content += notes_frame.text md_content = md_content.strip() return DocumentConverterResult(markdown=md_content.strip()) def _is_picture(self, shape): if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: return True if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: if hasattr(shape, "image"): return True return False def _is_table(self, shape): if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: return True return False def _convert_table_to_markdown(self, table, **kwargs): # Write the table as HTML, then convert it to Markdown html_table = "" first_row = True for row in table.rows: html_table += "" for cell in row.cells: if first_row: html_table += "" else: html_table += "" html_table += "" first_row = False html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" return ( self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + "\n" ) def _convert_chart_to_markdown(self, chart): try: md = "\n\n### Chart" if chart.has_title: md += f": {chart.chart_title.text_frame.text}" md += "\n\n" data = [] category_names = [c.label for c in chart.plots[0].categories] series_names = [s.name for s in chart.series] data.append(["Category"] + series_names) for idx, category in enumerate(category_names): row = [category] for series in chart.series: row.append(series.values[idx]) data.append(row) markdown_table = [] for row in data: markdown_table.append("| " + " | ".join(map(str, row)) + " |") header = markdown_table[0] separator = "|" + "|".join(["---"] * len(data[0])) + "|" return md + "\n".join([header, separator] + markdown_table[1:]) except ValueError as e: # Handle the specific error for unsupported chart types if "unsupported plot type" in str(e): return "\n\n[unsupported chart]\n\n" except Exception: # Catch any other exceptions that might occur return "\n\n[unsupported chart]\n\n" ``` The content has been capped at 50000 tokens, and files over NaN bytes have been omitted. The user could consider applying other filters to refine the result. The better and more specific the context, the better the LLM can follow instructions. If the context seems verbose, the user can refine the filter using uithub. Thank you for using https://uithub.com - Perfect LLM context for any GitHub repo.