Compare commits

..

1 Commits

Author SHA1 Message Date
dependabot[bot] 3d3da11ffe Bump actions/setup-python from 5 to 6
Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5 to 6.
- [Release notes](https://github.com/actions/setup-python/releases)
- [Commits](https://github.com/actions/setup-python/compare/v5...v6)

---
updated-dependencies:
- dependency-name: actions/setup-python
  dependency-version: '6'
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-09-08 15:37:33 +00:00
7 changed files with 16 additions and 51 deletions
+1 -1
View File
@@ -7,7 +7,7 @@ jobs:
steps:
- uses: actions/checkout@v5
- name: Set up Python
uses: actions/setup-python@v5
uses: actions/setup-python@v6
with:
python-version: "3.x"
+1 -1
View File
@@ -6,7 +6,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/setup-python@v5
- uses: actions/setup-python@v6
with:
python-version: |
3.10
+3 -3
View File
@@ -36,12 +36,12 @@ dependencies = [
[project.optional-dependencies]
all = [
"python-pptx",
"mammoth~=1.11.0",
"mammoth~=1.10.0",
"pandas",
"openpyxl",
"xlrd",
"lxml",
"pdfminer.six>=20251107",
"pdfminer.six",
"olefile",
"pydub",
"SpeechRecognition",
@@ -50,7 +50,7 @@ all = [
"azure-identity"
]
pptx = ["python-pptx"]
docx = ["mammoth~=1.11.0", "lxml"]
docx = ["mammoth", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.4"
__version__ = "0.1.3"
@@ -15,6 +15,13 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
_dependency_exc_info = None
try:
import mammoth
import mammoth.docx.files
def mammoth_files_open(self, uri):
warn("DOCX: processing of r:link resources (e.g., linked images) is disabled.")
return io.BytesIO(b"")
mammoth.docx.files.Files.open = mammoth_files_open
except ImportError:
# Preserve the error and stack trace for later
Binary file not shown.
+3 -45
View File
@@ -288,47 +288,6 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content
def test_doc_rlink() -> None:
# Test for: CVE-2025-11849
markitdown = MarkItDown()
# Document with rlink
docx_file = os.path.join(TEST_FILES_DIR, "rlink.docx")
# Directory containing the target rlink file
rlink_tmp_dir = os.path.abspath(os.sep + "tmp")
# Ensure the tmp directory exists
if not os.path.exists(rlink_tmp_dir):
pytest.skip(f"Skipping rlink test; {rlink_tmp_dir} directory does not exist.")
return
rlink_file_path = os.path.join(rlink_tmp_dir, "test_rlink.txt")
rlink_content = "de658225-569e-4e3d-9ed2-cfb6abf927fc"
b64_prefix = (
"ZGU2NTgyMjUtNTY5ZS00ZTNkLTllZDItY2ZiNmFiZjk" # base64 prefix of rlink_content
)
if os.path.exists(rlink_file_path):
with open(rlink_file_path, "r", encoding="utf-8") as f:
existing_content = f.read()
if existing_content != rlink_content:
raise ValueError(
f"Existing {rlink_file_path} content does not match expected content."
)
else:
with open(rlink_file_path, "w", encoding="utf-8") as f:
f.write(rlink_content)
try:
result = markitdown.convert(docx_file, keep_data_uris=True).text_content
assert (
b64_prefix not in result
) # Make sure the target file was NOT embedded in the output
finally:
os.remove(rlink_file_path)
@pytest.mark.skipif(
skip_remote,
reason="do not run tests that query external urls",
@@ -342,9 +301,9 @@ def test_markitdown_remote() -> None:
assert test_string in result.text_content
# Youtube
# result = markitdown.convert(YOUTUBE_TEST_URL)
# for test_string in YOUTUBE_TEST_STRINGS:
# assert test_string in result.text_content
result = markitdown.convert(YOUTUBE_TEST_URL)
for test_string in YOUTUBE_TEST_STRINGS:
assert test_string in result.text_content
@pytest.mark.skipif(
@@ -493,7 +452,6 @@ if __name__ == "__main__":
test_markitdown_remote,
test_speech_transcription,
test_exceptions,
test_doc_rlink,
test_markitdown_exiftool,
test_markitdown_llm_parameters,
test_markitdown_llm,