Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3d3da11ffe |
@@ -7,7 +7,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v5
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v6
|
||||||
with:
|
with:
|
||||||
python-version: "3.x"
|
python-version: "3.x"
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v5
|
||||||
- uses: actions/setup-python@v5
|
- uses: actions/setup-python@v6
|
||||||
with:
|
with:
|
||||||
python-version: |
|
python-version: |
|
||||||
3.10
|
3.10
|
||||||
|
|||||||
@@ -36,12 +36,12 @@ dependencies = [
|
|||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
all = [
|
all = [
|
||||||
"python-pptx",
|
"python-pptx",
|
||||||
"mammoth~=1.11.0",
|
"mammoth~=1.10.0",
|
||||||
"pandas",
|
"pandas",
|
||||||
"openpyxl",
|
"openpyxl",
|
||||||
"xlrd",
|
"xlrd",
|
||||||
"lxml",
|
"lxml",
|
||||||
"pdfminer.six>=20251107",
|
"pdfminer.six",
|
||||||
"olefile",
|
"olefile",
|
||||||
"pydub",
|
"pydub",
|
||||||
"SpeechRecognition",
|
"SpeechRecognition",
|
||||||
@@ -50,7 +50,7 @@ all = [
|
|||||||
"azure-identity"
|
"azure-identity"
|
||||||
]
|
]
|
||||||
pptx = ["python-pptx"]
|
pptx = ["python-pptx"]
|
||||||
docx = ["mammoth~=1.11.0", "lxml"]
|
docx = ["mammoth", "lxml"]
|
||||||
xlsx = ["pandas", "openpyxl"]
|
xlsx = ["pandas", "openpyxl"]
|
||||||
xls = ["pandas", "xlrd"]
|
xls = ["pandas", "xlrd"]
|
||||||
pdf = ["pdfminer.six"]
|
pdf = ["pdfminer.six"]
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.1.4"
|
__version__ = "0.1.3"
|
||||||
|
|||||||
@@ -15,6 +15,13 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|||||||
_dependency_exc_info = None
|
_dependency_exc_info = None
|
||||||
try:
|
try:
|
||||||
import mammoth
|
import mammoth
|
||||||
|
import mammoth.docx.files
|
||||||
|
|
||||||
|
def mammoth_files_open(self, uri):
|
||||||
|
warn("DOCX: processing of r:link resources (e.g., linked images) is disabled.")
|
||||||
|
return io.BytesIO(b"")
|
||||||
|
|
||||||
|
mammoth.docx.files.Files.open = mammoth_files_open
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# Preserve the error and stack trace for later
|
# Preserve the error and stack trace for later
|
||||||
|
|||||||
BIN
Binary file not shown.
@@ -288,47 +288,6 @@ def test_input_as_strings() -> None:
|
|||||||
assert "# Test" in result.text_content
|
assert "# Test" in result.text_content
|
||||||
|
|
||||||
|
|
||||||
def test_doc_rlink() -> None:
|
|
||||||
# Test for: CVE-2025-11849
|
|
||||||
markitdown = MarkItDown()
|
|
||||||
|
|
||||||
# Document with rlink
|
|
||||||
docx_file = os.path.join(TEST_FILES_DIR, "rlink.docx")
|
|
||||||
|
|
||||||
# Directory containing the target rlink file
|
|
||||||
rlink_tmp_dir = os.path.abspath(os.sep + "tmp")
|
|
||||||
|
|
||||||
# Ensure the tmp directory exists
|
|
||||||
if not os.path.exists(rlink_tmp_dir):
|
|
||||||
pytest.skip(f"Skipping rlink test; {rlink_tmp_dir} directory does not exist.")
|
|
||||||
return
|
|
||||||
|
|
||||||
rlink_file_path = os.path.join(rlink_tmp_dir, "test_rlink.txt")
|
|
||||||
rlink_content = "de658225-569e-4e3d-9ed2-cfb6abf927fc"
|
|
||||||
b64_prefix = (
|
|
||||||
"ZGU2NTgyMjUtNTY5ZS00ZTNkLTllZDItY2ZiNmFiZjk" # base64 prefix of rlink_content
|
|
||||||
)
|
|
||||||
|
|
||||||
if os.path.exists(rlink_file_path):
|
|
||||||
with open(rlink_file_path, "r", encoding="utf-8") as f:
|
|
||||||
existing_content = f.read()
|
|
||||||
if existing_content != rlink_content:
|
|
||||||
raise ValueError(
|
|
||||||
f"Existing {rlink_file_path} content does not match expected content."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
with open(rlink_file_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write(rlink_content)
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = markitdown.convert(docx_file, keep_data_uris=True).text_content
|
|
||||||
assert (
|
|
||||||
b64_prefix not in result
|
|
||||||
) # Make sure the target file was NOT embedded in the output
|
|
||||||
finally:
|
|
||||||
os.remove(rlink_file_path)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_remote,
|
skip_remote,
|
||||||
reason="do not run tests that query external urls",
|
reason="do not run tests that query external urls",
|
||||||
@@ -342,9 +301,9 @@ def test_markitdown_remote() -> None:
|
|||||||
assert test_string in result.text_content
|
assert test_string in result.text_content
|
||||||
|
|
||||||
# Youtube
|
# Youtube
|
||||||
# result = markitdown.convert(YOUTUBE_TEST_URL)
|
result = markitdown.convert(YOUTUBE_TEST_URL)
|
||||||
# for test_string in YOUTUBE_TEST_STRINGS:
|
for test_string in YOUTUBE_TEST_STRINGS:
|
||||||
# assert test_string in result.text_content
|
assert test_string in result.text_content
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
@@ -493,7 +452,6 @@ if __name__ == "__main__":
|
|||||||
test_markitdown_remote,
|
test_markitdown_remote,
|
||||||
test_speech_transcription,
|
test_speech_transcription,
|
||||||
test_exceptions,
|
test_exceptions,
|
||||||
test_doc_rlink,
|
|
||||||
test_markitdown_exiftool,
|
test_markitdown_exiftool,
|
||||||
test_markitdown_llm_parameters,
|
test_markitdown_llm_parameters,
|
||||||
test_markitdown_llm,
|
test_markitdown_llm,
|
||||||
|
|||||||
Reference in New Issue
Block a user