fix: handle deeply nested HTML that triggers RecursionError (#1644)

* fix: handle deeply nested HTML that triggers RecursionError (#1636)

Large HTML files with deep DOM nesting (e.g., SEC EDGAR filings) cause
markdownify's recursive DOM traversal to exceed Python's default
recursion limit (1000). Previously this RecursionError was caught by
the top-level _convert() dispatcher, which then fell through to
PlainTextConverter — silently returning the raw HTML as 'markdown'
with no warning.

This fix catches RecursionError in HtmlConverter.convert() and falls
back to BeautifulSoup's iterative get_text() method, which handles
arbitrary nesting depths. A warning is emitted so callers know the
output is plain text rather than full markdown.

Root cause chain:
1. HtmlConverter.convert() calls markdownify.convert_soup() (recursive)
2. Deeply nested HTML (>~400 levels) triggers RecursionError
3. _convert() catches all Exceptions, stores in failed_attempts
4. PlainTextConverter.accepts() matches text/html via 'text/' prefix
5. PlainTextConverter.convert() returns raw HTML bytes as text
6. Caller receives 'markdown' that is actually unconverted HTML

* refactor: address review feedback on RecursionError fallback

- Move 'import warnings' to module top level (was inside except block)
- Make test environment-independent by temporarily lowering
  sys.setrecursionlimit(200) instead of relying on depth=500 being
  sufficient on all platforms; original limit restored in finally block
- Add strict=True keyword argument to opt out of the plain-text
  fallback and let RecursionError propagate to the caller

* test: use result.markdown instead of deprecated result.text_content

---------

Co-authored-by: jigangz <jigangz@github.com>
This commit is contained in:
jigangz
2026-04-15 15:26:44 -07:00
committed by GitHub
parent 63cbbd9de6
commit 604bba13da
2 changed files with 74 additions and 4 deletions
@@ -1,4 +1,5 @@
import io import io
import warnings
from typing import Any, BinaryIO, Optional from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -44,6 +45,10 @@ class HtmlConverter(DocumentConverter):
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Pop our own keyword before forwarding the rest to markdownify.
# strict=True raises RecursionError instead of falling back to plain text.
strict: bool = kwargs.pop("strict", False)
# Parse the stream # Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
@@ -55,10 +60,25 @@ class HtmlConverter(DocumentConverter):
# Print only the main content # Print only the main content
body_elm = soup.find("body") body_elm = soup.find("body")
webpage_text = "" webpage_text = ""
if body_elm: try:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) if body_elm:
else: webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) else:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
except RecursionError:
if strict:
raise
# Large or deeply-nested HTML can exceed Python's recursion limit
# during markdownify's recursive DOM traversal. Fall back to
# BeautifulSoup's iterative get_text() so the caller still gets
# usable plain-text content instead of raw HTML.
warnings.warn(
"HTML document is too deeply nested for markdown conversion "
"(RecursionError). Falling back to plain-text extraction.",
stacklevel=2,
)
target = body_elm if body_elm else soup
webpage_text = target.get_text("\n", strip=True)
assert isinstance(webpage_text, str) assert isinstance(webpage_text, str)
@@ -288,6 +288,56 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content assert "# Test" in result.text_content
def test_deeply_nested_html_fallback() -> None:
"""Large, deeply nested HTML should fall back to plain-text extraction
instead of silently returning unconverted HTML (issue #1636).
Note: This test uses sys.setrecursionlimit to guarantee a RecursionError
regardless of the host environment's default limit, making it deterministic
across different platforms and CI configurations.
"""
import sys
import warnings
markitdown = MarkItDown()
# Use a small recursion limit so the test is environment-independent.
# We restore the original limit in a finally block to avoid side-effects.
original_limit = sys.getrecursionlimit()
low_limit = 200 # well below markdownify's traversal depth for depth=500
# Build HTML with nesting deep enough to trigger RecursionError
depth = 500
html = "<html><body>"
for _ in range(depth):
html += '<div style="margin-left:10px">'
html += "<p>Deep content with <b>bold text</b></p>"
for _ in range(depth):
html += "</div>"
html += "</body></html>"
try:
sys.setrecursionlimit(low_limit)
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
result = markitdown.convert_stream(
io.BytesIO(html.encode("utf-8")),
file_extension=".html",
)
# Should have emitted a warning about the fallback
recursion_warnings = [x for x in w if "deeply nested" in str(x.message)]
assert len(recursion_warnings) > 0
finally:
sys.setrecursionlimit(original_limit)
# The output should contain the text content, not raw HTML
assert "Deep content" in result.markdown
assert "bold text" in result.markdown
assert "<div" not in result.markdown
assert "<p>" not in result.markdown
def test_doc_rlink() -> None: def test_doc_rlink() -> None:
# Test for: CVE-2025-11849 # Test for: CVE-2025-11849
markitdown = MarkItDown() markitdown = MarkItDown()