Files
WeKnora/docreader/tests/test_web_parser.py
wizardchen ef1047bf67 feat(parser): add OpenDataLoader, PaddleOCR-VL engines, and parser improvements
Introduce opendataloader and PaddleOCR-VL parser engines with tenant-level
settings UI, replace liteparse, and harden Excel/PPT/Markdown parsing.
Optional odl-hybrid sidecar stays local-build only and is excluded from
default dev-start and full profiles.
2026-06-03 12:29:13 +08:00

43 lines
1.3 KiB
Python

import unittest
from docreader.parser.web_parser import (
build_visible_text_fallback,
extract_markdown_from_html,
)
class TestWebParserHelpers(unittest.TestCase):
def test_extract_markdown_empty_html(self):
self.assertIsNone(extract_markdown_from_html(""))
self.assertIsNone(extract_markdown_from_html(" "))
def test_extract_markdown_article_html(self):
html = """
<html><head><title>Demo</title></head><body>
<article><h1>Hello</h1><p>World paragraph with enough text for extraction.</p></article>
</body></html>
"""
md = extract_markdown_from_html(html)
self.assertIsNotNone(md)
self.assertIn("Hello", md)
def test_build_fallback_too_short(self):
self.assertIsNone(build_visible_text_fallback("short"))
self.assertIsNone(build_visible_text_fallback(""))
def test_build_fallback_with_title(self):
text = "A" * 60
md = build_visible_text_fallback(text, page_title="WeKnora")
self.assertIsNotNone(md)
self.assertTrue(md.startswith("# WeKnora"))
self.assertIn(text, md)
def test_build_fallback_without_title(self):
text = "B" * 60
md = build_visible_text_fallback(text, page_title="")
self.assertEqual(md, text)
if __name__ == "__main__":
unittest.main()