Files
WeKnora/docreader/parser/registry.py
wizardchen ef1047bf67 feat(parser): add OpenDataLoader, PaddleOCR-VL engines, and parser improvements
Introduce opendataloader and PaddleOCR-VL parser engines with tenant-level
settings UI, replace liteparse, and harden Excel/PPT/Markdown parsing.
Optional odl-hybrid sidecar stays local-build only and is excluded from
default dev-start and full profiles.
2026-06-03 12:29:13 +08:00

173 lines
5.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import logging
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
from docreader.parser.base_parser import BaseParser
from docreader.parser.doc_parser import DocParser
from docreader.parser.docx2_parser import Docx2Parser
from docreader.parser.excel_parser import ExcelParser
from docreader.parser.image_parser import ImageParser
from docreader.parser.markdown_parser import MarkdownParser
from docreader.parser.markitdown_parser import MarkitdownParser
from docreader.parser.opendataloader_parser import (
OpenDataLoaderParser,
opendataloader_available,
)
from docreader.parser.pdf_parser import PDFParser
logger = logging.getLogger(__name__)
BUILTIN_ENGINE = "builtin"
class ParserEngineRegistry:
"""Registry for parser engines.
Each engine maps file extensions to parser classes.
When a requested engine doesn't support a file type, the registry
falls back to the builtin engine automatically.
"""
def __init__(self):
self._engines: Dict[str, Dict[str, Type[BaseParser]]] = {}
self._descriptions: Dict[str, str] = {}
self._check_available: Dict[str, Callable[..., Tuple[bool, str]]] = {}
self._unavailable_hint: Dict[str, str] = {}
def register(
self,
name: str,
file_types: Dict[str, Type[BaseParser]],
description: str = "",
check_available: Callable[..., Tuple[bool, str]] | None = None,
unavailable_hint: str = "",
):
self._engines[name] = file_types
self._descriptions[name] = description
if check_available is not None:
self._check_available[name] = check_available
self._unavailable_hint[name] = unavailable_hint
logger.info(
"Registered parser engine '%s' with file types: %s",
name,
", ".join(file_types.keys()),
)
def get_parser_class(self, engine: str, file_type: str) -> Type[BaseParser]:
"""Resolve parser class for the given engine and file type.
Falls back to builtin engine when the requested engine doesn't
support the file type.
"""
ft = file_type.lower()
if engine and engine in self._engines:
cls = self._engines[engine].get(ft)
if cls:
logger.info("Using engine '%s' for file type '%s'", engine, ft)
return cls
logger.info(
"Engine '%s' does not support '%s', falling back to builtin",
engine,
ft,
)
builtin = self._engines.get(BUILTIN_ENGINE, {})
cls = builtin.get(ft)
if cls:
return cls
raise ValueError(f"Unsupported file type: {file_type}")
def list_engines(self, overrides: Optional[Dict[str, str]] = None) -> List[Dict]:
"""Return metadata for all registered engines, including availability.
Args:
overrides: tenant-level config overrides (e.g. mineru_endpoint, mineru_api_key)
forwarded to each engine's check_available function.
"""
result = []
for name, parsers in self._engines.items():
available = True
unavailable_reason = ""
check = self._check_available.get(name)
if check is not None:
try:
available, unavailable_reason = check(overrides)
except Exception as e:
available = False
unavailable_reason = str(e) or self._unavailable_hint.get(name, "")
if not available and not unavailable_reason:
unavailable_reason = self._unavailable_hint.get(name, "不可用")
result.append(
{
"name": name,
"description": self._descriptions.get(name, ""),
"file_types": sorted(parsers.keys()),
"available": available,
"unavailable_reason": unavailable_reason,
}
)
return result
def get_engine_names(self) -> List[str]:
return list(self._engines.keys())
def _build_default_registry() -> ParserEngineRegistry:
"""Create and populate the default registry with all known engines."""
reg = ParserEngineRegistry()
_image_types = {
ext: ImageParser for ext in ("jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp")
}
reg.register(
BUILTIN_ENGINE,
{
"docx": Docx2Parser,
"doc": DocParser,
"pdf": PDFParser,
"md": MarkdownParser,
"markdown": MarkdownParser,
"xlsx": ExcelParser,
"xls": ExcelParser,
**_image_types,
},
description="内置解析引擎",
)
reg.register(
"markitdown",
{
"md": MarkitdownParser,
"markdown": MarkitdownParser,
"pdf": MarkitdownParser,
"docx": MarkitdownParser,
"doc": MarkitdownParser,
"pptx": MarkitdownParser,
"ppt": MarkitdownParser,
"xlsx": MarkitdownParser,
"xls": MarkitdownParser,
"csv": MarkitdownParser,
},
description="MarkItDown 解析引擎(微软 MarkItDown 库)",
)
reg.register(
"opendataloader",
{"pdf": OpenDataLoaderParser},
description="OpenDataLoader PDF版面分析需 Java 11+",
check_available=opendataloader_available,
unavailable_hint="请安装 opendataloader-pdf 与 Java 11+",
)
# NOTE: Engine listing is managed by Go-side engine registry
# (docparser.ListAllEngines). The Python list_engines method is kept for
# backward compatibility with the gRPC ListEngines RPC but the Go app
# no longer calls it. MinerU engines are handled natively by Go.
return reg
registry = _build_default_registry()