feat(parser): add OpenDataLoader, PaddleOCR-VL engines, and parser improvements

Introduce opendataloader and PaddleOCR-VL parser engines with tenant-level
settings UI, replace liteparse, and harden Excel/PPT/Markdown parsing.
Optional odl-hybrid sidecar stays local-build only and is excluded from
default dev-start and full profiles.
This commit is contained in:
wizardchen
2026-06-03 12:00:09 +08:00
committed by lyingbug
parent 7b1bb1054f
commit ef1047bf67
50 changed files with 4352 additions and 304 deletions

View File

@@ -562,6 +562,39 @@ DOCREADER_TRANSPORT=grpc
# 渲染页图的最大长边像素(防止超大页面 PDF 渲染出 100+MP 图、撑爆 gRPC 消息上限)
# 调小可进一步减小图片体积;过小会影响 OCR 识别(密集中文建议 >=1600
# DOCREADER_PDF_RENDER_MAX_EDGE=2000
# Layout text: insert spaces when glyph gaps exceed this × median char width (default 0.4).
# DOCREADER_PDF_WORD_GAP_WIDTH_RATIO=0.4
# Native PDF layout: drop narrow margin columns (arXiv sidebar) below this page-width ratio (default 0.12).
# DOCREADER_PDF_MARGIN_COL_WIDTH_RATIO=0.12
# DOCREADER_PDF_MIN_HEADING_LINE_CHARS=8
# Remove U+FFFE/soft-hyphen artifacts; strip vector chart axis text; render chart areas as JPEG.
# DOCREADER_PDF_SANITIZE_TEXT=true
# DOCREADER_PDF_STRIP_CHART_DEBRIS=true
# DOCREADER_PDF_RENDER_VECTOR_FIGURES=true
# OpenDataLoader PDF知识库 parser_engine_rules 指定 engine: opendataloader
# 需 Java 11+docreader 镜像已包含 openjdk-17-jre-headless。
# DOCREADER_ODL_MAX_WORKERS=1
# fast 模式默认DOCREADER_ODL_HYBRID=off
# hybrid 需另起服务;镜像/模型较大,默认 pull/up 与 --full 均不含 odl-hybrid。
# 默认 --no-ocr不做 EasyOCR
# 开发make dev-start DEV_ARGS=--odl-hybrid本地 build
# 生产/docker-compose.yml需 DOCREADER_ODL_HYBRID=docling-fast 等):
# docker compose --profile odl-hybrid up -d --build odl-hybrid
# 该镜像未发布到 Docker Hub本地 tag: weknora-odl-hybrid:localmake pull-images 不会拉取,只能按需 build。
# 修改 Dockerfile.odl-hybrid 后需重建docker compose --profile odl-hybrid build --no-cache odl-hybrid
# ODL_HYBRID_EXTRA_ARGS=--no-ocr
# 扫描件不要用 hybrid OCR请用 builtin 扫描渲染 + Go OCR或 MinerU若坚持 hybrid OCR
# ODL_HYBRID_EXTRA_ARGS=--force-ocr
# DOCREADER_ODL_HYBRID=docling-fast
# DOCREADER_ODL_HYBRID_URL=http://odl-hybrid:5002
# 开发环境 hybridmake dev-start DEV_ARGS=--odl-hybrid
# 仅用 fast 模式(不需 odl-hybrid 容器)时请保持 DOCREADER_ODL_HYBRID=off。
# ODL_HYBRID_PORT=5002
# ODL_HYBRID_STARTUP_WAIT_SEC=180
# DOCREADER_ODL_HYBRID_MODE=auto
# DOCREADER_ODL_HYBRID_FALLBACK=false
# DOCREADER_ODL_MARKDOWN_WITH_HTML=false
# VLM视觉模型单次 HTTP 请求的整体超时时间(秒)。
# 扫描件整页 OCR全文+版式抽取)在慢端点上很容易超过默认值,
@@ -648,7 +681,9 @@ DOCREADER_TRANSPORT=grpc
# --- Async pipeline tuning (optional) -----------------------------------------
# Worker pool size for the asynq server. Default 16 — raise it on machines
# that handle many concurrent uploads (default Go runtime.NumCPU() under-
# provisions for the I/O-bound document pipeline).
# provisions for the I/O-bound document pipeline). Can also be set in the
# management UI under Settings → System settings (asynq.concurrency);
# UI changes require a process restart.
# WEKNORA_ASYNQ_CONCURRENCY=16
# Read/write timeout (in milliseconds) the asynq client uses against Redis.

View File

@@ -50,6 +50,7 @@ help:
@echo ""
@echo "开发模式(推荐):"
@echo " dev-start 启动开发环境基础设施(仅启动依赖服务)"
@echo " 可选: make dev-start DEV_ARGS=--odl-hybrid"
@echo " dev-stop 停止开发环境"
@echo " dev-restart 重启开发环境"
@echo " dev-logs 查看开发环境日志"
@@ -310,7 +311,7 @@ show-platform:
# Development mode commands
dev-start:
./scripts/dev.sh start
./scripts/dev.sh start $(DEV_ARGS)
dev-stop:
./scripts/dev.sh stop

View File

@@ -248,8 +248,13 @@ services:
- docreader-tmp-dev:/tmp/docreader
environment:
- DOCREADER_IMAGE_OUTPUT_DIR=/tmp/docreader
- MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
- MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-}
- DOCREADER_ODL_MAX_WORKERS=${DOCREADER_ODL_MAX_WORKERS:-1}
- DOCREADER_ODL_HYBRID=${DOCREADER_ODL_HYBRID:-off}
- DOCREADER_ODL_HYBRID_URL=${DOCREADER_ODL_HYBRID_URL:-http://odl-hybrid:5002}
- DOCREADER_ODL_HYBRID_MODE=${DOCREADER_ODL_HYBRID_MODE:-auto}
- DOCREADER_ODL_HYBRID_FALLBACK=${DOCREADER_ODL_HYBRID_FALLBACK:-false}
- DOCREADER_ODL_MARKDOWN_WITH_HTML=${DOCREADER_ODL_MARKDOWN_WITH_HTML:-false}
- DOCREADER_MARKITDOWN_MAX_WORKERS=${DOCREADER_MARKITDOWN_MAX_WORKERS:-1}
- DOCREADER_PDF_RENDER_MAX_WORKERS=${DOCREADER_PDF_RENDER_MAX_WORKERS:-1}
- DOCREADER_PDF_RENDER_DPI=${DOCREADER_PDF_RENDER_DPI:-200}
@@ -272,6 +277,27 @@ services:
extra_hosts:
- "host.docker.internal:host-gateway"
# OpenDataLoader hybrid backend (optional). Enable profile "odl-hybrid" and set
# DOCREADER_ODL_HYBRID=docling-fast on docreader. Default --no-ocr (no EasyOCR).
# Local build only — not published to Docker Hub.
odl-hybrid:
build:
context: .
dockerfile: docker/Dockerfile.odl-hybrid
image: weknora-odl-hybrid:local
container_name: WeKnora-odl-hybrid
profiles:
- odl-hybrid
ports:
- "${ODL_HYBRID_PORT:-5002}:5002"
environment:
# Default --no-ocr (digital PDFs). Scanned PDFs: use builtin OCR / MinerU, or
# ODL_HYBRID_EXTRA_ARGS="--force-ocr" (needs EasyOCR + libGL in image).
- ODL_HYBRID_EXTRA_ARGS=${ODL_HYBRID_EXTRA_ARGS:---no-ocr}
networks:
- WeKnora-network-dev
restart: unless-stopped
jaeger:
image: jaegertracing/all-in-one:latest
container_name: WeKnora-jaeger-dev

View File

@@ -241,6 +241,12 @@ services:
environment:
- DOCREADER_IMAGE_OUTPUT_DIR=/tmp/docreader
- MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-}
- DOCREADER_ODL_MAX_WORKERS=${DOCREADER_ODL_MAX_WORKERS:-1}
- DOCREADER_ODL_HYBRID=${DOCREADER_ODL_HYBRID:-off}
- DOCREADER_ODL_HYBRID_URL=${DOCREADER_ODL_HYBRID_URL:-http://odl-hybrid:5002}
- DOCREADER_ODL_HYBRID_MODE=${DOCREADER_ODL_HYBRID_MODE:-auto}
- DOCREADER_ODL_HYBRID_FALLBACK=${DOCREADER_ODL_HYBRID_FALLBACK:-false}
- DOCREADER_ODL_MARKDOWN_WITH_HTML=${DOCREADER_ODL_MARKDOWN_WITH_HTML:-false}
- DOCREADER_MARKITDOWN_MAX_WORKERS=${DOCREADER_MARKITDOWN_MAX_WORKERS:-1}
- DOCREADER_PDF_RENDER_MAX_WORKERS=${DOCREADER_PDF_RENDER_MAX_WORKERS:-1}
- DOCREADER_PDF_RENDER_DPI=${DOCREADER_PDF_RENDER_DPI:-200}
@@ -250,13 +256,6 @@ services:
- GRPC_TLS_KEY=${GRPC_TLS_KEY:-}
- GRPC_TLS_CA=${GRPC_TLS_CA:-}
- GRPC_AUTH_TOKEN=${GRPC_AUTH_TOKEN:-}
- OBS_ENDPOINT=${OBS_ENDPOINT:-}
- OBS_REGION=${OBS_REGION:-}
- OBS_ACCESS_KEY=${OBS_ACCESS_KEY:-}
- OBS_SECRET_KEY=${OBS_SECRET_KEY:-}
- OBS_BUCKET_NAME=${OBS_BUCKET_NAME:-}
- OBS_PATH_PREFIX=${OBS_PATH_PREFIX:-}
- OBS_PROXY_DOMAIN=${OBS_PROXY_DOMAIN:-}
healthcheck:
test: ["CMD", "grpc_health_probe", "-addr=localhost:50051"]
interval: 30s
@@ -269,6 +268,24 @@ services:
extra_hosts:
- "host.docker.internal:host-gateway"
# OpenDataLoader hybrid backend (optional). Default --no-ocr (no EasyOCR/libGL).
# Local build only — not published to Docker Hub; use --profile odl-hybrid --build.
odl-hybrid:
build:
context: .
dockerfile: docker/Dockerfile.odl-hybrid
image: weknora-odl-hybrid:local
container_name: WeKnora-odl-hybrid
profiles:
- odl-hybrid
expose:
- "5002"
environment:
- ODL_HYBRID_EXTRA_ARGS=${ODL_HYBRID_EXTRA_ARGS:---no-ocr}
networks:
- WeKnora-network
restart: unless-stopped
# 修改的PostgreSQL配置
postgres:
image: paradedb/paradedb:v0.22.2-pg17

View File

@@ -94,6 +94,7 @@ RUN apt-get update && apt-get install -y \
libjpeg62-turbo \
wget \
gnupg \
openjdk-17-jre-headless \
libgl1 \
libglib2.0-0 \
antiword \

View File

@@ -0,0 +1,29 @@
# OpenDataLoader PDF hybrid backend (Docling). Pre-install deps so the
# container listens on :5002 immediately instead of pip install on every start.
#
# Default --no-ocr: digital PDFs already have a text layer; Docling layout/table
# still runs without EasyOCR (avoids libGL + heavy OCR stack in slim images).
# For scanned PDFs use builtin docreader OCR, MinerU, or override with
# ODL_HYBRID_EXTRA_ARGS="--force-ocr" (requires extra system/Python deps).
FROM python:3.10.18-bookworm
# Docling table/layout models import cv2 (OpenCV), which needs libGL at runtime
# even when hybrid runs with --no-ocr.
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
libgl1 \
libglib2.0-0 \
libgomp1 \
libsm6 \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir "opendataloader-pdf[hybrid]>=2.4.7"
EXPOSE 5002
ENV ODL_HYBRID_EXTRA_ARGS="--no-ocr"
HEALTHCHECK --interval=30s --timeout=10s --retries=5 --start-period=120s \
CMD curl -f http://localhost:5002/health || exit 1
CMD ["bash", "-c", "exec opendataloader-pdf-hybrid --host 0.0.0.0 --port 5002 ${ODL_HYBRID_EXTRA_ARGS}"]

View File

@@ -55,6 +55,12 @@ class DocReaderConfig:
# Parser
docx_max_pages: int
markitdown_max_workers: int
odl_max_workers: int
odl_hybrid: str
odl_hybrid_url: str
odl_hybrid_mode: str
odl_hybrid_fallback: bool
odl_markdown_with_html: bool
pdf_render_max_workers: int
pdf_render_parallelism: int
pdf_render_dpi: int
@@ -81,6 +87,17 @@ def load_config() -> DocReaderConfig:
grpc_port = _get_int(["DOCREADER_GRPC_PORT", "PORT"], 50051)
docx_max_pages = _get_int(["DOCREADER_DOCX_MAX_PAGES"], 0)
markitdown_max_workers = _get_int(["DOCREADER_MARKITDOWN_MAX_WORKERS"], 1)
odl_max_workers = _get_int(["DOCREADER_ODL_MAX_WORKERS"], 1)
odl_hybrid = _get_str(["DOCREADER_ODL_HYBRID"], "off")
odl_hybrid_url = _get_str(
["DOCREADER_ODL_HYBRID_URL"],
"http://127.0.0.1:5002",
)
odl_hybrid_mode = _get_str(["DOCREADER_ODL_HYBRID_MODE"], "auto")
odl_hybrid_fallback = _get_bool(["DOCREADER_ODL_HYBRID_FALLBACK"], False)
odl_markdown_with_html = _get_bool(
["DOCREADER_ODL_MARKDOWN_WITH_HTML"], False
)
pdf_render_max_workers = _get_int(["DOCREADER_PDF_RENDER_MAX_WORKERS"], 1)
# Intra-document render parallelism: how many worker processes render the
# scanned pages of a SINGLE PDF in parallel. pdfium is not thread-safe, so
@@ -117,6 +134,12 @@ def load_config() -> DocReaderConfig:
grpc_port=grpc_port,
docx_max_pages=docx_max_pages,
markitdown_max_workers=markitdown_max_workers,
odl_max_workers=odl_max_workers,
odl_hybrid=odl_hybrid,
odl_hybrid_url=odl_hybrid_url,
odl_hybrid_mode=odl_hybrid_mode,
odl_hybrid_fallback=odl_hybrid_fallback,
odl_markdown_with_html=odl_markdown_with_html,
pdf_render_max_workers=pdf_render_max_workers,
pdf_render_parallelism=pdf_render_parallelism,
pdf_render_dpi=pdf_render_dpi,
@@ -139,6 +162,12 @@ def dump_config(mask_secrets: bool = True) -> Dict[str, Any]:
"DOCREADER_GRPC_PORT": cfg.grpc_port,
"DOCREADER_DOCX_MAX_PAGES": cfg.docx_max_pages,
"DOCREADER_MARKITDOWN_MAX_WORKERS": cfg.markitdown_max_workers,
"DOCREADER_ODL_MAX_WORKERS": cfg.odl_max_workers,
"DOCREADER_ODL_HYBRID": cfg.odl_hybrid,
"DOCREADER_ODL_HYBRID_URL": cfg.odl_hybrid_url,
"DOCREADER_ODL_HYBRID_MODE": cfg.odl_hybrid_mode,
"DOCREADER_ODL_HYBRID_FALLBACK": cfg.odl_hybrid_fallback,
"DOCREADER_ODL_MARKDOWN_WITH_HTML": cfg.odl_markdown_with_html,
"DOCREADER_PDF_RENDER_MAX_WORKERS": cfg.pdf_render_max_workers,
"DOCREADER_PDF_RENDER_PARALLELISM": cfg.pdf_render_parallelism,
"DOCREADER_PDF_RENDER_DPI": cfg.pdf_render_dpi,

View File

@@ -0,0 +1,149 @@
"""LibreOffice helpers for normalizing legacy or unusual Excel uploads."""
from __future__ import annotations
import logging
import os
import subprocess
import tempfile
import time
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
_XLS_MAGIC = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
_ZIP_MAGIC = b"PK\x03\x04"
def detect_excel_format(content: bytes) -> str | None:
"""Return pandas/excel format id: xlsx, xls, xlsb, ods, or None."""
if not content:
return None
from pandas.io.excel._base import inspect_excel_format
ext = inspect_excel_format(content_or_path=content)
if ext in ("xlsx", "xls", "xlsb", "ods"):
return ext
if ext == "zip":
return "xlsx"
if content.startswith(_ZIP_MAGIC):
return "xlsx"
if len(content) >= len(_XLS_MAGIC) and content.startswith(_XLS_MAGIC):
return "xls"
return None
def engine_for_format(ext: str | None) -> str:
if ext == "xls":
return "xlrd"
if ext in ("xlsx", "xlsb"):
return "openpyxl"
if ext == "ods":
return "odf"
return "openpyxl"
def convert_excel_to_xlsx_bytes(content: bytes, suffix: str = ".xlsx") -> bytes | None:
"""Convert arbitrary spreadsheet bytes to XLSX using LibreOffice, if available."""
soffice = find_soffice()
if not soffice:
return None
max_attempts = 3
for attempt in range(1, max_attempts + 1):
with tempfile.TemporaryDirectory() as temp_dir, tempfile.TemporaryDirectory() as profile_dir:
src = os.path.join(temp_dir, f"input{suffix}")
with open(src, "wb") as handle:
handle.write(content)
user_installation = Path(profile_dir).as_uri()
cmd = [
soffice,
"--headless",
f"-env:UserInstallation={user_installation}",
"--convert-to",
"xlsx",
"--outdir",
temp_dir,
src,
]
try:
result = subprocess.run(cmd, capture_output=True, timeout=120)
except (OSError, subprocess.TimeoutExpired) as exc:
logger.warning("LibreOffice convert failed to start: %s", exc)
return None
if result.returncode != 0:
stderr = result.stderr.decode("utf-8", errors="ignore")
logger.warning(
"LibreOffice convert failed (attempt %s/%s): %s",
attempt,
max_attempts,
stderr,
)
if attempt < max_attempts:
time.sleep(0.5 * attempt)
continue
return None
for name in os.listdir(temp_dir):
if name.endswith(".xlsx"):
with open(os.path.join(temp_dir, name), "rb") as handle:
converted = handle.read()
logger.info(
"Converted spreadsheet via LibreOffice (%s -> xlsx, %d bytes)",
suffix,
len(converted),
)
return converted
if attempt < max_attempts:
time.sleep(0.5 * attempt)
return None
def normalize_excel_bytes(content: bytes, file_type: str | None = None) -> bytes:
"""Return bytes readable by pandas, converting via LibreOffice when needed."""
ext = detect_excel_format(content)
if ext is not None:
return content
suffixes = []
if file_type:
suffixes.append(f".{file_type.lstrip('.')}")
suffixes.extend([".xlsx", ".xls", ".et", ".csv"])
seen: set[str] = set()
for suffix in suffixes:
if suffix in seen:
continue
seen.add(suffix)
converted = convert_excel_to_xlsx_bytes(content, suffix=suffix)
if converted and detect_excel_format(converted) is not None:
return converted
raise ValueError(
"Unrecognized Excel file format; the file may be corrupt, encrypted, "
"or not a spreadsheet"
)
def find_soffice() -> Optional[str]:
possible_paths = [
"/usr/bin/soffice",
"/usr/lib/libreoffice/program/soffice",
"/opt/libreoffice25.2/program/soffice",
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
"C:\\Program Files\\LibreOffice\\program\\soffice.exe",
"C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
]
for path in possible_paths:
if path and os.path.exists(path):
return path
result = subprocess.run(["which", "soffice"], capture_output=True, text=True)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return None

View File

@@ -13,6 +13,14 @@ import pandas as pd
from docreader.models.document import Chunk, Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.excel_convert import (
convert_excel_to_xlsx_bytes,
detect_excel_format,
engine_for_format,
normalize_excel_bytes,
)
from docreader.parser.xlsx_merge import fill_merged_cells_xlsx
from docreader.parser.xlsx_repair import repair_xlsx_bytes
logger = logging.getLogger(__name__)
@@ -60,13 +68,11 @@ class ExcelParser(BaseParser):
text: List[str] = []
start, end = 0, 0
# Load Excel file from bytes into pandas ExcelFile object
excel_file = pd.ExcelFile(BytesIO(content))
excel_file = _open_excel_file(content, file_type=self.file_type)
# Process each sheet in the Excel file
for excel_sheet_name in excel_file.sheet_names:
# Parse the sheet into a DataFrame
df = excel_file.parse(sheet_name=excel_sheet_name)
df = _read_sheet_dataframe(excel_file, excel_sheet_name)
# Remove rows where all values are NaN (completely empty rows)
df.dropna(how="all", inplace=True)
@@ -97,6 +103,90 @@ class ExcelParser(BaseParser):
return Document(content="".join(text), chunks=chunks)
def _read_sheet_dataframe(excel_file: pd.ExcelFile, sheet_name: str) -> pd.DataFrame:
"""Read a worksheet into a DataFrame with stable column labels."""
from openpyxl.utils import get_column_letter
# XLSX is preprocessed (merge fill); use A/B/C column letters and keep row 1 as data.
if excel_file.engine == "openpyxl":
df = excel_file.parse(sheet_name=sheet_name, header=None)
df.columns = [get_column_letter(idx + 1) for idx in range(len(df.columns))]
return df
df = excel_file.parse(sheet_name=sheet_name, header=0)
if df.empty:
df = excel_file.parse(sheet_name=sheet_name, header=None)
df.columns = [get_column_letter(idx + 1) for idx in range(len(df.columns))]
elif any(str(col).startswith("Unnamed:") for col in df.columns):
df = excel_file.parse(sheet_name=sheet_name, header=None)
df.columns = [get_column_letter(idx + 1) for idx in range(len(df.columns))]
return df
def _prepare_xlsx_bytes(data: bytes) -> bytes:
repaired = repair_xlsx_bytes(data)
if repaired is not None:
data = repaired
return fill_merged_cells_xlsx(data)
def _open_excel_file(content: bytes, file_type: str | None = None) -> pd.ExcelFile:
"""Open an Excel workbook with explicit engine selection and fallbacks."""
data = content
converted_via_soffice = False
while True:
ext = detect_excel_format(data)
if ext is None:
if converted_via_soffice:
raise ValueError(
"Excel file format cannot be determined, you must specify an engine manually."
)
try:
data = normalize_excel_bytes(data, file_type=file_type)
except ValueError as exc:
raise ValueError(
"Excel file format cannot be determined, you must specify an engine manually."
) from exc
converted_via_soffice = True
continue
if ext == "ods":
converted = convert_excel_to_xlsx_bytes(data, suffix=".ods")
if converted:
data = converted
continue
engine = engine_for_format(ext)
if ext == "xlsx":
data = _prepare_xlsx_bytes(data)
engine = "openpyxl"
try:
return pd.ExcelFile(BytesIO(data), engine=engine)
except ImportError as exc:
raise ValueError(
f"Excel engine {engine!r} is not available for .{ext} files"
) from exc
except KeyError as exc:
if "sharedStrings.xml" not in str(exc) or engine != "openpyxl":
raise
repaired = repair_xlsx_bytes(data)
if repaired is None:
raise
logger.info("Repaired XLSX sharedStrings packaging before parse")
data = _prepare_xlsx_bytes(repaired)
continue
except ValueError as exc:
if converted_via_soffice or "cannot be determined" not in str(exc):
raise
try:
data = normalize_excel_bytes(content, file_type=file_type)
except ValueError:
raise
converted_via_soffice = True
continue
if __name__ == "__main__":
# Example usage: Parse an Excel file and display results
logging.basicConfig(level=logging.DEBUG)

View File

@@ -1,109 +0,0 @@
"""Optional PDF engine backed by LiteParse (LlamaIndex, MIT).
LiteParse is a fast Rust/PDFium text extractor that performs spatial reading-order
reconstruction natively (multi-column aware) and is considerably faster than the
Python text path. It is exposed as a *selectable* engine (``liteparse``) rather
than replacing the builtin engine, so users can opt in per knowledge base.
Scope/limitations (documented intentionally):
* Text-first engine: it returns reading-order plain text, not figures. Scanned
pages carry no text layer, so for image-dominated PDFs we fall back to the
builtin scanned renderer (page -> JPEG, OCR'd by the Go App) to stay robust.
* docreader never runs OCR itself; OCR/VLM remain Go-side responsibilities.
"""
import logging
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
logger = logging.getLogger(__name__)
# If the extracted text averages fewer characters per page than this, the PDF is
# treated as scanned/image-dominated and routed to the builtin image renderer.
_MIN_CHARS_PER_PAGE = 20
# If at least this fraction of sampled pages are image-dominated, the PDF is
# scanned (even when it carries a garbled OCR text layer) and is routed to the
# builtin image renderer rather than trusting the low-quality text.
_SCANNED_PAGE_FRACTION = 0.5
def liteparse_available(_overrides=None):
"""Engine availability probe used by the registry/UI."""
try:
import liteparse # noqa: F401
except Exception as e: # pragma: no cover - depends on install
return False, f"liteparse 未安装: {e}"
return True, ""
class LiteParseParser(BaseParser):
"""Parse a PDF with LiteParse, falling back to scanned rendering when empty."""
def parse_into_text(self, content: bytes) -> Document:
import liteparse
from docreader.parser.pdf_parser import (
PDFScannedParser,
estimate_scanned_fraction,
)
# Image-dominated PDFs (incl. ones with a garbled OCR text layer) carry
# no trustworthy text; render them as images for Go-side OCR instead.
try:
scanned_frac = estimate_scanned_fraction(content)
except Exception:
scanned_frac = 0.0
if scanned_frac >= _SCANNED_PAGE_FRACTION:
logger.info(
"LiteParseParser: %s is image-dominated (%.0f%% scanned pages); "
"using builtin scanned renderer",
self.file_name,
scanned_frac * 100,
)
return PDFScannedParser(
file_name=self.file_name, file_type=self.file_type
).parse_into_text(content)
engine = liteparse.LiteParse(ocr_enabled=False, quiet=True)
result = engine.parse(content)
page_count = int(result.num_pages)
page_texts = []
for i in range(page_count):
page = result.get_page(i)
page_texts.append((getattr(page, "text", "") or "").strip())
doc_text = (getattr(result, "text", "") or "").strip()
if not doc_text:
doc_text = "\n\n".join(t for t in page_texts if t)
# Image-dominated / scanned PDFs yield little to no text: defer to the
# builtin scanned renderer so the Go App can OCR the page images.
if page_count and len(doc_text) < _MIN_CHARS_PER_PAGE * page_count:
logger.info(
"LiteParseParser: %s looks scanned (%d chars / %d pages); "
"falling back to builtin scanned renderer",
self.file_name,
len(doc_text),
page_count,
)
return PDFScannedParser(
file_name=self.file_name, file_type=self.file_type
).parse_into_text(content)
logger.info(
"LiteParseParser: %s -> %d pages, content_len=%d",
self.file_name,
page_count,
len(doc_text),
)
return Document(
content=doc_text,
images={},
metadata={
"page_count": page_count,
"image_source_type": "pdf_text_layer",
"parser_engine": "liteparse",
},
)

View File

@@ -18,6 +18,8 @@ import re
import uuid
from typing import Dict, List, Match, Optional, Tuple
_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
@@ -58,6 +60,71 @@ class MarkdownTableUtil:
re.MULTILINE,
)
@staticmethod
def _split_row_cells(row_line: str) -> List[str]:
"""Split a markdown table row into cells, preserving empty cells."""
inner = row_line.strip()
if not inner.startswith("|"):
return []
parts = inner.split("|")
if parts and parts[0].strip() == "":
parts = parts[1:]
if parts and parts[-1].strip() == "":
parts = parts[:-1]
return [part.strip() for part in parts]
@staticmethod
def _is_table_row(line: str) -> bool:
stripped = line.strip()
return stripped.startswith("|") and "|" in stripped[1:]
@classmethod
def _is_separator_row(cls, line: str) -> bool:
cells = cls._split_row_cells(line)
return bool(cells) and all(_SEPARATOR_CELL.match(cell) for cell in cells)
@classmethod
def _is_empty_row(cls, line: str) -> bool:
cells = cls._split_row_cells(line)
return bool(cells) and all(cell == "" for cell in cells)
@classmethod
def _separator_row_for(cls, header_line: str) -> str:
cells = cls._split_row_cells(header_line)
return "| " + " | ".join("---" for _ in cells) + " |"
@classmethod
def _normalize_table_block(cls, block: List[str]) -> List[str]:
"""Fix MarkItDown-style tables: drop bogus prefix rows, ensure GFM delimiter."""
while block and cls._is_empty_row(block[0]):
block.pop(0)
if block and cls._is_separator_row(block[0]):
block.pop(0)
# GFM/marked need "| --- |" after the first row. Headerless Word tables
# only have data rows after we strip the fake empty+separator prefix.
if len(block) >= 2 and not cls._is_separator_row(block[1]):
sep = cls._separator_row_for(block[0])
block = [block[0], sep] + block[1:]
return block
def normalize_spurious_table_prefixes(self, content: str) -> str:
"""Remove bogus empty/separator prefix rows from MarkItDown table output."""
lines = content.split("\n")
out: List[str] = []
i = 0
while i < len(lines):
line = lines[i]
if not self._is_table_row(line):
out.append(line)
i += 1
continue
block: List[str] = []
while i < len(lines) and self._is_table_row(lines[i]):
block.append(lines[i])
i += 1
out.extend(self._normalize_table_block(block))
return "\n".join(out)
def format_table(self, content: str) -> str:
"""Format all Markdown tables in the content.
@@ -70,8 +137,7 @@ class MarkdownTableUtil:
def process_align(match: Match[str]) -> str:
"""Process alignment row to standardize format."""
# Split by | and remove empty strings
columns = [col.strip() for col in match.group(0).split("|") if col.strip()]
columns = self._split_row_cells(match.group(0))
processed = []
for col in columns:
@@ -87,8 +153,7 @@ class MarkdownTableUtil:
def process_line(match: Match[str]) -> str:
"""Process regular table row to standardize format."""
# Split by | and remove empty strings
columns = [col.strip() for col in match.group(0).split("|") if col.strip()]
columns = self._split_row_cells(match.group(0))
# Preserve original indentation
prefix = match.group(1)
@@ -99,8 +164,7 @@ class MarkdownTableUtil:
formatted_content = self.line_pattern.sub(process_line, formatted_content)
# Then format alignment rows (must be done after to avoid conflicts)
formatted_content = self.align_pattern.sub(process_align, formatted_content)
return formatted_content
return self.normalize_spurious_table_prefixes(formatted_content)
@staticmethod
def _self_test():

View File

@@ -9,6 +9,11 @@ from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
from docreader.parser.concurrency import parser_worker_limit
from docreader.parser.markdown_parser import MarkdownParser
from docreader.parser.ppt_convert import normalize_ppt_bytes
from docreader.parser.pptx_media import (
attach_pptx_media_to_markdown,
markdown_needs_pptx_media_attach,
)
logger = logging.getLogger(__name__)
@@ -32,16 +37,41 @@ class StdMarkitdownParser(BaseParser):
Uses self.file_type (inherited from BaseParser) to hint the stream format.
"""
ext = self.file_type
if ext and not ext.startswith('.'):
ext = '.' + ext
ft = (ext or "").lstrip(".").lower()
pptx_bytes: bytes | None = None
if ft in ("ppt", "pptx"):
content, ext = normalize_ppt_bytes(content, ft)
pptx_bytes = content
ft = "pptx"
elif ext and not ext.startswith("."):
ext = "." + ext
with parser_worker_limit("markitdown", CONFIG.markitdown_max_workers):
result = self.markitdown.convert(
result = self._convert_markitdown(content, ext, keep_data_uris=True)
if result is None:
logger.warning(
"MarkItDown failed with embedded images for %s; retrying without data URIs",
ft or ext,
)
result = self._convert_markitdown(content, ext, keep_data_uris=False)
text = result.text_content
images: dict[str, str] = {}
if pptx_bytes is not None and markdown_needs_pptx_media_attach(text):
text, images = attach_pptx_media_to_markdown(text, pptx_bytes)
return Document(content=text, images=images)
def _convert_markitdown(self, content: bytes, ext: str | None, *, keep_data_uris: bool):
try:
return self.markitdown.convert(
io.BytesIO(content),
file_extension=ext,
keep_data_uris=True
keep_data_uris=keep_data_uris,
)
return Document(content=result.text_content)
except Exception:
if keep_data_uris:
return None
raise
class MarkitdownParser(PipelineParser):

View File

@@ -0,0 +1,351 @@
"""PDF parser backed by OpenDataLoader PDF (Apache-2.0).
Requires Java 11+ on PATH and the ``opendataloader-pdf`` Python package.
Each ``convert()`` spawns a JVM; concurrency is limited via
``DOCREADER_ODL_MAX_WORKERS``.
Hybrid mode (``docling-fast``, etc.) needs a running
``opendataloader-pdf-hybrid`` server — configure ``DOCREADER_ODL_HYBRID_URL``.
"""
from __future__ import annotations
import base64
import html
import logging
import os
import re
import shutil
import tempfile
import urllib.error
import urllib.request
from typing import Any, Dict, Mapping, Optional, Tuple
from docreader.config import CONFIG
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.concurrency import parser_worker_limit
logger = logging.getLogger(__name__)
_MIN_CHARS_PER_PAGE = 20
_IMAGE_SUFFIXES = (".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp")
_MD_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
_IMAGE_FILE_NUM_RE = re.compile(r"^imageFile(\d+)\.", re.I)
def _override_str(overrides: Optional[Mapping[str, Any]], key: str, default: str = "") -> str:
if overrides:
v = overrides.get(key)
if v is not None and str(v).strip() != "":
return str(v).strip()
return default
def _override_bool(overrides: Optional[Mapping[str, Any]], key: str, default: bool) -> bool:
if overrides:
v = overrides.get(key)
if v is not None and str(v).strip() != "":
return str(v).strip().lower() in {"1", "true", "yes", "y", "on"}
return default
def _java_available() -> Tuple[bool, str]:
if not shutil.which("java"):
return False, "需要 Java 11+JRE请安装并在 PATH 中配置 java"
return True, ""
def _package_available() -> Tuple[bool, str]:
try:
import opendataloader_pdf # noqa: F401
except ImportError as e:
return False, f"opendataloader-pdf 未安装: {e}"
return True, ""
def _ping_hybrid(
url: str,
*,
timeout_sec: float = 5.0,
retries: int = 3,
retry_delay_sec: float = 2.0,
) -> Tuple[bool, str]:
import time
base = url.rstrip("/")
health_url = f"{base}/health"
last_err = ""
for attempt in range(max(1, retries)):
try:
req = urllib.request.Request(health_url, method="GET")
with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
if 200 <= resp.status < 300:
return True, ""
last_err = f"hybrid 健康检查 HTTP {resp.status}: {health_url}"
except urllib.error.URLError as e:
last_err = f"无法连接 OpenDataLoader hybrid 服务 ({health_url}): {e}"
except Exception as e:
last_err = f"hybrid 健康检查失败: {e}"
if attempt + 1 < retries:
time.sleep(retry_delay_sec)
hint = (
";若刚执行 make dev-start --odl-hybrid请等待镜像构建/服务就绪"
"docker logs WeKnora-odl-hybrid"
)
return False, last_err + hint
def opendataloader_available(
overrides: Optional[Mapping[str, Any]] = None,
) -> Tuple[bool, str]:
"""Registry / ListEngines availability probe."""
ok, msg = _java_available()
if not ok:
return False, msg
ok, msg = _package_available()
if not ok:
return False, msg
hybrid = _resolve_hybrid(overrides)
if hybrid and hybrid.lower() not in ("off", ""):
url = _resolve_hybrid_url(overrides)
if url:
return _ping_hybrid(url, retries=6, retry_delay_sec=5.0, timeout_sec=5.0)
return True, ""
def _resolve_hybrid(overrides: Optional[Mapping[str, Any]] = None) -> str:
return _override_str(overrides, "odl_hybrid", CONFIG.odl_hybrid)
def _resolve_hybrid_url(overrides: Optional[Mapping[str, Any]] = None) -> str:
return _override_str(overrides, "odl_hybrid_url", CONFIG.odl_hybrid_url)
def _find_markdown_file(output_dir: str, pdf_stem: str) -> str:
candidates = []
for root, _, files in os.walk(output_dir):
for name in files:
if name.lower().endswith(".md"):
path = os.path.join(root, name)
candidates.append(path)
if not candidates:
raise FileNotFoundError(f"OpenDataLoader 未在 {output_dir} 生成 markdown 文件")
for path in candidates:
base = os.path.splitext(os.path.basename(path))[0]
if base == pdf_stem or base.startswith(pdf_stem):
return path
candidates.sort(key=lambda p: os.path.getmtime(p), reverse=True)
return candidates[0]
def _normalize_odl_image_url(raw: str) -> str:
"""OpenDataLoader wraps paths as ``<images/foo.png>``; storage may HTML-escape them."""
s = html.unescape((raw or "").strip())
s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
s = s.strip().strip("<>").strip().strip('"').strip("'")
if s.startswith("./"):
s = s[2:]
return s.replace("\\", "/")
def _canonical_image_ref(abs_path: str, output_dir: str) -> str:
"""Use ``images/<file>`` keys to match OpenDataLoader markdown conventions."""
rel = os.path.relpath(abs_path, output_dir).replace("\\", "/")
name = os.path.basename(abs_path)
if rel.startswith("images/"):
return rel
return f"images/{name}"
def _collect_images_under_output(output_dir: str) -> Dict[str, str]:
"""Collect every extracted image under the convert output tree."""
images: Dict[str, str] = {}
for root, _, files in os.walk(output_dir):
for name in files:
if not name.lower().endswith(_IMAGE_SUFFIXES):
continue
abs_path = os.path.join(root, name)
ref = _canonical_image_ref(abs_path, output_dir)
if ref in images:
continue
with open(abs_path, "rb") as f:
images[ref] = base64.b64encode(f.read()).decode("utf-8")
return images
def _register_image_alias(aliases: Dict[str, str], alias: str, canonical: str) -> None:
key = _normalize_odl_image_url(alias)
if key:
aliases[key] = canonical
def _build_path_alias_map(images: Dict[str, str]) -> Dict[str, str]:
"""Map ODL markdown spellings (angle brackets, entities, basenames) to dict keys."""
aliases: Dict[str, str] = {}
for ref in images:
base = os.path.basename(ref)
variants = [
ref,
base,
f"images/{base}",
f"<{ref}>",
f"<images/{base}>",
f"&lt;{ref}&gt;",
f"&lt;images/{base}&gt;",
]
for variant in variants:
_register_image_alias(aliases, variant, ref)
return aliases
def _resolve_image_ref(url: str, aliases: Dict[str, str]) -> Optional[str]:
key = _normalize_odl_image_url(url)
if not key or key.startswith("data:"):
return None
if key in aliases:
return aliases[key]
base = os.path.basename(key)
for candidate in (base, f"images/{base}"):
if candidate in aliases:
return aliases[candidate]
m = _IMAGE_FILE_NUM_RE.match(base)
if m:
num = int(m.group(1))
numbered = []
for ref in {aliases[k] for k in aliases}:
bm = _IMAGE_FILE_NUM_RE.match(os.path.basename(ref))
if bm:
numbered.append((int(bm.group(1)), ref))
numbered.sort(key=lambda x: x[0])
for n, ref in numbered:
if n == num:
return ref
if numbered and 1 <= num <= len(numbered):
return numbered[num - 1][1]
return None
def _rewrite_markdown_image_refs(
markdown: str, images: Dict[str, str]
) -> str:
if not images:
return markdown
aliases = _build_path_alias_map(images)
def repl(match: re.Match[str]) -> str:
alt, raw_url = match.group(1), match.group(2)
url = raw_url.strip().split()[0] if raw_url else ""
canonical = _resolve_image_ref(url, aliases)
if canonical is None:
return match.group(0)
return f"![{alt}]({canonical})"
return _MD_IMAGE_RE.sub(repl, markdown)
def _run_convert(
pdf_path: str,
output_dir: str,
image_dir: str,
overrides: Optional[Mapping[str, Any]] = None,
) -> None:
import opendataloader_pdf
kwargs: Dict[str, Any] = {
"input_path": pdf_path,
"output_dir": output_dir,
"format": "markdown",
"image_output": "external",
"image_dir": image_dir,
"quiet": True,
"markdown_with_html": _override_bool(
overrides, "odl_markdown_with_html", CONFIG.odl_markdown_with_html
),
}
hybrid = _resolve_hybrid(overrides)
if hybrid and hybrid.lower() not in ("off", ""):
kwargs["hybrid"] = hybrid
hybrid_url = _resolve_hybrid_url(overrides)
if hybrid_url:
kwargs["hybrid_url"] = hybrid_url
hybrid_mode = _override_str(overrides, "odl_hybrid_mode", CONFIG.odl_hybrid_mode)
if hybrid_mode:
kwargs["hybrid_mode"] = hybrid_mode
if _override_bool(overrides, "odl_hybrid_fallback", CONFIG.odl_hybrid_fallback):
kwargs["hybrid_fallback"] = True
opendataloader_pdf.convert(**kwargs)
class OpenDataLoaderParser(BaseParser):
"""Parse PDFs with OpenDataLoader (layout-aware markdown + external images)."""
def __init__(self, *args: Any, **kwargs: Any):
self._engine_overrides: Dict[str, Any] = {
k: v
for k, v in kwargs.items()
if k.startswith("odl_") or k in ("mineru_endpoint", "mineru_api_key")
}
super().__init__(*args, **kwargs)
def parse_into_text(self, content: bytes) -> Document:
ok, msg = opendataloader_available(self._engine_overrides)
if not ok:
raise RuntimeError(msg)
safe_name = os.path.basename(self.file_name) or "document.pdf"
if not safe_name.lower().endswith(".pdf"):
safe_name = f"{os.path.splitext(safe_name)[0] or 'document'}.pdf"
pdf_stem = os.path.splitext(safe_name)[0]
max_workers = CONFIG.odl_max_workers
with parser_worker_limit("opendataloader", max_workers):
with tempfile.TemporaryDirectory(prefix="weknora-odl-") as tmp_dir:
pdf_path = os.path.join(tmp_dir, safe_name)
with open(pdf_path, "wb") as f:
f.write(content)
image_dir = os.path.join(tmp_dir, "images")
os.makedirs(image_dir, exist_ok=True)
_run_convert(
pdf_path,
tmp_dir,
image_dir,
overrides=self._engine_overrides,
)
md_path = _find_markdown_file(tmp_dir, pdf_stem)
with open(md_path, encoding="utf-8", errors="replace") as f:
text = f.read()
images = _collect_images_under_output(tmp_dir)
text = _rewrite_markdown_image_refs(text, images)
if len(text.strip()) < _MIN_CHARS_PER_PAGE:
logger.info(
"OpenDataLoaderParser: %s yielded little text; "
"falling back to builtin scanned renderer",
self.file_name,
)
from docreader.parser.pdf_parser import PDFScannedParser
return PDFScannedParser(
file_name=self.file_name, file_type=self.file_type
).parse_into_text(content)
logger.info(
"OpenDataLoaderParser: %s -> content_len=%d images=%d",
self.file_name,
len(text),
len(images),
)
return Document(
content=text,
images=images,
metadata={
"parser_engine": "opendataloader",
"odl_hybrid": _resolve_hybrid(self._engine_overrides) or "off",
},
)

View File

@@ -20,6 +20,7 @@ import base64
import io
import logging
import os
import re
import statistics
from docreader.config import CONFIG
@@ -87,12 +88,51 @@ EMBED_MAX_IMAGES = _env_int("DOCREADER_PDF_EMBED_MAX_IMAGES", 50)
# Reconstruct reading order with a geometric XY-cut so multi-column pages are
# linearised column-by-column instead of line-interleaved.
LAYOUT_ORDERING = _env_bool("DOCREADER_PDF_LAYOUT_ORDERING", True)
# When glyphs are positioned without explicit space characters (common in OCR /
# search text layers), insert a space if the horizontal gap exceeds this
# multiple of the line's median glyph width.
WORD_GAP_WIDTH_RATIO = _env_float("DOCREADER_PDF_WORD_GAP_WIDTH_RATIO", 0.4)
# Promote visually larger lines to markdown headings (font-size proxy = rect
# height relative to the page's median line height).
DETECT_HEADINGS = _env_bool("DOCREADER_PDF_DETECT_HEADINGS", True)
# Drop invisible (render-mode 3), off-page and degenerate text — a cheap guard
# against hidden-text prompt injection and OCR artefacts.
FILTER_HIDDEN_TEXT = _env_bool("DOCREADER_PDF_FILTER_HIDDEN_TEXT", True)
# Narrow side strips (arXiv watermarks, page labels) narrower than this share of
# page width are dropped when they look like vertical / single-glyph noise.
MARGIN_COL_WIDTH_RATIO = _env_float("DOCREADER_PDF_MARGIN_COL_WIDTH_RATIO", 0.12)
# Minimum characters on a line before font-size heuristics may promote it to a
# markdown heading (avoids ``### C`` from margin glyphs).
MIN_HEADING_LINE_CHARS = _env_int("DOCREADER_PDF_MIN_HEADING_LINE_CHARS", 8)
# Strip pdfium placeholder glyphs (U+FFFE) and soft hyphens; remove axis/legend text
# from vector figures when a Figure caption is present on the page.
SANITIZE_PDF_TEXT = _env_bool("DOCREADER_PDF_SANITIZE_TEXT", True)
STRIP_CHART_TEXT_DEBRIS = _env_bool("DOCREADER_PDF_STRIP_CHART_DEBRIS", True)
# Render detected vector chart regions (no embedded bitmap) as JPEG for VLM/OCR.
RENDER_VECTOR_FIGURES = _env_bool("DOCREADER_PDF_RENDER_VECTOR_FIGURES", True)
MIN_CHART_REGION_CHARS = _env_int("DOCREADER_PDF_MIN_CHART_REGION_CHARS", 18)
MIN_CHART_REGION_AREA_RATIO = _env_float("DOCREADER_PDF_MIN_CHART_REGION_AREA", 0.015)
MAX_CHART_REGION_AREA_RATIO = _env_float("DOCREADER_PDF_MAX_CHART_REGION_AREA", 0.42)
MAX_FIGURE_HEIGHT_RATIO = _env_float("DOCREADER_PDF_MAX_FIGURE_HEIGHT_RATIO", 0.38)
# pdfium / Adobe text layers often emit U+FFFE for missing hyphenation or ligatures.
_PDF_ARTIFACT_RE = re.compile(r"[\u00ad\u200b-\u200f\ufeff\ufffe\uffff]")
_PDF_ARTIFACT_JOIN_RE = re.compile(r"(\w)[\u00ad\ufffe](\w)")
_CHART_DEBRIS_LINE_RE = re.compile(
r"^(?:"
r"[\d\s.]+|"
r"\d{1,2}|"
r"\d+-layer|"
r"iter\.\s*\(1e4\)|"
r"(?:training|test)\s+error\s*\(%\)"
r")$",
re.IGNORECASE,
)
_CHART_LAYER_RE = re.compile(r"^\d+-layer$", re.IGNORECASE)
_FIGURE_CAPTION_RE = re.compile(r"^Figure\s+\d+\b", re.IGNORECASE)
_FIGURE_CAPTION_SEARCH_RE = re.compile(r"\bFigure\s+(\d+)\b", re.IGNORECASE)
_ARXIV_LINE_RE = re.compile(r"^arXiv:\s*\S+", re.IGNORECASE)
_PAGE_NUM_LINE_RE = re.compile(r"^\d{1,3}$")
def _close_pdfium_resource(resource) -> None:
@@ -150,6 +190,394 @@ def _extract_page_text(page) -> str:
_close_pdfium_resource(textpage)
def _sanitize_pdf_text(text: str) -> str:
"""Remove PDF text-layer placeholders and repair broken hyphenations."""
if not text:
return text
text = _PDF_ARTIFACT_RE.sub("", text)
text = _PDF_ARTIFACT_JOIN_RE.sub(r"\1\2", text)
return text
def _is_chart_debris_line(line: str) -> bool:
t = line.strip()
if not t:
return False
if _CHART_DEBRIS_LINE_RE.match(t):
return True
if _CHART_LAYER_RE.match(t):
return True
# Tick labels like "0 1 2 3 4 5 6 0"
if re.fullmatch(r"[\d\s.()-]+", t) and len(t) <= 24 and sum(c.isdigit() for c in t) >= 3:
return True
return False
def _strip_chart_text_debris(text: str) -> str:
"""Drop runs of axis/legend lines leaked from vector figures into the text layer."""
if not text:
return text
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
out: list = []
i = 0
while i < len(lines):
if _is_chart_debris_line(lines[i]):
j = i
while j < len(lines) and (
_is_chart_debris_line(lines[j]) or not lines[j].strip()
):
j += 1
if j - i >= 3:
i = j
continue
out.append(lines[i])
i += 1
return "\n".join(out)
def _strip_arxiv_and_page_num_lines(text: str) -> str:
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
kept: list = []
for ln in lines:
t = ln.strip()
if _ARXIV_LINE_RE.match(t):
continue
if _PAGE_NUM_LINE_RE.match(t):
continue
if "arXiv:" in ln:
ln = re.sub(r"\s*arXiv:\s*\S+\s*(?:\[[^\]]+\])?\s*[^\n]*", "", ln).strip()
if not ln:
continue
kept.append(ln)
return "\n".join(kept)
def _strip_lines_above_figure_captions(text: str) -> str:
"""Remove diagram/chart label lines that sit immediately above a Figure caption."""
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
out: list = []
for ln in lines:
if _line_has_figure_caption(ln):
while out and _is_figure_interior_line(out[-1]):
out.pop()
out.append(ln)
else:
out.append(ln)
return "\n".join(out)
def _is_body_paragraph_line(text: str) -> bool:
t = text.strip()
if len(t) < 48:
return False
return len(t.split()) >= 8
def _is_figure_interior_line(text: str) -> bool:
"""Short, non-body line directly above a Figure caption (diagram labels, ticks)."""
t = text.strip()
if not t or _FIGURE_CAPTION_RE.match(t):
return False
if _ARXIV_LINE_RE.match(t) or _PAGE_NUM_LINE_RE.match(t):
return True
if _is_body_paragraph_line(t):
return False
if _is_chart_debris_line(t):
return True
# Prose sentence above a figure (wrapped paragraph tail) — keep in text.
if t.endswith((".", "", "!", "?", "")) and len(t) >= 15:
return False
if len(t.split()) >= 7:
return False
if len(t) <= 40:
return True
return False
def _postprocess_pdf_text(text: str) -> str:
if SANITIZE_PDF_TEXT:
text = _sanitize_pdf_text(text)
text = _strip_arxiv_and_page_num_lines(text)
text = _strip_lines_above_figure_captions(text)
if STRIP_CHART_TEXT_DEBRIS:
text = _strip_chart_text_debris(text)
return text
def _char_looks_chart_axis_tick(ch: str) -> bool:
"""Axis tick / numeric chart labels only (not words like ``layer`` in diagrams)."""
t = ch.strip()
if not t:
return False
if len(t) == 1 and t in "0123456789.%()-":
return True
if _CHART_LAYER_RE.match(t):
return True
if re.fullmatch(r"iter\.\s*\(1e4\)", t, re.I):
return True
if re.fullmatch(r"(?:training|test)\s+error\s*\(%\)", t, re.I):
return True
return False
def _chars_bbox(char_list: list) -> tuple:
return (
min(c["x0"] for c in char_list),
min(c["y0"] for c in char_list),
max(c["x1"] for c in char_list),
max(c["y1"] for c in char_list),
)
def _bbox_area_ratio(bbox, page_w: float, page_h: float) -> float:
page_area = float(page_w) * float(page_h)
if page_area <= 0:
return 0.0
x0, y0, x1, y1 = bbox
return max(0.0, (x1 - x0) * (y1 - y0) / page_area)
def _chart_region_bbox(chars: list, page_w: float, page_h: float):
"""Bounding box of numeric chart axis labels (fallback when caption walk fails)."""
chart = [c for c in chars if _char_looks_chart_axis_tick(c["ch"])]
if len(chart) < MIN_CHART_REGION_CHARS:
return None
bbox = _chars_bbox(chart)
ratio = _bbox_area_ratio(bbox, page_w, page_h)
if ratio < MIN_CHART_REGION_AREA_RATIO or ratio > MAX_CHART_REGION_AREA_RATIO:
return None
x0, y0, x1, y1 = bbox
pad_x = max(8.0, (x1 - x0) * 0.08)
pad_y = max(8.0, (y1 - y0) * 0.08)
return (
max(0.0, x0 - pad_x),
max(0.0, y0 - pad_y),
min(page_w, x1 + pad_x),
min(page_h, y1 + pad_y),
)
def _expand_chart_bbox(bbox, page_w: float, page_h: float, margin_frac: float = 0.18):
x0, y0, x1, y1 = bbox
dx = (x1 - x0) * margin_frac
dy = (y1 - y0) * margin_frac
return (
max(0.0, x0 - dx),
max(0.0, y0 - dy),
min(page_w, x1 + dx),
min(page_h, y1 + dy),
)
def _render_page_clip_jpeg(page, bbox, scale: float, quality: int, max_edge: int) -> bytes:
"""Render a PDF page region to JPEG (bbox in PDF points, bottom-left origin)."""
left, bottom, right, top = bbox
scale_eff = _effective_scale(page, scale, max_edge)
bitmap = None
try:
bitmap = page.render(scale=scale_eff)
pil = bitmap.to_pil().convert("RGB")
finally:
_close_pdfium_resource(bitmap)
page_w, page_h = page.get_size()
x0 = int(left * scale_eff)
x1 = int(right * scale_eff)
y0 = int((page_h - top) * scale_eff)
y1 = int((page_h - bottom) * scale_eff)
if x1 <= x0 or y1 <= y0:
raise ValueError("degenerate clip bbox")
return _pil_to_jpeg_bytes(pil.crop((x0, y0, x1, y1)), quality)
def _pil_to_jpeg_bytes(pil, quality: int) -> bytes:
buf = io.BytesIO()
if pil.mode not in ("RGB", "L"):
pil = pil.convert("RGB")
pil.save(buf, format="JPEG", quality=quality, optimize=True)
return buf.getvalue()
def _group_lines_with_chars(chars: list) -> list:
"""Group glyphs into lines; each line includes its char list and bbox."""
if not chars:
return []
heights = [c["y1"] - c["y0"] for c in chars if c["y1"] > c["y0"]]
med_h = statistics.median(heights) if heights else 1.0
ordered = sorted(chars, key=lambda c: -(c["y0"] + c["y1"]) / 2)
groups: list = []
cur: list = []
ref = None
for c in ordered:
yc = (c["y0"] + c["y1"]) / 2
if ref is None or abs(yc - ref) <= 0.5 * med_h:
cur.append(c)
ref = yc if ref is None else ref
else:
groups.append(cur)
cur = [c]
ref = yc
if cur:
groups.append(cur)
lines: list = []
for grp in groups:
grp_sorted = sorted(grp, key=lambda c: c["x0"])
text = _join_line_glyphs(grp_sorted)
if not text:
continue
hs = [c["y1"] - c["y0"] for c in grp_sorted if c["y1"] > c["y0"]]
lines.append(
{
"text": text,
"h": statistics.median(hs) if hs else med_h,
"chars": grp_sorted,
"bbox": _chars_bbox(grp_sorted),
}
)
return lines
def _line_has_figure_caption(text: str) -> bool:
return bool(_FIGURE_CAPTION_SEARCH_RE.search((text or "").strip()))
def _bbox_above_caption(lines: list, cap_i: int, page_w: float, page_h: float):
"""Region above a Figure caption line (PDF coords, bottom-left origin)."""
cap_bbox = lines[cap_i]["bbox"]
cap_top = cap_bbox[3]
x0, x1 = cap_bbox[0], cap_bbox[2]
fig_h = page_h * min(MAX_FIGURE_HEIGHT_RATIO, 0.35)
y_bottom = cap_top
y_top = min(page_h, cap_top + fig_h)
for j in range(cap_i - 1, -1, -1):
t = lines[j]["text"]
b = lines[j]["bbox"]
if b[3] < y_bottom - 4:
continue
if b[1] > y_top + 4:
break
if _is_body_paragraph_line(t) and not _is_figure_interior_line(t):
break
if _is_figure_interior_line(t) or _is_chart_debris_line(t) or not t.strip():
x0 = min(x0, b[0])
x1 = max(x1, b[2])
y_top = max(y_top, min(page_h, b[3] + fig_h * 0.15))
min_h = page_h * 0.08
if y_top - y_bottom < min_h:
y_top = min(page_h, y_bottom + min_h)
margin_x = max(8.0, (x1 - x0) * 0.05)
return (
max(0.0, x0 - margin_x),
y_bottom,
min(page_w, x1 + margin_x),
y_top,
)
def _cap_bbox_height(bbox, page_h: float, cap_y_top: float) -> tuple:
"""Limit figure bbox height (PDF coords, bottom-left origin)."""
x0, y0, x1, y1 = bbox
max_top = min(y1, cap_y_top + page_h * MAX_FIGURE_HEIGHT_RATIO)
if max_top <= y0:
return bbox
return (x0, y0, x1, max_top)
def _inject_figure_markdown_before_captions(text: str, clips: list) -> str:
"""Place ``![...]()`` immediately before each Figure caption line in page text."""
if not clips:
return text
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
clip_idx = 0
for i, ln in enumerate(lines):
if clip_idx >= len(clips):
break
if not _line_has_figure_caption(ln):
continue
if i > 0 and lines[i - 1].lstrip().startswith("!["):
continue
ref_path = clips[clip_idx][0]
fname = os.path.basename(ref_path)
img_md = f"![{fname}]({ref_path})"
lines[i] = f"{img_md}\n\n{ln}"
clip_idx += 1
return "\n".join(lines)
def _extract_vector_figure_clips(
page,
page_index: int,
plain_text: str,
raw,
base_name: str,
scale: float,
quality: int,
max_edge: int,
) -> list:
"""Render vector figure regions anchored at each ``Figure N.`` caption on the page.
Returns ``[(ref_path, b64, y_sort, caption_line), ...]`` for markdown injection.
"""
if not RENDER_VECTOR_FIGURES or not re.search(r"\bFigure\s+\d+", plain_text, re.I):
return []
textpage = None
try:
textpage = page.get_textpage()
chars, page_w = _page_chars(textpage, page, raw)
if not chars:
return []
page_h = page.get_size()[1]
lines = _merge_orphan_punctuation_lines(_group_lines_with_chars(chars))
caption_indices = [
i for i, ln in enumerate(lines) if _line_has_figure_caption(ln["text"])
]
if not caption_indices:
return []
results: list = []
for fig_idx, cap_i in enumerate(caption_indices):
cap_line = lines[cap_i]["text"].strip()
m = _FIGURE_CAPTION_SEARCH_RE.search(cap_line)
if m:
cap_line = cap_line[m.start() :].split("\n", 1)[0].strip()
bbox = _bbox_above_caption(lines, cap_i, page_w, page_h)
if bbox is None:
bbox = _chart_region_bbox(chars, page_w, page_h)
if bbox is None:
continue
ratio = _bbox_area_ratio(bbox, page_w, page_h)
if ratio > MAX_CHART_REGION_AREA_RATIO:
bbox = _cap_bbox_height(bbox, page_h, lines[cap_i]["bbox"][3])
ratio = _bbox_area_ratio(bbox, page_w, page_h)
if ratio > MAX_CHART_REGION_AREA_RATIO:
continue
if ratio < MIN_CHART_REGION_AREA_RATIO:
continue
bbox = _expand_chart_bbox(bbox, page_w, page_h, margin_frac=0.06)
jpeg = _render_page_clip_jpeg(page, bbox, scale, quality, max_edge)
fname = f"{base_name}_p{page_index + 1}_fig{fig_idx + 1}.jpg"
ref_path = f"images/{fname}"
results.append(
(
ref_path,
base64.b64encode(jpeg).decode("utf-8"),
bbox[3],
cap_line,
)
)
return results
except Exception:
logger.debug("vector figure clip failed on page %d", page_index, exc_info=True)
return []
finally:
_close_pdfium_resource(textpage)
def _collect_invisible_boxes(page, raw) -> list:
"""Bounding boxes of invisible (render-mode 3) text objects on the page."""
boxes: list = []
@@ -251,6 +679,109 @@ def _split_columns(chars: list, scale: float, width: float, depth: int = 0) -> l
)
def _column_x_span(chars: list) -> float:
if not chars:
return 0.0
return max(c["x1"] for c in chars) - min(c["x0"] for c in chars)
def _column_single_line_fraction(lines: list) -> float:
if not lines:
return 0.0
single = sum(1 for ln in lines if len(ln["text"]) <= 2)
return single / len(lines)
def _is_artifact_column(chars: list, width: float) -> bool:
"""Detect margin strips and vertical watermarks (e.g. arXiv sidebar).
Docling / MinerU solve this with learned layout regions; here we use
geometry only: a narrow column whose lines are mostly one glyph tall is not
part of the reading order.
"""
if not chars or width <= 0:
return True
span = _column_x_span(chars)
if span <= 0:
return True
lines = _group_lines(chars)
single_frac = _column_single_line_fraction(lines)
narrow = span / width < MARGIN_COL_WIDTH_RATIO
if narrow and single_frac >= 0.45:
return True
ys = [(c["y0"] + c["y1"]) / 2 for c in chars]
y_span = max(ys) - min(ys)
# Vertical text: tall stack, narrow horizontal extent, mostly one char/line.
if y_span > span * 3.5 and len(chars) >= 8 and single_frac >= 0.35:
return True
return False
def _filter_reading_columns(chars: list, scale: float, width: float) -> list:
"""Split into columns and drop margin / watermark strips."""
cols = _split_columns(chars, scale, width)
kept = [c for c in cols if not _is_artifact_column(c, width)]
if kept:
return kept
# All columns looked like noise — keep the widest glyph set (main body).
if len(cols) > 1:
return [max(cols, key=_column_x_span)]
return cols
def _merge_orphan_punctuation_lines(lines: list) -> list:
"""Attach lines that are only punctuation to the previous visual line.
Many PDFs place ``.`` in figure labels or footnotes on a slightly different
baseline; grouping by y then leaves ``Figure 1`` and ``2:`` on separate lines.
"""
if not lines:
return []
merged: list = []
for ln in lines:
t = ln["text"].strip()
if (
merged
and t
and len(t) <= 4
and all(c in ".,;:!?…·" or c.isspace() for c in t)
):
suffix = "".join(t.split())
prev = merged[-1]["text"]
if suffix and prev and not prev.endswith((" ", "-")):
merged[-1]["text"] = prev + suffix
else:
merged[-1]["text"] = (prev + " " + t).strip()
continue
merged.append(dict(ln))
return merged
def _join_line_glyphs(ln_sorted: list) -> str:
"""Join a visual line's glyphs, inferring word spaces from horizontal gaps."""
if not ln_sorted:
return ""
widths = [c["x1"] - c["x0"] for c in ln_sorted if c["x1"] > c["x0"]]
med_w = statistics.median(widths) if widths else 1.0
gap_threshold = med_w * WORD_GAP_WIDTH_RATIO
parts: list[str] = []
for i, cur in enumerate(ln_sorted):
ch = cur["ch"]
if i == 0:
parts.append(ch)
continue
prev = ln_sorted[i - 1]
if ch.isspace() or prev["ch"].isspace():
if not ch.isspace() or (parts and not parts[-1].endswith(" ")):
parts.append(ch)
continue
if cur["x0"] - prev["x1"] > gap_threshold:
parts.append(" ")
parts.append(ch)
return "".join(parts).strip()
def _group_lines(chars: list) -> list:
"""Group a column's glyphs into lines (top-to-bottom, glyphs sorted by x)."""
if not chars:
@@ -277,7 +808,7 @@ def _group_lines(chars: list) -> list:
out: list = []
for ln in lines:
ln_sorted = sorted(ln, key=lambda c: c["x0"])
text = "".join(c["ch"] for c in ln_sorted).strip()
text = _join_line_glyphs(ln_sorted)
if not text:
continue
hs = [c["y1"] - c["y0"] for c in ln_sorted if c["y1"] - c["y0"] > 0]
@@ -293,7 +824,12 @@ def _segments_to_markdown(lines: list) -> str:
def level(ln) -> int:
txt = ln["text"]
if not DETECT_HEADINGS or body <= 0 or len(txt) > 80:
if (
not DETECT_HEADINGS
or body <= 0
or len(txt) > 80
or len(txt) < MIN_HEADING_LINE_CHARS
):
return 0
if txt[-1:] in ".。!?,;:":
return 0
@@ -317,6 +853,100 @@ def _segments_to_markdown(lines: list) -> str:
return "\n".join(out)
def _chars_to_layout_markdown(chars: list, scale: float, width: float) -> str:
blocks: list = []
for col in _filter_reading_columns(chars, scale, width):
lines = _merge_orphan_punctuation_lines(_group_lines(col))
md = _segments_to_markdown(lines)
if md:
blocks.append(md)
return "\n".join(blocks)
def _layout_line_stats(text: str) -> tuple:
"""Return (line_count, single_char_line_count, punct_only_line_count)."""
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
if not lines:
return 0, 0, 0
single = sum(1 for ln in lines if len(ln) <= 2)
punct_only = sum(
1
for ln in lines
if len(ln) <= 4 and re.fullmatch(r"[\s.,;:!?…·\-–—]+", ln)
)
return len(lines), single, punct_only
def _layout_garbled_line_fraction(text: str) -> float:
"""Share of lines that look like broken OCR (many 12 letter tokens)."""
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
if not lines:
return 0.0
garbled = 0
for ln in lines:
words = ln.split()
if len(words) >= 6 and sum(1 for w in words if len(w) <= 2) / len(words) > 0.45:
garbled += 1
return garbled / len(lines)
def _plain_is_well_formed(plain: str) -> bool:
"""True when pdfium plain text already has usable words and punctuation.
Academic PDFs (arXiv) and TOCs already expose a good text layer; running
geometric layout on them often destroys citations and words. Scanned books
with a poor text layer (no commas in refs, short glued tokens) still need
layout gap inference.
"""
plain = (plain or "").strip()
if not plain:
return False
if re.search(r"\[\w+,\s", plain):
return True
if plain.count(" . . ") >= 2:
return True
words = re.findall(r"\S+", plain)
if len(words) < 30:
return False
avg_len = sum(len(w) for w in words) / len(words)
return avg_len >= 5.0
def _should_prefer_plain(plain: str, layout: str) -> bool:
"""Fall back to pdfium plain text when layout reconstruction looks broken."""
layout = (layout or "").strip()
plain = (plain or "").strip()
if not layout:
return True
if not plain:
return False
n, single, punct_only = _layout_line_stats(layout)
if n == 0:
return True
if single / n >= 0.18 or punct_only / n >= 0.12:
return True
garbled = _layout_garbled_line_fraction(layout)
if garbled >= 0.20 and _layout_garbled_line_fraction(plain) < 0.08:
return True
if re.search(r"\[\w+,\s", plain) and re.search(
r"\[\w+\s+\w+\s+\d", layout
):
return True
# Title / lead sentence from plain should survive in layout.
for ln in plain.splitlines():
probe = ln.strip()
if len(probe) < 24:
continue
alnum = "".join(c for c in probe if c.isalnum())[:16]
if len(alnum) < 12:
continue
layout_alnum = "".join(c for c in layout if c.isalnum())
if alnum not in layout_alnum:
return True
break
return False
def _extract_layout_text(page, raw) -> str:
"""Layout-aware extraction: reading order + headings + hidden-text filter.
@@ -331,12 +961,7 @@ def _extract_layout_text(page, raw) -> str:
return ""
heights = [c["y1"] - c["y0"] for c in chars if c["y1"] - c["y0"] > 0]
scale = (statistics.median(heights) if heights else 1.0) or 1.0
blocks = []
for col in _split_columns(chars, scale, width):
md = _segments_to_markdown(_group_lines(col))
if md:
blocks.append(md)
return "\n".join(blocks)
return _chars_to_layout_markdown(chars, scale, width)
except Exception:
logger.debug("layout extraction failed; using plain text", exc_info=True)
return _extract_page_text(page)
@@ -623,37 +1248,6 @@ def _extract_embedded_images(pdf, classes, raw, base_name: str, quality: int) ->
return result
def estimate_scanned_fraction(content: bytes, sample: int = 12) -> float:
"""Return the fraction of (sampled) pages that look image-dominated.
Used by alternative engines (e.g. liteparse) that lack image-object access
to decide whether a PDF is scanned, applying the same image-area signal the
builtin router uses. Samples up to ``sample`` pages for speed on big PDFs.
"""
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_r
pdf = pdfium.PdfDocument(content)
try:
page_count = len(pdf)
if page_count <= 0:
return 0.0
step = max(1, page_count // sample)
indices = list(range(0, page_count, step))
scanned = 0
for i in indices:
page = pdf[i]
try:
ratio = _page_image_area_ratio(page, pdfium_r)
finally:
_close_pdfium_resource(page)
if ratio >= SCAN_IMAGE_AREA_RATIO:
scanned += 1
return scanned / len(indices) if indices else 0.0
finally:
_close_pdfium_resource(pdf)
def _strip_repeating_lines(texts: list, classes: list) -> list:
"""Remove running headers/footers that repeat across most text pages.
@@ -791,6 +1385,7 @@ class PDFParser(BaseParser):
# Pass 1: cheap text extraction + image-area classification.
texts: list = []
classes: list = []
vector_clips: dict = {}
for i in range(page_count):
page = pdf[i]
try:
@@ -800,9 +1395,36 @@ class PDFParser(BaseParser):
# Layout reconstruction only pays off (and is only spent) on
# native text pages; scanned pages are rendered, not read.
if cls == "text" and LAYOUT_ORDERING:
text = _extract_layout_text(page, pdfium_r) or plain
if _plain_is_well_formed(plain):
text = plain
else:
layout = _extract_layout_text(page, pdfium_r)
if layout and not _should_prefer_plain(plain, layout):
text = layout
else:
text = plain
else:
text = plain
if cls == "text":
clips = _extract_vector_figure_clips(
page,
i,
plain,
pdfium_r,
base_name,
scale,
quality,
CONFIG.pdf_render_max_edge,
)
if clips:
vector_clips[i] = clips
for ref_path, b64, _y, _cap in clips:
images[ref_path] = b64
text = _postprocess_pdf_text(text)
if cls == "text" and vector_clips.get(i):
text = _inject_figure_markdown_before_captions(
text, vector_clips[i]
)
finally:
_close_pdfium_resource(page)
texts.append(text)
@@ -841,6 +1463,7 @@ class PDFParser(BaseParser):
# Assemble markdown in reading order.
embedded_count = 0
vector_figure_count = 0
blocks = []
for i in range(page_count):
if classes[i] == "scanned":
@@ -850,7 +1473,10 @@ class PDFParser(BaseParser):
stripped = texts[i].strip()
if stripped:
blocks.append(stripped)
for ref_path, _b64, _y in embedded.get(i, []):
vector_figure_count += len(vector_clips.get(i, []))
page_images = list(embedded.get(i, []))
page_images.sort(key=lambda item: item[2], reverse=True)
for ref_path, _b64, _y in page_images:
fname = os.path.basename(ref_path)
blocks.append(f"![{fname}]({ref_path})")
embedded_count += 1
@@ -862,6 +1488,7 @@ class PDFParser(BaseParser):
"scanned_page_count": len(scanned_indices),
"text_page_count": page_count - len(scanned_indices),
"embedded_image_count": embedded_count,
"vector_figure_count": vector_figure_count,
"image_source_type": "scanned_pdf" if scanned_indices else "pdf_text_layer",
}

View File

@@ -0,0 +1,116 @@
"""LibreOffice helpers for legacy binary PowerPoint (.ppt) uploads."""
from __future__ import annotations
import logging
import os
import subprocess
import tempfile
import time
from pathlib import Path
from docreader.parser.excel_convert import find_soffice
logger = logging.getLogger(__name__)
_OLE_MAGIC = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
_ZIP_MAGIC = b"PK\x03\x04"
def is_ole_compound(content: bytes) -> bool:
return len(content) >= len(_OLE_MAGIC) and content.startswith(_OLE_MAGIC)
def is_zip_openxml(content: bytes) -> bool:
return len(content) >= len(_ZIP_MAGIC) and content.startswith(_ZIP_MAGIC)
def needs_ppt_to_pptx_conversion(content: bytes, file_type: str | None) -> bool:
"""True when content is legacy .ppt (OLE), not modern .pptx (ZIP)."""
ext = (file_type or "").lstrip(".").lower()
if ext == "pptx" or is_zip_openxml(content):
return False
if ext == "ppt" or is_ole_compound(content):
return is_ole_compound(content) or ext == "ppt"
return False
def convert_ppt_to_pptx_bytes(content: bytes, suffix: str = ".ppt") -> bytes | None:
"""Convert legacy PowerPoint bytes to PPTX using LibreOffice, if available."""
soffice = find_soffice()
if not soffice:
return None
max_attempts = 3
for attempt in range(1, max_attempts + 1):
with tempfile.TemporaryDirectory() as temp_dir, tempfile.TemporaryDirectory() as profile_dir:
src = os.path.join(temp_dir, f"input{suffix}")
with open(src, "wb") as handle:
handle.write(content)
user_installation = Path(profile_dir).as_uri()
cmd = [
soffice,
"--headless",
f"-env:UserInstallation={user_installation}",
"--convert-to",
"pptx",
"--outdir",
temp_dir,
src,
]
try:
result = subprocess.run(cmd, capture_output=True, timeout=120)
except (OSError, subprocess.TimeoutExpired) as exc:
logger.warning("LibreOffice PPT convert failed to start: %s", exc)
return None
if result.returncode != 0:
stderr = result.stderr.decode("utf-8", errors="ignore")
logger.warning(
"LibreOffice PPT convert failed (attempt %s/%s): %s",
attempt,
max_attempts,
stderr,
)
if attempt < max_attempts:
time.sleep(0.5 * attempt)
continue
return None
for name in os.listdir(temp_dir):
if name.endswith(".pptx"):
with open(os.path.join(temp_dir, name), "rb") as handle:
converted = handle.read()
logger.info(
"Converted presentation via LibreOffice (%s -> pptx, %d bytes)",
suffix,
len(converted),
)
return converted
if attempt < max_attempts:
time.sleep(0.5 * attempt)
return None
def normalize_ppt_bytes(content: bytes, file_type: str | None) -> tuple[bytes, str]:
"""Return (bytes, extension) suitable for MarkItDown (pptx when converted)."""
ext = (file_type or "").lstrip(".").lower()
if is_zip_openxml(content):
return content, ".pptx"
if not needs_ppt_to_pptx_conversion(content, ext):
dotted = f".{ext}" if ext else ".pptx"
return content, dotted
suffix = ".ppt" if ext in ("", "ppt") else f".{ext}"
converted = convert_ppt_to_pptx_bytes(content, suffix=suffix)
if converted:
return converted, ".pptx"
raise ValueError(
"Legacy PowerPoint (.ppt) is not supported by MarkItDown directly; "
"LibreOffice is required to convert it to .pptx. Install LibreOffice "
"(soffice) in the docreader environment or upload .pptx instead."
)

View File

@@ -0,0 +1,154 @@
"""Extract and rasterize images embedded in PPTX (e.g. WMF) when MarkItDown cannot inline them."""
from __future__ import annotations
import base64
import io
import logging
import os
import re
import subprocess
import tempfile
import uuid
import zipfile
from typing import Dict, List, Tuple
logger = logging.getLogger(__name__)
_MARKDOWN_IMAGE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
_RASTER_EXT = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"}
_VECTOR_EXT = {".wmf", ".emf", ".svg"}
def _find_convert() -> str | None:
for path in ("/usr/bin/convert", "/usr/local/bin/convert"):
if os.path.isfile(path):
return path
try:
result = subprocess.run(
["which", "convert"], capture_output=True, text=True, check=False
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
except OSError:
pass
return None
def _rasterize_with_imagemagick(data: bytes, suffix: str) -> bytes | None:
convert = _find_convert()
if not convert:
return None
with tempfile.TemporaryDirectory() as temp_dir:
src = os.path.join(temp_dir, f"input{suffix}")
dst = os.path.join(temp_dir, "output.png")
with open(src, "wb") as handle:
handle.write(data)
try:
result = subprocess.run(
[convert, src, dst],
capture_output=True,
timeout=60,
)
except (OSError, subprocess.TimeoutExpired) as exc:
logger.warning("ImageMagick convert failed: %s", exc)
return None
if result.returncode != 0 or not os.path.isfile(dst):
stderr = (result.stderr or b"").decode("utf-8", errors="ignore")
logger.warning("ImageMagick convert exit %s: %s", result.returncode, stderr)
return None
with open(dst, "rb") as handle:
return handle.read()
def _rasterize_with_pillow(data: bytes) -> bytes | None:
try:
from PIL import Image
except ImportError:
return None
try:
img = Image.open(io.BytesIO(data))
if img.mode not in ("RGB", "L"):
img = img.convert("RGB")
out = io.BytesIO()
img.save(out, format="PNG")
return out.getvalue()
except Exception as exc:
logger.debug("Pillow could not open media bytes: %s", exc)
return None
def rasterize_media_bytes(name: str, data: bytes) -> bytes | None:
ext = os.path.splitext(name)[1].lower()
if ext in _RASTER_EXT:
png = _rasterize_with_pillow(data)
if png:
return png
if ext in _VECTOR_EXT or ext in _RASTER_EXT:
return _rasterize_with_imagemagick(data, ext or ".bin")
return _rasterize_with_imagemagick(data, ext or ".bin")
def list_pptx_media(pptx_bytes: bytes) -> List[Tuple[str, bytes]]:
"""Return (zip path, raw bytes) for each file under ppt/media/, in archive order."""
items: List[Tuple[str, bytes]] = []
with zipfile.ZipFile(io.BytesIO(pptx_bytes)) as archive:
for name in archive.namelist():
if not name.startswith("ppt/media/"):
continue
base = os.path.basename(name)
if not base or base.startswith("."):
continue
items.append((name, archive.read(name)))
return items
def extract_pptx_media_rasterized(pptx_bytes: bytes) -> List[bytes]:
"""Rasterize all ppt/media assets to PNG bytes, skipping failures."""
rasterized: List[bytes] = []
for path, raw in list_pptx_media(pptx_bytes):
png = rasterize_media_bytes(os.path.basename(path), raw)
if png:
rasterized.append(png)
logger.info("Rasterized pptx media %s (%d -> %d bytes)", path, len(raw), len(png))
else:
logger.warning("Failed to rasterize pptx media %s", path)
return rasterized
def _is_unresolved_image_ref(url: str) -> bool:
if not url or url.startswith("data:") or url.startswith("images/"):
return False
if url.startswith(("http://", "https://")):
return False
return True
def attach_pptx_media_to_markdown(
markdown: str, pptx_bytes: bytes
) -> Tuple[str, Dict[str, str]]:
"""Replace unresolved ![](...) refs with images/ paths and inline image payloads."""
media = extract_pptx_media_rasterized(pptx_bytes)
if not media:
return markdown, {}
images: Dict[str, str] = {}
media_iter = iter(media)
def repl(match: re.Match[str]) -> str:
alt, url = match.group(1), match.group(2)
if not _is_unresolved_image_ref(url):
return match.group(0)
try:
png = next(media_iter)
except StopIteration:
return match.group(0)
ref = f"images/{uuid.uuid4()}.png"
images[ref] = base64.b64encode(png).decode()
return f"![{alt}]({ref})"
return _MARKDOWN_IMAGE.sub(repl, markdown), images
def markdown_needs_pptx_media_attach(markdown: str) -> bool:
return any(_is_unresolved_image_ref(m.group(2)) for m in _MARKDOWN_IMAGE.finditer(markdown))

View File

@@ -7,8 +7,11 @@ from docreader.parser.docx2_parser import Docx2Parser
from docreader.parser.excel_parser import ExcelParser
from docreader.parser.image_parser import ImageParser
from docreader.parser.markdown_parser import MarkdownParser
from docreader.parser.liteparse_parser import LiteParseParser, liteparse_available
from docreader.parser.markitdown_parser import MarkitdownParser
from docreader.parser.opendataloader_parser import (
OpenDataLoaderParser,
opendataloader_available,
)
from docreader.parser.pdf_parser import PDFParser
logger = logging.getLogger(__name__)
@@ -151,11 +154,11 @@ def _build_default_registry() -> ParserEngineRegistry:
)
reg.register(
"liteparse",
{"pdf": LiteParseParser},
description="LiteParse 解析引擎(快速空间阅读顺序,适合数字版 PDF",
check_available=liteparse_available,
unavailable_hint="liteparse 未安装",
"opendataloader",
{"pdf": OpenDataLoaderParser},
description="OpenDataLoader PDF版面分析需 Java 11+",
check_available=opendataloader_available,
unavailable_hint="请安装 opendataloader-pdf 与 Java 11+",
)
# NOTE: Engine listing is managed by Go-side engine registry

View File

@@ -1,9 +1,11 @@
import asyncio
import logging
import re
from dataclasses import dataclass
from typing import Optional
from lxml.etree import XPath
from playwright.async_api import async_playwright
from playwright.async_api import Page, async_playwright
from trafilatura import extract, utils, xpaths
from docreader.config import CONFIG
@@ -15,6 +17,14 @@ from docreader.utils import endecode
logger = logging.getLogger(__name__)
_GOTO_TIMEOUT_MS = 30_000
_NETWORK_IDLE_TIMEOUT_MS = 10_000
_SPA_WAIT_TIMEOUT_MS = 15_000
# Minimum visible characters before treating an SPA shell as "rendered".
_SPA_MIN_TEXT_LEN = 80
# Minimum visible characters for Playwright text fallback when trafilatura fails.
_MIN_FALLBACK_TEXT_LEN = 50
# Monkey-patch trafilatura internals to better support WeChat Official Account
# articles, whose images live on `mmbiz.qpic.cn` without a standard file
# extension and whose main content sits inside `#js_content` /
@@ -40,6 +50,78 @@ except (AttributeError, ImportError) as e:
)
@dataclass(frozen=True)
class _ScrapeResult:
html: str
visible_text: str
page_title: str
def extract_markdown_from_html(html: str) -> Optional[str]:
"""Run trafilatura on HTML; return markdown or None if nothing extracted."""
if not html or not html.strip():
return None
md_text = extract(
html,
output_format="markdown",
with_metadata=True,
include_images=True,
include_tables=True,
include_links=True,
)
if not md_text or not md_text.strip():
return None
return md_text
def build_visible_text_fallback(visible_text: str, page_title: str = "") -> Optional[str]:
"""Build markdown from Playwright-visible text when trafilatura finds no article body."""
text = (visible_text or "").strip()
if len(text) < _MIN_FALLBACK_TEXT_LEN:
return None
title = (page_title or "").strip()
if title and not text.startswith(title):
return f"# {title}\n\n{text}"
return text
async def wait_for_rendered_content(page: Page) -> None:
"""Wait for SPA/JS pages beyond the initial HTML shell."""
try:
await page.wait_for_load_state("networkidle", timeout=_NETWORK_IDLE_TIMEOUT_MS)
logger.info("Network idle after navigation")
except Exception:
logger.info("Network idle wait timed out, continuing")
try:
await page.wait_for_function(
"""(minLen) => {
const root = document.querySelector('#app')
|| document.querySelector('main')
|| document.body;
return ((root?.innerText || '').trim().length >= minLen);
}""",
arg=_SPA_MIN_TEXT_LEN,
timeout=_SPA_WAIT_TIMEOUT_MS,
)
logger.info("SPA/root visible text reached minimum length")
except Exception:
logger.info("SPA text wait timed out, using current DOM")
async def read_visible_text(page: Page) -> str:
"""Prefer #app/main innerText, then fall back to body."""
return await page.evaluate(
"""() => {
const root = document.querySelector('#app')
|| document.querySelector('main')
|| document.querySelector('[role="main"]')
|| document.body;
return (root?.innerText || '').trim();
}"""
)
class StdWebParser(BaseParser):
"""Standard web page parser using Playwright and Trafilatura.
@@ -61,16 +143,17 @@ class StdWebParser(BaseParser):
super().__init__(file_name=title, **kwargs)
logger.info(f"Initialized WebParser with title: {title}")
async def scrape(self, url: str) -> str:
async def scrape(self, url: str) -> _ScrapeResult:
"""Scrape web page content using Playwright.
Args:
url: The URL of the web page to scrape
Returns:
HTML content of the web page as string, empty string on error
HTML, visible text, and document title; empty fields on hard failure
"""
logger.info(f"Starting web page scraping for URL: {url}")
empty = _ScrapeResult(html="", visible_text="", page_title="")
try:
async with async_playwright() as p:
kwargs = {}
@@ -83,30 +166,42 @@ class StdWebParser(BaseParser):
logger.info(f"Navigating to URL: {url}")
try:
# Navigate to URL with 30 second timeout
await page.goto(url, timeout=30000)
await page.goto(
url,
timeout=_GOTO_TIMEOUT_MS,
wait_until="domcontentloaded",
)
logger.info("Initial page load complete")
except Exception as e:
logger.error(f"Error navigating to URL: {str(e)}")
await browser.close()
return ""
return empty
logger.info("Retrieving page HTML content")
# Get the full HTML content of the page
await wait_for_rendered_content(page)
page_title = await page.title()
visible_text = await read_visible_text(page)
content = await page.content()
logger.info(f"Retrieved {len(content)} bytes of HTML content")
logger.info(
"Retrieved %d bytes HTML, %d chars visible text, title=%r",
len(content),
len(visible_text),
page_title[:80] if page_title else "",
)
await browser.close()
logger.info("Browser closed")
# Return raw HTML content for further processing
logger.info("Successfully retrieved HTML content")
return content
return _ScrapeResult(
html=content,
visible_text=visible_text,
page_title=page_title or "",
)
except Exception as e:
logger.error(f"Failed to scrape web page: {str(e)}")
# Return empty string on error
return ""
return empty
def parse_into_text(self, content: bytes) -> Document:
"""Parse web page content into a Document object.
@@ -117,36 +212,49 @@ class StdWebParser(BaseParser):
Returns:
Document object containing the parsed markdown content
"""
# Decode bytes to get the URL string
url = endecode.decode_bytes(content)
logger.info(f"Scraping web page: {url}")
# Run async scraping in sync context
chtml = asyncio.run(self.scrape(url))
# Extract clean content from HTML using Trafilatura
# Convert to markdown format with metadata, images, tables, and links
md_text = extract(
chtml,
output_format="markdown",
with_metadata=True,
include_images=True,
include_tables=True,
include_links=True,
)
scrape_result = asyncio.run(self.scrape(url))
if not scrape_result.html and not scrape_result.visible_text:
logger.error("Failed to scrape web page (no HTML or visible text)")
return Document(content=f"Error parsing web page: {url}")
md_text = extract_markdown_from_html(scrape_result.html)
if not md_text:
md_text = build_visible_text_fallback(
scrape_result.visible_text,
scrape_result.page_title,
)
if md_text:
logger.info(
"Trafilatura empty; using Playwright visible-text fallback (%d chars)",
len(md_text),
)
if not md_text:
logger.error("Failed to parse web page")
return Document(content=f"Error parsing web page: {url}")
# Extract title from trafilatura metadata output (e.g. "title: xxx" line)
metadata = {}
title_match = re.search(r"^title:\s*(.+)", md_text, re.MULTILINE)
if title_match:
extracted_title = title_match.group(1).strip()
if extracted_title:
metadata["title"] = extracted_title
logger.info(f"Extracted article title from trafilatura: {extracted_title}")
logger.info(
f"Extracted article title from trafilatura: {extracted_title}"
)
elif scrape_result.page_title:
metadata["title"] = scrape_result.page_title.strip()
logger.info(
"Using page title from Playwright: %s", metadata["title"]
)
else:
logger.info(f"No title found in trafilatura output, first 200 chars: {md_text[:200]!r}")
logger.info(
"No title found in trafilatura output, first 200 chars: %r",
md_text[:200],
)
return Document(content=md_text, metadata=metadata)

View File

@@ -0,0 +1,42 @@
"""Fill merged cell values before pandas reads an XLSX workbook."""
from __future__ import annotations
import logging
import zipfile
from io import BytesIO
logger = logging.getLogger(__name__)
def fill_merged_cells_xlsx(content: bytes) -> bytes:
"""Unmerge ranges and copy the master cell value into every covered cell.
openpyxl only stores values on the top-left cell of a merge; pandas then
sees NaN in the rest. Filling makes row-wise RAG chunks retain context.
"""
if not zipfile.is_zipfile(BytesIO(content)):
return content
from openpyxl import load_workbook
wb = load_workbook(BytesIO(content), data_only=True)
changed = False
for ws in wb.worksheets:
if not ws.merged_cells.ranges:
continue
for merge_range in list(ws.merged_cells.ranges):
master_value = ws.cell(merge_range.min_row, merge_range.min_col).value
ws.unmerge_cells(str(merge_range))
for row in range(merge_range.min_row, merge_range.max_row + 1):
for col in range(merge_range.min_col, merge_range.max_col + 1):
ws.cell(row, col).value = master_value
changed = True
if not changed:
return content
out = BytesIO()
wb.save(out)
logger.info("Filled merged cells in XLSX before parse")
return out.getvalue()

View File

@@ -0,0 +1,126 @@
"""Repair common XLSX packaging issues before openpyxl/pandas read."""
from __future__ import annotations
import io
import re
import zipfile
from typing import Callable, Dict, Iterable, Set
SST_PART = "xl/sharedStrings.xml"
_SST_OVERRIDE_RE = re.compile(
r'<Override[^>]*PartName="[^"]*sharedStrings\.xml"[^>]*/>',
re.IGNORECASE,
)
_SST_REL_RE = re.compile(
r'<Relationship[^>]*Type="[^"]*sharedStrings"[^>]*/>',
re.IGNORECASE,
)
def repair_xlsx_bytes(content: bytes) -> bytes | None:
"""Return repaired XLSX bytes, or None if no repair was applied.
Handles workbooks that reference ``xl/sharedStrings.xml`` in package
metadata but omit the part (common with some exporters). When worksheets
only use inline strings, manifest references are stripped so openpyxl can
read the file.
"""
if not zipfile.is_zipfile(io.BytesIO(content)):
return None
with zipfile.ZipFile(io.BytesIO(content), "r") as zin:
names = _normalized_names(zin.namelist())
sst_path = _find_shared_strings_path(names)
if sst_path:
if sst_path == SST_PART:
return None
return _rewrite_zip(
zin, lambda files: _rename_shared_strings_part(files, sst_path)
)
if not _package_references_shared_strings(zin, names):
return None
if _worksheets_use_shared_string_cells(zin, names):
return None
return _rewrite_zip(zin, _strip_shared_strings_manifest)
def _normalized_names(namelist: Iterable[str]) -> Set[str]:
return {name.replace("\\", "/") for name in namelist}
def _find_shared_strings_path(names: Set[str]) -> str | None:
for name in names:
if name.lower().endswith("sharedstrings.xml"):
return name
return None
def _package_references_shared_strings(
zin: zipfile.ZipFile, names: Set[str]
) -> bool:
content_types = "[Content_Types].xml"
if content_types in names:
ct = zin.read(content_types).decode("utf-8", errors="replace")
if "sharedstrings.xml" in ct.lower():
return True
rels_path = "xl/_rels/workbook.xml.rels"
if rels_path in names:
rels = zin.read(rels_path).decode("utf-8", errors="replace")
if "sharedstrings" in rels.lower():
return True
return False
def _worksheets_use_shared_string_cells(
zin: zipfile.ZipFile, names: Set[str]
) -> bool:
for name in names:
if not name.startswith("xl/worksheets/") or not name.endswith(".xml"):
continue
sheet = zin.read(name).decode("utf-8", errors="replace")
if re.search(r'\bt="s"', sheet):
return True
return False
def _rename_shared_strings_part(
files: Dict[str, bytes], source_path: str
) -> Dict[str, bytes]:
updated = dict(files)
updated[SST_PART] = updated.pop(source_path)
return updated
def _strip_shared_strings_manifest(files: Dict[str, bytes]) -> Dict[str, bytes]:
updated = dict(files)
ct_path = "[Content_Types].xml"
if ct_path in updated:
ct = updated[ct_path].decode("utf-8")
ct = _SST_OVERRIDE_RE.sub("", ct)
updated[ct_path] = ct.encode("utf-8")
rels_path = "xl/_rels/workbook.xml.rels"
if rels_path in updated:
rels = updated[rels_path].decode("utf-8")
rels = _SST_REL_RE.sub("", rels)
updated[rels_path] = rels.encode("utf-8")
return updated
def _rewrite_zip(
zin: zipfile.ZipFile,
transform: Callable[[Dict[str, bytes]], Dict[str, bytes]],
) -> bytes:
files: Dict[str, bytes] = {}
for info in zin.infolist():
name = info.filename.replace("\\", "/")
files[name] = zin.read(info.filename)
files = transform(files)
out = io.BytesIO()
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
for name, data in files.items():
zout.writestr(name, data)
return out.getvalue()

View File

@@ -9,9 +9,12 @@ dependencies = [
"grpcio>=1.78.0",
"grpcio-health-checking>=1.78.0",
"grpcio-tools>=1.78.0",
"liteparse>=2.0.4",
"lxml>=6.1.0",
"markitdown[docx,pdf,xls,xlsx]>=0.1.3",
"opendataloader-pdf>=2.4.7",
"openpyxl>=3.1.0",
"pandas>=2.0.0",
"xlrd>=2.0.0",
"pillow>=12.0.0",
"playwright>=1.55.0",
"protobuf>=6.33.0",

View File

@@ -62,6 +62,67 @@ DEFAULT_CONFIGS = [
]
DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)
_TABLE_ROW_PATTERN = re.compile(r"^\s*(?:\|[^|\n]*)+\|\s*$", re.MULTILINE)
_MARKDOWN_TABLE_PRIORITY = 15
def _is_empty_table_header_row(header: str) -> bool:
"""True when the column-name line is only pipes/whitespace (MarkItDown quirk)."""
newline = header.find("\n")
if newline < 0:
return False
row = header[:newline].strip()
return bool(row) and all(ch in "| \t" for ch in row)
def _extract_separator_line(header: str) -> str:
for line in header.split("\n"):
if "---" in line:
return line + "\n"
return ""
def _table_row_column_count(line: str) -> int:
line = line.strip()
if not line.startswith("|"):
return 0
parts = line.split("|")
if parts and parts[0].strip() == "":
parts = parts[1:]
if parts and parts[-1].strip() == "":
parts = parts[:-1]
return len(parts)
def _first_table_row_column_count(text: str) -> int:
for line in text.split("\n"):
line = line.strip()
if line and _TABLE_ROW_PATTERN.match(line):
return _table_row_column_count(line)
return 0
def _header_table_column_count(header: str) -> int:
for line in header.split("\n"):
line = line.strip()
if not line or "---" in line:
continue
count = _table_row_column_count(line)
if count > 0:
return count
return 0
def _split_ends_with_paragraph_break(split: str) -> bool:
trimmed = split.rstrip(" \t\r")
return trimmed.endswith("\n\n") or trimmed.endswith("\r\n\r\n")
def header_column_mismatch(headers: str, next_unit: str) -> bool:
header_cols = _header_table_column_count(headers)
row_cols = _first_table_row_column_count(next_unit)
return header_cols > 0 and row_cols > 0 and header_cols != row_cols
# 定义Hook状态数据结构
class HeaderTracker(BaseModel):
@@ -70,10 +131,28 @@ class HeaderTracker(BaseModel):
header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
active_headers: Dict[int, str] = Field(default_factory=dict)
ended_headers: set[int] = Field(default_factory=set)
pending_extend: Dict[int, bool] = Field(default_factory=dict)
pending_table_break: bool = Field(default=False)
header_ended_this_unit: bool = Field(default=False)
def _clear_table_header(self) -> None:
self.ended_headers.add(_MARKDOWN_TABLE_PRIORITY)
self.active_headers.pop(_MARKDOWN_TABLE_PRIORITY, None)
self.pending_extend.pop(_MARKDOWN_TABLE_PRIORITY, None)
def update(self, split: str) -> Dict[int, str]:
"""检测当前split中的表头开始/结束更新Hook状态"""
new_headers: Dict[int, str] = {}
self.header_ended_this_unit = False
if self.pending_table_break:
self.pending_table_break = False
if _MARKDOWN_TABLE_PRIORITY in self.active_headers:
if _first_table_row_column_count(split) > 0:
self._clear_table_header()
self.header_ended_this_unit = True
else:
self._clear_table_header()
# 1. 检查是否有表头结束标记
for config in self.header_hook_configs:
@@ -82,8 +161,31 @@ class HeaderTracker(BaseModel):
):
self.ended_headers.add(config.priority)
del self.active_headers[config.priority]
self.pending_extend.pop(config.priority, None)
# 2. 检查是否有新的表头开始标记(只处理未活跃且未结束的)
# 1b. \n\n 分块会吞掉表间空行:段尾 \n\n 或列数变化时结束表头追踪
if (
_MARKDOWN_TABLE_PRIORITY in self.active_headers
and not self.pending_extend.get(_MARKDOWN_TABLE_PRIORITY)
):
if _split_ends_with_paragraph_break(split):
self.pending_table_break = True
else:
header = self.active_headers[_MARKDOWN_TABLE_PRIORITY]
row_cols = _first_table_row_column_count(split)
header_cols = _header_table_column_count(header)
if row_cols > 0 and header_cols > 0 and row_cols != header_cols:
self._clear_table_header()
self.header_ended_this_unit = True
# 2. 空表头行:用首个数据行补全列名(与 Go header_tracker 一致)
for priority in list(self.pending_extend.keys()):
if priority in self.active_headers and _TABLE_ROW_PATTERN.search(split):
sep = _extract_separator_line(self.active_headers[priority])
self.active_headers[priority] = split + sep
self.pending_extend.pop(priority, None)
# 3. 检查是否有新的表头开始标记(只处理未活跃且未结束的)
for config in self.header_hook_configs:
if (
config.priority not in self.active_headers
@@ -94,8 +196,10 @@ class HeaderTracker(BaseModel):
header = config.extract_header_fn(match)
self.active_headers[config.priority] = header
new_headers[config.priority] = header
if _is_empty_table_header_row(header):
self.pending_extend[config.priority] = True
# 3. 检查是否所有活跃表头都已结束(清空结束标记)
# 4. 检查是否所有活跃表头都已结束(清空结束标记)
if not self.active_headers:
self.ended_headers.clear()

View File

@@ -16,6 +16,7 @@ from pydantic import BaseModel, Field, PrivateAttr
from docreader.splitter.header_hook import (
HeaderTracker,
header_column_mismatch,
)
from docreader.utils.split import split_by_char, split_by_sep
@@ -225,6 +226,16 @@ class TextSplitter(BaseModel, Generic[T]):
# Update header tracking with current split
self.header_hook.update(split)
if self.header_hook.header_ended_this_unit and len(cur_chunk) > 0:
chunks.append(
(
cur_chunk[0][0],
cur_chunk[-1][1],
"".join([c[2] for c in cur_chunk]),
)
)
cur_chunk = []
cur_len = 0
cur_headers = self.header_hook.get_headers()
cur_headers_len = self.len_function(cur_headers)
@@ -276,6 +287,7 @@ class TextSplitter(BaseModel, Generic[T]):
cur_headers
and split_len + cur_headers_len < self.chunk_size
and cur_headers not in split
and not header_column_mismatch(cur_headers, split)
):
next_start = cur_chunk[0][0] if cur_chunk else cur_start

View File

@@ -0,0 +1,210 @@
import io
import os
import shutil
import subprocess
import tempfile
import unittest
import zipfile
import openpyxl
import pandas as pd
from docreader.parser.excel_convert import detect_excel_format, engine_for_format
from docreader.parser.excel_parser import ExcelParser
from docreader.parser.xlsx_merge import fill_merged_cells_xlsx
from docreader.parser.xlsx_repair import repair_xlsx_bytes
def _xlsx_with_phantom_shared_strings() -> bytes:
"""Workbook with inline strings but a dangling sharedStrings manifest entry."""
wb = openpyxl.Workbook()
ws = wb.active
ws["A1"] = "hello"
ws["B1"] = 42
bio = io.BytesIO()
wb.save(bio)
with tempfile.TemporaryDirectory() as tmpdir:
with zipfile.ZipFile(io.BytesIO(bio.getvalue()), "r") as zin:
zin.extractall(tmpdir)
ct_path = f"{tmpdir}/[Content_Types].xml"
with open(ct_path, encoding="utf-8") as f:
ct = f.read()
override = (
'<Override PartName="/xl/sharedStrings.xml" '
'ContentType="application/vnd.openxmlformats-officedocument.'
'spreadsheetml.sharedStrings+xml"/>'
)
with open(ct_path, "w", encoding="utf-8") as f:
f.write(ct.replace("</Types>", override + "</Types>"))
out = io.BytesIO()
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
for root, _, files in os.walk(tmpdir):
for name in files:
path = os.path.join(root, name)
arc = os.path.relpath(path, tmpdir)
zout.write(path, arc)
return out.getvalue()
class ExcelFormatDetectionTest(unittest.TestCase):
def test_detect_xlsx_and_engine(self):
wb = openpyxl.Workbook()
bio = io.BytesIO()
wb.save(bio)
content = bio.getvalue()
self.assertEqual(detect_excel_format(content), "xlsx")
self.assertEqual(engine_for_format("xlsx"), "openpyxl")
def test_detect_xls_magic(self):
content = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" + b"\x00" * 512
self.assertEqual(detect_excel_format(content), "xls")
self.assertEqual(engine_for_format("xls"), "xlrd")
def test_open_legacy_xls_bytes_with_xlsx_extension(self):
if not shutil.which("soffice"):
self.skipTest("LibreOffice not available")
wb = openpyxl.Workbook()
ws = wb.active
ws["A1"] = "legacy"
xlsx_bio = io.BytesIO()
wb.save(xlsx_bio)
with tempfile.TemporaryDirectory() as tmpdir:
src = os.path.join(tmpdir, "sheet.xlsx")
with open(src, "wb") as handle:
handle.write(xlsx_bio.getvalue())
subprocess.run(
[
"soffice",
"--headless",
"--convert-to",
"xls",
"--outdir",
tmpdir,
src,
],
check=True,
capture_output=True,
)
xls_path = os.path.join(tmpdir, "sheet.xls")
with open(xls_path, "rb") as handle:
xls_bytes = handle.read()
document = ExcelParser(file_name="fake.xlsx", file_type="xlsx").parse_into_text(
xls_bytes
)
self.assertIn("legacy", document.content)
class XlsxRepairTest(unittest.TestCase):
def test_repair_removes_phantom_shared_strings_reference(self):
broken = _xlsx_with_phantom_shared_strings()
with self.assertRaises(KeyError):
pd.read_excel(io.BytesIO(broken))
repaired = repair_xlsx_bytes(broken)
self.assertIsNotNone(repaired)
df = pd.read_excel(io.BytesIO(repaired), header=None)
self.assertEqual(df.values.tolist(), [["hello", 42]])
def test_repair_skips_when_shared_string_cells_need_table(self):
import xlsxwriter
bio = io.BytesIO()
wb = xlsxwriter.Workbook(bio, {"in_memory": True})
ws = wb.add_worksheet()
ws.write(0, 0, "hello")
wb.close()
with tempfile.TemporaryDirectory() as tmpdir:
with zipfile.ZipFile(io.BytesIO(bio.getvalue()), "r") as zin:
zin.extractall(tmpdir)
os.remove(f"{tmpdir}/xl/sharedStrings.xml")
out = io.BytesIO()
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
for root, _, files in os.walk(tmpdir):
for name in files:
path = os.path.join(root, name)
arc = os.path.relpath(path, tmpdir)
zout.write(path, arc)
broken = out.getvalue()
self.assertIsNone(repair_xlsx_bytes(broken))
class XlsxMergeFillTest(unittest.TestCase):
def test_fill_merged_cells_propagates_master_value(self):
wb = openpyxl.Workbook()
ws = wb.active
ws["A1"] = "title"
ws.merge_cells("A1:B1")
ws["A2"] = "left"
ws["B2"] = "right"
ws.merge_cells("A2:A3")
ws["B3"] = "only-b"
bio = io.BytesIO()
wb.save(bio)
filled = fill_merged_cells_xlsx(bio.getvalue())
out_wb = openpyxl.load_workbook(io.BytesIO(filled), data_only=True)
out_ws = out_wb.active
self.assertEqual(out_ws["B1"].value, "title")
self.assertEqual(out_ws["A3"].value, "left")
self.assertEqual(out_ws["B3"].value, "only-b")
def test_parse_en_mergecell_workbook(self):
path = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"testdata",
"rag_test",
"xlsx",
"en_mergecell.xlsx",
)
if not os.path.isfile(path):
self.skipTest("en_mergecell.xlsx fixture not available")
with open(path, "rb") as handle:
document = ExcelParser().parse_into_text(handle.read())
chunks = [chunk.content.strip() for chunk in document.chunks]
self.assertEqual(len(chunks), 12)
self.assertIn("A: A1", chunks[0])
self.assertIn("A: A2", chunks[1])
self.assertIn("B: B3", chunks[2])
self.assertNotIn("Unnamed:", document.content)
self.assertIn("A: A7", chunks[6])
self.assertIn("A: A7", chunks[7])
self.assertIn("D: D10", chunks[9])
class ExcelParserTest(unittest.TestCase):
def test_parse_phantom_shared_strings_workbook(self):
document = ExcelParser().parse_into_text(_xlsx_with_phantom_shared_strings())
self.assertIn("hello", document.content)
self.assertIn("42", document.content)
self.assertGreater(len(document.chunks), 0)
def test_parse_en_calcchain_shared_strings_case(self):
path = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"testdata",
"rag_test",
"xlsx",
"en_calcchain.xlsx",
)
if not os.path.isfile(path):
self.skipTest("en_calcchain.xlsx fixture not available")
with open(path, "rb") as f:
document = ExcelParser().parse_into_text(f.read())
self.assertGreater(len(document.content), 0)
self.assertGreater(len(document.chunks), 0)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,57 @@
import io
import unittest
from pathlib import Path
from markitdown import MarkItDown
from docreader.parser.markdown_parser import MarkdownTableUtil
class TestMarkdownTableUtil(unittest.TestCase):
def test_preserves_empty_cells(self):
"""Interior empty cells must not be dropped during formatting."""
raw = "| a | | c |\n| --- | --- | --- |\n| 1 | 2 | 3 |"
formatted = MarkdownTableUtil().format_table(raw)
self.assertIn("| a | | c |", formatted)
self.assertEqual(formatted.count("|"), raw.count("|"))
def test_format_nonempty_table(self):
raw = "|Name|Age|\n|---|---|\n|John|30|"
formatted = MarkdownTableUtil().format_table(raw)
self.assertIn("| Name | Age |", formatted)
self.assertIn("| --- | --- |", formatted)
self.assertIn("| John | 30 |", formatted)
def test_normalize_markitdown_en_tables(self):
docx = (
Path(__file__).resolve().parents[2]
/ "testdata"
/ "rag_test"
/ "docx"
/ "en_tables.docx"
)
if not docx.is_file():
docx = Path(__file__).resolve().parents[2].parent / "testdata/rag_test/docx/en_tables.docx"
raw = MarkItDown().convert(io.BytesIO(docx.read_bytes()), file_extension=".docx").text_content
normalized = MarkdownTableUtil().format_table(raw)
self.assertNotIn("| | | | |", normalized)
self.assertIn("| Name | Game | Fame | Blame |", normalized)
idx_name = normalized.index("| Name | Game | Fame | Blame |")
idx_sep = normalized.index("| --- | --- | --- | --- |", idx_name)
self.assertLess(idx_name, idx_sep)
self.assertIn("| Lebron James | Basketball |", normalized)
# Headerless 2-row tables: delimiter inserted so GFM renderers show a table
self.assertIn(
"| Sinple | Table |\n| --- | --- |\n| Without | Header |", normalized
)
self.assertIn(
"| Simple Multiparagraph | Table Full |\n| --- | --- |\n"
"| Of Paragraphs | In each Cell. |",
normalized,
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,103 @@
"""Unit tests for OpenDataLoader parser helpers (no JVM required)."""
import os
import tempfile
import unittest
from unittest import mock
from docreader.parser.opendataloader_parser import (
OpenDataLoaderParser,
_collect_images_under_output,
_find_markdown_file,
_normalize_odl_image_url,
_rewrite_markdown_image_refs,
opendataloader_available,
)
class OpenDataLoaderHelpersTest(unittest.TestCase):
def test_find_markdown_prefers_stem_match(self):
with tempfile.TemporaryDirectory() as d:
other = os.path.join(d, "other.md")
target = os.path.join(d, "paper.md")
with open(other, "w") as f:
f.write("x")
with open(target, "w") as f:
f.write("# Title")
self.assertEqual(_find_markdown_file(d, "paper"), target)
def test_collect_and_rewrite_images(self):
with tempfile.TemporaryDirectory() as d:
img_dir = os.path.join(d, "images")
os.makedirs(img_dir)
png = os.path.join(img_dir, "fig1.png")
with open(png, "wb") as f:
f.write(b"\x89PNG\r\n\x1a\n")
images = _collect_images_under_output(d)
self.assertIn("images/fig1.png", images)
md = "See ![fig](images/fig1.png) and ![alt](./fig1.png)."
out = _rewrite_markdown_image_refs(md, images)
self.assertIn("![fig](images/fig1.png)", out)
self.assertIn("![alt](images/fig1.png)", out)
def test_rewrite_odl_angle_bracket_and_entity_urls(self):
images = {"images/imageFile1.png": "e30="}
for md_in in (
"![image 1](<images/imageFile1.png>)",
"![image 1](&lt;images/imageFile1.png&gt;)",
):
out = _rewrite_markdown_image_refs(md_in, images)
self.assertEqual("![image 1](images/imageFile1.png)", out)
def test_normalize_odl_image_url(self):
self.assertEqual(
_normalize_odl_image_url("&lt;images/imageFile2.png&gt;"),
"images/imageFile2.png",
)
self.assertEqual(
_normalize_odl_image_url("<images/imageFile2.png>"),
"images/imageFile2.png",
)
def test_rewrite_skips_data_uris(self):
md = "![x](data:image/png;base64,abc)"
self.assertEqual(_rewrite_markdown_image_refs(md, {"images/a.png": "e30="}), md)
class OpenDataLoaderParserTest(unittest.TestCase):
@mock.patch("docreader.parser.opendataloader_parser.opendataloader_available")
@mock.patch("docreader.parser.opendataloader_parser._run_convert")
def test_parse_reads_markdown_and_images(self, mock_convert, mock_avail):
mock_avail.return_value = (True, "")
def fake_convert(pdf_path, output_dir, image_dir, overrides=None):
stem = os.path.splitext(os.path.basename(pdf_path))[0]
md_path = os.path.join(output_dir, f"{stem}.md")
with open(md_path, "w") as f:
f.write("# Hello\n\n![pic](images/pic.png)\n")
os.makedirs(image_dir, exist_ok=True)
with open(os.path.join(image_dir, "pic.png"), "wb") as f:
f.write(b"png")
mock_convert.side_effect = fake_convert
parser = OpenDataLoaderParser(file_name="doc.pdf", file_type="pdf")
doc = parser.parse_into_text(b"%PDF-1.4 fake")
self.assertIn("# Hello", doc.content)
self.assertIn("images/pic.png", doc.content)
self.assertIn("images/pic.png", doc.images)
self.assertEqual(doc.metadata.get("parser_engine"), "opendataloader")
@mock.patch("docreader.parser.opendataloader_parser.shutil.which", return_value=None)
def test_availability_requires_java(self, _which):
with mock.patch(
"docreader.parser.opendataloader_parser._package_available",
return_value=(True, ""),
):
ok, msg = opendataloader_available()
self.assertFalse(ok)
self.assertIn("Java", msg)
if __name__ == "__main__":
unittest.main()

View File

@@ -6,10 +6,15 @@ from PIL import Image
from docreader.parser.pdf_parser import (
PDFParser,
_classify_page,
_filter_reading_columns,
_group_lines,
_is_artifact_column,
_join_line_glyphs,
_merge_orphan_punctuation_lines,
_point_in_boxes,
_segments_to_markdown,
_select_embedded_images,
_should_prefer_plain,
_split_columns,
_strip_repeating_lines,
)
@@ -122,13 +127,25 @@ class ReadingOrderTest(unittest.TestCase):
def test_group_lines_orders_by_y_then_x(self):
# Two visual lines; within a line glyphs given out of x-order.
chars = [
_char("B", 120, 130, 700, 712),
_char("A", 100, 110, 700, 712), # same line, left of B
_char("B", 110, 120, 700, 712), # adjacent to A (no word-sized gap)
_char("A", 100, 110, 700, 712),
_char("C", 100, 110, 680, 692), # next line down
]
lines = _group_lines(chars)
self.assertEqual([ln["text"] for ln in lines], ["AB", "C"])
def test_join_line_glyphs_inserts_word_spaces(self):
# Wide gap between "copy" and "of" mimics positioned OCR / text layers.
chars = [
_char("c", 0, 4, 0, 10),
_char("f", 10, 14, 0, 10),
]
self.assertEqual(_join_line_glyphs(chars), "c f")
def test_join_line_glyphs_keeps_adjacent_letters(self):
chars = [_char("A", 100, 110, 700, 712), _char("B", 110, 120, 700, 712)]
self.assertEqual(_join_line_glyphs(chars), "AB")
class HeadingDetectionTest(unittest.TestCase):
def test_promotes_large_line_to_heading(self):
@@ -156,6 +173,235 @@ class HiddenTextFilterTest(unittest.TestCase):
self.assertFalse(_point_in_boxes(20.0, 5.0, boxes))
class MarginColumnFilterTest(unittest.TestCase):
def test_drops_narrow_vertical_margin_column(self):
# Mimics arXiv sidebar: narrow x span, one glyph per line.
margin = [
_char(c, 20, 28, 500 - i * 14, 512 - i * 14)
for i, c in enumerate("0202luJ22")
]
body = [
_char("L", 160, 170, 700, 712),
_char("a", 170, 180, 700, 712),
_char("n", 180, 190, 700, 712),
]
cols = _filter_reading_columns(margin + body, scale=10.0, width=612.0)
self.assertEqual(len(cols), 1)
self.assertEqual(cols[0][0]["ch"], "L")
def test_keeps_real_two_column_layout(self):
left = [_char("L", 50, 150, 700 - i * 12, 712 - i * 12) for i in range(4)]
right = [_char("R", 400, 500, 700 - i * 12, 712 - i * 12) for i in range(4)]
cols = _filter_reading_columns(left + right, scale=12.0, width=600.0)
self.assertEqual(len(cols), 2)
class PunctuationMergeTest(unittest.TestCase):
def test_merges_orphan_periods(self):
lines = [
_line("Figure 1 2", 10.0),
_line(". .", 10.0),
_line("Next", 10.0),
]
merged = _merge_orphan_punctuation_lines(lines)
self.assertEqual([ln["text"] for ln in merged], ["Figure 1 2..", "Next"])
class PdfTextSanitizeTest(unittest.TestCase):
def test_removes_fffe_placeholder(self):
from docreader.parser.pdf_parser import _postprocess_pdf_text
raw = "multi\ufffelayer and non\ufffetrivial"
out = _postprocess_pdf_text(raw)
self.assertEqual(out, "multilayer and nontrivial")
def test_strips_chart_axis_run(self):
from docreader.parser.pdf_parser import _postprocess_pdf_text
raw = (
"Deep convolutional neural networks have led to breakthroughs.\n"
"0 1 2 3 4 5 6 0\n"
"10\n"
"20\n"
"iter. (1e4)\n"
"training error (%)\n"
"56-layer\n"
"20-layer\n"
"Figure 1. Training error on CIFAR-10.\n"
)
out = _postprocess_pdf_text(raw)
self.assertIn("breakthroughs", out)
self.assertNotIn("56-layer", out)
self.assertIn("Figure 1.", out)
def test_strips_diagram_labels_above_caption(self):
from docreader.parser.pdf_parser import _postprocess_pdf_text
raw = (
"Paragraph before.\n"
"identity\n"
"weight layer\n"
"relu\n"
"Figure 2. Residual learning block.\n"
"Paragraph after.\n"
)
out = _postprocess_pdf_text(raw)
self.assertIn("Paragraph before.", out)
self.assertIn("Figure 2.", out)
self.assertIn("Paragraph after.", out)
self.assertNotIn("identity", out)
self.assertNotIn("weight layer", out)
def test_strips_arxiv_header_line(self):
from docreader.parser.pdf_parser import _postprocess_pdf_text
raw = "Body text.\n1\narXiv:1512.03385v1 [cs.CV] 10 Dec 2015\nMore body."
out = _postprocess_pdf_text(raw)
self.assertNotIn("arXiv:", out)
self.assertIn("Body text.", out)
class PlainWellFormedTest(unittest.TestCase):
def test_academic_plain_skips_layout(self):
from docreader.parser.pdf_parser import _plain_is_well_formed
plain = (
"Recent work [DL15, MBXS17] shows progress on NLP tasks "
"with pre-trained models."
)
self.assertTrue(_plain_is_well_formed(plain))
def test_glued_scan_plain_needs_layout(self):
from docreader.parser.pdf_parser import _plain_is_well_formed
self.assertFalse(_plain_is_well_formed("Thisisadigitalcopyofabook"))
class LayoutQualityFallbackTest(unittest.TestCase):
def test_prefers_plain_when_many_single_char_lines(self):
plain = "Language Models are Few-Shot Learners\nTom Brown"
layout = "0\n2\n0\n2\nl\nu\nJ\nLan ua e Models"
self.assertTrue(_should_prefer_plain(plain, layout))
def test_keeps_good_layout(self):
plain = "Hello world"
layout = "Hello world"
self.assertFalse(_should_prefer_plain(plain, layout))
class ResNetPaperFigureTest(unittest.TestCase):
"""Regression: ResNet PDF (arXiv:1512.03385) vector figures and captions."""
def test_resnet_figures_and_captions(self):
import os
from docreader.parser.pdf_parser import PDFParser
for path in (
os.path.join(
os.path.dirname(__file__),
"..",
"..",
"testdata",
"rag_test",
"pdf_en",
"resnet.pdf",
),
"/tmp/resnet.pdf",
):
if os.path.isfile(path):
break
else:
self.skipTest("resnet.pdf not available")
with open(path, "rb") as f:
doc = PDFParser(file_name="resnet.pdf", file_type="pdf").parse_into_text(
f.read()
)
self.assertGreater(doc.metadata.get("vector_figure_count", 0), 0)
self.assertIn("![", doc.content)
self.assertIn("Figure 2. Residual learning", doc.content)
self.assertNotIn("arXiv:", doc.content)
fig2 = doc.content.find("Figure 2. Residual learning")
before = doc.content[max(0, fig2 - 120) : fig2]
self.assertIn("![", before)
self.assertNotIn("identity", before)
class Gpt3PaperLayoutTest(unittest.TestCase):
"""Regression: arXiv GPT-3 paper title page must not be one-glyph-per-line."""
def test_gpt3_page0_title_and_authors(self):
import os
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_r
from docreader.parser.pdf_parser import PDFParser, _extract_layout_text
pdf_path = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"testdata",
"rag_test",
"pdf_en",
"gpt3.pdf",
)
if not os.path.isfile(pdf_path):
self.skipTest("gpt3.pdf not in testdata")
with open(pdf_path, "rb") as f:
content = f.read()
with pdfium.PdfDocument(content) as pdf:
page = pdf[0]
try:
layout = _extract_layout_text(page, pdfium_r)
finally:
page.close()
# Margin sidebar must not appear as one-glyph-per-line prefix.
self.assertNotRegex(layout[:300], r"^0\n2\n0\n2")
self.assertIn("Few-Shot Learners", layout)
doc = PDFParser(file_name="gpt3.pdf", file_type="pdf").parse_into_text(content)
self.assertIn("Language Models are Few-Shot Learners", doc.content)
self.assertIn("Tom B. Brown", doc.content[:1200])
self.assertIn("[DL15, MBXS17, PNZtY18]", doc.content)
self.assertIn("task-specific architectures), and more recently", doc.content)
self.assertNotIn("k ifi hi d l", doc.content)
class ScanEnglishDictLayoutTest(unittest.TestCase):
"""Regression: Google Books-style PDFs lose spaces without gap inference."""
def test_scan_en_dict_page0_has_word_spaces(self):
import os
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_r
from docreader.parser.pdf_parser import _extract_layout_text
pdf_path = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"testdata",
"rag_test",
"pdf_scan",
"scan_en_dict.pdf",
)
if not os.path.isfile(pdf_path):
self.skipTest("scan_en_dict.pdf not in testdata")
with open(pdf_path, "rb") as f:
pdf = pdfium.PdfDocument(f.read())
try:
text = _extract_layout_text(pdf[0], pdfium_r)
finally:
pdf.close()
self.assertIn("This is a digital copy of a book", text)
self.assertNotIn("Thisisadigitalcopyofabook", text)
class PDFRouterIntegrationTest(unittest.TestCase):
def test_image_only_pdf_routes_to_scanned(self):
pdf_bytes = _make_image_only_pdf(2)

View File

@@ -0,0 +1,86 @@
import shutil
import unittest
from pathlib import Path
from docreader.parser.ppt_convert import (
convert_ppt_to_pptx_bytes,
is_ole_compound,
is_zip_openxml,
needs_ppt_to_pptx_conversion,
normalize_ppt_bytes,
)
TESTDATA = Path(__file__).resolve().parents[2] / "testdata" / "rag_test"
LEGACY_PPT = TESTDATA / "ppt_old" / "en_38256.ppt"
WMF_IMAGE_PPT = LEGACY_PPT
IMAGE_HEAVY_PPT = TESTDATA / "ppt_old" / "en_41384.ppt"
PPTX_SAMPLE = TESTDATA / "pptx" / "en_marker.pptx"
class TestPptConvert(unittest.TestCase):
def test_legacy_ppt_magic(self):
content = LEGACY_PPT.read_bytes()
self.assertTrue(is_ole_compound(content))
self.assertFalse(is_zip_openxml(content))
self.assertTrue(needs_ppt_to_pptx_conversion(content, "ppt"))
def test_pptx_does_not_need_conversion(self):
content = PPTX_SAMPLE.read_bytes()
self.assertTrue(is_zip_openxml(content))
self.assertFalse(needs_ppt_to_pptx_conversion(content, "pptx"))
def test_normalize_pptx_passthrough(self):
content = PPTX_SAMPLE.read_bytes()
out, ext = normalize_ppt_bytes(content, "pptx")
self.assertEqual(out, content)
self.assertEqual(ext, ".pptx")
def test_legacy_ppt_requires_soffice(self):
if not shutil.which("soffice"):
with self.assertRaises(ValueError) as ctx:
normalize_ppt_bytes(LEGACY_PPT.read_bytes(), "ppt")
self.assertIn("LibreOffice", str(ctx.exception))
self.skipTest("LibreOffice not available")
converted = convert_ppt_to_pptx_bytes(LEGACY_PPT.read_bytes(), suffix=".ppt")
self.assertIsNotNone(converted)
self.assertTrue(is_zip_openxml(converted))
out, ext = normalize_ppt_bytes(LEGACY_PPT.read_bytes(), "ppt")
self.assertEqual(ext, ".pptx")
self.assertTrue(is_zip_openxml(out))
def test_wmf_legacy_ppt_extracts_rasterized_image(self):
if not shutil.which("soffice"):
self.skipTest("LibreOffice not available")
if not shutil.which("convert"):
self.skipTest("ImageMagick convert not available")
if not WMF_IMAGE_PPT.is_file():
self.skipTest("testdata missing")
from docreader.parser.markitdown_parser import MarkitdownParser
doc = MarkitdownParser(file_type="ppt").parse_into_text(
WMF_IMAGE_PPT.read_bytes()
)
self.assertEqual(len(doc.images), 1)
self.assertNotIn("bd10496_.jpg", doc.content)
self.assertIn("images/", doc.content)
def test_image_heavy_legacy_ppt_extracts_images(self):
if not shutil.which("soffice"):
self.skipTest("LibreOffice not available")
if not IMAGE_HEAVY_PPT.is_file():
self.skipTest("testdata missing")
from docreader.parser.markitdown_parser import MarkitdownParser
doc = MarkitdownParser(file_type="ppt").parse_into_text(
IMAGE_HEAVY_PPT.read_bytes()
)
self.assertGreaterEqual(len(doc.images), 2)
self.assertNotIn("![](.jpg)", doc.content)
for ref in doc.images:
self.assertTrue(ref.startswith("images/"))
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,42 @@
import unittest
from docreader.parser.web_parser import (
build_visible_text_fallback,
extract_markdown_from_html,
)
class TestWebParserHelpers(unittest.TestCase):
def test_extract_markdown_empty_html(self):
self.assertIsNone(extract_markdown_from_html(""))
self.assertIsNone(extract_markdown_from_html(" "))
def test_extract_markdown_article_html(self):
html = """
<html><head><title>Demo</title></head><body>
<article><h1>Hello</h1><p>World paragraph with enough text for extraction.</p></article>
</body></html>
"""
md = extract_markdown_from_html(html)
self.assertIsNotNone(md)
self.assertIn("Hello", md)
def test_build_fallback_too_short(self):
self.assertIsNone(build_visible_text_fallback("short"))
self.assertIsNone(build_visible_text_fallback(""))
def test_build_fallback_with_title(self):
text = "A" * 60
md = build_visible_text_fallback(text, page_title="WeKnora")
self.assertIsNotNone(md)
self.assertTrue(md.startswith("# WeKnora"))
self.assertIn(text, md)
def test_build_fallback_without_title(self):
text = "B" * 60
md = build_visible_text_fallback(text, page_title="")
self.assertEqual(md, text)
if __name__ == "__main__":
unittest.main()

50
docreader/uv.lock generated
View File

@@ -463,9 +463,11 @@ dependencies = [
{ name = "grpcio" },
{ name = "grpcio-health-checking" },
{ name = "grpcio-tools" },
{ name = "liteparse" },
{ name = "lxml" },
{ name = "markitdown", extra = ["docx", "pdf", "xls", "xlsx"] },
{ name = "opendataloader-pdf" },
{ name = "openpyxl" },
{ name = "pandas" },
{ name = "pillow" },
{ name = "playwright" },
{ name = "protobuf" },
@@ -476,6 +478,7 @@ dependencies = [
{ name = "requests" },
{ name = "textract" },
{ name = "trafilatura" },
{ name = "xlrd" },
]
[package.metadata]
@@ -484,9 +487,11 @@ requires-dist = [
{ name = "grpcio", specifier = ">=1.78.0" },
{ name = "grpcio-health-checking", specifier = ">=1.78.0" },
{ name = "grpcio-tools", specifier = ">=1.78.0" },
{ name = "liteparse", specifier = ">=2.0.4" },
{ name = "lxml", specifier = ">=6.1.0" },
{ name = "markitdown", extras = ["docx", "pdf", "xls", "xlsx"], specifier = ">=0.1.3" },
{ name = "opendataloader-pdf", specifier = ">=2.4.7" },
{ name = "openpyxl", specifier = ">=3.1.0" },
{ name = "pandas", specifier = ">=2.0.0" },
{ name = "pillow", specifier = ">=12.0.0" },
{ name = "playwright", specifier = ">=1.55.0" },
{ name = "protobuf", specifier = ">=6.33.0" },
@@ -497,6 +502,7 @@ requires-dist = [
{ name = "requests", specifier = ">=2.32.5" },
{ name = "textract", specifier = "==1.5.0" },
{ name = "trafilatura", specifier = ">=2.0.0" },
{ name = "xlrd", specifier = ">=2.0.0" },
]
[[package]]
@@ -786,37 +792,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940, upload-time = "2025-02-25T20:21:44.179Z" },
]
[[package]]
name = "liteparse"
version = "2.0.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/0d/e7/ecf68643604a59247a0a7b2f8c73bee7415ea99e0165bb32e2838ddd0d3f/liteparse-2.0.4.tar.gz", hash = "sha256:17f6119f38e80b956c1ce3dc998ea7b0a8e80777ce1f49178f2b14bb17b35a9c", size = 115487, upload-time = "2026-05-30T06:32:12.351Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/b0/4f5007a52ef13679437a892a06ea58448b825de7ea78276e19b9d7fb9dcb/liteparse-2.0.4-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6df1e1199ffbeb2191bb64d7fcbff6af6bdfd1592973e0ad67a82eb09d377c08", size = 13027870, upload-time = "2026-05-30T06:31:11.022Z" },
{ url = "https://files.pythonhosted.org/packages/83/2f/c7977a2d6f376e31c8c465ee010c238e27e06cbb2c3200d63f41983e40db/liteparse-2.0.4-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:35a72946b965d3b6b87a602051919e7ce243da15ad143d301152fb5e8cd0f6d2", size = 13149255, upload-time = "2026-05-30T06:31:13.636Z" },
{ url = "https://files.pythonhosted.org/packages/6b/9d/e7f1a1b8cb14ac867b1220fdb0c87bfe07b86c69bf98578573ab37b1a103/liteparse-2.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:c1bbc8b7206b8bfbf7aabc5341d2cf851b7464641d58375bd218b4e1dd3517f9", size = 11115466, upload-time = "2026-05-30T06:31:16.201Z" },
{ url = "https://files.pythonhosted.org/packages/9c/2d/be89a429a6a6bc78ce8d620974a4f8fbe9f566ea3592a2f1da8dc6bdda4a/liteparse-2.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:537ab6347a384f81980e48cc181d6cd33fc6ad2b7478e3db61350076744d952e", size = 11029024, upload-time = "2026-05-30T06:31:19.11Z" },
{ url = "https://files.pythonhosted.org/packages/a9/c8/7429622d86bf00ceaec95bf211adf1c9a7bdf46f8c2cd806685f9c02c0f1/liteparse-2.0.4-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:feae0c530197130cb38f176d718eeae639d9091264aa5f954835986c59470813", size = 13028074, upload-time = "2026-05-30T06:31:21.725Z" },
{ url = "https://files.pythonhosted.org/packages/21/d0/a97174ae281d353251994ed080c8855ea9b0b5d81a60ab3b6b065e911c49/liteparse-2.0.4-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:2012a3a9b5a3f7e13ce34b5a770158971da43bb9d266c7c5a3ea62bdda7ca851", size = 13148977, upload-time = "2026-05-30T06:31:24.498Z" },
{ url = "https://files.pythonhosted.org/packages/60/48/f41ebe428d8d8d70c53ddd47523baa7300c5cc96e404417d7af25578be01/liteparse-2.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:2d05d10f0d14b1beb34ef8c5e9a14d6cc966adf19f60c7ea1ec5717adc4c986f", size = 11115791, upload-time = "2026-05-30T06:31:26.961Z" },
{ url = "https://files.pythonhosted.org/packages/ea/58/be78c7c47147aeb1350d475336c6c2e17d5aa513be9244e9d95a170ced34/liteparse-2.0.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:87680616fae276b04ace6e5fc5e4e0c93980391b0d46c2d66d72c0742a3cb19e", size = 11026045, upload-time = "2026-05-30T06:31:29.405Z" },
{ url = "https://files.pythonhosted.org/packages/86/1f/105ccdd9bc4608a836fe409394d68e8765e699fa7393c2f2f464c612057f/liteparse-2.0.4-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:414599a922aa51f567fa939183929579d1668ef74846fe25f7f46742bb31fcd8", size = 13022571, upload-time = "2026-05-30T06:31:32.321Z" },
{ url = "https://files.pythonhosted.org/packages/58/9f/4bf4e9b112b47025ae085503fe9cbf13631673ffc41bfb864a3091285c22/liteparse-2.0.4-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:efdaa6b471084a1f4594574555eb6abb5f85de25f2155c8d539542239eacaa56", size = 13146871, upload-time = "2026-05-30T06:31:34.705Z" },
{ url = "https://files.pythonhosted.org/packages/ff/f0/bf10611e409732bd4e19f0fc0faf3194040e8e09bb75a166ee126d09b70f/liteparse-2.0.4-cp312-cp312-win_amd64.whl", hash = "sha256:fb67326ba957388214762acea35d24cf0d1230ae6a2fe1fdeaf74024e92e3c40", size = 11116682, upload-time = "2026-05-30T06:31:36.992Z" },
{ url = "https://files.pythonhosted.org/packages/6e/b5/02ed5fff6418fdc970688190eab4470f4f9c116f4de1e39a7deea0d9968a/liteparse-2.0.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8f02c6e0d8f71da671a3527d52d8f1e2c42fddebf81d1b4931c3d035e4ec1e6a", size = 11025231, upload-time = "2026-05-30T06:31:39.696Z" },
{ url = "https://files.pythonhosted.org/packages/be/d7/b4633483502940d43d583f8057e0aed68b9091087a86d021f8bd7558ba0b/liteparse-2.0.4-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:924e3f669341e22e625b13d08535644d1dfd779bc6781e4ab6f6e54ea90a53d6", size = 13022754, upload-time = "2026-05-30T06:31:42.434Z" },
{ url = "https://files.pythonhosted.org/packages/8a/e0/3938561ad66d4a216922c8e1e6a878f63df82ce5f00f15a935f779fb7c5b/liteparse-2.0.4-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:137f169002f3abe21e3dd2e6781fbd86841096a0f3b0162afc1fd64eb21fa607", size = 13146432, upload-time = "2026-05-30T06:31:46.706Z" },
{ url = "https://files.pythonhosted.org/packages/e6/7f/a2017df8031677d7940ad1ce33640219aa28defae4a8171844ea8bed68ca/liteparse-2.0.4-cp313-cp313-win_amd64.whl", hash = "sha256:098fba3ecb2337f78426d9e077d1f70bc75871d4387ab8c3774b0cc5d26b890d", size = 11116383, upload-time = "2026-05-30T06:31:49.546Z" },
{ url = "https://files.pythonhosted.org/packages/59/63/b2bb03bc30103e93c87695f63eae3ed007b08796a6cc06ea29acace54c4a/liteparse-2.0.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8aeaf821151aaaa854294f3499d64264dbea7d10e682fa9a2443f9177cd444c6", size = 11024196, upload-time = "2026-05-30T06:31:52.091Z" },
{ url = "https://files.pythonhosted.org/packages/b9/c1/6dedc6b4325aa8de3249694123a74bc9506e0d65a28c85aa5fad4bfdea5c/liteparse-2.0.4-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:d2efbaf7453d2bedc86db51b2b808078567817d7fc537122389b65a317927902", size = 13022936, upload-time = "2026-05-30T06:31:54.619Z" },
{ url = "https://files.pythonhosted.org/packages/7a/04/7e7c3a8edd01c9904b6eef76bf4a008f987a5df64b8334c61e742861ac84/liteparse-2.0.4-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:34c53d9cefa35f77dc67a19a875e6dca32b4f35006c2015a22eb30c9c810653b", size = 13146821, upload-time = "2026-05-30T06:31:57.284Z" },
{ url = "https://files.pythonhosted.org/packages/66/f4/da191e881cad5941dc0065782497eb81027bc3f48ac0a3143deab094be33/liteparse-2.0.4-cp314-cp314-win_amd64.whl", hash = "sha256:6546ee0359dc56eebd9f45008bb59708118c234140ecf466f6c7121d9161d9e4", size = 11114558, upload-time = "2026-05-30T06:31:59.799Z" },
{ url = "https://files.pythonhosted.org/packages/16/59/c554f376c0bdd1bf4c313ac5d77a34817740f021ab6ada9d3226a23fa4b6/liteparse-2.0.4-cp315-cp315-manylinux_2_28_aarch64.whl", hash = "sha256:7c02d0bb31cd5aefa3297ce6e58388abd6f3e109c62ac0fdeef07d8eac4b769e", size = 13023454, upload-time = "2026-05-30T06:32:02.464Z" },
{ url = "https://files.pythonhosted.org/packages/89/96/04c595ab45162d81bc73218870d1459560428c3f40957e594a6c1c5ea2be/liteparse-2.0.4-cp315-cp315-manylinux_2_28_x86_64.whl", hash = "sha256:acdf3c76cb3215f8d389a935b6b68007fac2ffa9ce0b681dd53650b69d580521", size = 13146859, upload-time = "2026-05-30T06:32:05.175Z" },
{ url = "https://files.pythonhosted.org/packages/97/9c/59cdd88ebc6c27312ea6cbd0a894002e78b6f8a3dead2b2bf60d7febba85/liteparse-2.0.4-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4cf31cb3987df1190e59b73d9f10976e538ff577f41c40281fd14b84fe4f9da1", size = 13030767, upload-time = "2026-05-30T06:32:07.9Z" },
{ url = "https://files.pythonhosted.org/packages/7d/ae/9b85e510ddb390ed63b407851d412152b7006487d06703d931f6a0b1414e/liteparse-2.0.4-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:a1f9cb9c24f2df0d4f71ddd66ddb474bfdec8a434ecc1428b791f83aab2a688b", size = 13152103, upload-time = "2026-05-30T06:32:10.457Z" },
]
[[package]]
name = "lxml"
version = "6.1.0"
@@ -1294,6 +1269,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" },
]
[[package]]
name = "opendataloader-pdf"
version = "2.4.7"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/1d/5a/98cd2079f2828f7886ee447eae21ee60a858930596aebcc8d275a1fe2b12/opendataloader_pdf-2.4.7.tar.gz", hash = "sha256:a16e995f2f526d706045218d9e359a31f50371a0bc0e3bb1bc15abb467c08fb7", size = 22554865, upload-time = "2026-05-27T10:04:54.285Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b7/14/f897eabf04eab4e6a40dce9214d921558165f3eaed68335892a5b1a004d0/opendataloader_pdf-2.4.7-py3-none-any.whl", hash = "sha256:1c359183650f4c012875010c156f13b6d3477b00762b8e3fbd8479fa03feb628", size = 22568934, upload-time = "2026-05-27T10:04:49.902Z" },
]
[[package]]
name = "openpyxl"
version = "3.1.5"

View File

@@ -86,6 +86,15 @@ export interface ParserEngineConfig {
mineru_cloud_enable_table?: boolean | null
mineru_cloud_enable_ocr?: boolean | null
mineru_cloud_language?: string
// PaddleOCR-VL 自建参数
paddleocr_vl_endpoint?: string
paddleocr_vl_use_seal_recognition?: boolean | null
paddleocr_vl_use_chart_recognition?: boolean | null
// PaddleOCR-VL 云 API 参数
paddleocr_vl_cloud_token?: string
paddleocr_vl_cloud_model?: string
paddleocr_vl_cloud_use_seal_recognition?: boolean | null
paddleocr_vl_cloud_use_chart_recognition?: boolean | null
}
export interface ParserEnginesResponse {

View File

@@ -11,6 +11,7 @@ import { onMounted, ref, nextTick, onUnmounted, watch, computed } from "vue";
import { downKnowledgeDetails, deleteGeneratedQuestion, getChunkByIdOnly, previewKnowledgeFile } from "@/api/knowledge-base/index";
import { MessagePlugin, DialogPlugin } from "tdesign-vue-next";
import { sanitizeHTML, safeMarkdownToHTML, createSafeImage, isValidImageURL, hydrateProtectedFileImages, isValidURL } from '@/utils/security';
import { normalizeSpuriousTablePrefixes } from '@/utils/markdownTableNormalize';
import { openMermaidFullscreen } from '@/utils/mermaidViewer';
import { useI18n } from 'vue-i18n';
import { useAuthStore } from '@/stores/auth';
@@ -283,8 +284,15 @@ let page = 1;
let loadingChunks = false;
let pendingRequestedPage: number | null = null;
let pendingChunksBeforeLoad = 0;
let doc = null;
const CHUNK_PAGE_SIZE = 25;
/** Scroll container for the main doc drawer (not the first .t-drawer__body on the page). */
let docScrollEl: HTMLElement | null = null;
let mdContentWrap = ref()
// Drawer uses attach="body", so markdown nodes live outside mdContentWrap in the DOM.
const docMarkdownRoot = ref<HTMLElement | null>(null)
const getMarkdownRenderRoot = (): ParentNode | null =>
docMarkdownRoot.value ?? (mdContentWrap.value as ParentNode | null) ?? null
let url = ref('')
// 视图模式chunks / merged / preview
// file 类型默认「预览」URL / 手动创建 默认「全文」
@@ -365,18 +373,40 @@ const mergeChunks = (chunks: any[]): string => {
return merged;
};
const findDocDrawerScrollEl = (): HTMLElement | null =>
document.querySelector('.doc-main-drawer .t-drawer__body') as HTMLElement | null;
const unbindDrawerScroll = () => {
if (docScrollEl) {
docScrollEl.removeEventListener('scroll', handleDetailsScroll);
docScrollEl = null;
}
};
const bindDrawerScroll = () => {
unbindDrawerScroll();
docScrollEl = findDocDrawerScrollEl();
if (docScrollEl) {
docScrollEl.addEventListener('scroll', handleDetailsScroll, { passive: true });
}
};
onMounted(() => {
loadTraceDrawerWidth();
loadMainDrawerWidth();
window.addEventListener('resize', onTraceDrawerWindowResize, { passive: true });
nextTick(() => {
const drawers = document.getElementsByClassName('t-drawer__body');
if (drawers && drawers.length > 0) {
doc = drawers[0];
doc.addEventListener('scroll', handleDetailsScroll);
}
})
})
});
watch(() => props.visible, (visible) => {
if (visible) {
nextTick(() => {
bindDrawerScroll();
maybeLoadMoreChunks();
});
} else {
unbindDrawerScroll();
}
});
watch(() => props.details?.id, () => {
page = 1;
loadingChunks = false;
@@ -396,15 +426,16 @@ watch(() => props.details?.chunkLoading, (val) => {
pendingRequestedPage = null;
pendingChunksBeforeLoad = 0;
loadingChunks = false;
if (props.visible) {
nextTick(() => maybeLoadMoreChunks());
}
}
});
onUnmounted(() => {
window.removeEventListener('resize', onTraceDrawerWindowResize);
cleanupTraceDrawerResize();
cleanupMainDrawerResize();
if (doc) {
doc.removeEventListener('scroll', handleDetailsScroll);
}
unbindDrawerScroll();
if (audioBlobUrl.value) {
URL.revokeObjectURL(audioBlobUrl.value);
}
@@ -561,7 +592,10 @@ const loadAudioPreview = async () => {
};
const runMarkdownPostRenderPipeline = async () => {
await nextTick();
const renderRoot = mdContentWrap.value as ParentNode;
const renderRoot = getMarkdownRenderRoot();
if (!renderRoot) {
return;
}
await hydrateProtectedFileImages(renderRoot);
const images = renderRoot?.querySelectorAll?.('img.markdown-image') as NodeListOf<HTMLImageElement> | undefined;
if (images) {
@@ -576,26 +610,29 @@ const runMarkdownPostRenderPipeline = async () => {
await renderMermaidDiagrams();
};
watch(() => props.details.md, (newVal) => {
watch(() => props.details.md, () => {
runMarkdownPostRenderPipeline();
}, { immediate: true, deep: true })
}, { immediate: true, deep: true, flush: 'post' })
watch(() => viewMode.value, (mode) => {
if ((mode === 'chunks' || mode === 'merged') && props.visible) {
runMarkdownPostRenderPipeline();
if (mode === 'chunks') {
nextTick(() => maybeLoadMoreChunks());
}
}
});
}, { flush: 'post' });
watch(() => props.visible, (visible) => {
if (visible && (viewMode.value === 'chunks' || viewMode.value === 'merged')) {
runMarkdownPostRenderPipeline();
}
});
}, { flush: 'post' });
// 渲染 Mermaid 图表的函数
const renderMermaidDiagrams = async () => {
try {
const mermaidElements = mdContentWrap.value?.querySelectorAll('.mermaid');
const mermaidElements = getMarkdownRenderRoot()?.querySelectorAll('.mermaid');
console.log('[Mermaid] Found mermaid elements:', mermaidElements?.length);
if (mermaidElements && mermaidElements.length > 0) {
await mermaid.run({
@@ -624,12 +661,13 @@ const handleMermaidClick = (e: Event) => {
// 为 Mermaid 容器绑定点击全屏事件(绑定在 div 上,不是 SVG 上)
const bindMermaidClickEvents = () => {
if (!mdContentWrap.value) {
console.log('[Mermaid] mdContentWrap is null');
const renderRoot = getMarkdownRenderRoot();
if (!renderRoot) {
console.log('[Mermaid] markdown render root is null');
return;
}
// 绑定在 .mermaid div 上,而不是 SVG 上
const mermaidDivs = mdContentWrap.value.querySelectorAll('.mermaid');
const mermaidDivs = renderRoot.querySelectorAll('.mermaid');
console.log('[Mermaid] Found mermaid divs:', mermaidDivs.length);
mermaidDivs.forEach((div, index) => {
const divEl = div as HTMLElement;
@@ -663,6 +701,9 @@ const processMarkdown = (markdownText) => {
// 处理被 <p> 包裹的表格行,转换为正常的表格行,并在前后补空行
processedText = processedText.replace(/<p>\s*(\|[\s\S]*?\|)\s*<\/p>/gi, '\n$1\n');
// MarkItDown 常在表格前插入空行 + 分隔行,渲染会出现多余空行
processedText = normalizeSpuriousTablePrefixes(processedText);
// 保留表格单元格中的 <br>,不转成换行,避免打散表格;其他区域原样交给 marked 处理
// 先预处理数学定界符,再做安全预处理
@@ -683,7 +724,8 @@ const processMarkdown = (markdownText) => {
};
const handleClose = () => {
emit("closeDoc", false);
if (doc) doc.scrollTop = 0;
const scrollEl = docScrollEl || findDocDrawerScrollEl();
if (scrollEl) scrollEl.scrollTop = 0;
viewMode.value = 'merged';
};
@@ -973,19 +1015,41 @@ const downloadFile = () => {
MessagePlugin.error(t('file.downloadFailed'));
});
};
const requestNextChunkPage = () => {
if (loadingChunks || props.details?.chunkLoading) return;
const total = props.details?.total ?? 0;
const loaded = props.details?.md?.length ?? 0;
if (loaded >= total || total === 0) return;
const pageNum = Math.ceil(total / CHUNK_PAGE_SIZE);
if (page + 1 > pageNum) return;
page++;
loadingChunks = true;
pendingRequestedPage = page;
pendingChunksBeforeLoad = loaded;
emit('getDoc', page);
};
/** When the list is shorter than the drawer, scroll never fires — prefetch until scrollable or done. */
const maybeLoadMoreChunks = () => {
if (!props.visible || loadingChunks || props.details?.chunkLoading) return;
const el = docScrollEl || findDocDrawerScrollEl();
if (!el) return;
const loaded = props.details?.md?.length ?? 0;
const total = props.details?.total ?? 0;
if (loaded >= total) return;
const { scrollHeight, clientHeight } = el;
if (scrollHeight <= clientHeight + 8) {
requestNextChunkPage();
}
};
const handleDetailsScroll = () => {
if (doc && !loadingChunks) {
let pageNum = Math.ceil(props.details.total / 25);
const { scrollTop, scrollHeight, clientHeight } = doc;
if (scrollTop + clientHeight >= scrollHeight - 8) {
if (props.details.md.length < props.details.total && page + 1 <= pageNum) {
page++;
loadingChunks = true;
pendingRequestedPage = page;
pendingChunksBeforeLoad = props.details.md.length;
emit("getDoc", page);
}
}
if (loadingChunks || props.details?.chunkLoading) return;
const el = docScrollEl || findDocDrawerScrollEl();
if (!el) return;
const { scrollTop, scrollHeight, clientHeight } = el;
if (scrollTop + clientHeight >= scrollHeight - 8) {
requestNextChunkPage();
}
};
</script>
@@ -1052,6 +1116,7 @@ const handleDetailsScroll = () => {
</div>
</t-drawer>
<div ref="docMarkdownRoot" class="doc-markdown-root">
<!-- URL类型专属区域保留source 是真实链接,不与标题重复) -->
<div v-if="details.type === 'url'" class="url_box">
<span class="label">{{ $t('knowledgeBase.urlSource') }}</span>
@@ -1203,6 +1268,7 @@ const handleDetailsScroll = () => {
<DocumentPreview :knowledgeId="details.id" :fileType="details.file_type" :fileName="details.title"
:active="viewMode === 'preview'" />
</div>
</div>
</t-drawer>
</div>

View File

@@ -951,6 +951,8 @@ export default {
selfHostedEndpoint: 'Self-hosted Endpoint',
formulaRecognition: 'Formula Recognition',
tableRecognition: 'Table Recognition',
sealRecognition: 'Seal Recognition',
chartRecognition: 'Chart Recognition',
language: 'Language',
testConnection: 'Test Connection',
saveConfig: 'Save Configuration',
@@ -971,6 +973,9 @@ export default {
serverUrl: 'Server URL',
vlmServerUrlPlaceholder: 'e.g. http://your-vllm-server:8000',
vlmServerUrlHint: 'Required when Backend is vlm-http-client or hybrid-http-client',
paddleocrVlEndpointPlaceholder: 'e.g. http://your-paddleocr-vl:8080',
paddleocrVlEndpointHint: 'Base URL of the full PaddleOCR-VL pipeline service; no /layout-parsing suffix needed',
paddleocrVlCloudTokenPlaceholder: 'PaddleOCR-VL AI Studio Token',
},
storage: {
title: 'Storage Engine',
@@ -2814,6 +2819,9 @@ export default {
max_owned_per_user: 'Max tenants owned per user',
default_storage_quota_gb: 'Default storage quota for new tenants (GB)',
},
asynq: {
concurrency: 'Async task worker concurrency',
},
},
enumLabels: {
auth: {
@@ -4240,6 +4248,14 @@ export default {
name: 'MinerU Cloud',
desc: 'MinerU Cloud API',
},
paddleocr_vl: {
name: 'PaddleOCR-VL',
desc: 'PaddleOCR-VL self-hosted service',
},
paddleocr_vl_cloud: {
name: 'PaddleOCR-VL Cloud',
desc: 'PaddleOCR-VL Cloud API',
},
weknoracloud: {
name: 'WeKnora Cloud',
desc: 'Document parsing via WeKnora Cloud',

View File

@@ -811,6 +811,8 @@ export default {
selfHostedEndpoint: '자체 호스팅 엔드포인트',
formulaRecognition: '수식 인식',
tableRecognition: '표 인식',
sealRecognition: '인장 인식',
chartRecognition: '차트 인식',
language: '언어',
testConnection: '연결 테스트',
saveConfig: '설정 저장',
@@ -831,6 +833,9 @@ export default {
serverUrl: '서버 URL',
vlmServerUrlPlaceholder: '예: http://your-vllm-server:8000',
vlmServerUrlHint: 'Backend가 vlm-http-client 또는 hybrid-http-client인 경우 필요',
paddleocrVlEndpointPlaceholder: '예: http://your-paddleocr-vl:8080',
paddleocrVlEndpointHint: 'PaddleOCR-VL 전체 서비스(pipeline) 주소를 입력하세요. /layout-parsing 접미사는 불필요합니다',
paddleocrVlCloudTokenPlaceholder: 'PaddleOCR-VL AI Studio Token',
},
storage: {
title: '스토리지 엔진',
@@ -2053,6 +2058,9 @@ export default {
max_owned_per_user: "사용자당 최대 테넌트 수",
default_storage_quota_gb: "신규 테넌트 기본 저장 용량 (GB)",
},
asynq: {
concurrency: "비동기 작업 워커 동시 처리 수",
},
},
enumLabels: {
auth: {
@@ -4302,6 +4310,14 @@ export default {
name: 'MinerU Cloud',
desc: 'MinerU Cloud API',
},
paddleocr_vl: {
name: 'PaddleOCR-VL',
desc: 'PaddleOCR-VL 자체 호스팅 서비스',
},
paddleocr_vl_cloud: {
name: 'PaddleOCR-VL Cloud',
desc: 'PaddleOCR-VL Cloud API',
},
weknoracloud: {
name: 'WeKnora Cloud',
desc: 'WeKnora Cloud를 통한 문서 파싱',

View File

@@ -866,6 +866,8 @@ export default {
selfHostedEndpoint: 'Собственная конечная точка',
formulaRecognition: 'Распознавание формул',
tableRecognition: 'Распознавание таблиц',
sealRecognition: 'Распознавание печатей',
chartRecognition: 'Распознавание диаграмм',
language: 'Язык',
testConnection: 'Проверить с текущими параметрами',
saveConfig: 'Сохранить конфигурацию',
@@ -882,7 +884,10 @@ export default {
languagePlaceholder: 'напр. ch, en, ja (по умолчанию ch)',
mineruCloudApiKeyPlaceholder: 'MinerU Cloud API Key',
vlmLabel: 'vlm (визуальная языковая модель)',
mineruHtmlLabel: 'MinerU-HTML (HTML парсинг)'
mineruHtmlLabel: 'MinerU-HTML (HTML парсинг)',
paddleocrVlEndpointPlaceholder: 'напр. http://your-paddleocr-vl:8080',
paddleocrVlEndpointHint: 'Адрес полного сервиса PaddleOCR-VL (pipeline); суффикс /layout-parsing не требуется',
paddleocrVlCloudTokenPlaceholder: 'Токен PaddleOCR-VL AI Studio'
},
storage: {
title: 'Хранилище',
@@ -1772,6 +1777,9 @@ export default {
max_owned_per_user: 'Максимум тенантов на пользователя',
default_storage_quota_gb: 'Квота хранилища для новых тенантов по умолчанию (ГБ)',
},
asynq: {
concurrency: 'Параллелизм воркеров асинхронных задач',
},
},
enumLabels: {
auth: {
@@ -3802,6 +3810,14 @@ export default {
name: 'MinerU Cloud',
desc: 'MinerU Cloud API',
},
paddleocr_vl: {
name: 'PaddleOCR-VL',
desc: 'Самостоятельно развёрнутый сервис PaddleOCR-VL',
},
paddleocr_vl_cloud: {
name: 'PaddleOCR-VL Cloud',
desc: 'PaddleOCR-VL Cloud API',
},
weknoracloud: {
name: 'WeKnora Cloud',
desc: 'Парсинг документов через WeKnora Cloud',

View File

@@ -807,6 +807,8 @@ export default {
selfHostedEndpoint: "自建端点",
formulaRecognition: "公式识别",
tableRecognition: "表格识别",
sealRecognition: "印章识别",
chartRecognition: "图表识别",
language: "语言",
testConnection: "测试连接",
saveConfig: "保存配置",
@@ -827,6 +829,9 @@ export default {
serverUrl: "服务器地址",
vlmServerUrlPlaceholder: "如 http://your-vllm-server:8000",
vlmServerUrlHint: "当 Backend 选择 vlm-http-client 或 hybrid-http-client 时需要填写",
paddleocrVlEndpointPlaceholder: "如 http://your-paddleocr-vl:8080",
paddleocrVlEndpointHint: "填写 PaddleOCR-VL 完整服务pipeline地址无需 /layout-parsing 后缀",
paddleocrVlCloudTokenPlaceholder: "PaddleOCR-VL 飞桨星河社区 Token",
},
storage: {
title: "存储引擎",
@@ -2032,6 +2037,9 @@ export default {
max_owned_per_user: "每用户最大租户数",
default_storage_quota_gb: "新租户默认存储配额 (GB)",
},
asynq: {
concurrency: "异步任务并发数",
},
},
enumLabels: {
auth: {
@@ -4234,6 +4242,14 @@ export default {
name: "MinerU Cloud",
desc: "MinerU Cloud API",
},
paddleocr_vl: {
name: "PaddleOCR-VL",
desc: "PaddleOCR-VL 自部署服务",
},
paddleocr_vl_cloud: {
name: "PaddleOCR-VL Cloud",
desc: "PaddleOCR-VL 云 API",
},
weknoracloud: {
name: "WeKnora Cloud",
desc: "使用 WeKnora Cloud 进行文档解析",

View File

@@ -0,0 +1,72 @@
/** Matches a GFM alignment cell (---, :---, ---:, :---:). */
const SEPARATOR_CELL = /^:?-{3,}:?$/;
function splitRowCells(line: string): string[] {
const inner = line.trim();
if (!inner.startsWith('|')) {
return [];
}
let parts = inner.split('|');
if (parts.length && parts[0].trim() === '') {
parts = parts.slice(1);
}
if (parts.length && parts[parts.length - 1].trim() === '') {
parts = parts.slice(0, -1);
}
return parts.map((part) => part.trim());
}
function isTableRow(line: string): boolean {
const stripped = line.trim();
return stripped.startsWith('|') && stripped.includes('|', 1);
}
function isSeparatorRow(line: string): boolean {
const cells = splitRowCells(line);
return cells.length > 0 && cells.every((cell) => SEPARATOR_CELL.test(cell));
}
function isEmptyRow(line: string): boolean {
const cells = splitRowCells(line);
return cells.length > 0 && cells.every((cell) => cell === '');
}
function separatorRowFor(headerLine: string): string {
const cells = splitRowCells(headerLine);
return `| ${cells.map(() => '---').join(' | ')} |`;
}
function normalizeTableBlock(block: string[]): string[] {
let rows = [...block];
while (rows.length && isEmptyRow(rows[0])) {
rows.shift();
}
if (rows.length && isSeparatorRow(rows[0])) {
rows.shift();
}
if (rows.length >= 2 && !isSeparatorRow(rows[1])) {
rows = [rows[0], separatorRowFor(rows[0]), ...rows.slice(1)];
}
return rows;
}
/** Fix MarkItDown-style tables: empty row + separator before real rows. */
export function normalizeSpuriousTablePrefixes(content: string): string {
const lines = content.split('\n');
const out: string[] = [];
let i = 0;
while (i < lines.length) {
if (!isTableRow(lines[i])) {
out.push(lines[i]);
i += 1;
continue;
}
const block: string[] = [];
while (i < lines.length && isTableRow(lines[i])) {
block.push(lines[i]);
i += 1;
}
out.push(...normalizeTableBlock(block));
}
return out.join('\n');
}

View File

@@ -311,6 +311,60 @@
/>
</div>
</section>
<!-- Section 3 paddleocr_vl 自建配置 -->
<section v-if="currentEngine.Name === 'paddleocr_vl'" class="setting-drawer__section">
<h4 class="setting-drawer__section-title">{{ $t('settings.parser.configSection', '配置') }}</h4>
<div class="form-item">
<label class="form-label required">{{ t('settings.parser.selfHostedEndpoint') }}</label>
<t-input
v-model="config.paddleocr_vl_endpoint"
:placeholder="$t('settings.parser.paddleocrVlEndpointPlaceholder')"
clearable
/>
<p class="form-desc">{{ $t('settings.parser.paddleocrVlEndpointHint') }}</p>
</div>
<div class="form-item">
<label class="form-label">{{ $t('settings.parser.featuresLabel', '识别选项') }}</label>
<div class="form-toggles">
<t-checkbox v-model="config.paddleocr_vl_use_seal_recognition">{{ $t('settings.parser.sealRecognition') }}</t-checkbox>
<t-checkbox v-model="config.paddleocr_vl_use_chart_recognition">{{ $t('settings.parser.chartRecognition') }}</t-checkbox>
</div>
</div>
</section>
<!-- Section 3 paddleocr_vl_cloud API 配置 -->
<section v-if="currentEngine.Name === 'paddleocr_vl_cloud'" class="setting-drawer__section">
<h4 class="setting-drawer__section-title">{{ $t('settings.parser.configSection', '配置') }}</h4>
<div class="form-item">
<label class="form-label required">Token</label>
<t-input
v-model="config.paddleocr_vl_cloud_token"
type="password"
:placeholder="$t('settings.parser.paddleocrVlCloudTokenPlaceholder')"
clearable
>
<template #prefix-icon><t-icon name="lock-on" /></template>
</t-input>
</div>
<div class="form-item">
<label class="form-label">Model</label>
<t-input
v-model="config.paddleocr_vl_cloud_model"
placeholder="PaddleOCR-VL-1.6"
clearable
/>
</div>
<div class="form-item">
<label class="form-label">{{ $t('settings.parser.featuresLabel', '识别选项') }}</label>
<div class="form-toggles">
<t-checkbox v-model="config.paddleocr_vl_cloud_use_seal_recognition">{{ $t('settings.parser.sealRecognition') }}</t-checkbox>
<t-checkbox v-model="config.paddleocr_vl_cloud_use_chart_recognition">{{ $t('settings.parser.chartRecognition') }}</t-checkbox>
</div>
</div>
</section>
</div>
</SettingDrawer>
</div>
@@ -336,7 +390,7 @@ const { t } = useI18n()
const uiStore = useUIStore()
const authStore = useAuthStore()
const CONFIGURABLE_ENGINES = new Set(['mineru', 'mineru_cloud'])
const CONFIGURABLE_ENGINES = new Set(['mineru', 'mineru_cloud', 'paddleocr_vl', 'paddleocr_vl_cloud'])
/** 各解析引擎的项目/官方文档地址 */
const ENGINE_DOC_LINKS: Record<string, string> = {
@@ -344,6 +398,8 @@ const ENGINE_DOC_LINKS: Record<string, string> = {
markitdown: 'https://github.com/microsoft/markitdown',
mineru: 'https://github.com/opendatalab/MinerU',
mineru_cloud: 'https://mineru.net/apiManage/docs',
paddleocr_vl: 'https://github.com/PaddlePaddle/PaddleOCR',
paddleocr_vl_cloud: 'https://aistudio.baidu.com/paddleocr',
}
/** 解析引擎配置默认值(与 DocReader/Python 侧一致) */
@@ -363,6 +419,13 @@ const DEFAULT_PARSER_CONFIG: ParserEngineConfig = {
mineru_cloud_enable_table: true,
mineru_cloud_enable_ocr: true,
mineru_cloud_language: 'ch',
paddleocr_vl_endpoint: '',
paddleocr_vl_use_seal_recognition: true,
paddleocr_vl_use_chart_recognition: false,
paddleocr_vl_cloud_token: '',
paddleocr_vl_cloud_model: 'PaddleOCR-VL-1.6',
paddleocr_vl_cloud_use_seal_recognition: true,
paddleocr_vl_cloud_use_chart_recognition: false,
}
const engines = ref<ParserEngineInfo[]>([])
@@ -407,6 +470,8 @@ const ENGINE_ORDER: Record<string, number> = {
markitdown: 3,
mineru: 4,
mineru_cloud: 5,
paddleocr_vl: 6,
paddleocr_vl_cloud: 7,
}
const sortedEngines = computed(() => {
@@ -491,6 +556,13 @@ async function loadConfig() {
mineru_cloud_enable_table: data?.mineru_cloud_enable_table ?? DEFAULT_PARSER_CONFIG.mineru_cloud_enable_table ?? true,
mineru_cloud_enable_ocr: data?.mineru_cloud_enable_ocr ?? DEFAULT_PARSER_CONFIG.mineru_cloud_enable_ocr ?? true,
mineru_cloud_language: data?.mineru_cloud_language ?? DEFAULT_PARSER_CONFIG.mineru_cloud_language ?? 'ch',
paddleocr_vl_endpoint: data?.paddleocr_vl_endpoint ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_endpoint ?? '',
paddleocr_vl_use_seal_recognition: data?.paddleocr_vl_use_seal_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_use_seal_recognition ?? true,
paddleocr_vl_use_chart_recognition: data?.paddleocr_vl_use_chart_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_use_chart_recognition ?? false,
paddleocr_vl_cloud_token: data?.paddleocr_vl_cloud_token ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_token ?? '',
paddleocr_vl_cloud_model: data?.paddleocr_vl_cloud_model ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_model ?? 'PaddleOCR-VL-1.6',
paddleocr_vl_cloud_use_seal_recognition: data?.paddleocr_vl_cloud_use_seal_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_use_seal_recognition ?? true,
paddleocr_vl_cloud_use_chart_recognition: data?.paddleocr_vl_cloud_use_chart_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_use_chart_recognition ?? false,
}
} catch {
config.value = { ...DEFAULT_PARSER_CONFIG }
@@ -521,6 +593,13 @@ function buildConfigPayload(): ParserEngineConfig {
mineru_cloud_enable_table: config.value.mineru_cloud_enable_table,
mineru_cloud_enable_ocr: config.value.mineru_cloud_enable_ocr,
mineru_cloud_language: config.value.mineru_cloud_language?.trim() ?? '',
paddleocr_vl_endpoint: config.value.paddleocr_vl_endpoint?.trim() ?? '',
paddleocr_vl_use_seal_recognition: config.value.paddleocr_vl_use_seal_recognition,
paddleocr_vl_use_chart_recognition: config.value.paddleocr_vl_use_chart_recognition,
paddleocr_vl_cloud_token: config.value.paddleocr_vl_cloud_token?.trim() ?? '',
paddleocr_vl_cloud_model: config.value.paddleocr_vl_cloud_model?.trim() ?? '',
paddleocr_vl_cloud_use_seal_recognition: config.value.paddleocr_vl_cloud_use_seal_recognition,
paddleocr_vl_cloud_use_chart_recognition: config.value.paddleocr_vl_cloud_use_chart_recognition,
}
}
@@ -739,7 +818,9 @@ onMounted(loadAll)
color: #0089FF;
}
.engine-card--mineru .engine-card__badge,
.engine-card--mineru_cloud .engine-card__badge {
.engine-card--mineru_cloud .engine-card__badge,
.engine-card--paddleocr_vl .engine-card__badge,
.engine-card--paddleocr_vl_cloud .engine-card__badge {
background: rgba(98, 53, 187, 0.12);
color: #6235BB;
}
@@ -1086,7 +1167,9 @@ onMounted(loadAll)
color: #0089FF;
}
.parser-engine-drawer--mineru .setting-drawer__header-icon,
.parser-engine-drawer--mineru_cloud .setting-drawer__header-icon {
.parser-engine-drawer--mineru_cloud .setting-drawer__header-icon,
.parser-engine-drawer--paddleocr_vl .setting-drawer__header-icon,
.parser-engine-drawer--paddleocr_vl_cloud .setting-drawer__header-icon {
background: rgba(98, 53, 187, 0.12);
color: #6235BB;
}

View File

@@ -3256,6 +3256,10 @@ func (s *knowledgeService) resolveDocReader(ctx context.Context, engine, fileTyp
return docparser.NewMinerUReader(overrides)
case "mineru_cloud":
return docparser.NewMinerUCloudReader(overrides)
case "paddleocr_vl":
return docparser.NewPaddleOCRVLReader(overrides)
case "paddleocr_vl_cloud":
return docparser.NewPaddleOCRVLCloudReader(overrides)
case "builtin":
// 明确指定使用 builtin 引擎docreader不使用 simple format 兜底
return s.documentReader

View File

@@ -87,6 +87,10 @@ type settingSpec struct {
// Description is shown in the UI under the key. Stored on the row
// at first write (mirrors Category).
Description string
// RequiresRestart marks keys whose value is bound at process startup
// (e.g. asynq worker pool size). The UI shows a restart badge; the
// service persists the flag on first write.
RequiresRestart bool
}
// registry pins the set of legal keys. Expanding it is a deliberate,
@@ -161,6 +165,20 @@ var registry = map[string]settingSpec{
"仅在创建时读取,修改后只对之后新建的租户生效,不会回写已存在的租户。" +
"0 或负数表示使用内置默认值 10GB。",
},
// asynq.concurrency is the asynq worker pool size (parallel in-flight
// tasks). Read once when the asynq server starts — changing it in the
// UI requires a process restart to take effect. Mirrors
// WEKNORA_ASYNQ_CONCURRENCY (default 16).
"asynq.concurrency": {
Type: "int",
EnvName: "WEKNORA_ASYNQ_CONCURRENCY",
Default: int64(16),
Category: "worker",
RequiresRestart: true,
Description: "异步任务 worker 并发数asynq 线程池大小)。" +
"文档解析、嵌入等任务多为 I/O 等待,适当提高可缩短批量上传排队时间。" +
"修改后需重启服务进程方可生效。",
},
}
// systemSettingService wires the repository, audit log, and (P2)
@@ -655,7 +673,7 @@ func (s *systemSettingService) virtualSetting(key string, spec settingSpec) *typ
Category: category,
Description: spec.Description,
IsSecret: false,
RequiresRestart: false,
RequiresRestart: spec.RequiresRestart,
LastModifiedBy: "",
Enum: spec.Enum,
}
@@ -817,6 +835,7 @@ func (s *systemSettingService) Update(ctx context.Context, key string, rawValue
category = "general"
}
description = spec.Description
requiresRestart = spec.RequiresRestart
}
row := &types.SystemSetting{
@@ -1142,6 +1161,14 @@ func encodeForType(declared string, rawValue any) (types.JSON, error) {
// 400 body verbatim).
func validateRegistryEntry(key string, rawValue any) error {
switch key {
case "asynq.concurrency":
n, err := coerceToPositiveInt64(rawValue)
if err != nil {
return err
}
if n <= 0 {
return errors.New("concurrency must be a positive integer")
}
case "ssrf.whitelist":
// Coerce into the same shape encodeForType produced. We don't
// look at the encoded JSON because that's already canonicalised
@@ -1155,6 +1182,23 @@ func validateRegistryEntry(key string, rawValue any) error {
return nil
}
// coerceToPositiveInt64 accepts int / int64 / float64 from JSON decoding.
func coerceToPositiveInt64(rawValue any) (int64, error) {
switch v := rawValue.(type) {
case int:
return int64(v), nil
case int64:
return v, nil
case float64:
if v != float64(int64(v)) {
return 0, errors.New("expected integer value")
}
return int64(v), nil
default:
return 0, fmt.Errorf("expected integer, got %T", rawValue)
}
}
// coerceToStringSlice mirrors the input shapes accepted by
// encodeForType for "string_list": []any of strings, []string, or a
// comma-separated string. Returns the trimmed, empty-stripped result.

View File

@@ -36,12 +36,22 @@ var defaultHeaderHooks = []headerTrackerHook{
// tableRowPattern matches a single Markdown table row: "| cell | cell | ... |\n"
var tableRowPattern = regexp.MustCompile(`(?m)^\s*(?:\|[^|\n]*)+\|\s*$`)
// markdownTableHookPriority matches DEFAULT_CONFIGS / defaultHeaderHooks table hook.
const markdownTableHookPriority = 15
// headerTracker maintains the state of active headers across split units.
type headerTracker struct {
hooks []headerTrackerHook
activeHeaders map[int]string // priority -> header text
endedHeaders map[int]bool // priorities that have been ended
pendingExtend map[int]bool // headers with empty column names awaiting first data row
// pendingTableBreak is set when a table row unit ends with a paragraph break
// (the blank line between tables is consumed by \n\n splitting). The header
// stays active until the next unit is seen so we can detect a new table.
pendingTableBreak bool
// headerEndedThisUnit tells mergeUnits to flush before the current unit when a
// new table starts (column mismatch or pendingTableBreak + table row).
headerEndedThisUnit bool
}
func newHeaderTracker() *headerTracker {
@@ -55,6 +65,20 @@ func newHeaderTracker() *headerTracker {
// update checks split text for header start/end markers and updates internal state.
func (ht *headerTracker) update(split string) {
ht.headerEndedThisUnit = false
if ht.pendingTableBreak {
ht.pendingTableBreak = false
if _, active := ht.activeHeaders[markdownTableHookPriority]; active {
if firstTableRowColumnCount(split) > 0 {
ht.clearTableHeader()
ht.headerEndedThisUnit = true
} else {
ht.clearTableHeader()
}
}
}
// 1. Check for header-end markers among currently active headers
for _, hook := range ht.hooks {
if _, active := ht.activeHeaders[hook.priority]; active {
@@ -66,6 +90,19 @@ func (ht *headerTracker) update(split string) {
}
}
// 1b. Paragraph splits consume the blank line between tables. Mark a break
// after "| last row |\n\n" and resolve on the next unit; also end when a new
// table row has a different column count than the active header.
if _, active := ht.activeHeaders[markdownTableHookPriority]; active {
if !ht.pendingExtend[markdownTableHookPriority] {
if splitEndsWithParagraphBreak(split) {
ht.pendingTableBreak = true
} else {
ht.endTableHeaderOnColumnMismatch(split)
}
}
}
// 2. If a header has an empty column-name row (e.g. "||"), replace it with
// a proper Markdown table header using the first data row as column names.
//
@@ -159,3 +196,73 @@ func extractSeparatorLine(header string) string {
}
return ""
}
func (ht *headerTracker) clearTableHeader() {
ht.endedHeaders[markdownTableHookPriority] = true
delete(ht.activeHeaders, markdownTableHookPriority)
delete(ht.pendingExtend, markdownTableHookPriority)
}
func (ht *headerTracker) endTableHeaderOnColumnMismatch(split string) {
header, ok := ht.activeHeaders[markdownTableHookPriority]
if !ok {
return
}
rowCols := firstTableRowColumnCount(split)
headerCols := headerTableColumnCount(header)
if rowCols > 0 && headerCols > 0 && rowCols != headerCols {
ht.clearTableHeader()
ht.headerEndedThisUnit = true
}
}
func splitEndsWithParagraphBreak(split string) bool {
trimmed := strings.TrimRight(split, " \t\r")
return strings.HasSuffix(trimmed, "\n\n") || strings.HasSuffix(trimmed, "\r\n\r\n")
}
func tableRowColumnCount(line string) int {
line = strings.TrimSpace(line)
if !strings.HasPrefix(line, "|") {
return 0
}
parts := strings.Split(line, "|")
if len(parts) > 0 && strings.TrimSpace(parts[0]) == "" {
parts = parts[1:]
}
if len(parts) > 0 && strings.TrimSpace(parts[len(parts)-1]) == "" {
parts = parts[:len(parts)-1]
}
return len(parts)
}
func firstTableRowColumnCount(text string) int {
for _, line := range strings.Split(text, "\n") {
line = strings.TrimSpace(line)
if line != "" && tableRowPattern.MatchString(line) {
return tableRowColumnCount(line)
}
}
return 0
}
func headerTableColumnCount(header string) int {
for _, line := range strings.Split(header, "\n") {
line = strings.TrimSpace(line)
if line == "" || strings.Contains(line, "---") {
continue
}
if n := tableRowColumnCount(line); n > 0 {
return n
}
}
return 0
}
// headerColumnMismatch reports whether the next split unit starts a new table
// whose width differs from the active markdown table header.
func headerColumnMismatch(headers, nextUnit string) bool {
headerCols := headerTableColumnCount(headers)
rowCols := firstTableRowColumnCount(nextUnit)
return headerCols > 0 && rowCols > 0 && headerCols != rowCols
}

View File

@@ -450,6 +450,13 @@ func mergeUnits(units []splitUnit, chunkSize, chunkOverlap int) []Chunk {
// Update header tracking
ht.update(u.text)
// Flush at table boundary so the next table is not merged into a chunk
// that still carries the previous table's prepended header context.
if ht.headerEndedThisUnit && len(current) > 0 {
chunks = append(chunks, buildChunk(current, len(chunks)))
current = nil
curLen = 0
}
headers := ht.getHeaders()
headersLen := runeLen(headers)
if headersLen > chunkSize {
@@ -475,7 +482,8 @@ func mergeUnits(units []splitUnit, chunkSize, chunkOverlap int) []Chunk {
// Prepend headers if the column-name context is not already present
// in the overlap or the next unit being added.
overlapText := unitsText(current)
if !headerAlreadyPresent(headers, overlapText, u.text) {
if !headerAlreadyPresent(headers, overlapText, u.text) &&
!headerColumnMismatch(headers, u.text) {
startPos := u.start
if len(current) > 0 {
startPos = current[0].start

View File

@@ -674,6 +674,66 @@ func TestSplitText_EmptyHeaderRowPrepend(t *testing.T) {
}
}
func TestHeaderTracker_ColumnMismatchEndsTable(t *testing.T) {
ht := newHeaderTracker()
ht.update("| Name | Game | Fame | Blame |\n| --- | --- | --- | --- |\n")
if ht.getHeaders() == "" {
t.Fatal("expected active table header")
}
ht.update("| Sinple | Table |\n")
if h := ht.getHeaders(); h != "" {
t.Fatalf("2-col row should end 4-col table header, still active:\n%s", h)
}
}
func TestHeaderTracker_ParagraphBreakEndsOnNextUnit(t *testing.T) {
ht := newHeaderTracker()
ht.update("| Name | Game | Fame | Blame |\n| --- | --- | --- | --- |\n")
ht.update("| Russell Wilson | Football | High | Tacky uniform |\n\n")
if h := ht.getHeaders(); h == "" {
t.Fatal("paragraph break alone should not clear header yet")
}
if !ht.pendingTableBreak {
t.Fatal("expected pendingTableBreak after row ending with \\n\\n")
}
ht.update("| Sinple | Table |\n")
if h := ht.getHeaders(); h != "" {
t.Fatalf("next table row should clear previous header, got %q", h)
}
if !ht.headerEndedThisUnit {
t.Fatal("expected flush signal when new table starts after paragraph break")
}
}
func TestSplitText_EnTablesNoCrossTableHeader(t *testing.T) {
text := "## A table, with and without a header row\n\n" +
"| Name | Game | Fame | Blame |\n" +
"| --- | --- | --- | --- |\n" +
"| Lebron James | Basketball | Very High | Leaving Cleveland |\n" +
"| Ryan Braun | Baseball | Moderate | Steroids |\n" +
"| Russell Wilson | Football | High | Tacky uniform |\n\n" +
"| Sinple | Table |\n" +
"| Without | Header |\n\n" +
"| Simple Multiparagraph | Table Full |\n" +
"| Of Paragraphs | In each Cell. |\n"
cfg := SplitterConfig{ChunkSize: 200, ChunkOverlap: 20, Separators: []string{"\n\n", "\n", "。"}}
chunks := SplitText(text, cfg)
if len(chunks) < 2 {
t.Fatalf("expected multiple chunks, got %d", len(chunks))
}
for i, c := range chunks {
hasSinple := strings.Contains(c.Content, "| Sinple | Table |")
hasSimple := strings.Contains(c.Content, "| Simple Multiparagraph |")
if hasSinple || hasSimple {
if strings.Contains(c.Content, "| Name | Game | Fame | Blame |") {
t.Errorf("chunk[%d] must not carry table-1 header into later tables:\n%s", i, c.Content)
}
}
}
}
func TestSplitText_MultipleTablesInDocument(t *testing.T) {
text := "" +
"第一个表格:\n\n" +

View File

@@ -30,6 +30,8 @@ func init() {
RegisterEngine(&weKnoraCloudEngine{})
RegisterEngine(&mineruEngine{})
RegisterEngine(&mineruCloudEngine{})
RegisterEngine(&paddleOCRVLEngine{})
RegisterEngine(&paddleOCRVLCloudEngine{})
}
// ---------------------------------------------------------------------------
@@ -133,6 +135,44 @@ func (e *mineruCloudEngine) CheckAvailable(_ bool, overrides map[string]string)
return PingMinerUCloud(apiKey)
}
// ---------------------------------------------------------------------------
// paddleocr_vl — Go-native, calls a self-hosted PaddleOCR-VL pipeline service
// ---------------------------------------------------------------------------
type paddleOCRVLEngine struct{}
func (e *paddleOCRVLEngine) Name() string { return "paddleocr_vl" }
func (e *paddleOCRVLEngine) Description() string { return "PaddleOCR-VL self-hosted service" }
func (e *paddleOCRVLEngine) FileTypes(_ bool) []string {
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff"}
}
func (e *paddleOCRVLEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
endpoint := strings.TrimSpace(overrides["paddleocr_vl_endpoint"])
if endpoint == "" {
return false, "PaddleOCR-VL service not configured"
}
return PingPaddleOCRVL(endpoint)
}
// ---------------------------------------------------------------------------
// paddleocr_vl_cloud — Go-native, calls the PaddleOCR-VL AI Studio cloud API
// ---------------------------------------------------------------------------
type paddleOCRVLCloudEngine struct{}
func (e *paddleOCRVLCloudEngine) Name() string { return "paddleocr_vl_cloud" }
func (e *paddleOCRVLCloudEngine) Description() string { return "PaddleOCR-VL Cloud API" }
func (e *paddleOCRVLCloudEngine) FileTypes(_ bool) []string {
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff"}
}
func (e *paddleOCRVLCloudEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
token := strings.TrimSpace(overrides["paddleocr_vl_cloud_token"])
if token == "" {
return false, "PaddleOCR-VL Cloud Token not configured"
}
return PingPaddleOCRVLCloud(token)
}
// ---------------------------------------------------------------------------
// ListAllEngines — merge local + remote
// ---------------------------------------------------------------------------

View File

@@ -0,0 +1,353 @@
package docparser
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime"
"mime/multipart"
"net/http"
"path/filepath"
"strings"
"time"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types"
"github.com/Tencent/WeKnora/internal/utils"
)
const (
paddleOCRVLCloudDefaultBaseURL = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs"
paddleOCRVLCloudDefaultModel = "PaddleOCR-VL-1.6"
paddleOCRVLCloudPollInterval = 5 * time.Second
paddleOCRVLCloudTimeout = 600 * time.Second
)
// PaddleOCRVLCloudReader calls the PaddleOCR-VL AI Studio cloud API.
// Flow: POST /jobs (multipart) → poll GET /jobs/{id} → download result JSONL,
// then fetch each referenced image URL.
type PaddleOCRVLCloudReader struct {
token string
baseURL string
model string
useSeal bool
useChart bool
}
// NewPaddleOCRVLCloudReader creates a reader from ParserEngineOverrides.
func NewPaddleOCRVLCloudReader(overrides map[string]string) *PaddleOCRVLCloudReader {
return &PaddleOCRVLCloudReader{
token: strings.TrimSpace(overrides["paddleocr_vl_cloud_token"]),
baseURL: strings.TrimRight(stringOr(overrides["paddleocr_vl_cloud_base_url"], paddleOCRVLCloudDefaultBaseURL), "/"),
model: stringOr(overrides["paddleocr_vl_cloud_model"], paddleOCRVLCloudDefaultModel),
useSeal: parseBoolOr(overrides["paddleocr_vl_cloud_use_seal_recognition"], true),
useChart: parseBoolOr(overrides["paddleocr_vl_cloud_use_chart_recognition"], false),
}
}
func (c *PaddleOCRVLCloudReader) Read(ctx context.Context, req *types.ReadRequest) (*types.ReadResult, error) {
if c.token == "" {
return &types.ReadResult{Error: "PaddleOCR-VL Cloud token is not configured"}, nil
}
content := req.FileContent
if len(content) == 0 {
return &types.ReadResult{Error: "no file content provided"}, nil
}
logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] Parsing file=%s size=%d model=%s",
req.FileName, len(content), c.model)
jobID, err := c.submitJob(ctx, req, content)
if err != nil {
return nil, fmt.Errorf("PaddleOCR-VL Cloud submit: %w", err)
}
jsonlURL, err := c.pollJob(ctx, jobID)
if err != nil {
return nil, fmt.Errorf("PaddleOCR-VL Cloud poll: %w", err)
}
mdContent, imagesURL, err := c.fetchResults(jsonlURL)
if err != nil {
return nil, fmt.Errorf("PaddleOCR-VL Cloud fetch results: %w", err)
}
imageRefs := c.downloadImages(mdContent, imagesURL)
mdContent, imageRefs = ensureOriginalImageRef(req, mdContent, imageRefs)
logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] Parsed successfully, markdown=%d chars, images=%d",
len(mdContent), len(imageRefs))
return &types.ReadResult{
MarkdownContent: mdContent,
ImageRefs: imageRefs,
}, nil
}
func (c *PaddleOCRVLCloudReader) optionalPayload() map[string]interface{} {
// Shared with the self-hosted engine so both produce identical output.
return paddleOCRVLRecognitionParams(c.useSeal, c.useChart)
}
// --- job submit ---
type paddleOCRVLCloudSubmitResponse struct {
Data struct {
JobID string `json:"jobId"`
} `json:"data"`
ErrorCode int `json:"errorCode"`
ErrorMsg string `json:"errorMsg"`
}
func (c *PaddleOCRVLCloudReader) submitJob(ctx context.Context, req *types.ReadRequest, content []byte) (string, error) {
optional, err := json.Marshal(c.optionalPayload())
if err != nil {
return "", fmt.Errorf("marshal optionalPayload: %w", err)
}
fileName := req.FileName
if fileName == "" {
ext := strings.TrimPrefix(req.FileType, ".")
if ext == "" {
ext = "pdf"
}
fileName = "document." + ext
}
var body bytes.Buffer
writer := multipart.NewWriter(&body)
_ = writer.WriteField("model", c.model)
_ = writer.WriteField("optionalPayload", string(optional))
part, err := writer.CreateFormFile("file", filepath.Base(fileName))
if err != nil {
return "", fmt.Errorf("create form file: %w", err)
}
if _, err := part.Write(content); err != nil {
return "", fmt.Errorf("write file content: %w", err)
}
writer.Close()
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL, &body)
if err != nil {
return "", fmt.Errorf("create request: %w", err)
}
httpReq.Header.Set("Authorization", "bearer "+c.token)
httpReq.Header.Set("Content-Type", writer.FormDataContentType())
client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 60 * time.Second, MaxRedirects: 5})
resp, err := client.Do(httpReq)
if err != nil {
return "", fmt.Errorf("HTTP request: %w", err)
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("API status %d: %s", resp.StatusCode, string(respBody))
}
var result paddleOCRVLCloudSubmitResponse
if err := json.Unmarshal(respBody, &result); err != nil {
return "", fmt.Errorf("decode response: %w", err)
}
if result.Data.JobID == "" {
return "", fmt.Errorf("API returned no jobId: %s", string(respBody))
}
logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] job submitted: jobId=%s", result.Data.JobID)
return result.Data.JobID, nil
}
// --- polling ---
type paddleOCRVLCloudPollResponse struct {
Data struct {
State string `json:"state"`
ErrorMsg string `json:"errorMsg"`
ExtractProgress struct {
TotalPages int `json:"totalPages"`
ExtractedPages int `json:"extractedPages"`
} `json:"extractProgress"`
ResultURL struct {
JSONURL string `json:"jsonUrl"`
} `json:"resultUrl"`
} `json:"data"`
}
func (c *PaddleOCRVLCloudReader) pollJob(ctx context.Context, jobID string) (string, error) {
deadline := time.Now().Add(paddleOCRVLCloudTimeout)
pollCount := 0
url := c.baseURL + "/" + jobID
for time.Now().Before(deadline) {
pollCount++
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return "", fmt.Errorf("create poll request: %w", err)
}
httpReq.Header.Set("Authorization", "bearer "+c.token)
client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 30 * time.Second, MaxRedirects: 5})
resp, err := client.Do(httpReq)
if err != nil {
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] poll #%d failed: %v", pollCount, err)
sleepCtx(ctx, paddleOCRVLCloudPollInterval)
continue
}
respBody, _ := io.ReadAll(resp.Body)
resp.Body.Close()
if resp.StatusCode != http.StatusOK {
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] poll #%d status %d: %s", pollCount, resp.StatusCode, string(respBody))
sleepCtx(ctx, paddleOCRVLCloudPollInterval)
continue
}
var pollResp paddleOCRVLCloudPollResponse
if err := json.Unmarshal(respBody, &pollResp); err != nil {
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] poll #%d decode error: %v", pollCount, err)
sleepCtx(ctx, paddleOCRVLCloudPollInterval)
continue
}
state := strings.ToLower(pollResp.Data.State)
if pollCount == 1 || pollCount%6 == 0 || state == "done" || state == "failed" {
logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] poll #%d: state=%s pages=%d/%d",
pollCount, state, pollResp.Data.ExtractProgress.ExtractedPages, pollResp.Data.ExtractProgress.TotalPages)
}
switch state {
case "done":
if pollResp.Data.ResultURL.JSONURL == "" {
return "", fmt.Errorf("state=done but no jsonUrl")
}
return pollResp.Data.ResultURL.JSONURL, nil
case "failed":
return "", fmt.Errorf("task failed: %s", pollResp.Data.ErrorMsg)
}
sleepCtx(ctx, paddleOCRVLCloudPollInterval)
}
return "", fmt.Errorf("task timed out after %d polls", pollCount)
}
// --- result parsing ---
type paddleOCRVLCloudResultLine struct {
Result struct {
LayoutParsingResults []struct {
Markdown struct {
Text string `json:"text"`
Images map[string]string `json:"images"`
} `json:"markdown"`
} `json:"layoutParsingResults"`
} `json:"result"`
}
func (c *PaddleOCRVLCloudReader) fetchResults(jsonlURL string) (string, map[string]string, error) {
if err := utils.ValidateURLForSSRF(jsonlURL); err != nil {
return "", nil, fmt.Errorf("jsonl URL blocked by SSRF check: %v", err)
}
client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 120 * time.Second, MaxRedirects: 5})
resp, err := client.Get(jsonlURL)
if err != nil {
return "", nil, fmt.Errorf("download jsonl: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", nil, fmt.Errorf("download jsonl status %d", resp.StatusCode)
}
data, err := io.ReadAll(resp.Body)
if err != nil {
return "", nil, fmt.Errorf("read jsonl body: %w", err)
}
texts := make([]string, 0)
images := make(map[string]string)
for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
var parsed paddleOCRVLCloudResultLine
if err := json.Unmarshal([]byte(line), &parsed); err != nil {
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] skip malformed jsonl line: %v", err)
continue
}
for _, p := range parsed.Result.LayoutParsingResults {
if t := strings.TrimSpace(p.Markdown.Text); t != "" {
texts = append(texts, p.Markdown.Text)
}
for path, u := range p.Markdown.Images {
if _, ok := images[path]; !ok {
images[path] = u
}
}
}
}
logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] fetched %d page(s), images=%d", len(texts), len(images))
return strings.Join(texts, "\n\n"), images, nil
}
// downloadImages fetches each referenced image URL and builds ImageRef entries.
func (c *PaddleOCRVLCloudReader) downloadImages(mdContent string, imagesURL map[string]string) []types.ImageRef {
var refs []types.ImageRef
client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 60 * time.Second, MaxRedirects: 5})
for ipath, u := range imagesURL {
matchedRefs := mineruImageOriginalRefs(mdContent, ipath)
if len(matchedRefs) == 0 {
continue
}
if err := utils.ValidateURLForSSRF(u); err != nil {
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] image URL blocked %s: %v", ipath, err)
continue
}
resp, err := client.Get(u)
if err != nil {
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] download image %s: %v", ipath, err)
continue
}
imgBytes, err := io.ReadAll(resp.Body)
resp.Body.Close()
if err != nil || resp.StatusCode != http.StatusOK {
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] read image %s status=%d err=%v", ipath, resp.StatusCode, err)
continue
}
ext := strings.TrimPrefix(filepath.Ext(ipath), ".")
if ext == "" {
ext = "png"
}
mimeType := mime.TypeByExtension("." + ext)
if mimeType == "" {
mimeType = "image/png"
}
for _, originalRef := range matchedRefs {
refs = append(refs, types.ImageRef{
Filename: ipath,
OriginalRef: originalRef,
MimeType: mimeType,
ImageData: imgBytes,
})
}
}
return refs
}
// PingPaddleOCRVLCloud checks whether the cloud token is present (the API has
// no lightweight health endpoint, so we only validate configuration here).
func PingPaddleOCRVLCloud(token string) (bool, string) {
if strings.TrimSpace(token) == "" {
return false, "未配置 PaddleOCR-VL Cloud Token"
}
return true, ""
}

View File

@@ -0,0 +1,282 @@
package docparser
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"mime"
"net/http"
"path/filepath"
"strings"
"time"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types"
"github.com/Tencent/WeKnora/internal/utils"
)
const paddleOCRVLTimeout = 1000 * time.Second // large scanned PDFs can take a while
// PaddleOCRVLReader calls a self-hosted PaddleOCR-VL pipeline service
// (the full document-parsing API, not the bare VLM inference server).
//
// Flow: POST {endpoint}/layout-parsing with base64 file → synchronous JSON
// response containing per-page markdown + inline base64 images.
type PaddleOCRVLReader struct {
endpoint string
useSeal bool
useChart bool
useLayout bool
}
// NewPaddleOCRVLReader creates a reader from ParserEngineOverrides.
func NewPaddleOCRVLReader(overrides map[string]string) *PaddleOCRVLReader {
return &PaddleOCRVLReader{
endpoint: strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
useSeal: parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
useChart: parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
useLayout: parseBoolOr(overrides["paddleocr_vl_use_layout_detection"], true),
}
}
func (c *PaddleOCRVLReader) Read(ctx context.Context, req *types.ReadRequest) (*types.ReadResult, error) {
if c.endpoint == "" {
return &types.ReadResult{Error: "PaddleOCR-VL endpoint is not configured"}, nil
}
content := req.FileContent
if len(content) == 0 {
return &types.ReadResult{Error: "no file content provided"}, nil
}
logger.Infof(context.Background(), "[PaddleOCR-VL] Parsing file=%s size=%d via %s",
req.FileName, len(content), c.endpoint)
mdContent, imagesB64, err := c.callLayoutParsing(ctx, req, content)
if err != nil {
return nil, fmt.Errorf("PaddleOCR-VL layout-parsing: %w", err)
}
imageRefs, mdContent := c.processImages(mdContent, imagesB64)
mdContent, imageRefs = ensureOriginalImageRef(req, mdContent, imageRefs)
logger.Infof(context.Background(), "[PaddleOCR-VL] Parsed successfully, markdown=%d chars, images=%d",
len(mdContent), len(imageRefs))
return &types.ReadResult{
MarkdownContent: mdContent,
ImageRefs: imageRefs,
}, nil
}
// paddleOCRVLRecognitionParams returns the recognition / page-restructuring
// parameters shared by the self-hosted (/layout-parsing, top-level body) and
// cloud (optionalPayload) request bodies. Keeping both identical ensures the
// self-hosted engine reproduces the cloud output: cross-page table merging,
// multi-level heading reconstruction, header/footer stripping, and the same
// sampling / resolution settings used by the AI Studio service.
func paddleOCRVLRecognitionParams(useSeal, useChart bool) map[string]interface{} {
return map[string]interface{}{
"markdownIgnoreLabels": []string{
"header", "header_image", "footer", "footer_image",
"number", "footnote", "aside_text",
},
"useDocOrientationClassify": false,
"useDocUnwarping": false,
"useLayoutDetection": true,
"useChartRecognition": useChart,
"useSealRecognition": useSeal,
"useOcrForImageBlock": false,
"mergeTables": true,
"relevelTitles": true,
"restructurePages": true,
"layoutShapeMode": "auto",
"promptLabel": "ocr",
"layoutNms": true,
"repetitionPenalty": 1,
"temperature": 0,
"topP": 1,
"minPixels": 147384,
"maxPixels": 2822400,
}
}
// fileTypeCode maps a request to the PaddleOCR-VL fileType field:
// 0 = PDF, 1 = image (including TIFF).
func fileTypeCode(req *types.ReadRequest) int {
ft := strings.ToLower(strings.TrimPrefix(req.FileType, "."))
if ft == "" {
ft = strings.TrimPrefix(strings.ToLower(filepath.Ext(req.FileName)), ".")
}
if ft == "pdf" {
return 0
}
return 1
}
// paddleOCRVLResponse mirrors the relevant fields of the PaddleX serving
// /layout-parsing response. The service returns one entry per page.
type paddleOCRVLResponse struct {
ErrorCode int `json:"errorCode"`
ErrorMsg string `json:"errorMsg"`
Result struct {
LayoutParsingResults []struct {
Markdown struct {
Text string `json:"text"`
Images map[string]string `json:"images"`
} `json:"markdown"`
} `json:"layoutParsingResults"`
} `json:"result"`
}
func (c *PaddleOCRVLReader) callLayoutParsing(
ctx context.Context, req *types.ReadRequest, content []byte,
) (string, map[string]string, error) {
payload := paddleOCRVLRecognitionParams(c.useSeal, c.useChart)
payload["file"] = base64.StdEncoding.EncodeToString(content)
payload["fileType"] = fileTypeCode(req)
payload["visualize"] = false
if !c.useLayout {
payload["useLayoutDetection"] = false
}
body, err := json.Marshal(payload)
if err != nil {
return "", nil, fmt.Errorf("marshal payload: %w", err)
}
httpReq, err := http.NewRequestWithContext(
ctx, http.MethodPost, c.endpoint+"/layout-parsing", bytes.NewReader(body),
)
if err != nil {
return "", nil, fmt.Errorf("create request: %w", err)
}
httpReq.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: paddleOCRVLTimeout}
resp, err := client.Do(httpReq)
if err != nil {
return "", nil, fmt.Errorf("HTTP request: %w", err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return "", nil, fmt.Errorf("read response body: %w", err)
}
if resp.StatusCode != http.StatusOK {
return "", nil, fmt.Errorf("PaddleOCR-VL API status %d: %s", resp.StatusCode, string(respBody))
}
var result paddleOCRVLResponse
if err := json.Unmarshal(respBody, &result); err != nil {
return "", nil, fmt.Errorf("decode response: %w", err)
}
if result.ErrorCode != 0 {
return "", nil, fmt.Errorf("PaddleOCR-VL error %d: %s", result.ErrorCode, result.ErrorMsg)
}
pages := result.Result.LayoutParsingResults
if len(pages) == 0 {
logger.Errorf(context.Background(), "[PaddleOCR-VL] response has no layoutParsingResults")
return "", nil, nil
}
// Merge per-page markdown and image dicts into one document.
texts := make([]string, 0, len(pages))
images := make(map[string]string)
for _, p := range pages {
if t := strings.TrimSpace(p.Markdown.Text); t != "" {
texts = append(texts, p.Markdown.Text)
}
for path, data := range p.Markdown.Images {
if _, ok := images[path]; !ok {
images[path] = data
}
}
}
logger.Infof(context.Background(), "[PaddleOCR-VL] parsed %d page(s), images=%d", len(pages), len(images))
return strings.Join(texts, "\n\n"), images, nil
}
// processImages decodes the inline base64 images returned by PaddleOCR-VL and
// builds ImageRef entries, matching them against references in the markdown.
func (c *PaddleOCRVLReader) processImages(
mdContent string, imagesB64 map[string]string,
) ([]types.ImageRef, string) {
var refs []types.ImageRef
for ipath, b64Str := range imagesB64 {
matchedRefs := mineruImageOriginalRefs(mdContent, ipath)
if len(matchedRefs) == 0 {
continue
}
var imgBytes []byte
var ext string
if m := b64DataURIPattern.FindStringSubmatch(b64Str); len(m) == 3 {
ext = m[1]
decoded, err := base64.StdEncoding.DecodeString(m[2])
if err != nil {
logger.Errorf(context.Background(), "[PaddleOCR-VL] decode base64 image %s: %v", ipath, err)
continue
}
imgBytes = decoded
} else {
decoded, err := base64.StdEncoding.DecodeString(b64Str)
if err != nil {
logger.Errorf(context.Background(), "[PaddleOCR-VL] decode raw base64 image %s: %v", ipath, err)
continue
}
imgBytes = decoded
ext = strings.TrimPrefix(filepath.Ext(ipath), ".")
if ext == "" {
ext = "png"
}
}
mimeType := mime.TypeByExtension("." + ext)
if mimeType == "" {
mimeType = "image/png"
}
for _, originalRef := range matchedRefs {
refs = append(refs, types.ImageRef{
Filename: ipath,
OriginalRef: originalRef,
MimeType: mimeType,
ImageData: imgBytes,
})
}
}
return refs, mdContent
}
// PingPaddleOCRVL checks whether a self-hosted PaddleOCR-VL service is reachable.
func PingPaddleOCRVL(endpoint string) (bool, string) {
endpoint = strings.TrimRight(endpoint, "/")
if endpoint == "" {
return false, "未配置 PaddleOCR-VL 端点"
}
client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{
Timeout: 5 * time.Second,
MaxRedirects: 5,
})
// The pipeline only exposes POST /layout-parsing; an empty GET should still
// produce a routed HTTP response (e.g. 404/405) when the service is up.
resp, err := client.Get(endpoint + "/layout-parsing")
if err != nil {
return false, fmt.Sprintf("PaddleOCR-VL 服务不可达: %v", err)
}
resp.Body.Close()
if resp.StatusCode >= 500 {
return false, fmt.Sprintf("PaddleOCR-VL 服务返回状态 %d", resp.StatusCode)
}
return true, ""
}

View File

@@ -120,18 +120,15 @@ func asynqRetryDelayFunc(n int, e error, t *asynq.Task) time.Duration {
// not on local CPU).
const defaultAsynqConcurrency = 16
func readAsynqConcurrency() int {
if v := strings.TrimSpace(os.Getenv("WEKNORA_ASYNQ_CONCURRENCY")); v != "" {
if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
return parsed
func NewAsynqServer(svc interfaces.SystemSettingService) *asynq.Server {
opt := getAsynqRedisClientOpt()
concurrency := defaultAsynqConcurrency
if svc != nil {
n := svc.GetInt(context.Background(), "asynq.concurrency", "WEKNORA_ASYNQ_CONCURRENCY", defaultAsynqConcurrency)
if n > 0 {
concurrency = int(n)
}
}
return defaultAsynqConcurrency
}
func NewAsynqServer() *asynq.Server {
opt := getAsynqRedisClientOpt()
concurrency := readAsynqConcurrency()
log.Printf("asynq server starting with concurrency=%d redis_op_timeout=%dms",
concurrency, readRedisOpTimeoutMs())
srv := asynq.NewServer(

View File

@@ -285,6 +285,24 @@ type ParserEngineConfig struct {
MinerUCloudEnableTable *bool `json:"mineru_cloud_enable_table,omitempty"`
MinerUCloudEnableOCR *bool `json:"mineru_cloud_enable_ocr,omitempty"`
MinerUCloudLanguage string `json:"mineru_cloud_language,omitempty"`
// OpenDataLoader PDF (docreader engine); hybrid requires opendataloader-pdf-hybrid service.
ODLHybrid string `json:"odl_hybrid,omitempty"` // off (default), docling-fast, hancom-ai
ODLHybridURL string `json:"odl_hybrid_url,omitempty"` // e.g. http://odl-hybrid:5002
ODLHybridMode string `json:"odl_hybrid_mode,omitempty"` // auto, full
ODLHybridFallback *bool `json:"odl_hybrid_fallback,omitempty"`
ODLMarkdownWithHTML *bool `json:"odl_markdown_with_html,omitempty"`
// PaddleOCR-VL self-hosted pipeline service (full /layout-parsing API).
PaddleOCRVLEndpoint string `json:"paddleocr_vl_endpoint,omitempty"` // e.g. http://paddleocr-vl:8080
PaddleOCRVLUseSealRecognition *bool `json:"paddleocr_vl_use_seal_recognition,omitempty"`
PaddleOCRVLUseChartRecognition *bool `json:"paddleocr_vl_use_chart_recognition,omitempty"`
// PaddleOCR-VL AI Studio cloud API.
PaddleOCRVLCloudToken string `json:"paddleocr_vl_cloud_token,omitempty"`
PaddleOCRVLCloudModel string `json:"paddleocr_vl_cloud_model,omitempty"` // e.g. PaddleOCR-VL-1.6
PaddleOCRVLCloudUseSealRecognition *bool `json:"paddleocr_vl_cloud_use_seal_recognition,omitempty"`
PaddleOCRVLCloudUseChartRecognition *bool `json:"paddleocr_vl_cloud_use_chart_recognition,omitempty"`
}
// ToOverridesMap returns a map suitable for ParserEngineOverrides in parse requests.
@@ -333,6 +351,42 @@ func (c *ParserEngineConfig) ToOverridesMap() map[string]string {
if c.MinerUCloudLanguage != "" {
m["mineru_cloud_language"] = c.MinerUCloudLanguage
}
if c.ODLHybrid != "" {
m["odl_hybrid"] = c.ODLHybrid
}
if c.ODLHybridURL != "" {
m["odl_hybrid_url"] = c.ODLHybridURL
}
if c.ODLHybridMode != "" {
m["odl_hybrid_mode"] = c.ODLHybridMode
}
if c.ODLHybridFallback != nil {
m["odl_hybrid_fallback"] = fmt.Sprintf("%v", *c.ODLHybridFallback)
}
if c.ODLMarkdownWithHTML != nil {
m["odl_markdown_with_html"] = fmt.Sprintf("%v", *c.ODLMarkdownWithHTML)
}
if c.PaddleOCRVLEndpoint != "" {
m["paddleocr_vl_endpoint"] = c.PaddleOCRVLEndpoint
}
if c.PaddleOCRVLUseSealRecognition != nil {
m["paddleocr_vl_use_seal_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLUseSealRecognition)
}
if c.PaddleOCRVLUseChartRecognition != nil {
m["paddleocr_vl_use_chart_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLUseChartRecognition)
}
if c.PaddleOCRVLCloudToken != "" {
m["paddleocr_vl_cloud_token"] = c.PaddleOCRVLCloudToken
}
if c.PaddleOCRVLCloudModel != "" {
m["paddleocr_vl_cloud_model"] = c.PaddleOCRVLCloudModel
}
if c.PaddleOCRVLCloudUseSealRecognition != nil {
m["paddleocr_vl_cloud_use_seal_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLCloudUseSealRecognition)
}
if c.PaddleOCRVLCloudUseChartRecognition != nil {
m["paddleocr_vl_cloud_use_chart_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLCloudUseChartRecognition)
}
if len(m) == 0 {
return nil
}

View File

@@ -91,6 +91,7 @@ WeKnora `docker-compose.yml` 大量服务是 **profile 限定**,本镜像只
| `jaeger` | OpenTelemetry trace UI |
| `langfuse` | 自建 Langfuse 可观测平台 |
| `dex` | OIDC 登录 |
| `odl-hybrid` | OpenDataLoader Docling hybrid体积大无预发布镜像`--build` |
启用方式:
@@ -99,6 +100,7 @@ cd /opt/WeKnora
docker compose --profile neo4j up -d # 启用 GraphRAG
docker compose --profile langfuse up -d # 启用自建 Langfuse
docker compose --profile qdrant up -d # 切换到 Qdrant
docker compose --profile odl-hybrid up -d --build odl-hybrid # Docling hybrid按需
```
---

View File

@@ -72,14 +72,17 @@ show_help() {
echo " --dex 启动 DexOIDC 身份认证)"
echo " --langfuse 启动 Langfuse默认已开启"
echo " --no-langfuse 不启动 Langfuse"
echo " --full 启动所有可选服务"
echo " --odl-hybrid 启动 OpenDataLoader hybridDocling镜像较大按需启用"
echo " --full 启动所有可选服务(不含 odl-hybrid需另加 --odl-hybrid"
echo ""
echo "示例:"
echo " $0 start # 启动基础服务"
echo " $0 start --qdrant # 启动基础服务 + Qdrant"
echo " $0 start --qdrant --jaeger # 启动基础服务 + Qdrant + Jaeger"
echo " $0 start --dex # 启动基础服务 + Dex"
echo " $0 start --odl-hybrid # 启动基础服务 + OpenDataLoader hybrid"
echo " $0 start --full # 启动所有服务"
echo " make dev-start DEV_ARGS=--odl-hybrid # 同上Makefile 传参)"
echo " $0 app # 在另一个终端启动后端"
echo " $0 frontend # 在另一个终端启动前端"
}
@@ -104,6 +107,46 @@ check_docker() {
return 0
}
# 检查 .env 是否启用了 hybrid 模式(用于 --odl-hybrid 启动后重建 docreader
_should_enable_odl_hybrid_from_env() {
local hybrid="${DOCREADER_ODL_HYBRID:-off}"
hybrid=$(printf '%s' "$hybrid" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]')
case "$hybrid" in
off|"") return 1 ;;
*) return 0 ;;
esac
}
_enable_odl_hybrid_profile() {
PROFILES="$PROFILES --profile odl-hybrid"
ENABLED_SERVICES="$ENABLED_SERVICES odl-hybrid"
}
# 等待 odl-hybrid HTTP 健康检查通过compose 启动后服务可能仍在拉依赖)
_wait_odl_hybrid_ready() {
local port="${ODL_HYBRID_PORT:-5002}"
local max_wait="${ODL_HYBRID_STARTUP_WAIT_SEC:-180}"
local waited=0
local interval=5
if ! command -v curl &> /dev/null; then
log_warning "未安装 curl跳过 odl-hybrid 就绪等待;请手动检查 http://localhost:${port}/health"
return 0
fi
log_info "等待 odl-hybrid 就绪(最多 ${max_wait}s首次需构建镜像: docker compose ... build odl-hybrid..."
while [ "$waited" -lt "$max_wait" ]; do
if curl -sf "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then
log_success "odl-hybrid 已就绪 (http://localhost:${port}/health)"
return 0
fi
sleep "$interval"
waited=$((waited + interval))
done
log_warning "odl-hybrid 在 ${max_wait}s 内未就绪,请查看: docker logs WeKnora-odl-hybrid"
return 1
}
# 启动基础设施服务
start_services() {
log_info "启动开发环境基础设施服务..."
@@ -120,6 +163,11 @@ start_services() {
log_error ".env 文件不存在,请先创建"
return 1
fi
set -a
# shellcheck source=/dev/null
source .env
set +a
# 解析 profile 参数
shift # 移除 "start" 命令本身
@@ -127,7 +175,6 @@ start_services() {
# 其余可选服务通过 --minio / --qdrant / --neo4j / --jaeger / --dex / --full 按需开启。
PROFILES="--profile langfuse"
ENABLED_SERVICES="langfuse"
while [ $# -gt 0 ]; do
case "$1" in
--minio)
@@ -158,6 +205,11 @@ start_services() {
PROFILES="${PROFILES//--profile langfuse/}"
ENABLED_SERVICES="${ENABLED_SERVICES//langfuse/}"
;;
--odl-hybrid)
if [[ "$ENABLED_SERVICES" != *"odl-hybrid"* ]]; then
_enable_odl_hybrid_profile
fi
;;
--full)
PROFILES="--profile full"
ENABLED_SERVICES="minio qdrant neo4j jaeger dex"
@@ -169,11 +221,22 @@ start_services() {
esac
shift
done
# 启动服务
# 启动服务odl-hybrid 单独 --build避免每次重建 docreader
"$DOCKER_COMPOSE_BIN" $DOCKER_COMPOSE_SUBCMD -f docker-compose.dev.yml $PROFILES up -d
if [ $? -eq 0 ]; then
local compose_rc=$?
if [ "$compose_rc" -eq 0 ] && [[ "$ENABLED_SERVICES" == *"odl-hybrid"* ]]; then
log_info "构建/更新 odl-hybrid 镜像..."
"$DOCKER_COMPOSE_BIN" $DOCKER_COMPOSE_SUBCMD -f docker-compose.dev.yml $PROFILES up -d --build odl-hybrid
_wait_odl_hybrid_ready || true
# docreader 需读取 DOCREADER_ODL_HYBRID若刚改 .env强制重建以注入环境变量
if _should_enable_odl_hybrid_from_env; then
log_info "重建 docreader 以应用 DOCREADER_ODL_HYBRID=${DOCREADER_ODL_HYBRID} ..."
"$DOCKER_COMPOSE_BIN" $DOCKER_COMPOSE_SUBCMD -f docker-compose.dev.yml up -d --force-recreate docreader
fi
fi
if [ "$compose_rc" -eq 0 ]; then
log_success "基础设施服务已启动"
echo ""
log_info "服务访问地址:"
@@ -200,6 +263,10 @@ start_services() {
if [[ "$ENABLED_SERVICES" == *"langfuse"* ]]; then
echo " - Langfuse: http://localhost:${LANGFUSE_WEB_PORT:-3000}"
fi
if [[ "$ENABLED_SERVICES" == *"odl-hybrid"* ]]; then
echo " - ODL Hybrid: http://localhost:${ODL_HYBRID_PORT:-5002} (health: /health)"
echo " docreader 需 DOCREADER_ODL_HYBRID=docling-fast"
fi
echo ""
log_info "接下来的步骤:"