feat(parser): add OpenDataLoader, PaddleOCR-VL engines, and parser improvements

Introduce opendataloader and PaddleOCR-VL parser engines with tenant-level settings UI, replace liteparse, and harden Excel/PPT/Markdown parsing. Optional odl-hybrid sidecar stays local-build only and is excluded from default dev-start and full profiles.
2026-06-04 13:30:32 +08:00 · 2026-06-03 12:00:09 +08:00
parent 7b1bb1054f
commit ef1047bf67
50 changed files with 4352 additions and 304 deletions
--- a/.env.example
+++ b/.env.example
@@ -562,6 +562,39 @@ DOCREADER_TRANSPORT=grpc
 # 渲染页图的最大长边像素（防止超大页面 PDF 渲染出 100+MP 图、撑爆 gRPC 消息上限）
 # 调小可进一步减小图片体积；过小会影响 OCR 识别（密集中文建议 >=1600）
 # DOCREADER_PDF_RENDER_MAX_EDGE=2000
+# Layout text: insert spaces when glyph gaps exceed this × median char width (default 0.4).
+# DOCREADER_PDF_WORD_GAP_WIDTH_RATIO=0.4
+# Native PDF layout: drop narrow margin columns (arXiv sidebar) below this page-width ratio (default 0.12).
+# DOCREADER_PDF_MARGIN_COL_WIDTH_RATIO=0.12
+# DOCREADER_PDF_MIN_HEADING_LINE_CHARS=8
+# Remove U+FFFE/soft-hyphen artifacts; strip vector chart axis text; render chart areas as JPEG.
+# DOCREADER_PDF_SANITIZE_TEXT=true
+# DOCREADER_PDF_STRIP_CHART_DEBRIS=true
+# DOCREADER_PDF_RENDER_VECTOR_FIGURES=true
+
+# OpenDataLoader PDF（知识库 parser_engine_rules 指定 engine: opendataloader）
+# 需 Java 11+；docreader 镜像已包含 openjdk-17-jre-headless。
+# DOCREADER_ODL_MAX_WORKERS=1
+# fast 模式（默认）：DOCREADER_ODL_HYBRID=off
+# hybrid 需另起服务；镜像/模型较大，默认 pull/up 与 --full 均不含 odl-hybrid。
+# 默认 --no-ocr（不做 EasyOCR）。
+# 开发：make dev-start DEV_ARGS=--odl-hybrid（本地 build）
+# 生产/docker-compose.yml（需 DOCREADER_ODL_HYBRID=docling-fast 等）：
+#   docker compose --profile odl-hybrid up -d --build odl-hybrid
+# 该镜像未发布到 Docker Hub（本地 tag: weknora-odl-hybrid:local），make pull-images 不会拉取，只能按需 build。
+# 修改 Dockerfile.odl-hybrid 后需重建：docker compose --profile odl-hybrid build --no-cache odl-hybrid
+# ODL_HYBRID_EXTRA_ARGS=--no-ocr
+# 扫描件不要用 hybrid OCR，请用 builtin 扫描渲染 + Go OCR，或 MinerU；若坚持 hybrid OCR：
+# ODL_HYBRID_EXTRA_ARGS=--force-ocr
+# DOCREADER_ODL_HYBRID=docling-fast
+# DOCREADER_ODL_HYBRID_URL=http://odl-hybrid:5002
+# 开发环境 hybrid：make dev-start DEV_ARGS=--odl-hybrid
+# 仅用 fast 模式（不需 odl-hybrid 容器）时请保持 DOCREADER_ODL_HYBRID=off。
+# ODL_HYBRID_PORT=5002
+# ODL_HYBRID_STARTUP_WAIT_SEC=180
+# DOCREADER_ODL_HYBRID_MODE=auto
+# DOCREADER_ODL_HYBRID_FALLBACK=false
+# DOCREADER_ODL_MARKDOWN_WITH_HTML=false

 # VLM（视觉模型）单次 HTTP 请求的整体超时时间（秒）。
 # 扫描件整页 OCR（全文+版式抽取）在慢端点上很容易超过默认值，
@@ -648,7 +681,9 @@ DOCREADER_TRANSPORT=grpc
 # --- Async pipeline tuning (optional) -----------------------------------------
 # Worker pool size for the asynq server. Default 16 — raise it on machines
 # that handle many concurrent uploads (default Go runtime.NumCPU() under-
-# provisions for the I/O-bound document pipeline).
+# provisions for the I/O-bound document pipeline). Can also be set in the
+# management UI under Settings → System settings (asynq.concurrency);
+# UI changes require a process restart.
 # WEKNORA_ASYNQ_CONCURRENCY=16

 # Read/write timeout (in milliseconds) the asynq client uses against Redis.
--- a/3
+++ b/3
@@ -50,6 +50,7 @@ help:
 	@echo ""
 	@echo "开发模式（推荐）:"
 	@echo "  dev-start         启动开发环境基础设施（仅启动依赖服务）"
+	@echo "                    可选: make dev-start DEV_ARGS=--odl-hybrid"
 	@echo "  dev-stop          停止开发环境"
 	@echo "  dev-restart       重启开发环境"
 	@echo "  dev-logs          查看开发环境日志"
@@ -310,7 +311,7 @@ show-platform:

 # Development mode commands
 dev-start:
-	./scripts/dev.sh start
+	./scripts/dev.sh start $(DEV_ARGS)

 dev-stop:
 	./scripts/dev.sh stop
--- a/docker-compose.dev.yml
+++ b/docker-compose.dev.yml
@@ -248,8 +248,13 @@ services:
      - docreader-tmp-dev:/tmp/docreader
    environment:
      - DOCREADER_IMAGE_OUTPUT_DIR=/tmp/docreader
-      - MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
      - MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-}
+      - DOCREADER_ODL_MAX_WORKERS=${DOCREADER_ODL_MAX_WORKERS:-1}
+      - DOCREADER_ODL_HYBRID=${DOCREADER_ODL_HYBRID:-off}
+      - DOCREADER_ODL_HYBRID_URL=${DOCREADER_ODL_HYBRID_URL:-http://odl-hybrid:5002}
+      - DOCREADER_ODL_HYBRID_MODE=${DOCREADER_ODL_HYBRID_MODE:-auto}
+      - DOCREADER_ODL_HYBRID_FALLBACK=${DOCREADER_ODL_HYBRID_FALLBACK:-false}
+      - DOCREADER_ODL_MARKDOWN_WITH_HTML=${DOCREADER_ODL_MARKDOWN_WITH_HTML:-false}
      - DOCREADER_MARKITDOWN_MAX_WORKERS=${DOCREADER_MARKITDOWN_MAX_WORKERS:-1}
      - DOCREADER_PDF_RENDER_MAX_WORKERS=${DOCREADER_PDF_RENDER_MAX_WORKERS:-1}
      - DOCREADER_PDF_RENDER_DPI=${DOCREADER_PDF_RENDER_DPI:-200}
@@ -272,6 +277,27 @@ services:
    extra_hosts:
      - "host.docker.internal:host-gateway"

+  # OpenDataLoader hybrid backend (optional). Enable profile "odl-hybrid" and set
+  # DOCREADER_ODL_HYBRID=docling-fast on docreader. Default --no-ocr (no EasyOCR).
+  # Local build only — not published to Docker Hub.
+  odl-hybrid:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile.odl-hybrid
+    image: weknora-odl-hybrid:local
+    container_name: WeKnora-odl-hybrid
+    profiles:
+      - odl-hybrid
+    ports:
+      - "${ODL_HYBRID_PORT:-5002}:5002"
+    environment:
+      # Default --no-ocr (digital PDFs). Scanned PDFs: use builtin OCR / MinerU, or
+      # ODL_HYBRID_EXTRA_ARGS="--force-ocr" (needs EasyOCR + libGL in image).
+      - ODL_HYBRID_EXTRA_ARGS=${ODL_HYBRID_EXTRA_ARGS:---no-ocr}
+    networks:
+      - WeKnora-network-dev
+    restart: unless-stopped
+
  jaeger:
    image: jaegertracing/all-in-one:latest
    container_name: WeKnora-jaeger-dev
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -241,6 +241,12 @@ services:
    environment:
      - DOCREADER_IMAGE_OUTPUT_DIR=/tmp/docreader
      - MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-}
+      - DOCREADER_ODL_MAX_WORKERS=${DOCREADER_ODL_MAX_WORKERS:-1}
+      - DOCREADER_ODL_HYBRID=${DOCREADER_ODL_HYBRID:-off}
+      - DOCREADER_ODL_HYBRID_URL=${DOCREADER_ODL_HYBRID_URL:-http://odl-hybrid:5002}
+      - DOCREADER_ODL_HYBRID_MODE=${DOCREADER_ODL_HYBRID_MODE:-auto}
+      - DOCREADER_ODL_HYBRID_FALLBACK=${DOCREADER_ODL_HYBRID_FALLBACK:-false}
+      - DOCREADER_ODL_MARKDOWN_WITH_HTML=${DOCREADER_ODL_MARKDOWN_WITH_HTML:-false}
      - DOCREADER_MARKITDOWN_MAX_WORKERS=${DOCREADER_MARKITDOWN_MAX_WORKERS:-1}
      - DOCREADER_PDF_RENDER_MAX_WORKERS=${DOCREADER_PDF_RENDER_MAX_WORKERS:-1}
      - DOCREADER_PDF_RENDER_DPI=${DOCREADER_PDF_RENDER_DPI:-200}
@@ -250,13 +256,6 @@ services:
      - GRPC_TLS_KEY=${GRPC_TLS_KEY:-}
      - GRPC_TLS_CA=${GRPC_TLS_CA:-}
      - GRPC_AUTH_TOKEN=${GRPC_AUTH_TOKEN:-}
-      - OBS_ENDPOINT=${OBS_ENDPOINT:-}
-      - OBS_REGION=${OBS_REGION:-}
-      - OBS_ACCESS_KEY=${OBS_ACCESS_KEY:-}
-      - OBS_SECRET_KEY=${OBS_SECRET_KEY:-}
-      - OBS_BUCKET_NAME=${OBS_BUCKET_NAME:-}
-      - OBS_PATH_PREFIX=${OBS_PATH_PREFIX:-}
-      - OBS_PROXY_DOMAIN=${OBS_PROXY_DOMAIN:-}
    healthcheck:
      test: ["CMD", "grpc_health_probe", "-addr=localhost:50051"]
      interval: 30s
@@ -269,6 +268,24 @@ services:
    extra_hosts:
      - "host.docker.internal:host-gateway"

+  # OpenDataLoader hybrid backend (optional). Default --no-ocr (no EasyOCR/libGL).
+  # Local build only — not published to Docker Hub; use --profile odl-hybrid --build.
+  odl-hybrid:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile.odl-hybrid
+    image: weknora-odl-hybrid:local
+    container_name: WeKnora-odl-hybrid
+    profiles:
+      - odl-hybrid
+    expose:
+      - "5002"
+    environment:
+      - ODL_HYBRID_EXTRA_ARGS=${ODL_HYBRID_EXTRA_ARGS:---no-ocr}
+    networks:
+      - WeKnora-network
+    restart: unless-stopped
+
  # 修改的PostgreSQL配置
  postgres:
    image: paradedb/paradedb:v0.22.2-pg17
--- a/docker/Dockerfile.docreader
+++ b/docker/Dockerfile.docreader
@@ -94,6 +94,7 @@ RUN apt-get update && apt-get install -y \
    libjpeg62-turbo \
    wget \
    gnupg \
+    openjdk-17-jre-headless \
    libgl1 \
    libglib2.0-0 \
    antiword \
--- a/docker/Dockerfile.odl-hybrid
+++ b/docker/Dockerfile.odl-hybrid
@@ -0,0 +1,29 @@
+# OpenDataLoader PDF hybrid backend (Docling). Pre-install deps so the
+# container listens on :5002 immediately instead of pip install on every start.
+#
+# Default --no-ocr: digital PDFs already have a text layer; Docling layout/table
+# still runs without EasyOCR (avoids libGL + heavy OCR stack in slim images).
+# For scanned PDFs use builtin docreader OCR, MinerU, or override with
+# ODL_HYBRID_EXTRA_ARGS="--force-ocr" (requires extra system/Python deps).
+FROM python:3.10.18-bookworm
+
+# Docling table/layout models import cv2 (OpenCV), which needs libGL at runtime
+# even when hybrid runs with --no-ocr.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    libgl1 \
+    libglib2.0-0 \
+    libgomp1 \
+    libsm6 \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir "opendataloader-pdf[hybrid]>=2.4.7"
+
+EXPOSE 5002
+
+ENV ODL_HYBRID_EXTRA_ARGS="--no-ocr"
+
+HEALTHCHECK --interval=30s --timeout=10s --retries=5 --start-period=120s \
+    CMD curl -f http://localhost:5002/health || exit 1
+
+CMD ["bash", "-c", "exec opendataloader-pdf-hybrid --host 0.0.0.0 --port 5002 ${ODL_HYBRID_EXTRA_ARGS}"]
--- a/docreader/config.py
+++ b/docreader/config.py
@@ -55,6 +55,12 @@ class DocReaderConfig:
    # Parser
    docx_max_pages: int
    markitdown_max_workers: int
+    odl_max_workers: int
+    odl_hybrid: str
+    odl_hybrid_url: str
+    odl_hybrid_mode: str
+    odl_hybrid_fallback: bool
+    odl_markdown_with_html: bool
    pdf_render_max_workers: int
    pdf_render_parallelism: int
    pdf_render_dpi: int
@@ -81,6 +87,17 @@ def load_config() -> DocReaderConfig:
    grpc_port = _get_int(["DOCREADER_GRPC_PORT", "PORT"], 50051)
    docx_max_pages = _get_int(["DOCREADER_DOCX_MAX_PAGES"], 0)
    markitdown_max_workers = _get_int(["DOCREADER_MARKITDOWN_MAX_WORKERS"], 1)
+    odl_max_workers = _get_int(["DOCREADER_ODL_MAX_WORKERS"], 1)
+    odl_hybrid = _get_str(["DOCREADER_ODL_HYBRID"], "off")
+    odl_hybrid_url = _get_str(
+        ["DOCREADER_ODL_HYBRID_URL"],
+        "http://127.0.0.1:5002",
+    )
+    odl_hybrid_mode = _get_str(["DOCREADER_ODL_HYBRID_MODE"], "auto")
+    odl_hybrid_fallback = _get_bool(["DOCREADER_ODL_HYBRID_FALLBACK"], False)
+    odl_markdown_with_html = _get_bool(
+        ["DOCREADER_ODL_MARKDOWN_WITH_HTML"], False
+    )
    pdf_render_max_workers = _get_int(["DOCREADER_PDF_RENDER_MAX_WORKERS"], 1)
    # Intra-document render parallelism: how many worker processes render the
    # scanned pages of a SINGLE PDF in parallel. pdfium is not thread-safe, so
@@ -117,6 +134,12 @@ def load_config() -> DocReaderConfig:
        grpc_port=grpc_port,
        docx_max_pages=docx_max_pages,
        markitdown_max_workers=markitdown_max_workers,
+        odl_max_workers=odl_max_workers,
+        odl_hybrid=odl_hybrid,
+        odl_hybrid_url=odl_hybrid_url,
+        odl_hybrid_mode=odl_hybrid_mode,
+        odl_hybrid_fallback=odl_hybrid_fallback,
+        odl_markdown_with_html=odl_markdown_with_html,
        pdf_render_max_workers=pdf_render_max_workers,
        pdf_render_parallelism=pdf_render_parallelism,
        pdf_render_dpi=pdf_render_dpi,
@@ -139,6 +162,12 @@ def dump_config(mask_secrets: bool = True) -> Dict[str, Any]:
        "DOCREADER_GRPC_PORT": cfg.grpc_port,
        "DOCREADER_DOCX_MAX_PAGES": cfg.docx_max_pages,
        "DOCREADER_MARKITDOWN_MAX_WORKERS": cfg.markitdown_max_workers,
+        "DOCREADER_ODL_MAX_WORKERS": cfg.odl_max_workers,
+        "DOCREADER_ODL_HYBRID": cfg.odl_hybrid,
+        "DOCREADER_ODL_HYBRID_URL": cfg.odl_hybrid_url,
+        "DOCREADER_ODL_HYBRID_MODE": cfg.odl_hybrid_mode,
+        "DOCREADER_ODL_HYBRID_FALLBACK": cfg.odl_hybrid_fallback,
+        "DOCREADER_ODL_MARKDOWN_WITH_HTML": cfg.odl_markdown_with_html,
        "DOCREADER_PDF_RENDER_MAX_WORKERS": cfg.pdf_render_max_workers,
        "DOCREADER_PDF_RENDER_PARALLELISM": cfg.pdf_render_parallelism,
        "DOCREADER_PDF_RENDER_DPI": cfg.pdf_render_dpi,
--- a/docreader/parser/excel_convert.py
+++ b/docreader/parser/excel_convert.py
@@ -0,0 +1,149 @@
+"""LibreOffice helpers for normalizing legacy or unusual Excel uploads."""
+
+from __future__ import annotations
+
+import logging
+import os
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+_XLS_MAGIC = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
+_ZIP_MAGIC = b"PK\x03\x04"
+
+
+def detect_excel_format(content: bytes) -> str | None:
+    """Return pandas/excel format id: xlsx, xls, xlsb, ods, or None."""
+    if not content:
+        return None
+
+    from pandas.io.excel._base import inspect_excel_format
+
+    ext = inspect_excel_format(content_or_path=content)
+    if ext in ("xlsx", "xls", "xlsb", "ods"):
+        return ext
+    if ext == "zip":
+        return "xlsx"
+
+    if content.startswith(_ZIP_MAGIC):
+        return "xlsx"
+    if len(content) >= len(_XLS_MAGIC) and content.startswith(_XLS_MAGIC):
+        return "xls"
+    return None
+
+
+def engine_for_format(ext: str | None) -> str:
+    if ext == "xls":
+        return "xlrd"
+    if ext in ("xlsx", "xlsb"):
+        return "openpyxl"
+    if ext == "ods":
+        return "odf"
+    return "openpyxl"
+
+
+def convert_excel_to_xlsx_bytes(content: bytes, suffix: str = ".xlsx") -> bytes | None:
+    """Convert arbitrary spreadsheet bytes to XLSX using LibreOffice, if available."""
+    soffice = find_soffice()
+    if not soffice:
+        return None
+
+    max_attempts = 3
+    for attempt in range(1, max_attempts + 1):
+        with tempfile.TemporaryDirectory() as temp_dir, tempfile.TemporaryDirectory() as profile_dir:
+            src = os.path.join(temp_dir, f"input{suffix}")
+            with open(src, "wb") as handle:
+                handle.write(content)
+
+            user_installation = Path(profile_dir).as_uri()
+            cmd = [
+                soffice,
+                "--headless",
+                f"-env:UserInstallation={user_installation}",
+                "--convert-to",
+                "xlsx",
+                "--outdir",
+                temp_dir,
+                src,
+            ]
+            try:
+                result = subprocess.run(cmd, capture_output=True, timeout=120)
+            except (OSError, subprocess.TimeoutExpired) as exc:
+                logger.warning("LibreOffice convert failed to start: %s", exc)
+                return None
+
+            if result.returncode != 0:
+                stderr = result.stderr.decode("utf-8", errors="ignore")
+                logger.warning(
+                    "LibreOffice convert failed (attempt %s/%s): %s",
+                    attempt,
+                    max_attempts,
+                    stderr,
+                )
+                if attempt < max_attempts:
+                    time.sleep(0.5 * attempt)
+                    continue
+                return None
+
+            for name in os.listdir(temp_dir):
+                if name.endswith(".xlsx"):
+                    with open(os.path.join(temp_dir, name), "rb") as handle:
+                        converted = handle.read()
+                    logger.info(
+                        "Converted spreadsheet via LibreOffice (%s -> xlsx, %d bytes)",
+                        suffix,
+                        len(converted),
+                    )
+                    return converted
+
+            if attempt < max_attempts:
+                time.sleep(0.5 * attempt)
+    return None
+
+
+def normalize_excel_bytes(content: bytes, file_type: str | None = None) -> bytes:
+    """Return bytes readable by pandas, converting via LibreOffice when needed."""
+    ext = detect_excel_format(content)
+    if ext is not None:
+        return content
+
+    suffixes = []
+    if file_type:
+        suffixes.append(f".{file_type.lstrip('.')}")
+    suffixes.extend([".xlsx", ".xls", ".et", ".csv"])
+    seen: set[str] = set()
+    for suffix in suffixes:
+        if suffix in seen:
+            continue
+        seen.add(suffix)
+        converted = convert_excel_to_xlsx_bytes(content, suffix=suffix)
+        if converted and detect_excel_format(converted) is not None:
+            return converted
+
+    raise ValueError(
+        "Unrecognized Excel file format; the file may be corrupt, encrypted, "
+        "or not a spreadsheet"
+    )
+
+
+def find_soffice() -> Optional[str]:
+    possible_paths = [
+        "/usr/bin/soffice",
+        "/usr/lib/libreoffice/program/soffice",
+        "/opt/libreoffice25.2/program/soffice",
+        "/Applications/LibreOffice.app/Contents/MacOS/soffice",
+        "C:\\Program Files\\LibreOffice\\program\\soffice.exe",
+        "C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
+    ]
+    for path in possible_paths:
+        if path and os.path.exists(path):
+            return path
+
+    result = subprocess.run(["which", "soffice"], capture_output=True, text=True)
+    if result.returncode == 0 and result.stdout.strip():
+        return result.stdout.strip()
+    return None
--- a/docreader/parser/excel_parser.py
+++ b/docreader/parser/excel_parser.py
@@ -13,6 +13,14 @@ import pandas as pd

 from docreader.models.document import Chunk, Document
 from docreader.parser.base_parser import BaseParser
+from docreader.parser.excel_convert import (
+    convert_excel_to_xlsx_bytes,
+    detect_excel_format,
+    engine_for_format,
+    normalize_excel_bytes,
+)
+from docreader.parser.xlsx_merge import fill_merged_cells_xlsx
+from docreader.parser.xlsx_repair import repair_xlsx_bytes

 logger = logging.getLogger(__name__)

@@ -60,13 +68,11 @@ class ExcelParser(BaseParser):
        text: List[str] = []
        start, end = 0, 0

-        # Load Excel file from bytes into pandas ExcelFile object
-        excel_file = pd.ExcelFile(BytesIO(content))
+        excel_file = _open_excel_file(content, file_type=self.file_type)
        
        # Process each sheet in the Excel file
        for excel_sheet_name in excel_file.sheet_names:
-            # Parse the sheet into a DataFrame
-            df = excel_file.parse(sheet_name=excel_sheet_name)
+            df = _read_sheet_dataframe(excel_file, excel_sheet_name)
            # Remove rows where all values are NaN (completely empty rows)
            df.dropna(how="all", inplace=True)

@@ -97,6 +103,90 @@ class ExcelParser(BaseParser):
        return Document(content="".join(text), chunks=chunks)


+def _read_sheet_dataframe(excel_file: pd.ExcelFile, sheet_name: str) -> pd.DataFrame:
+    """Read a worksheet into a DataFrame with stable column labels."""
+    from openpyxl.utils import get_column_letter
+
+    # XLSX is preprocessed (merge fill); use A/B/C column letters and keep row 1 as data.
+    if excel_file.engine == "openpyxl":
+        df = excel_file.parse(sheet_name=sheet_name, header=None)
+        df.columns = [get_column_letter(idx + 1) for idx in range(len(df.columns))]
+        return df
+
+    df = excel_file.parse(sheet_name=sheet_name, header=0)
+    if df.empty:
+        df = excel_file.parse(sheet_name=sheet_name, header=None)
+        df.columns = [get_column_letter(idx + 1) for idx in range(len(df.columns))]
+    elif any(str(col).startswith("Unnamed:") for col in df.columns):
+        df = excel_file.parse(sheet_name=sheet_name, header=None)
+        df.columns = [get_column_letter(idx + 1) for idx in range(len(df.columns))]
+    return df
+
+
+def _prepare_xlsx_bytes(data: bytes) -> bytes:
+    repaired = repair_xlsx_bytes(data)
+    if repaired is not None:
+        data = repaired
+    return fill_merged_cells_xlsx(data)
+
+
+def _open_excel_file(content: bytes, file_type: str | None = None) -> pd.ExcelFile:
+    """Open an Excel workbook with explicit engine selection and fallbacks."""
+    data = content
+    converted_via_soffice = False
+
+    while True:
+        ext = detect_excel_format(data)
+        if ext is None:
+            if converted_via_soffice:
+                raise ValueError(
+                    "Excel file format cannot be determined, you must specify an engine manually."
+                )
+            try:
+                data = normalize_excel_bytes(data, file_type=file_type)
+            except ValueError as exc:
+                raise ValueError(
+                    "Excel file format cannot be determined, you must specify an engine manually."
+                ) from exc
+            converted_via_soffice = True
+            continue
+
+        if ext == "ods":
+            converted = convert_excel_to_xlsx_bytes(data, suffix=".ods")
+            if converted:
+                data = converted
+                continue
+
+        engine = engine_for_format(ext)
+        if ext == "xlsx":
+            data = _prepare_xlsx_bytes(data)
+            engine = "openpyxl"
+        try:
+            return pd.ExcelFile(BytesIO(data), engine=engine)
+        except ImportError as exc:
+            raise ValueError(
+                f"Excel engine {engine!r} is not available for .{ext} files"
+            ) from exc
+        except KeyError as exc:
+            if "sharedStrings.xml" not in str(exc) or engine != "openpyxl":
+                raise
+            repaired = repair_xlsx_bytes(data)
+            if repaired is None:
+                raise
+            logger.info("Repaired XLSX sharedStrings packaging before parse")
+            data = _prepare_xlsx_bytes(repaired)
+            continue
+        except ValueError as exc:
+            if converted_via_soffice or "cannot be determined" not in str(exc):
+                raise
+            try:
+                data = normalize_excel_bytes(content, file_type=file_type)
+            except ValueError:
+                raise
+            converted_via_soffice = True
+            continue
+
+
 if __name__ == "__main__":
    # Example usage: Parse an Excel file and display results
    logging.basicConfig(level=logging.DEBUG)
--- a/docreader/parser/liteparse_parser.py
+++ b/docreader/parser/liteparse_parser.py
@@ -1,109 +0,0 @@
-"""Optional PDF engine backed by LiteParse (LlamaIndex, MIT).
-
-LiteParse is a fast Rust/PDFium text extractor that performs spatial reading-order
-reconstruction natively (multi-column aware) and is considerably faster than the
-Python text path. It is exposed as a *selectable* engine (``liteparse``) rather
-than replacing the builtin engine, so users can opt in per knowledge base.
-
-Scope/limitations (documented intentionally):
-  * Text-first engine: it returns reading-order plain text, not figures. Scanned
-    pages carry no text layer, so for image-dominated PDFs we fall back to the
-    builtin scanned renderer (page -> JPEG, OCR'd by the Go App) to stay robust.
-  * docreader never runs OCR itself; OCR/VLM remain Go-side responsibilities.
-"""
-
-import logging
-
-from docreader.models.document import Document
-from docreader.parser.base_parser import BaseParser
-
-logger = logging.getLogger(__name__)
-
-# If the extracted text averages fewer characters per page than this, the PDF is
-# treated as scanned/image-dominated and routed to the builtin image renderer.
-_MIN_CHARS_PER_PAGE = 20
-# If at least this fraction of sampled pages are image-dominated, the PDF is
-# scanned (even when it carries a garbled OCR text layer) and is routed to the
-# builtin image renderer rather than trusting the low-quality text.
-_SCANNED_PAGE_FRACTION = 0.5
-
-
-def liteparse_available(_overrides=None):
-    """Engine availability probe used by the registry/UI."""
-    try:
-        import liteparse  # noqa: F401
-    except Exception as e:  # pragma: no cover - depends on install
-        return False, f"liteparse 未安装: {e}"
-    return True, ""
-
-
-class LiteParseParser(BaseParser):
-    """Parse a PDF with LiteParse, falling back to scanned rendering when empty."""
-
-    def parse_into_text(self, content: bytes) -> Document:
-        import liteparse
-
-        from docreader.parser.pdf_parser import (
-            PDFScannedParser,
-            estimate_scanned_fraction,
-        )
-
-        # Image-dominated PDFs (incl. ones with a garbled OCR text layer) carry
-        # no trustworthy text; render them as images for Go-side OCR instead.
-        try:
-            scanned_frac = estimate_scanned_fraction(content)
-        except Exception:
-            scanned_frac = 0.0
-        if scanned_frac >= _SCANNED_PAGE_FRACTION:
-            logger.info(
-                "LiteParseParser: %s is image-dominated (%.0f%% scanned pages); "
-                "using builtin scanned renderer",
-                self.file_name,
-                scanned_frac * 100,
-            )
-            return PDFScannedParser(
-                file_name=self.file_name, file_type=self.file_type
-            ).parse_into_text(content)
-
-        engine = liteparse.LiteParse(ocr_enabled=False, quiet=True)
-        result = engine.parse(content)
-        page_count = int(result.num_pages)
-
-        page_texts = []
-        for i in range(page_count):
-            page = result.get_page(i)
-            page_texts.append((getattr(page, "text", "") or "").strip())
-
-        doc_text = (getattr(result, "text", "") or "").strip()
-        if not doc_text:
-            doc_text = "\n\n".join(t for t in page_texts if t)
-
-        # Image-dominated / scanned PDFs yield little to no text: defer to the
-        # builtin scanned renderer so the Go App can OCR the page images.
-        if page_count and len(doc_text) < _MIN_CHARS_PER_PAGE * page_count:
-            logger.info(
-                "LiteParseParser: %s looks scanned (%d chars / %d pages); "
-                "falling back to builtin scanned renderer",
-                self.file_name,
-                len(doc_text),
-                page_count,
-            )
-            return PDFScannedParser(
-                file_name=self.file_name, file_type=self.file_type
-            ).parse_into_text(content)
-
-        logger.info(
-            "LiteParseParser: %s -> %d pages, content_len=%d",
-            self.file_name,
-            page_count,
-            len(doc_text),
-        )
-        return Document(
-            content=doc_text,
-            images={},
-            metadata={
-                "page_count": page_count,
-                "image_source_type": "pdf_text_layer",
-                "parser_engine": "liteparse",
-            },
-        )
--- a/docreader/parser/markdown_parser.py
+++ b/docreader/parser/markdown_parser.py
@@ -18,6 +18,8 @@ import re
 import uuid
 from typing import Dict, List, Match, Optional, Tuple

+_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
+
 from docreader.models.document import Document
 from docreader.parser.base_parser import BaseParser
 from docreader.parser.chain_parser import PipelineParser
@@ -58,6 +60,71 @@ class MarkdownTableUtil:
            re.MULTILINE,
        )

+    @staticmethod
+    def _split_row_cells(row_line: str) -> List[str]:
+        """Split a markdown table row into cells, preserving empty cells."""
+        inner = row_line.strip()
+        if not inner.startswith("|"):
+            return []
+        parts = inner.split("|")
+        if parts and parts[0].strip() == "":
+            parts = parts[1:]
+        if parts and parts[-1].strip() == "":
+            parts = parts[:-1]
+        return [part.strip() for part in parts]
+
+    @staticmethod
+    def _is_table_row(line: str) -> bool:
+        stripped = line.strip()
+        return stripped.startswith("|") and "|" in stripped[1:]
+
+    @classmethod
+    def _is_separator_row(cls, line: str) -> bool:
+        cells = cls._split_row_cells(line)
+        return bool(cells) and all(_SEPARATOR_CELL.match(cell) for cell in cells)
+
+    @classmethod
+    def _is_empty_row(cls, line: str) -> bool:
+        cells = cls._split_row_cells(line)
+        return bool(cells) and all(cell == "" for cell in cells)
+
+    @classmethod
+    def _separator_row_for(cls, header_line: str) -> str:
+        cells = cls._split_row_cells(header_line)
+        return "| " + " | ".join("---" for _ in cells) + " |"
+
+    @classmethod
+    def _normalize_table_block(cls, block: List[str]) -> List[str]:
+        """Fix MarkItDown-style tables: drop bogus prefix rows, ensure GFM delimiter."""
+        while block and cls._is_empty_row(block[0]):
+            block.pop(0)
+        if block and cls._is_separator_row(block[0]):
+            block.pop(0)
+        # GFM/marked need "| --- |" after the first row. Headerless Word tables
+        # only have data rows after we strip the fake empty+separator prefix.
+        if len(block) >= 2 and not cls._is_separator_row(block[1]):
+            sep = cls._separator_row_for(block[0])
+            block = [block[0], sep] + block[1:]
+        return block
+
+    def normalize_spurious_table_prefixes(self, content: str) -> str:
+        """Remove bogus empty/separator prefix rows from MarkItDown table output."""
+        lines = content.split("\n")
+        out: List[str] = []
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            if not self._is_table_row(line):
+                out.append(line)
+                i += 1
+                continue
+            block: List[str] = []
+            while i < len(lines) and self._is_table_row(lines[i]):
+                block.append(lines[i])
+                i += 1
+            out.extend(self._normalize_table_block(block))
+        return "\n".join(out)
+
    def format_table(self, content: str) -> str:
        """Format all Markdown tables in the content.

@@ -70,8 +137,7 @@ class MarkdownTableUtil:

        def process_align(match: Match[str]) -> str:
            """Process alignment row to standardize format."""
-            # Split by | and remove empty strings
-            columns = [col.strip() for col in match.group(0).split("|") if col.strip()]
+            columns = self._split_row_cells(match.group(0))

            processed = []
            for col in columns:
@@ -87,8 +153,7 @@ class MarkdownTableUtil:

        def process_line(match: Match[str]) -> str:
            """Process regular table row to standardize format."""
-            # Split by | and remove empty strings
-            columns = [col.strip() for col in match.group(0).split("|") if col.strip()]
+            columns = self._split_row_cells(match.group(0))

            # Preserve original indentation
            prefix = match.group(1)
@@ -99,8 +164,7 @@ class MarkdownTableUtil:
        formatted_content = self.line_pattern.sub(process_line, formatted_content)
        # Then format alignment rows (must be done after to avoid conflicts)
        formatted_content = self.align_pattern.sub(process_align, formatted_content)
-
-        return formatted_content
+        return self.normalize_spurious_table_prefixes(formatted_content)

    @staticmethod
    def _self_test():
--- a/docreader/parser/markitdown_parser.py
+++ b/docreader/parser/markitdown_parser.py
@@ -9,6 +9,11 @@ from docreader.parser.base_parser import BaseParser
 from docreader.parser.chain_parser import PipelineParser
 from docreader.parser.concurrency import parser_worker_limit
 from docreader.parser.markdown_parser import MarkdownParser
+from docreader.parser.ppt_convert import normalize_ppt_bytes
+from docreader.parser.pptx_media import (
+    attach_pptx_media_to_markdown,
+    markdown_needs_pptx_media_attach,
+)

 logger = logging.getLogger(__name__)

@@ -32,16 +37,41 @@ class StdMarkitdownParser(BaseParser):
        Uses self.file_type (inherited from BaseParser) to hint the stream format.
        """
        ext = self.file_type
-        if ext and not ext.startswith('.'):
-            ext = '.' + ext
+        ft = (ext or "").lstrip(".").lower()
+        pptx_bytes: bytes | None = None
+        if ft in ("ppt", "pptx"):
+            content, ext = normalize_ppt_bytes(content, ft)
+            pptx_bytes = content
+            ft = "pptx"
+        elif ext and not ext.startswith("."):
+            ext = "." + ext

        with parser_worker_limit("markitdown", CONFIG.markitdown_max_workers):
-            result = self.markitdown.convert(
+            result = self._convert_markitdown(content, ext, keep_data_uris=True)
+            if result is None:
+                logger.warning(
+                    "MarkItDown failed with embedded images for %s; retrying without data URIs",
+                    ft or ext,
+                )
+                result = self._convert_markitdown(content, ext, keep_data_uris=False)
+
+        text = result.text_content
+        images: dict[str, str] = {}
+        if pptx_bytes is not None and markdown_needs_pptx_media_attach(text):
+            text, images = attach_pptx_media_to_markdown(text, pptx_bytes)
+        return Document(content=text, images=images)
+
+    def _convert_markitdown(self, content: bytes, ext: str | None, *, keep_data_uris: bool):
+        try:
+            return self.markitdown.convert(
                io.BytesIO(content),
                file_extension=ext,
-                keep_data_uris=True
+                keep_data_uris=keep_data_uris,
            )
-        return Document(content=result.text_content)
+        except Exception:
+            if keep_data_uris:
+                return None
+            raise


 class MarkitdownParser(PipelineParser):
--- a/docreader/parser/opendataloader_parser.py
+++ b/docreader/parser/opendataloader_parser.py
@@ -0,0 +1,351 @@
+"""PDF parser backed by OpenDataLoader PDF (Apache-2.0).
+
+Requires Java 11+ on PATH and the ``opendataloader-pdf`` Python package.
+Each ``convert()`` spawns a JVM; concurrency is limited via
+``DOCREADER_ODL_MAX_WORKERS``.
+
+Hybrid mode (``docling-fast``, etc.) needs a running
+``opendataloader-pdf-hybrid`` server — configure ``DOCREADER_ODL_HYBRID_URL``.
+"""
+
+from __future__ import annotations
+
+import base64
+import html
+import logging
+import os
+import re
+import shutil
+import tempfile
+import urllib.error
+import urllib.request
+from typing import Any, Dict, Mapping, Optional, Tuple
+
+from docreader.config import CONFIG
+from docreader.models.document import Document
+from docreader.parser.base_parser import BaseParser
+from docreader.parser.concurrency import parser_worker_limit
+
+logger = logging.getLogger(__name__)
+
+_MIN_CHARS_PER_PAGE = 20
+_IMAGE_SUFFIXES = (".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp")
+_MD_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
+_IMAGE_FILE_NUM_RE = re.compile(r"^imageFile(\d+)\.", re.I)
+
+
+def _override_str(overrides: Optional[Mapping[str, Any]], key: str, default: str = "") -> str:
+    if overrides:
+        v = overrides.get(key)
+        if v is not None and str(v).strip() != "":
+            return str(v).strip()
+    return default
+
+
+def _override_bool(overrides: Optional[Mapping[str, Any]], key: str, default: bool) -> bool:
+    if overrides:
+        v = overrides.get(key)
+        if v is not None and str(v).strip() != "":
+            return str(v).strip().lower() in {"1", "true", "yes", "y", "on"}
+    return default
+
+
+def _java_available() -> Tuple[bool, str]:
+    if not shutil.which("java"):
+        return False, "需要 Java 11+（JRE），请安装并在 PATH 中配置 java"
+    return True, ""
+
+
+def _package_available() -> Tuple[bool, str]:
+    try:
+        import opendataloader_pdf  # noqa: F401
+    except ImportError as e:
+        return False, f"opendataloader-pdf 未安装: {e}"
+    return True, ""
+
+
+def _ping_hybrid(
+    url: str,
+    *,
+    timeout_sec: float = 5.0,
+    retries: int = 3,
+    retry_delay_sec: float = 2.0,
+) -> Tuple[bool, str]:
+    import time
+
+    base = url.rstrip("/")
+    health_url = f"{base}/health"
+    last_err = ""
+    for attempt in range(max(1, retries)):
+        try:
+            req = urllib.request.Request(health_url, method="GET")
+            with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
+                if 200 <= resp.status < 300:
+                    return True, ""
+                last_err = f"hybrid 健康检查 HTTP {resp.status}: {health_url}"
+        except urllib.error.URLError as e:
+            last_err = f"无法连接 OpenDataLoader hybrid 服务 ({health_url}): {e}"
+        except Exception as e:
+            last_err = f"hybrid 健康检查失败: {e}"
+        if attempt + 1 < retries:
+            time.sleep(retry_delay_sec)
+    hint = (
+        "；若刚执行 make dev-start --odl-hybrid，请等待镜像构建/服务就绪"
+        "（docker logs WeKnora-odl-hybrid）"
+    )
+    return False, last_err + hint
+
+
+def opendataloader_available(
+    overrides: Optional[Mapping[str, Any]] = None,
+) -> Tuple[bool, str]:
+    """Registry / ListEngines availability probe."""
+    ok, msg = _java_available()
+    if not ok:
+        return False, msg
+    ok, msg = _package_available()
+    if not ok:
+        return False, msg
+
+    hybrid = _resolve_hybrid(overrides)
+    if hybrid and hybrid.lower() not in ("off", ""):
+        url = _resolve_hybrid_url(overrides)
+        if url:
+            return _ping_hybrid(url, retries=6, retry_delay_sec=5.0, timeout_sec=5.0)
+    return True, ""
+
+
+def _resolve_hybrid(overrides: Optional[Mapping[str, Any]] = None) -> str:
+    return _override_str(overrides, "odl_hybrid", CONFIG.odl_hybrid)
+
+
+def _resolve_hybrid_url(overrides: Optional[Mapping[str, Any]] = None) -> str:
+    return _override_str(overrides, "odl_hybrid_url", CONFIG.odl_hybrid_url)
+
+
+def _find_markdown_file(output_dir: str, pdf_stem: str) -> str:
+    candidates = []
+    for root, _, files in os.walk(output_dir):
+        for name in files:
+            if name.lower().endswith(".md"):
+                path = os.path.join(root, name)
+                candidates.append(path)
+    if not candidates:
+        raise FileNotFoundError(f"OpenDataLoader 未在 {output_dir} 生成 markdown 文件")
+    for path in candidates:
+        base = os.path.splitext(os.path.basename(path))[0]
+        if base == pdf_stem or base.startswith(pdf_stem):
+            return path
+    candidates.sort(key=lambda p: os.path.getmtime(p), reverse=True)
+    return candidates[0]
+
+
+def _normalize_odl_image_url(raw: str) -> str:
+    """OpenDataLoader wraps paths as ``<images/foo.png>``; storage may HTML-escape them."""
+    s = html.unescape((raw or "").strip())
+    s = s.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
+    s = s.strip().strip("<>").strip().strip('"').strip("'")
+    if s.startswith("./"):
+        s = s[2:]
+    return s.replace("\\", "/")
+
+
+def _canonical_image_ref(abs_path: str, output_dir: str) -> str:
+    """Use ``images/<file>`` keys to match OpenDataLoader markdown conventions."""
+    rel = os.path.relpath(abs_path, output_dir).replace("\\", "/")
+    name = os.path.basename(abs_path)
+    if rel.startswith("images/"):
+        return rel
+    return f"images/{name}"
+
+
+def _collect_images_under_output(output_dir: str) -> Dict[str, str]:
+    """Collect every extracted image under the convert output tree."""
+    images: Dict[str, str] = {}
+    for root, _, files in os.walk(output_dir):
+        for name in files:
+            if not name.lower().endswith(_IMAGE_SUFFIXES):
+                continue
+            abs_path = os.path.join(root, name)
+            ref = _canonical_image_ref(abs_path, output_dir)
+            if ref in images:
+                continue
+            with open(abs_path, "rb") as f:
+                images[ref] = base64.b64encode(f.read()).decode("utf-8")
+    return images
+
+
+def _register_image_alias(aliases: Dict[str, str], alias: str, canonical: str) -> None:
+    key = _normalize_odl_image_url(alias)
+    if key:
+        aliases[key] = canonical
+
+
+def _build_path_alias_map(images: Dict[str, str]) -> Dict[str, str]:
+    """Map ODL markdown spellings (angle brackets, entities, basenames) to dict keys."""
+    aliases: Dict[str, str] = {}
+    for ref in images:
+        base = os.path.basename(ref)
+        variants = [
+            ref,
+            base,
+            f"images/{base}",
+            f"<{ref}>",
+            f"<images/{base}>",
+            f"&lt;{ref}&gt;",
+            f"&lt;images/{base}&gt;",
+        ]
+        for variant in variants:
+            _register_image_alias(aliases, variant, ref)
+    return aliases
+
+
+def _resolve_image_ref(url: str, aliases: Dict[str, str]) -> Optional[str]:
+    key = _normalize_odl_image_url(url)
+    if not key or key.startswith("data:"):
+        return None
+    if key in aliases:
+        return aliases[key]
+    base = os.path.basename(key)
+    for candidate in (base, f"images/{base}"):
+        if candidate in aliases:
+            return aliases[candidate]
+    m = _IMAGE_FILE_NUM_RE.match(base)
+    if m:
+        num = int(m.group(1))
+        numbered = []
+        for ref in {aliases[k] for k in aliases}:
+            bm = _IMAGE_FILE_NUM_RE.match(os.path.basename(ref))
+            if bm:
+                numbered.append((int(bm.group(1)), ref))
+        numbered.sort(key=lambda x: x[0])
+        for n, ref in numbered:
+            if n == num:
+                return ref
+        if numbered and 1 <= num <= len(numbered):
+            return numbered[num - 1][1]
+    return None
+
+
+def _rewrite_markdown_image_refs(
+    markdown: str, images: Dict[str, str]
+) -> str:
+    if not images:
+        return markdown
+    aliases = _build_path_alias_map(images)
+
+    def repl(match: re.Match[str]) -> str:
+        alt, raw_url = match.group(1), match.group(2)
+        url = raw_url.strip().split()[0] if raw_url else ""
+        canonical = _resolve_image_ref(url, aliases)
+        if canonical is None:
+            return match.group(0)
+        return f"![{alt}]({canonical})"
+
+    return _MD_IMAGE_RE.sub(repl, markdown)
+
+
+def _run_convert(
+    pdf_path: str,
+    output_dir: str,
+    image_dir: str,
+    overrides: Optional[Mapping[str, Any]] = None,
+) -> None:
+    import opendataloader_pdf
+
+    kwargs: Dict[str, Any] = {
+        "input_path": pdf_path,
+        "output_dir": output_dir,
+        "format": "markdown",
+        "image_output": "external",
+        "image_dir": image_dir,
+        "quiet": True,
+        "markdown_with_html": _override_bool(
+            overrides, "odl_markdown_with_html", CONFIG.odl_markdown_with_html
+        ),
+    }
+    hybrid = _resolve_hybrid(overrides)
+    if hybrid and hybrid.lower() not in ("off", ""):
+        kwargs["hybrid"] = hybrid
+        hybrid_url = _resolve_hybrid_url(overrides)
+        if hybrid_url:
+            kwargs["hybrid_url"] = hybrid_url
+        hybrid_mode = _override_str(overrides, "odl_hybrid_mode", CONFIG.odl_hybrid_mode)
+        if hybrid_mode:
+            kwargs["hybrid_mode"] = hybrid_mode
+        if _override_bool(overrides, "odl_hybrid_fallback", CONFIG.odl_hybrid_fallback):
+            kwargs["hybrid_fallback"] = True
+
+    opendataloader_pdf.convert(**kwargs)
+
+
+class OpenDataLoaderParser(BaseParser):
+    """Parse PDFs with OpenDataLoader (layout-aware markdown + external images)."""
+
+    def __init__(self, *args: Any, **kwargs: Any):
+        self._engine_overrides: Dict[str, Any] = {
+            k: v
+            for k, v in kwargs.items()
+            if k.startswith("odl_") or k in ("mineru_endpoint", "mineru_api_key")
+        }
+        super().__init__(*args, **kwargs)
+
+    def parse_into_text(self, content: bytes) -> Document:
+        ok, msg = opendataloader_available(self._engine_overrides)
+        if not ok:
+            raise RuntimeError(msg)
+
+        safe_name = os.path.basename(self.file_name) or "document.pdf"
+        if not safe_name.lower().endswith(".pdf"):
+            safe_name = f"{os.path.splitext(safe_name)[0] or 'document'}.pdf"
+        pdf_stem = os.path.splitext(safe_name)[0]
+
+        max_workers = CONFIG.odl_max_workers
+        with parser_worker_limit("opendataloader", max_workers):
+            with tempfile.TemporaryDirectory(prefix="weknora-odl-") as tmp_dir:
+                pdf_path = os.path.join(tmp_dir, safe_name)
+                with open(pdf_path, "wb") as f:
+                    f.write(content)
+                image_dir = os.path.join(tmp_dir, "images")
+                os.makedirs(image_dir, exist_ok=True)
+
+                _run_convert(
+                    pdf_path,
+                    tmp_dir,
+                    image_dir,
+                    overrides=self._engine_overrides,
+                )
+
+                md_path = _find_markdown_file(tmp_dir, pdf_stem)
+                with open(md_path, encoding="utf-8", errors="replace") as f:
+                    text = f.read()
+
+                images = _collect_images_under_output(tmp_dir)
+                text = _rewrite_markdown_image_refs(text, images)
+
+        if len(text.strip()) < _MIN_CHARS_PER_PAGE:
+            logger.info(
+                "OpenDataLoaderParser: %s yielded little text; "
+                "falling back to builtin scanned renderer",
+                self.file_name,
+            )
+            from docreader.parser.pdf_parser import PDFScannedParser
+
+            return PDFScannedParser(
+                file_name=self.file_name, file_type=self.file_type
+            ).parse_into_text(content)
+
+        logger.info(
+            "OpenDataLoaderParser: %s -> content_len=%d images=%d",
+            self.file_name,
+            len(text),
+            len(images),
+        )
+        return Document(
+            content=text,
+            images=images,
+            metadata={
+                "parser_engine": "opendataloader",
+                "odl_hybrid": _resolve_hybrid(self._engine_overrides) or "off",
+            },
+        )
--- a/docreader/parser/pdf_parser.py
+++ b/docreader/parser/pdf_parser.py
@@ -20,6 +20,7 @@ import base64
 import io
 import logging
 import os
+import re
 import statistics

 from docreader.config import CONFIG
@@ -87,12 +88,51 @@ EMBED_MAX_IMAGES = _env_int("DOCREADER_PDF_EMBED_MAX_IMAGES", 50)
 # Reconstruct reading order with a geometric XY-cut so multi-column pages are
 # linearised column-by-column instead of line-interleaved.
 LAYOUT_ORDERING = _env_bool("DOCREADER_PDF_LAYOUT_ORDERING", True)
+# When glyphs are positioned without explicit space characters (common in OCR /
+# search text layers), insert a space if the horizontal gap exceeds this
+# multiple of the line's median glyph width.
+WORD_GAP_WIDTH_RATIO = _env_float("DOCREADER_PDF_WORD_GAP_WIDTH_RATIO", 0.4)
 # Promote visually larger lines to markdown headings (font-size proxy = rect
 # height relative to the page's median line height).
 DETECT_HEADINGS = _env_bool("DOCREADER_PDF_DETECT_HEADINGS", True)
 # Drop invisible (render-mode 3), off-page and degenerate text — a cheap guard
 # against hidden-text prompt injection and OCR artefacts.
 FILTER_HIDDEN_TEXT = _env_bool("DOCREADER_PDF_FILTER_HIDDEN_TEXT", True)
+# Narrow side strips (arXiv watermarks, page labels) narrower than this share of
+# page width are dropped when they look like vertical / single-glyph noise.
+MARGIN_COL_WIDTH_RATIO = _env_float("DOCREADER_PDF_MARGIN_COL_WIDTH_RATIO", 0.12)
+# Minimum characters on a line before font-size heuristics may promote it to a
+# markdown heading (avoids ``### C`` from margin glyphs).
+MIN_HEADING_LINE_CHARS = _env_int("DOCREADER_PDF_MIN_HEADING_LINE_CHARS", 8)
+# Strip pdfium placeholder glyphs (U+FFFE) and soft hyphens; remove axis/legend text
+# from vector figures when a Figure caption is present on the page.
+SANITIZE_PDF_TEXT = _env_bool("DOCREADER_PDF_SANITIZE_TEXT", True)
+STRIP_CHART_TEXT_DEBRIS = _env_bool("DOCREADER_PDF_STRIP_CHART_DEBRIS", True)
+# Render detected vector chart regions (no embedded bitmap) as JPEG for VLM/OCR.
+RENDER_VECTOR_FIGURES = _env_bool("DOCREADER_PDF_RENDER_VECTOR_FIGURES", True)
+MIN_CHART_REGION_CHARS = _env_int("DOCREADER_PDF_MIN_CHART_REGION_CHARS", 18)
+MIN_CHART_REGION_AREA_RATIO = _env_float("DOCREADER_PDF_MIN_CHART_REGION_AREA", 0.015)
+MAX_CHART_REGION_AREA_RATIO = _env_float("DOCREADER_PDF_MAX_CHART_REGION_AREA", 0.42)
+MAX_FIGURE_HEIGHT_RATIO = _env_float("DOCREADER_PDF_MAX_FIGURE_HEIGHT_RATIO", 0.38)
+
+# pdfium / Adobe text layers often emit U+FFFE for missing hyphenation or ligatures.
+_PDF_ARTIFACT_RE = re.compile(r"[\u00ad\u200b-\u200f\ufeff\ufffe\uffff]")
+_PDF_ARTIFACT_JOIN_RE = re.compile(r"(\w)[\u00ad\ufffe](\w)")
+_CHART_DEBRIS_LINE_RE = re.compile(
+    r"^(?:"
+    r"[\d\s.]+|"
+    r"\d{1,2}|"
+    r"\d+-layer|"
+    r"iter\.\s*\(1e4\)|"
+    r"(?:training|test)\s+error\s*\(%\)"
+    r")$",
+    re.IGNORECASE,
+)
+_CHART_LAYER_RE = re.compile(r"^\d+-layer$", re.IGNORECASE)
+_FIGURE_CAPTION_RE = re.compile(r"^Figure\s+\d+\b", re.IGNORECASE)
+_FIGURE_CAPTION_SEARCH_RE = re.compile(r"\bFigure\s+(\d+)\b", re.IGNORECASE)
+_ARXIV_LINE_RE = re.compile(r"^arXiv:\s*\S+", re.IGNORECASE)
+_PAGE_NUM_LINE_RE = re.compile(r"^\d{1,3}$")


 def _close_pdfium_resource(resource) -> None:
@@ -150,6 +190,394 @@ def _extract_page_text(page) -> str:
        _close_pdfium_resource(textpage)


+def _sanitize_pdf_text(text: str) -> str:
+    """Remove PDF text-layer placeholders and repair broken hyphenations."""
+    if not text:
+        return text
+    text = _PDF_ARTIFACT_RE.sub("", text)
+    text = _PDF_ARTIFACT_JOIN_RE.sub(r"\1\2", text)
+    return text
+
+
+def _is_chart_debris_line(line: str) -> bool:
+    t = line.strip()
+    if not t:
+        return False
+    if _CHART_DEBRIS_LINE_RE.match(t):
+        return True
+    if _CHART_LAYER_RE.match(t):
+        return True
+    # Tick labels like "0 1 2 3 4 5 6 0"
+    if re.fullmatch(r"[\d\s.()-]+", t) and len(t) <= 24 and sum(c.isdigit() for c in t) >= 3:
+        return True
+    return False
+
+
+def _strip_chart_text_debris(text: str) -> str:
+    """Drop runs of axis/legend lines leaked from vector figures into the text layer."""
+    if not text:
+        return text
+    lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
+    out: list = []
+    i = 0
+    while i < len(lines):
+        if _is_chart_debris_line(lines[i]):
+            j = i
+            while j < len(lines) and (
+                _is_chart_debris_line(lines[j]) or not lines[j].strip()
+            ):
+                j += 1
+            if j - i >= 3:
+                i = j
+                continue
+        out.append(lines[i])
+        i += 1
+    return "\n".join(out)
+
+
+def _strip_arxiv_and_page_num_lines(text: str) -> str:
+    lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
+    kept: list = []
+    for ln in lines:
+        t = ln.strip()
+        if _ARXIV_LINE_RE.match(t):
+            continue
+        if _PAGE_NUM_LINE_RE.match(t):
+            continue
+        if "arXiv:" in ln:
+            ln = re.sub(r"\s*arXiv:\s*\S+\s*(?:\[[^\]]+\])?\s*[^\n]*", "", ln).strip()
+            if not ln:
+                continue
+        kept.append(ln)
+    return "\n".join(kept)
+
+
+def _strip_lines_above_figure_captions(text: str) -> str:
+    """Remove diagram/chart label lines that sit immediately above a Figure caption."""
+    lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
+    out: list = []
+    for ln in lines:
+        if _line_has_figure_caption(ln):
+            while out and _is_figure_interior_line(out[-1]):
+                out.pop()
+            out.append(ln)
+        else:
+            out.append(ln)
+    return "\n".join(out)
+
+
+def _is_body_paragraph_line(text: str) -> bool:
+    t = text.strip()
+    if len(t) < 48:
+        return False
+    return len(t.split()) >= 8
+
+
+def _is_figure_interior_line(text: str) -> bool:
+    """Short, non-body line directly above a Figure caption (diagram labels, ticks)."""
+    t = text.strip()
+    if not t or _FIGURE_CAPTION_RE.match(t):
+        return False
+    if _ARXIV_LINE_RE.match(t) or _PAGE_NUM_LINE_RE.match(t):
+        return True
+    if _is_body_paragraph_line(t):
+        return False
+    if _is_chart_debris_line(t):
+        return True
+    # Prose sentence above a figure (wrapped paragraph tail) — keep in text.
+    if t.endswith((".", "。", "!", "?", "！")) and len(t) >= 15:
+        return False
+    if len(t.split()) >= 7:
+        return False
+    if len(t) <= 40:
+        return True
+    return False
+
+
+def _postprocess_pdf_text(text: str) -> str:
+    if SANITIZE_PDF_TEXT:
+        text = _sanitize_pdf_text(text)
+    text = _strip_arxiv_and_page_num_lines(text)
+    text = _strip_lines_above_figure_captions(text)
+    if STRIP_CHART_TEXT_DEBRIS:
+        text = _strip_chart_text_debris(text)
+    return text
+
+
+def _char_looks_chart_axis_tick(ch: str) -> bool:
+    """Axis tick / numeric chart labels only (not words like ``layer`` in diagrams)."""
+    t = ch.strip()
+    if not t:
+        return False
+    if len(t) == 1 and t in "0123456789.%()-":
+        return True
+    if _CHART_LAYER_RE.match(t):
+        return True
+    if re.fullmatch(r"iter\.\s*\(1e4\)", t, re.I):
+        return True
+    if re.fullmatch(r"(?:training|test)\s+error\s*\(%\)", t, re.I):
+        return True
+    return False
+
+
+def _chars_bbox(char_list: list) -> tuple:
+    return (
+        min(c["x0"] for c in char_list),
+        min(c["y0"] for c in char_list),
+        max(c["x1"] for c in char_list),
+        max(c["y1"] for c in char_list),
+    )
+
+
+def _bbox_area_ratio(bbox, page_w: float, page_h: float) -> float:
+    page_area = float(page_w) * float(page_h)
+    if page_area <= 0:
+        return 0.0
+    x0, y0, x1, y1 = bbox
+    return max(0.0, (x1 - x0) * (y1 - y0) / page_area)
+
+
+def _chart_region_bbox(chars: list, page_w: float, page_h: float):
+    """Bounding box of numeric chart axis labels (fallback when caption walk fails)."""
+    chart = [c for c in chars if _char_looks_chart_axis_tick(c["ch"])]
+    if len(chart) < MIN_CHART_REGION_CHARS:
+        return None
+    bbox = _chars_bbox(chart)
+    ratio = _bbox_area_ratio(bbox, page_w, page_h)
+    if ratio < MIN_CHART_REGION_AREA_RATIO or ratio > MAX_CHART_REGION_AREA_RATIO:
+        return None
+    x0, y0, x1, y1 = bbox
+    pad_x = max(8.0, (x1 - x0) * 0.08)
+    pad_y = max(8.0, (y1 - y0) * 0.08)
+    return (
+        max(0.0, x0 - pad_x),
+        max(0.0, y0 - pad_y),
+        min(page_w, x1 + pad_x),
+        min(page_h, y1 + pad_y),
+    )
+
+
+def _expand_chart_bbox(bbox, page_w: float, page_h: float, margin_frac: float = 0.18):
+    x0, y0, x1, y1 = bbox
+    dx = (x1 - x0) * margin_frac
+    dy = (y1 - y0) * margin_frac
+    return (
+        max(0.0, x0 - dx),
+        max(0.0, y0 - dy),
+        min(page_w, x1 + dx),
+        min(page_h, y1 + dy),
+    )
+
+
+def _render_page_clip_jpeg(page, bbox, scale: float, quality: int, max_edge: int) -> bytes:
+    """Render a PDF page region to JPEG (bbox in PDF points, bottom-left origin)."""
+    left, bottom, right, top = bbox
+    scale_eff = _effective_scale(page, scale, max_edge)
+    bitmap = None
+    try:
+        bitmap = page.render(scale=scale_eff)
+        pil = bitmap.to_pil().convert("RGB")
+    finally:
+        _close_pdfium_resource(bitmap)
+    page_w, page_h = page.get_size()
+    x0 = int(left * scale_eff)
+    x1 = int(right * scale_eff)
+    y0 = int((page_h - top) * scale_eff)
+    y1 = int((page_h - bottom) * scale_eff)
+    if x1 <= x0 or y1 <= y0:
+        raise ValueError("degenerate clip bbox")
+    return _pil_to_jpeg_bytes(pil.crop((x0, y0, x1, y1)), quality)
+
+
+def _pil_to_jpeg_bytes(pil, quality: int) -> bytes:
+    buf = io.BytesIO()
+    if pil.mode not in ("RGB", "L"):
+        pil = pil.convert("RGB")
+    pil.save(buf, format="JPEG", quality=quality, optimize=True)
+    return buf.getvalue()
+
+
+def _group_lines_with_chars(chars: list) -> list:
+    """Group glyphs into lines; each line includes its char list and bbox."""
+    if not chars:
+        return []
+    heights = [c["y1"] - c["y0"] for c in chars if c["y1"] > c["y0"]]
+    med_h = statistics.median(heights) if heights else 1.0
+    ordered = sorted(chars, key=lambda c: -(c["y0"] + c["y1"]) / 2)
+    groups: list = []
+    cur: list = []
+    ref = None
+    for c in ordered:
+        yc = (c["y0"] + c["y1"]) / 2
+        if ref is None or abs(yc - ref) <= 0.5 * med_h:
+            cur.append(c)
+            ref = yc if ref is None else ref
+        else:
+            groups.append(cur)
+            cur = [c]
+            ref = yc
+    if cur:
+        groups.append(cur)
+
+    lines: list = []
+    for grp in groups:
+        grp_sorted = sorted(grp, key=lambda c: c["x0"])
+        text = _join_line_glyphs(grp_sorted)
+        if not text:
+            continue
+        hs = [c["y1"] - c["y0"] for c in grp_sorted if c["y1"] > c["y0"]]
+        lines.append(
+            {
+                "text": text,
+                "h": statistics.median(hs) if hs else med_h,
+                "chars": grp_sorted,
+                "bbox": _chars_bbox(grp_sorted),
+            }
+        )
+    return lines
+
+
+def _line_has_figure_caption(text: str) -> bool:
+    return bool(_FIGURE_CAPTION_SEARCH_RE.search((text or "").strip()))
+
+
+def _bbox_above_caption(lines: list, cap_i: int, page_w: float, page_h: float):
+    """Region above a Figure caption line (PDF coords, bottom-left origin)."""
+    cap_bbox = lines[cap_i]["bbox"]
+    cap_top = cap_bbox[3]
+    x0, x1 = cap_bbox[0], cap_bbox[2]
+    fig_h = page_h * min(MAX_FIGURE_HEIGHT_RATIO, 0.35)
+    y_bottom = cap_top
+    y_top = min(page_h, cap_top + fig_h)
+
+    for j in range(cap_i - 1, -1, -1):
+        t = lines[j]["text"]
+        b = lines[j]["bbox"]
+        if b[3] < y_bottom - 4:
+            continue
+        if b[1] > y_top + 4:
+            break
+        if _is_body_paragraph_line(t) and not _is_figure_interior_line(t):
+            break
+        if _is_figure_interior_line(t) or _is_chart_debris_line(t) or not t.strip():
+            x0 = min(x0, b[0])
+            x1 = max(x1, b[2])
+            y_top = max(y_top, min(page_h, b[3] + fig_h * 0.15))
+
+    min_h = page_h * 0.08
+    if y_top - y_bottom < min_h:
+        y_top = min(page_h, y_bottom + min_h)
+    margin_x = max(8.0, (x1 - x0) * 0.05)
+    return (
+        max(0.0, x0 - margin_x),
+        y_bottom,
+        min(page_w, x1 + margin_x),
+        y_top,
+    )
+
+
+def _cap_bbox_height(bbox, page_h: float, cap_y_top: float) -> tuple:
+    """Limit figure bbox height (PDF coords, bottom-left origin)."""
+    x0, y0, x1, y1 = bbox
+    max_top = min(y1, cap_y_top + page_h * MAX_FIGURE_HEIGHT_RATIO)
+    if max_top <= y0:
+        return bbox
+    return (x0, y0, x1, max_top)
+
+
+def _inject_figure_markdown_before_captions(text: str, clips: list) -> str:
+    """Place ``![...]()`` immediately before each Figure caption line in page text."""
+    if not clips:
+        return text
+    lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
+    clip_idx = 0
+    for i, ln in enumerate(lines):
+        if clip_idx >= len(clips):
+            break
+        if not _line_has_figure_caption(ln):
+            continue
+        if i > 0 and lines[i - 1].lstrip().startswith("!["):
+            continue
+        ref_path = clips[clip_idx][0]
+        fname = os.path.basename(ref_path)
+        img_md = f"![{fname}]({ref_path})"
+        lines[i] = f"{img_md}\n\n{ln}"
+        clip_idx += 1
+    return "\n".join(lines)
+
+
+def _extract_vector_figure_clips(
+    page,
+    page_index: int,
+    plain_text: str,
+    raw,
+    base_name: str,
+    scale: float,
+    quality: int,
+    max_edge: int,
+) -> list:
+    """Render vector figure regions anchored at each ``Figure N.`` caption on the page.
+
+    Returns ``[(ref_path, b64, y_sort, caption_line), ...]`` for markdown injection.
+    """
+    if not RENDER_VECTOR_FIGURES or not re.search(r"\bFigure\s+\d+", plain_text, re.I):
+        return []
+    textpage = None
+    try:
+        textpage = page.get_textpage()
+        chars, page_w = _page_chars(textpage, page, raw)
+        if not chars:
+            return []
+        page_h = page.get_size()[1]
+        lines = _merge_orphan_punctuation_lines(_group_lines_with_chars(chars))
+        caption_indices = [
+            i for i, ln in enumerate(lines) if _line_has_figure_caption(ln["text"])
+        ]
+        if not caption_indices:
+            return []
+
+        results: list = []
+        for fig_idx, cap_i in enumerate(caption_indices):
+            cap_line = lines[cap_i]["text"].strip()
+            m = _FIGURE_CAPTION_SEARCH_RE.search(cap_line)
+            if m:
+                cap_line = cap_line[m.start() :].split("\n", 1)[0].strip()
+
+            bbox = _bbox_above_caption(lines, cap_i, page_w, page_h)
+            if bbox is None:
+                bbox = _chart_region_bbox(chars, page_w, page_h)
+            if bbox is None:
+                continue
+
+            ratio = _bbox_area_ratio(bbox, page_w, page_h)
+            if ratio > MAX_CHART_REGION_AREA_RATIO:
+                bbox = _cap_bbox_height(bbox, page_h, lines[cap_i]["bbox"][3])
+                ratio = _bbox_area_ratio(bbox, page_w, page_h)
+                if ratio > MAX_CHART_REGION_AREA_RATIO:
+                    continue
+            if ratio < MIN_CHART_REGION_AREA_RATIO:
+                continue
+
+            bbox = _expand_chart_bbox(bbox, page_w, page_h, margin_frac=0.06)
+            jpeg = _render_page_clip_jpeg(page, bbox, scale, quality, max_edge)
+            fname = f"{base_name}_p{page_index + 1}_fig{fig_idx + 1}.jpg"
+            ref_path = f"images/{fname}"
+            results.append(
+                (
+                    ref_path,
+                    base64.b64encode(jpeg).decode("utf-8"),
+                    bbox[3],
+                    cap_line,
+                )
+            )
+        return results
+    except Exception:
+        logger.debug("vector figure clip failed on page %d", page_index, exc_info=True)
+        return []
+    finally:
+        _close_pdfium_resource(textpage)
+
+
 def _collect_invisible_boxes(page, raw) -> list:
    """Bounding boxes of invisible (render-mode 3) text objects on the page."""
    boxes: list = []
@@ -251,6 +679,109 @@ def _split_columns(chars: list, scale: float, width: float, depth: int = 0) -> l
    )


+def _column_x_span(chars: list) -> float:
+    if not chars:
+        return 0.0
+    return max(c["x1"] for c in chars) - min(c["x0"] for c in chars)
+
+
+def _column_single_line_fraction(lines: list) -> float:
+    if not lines:
+        return 0.0
+    single = sum(1 for ln in lines if len(ln["text"]) <= 2)
+    return single / len(lines)
+
+
+def _is_artifact_column(chars: list, width: float) -> bool:
+    """Detect margin strips and vertical watermarks (e.g. arXiv sidebar).
+
+    Docling / MinerU solve this with learned layout regions; here we use
+    geometry only: a narrow column whose lines are mostly one glyph tall is not
+    part of the reading order.
+    """
+    if not chars or width <= 0:
+        return True
+    span = _column_x_span(chars)
+    if span <= 0:
+        return True
+    lines = _group_lines(chars)
+    single_frac = _column_single_line_fraction(lines)
+    narrow = span / width < MARGIN_COL_WIDTH_RATIO
+    if narrow and single_frac >= 0.45:
+        return True
+    ys = [(c["y0"] + c["y1"]) / 2 for c in chars]
+    y_span = max(ys) - min(ys)
+    # Vertical text: tall stack, narrow horizontal extent, mostly one char/line.
+    if y_span > span * 3.5 and len(chars) >= 8 and single_frac >= 0.35:
+        return True
+    return False
+
+
+def _filter_reading_columns(chars: list, scale: float, width: float) -> list:
+    """Split into columns and drop margin / watermark strips."""
+    cols = _split_columns(chars, scale, width)
+    kept = [c for c in cols if not _is_artifact_column(c, width)]
+    if kept:
+        return kept
+    # All columns looked like noise — keep the widest glyph set (main body).
+    if len(cols) > 1:
+        return [max(cols, key=_column_x_span)]
+    return cols
+
+
+def _merge_orphan_punctuation_lines(lines: list) -> list:
+    """Attach lines that are only punctuation to the previous visual line.
+
+    Many PDFs place ``.`` in figure labels or footnotes on a slightly different
+    baseline; grouping by y then leaves ``Figure 1`` and ``2:`` on separate lines.
+    """
+    if not lines:
+        return []
+    merged: list = []
+    for ln in lines:
+        t = ln["text"].strip()
+        if (
+            merged
+            and t
+            and len(t) <= 4
+            and all(c in ".,;:!?…·" or c.isspace() for c in t)
+        ):
+            suffix = "".join(t.split())
+            prev = merged[-1]["text"]
+            if suffix and prev and not prev.endswith((" ", "-")):
+                merged[-1]["text"] = prev + suffix
+            else:
+                merged[-1]["text"] = (prev + " " + t).strip()
+            continue
+        merged.append(dict(ln))
+    return merged
+
+
+def _join_line_glyphs(ln_sorted: list) -> str:
+    """Join a visual line's glyphs, inferring word spaces from horizontal gaps."""
+    if not ln_sorted:
+        return ""
+    widths = [c["x1"] - c["x0"] for c in ln_sorted if c["x1"] > c["x0"]]
+    med_w = statistics.median(widths) if widths else 1.0
+    gap_threshold = med_w * WORD_GAP_WIDTH_RATIO
+
+    parts: list[str] = []
+    for i, cur in enumerate(ln_sorted):
+        ch = cur["ch"]
+        if i == 0:
+            parts.append(ch)
+            continue
+        prev = ln_sorted[i - 1]
+        if ch.isspace() or prev["ch"].isspace():
+            if not ch.isspace() or (parts and not parts[-1].endswith(" ")):
+                parts.append(ch)
+            continue
+        if cur["x0"] - prev["x1"] > gap_threshold:
+            parts.append(" ")
+        parts.append(ch)
+    return "".join(parts).strip()
+
+
 def _group_lines(chars: list) -> list:
    """Group a column's glyphs into lines (top-to-bottom, glyphs sorted by x)."""
    if not chars:
@@ -277,7 +808,7 @@ def _group_lines(chars: list) -> list:
    out: list = []
    for ln in lines:
        ln_sorted = sorted(ln, key=lambda c: c["x0"])
-        text = "".join(c["ch"] for c in ln_sorted).strip()
+        text = _join_line_glyphs(ln_sorted)
        if not text:
            continue
        hs = [c["y1"] - c["y0"] for c in ln_sorted if c["y1"] - c["y0"] > 0]
@@ -293,7 +824,12 @@ def _segments_to_markdown(lines: list) -> str:

    def level(ln) -> int:
        txt = ln["text"]
-        if not DETECT_HEADINGS or body <= 0 or len(txt) > 80:
+        if (
+            not DETECT_HEADINGS
+            or body <= 0
+            or len(txt) > 80
+            or len(txt) < MIN_HEADING_LINE_CHARS
+        ):
            return 0
        if txt[-1:] in ".。!！?？,，;；:：":
            return 0
@@ -317,6 +853,100 @@ def _segments_to_markdown(lines: list) -> str:
    return "\n".join(out)


+def _chars_to_layout_markdown(chars: list, scale: float, width: float) -> str:
+    blocks: list = []
+    for col in _filter_reading_columns(chars, scale, width):
+        lines = _merge_orphan_punctuation_lines(_group_lines(col))
+        md = _segments_to_markdown(lines)
+        if md:
+            blocks.append(md)
+    return "\n".join(blocks)
+
+
+def _layout_line_stats(text: str) -> tuple:
+    """Return (line_count, single_char_line_count, punct_only_line_count)."""
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    if not lines:
+        return 0, 0, 0
+    single = sum(1 for ln in lines if len(ln) <= 2)
+    punct_only = sum(
+        1
+        for ln in lines
+        if len(ln) <= 4 and re.fullmatch(r"[\s.,;:!?…·\-–—]+", ln)
+    )
+    return len(lines), single, punct_only
+
+
+def _layout_garbled_line_fraction(text: str) -> float:
+    """Share of lines that look like broken OCR (many 1–2 letter tokens)."""
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    if not lines:
+        return 0.0
+    garbled = 0
+    for ln in lines:
+        words = ln.split()
+        if len(words) >= 6 and sum(1 for w in words if len(w) <= 2) / len(words) > 0.45:
+            garbled += 1
+    return garbled / len(lines)
+
+
+def _plain_is_well_formed(plain: str) -> bool:
+    """True when pdfium plain text already has usable words and punctuation.
+
+    Academic PDFs (arXiv) and TOCs already expose a good text layer; running
+    geometric layout on them often destroys citations and words. Scanned books
+    with a poor text layer (no commas in refs, short glued tokens) still need
+    layout gap inference.
+    """
+    plain = (plain or "").strip()
+    if not plain:
+        return False
+    if re.search(r"\[\w+,\s", plain):
+        return True
+    if plain.count(" . . ") >= 2:
+        return True
+    words = re.findall(r"\S+", plain)
+    if len(words) < 30:
+        return False
+    avg_len = sum(len(w) for w in words) / len(words)
+    return avg_len >= 5.0
+
+
+def _should_prefer_plain(plain: str, layout: str) -> bool:
+    """Fall back to pdfium plain text when layout reconstruction looks broken."""
+    layout = (layout or "").strip()
+    plain = (plain or "").strip()
+    if not layout:
+        return True
+    if not plain:
+        return False
+    n, single, punct_only = _layout_line_stats(layout)
+    if n == 0:
+        return True
+    if single / n >= 0.18 or punct_only / n >= 0.12:
+        return True
+    garbled = _layout_garbled_line_fraction(layout)
+    if garbled >= 0.20 and _layout_garbled_line_fraction(plain) < 0.08:
+        return True
+    if re.search(r"\[\w+,\s", plain) and re.search(
+        r"\[\w+\s+\w+\s+\d", layout
+    ):
+        return True
+    # Title / lead sentence from plain should survive in layout.
+    for ln in plain.splitlines():
+        probe = ln.strip()
+        if len(probe) < 24:
+            continue
+        alnum = "".join(c for c in probe if c.isalnum())[:16]
+        if len(alnum) < 12:
+            continue
+        layout_alnum = "".join(c for c in layout if c.isalnum())
+        if alnum not in layout_alnum:
+            return True
+        break
+    return False
+
+
 def _extract_layout_text(page, raw) -> str:
    """Layout-aware extraction: reading order + headings + hidden-text filter.

@@ -331,12 +961,7 @@ def _extract_layout_text(page, raw) -> str:
            return ""
        heights = [c["y1"] - c["y0"] for c in chars if c["y1"] - c["y0"] > 0]
        scale = (statistics.median(heights) if heights else 1.0) or 1.0
-        blocks = []
-        for col in _split_columns(chars, scale, width):
-            md = _segments_to_markdown(_group_lines(col))
-            if md:
-                blocks.append(md)
-        return "\n".join(blocks)
+        return _chars_to_layout_markdown(chars, scale, width)
    except Exception:
        logger.debug("layout extraction failed; using plain text", exc_info=True)
        return _extract_page_text(page)
@@ -623,37 +1248,6 @@ def _extract_embedded_images(pdf, classes, raw, base_name: str, quality: int) ->
    return result


-def estimate_scanned_fraction(content: bytes, sample: int = 12) -> float:
-    """Return the fraction of (sampled) pages that look image-dominated.
-
-    Used by alternative engines (e.g. liteparse) that lack image-object access
-    to decide whether a PDF is scanned, applying the same image-area signal the
-    builtin router uses. Samples up to ``sample`` pages for speed on big PDFs.
-    """
-    import pypdfium2 as pdfium
-    import pypdfium2.raw as pdfium_r
-
-    pdf = pdfium.PdfDocument(content)
-    try:
-        page_count = len(pdf)
-        if page_count <= 0:
-            return 0.0
-        step = max(1, page_count // sample)
-        indices = list(range(0, page_count, step))
-        scanned = 0
-        for i in indices:
-            page = pdf[i]
-            try:
-                ratio = _page_image_area_ratio(page, pdfium_r)
-            finally:
-                _close_pdfium_resource(page)
-            if ratio >= SCAN_IMAGE_AREA_RATIO:
-                scanned += 1
-        return scanned / len(indices) if indices else 0.0
-    finally:
-        _close_pdfium_resource(pdf)
-
-
 def _strip_repeating_lines(texts: list, classes: list) -> list:
    """Remove running headers/footers that repeat across most text pages.

@@ -791,6 +1385,7 @@ class PDFParser(BaseParser):
            # Pass 1: cheap text extraction + image-area classification.
            texts: list = []
            classes: list = []
+            vector_clips: dict = {}
            for i in range(page_count):
                page = pdf[i]
                try:
@@ -800,9 +1395,36 @@ class PDFParser(BaseParser):
                    # Layout reconstruction only pays off (and is only spent) on
                    # native text pages; scanned pages are rendered, not read.
                    if cls == "text" and LAYOUT_ORDERING:
-                        text = _extract_layout_text(page, pdfium_r) or plain
+                        if _plain_is_well_formed(plain):
+                            text = plain
+                        else:
+                            layout = _extract_layout_text(page, pdfium_r)
+                            if layout and not _should_prefer_plain(plain, layout):
+                                text = layout
+                            else:
+                                text = plain
                    else:
                        text = plain
+                    if cls == "text":
+                        clips = _extract_vector_figure_clips(
+                            page,
+                            i,
+                            plain,
+                            pdfium_r,
+                            base_name,
+                            scale,
+                            quality,
+                            CONFIG.pdf_render_max_edge,
+                        )
+                        if clips:
+                            vector_clips[i] = clips
+                            for ref_path, b64, _y, _cap in clips:
+                                images[ref_path] = b64
+                    text = _postprocess_pdf_text(text)
+                    if cls == "text" and vector_clips.get(i):
+                        text = _inject_figure_markdown_before_captions(
+                            text, vector_clips[i]
+                        )
                finally:
                    _close_pdfium_resource(page)
                texts.append(text)
@@ -841,6 +1463,7 @@ class PDFParser(BaseParser):

        # Assemble markdown in reading order.
        embedded_count = 0
+        vector_figure_count = 0
        blocks = []
        for i in range(page_count):
            if classes[i] == "scanned":
@@ -850,7 +1473,10 @@ class PDFParser(BaseParser):
                stripped = texts[i].strip()
                if stripped:
                    blocks.append(stripped)
-                for ref_path, _b64, _y in embedded.get(i, []):
+                vector_figure_count += len(vector_clips.get(i, []))
+                page_images = list(embedded.get(i, []))
+                page_images.sort(key=lambda item: item[2], reverse=True)
+                for ref_path, _b64, _y in page_images:
                    fname = os.path.basename(ref_path)
                    blocks.append(f"![{fname}]({ref_path})")
                    embedded_count += 1
@@ -862,6 +1488,7 @@ class PDFParser(BaseParser):
            "scanned_page_count": len(scanned_indices),
            "text_page_count": page_count - len(scanned_indices),
            "embedded_image_count": embedded_count,
+            "vector_figure_count": vector_figure_count,
            "image_source_type": "scanned_pdf" if scanned_indices else "pdf_text_layer",
        }

--- a/docreader/parser/ppt_convert.py
+++ b/docreader/parser/ppt_convert.py
@@ -0,0 +1,116 @@
+"""LibreOffice helpers for legacy binary PowerPoint (.ppt) uploads."""
+
+from __future__ import annotations
+
+import logging
+import os
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+from docreader.parser.excel_convert import find_soffice
+
+logger = logging.getLogger(__name__)
+
+_OLE_MAGIC = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
+_ZIP_MAGIC = b"PK\x03\x04"
+
+
+def is_ole_compound(content: bytes) -> bool:
+    return len(content) >= len(_OLE_MAGIC) and content.startswith(_OLE_MAGIC)
+
+
+def is_zip_openxml(content: bytes) -> bool:
+    return len(content) >= len(_ZIP_MAGIC) and content.startswith(_ZIP_MAGIC)
+
+
+def needs_ppt_to_pptx_conversion(content: bytes, file_type: str | None) -> bool:
+    """True when content is legacy .ppt (OLE), not modern .pptx (ZIP)."""
+    ext = (file_type or "").lstrip(".").lower()
+    if ext == "pptx" or is_zip_openxml(content):
+        return False
+    if ext == "ppt" or is_ole_compound(content):
+        return is_ole_compound(content) or ext == "ppt"
+    return False
+
+
+def convert_ppt_to_pptx_bytes(content: bytes, suffix: str = ".ppt") -> bytes | None:
+    """Convert legacy PowerPoint bytes to PPTX using LibreOffice, if available."""
+    soffice = find_soffice()
+    if not soffice:
+        return None
+
+    max_attempts = 3
+    for attempt in range(1, max_attempts + 1):
+        with tempfile.TemporaryDirectory() as temp_dir, tempfile.TemporaryDirectory() as profile_dir:
+            src = os.path.join(temp_dir, f"input{suffix}")
+            with open(src, "wb") as handle:
+                handle.write(content)
+
+            user_installation = Path(profile_dir).as_uri()
+            cmd = [
+                soffice,
+                "--headless",
+                f"-env:UserInstallation={user_installation}",
+                "--convert-to",
+                "pptx",
+                "--outdir",
+                temp_dir,
+                src,
+            ]
+            try:
+                result = subprocess.run(cmd, capture_output=True, timeout=120)
+            except (OSError, subprocess.TimeoutExpired) as exc:
+                logger.warning("LibreOffice PPT convert failed to start: %s", exc)
+                return None
+
+            if result.returncode != 0:
+                stderr = result.stderr.decode("utf-8", errors="ignore")
+                logger.warning(
+                    "LibreOffice PPT convert failed (attempt %s/%s): %s",
+                    attempt,
+                    max_attempts,
+                    stderr,
+                )
+                if attempt < max_attempts:
+                    time.sleep(0.5 * attempt)
+                    continue
+                return None
+
+            for name in os.listdir(temp_dir):
+                if name.endswith(".pptx"):
+                    with open(os.path.join(temp_dir, name), "rb") as handle:
+                        converted = handle.read()
+                    logger.info(
+                        "Converted presentation via LibreOffice (%s -> pptx, %d bytes)",
+                        suffix,
+                        len(converted),
+                    )
+                    return converted
+
+            if attempt < max_attempts:
+                time.sleep(0.5 * attempt)
+    return None
+
+
+def normalize_ppt_bytes(content: bytes, file_type: str | None) -> tuple[bytes, str]:
+    """Return (bytes, extension) suitable for MarkItDown (pptx when converted)."""
+    ext = (file_type or "").lstrip(".").lower()
+
+    if is_zip_openxml(content):
+        return content, ".pptx"
+
+    if not needs_ppt_to_pptx_conversion(content, ext):
+        dotted = f".{ext}" if ext else ".pptx"
+        return content, dotted
+
+    suffix = ".ppt" if ext in ("", "ppt") else f".{ext}"
+    converted = convert_ppt_to_pptx_bytes(content, suffix=suffix)
+    if converted:
+        return converted, ".pptx"
+
+    raise ValueError(
+        "Legacy PowerPoint (.ppt) is not supported by MarkItDown directly; "
+        "LibreOffice is required to convert it to .pptx. Install LibreOffice "
+        "(soffice) in the docreader environment or upload .pptx instead."
+    )
--- a/docreader/parser/pptx_media.py
+++ b/docreader/parser/pptx_media.py
@@ -0,0 +1,154 @@
+"""Extract and rasterize images embedded in PPTX (e.g. WMF) when MarkItDown cannot inline them."""
+
+from __future__ import annotations
+
+import base64
+import io
+import logging
+import os
+import re
+import subprocess
+import tempfile
+import uuid
+import zipfile
+from typing import Dict, List, Tuple
+
+logger = logging.getLogger(__name__)
+
+_MARKDOWN_IMAGE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
+_RASTER_EXT = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"}
+_VECTOR_EXT = {".wmf", ".emf", ".svg"}
+
+
+def _find_convert() -> str | None:
+    for path in ("/usr/bin/convert", "/usr/local/bin/convert"):
+        if os.path.isfile(path):
+            return path
+    try:
+        result = subprocess.run(
+            ["which", "convert"], capture_output=True, text=True, check=False
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            return result.stdout.strip()
+    except OSError:
+        pass
+    return None
+
+
+def _rasterize_with_imagemagick(data: bytes, suffix: str) -> bytes | None:
+    convert = _find_convert()
+    if not convert:
+        return None
+    with tempfile.TemporaryDirectory() as temp_dir:
+        src = os.path.join(temp_dir, f"input{suffix}")
+        dst = os.path.join(temp_dir, "output.png")
+        with open(src, "wb") as handle:
+            handle.write(data)
+        try:
+            result = subprocess.run(
+                [convert, src, dst],
+                capture_output=True,
+                timeout=60,
+            )
+        except (OSError, subprocess.TimeoutExpired) as exc:
+            logger.warning("ImageMagick convert failed: %s", exc)
+            return None
+        if result.returncode != 0 or not os.path.isfile(dst):
+            stderr = (result.stderr or b"").decode("utf-8", errors="ignore")
+            logger.warning("ImageMagick convert exit %s: %s", result.returncode, stderr)
+            return None
+        with open(dst, "rb") as handle:
+            return handle.read()
+
+
+def _rasterize_with_pillow(data: bytes) -> bytes | None:
+    try:
+        from PIL import Image
+    except ImportError:
+        return None
+    try:
+        img = Image.open(io.BytesIO(data))
+        if img.mode not in ("RGB", "L"):
+            img = img.convert("RGB")
+        out = io.BytesIO()
+        img.save(out, format="PNG")
+        return out.getvalue()
+    except Exception as exc:
+        logger.debug("Pillow could not open media bytes: %s", exc)
+        return None
+
+
+def rasterize_media_bytes(name: str, data: bytes) -> bytes | None:
+    ext = os.path.splitext(name)[1].lower()
+    if ext in _RASTER_EXT:
+        png = _rasterize_with_pillow(data)
+        if png:
+            return png
+    if ext in _VECTOR_EXT or ext in _RASTER_EXT:
+        return _rasterize_with_imagemagick(data, ext or ".bin")
+    return _rasterize_with_imagemagick(data, ext or ".bin")
+
+
+def list_pptx_media(pptx_bytes: bytes) -> List[Tuple[str, bytes]]:
+    """Return (zip path, raw bytes) for each file under ppt/media/, in archive order."""
+    items: List[Tuple[str, bytes]] = []
+    with zipfile.ZipFile(io.BytesIO(pptx_bytes)) as archive:
+        for name in archive.namelist():
+            if not name.startswith("ppt/media/"):
+                continue
+            base = os.path.basename(name)
+            if not base or base.startswith("."):
+                continue
+            items.append((name, archive.read(name)))
+    return items
+
+
+def extract_pptx_media_rasterized(pptx_bytes: bytes) -> List[bytes]:
+    """Rasterize all ppt/media assets to PNG bytes, skipping failures."""
+    rasterized: List[bytes] = []
+    for path, raw in list_pptx_media(pptx_bytes):
+        png = rasterize_media_bytes(os.path.basename(path), raw)
+        if png:
+            rasterized.append(png)
+            logger.info("Rasterized pptx media %s (%d -> %d bytes)", path, len(raw), len(png))
+        else:
+            logger.warning("Failed to rasterize pptx media %s", path)
+    return rasterized
+
+
+def _is_unresolved_image_ref(url: str) -> bool:
+    if not url or url.startswith("data:") or url.startswith("images/"):
+        return False
+    if url.startswith(("http://", "https://")):
+        return False
+    return True
+
+
+def attach_pptx_media_to_markdown(
+    markdown: str, pptx_bytes: bytes
+) -> Tuple[str, Dict[str, str]]:
+    """Replace unresolved ![](...) refs with images/ paths and inline image payloads."""
+    media = extract_pptx_media_rasterized(pptx_bytes)
+    if not media:
+        return markdown, {}
+
+    images: Dict[str, str] = {}
+    media_iter = iter(media)
+
+    def repl(match: re.Match[str]) -> str:
+        alt, url = match.group(1), match.group(2)
+        if not _is_unresolved_image_ref(url):
+            return match.group(0)
+        try:
+            png = next(media_iter)
+        except StopIteration:
+            return match.group(0)
+        ref = f"images/{uuid.uuid4()}.png"
+        images[ref] = base64.b64encode(png).decode()
+        return f"![{alt}]({ref})"
+
+    return _MARKDOWN_IMAGE.sub(repl, markdown), images
+
+
+def markdown_needs_pptx_media_attach(markdown: str) -> bool:
+    return any(_is_unresolved_image_ref(m.group(2)) for m in _MARKDOWN_IMAGE.finditer(markdown))
--- a/docreader/parser/registry.py
+++ b/docreader/parser/registry.py
@@ -7,8 +7,11 @@ from docreader.parser.docx2_parser import Docx2Parser
 from docreader.parser.excel_parser import ExcelParser
 from docreader.parser.image_parser import ImageParser
 from docreader.parser.markdown_parser import MarkdownParser
-from docreader.parser.liteparse_parser import LiteParseParser, liteparse_available
 from docreader.parser.markitdown_parser import MarkitdownParser
+from docreader.parser.opendataloader_parser import (
+    OpenDataLoaderParser,
+    opendataloader_available,
+)
 from docreader.parser.pdf_parser import PDFParser

 logger = logging.getLogger(__name__)
@@ -151,11 +154,11 @@ def _build_default_registry() -> ParserEngineRegistry:
    )

    reg.register(
-        "liteparse",
-        {"pdf": LiteParseParser},
-        description="LiteParse 解析引擎（快速空间阅读顺序，适合数字版 PDF）",
-        check_available=liteparse_available,
-        unavailable_hint="liteparse 未安装",
+        "opendataloader",
+        {"pdf": OpenDataLoaderParser},
+        description="OpenDataLoader PDF（版面分析，需 Java 11+）",
+        check_available=opendataloader_available,
+        unavailable_hint="请安装 opendataloader-pdf 与 Java 11+",
    )

    # NOTE: Engine listing is managed by Go-side engine registry
--- a/docreader/parser/web_parser.py
+++ b/docreader/parser/web_parser.py
@@ -1,9 +1,11 @@
 import asyncio
 import logging
 import re
+from dataclasses import dataclass
+from typing import Optional

 from lxml.etree import XPath
-from playwright.async_api import async_playwright
+from playwright.async_api import Page, async_playwright
 from trafilatura import extract, utils, xpaths

 from docreader.config import CONFIG
@@ -15,6 +17,14 @@ from docreader.utils import endecode

 logger = logging.getLogger(__name__)

+_GOTO_TIMEOUT_MS = 30_000
+_NETWORK_IDLE_TIMEOUT_MS = 10_000
+_SPA_WAIT_TIMEOUT_MS = 15_000
+# Minimum visible characters before treating an SPA shell as "rendered".
+_SPA_MIN_TEXT_LEN = 80
+# Minimum visible characters for Playwright text fallback when trafilatura fails.
+_MIN_FALLBACK_TEXT_LEN = 50
+
 # Monkey-patch trafilatura internals to better support WeChat Official Account
 # articles, whose images live on `mmbiz.qpic.cn` without a standard file
 # extension and whose main content sits inside `#js_content` /
@@ -40,6 +50,78 @@ except (AttributeError, ImportError) as e:
    )


+@dataclass(frozen=True)
+class _ScrapeResult:
+    html: str
+    visible_text: str
+    page_title: str
+
+
+def extract_markdown_from_html(html: str) -> Optional[str]:
+    """Run trafilatura on HTML; return markdown or None if nothing extracted."""
+    if not html or not html.strip():
+        return None
+    md_text = extract(
+        html,
+        output_format="markdown",
+        with_metadata=True,
+        include_images=True,
+        include_tables=True,
+        include_links=True,
+    )
+    if not md_text or not md_text.strip():
+        return None
+    return md_text
+
+
+def build_visible_text_fallback(visible_text: str, page_title: str = "") -> Optional[str]:
+    """Build markdown from Playwright-visible text when trafilatura finds no article body."""
+    text = (visible_text or "").strip()
+    if len(text) < _MIN_FALLBACK_TEXT_LEN:
+        return None
+    title = (page_title or "").strip()
+    if title and not text.startswith(title):
+        return f"# {title}\n\n{text}"
+    return text
+
+
+async def wait_for_rendered_content(page: Page) -> None:
+    """Wait for SPA/JS pages beyond the initial HTML shell."""
+    try:
+        await page.wait_for_load_state("networkidle", timeout=_NETWORK_IDLE_TIMEOUT_MS)
+        logger.info("Network idle after navigation")
+    except Exception:
+        logger.info("Network idle wait timed out, continuing")
+
+    try:
+        await page.wait_for_function(
+            """(minLen) => {
+                const root = document.querySelector('#app')
+                    || document.querySelector('main')
+                    || document.body;
+                return ((root?.innerText || '').trim().length >= minLen);
+            }""",
+            arg=_SPA_MIN_TEXT_LEN,
+            timeout=_SPA_WAIT_TIMEOUT_MS,
+        )
+        logger.info("SPA/root visible text reached minimum length")
+    except Exception:
+        logger.info("SPA text wait timed out, using current DOM")
+
+
+async def read_visible_text(page: Page) -> str:
+    """Prefer #app/main innerText, then fall back to body."""
+    return await page.evaluate(
+        """() => {
+            const root = document.querySelector('#app')
+                || document.querySelector('main')
+                || document.querySelector('[role="main"]')
+                || document.body;
+            return (root?.innerText || '').trim();
+        }"""
+    )
+
+
 class StdWebParser(BaseParser):
    """Standard web page parser using Playwright and Trafilatura.

@@ -61,16 +143,17 @@ class StdWebParser(BaseParser):
        super().__init__(file_name=title, **kwargs)
        logger.info(f"Initialized WebParser with title: {title}")

-    async def scrape(self, url: str) -> str:
+    async def scrape(self, url: str) -> _ScrapeResult:
        """Scrape web page content using Playwright.

        Args:
            url: The URL of the web page to scrape

        Returns:
-            HTML content of the web page as string, empty string on error
+            HTML, visible text, and document title; empty fields on hard failure
        """
        logger.info(f"Starting web page scraping for URL: {url}")
+        empty = _ScrapeResult(html="", visible_text="", page_title="")
        try:
            async with async_playwright() as p:
                kwargs = {}
@@ -83,30 +166,42 @@ class StdWebParser(BaseParser):

                logger.info(f"Navigating to URL: {url}")
                try:
-                    # Navigate to URL with 30 second timeout
-                    await page.goto(url, timeout=30000)
+                    await page.goto(
+                        url,
+                        timeout=_GOTO_TIMEOUT_MS,
+                        wait_until="domcontentloaded",
+                    )
                    logger.info("Initial page load complete")
                except Exception as e:
                    logger.error(f"Error navigating to URL: {str(e)}")
                    await browser.close()
-                    return ""
+                    return empty

-                logger.info("Retrieving page HTML content")
-                # Get the full HTML content of the page
+                await wait_for_rendered_content(page)
+
+                page_title = await page.title()
+                visible_text = await read_visible_text(page)
                content = await page.content()
-                logger.info(f"Retrieved {len(content)} bytes of HTML content")
+                logger.info(
+                    "Retrieved %d bytes HTML, %d chars visible text, title=%r",
+                    len(content),
+                    len(visible_text),
+                    page_title[:80] if page_title else "",
+                )

                await browser.close()
                logger.info("Browser closed")

-            # Return raw HTML content for further processing
            logger.info("Successfully retrieved HTML content")
-            return content
+            return _ScrapeResult(
+                html=content,
+                visible_text=visible_text,
+                page_title=page_title or "",
+            )

        except Exception as e:
            logger.error(f"Failed to scrape web page: {str(e)}")
-            # Return empty string on error
-            return ""
+            return empty

    def parse_into_text(self, content: bytes) -> Document:
        """Parse web page content into a Document object.
@@ -117,36 +212,49 @@ class StdWebParser(BaseParser):
        Returns:
            Document object containing the parsed markdown content
        """
-        # Decode bytes to get the URL string
        url = endecode.decode_bytes(content)

        logger.info(f"Scraping web page: {url}")
-        # Run async scraping in sync context
-        chtml = asyncio.run(self.scrape(url))
-        # Extract clean content from HTML using Trafilatura
-        # Convert to markdown format with metadata, images, tables, and links
-        md_text = extract(
-            chtml,
-            output_format="markdown",
-            with_metadata=True,
-            include_images=True,
-            include_tables=True,
-            include_links=True,
-        )
+        scrape_result = asyncio.run(self.scrape(url))
+        if not scrape_result.html and not scrape_result.visible_text:
+            logger.error("Failed to scrape web page (no HTML or visible text)")
+            return Document(content=f"Error parsing web page: {url}")
+
+        md_text = extract_markdown_from_html(scrape_result.html)
+        if not md_text:
+            md_text = build_visible_text_fallback(
+                scrape_result.visible_text,
+                scrape_result.page_title,
+            )
+            if md_text:
+                logger.info(
+                    "Trafilatura empty; using Playwright visible-text fallback (%d chars)",
+                    len(md_text),
+                )
+
        if not md_text:
            logger.error("Failed to parse web page")
            return Document(content=f"Error parsing web page: {url}")

-        # Extract title from trafilatura metadata output (e.g. "title: xxx" line)
        metadata = {}
        title_match = re.search(r"^title:\s*(.+)", md_text, re.MULTILINE)
        if title_match:
            extracted_title = title_match.group(1).strip()
            if extracted_title:
                metadata["title"] = extracted_title
-                logger.info(f"Extracted article title from trafilatura: {extracted_title}")
+                logger.info(
+                    f"Extracted article title from trafilatura: {extracted_title}"
+                )
+        elif scrape_result.page_title:
+            metadata["title"] = scrape_result.page_title.strip()
+            logger.info(
+                "Using page title from Playwright: %s", metadata["title"]
+            )
        else:
-            logger.info(f"No title found in trafilatura output, first 200 chars: {md_text[:200]!r}")
+            logger.info(
+                "No title found in trafilatura output, first 200 chars: %r",
+                md_text[:200],
+            )
        return Document(content=md_text, metadata=metadata)


--- a/docreader/parser/xlsx_merge.py
+++ b/docreader/parser/xlsx_merge.py
@@ -0,0 +1,42 @@
+"""Fill merged cell values before pandas reads an XLSX workbook."""
+
+from __future__ import annotations
+
+import logging
+import zipfile
+from io import BytesIO
+
+logger = logging.getLogger(__name__)
+
+
+def fill_merged_cells_xlsx(content: bytes) -> bytes:
+    """Unmerge ranges and copy the master cell value into every covered cell.
+
+    openpyxl only stores values on the top-left cell of a merge; pandas then
+    sees NaN in the rest. Filling makes row-wise RAG chunks retain context.
+    """
+    if not zipfile.is_zipfile(BytesIO(content)):
+        return content
+
+    from openpyxl import load_workbook
+
+    wb = load_workbook(BytesIO(content), data_only=True)
+    changed = False
+    for ws in wb.worksheets:
+        if not ws.merged_cells.ranges:
+            continue
+        for merge_range in list(ws.merged_cells.ranges):
+            master_value = ws.cell(merge_range.min_row, merge_range.min_col).value
+            ws.unmerge_cells(str(merge_range))
+            for row in range(merge_range.min_row, merge_range.max_row + 1):
+                for col in range(merge_range.min_col, merge_range.max_col + 1):
+                    ws.cell(row, col).value = master_value
+            changed = True
+
+    if not changed:
+        return content
+
+    out = BytesIO()
+    wb.save(out)
+    logger.info("Filled merged cells in XLSX before parse")
+    return out.getvalue()
--- a/docreader/parser/xlsx_repair.py
+++ b/docreader/parser/xlsx_repair.py
@@ -0,0 +1,126 @@
+"""Repair common XLSX packaging issues before openpyxl/pandas read."""
+
+from __future__ import annotations
+
+import io
+import re
+import zipfile
+from typing import Callable, Dict, Iterable, Set
+
+SST_PART = "xl/sharedStrings.xml"
+_SST_OVERRIDE_RE = re.compile(
+    r'<Override[^>]*PartName="[^"]*sharedStrings\.xml"[^>]*/>',
+    re.IGNORECASE,
+)
+_SST_REL_RE = re.compile(
+    r'<Relationship[^>]*Type="[^"]*sharedStrings"[^>]*/>',
+    re.IGNORECASE,
+)
+
+
+def repair_xlsx_bytes(content: bytes) -> bytes | None:
+    """Return repaired XLSX bytes, or None if no repair was applied.
+
+    Handles workbooks that reference ``xl/sharedStrings.xml`` in package
+    metadata but omit the part (common with some exporters). When worksheets
+    only use inline strings, manifest references are stripped so openpyxl can
+    read the file.
+    """
+    if not zipfile.is_zipfile(io.BytesIO(content)):
+        return None
+
+    with zipfile.ZipFile(io.BytesIO(content), "r") as zin:
+        names = _normalized_names(zin.namelist())
+        sst_path = _find_shared_strings_path(names)
+        if sst_path:
+            if sst_path == SST_PART:
+                return None
+            return _rewrite_zip(
+                zin, lambda files: _rename_shared_strings_part(files, sst_path)
+            )
+        if not _package_references_shared_strings(zin, names):
+            return None
+        if _worksheets_use_shared_string_cells(zin, names):
+            return None
+        return _rewrite_zip(zin, _strip_shared_strings_manifest)
+
+
+def _normalized_names(namelist: Iterable[str]) -> Set[str]:
+    return {name.replace("\\", "/") for name in namelist}
+
+
+def _find_shared_strings_path(names: Set[str]) -> str | None:
+    for name in names:
+        if name.lower().endswith("sharedstrings.xml"):
+            return name
+    return None
+
+
+def _package_references_shared_strings(
+    zin: zipfile.ZipFile, names: Set[str]
+) -> bool:
+    content_types = "[Content_Types].xml"
+    if content_types in names:
+        ct = zin.read(content_types).decode("utf-8", errors="replace")
+        if "sharedstrings.xml" in ct.lower():
+            return True
+
+    rels_path = "xl/_rels/workbook.xml.rels"
+    if rels_path in names:
+        rels = zin.read(rels_path).decode("utf-8", errors="replace")
+        if "sharedstrings" in rels.lower():
+            return True
+    return False
+
+
+def _worksheets_use_shared_string_cells(
+    zin: zipfile.ZipFile, names: Set[str]
+) -> bool:
+    for name in names:
+        if not name.startswith("xl/worksheets/") or not name.endswith(".xml"):
+            continue
+        sheet = zin.read(name).decode("utf-8", errors="replace")
+        if re.search(r'\bt="s"', sheet):
+            return True
+    return False
+
+
+def _rename_shared_strings_part(
+    files: Dict[str, bytes], source_path: str
+) -> Dict[str, bytes]:
+    updated = dict(files)
+    updated[SST_PART] = updated.pop(source_path)
+    return updated
+
+
+def _strip_shared_strings_manifest(files: Dict[str, bytes]) -> Dict[str, bytes]:
+    updated = dict(files)
+    ct_path = "[Content_Types].xml"
+    if ct_path in updated:
+        ct = updated[ct_path].decode("utf-8")
+        ct = _SST_OVERRIDE_RE.sub("", ct)
+        updated[ct_path] = ct.encode("utf-8")
+
+    rels_path = "xl/_rels/workbook.xml.rels"
+    if rels_path in updated:
+        rels = updated[rels_path].decode("utf-8")
+        rels = _SST_REL_RE.sub("", rels)
+        updated[rels_path] = rels.encode("utf-8")
+    return updated
+
+
+def _rewrite_zip(
+    zin: zipfile.ZipFile,
+    transform: Callable[[Dict[str, bytes]], Dict[str, bytes]],
+) -> bytes:
+    files: Dict[str, bytes] = {}
+    for info in zin.infolist():
+        name = info.filename.replace("\\", "/")
+        files[name] = zin.read(info.filename)
+    files = transform(files)
+
+    out = io.BytesIO()
+    with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
+        for name, data in files.items():
+            zout.writestr(name, data)
+    return out.getvalue()
--- a/docreader/pyproject.toml
+++ b/docreader/pyproject.toml
@@ -9,9 +9,12 @@ dependencies = [
    "grpcio>=1.78.0",
    "grpcio-health-checking>=1.78.0",
    "grpcio-tools>=1.78.0",
-    "liteparse>=2.0.4",
    "lxml>=6.1.0",
    "markitdown[docx,pdf,xls,xlsx]>=0.1.3",
+    "opendataloader-pdf>=2.4.7",
+    "openpyxl>=3.1.0",
+    "pandas>=2.0.0",
+    "xlrd>=2.0.0",
    "pillow>=12.0.0",
    "playwright>=1.55.0",
    "protobuf>=6.33.0",
--- a/docreader/splitter/header_hook.py
+++ b/docreader/splitter/header_hook.py
@@ -62,6 +62,67 @@ DEFAULT_CONFIGS = [
 ]
 DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)

+_TABLE_ROW_PATTERN = re.compile(r"^\s*(?:\|[^|\n]*)+\|\s*$", re.MULTILINE)
+_MARKDOWN_TABLE_PRIORITY = 15
+
+
+def _is_empty_table_header_row(header: str) -> bool:
+    """True when the column-name line is only pipes/whitespace (MarkItDown quirk)."""
+    newline = header.find("\n")
+    if newline < 0:
+        return False
+    row = header[:newline].strip()
+    return bool(row) and all(ch in "| \t" for ch in row)
+
+
+def _extract_separator_line(header: str) -> str:
+    for line in header.split("\n"):
+        if "---" in line:
+            return line + "\n"
+    return ""
+
+
+def _table_row_column_count(line: str) -> int:
+    line = line.strip()
+    if not line.startswith("|"):
+        return 0
+    parts = line.split("|")
+    if parts and parts[0].strip() == "":
+        parts = parts[1:]
+    if parts and parts[-1].strip() == "":
+        parts = parts[:-1]
+    return len(parts)
+
+
+def _first_table_row_column_count(text: str) -> int:
+    for line in text.split("\n"):
+        line = line.strip()
+        if line and _TABLE_ROW_PATTERN.match(line):
+            return _table_row_column_count(line)
+    return 0
+
+
+def _header_table_column_count(header: str) -> int:
+    for line in header.split("\n"):
+        line = line.strip()
+        if not line or "---" in line:
+            continue
+        count = _table_row_column_count(line)
+        if count > 0:
+            return count
+    return 0
+
+
+def _split_ends_with_paragraph_break(split: str) -> bool:
+    trimmed = split.rstrip(" \t\r")
+    return trimmed.endswith("\n\n") or trimmed.endswith("\r\n\r\n")
+
+
+def header_column_mismatch(headers: str, next_unit: str) -> bool:
+    header_cols = _header_table_column_count(headers)
+    row_cols = _first_table_row_column_count(next_unit)
+    return header_cols > 0 and row_cols > 0 and header_cols != row_cols
+

 # 定义Hook状态数据结构
 class HeaderTracker(BaseModel):
@@ -70,10 +131,28 @@ class HeaderTracker(BaseModel):
    header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
    active_headers: Dict[int, str] = Field(default_factory=dict)
    ended_headers: set[int] = Field(default_factory=set)
+    pending_extend: Dict[int, bool] = Field(default_factory=dict)
+    pending_table_break: bool = Field(default=False)
+    header_ended_this_unit: bool = Field(default=False)
+
+    def _clear_table_header(self) -> None:
+        self.ended_headers.add(_MARKDOWN_TABLE_PRIORITY)
+        self.active_headers.pop(_MARKDOWN_TABLE_PRIORITY, None)
+        self.pending_extend.pop(_MARKDOWN_TABLE_PRIORITY, None)

    def update(self, split: str) -> Dict[int, str]:
        """检测当前split中的表头开始/结束，更新Hook状态"""
        new_headers: Dict[int, str] = {}
+        self.header_ended_this_unit = False
+
+        if self.pending_table_break:
+            self.pending_table_break = False
+            if _MARKDOWN_TABLE_PRIORITY in self.active_headers:
+                if _first_table_row_column_count(split) > 0:
+                    self._clear_table_header()
+                    self.header_ended_this_unit = True
+                else:
+                    self._clear_table_header()

        # 1. 检查是否有表头结束标记
        for config in self.header_hook_configs:
@@ -82,8 +161,31 @@ class HeaderTracker(BaseModel):
            ):
                self.ended_headers.add(config.priority)
                del self.active_headers[config.priority]
+                self.pending_extend.pop(config.priority, None)

-        # 2. 检查是否有新的表头开始标记（只处理未活跃且未结束的）
+        # 1b. \n\n 分块会吞掉表间空行：段尾 \n\n 或列数变化时结束表头追踪
+        if (
+            _MARKDOWN_TABLE_PRIORITY in self.active_headers
+            and not self.pending_extend.get(_MARKDOWN_TABLE_PRIORITY)
+        ):
+            if _split_ends_with_paragraph_break(split):
+                self.pending_table_break = True
+            else:
+                header = self.active_headers[_MARKDOWN_TABLE_PRIORITY]
+                row_cols = _first_table_row_column_count(split)
+                header_cols = _header_table_column_count(header)
+                if row_cols > 0 and header_cols > 0 and row_cols != header_cols:
+                    self._clear_table_header()
+                    self.header_ended_this_unit = True
+
+        # 2. 空表头行：用首个数据行补全列名（与 Go header_tracker 一致）
+        for priority in list(self.pending_extend.keys()):
+            if priority in self.active_headers and _TABLE_ROW_PATTERN.search(split):
+                sep = _extract_separator_line(self.active_headers[priority])
+                self.active_headers[priority] = split + sep
+            self.pending_extend.pop(priority, None)
+
+        # 3. 检查是否有新的表头开始标记（只处理未活跃且未结束的）
        for config in self.header_hook_configs:
            if (
                config.priority not in self.active_headers
@@ -94,8 +196,10 @@ class HeaderTracker(BaseModel):
                    header = config.extract_header_fn(match)
                    self.active_headers[config.priority] = header
                    new_headers[config.priority] = header
+                    if _is_empty_table_header_row(header):
+                        self.pending_extend[config.priority] = True

-        # 3. 检查是否所有活跃表头都已结束（清空结束标记）
+        # 4. 检查是否所有活跃表头都已结束（清空结束标记）
        if not self.active_headers:
            self.ended_headers.clear()

--- a/docreader/splitter/splitter.py
+++ b/docreader/splitter/splitter.py
@@ -16,6 +16,7 @@ from pydantic import BaseModel, Field, PrivateAttr

 from docreader.splitter.header_hook import (
    HeaderTracker,
+    header_column_mismatch,
 )
 from docreader.utils.split import split_by_char, split_by_sep

@@ -225,6 +226,16 @@ class TextSplitter(BaseModel, Generic[T]):

            # Update header tracking with current split
            self.header_hook.update(split)
+            if self.header_hook.header_ended_this_unit and len(cur_chunk) > 0:
+                chunks.append(
+                    (
+                        cur_chunk[0][0],
+                        cur_chunk[-1][1],
+                        "".join([c[2] for c in cur_chunk]),
+                    )
+                )
+                cur_chunk = []
+                cur_len = 0
            cur_headers = self.header_hook.get_headers()
            cur_headers_len = self.len_function(cur_headers)

@@ -276,6 +287,7 @@ class TextSplitter(BaseModel, Generic[T]):
                    cur_headers
                    and split_len + cur_headers_len < self.chunk_size
                    and cur_headers not in split
+                    and not header_column_mismatch(cur_headers, split)
                ):
                    next_start = cur_chunk[0][0] if cur_chunk else cur_start

--- a/docreader/tests/test_excel_parser.py
+++ b/docreader/tests/test_excel_parser.py
@@ -0,0 +1,210 @@
+import io
+import os
+import shutil
+import subprocess
+import tempfile
+import unittest
+import zipfile
+
+import openpyxl
+import pandas as pd
+
+from docreader.parser.excel_convert import detect_excel_format, engine_for_format
+from docreader.parser.excel_parser import ExcelParser
+from docreader.parser.xlsx_merge import fill_merged_cells_xlsx
+from docreader.parser.xlsx_repair import repair_xlsx_bytes
+
+
+def _xlsx_with_phantom_shared_strings() -> bytes:
+    """Workbook with inline strings but a dangling sharedStrings manifest entry."""
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws["A1"] = "hello"
+    ws["B1"] = 42
+    bio = io.BytesIO()
+    wb.save(bio)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with zipfile.ZipFile(io.BytesIO(bio.getvalue()), "r") as zin:
+            zin.extractall(tmpdir)
+
+        ct_path = f"{tmpdir}/[Content_Types].xml"
+        with open(ct_path, encoding="utf-8") as f:
+            ct = f.read()
+        override = (
+            '<Override PartName="/xl/sharedStrings.xml" '
+            'ContentType="application/vnd.openxmlformats-officedocument.'
+            'spreadsheetml.sharedStrings+xml"/>'
+        )
+        with open(ct_path, "w", encoding="utf-8") as f:
+            f.write(ct.replace("</Types>", override + "</Types>"))
+
+        out = io.BytesIO()
+        with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
+            for root, _, files in os.walk(tmpdir):
+                for name in files:
+                    path = os.path.join(root, name)
+                    arc = os.path.relpath(path, tmpdir)
+                    zout.write(path, arc)
+        return out.getvalue()
+
+
+class ExcelFormatDetectionTest(unittest.TestCase):
+    def test_detect_xlsx_and_engine(self):
+        wb = openpyxl.Workbook()
+        bio = io.BytesIO()
+        wb.save(bio)
+        content = bio.getvalue()
+        self.assertEqual(detect_excel_format(content), "xlsx")
+        self.assertEqual(engine_for_format("xlsx"), "openpyxl")
+
+    def test_detect_xls_magic(self):
+        content = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" + b"\x00" * 512
+        self.assertEqual(detect_excel_format(content), "xls")
+        self.assertEqual(engine_for_format("xls"), "xlrd")
+
+    def test_open_legacy_xls_bytes_with_xlsx_extension(self):
+        if not shutil.which("soffice"):
+            self.skipTest("LibreOffice not available")
+        wb = openpyxl.Workbook()
+        ws = wb.active
+        ws["A1"] = "legacy"
+        xlsx_bio = io.BytesIO()
+        wb.save(xlsx_bio)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src = os.path.join(tmpdir, "sheet.xlsx")
+            with open(src, "wb") as handle:
+                handle.write(xlsx_bio.getvalue())
+            subprocess.run(
+                [
+                    "soffice",
+                    "--headless",
+                    "--convert-to",
+                    "xls",
+                    "--outdir",
+                    tmpdir,
+                    src,
+                ],
+                check=True,
+                capture_output=True,
+            )
+            xls_path = os.path.join(tmpdir, "sheet.xls")
+            with open(xls_path, "rb") as handle:
+                xls_bytes = handle.read()
+
+        document = ExcelParser(file_name="fake.xlsx", file_type="xlsx").parse_into_text(
+            xls_bytes
+        )
+        self.assertIn("legacy", document.content)
+
+
+class XlsxRepairTest(unittest.TestCase):
+    def test_repair_removes_phantom_shared_strings_reference(self):
+        broken = _xlsx_with_phantom_shared_strings()
+        with self.assertRaises(KeyError):
+            pd.read_excel(io.BytesIO(broken))
+
+        repaired = repair_xlsx_bytes(broken)
+        self.assertIsNotNone(repaired)
+        df = pd.read_excel(io.BytesIO(repaired), header=None)
+        self.assertEqual(df.values.tolist(), [["hello", 42]])
+
+    def test_repair_skips_when_shared_string_cells_need_table(self):
+        import xlsxwriter
+
+        bio = io.BytesIO()
+        wb = xlsxwriter.Workbook(bio, {"in_memory": True})
+        ws = wb.add_worksheet()
+        ws.write(0, 0, "hello")
+        wb.close()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with zipfile.ZipFile(io.BytesIO(bio.getvalue()), "r") as zin:
+                zin.extractall(tmpdir)
+            os.remove(f"{tmpdir}/xl/sharedStrings.xml")
+
+            out = io.BytesIO()
+            with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
+                for root, _, files in os.walk(tmpdir):
+                    for name in files:
+                        path = os.path.join(root, name)
+                        arc = os.path.relpath(path, tmpdir)
+                        zout.write(path, arc)
+            broken = out.getvalue()
+
+        self.assertIsNone(repair_xlsx_bytes(broken))
+
+
+class XlsxMergeFillTest(unittest.TestCase):
+    def test_fill_merged_cells_propagates_master_value(self):
+        wb = openpyxl.Workbook()
+        ws = wb.active
+        ws["A1"] = "title"
+        ws.merge_cells("A1:B1")
+        ws["A2"] = "left"
+        ws["B2"] = "right"
+        ws.merge_cells("A2:A3")
+        ws["B3"] = "only-b"
+        bio = io.BytesIO()
+        wb.save(bio)
+
+        filled = fill_merged_cells_xlsx(bio.getvalue())
+        out_wb = openpyxl.load_workbook(io.BytesIO(filled), data_only=True)
+        out_ws = out_wb.active
+        self.assertEqual(out_ws["B1"].value, "title")
+        self.assertEqual(out_ws["A3"].value, "left")
+        self.assertEqual(out_ws["B3"].value, "only-b")
+
+    def test_parse_en_mergecell_workbook(self):
+        path = os.path.join(
+            os.path.dirname(__file__),
+            "..",
+            "..",
+            "testdata",
+            "rag_test",
+            "xlsx",
+            "en_mergecell.xlsx",
+        )
+        if not os.path.isfile(path):
+            self.skipTest("en_mergecell.xlsx fixture not available")
+        with open(path, "rb") as handle:
+            document = ExcelParser().parse_into_text(handle.read())
+
+        chunks = [chunk.content.strip() for chunk in document.chunks]
+        self.assertEqual(len(chunks), 12)
+        self.assertIn("A: A1", chunks[0])
+        self.assertIn("A: A2", chunks[1])
+        self.assertIn("B: B3", chunks[2])
+        self.assertNotIn("Unnamed:", document.content)
+        self.assertIn("A: A7", chunks[6])
+        self.assertIn("A: A7", chunks[7])
+        self.assertIn("D: D10", chunks[9])
+
+
+class ExcelParserTest(unittest.TestCase):
+    def test_parse_phantom_shared_strings_workbook(self):
+        document = ExcelParser().parse_into_text(_xlsx_with_phantom_shared_strings())
+        self.assertIn("hello", document.content)
+        self.assertIn("42", document.content)
+        self.assertGreater(len(document.chunks), 0)
+
+    def test_parse_en_calcchain_shared_strings_case(self):
+        path = os.path.join(
+            os.path.dirname(__file__),
+            "..",
+            "..",
+            "testdata",
+            "rag_test",
+            "xlsx",
+            "en_calcchain.xlsx",
+        )
+        if not os.path.isfile(path):
+            self.skipTest("en_calcchain.xlsx fixture not available")
+        with open(path, "rb") as f:
+            document = ExcelParser().parse_into_text(f.read())
+        self.assertGreater(len(document.content), 0)
+        self.assertGreater(len(document.chunks), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/docreader/tests/test_markdown_table_util.py
+++ b/docreader/tests/test_markdown_table_util.py
@@ -0,0 +1,57 @@
+import io
+import unittest
+from pathlib import Path
+
+from markitdown import MarkItDown
+
+from docreader.parser.markdown_parser import MarkdownTableUtil
+
+
+class TestMarkdownTableUtil(unittest.TestCase):
+    def test_preserves_empty_cells(self):
+        """Interior empty cells must not be dropped during formatting."""
+        raw = "| a |  | c |\n| --- | --- | --- |\n| 1 | 2 | 3 |"
+        formatted = MarkdownTableUtil().format_table(raw)
+        self.assertIn("| a |  | c |", formatted)
+        self.assertEqual(formatted.count("|"), raw.count("|"))
+
+    def test_format_nonempty_table(self):
+        raw = "|Name|Age|\n|---|---|\n|John|30|"
+        formatted = MarkdownTableUtil().format_table(raw)
+        self.assertIn("| Name | Age |", formatted)
+        self.assertIn("| --- | --- |", formatted)
+        self.assertIn("| John | 30 |", formatted)
+
+    def test_normalize_markitdown_en_tables(self):
+        docx = (
+            Path(__file__).resolve().parents[2]
+            / "testdata"
+            / "rag_test"
+            / "docx"
+            / "en_tables.docx"
+        )
+        if not docx.is_file():
+            docx = Path(__file__).resolve().parents[2].parent / "testdata/rag_test/docx/en_tables.docx"
+        raw = MarkItDown().convert(io.BytesIO(docx.read_bytes()), file_extension=".docx").text_content
+        normalized = MarkdownTableUtil().format_table(raw)
+
+        self.assertNotIn("|  |  |  |  |", normalized)
+        self.assertIn("| Name | Game | Fame | Blame |", normalized)
+        idx_name = normalized.index("| Name | Game | Fame | Blame |")
+        idx_sep = normalized.index("| --- | --- | --- | --- |", idx_name)
+        self.assertLess(idx_name, idx_sep)
+        self.assertIn("| Lebron James | Basketball |", normalized)
+
+        # Headerless 2-row tables: delimiter inserted so GFM renderers show a table
+        self.assertIn(
+            "| Sinple | Table |\n| --- | --- |\n| Without | Header |", normalized
+        )
+        self.assertIn(
+            "| Simple  Multiparagraph | Table  Full |\n| --- | --- |\n"
+            "| Of  Paragraphs | In each  Cell. |",
+            normalized,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/docreader/tests/test_opendataloader_parser.py
+++ b/docreader/tests/test_opendataloader_parser.py
@@ -0,0 +1,103 @@
+"""Unit tests for OpenDataLoader parser helpers (no JVM required)."""
+
+import os
+import tempfile
+import unittest
+from unittest import mock
+
+from docreader.parser.opendataloader_parser import (
+    OpenDataLoaderParser,
+    _collect_images_under_output,
+    _find_markdown_file,
+    _normalize_odl_image_url,
+    _rewrite_markdown_image_refs,
+    opendataloader_available,
+)
+
+
+class OpenDataLoaderHelpersTest(unittest.TestCase):
+    def test_find_markdown_prefers_stem_match(self):
+        with tempfile.TemporaryDirectory() as d:
+            other = os.path.join(d, "other.md")
+            target = os.path.join(d, "paper.md")
+            with open(other, "w") as f:
+                f.write("x")
+            with open(target, "w") as f:
+                f.write("# Title")
+            self.assertEqual(_find_markdown_file(d, "paper"), target)
+
+    def test_collect_and_rewrite_images(self):
+        with tempfile.TemporaryDirectory() as d:
+            img_dir = os.path.join(d, "images")
+            os.makedirs(img_dir)
+            png = os.path.join(img_dir, "fig1.png")
+            with open(png, "wb") as f:
+                f.write(b"\x89PNG\r\n\x1a\n")
+            images = _collect_images_under_output(d)
+            self.assertIn("images/fig1.png", images)
+            md = "See ![fig](images/fig1.png) and ![alt](./fig1.png)."
+            out = _rewrite_markdown_image_refs(md, images)
+            self.assertIn("![fig](images/fig1.png)", out)
+            self.assertIn("![alt](images/fig1.png)", out)
+
+    def test_rewrite_odl_angle_bracket_and_entity_urls(self):
+        images = {"images/imageFile1.png": "e30="}
+        for md_in in (
+            "![image 1](<images/imageFile1.png>)",
+            "![image 1](&lt;images/imageFile1.png&gt;)",
+        ):
+            out = _rewrite_markdown_image_refs(md_in, images)
+            self.assertEqual("![image 1](images/imageFile1.png)", out)
+
+    def test_normalize_odl_image_url(self):
+        self.assertEqual(
+            _normalize_odl_image_url("&lt;images/imageFile2.png&gt;"),
+            "images/imageFile2.png",
+        )
+        self.assertEqual(
+            _normalize_odl_image_url("<images/imageFile2.png>"),
+            "images/imageFile2.png",
+        )
+
+    def test_rewrite_skips_data_uris(self):
+        md = "![x](data:image/png;base64,abc)"
+        self.assertEqual(_rewrite_markdown_image_refs(md, {"images/a.png": "e30="}), md)
+
+
+class OpenDataLoaderParserTest(unittest.TestCase):
+    @mock.patch("docreader.parser.opendataloader_parser.opendataloader_available")
+    @mock.patch("docreader.parser.opendataloader_parser._run_convert")
+    def test_parse_reads_markdown_and_images(self, mock_convert, mock_avail):
+        mock_avail.return_value = (True, "")
+
+        def fake_convert(pdf_path, output_dir, image_dir, overrides=None):
+            stem = os.path.splitext(os.path.basename(pdf_path))[0]
+            md_path = os.path.join(output_dir, f"{stem}.md")
+            with open(md_path, "w") as f:
+                f.write("# Hello\n\n![pic](images/pic.png)\n")
+            os.makedirs(image_dir, exist_ok=True)
+            with open(os.path.join(image_dir, "pic.png"), "wb") as f:
+                f.write(b"png")
+
+        mock_convert.side_effect = fake_convert
+
+        parser = OpenDataLoaderParser(file_name="doc.pdf", file_type="pdf")
+        doc = parser.parse_into_text(b"%PDF-1.4 fake")
+        self.assertIn("# Hello", doc.content)
+        self.assertIn("images/pic.png", doc.content)
+        self.assertIn("images/pic.png", doc.images)
+        self.assertEqual(doc.metadata.get("parser_engine"), "opendataloader")
+
+    @mock.patch("docreader.parser.opendataloader_parser.shutil.which", return_value=None)
+    def test_availability_requires_java(self, _which):
+        with mock.patch(
+            "docreader.parser.opendataloader_parser._package_available",
+            return_value=(True, ""),
+        ):
+            ok, msg = opendataloader_available()
+        self.assertFalse(ok)
+        self.assertIn("Java", msg)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/docreader/tests/test_pdf_router.py
+++ b/docreader/tests/test_pdf_router.py
@@ -6,10 +6,15 @@ from PIL import Image
 from docreader.parser.pdf_parser import (
    PDFParser,
    _classify_page,
+    _filter_reading_columns,
    _group_lines,
+    _is_artifact_column,
+    _join_line_glyphs,
+    _merge_orphan_punctuation_lines,
    _point_in_boxes,
    _segments_to_markdown,
    _select_embedded_images,
+    _should_prefer_plain,
    _split_columns,
    _strip_repeating_lines,
 )
@@ -122,13 +127,25 @@ class ReadingOrderTest(unittest.TestCase):
    def test_group_lines_orders_by_y_then_x(self):
        # Two visual lines; within a line glyphs given out of x-order.
        chars = [
-            _char("B", 120, 130, 700, 712),
-            _char("A", 100, 110, 700, 712),  # same line, left of B
+            _char("B", 110, 120, 700, 712),  # adjacent to A (no word-sized gap)
+            _char("A", 100, 110, 700, 712),
            _char("C", 100, 110, 680, 692),  # next line down
        ]
        lines = _group_lines(chars)
        self.assertEqual([ln["text"] for ln in lines], ["AB", "C"])

+    def test_join_line_glyphs_inserts_word_spaces(self):
+        # Wide gap between "copy" and "of" mimics positioned OCR / text layers.
+        chars = [
+            _char("c", 0, 4, 0, 10),
+            _char("f", 10, 14, 0, 10),
+        ]
+        self.assertEqual(_join_line_glyphs(chars), "c f")
+
+    def test_join_line_glyphs_keeps_adjacent_letters(self):
+        chars = [_char("A", 100, 110, 700, 712), _char("B", 110, 120, 700, 712)]
+        self.assertEqual(_join_line_glyphs(chars), "AB")
+

 class HeadingDetectionTest(unittest.TestCase):
    def test_promotes_large_line_to_heading(self):
@@ -156,6 +173,235 @@ class HiddenTextFilterTest(unittest.TestCase):
        self.assertFalse(_point_in_boxes(20.0, 5.0, boxes))


+class MarginColumnFilterTest(unittest.TestCase):
+    def test_drops_narrow_vertical_margin_column(self):
+        # Mimics arXiv sidebar: narrow x span, one glyph per line.
+        margin = [
+            _char(c, 20, 28, 500 - i * 14, 512 - i * 14)
+            for i, c in enumerate("0202luJ22")
+        ]
+        body = [
+            _char("L", 160, 170, 700, 712),
+            _char("a", 170, 180, 700, 712),
+            _char("n", 180, 190, 700, 712),
+        ]
+        cols = _filter_reading_columns(margin + body, scale=10.0, width=612.0)
+        self.assertEqual(len(cols), 1)
+        self.assertEqual(cols[0][0]["ch"], "L")
+
+    def test_keeps_real_two_column_layout(self):
+        left = [_char("L", 50, 150, 700 - i * 12, 712 - i * 12) for i in range(4)]
+        right = [_char("R", 400, 500, 700 - i * 12, 712 - i * 12) for i in range(4)]
+        cols = _filter_reading_columns(left + right, scale=12.0, width=600.0)
+        self.assertEqual(len(cols), 2)
+
+
+class PunctuationMergeTest(unittest.TestCase):
+    def test_merges_orphan_periods(self):
+        lines = [
+            _line("Figure 1 2", 10.0),
+            _line(". .", 10.0),
+            _line("Next", 10.0),
+        ]
+        merged = _merge_orphan_punctuation_lines(lines)
+        self.assertEqual([ln["text"] for ln in merged], ["Figure 1 2..", "Next"])
+
+
+class PdfTextSanitizeTest(unittest.TestCase):
+    def test_removes_fffe_placeholder(self):
+        from docreader.parser.pdf_parser import _postprocess_pdf_text
+
+        raw = "multi\ufffelayer and non\ufffetrivial"
+        out = _postprocess_pdf_text(raw)
+        self.assertEqual(out, "multilayer and nontrivial")
+
+    def test_strips_chart_axis_run(self):
+        from docreader.parser.pdf_parser import _postprocess_pdf_text
+
+        raw = (
+            "Deep convolutional neural networks have led to breakthroughs.\n"
+            "0 1 2 3 4 5 6 0\n"
+            "10\n"
+            "20\n"
+            "iter. (1e4)\n"
+            "training error (%)\n"
+            "56-layer\n"
+            "20-layer\n"
+            "Figure 1. Training error on CIFAR-10.\n"
+        )
+        out = _postprocess_pdf_text(raw)
+        self.assertIn("breakthroughs", out)
+        self.assertNotIn("56-layer", out)
+        self.assertIn("Figure 1.", out)
+
+    def test_strips_diagram_labels_above_caption(self):
+        from docreader.parser.pdf_parser import _postprocess_pdf_text
+
+        raw = (
+            "Paragraph before.\n"
+            "identity\n"
+            "weight layer\n"
+            "relu\n"
+            "Figure 2. Residual learning block.\n"
+            "Paragraph after.\n"
+        )
+        out = _postprocess_pdf_text(raw)
+        self.assertIn("Paragraph before.", out)
+        self.assertIn("Figure 2.", out)
+        self.assertIn("Paragraph after.", out)
+        self.assertNotIn("identity", out)
+        self.assertNotIn("weight layer", out)
+
+    def test_strips_arxiv_header_line(self):
+        from docreader.parser.pdf_parser import _postprocess_pdf_text
+
+        raw = "Body text.\n1\narXiv:1512.03385v1 [cs.CV] 10 Dec 2015\nMore body."
+        out = _postprocess_pdf_text(raw)
+        self.assertNotIn("arXiv:", out)
+        self.assertIn("Body text.", out)
+
+
+class PlainWellFormedTest(unittest.TestCase):
+    def test_academic_plain_skips_layout(self):
+        from docreader.parser.pdf_parser import _plain_is_well_formed
+
+        plain = (
+            "Recent work [DL15, MBXS17] shows progress on NLP tasks "
+            "with pre-trained models."
+        )
+        self.assertTrue(_plain_is_well_formed(plain))
+
+    def test_glued_scan_plain_needs_layout(self):
+        from docreader.parser.pdf_parser import _plain_is_well_formed
+
+        self.assertFalse(_plain_is_well_formed("Thisisadigitalcopyofabook"))
+
+
+class LayoutQualityFallbackTest(unittest.TestCase):
+    def test_prefers_plain_when_many_single_char_lines(self):
+        plain = "Language Models are Few-Shot Learners\nTom Brown"
+        layout = "0\n2\n0\n2\nl\nu\nJ\nLan ua e Models"
+        self.assertTrue(_should_prefer_plain(plain, layout))
+
+    def test_keeps_good_layout(self):
+        plain = "Hello world"
+        layout = "Hello world"
+        self.assertFalse(_should_prefer_plain(plain, layout))
+
+
+class ResNetPaperFigureTest(unittest.TestCase):
+    """Regression: ResNet PDF (arXiv:1512.03385) vector figures and captions."""
+
+    def test_resnet_figures_and_captions(self):
+        import os
+
+        from docreader.parser.pdf_parser import PDFParser
+
+        for path in (
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "..",
+                "testdata",
+                "rag_test",
+                "pdf_en",
+                "resnet.pdf",
+            ),
+            "/tmp/resnet.pdf",
+        ):
+            if os.path.isfile(path):
+                break
+        else:
+            self.skipTest("resnet.pdf not available")
+
+        with open(path, "rb") as f:
+            doc = PDFParser(file_name="resnet.pdf", file_type="pdf").parse_into_text(
+                f.read()
+            )
+        self.assertGreater(doc.metadata.get("vector_figure_count", 0), 0)
+        self.assertIn("![", doc.content)
+        self.assertIn("Figure 2. Residual learning", doc.content)
+        self.assertNotIn("arXiv:", doc.content)
+        fig2 = doc.content.find("Figure 2. Residual learning")
+        before = doc.content[max(0, fig2 - 120) : fig2]
+        self.assertIn("![", before)
+        self.assertNotIn("identity", before)
+
+
+class Gpt3PaperLayoutTest(unittest.TestCase):
+    """Regression: arXiv GPT-3 paper title page must not be one-glyph-per-line."""
+
+    def test_gpt3_page0_title_and_authors(self):
+        import os
+
+        import pypdfium2 as pdfium
+        import pypdfium2.raw as pdfium_r
+
+        from docreader.parser.pdf_parser import PDFParser, _extract_layout_text
+
+        pdf_path = os.path.join(
+            os.path.dirname(__file__),
+            "..",
+            "..",
+            "testdata",
+            "rag_test",
+            "pdf_en",
+            "gpt3.pdf",
+        )
+        if not os.path.isfile(pdf_path):
+            self.skipTest("gpt3.pdf not in testdata")
+        with open(pdf_path, "rb") as f:
+            content = f.read()
+        with pdfium.PdfDocument(content) as pdf:
+            page = pdf[0]
+            try:
+                layout = _extract_layout_text(page, pdfium_r)
+            finally:
+                page.close()
+        # Margin sidebar must not appear as one-glyph-per-line prefix.
+        self.assertNotRegex(layout[:300], r"^0\n2\n0\n2")
+        self.assertIn("Few-Shot Learners", layout)
+
+        doc = PDFParser(file_name="gpt3.pdf", file_type="pdf").parse_into_text(content)
+        self.assertIn("Language Models are Few-Shot Learners", doc.content)
+        self.assertIn("Tom B. Brown", doc.content[:1200])
+        self.assertIn("[DL15, MBXS17, PNZtY18]", doc.content)
+        self.assertIn("task-specific architectures), and more recently", doc.content)
+        self.assertNotIn("k ifi hi d l", doc.content)
+
+
+class ScanEnglishDictLayoutTest(unittest.TestCase):
+    """Regression: Google Books-style PDFs lose spaces without gap inference."""
+
+    def test_scan_en_dict_page0_has_word_spaces(self):
+        import os
+
+        import pypdfium2 as pdfium
+        import pypdfium2.raw as pdfium_r
+
+        from docreader.parser.pdf_parser import _extract_layout_text
+
+        pdf_path = os.path.join(
+            os.path.dirname(__file__),
+            "..",
+            "..",
+            "testdata",
+            "rag_test",
+            "pdf_scan",
+            "scan_en_dict.pdf",
+        )
+        if not os.path.isfile(pdf_path):
+            self.skipTest("scan_en_dict.pdf not in testdata")
+        with open(pdf_path, "rb") as f:
+            pdf = pdfium.PdfDocument(f.read())
+        try:
+            text = _extract_layout_text(pdf[0], pdfium_r)
+        finally:
+            pdf.close()
+        self.assertIn("This is a digital copy of a book", text)
+        self.assertNotIn("Thisisadigitalcopyofabook", text)
+
+
 class PDFRouterIntegrationTest(unittest.TestCase):
    def test_image_only_pdf_routes_to_scanned(self):
        pdf_bytes = _make_image_only_pdf(2)
--- a/docreader/tests/test_ppt_convert.py
+++ b/docreader/tests/test_ppt_convert.py
@@ -0,0 +1,86 @@
+import shutil
+import unittest
+from pathlib import Path
+
+from docreader.parser.ppt_convert import (
+    convert_ppt_to_pptx_bytes,
+    is_ole_compound,
+    is_zip_openxml,
+    needs_ppt_to_pptx_conversion,
+    normalize_ppt_bytes,
+)
+
+TESTDATA = Path(__file__).resolve().parents[2] / "testdata" / "rag_test"
+LEGACY_PPT = TESTDATA / "ppt_old" / "en_38256.ppt"
+WMF_IMAGE_PPT = LEGACY_PPT
+IMAGE_HEAVY_PPT = TESTDATA / "ppt_old" / "en_41384.ppt"
+PPTX_SAMPLE = TESTDATA / "pptx" / "en_marker.pptx"
+
+
+class TestPptConvert(unittest.TestCase):
+    def test_legacy_ppt_magic(self):
+        content = LEGACY_PPT.read_bytes()
+        self.assertTrue(is_ole_compound(content))
+        self.assertFalse(is_zip_openxml(content))
+        self.assertTrue(needs_ppt_to_pptx_conversion(content, "ppt"))
+
+    def test_pptx_does_not_need_conversion(self):
+        content = PPTX_SAMPLE.read_bytes()
+        self.assertTrue(is_zip_openxml(content))
+        self.assertFalse(needs_ppt_to_pptx_conversion(content, "pptx"))
+
+    def test_normalize_pptx_passthrough(self):
+        content = PPTX_SAMPLE.read_bytes()
+        out, ext = normalize_ppt_bytes(content, "pptx")
+        self.assertEqual(out, content)
+        self.assertEqual(ext, ".pptx")
+
+    def test_legacy_ppt_requires_soffice(self):
+        if not shutil.which("soffice"):
+            with self.assertRaises(ValueError) as ctx:
+                normalize_ppt_bytes(LEGACY_PPT.read_bytes(), "ppt")
+            self.assertIn("LibreOffice", str(ctx.exception))
+            self.skipTest("LibreOffice not available")
+        converted = convert_ppt_to_pptx_bytes(LEGACY_PPT.read_bytes(), suffix=".ppt")
+        self.assertIsNotNone(converted)
+        self.assertTrue(is_zip_openxml(converted))
+        out, ext = normalize_ppt_bytes(LEGACY_PPT.read_bytes(), "ppt")
+        self.assertEqual(ext, ".pptx")
+        self.assertTrue(is_zip_openxml(out))
+
+    def test_wmf_legacy_ppt_extracts_rasterized_image(self):
+        if not shutil.which("soffice"):
+            self.skipTest("LibreOffice not available")
+        if not shutil.which("convert"):
+            self.skipTest("ImageMagick convert not available")
+        if not WMF_IMAGE_PPT.is_file():
+            self.skipTest("testdata missing")
+
+        from docreader.parser.markitdown_parser import MarkitdownParser
+
+        doc = MarkitdownParser(file_type="ppt").parse_into_text(
+            WMF_IMAGE_PPT.read_bytes()
+        )
+        self.assertEqual(len(doc.images), 1)
+        self.assertNotIn("bd10496_.jpg", doc.content)
+        self.assertIn("images/", doc.content)
+
+    def test_image_heavy_legacy_ppt_extracts_images(self):
+        if not shutil.which("soffice"):
+            self.skipTest("LibreOffice not available")
+        if not IMAGE_HEAVY_PPT.is_file():
+            self.skipTest("testdata missing")
+
+        from docreader.parser.markitdown_parser import MarkitdownParser
+
+        doc = MarkitdownParser(file_type="ppt").parse_into_text(
+            IMAGE_HEAVY_PPT.read_bytes()
+        )
+        self.assertGreaterEqual(len(doc.images), 2)
+        self.assertNotIn("![](.jpg)", doc.content)
+        for ref in doc.images:
+            self.assertTrue(ref.startswith("images/"))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/docreader/tests/test_web_parser.py
+++ b/docreader/tests/test_web_parser.py
@@ -0,0 +1,42 @@
+import unittest
+
+from docreader.parser.web_parser import (
+    build_visible_text_fallback,
+    extract_markdown_from_html,
+)
+
+
+class TestWebParserHelpers(unittest.TestCase):
+    def test_extract_markdown_empty_html(self):
+        self.assertIsNone(extract_markdown_from_html(""))
+        self.assertIsNone(extract_markdown_from_html("   "))
+
+    def test_extract_markdown_article_html(self):
+        html = """
+        <html><head><title>Demo</title></head><body>
+        <article><h1>Hello</h1><p>World paragraph with enough text for extraction.</p></article>
+        </body></html>
+        """
+        md = extract_markdown_from_html(html)
+        self.assertIsNotNone(md)
+        self.assertIn("Hello", md)
+
+    def test_build_fallback_too_short(self):
+        self.assertIsNone(build_visible_text_fallback("short"))
+        self.assertIsNone(build_visible_text_fallback(""))
+
+    def test_build_fallback_with_title(self):
+        text = "A" * 60
+        md = build_visible_text_fallback(text, page_title="WeKnora")
+        self.assertIsNotNone(md)
+        self.assertTrue(md.startswith("# WeKnora"))
+        self.assertIn(text, md)
+
+    def test_build_fallback_without_title(self):
+        text = "B" * 60
+        md = build_visible_text_fallback(text, page_title="")
+        self.assertEqual(md, text)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/docreader/uv.lock
+++ b/docreader/uv.lock
@@ -463,9 +463,11 @@ dependencies = [
    { name = "grpcio" },
    { name = "grpcio-health-checking" },
    { name = "grpcio-tools" },
-    { name = "liteparse" },
    { name = "lxml" },
    { name = "markitdown", extra = ["docx", "pdf", "xls", "xlsx"] },
+    { name = "opendataloader-pdf" },
+    { name = "openpyxl" },
+    { name = "pandas" },
    { name = "pillow" },
    { name = "playwright" },
    { name = "protobuf" },
@@ -476,6 +478,7 @@ dependencies = [
    { name = "requests" },
    { name = "textract" },
    { name = "trafilatura" },
+    { name = "xlrd" },
 ]

 [package.metadata]
@@ -484,9 +487,11 @@ requires-dist = [
    { name = "grpcio", specifier = ">=1.78.0" },
    { name = "grpcio-health-checking", specifier = ">=1.78.0" },
    { name = "grpcio-tools", specifier = ">=1.78.0" },
-    { name = "liteparse", specifier = ">=2.0.4" },
    { name = "lxml", specifier = ">=6.1.0" },
    { name = "markitdown", extras = ["docx", "pdf", "xls", "xlsx"], specifier = ">=0.1.3" },
+    { name = "opendataloader-pdf", specifier = ">=2.4.7" },
+    { name = "openpyxl", specifier = ">=3.1.0" },
+    { name = "pandas", specifier = ">=2.0.0" },
    { name = "pillow", specifier = ">=12.0.0" },
    { name = "playwright", specifier = ">=1.55.0" },
    { name = "protobuf", specifier = ">=6.33.0" },
@@ -497,6 +502,7 @@ requires-dist = [
    { name = "requests", specifier = ">=2.32.5" },
    { name = "textract", specifier = "==1.5.0" },
    { name = "trafilatura", specifier = ">=2.0.0" },
+    { name = "xlrd", specifier = ">=2.0.0" },
 ]

 [[package]]
@@ -786,37 +792,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940, upload-time = "2025-02-25T20:21:44.179Z" },
 ]

-[[package]]
-name = "liteparse"
-version = "2.0.4"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/0d/e7/ecf68643604a59247a0a7b2f8c73bee7415ea99e0165bb32e2838ddd0d3f/liteparse-2.0.4.tar.gz", hash = "sha256:17f6119f38e80b956c1ce3dc998ea7b0a8e80777ce1f49178f2b14bb17b35a9c", size = 115487, upload-time = "2026-05-30T06:32:12.351Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/b0/4f5007a52ef13679437a892a06ea58448b825de7ea78276e19b9d7fb9dcb/liteparse-2.0.4-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6df1e1199ffbeb2191bb64d7fcbff6af6bdfd1592973e0ad67a82eb09d377c08", size = 13027870, upload-time = "2026-05-30T06:31:11.022Z" },
-    { url = "https://files.pythonhosted.org/packages/83/2f/c7977a2d6f376e31c8c465ee010c238e27e06cbb2c3200d63f41983e40db/liteparse-2.0.4-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:35a72946b965d3b6b87a602051919e7ce243da15ad143d301152fb5e8cd0f6d2", size = 13149255, upload-time = "2026-05-30T06:31:13.636Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/9d/e7f1a1b8cb14ac867b1220fdb0c87bfe07b86c69bf98578573ab37b1a103/liteparse-2.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:c1bbc8b7206b8bfbf7aabc5341d2cf851b7464641d58375bd218b4e1dd3517f9", size = 11115466, upload-time = "2026-05-30T06:31:16.201Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/2d/be89a429a6a6bc78ce8d620974a4f8fbe9f566ea3592a2f1da8dc6bdda4a/liteparse-2.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:537ab6347a384f81980e48cc181d6cd33fc6ad2b7478e3db61350076744d952e", size = 11029024, upload-time = "2026-05-30T06:31:19.11Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/c8/7429622d86bf00ceaec95bf211adf1c9a7bdf46f8c2cd806685f9c02c0f1/liteparse-2.0.4-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:feae0c530197130cb38f176d718eeae639d9091264aa5f954835986c59470813", size = 13028074, upload-time = "2026-05-30T06:31:21.725Z" },
-    { url = "https://files.pythonhosted.org/packages/21/d0/a97174ae281d353251994ed080c8855ea9b0b5d81a60ab3b6b065e911c49/liteparse-2.0.4-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:2012a3a9b5a3f7e13ce34b5a770158971da43bb9d266c7c5a3ea62bdda7ca851", size = 13148977, upload-time = "2026-05-30T06:31:24.498Z" },
-    { url = "https://files.pythonhosted.org/packages/60/48/f41ebe428d8d8d70c53ddd47523baa7300c5cc96e404417d7af25578be01/liteparse-2.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:2d05d10f0d14b1beb34ef8c5e9a14d6cc966adf19f60c7ea1ec5717adc4c986f", size = 11115791, upload-time = "2026-05-30T06:31:26.961Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/58/be78c7c47147aeb1350d475336c6c2e17d5aa513be9244e9d95a170ced34/liteparse-2.0.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:87680616fae276b04ace6e5fc5e4e0c93980391b0d46c2d66d72c0742a3cb19e", size = 11026045, upload-time = "2026-05-30T06:31:29.405Z" },
-    { url = "https://files.pythonhosted.org/packages/86/1f/105ccdd9bc4608a836fe409394d68e8765e699fa7393c2f2f464c612057f/liteparse-2.0.4-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:414599a922aa51f567fa939183929579d1668ef74846fe25f7f46742bb31fcd8", size = 13022571, upload-time = "2026-05-30T06:31:32.321Z" },
-    { url = "https://files.pythonhosted.org/packages/58/9f/4bf4e9b112b47025ae085503fe9cbf13631673ffc41bfb864a3091285c22/liteparse-2.0.4-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:efdaa6b471084a1f4594574555eb6abb5f85de25f2155c8d539542239eacaa56", size = 13146871, upload-time = "2026-05-30T06:31:34.705Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/f0/bf10611e409732bd4e19f0fc0faf3194040e8e09bb75a166ee126d09b70f/liteparse-2.0.4-cp312-cp312-win_amd64.whl", hash = "sha256:fb67326ba957388214762acea35d24cf0d1230ae6a2fe1fdeaf74024e92e3c40", size = 11116682, upload-time = "2026-05-30T06:31:36.992Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/b5/02ed5fff6418fdc970688190eab4470f4f9c116f4de1e39a7deea0d9968a/liteparse-2.0.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8f02c6e0d8f71da671a3527d52d8f1e2c42fddebf81d1b4931c3d035e4ec1e6a", size = 11025231, upload-time = "2026-05-30T06:31:39.696Z" },
-    { url = "https://files.pythonhosted.org/packages/be/d7/b4633483502940d43d583f8057e0aed68b9091087a86d021f8bd7558ba0b/liteparse-2.0.4-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:924e3f669341e22e625b13d08535644d1dfd779bc6781e4ab6f6e54ea90a53d6", size = 13022754, upload-time = "2026-05-30T06:31:42.434Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/e0/3938561ad66d4a216922c8e1e6a878f63df82ce5f00f15a935f779fb7c5b/liteparse-2.0.4-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:137f169002f3abe21e3dd2e6781fbd86841096a0f3b0162afc1fd64eb21fa607", size = 13146432, upload-time = "2026-05-30T06:31:46.706Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/7f/a2017df8031677d7940ad1ce33640219aa28defae4a8171844ea8bed68ca/liteparse-2.0.4-cp313-cp313-win_amd64.whl", hash = "sha256:098fba3ecb2337f78426d9e077d1f70bc75871d4387ab8c3774b0cc5d26b890d", size = 11116383, upload-time = "2026-05-30T06:31:49.546Z" },
-    { url = "https://files.pythonhosted.org/packages/59/63/b2bb03bc30103e93c87695f63eae3ed007b08796a6cc06ea29acace54c4a/liteparse-2.0.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8aeaf821151aaaa854294f3499d64264dbea7d10e682fa9a2443f9177cd444c6", size = 11024196, upload-time = "2026-05-30T06:31:52.091Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/c1/6dedc6b4325aa8de3249694123a74bc9506e0d65a28c85aa5fad4bfdea5c/liteparse-2.0.4-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:d2efbaf7453d2bedc86db51b2b808078567817d7fc537122389b65a317927902", size = 13022936, upload-time = "2026-05-30T06:31:54.619Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/04/7e7c3a8edd01c9904b6eef76bf4a008f987a5df64b8334c61e742861ac84/liteparse-2.0.4-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:34c53d9cefa35f77dc67a19a875e6dca32b4f35006c2015a22eb30c9c810653b", size = 13146821, upload-time = "2026-05-30T06:31:57.284Z" },
-    { url = "https://files.pythonhosted.org/packages/66/f4/da191e881cad5941dc0065782497eb81027bc3f48ac0a3143deab094be33/liteparse-2.0.4-cp314-cp314-win_amd64.whl", hash = "sha256:6546ee0359dc56eebd9f45008bb59708118c234140ecf466f6c7121d9161d9e4", size = 11114558, upload-time = "2026-05-30T06:31:59.799Z" },
-    { url = "https://files.pythonhosted.org/packages/16/59/c554f376c0bdd1bf4c313ac5d77a34817740f021ab6ada9d3226a23fa4b6/liteparse-2.0.4-cp315-cp315-manylinux_2_28_aarch64.whl", hash = "sha256:7c02d0bb31cd5aefa3297ce6e58388abd6f3e109c62ac0fdeef07d8eac4b769e", size = 13023454, upload-time = "2026-05-30T06:32:02.464Z" },
-    { url = "https://files.pythonhosted.org/packages/89/96/04c595ab45162d81bc73218870d1459560428c3f40957e594a6c1c5ea2be/liteparse-2.0.4-cp315-cp315-manylinux_2_28_x86_64.whl", hash = "sha256:acdf3c76cb3215f8d389a935b6b68007fac2ffa9ce0b681dd53650b69d580521", size = 13146859, upload-time = "2026-05-30T06:32:05.175Z" },
-    { url = "https://files.pythonhosted.org/packages/97/9c/59cdd88ebc6c27312ea6cbd0a894002e78b6f8a3dead2b2bf60d7febba85/liteparse-2.0.4-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4cf31cb3987df1190e59b73d9f10976e538ff577f41c40281fd14b84fe4f9da1", size = 13030767, upload-time = "2026-05-30T06:32:07.9Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/ae/9b85e510ddb390ed63b407851d412152b7006487d06703d931f6a0b1414e/liteparse-2.0.4-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:a1f9cb9c24f2df0d4f71ddd66ddb474bfdec8a434ecc1428b791f83aab2a688b", size = 13152103, upload-time = "2026-05-30T06:32:10.457Z" },
-]
-
 [[package]]
 name = "lxml"
 version = "6.1.0"
@@ -1294,6 +1269,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" },
 ]

+[[package]]
+name = "opendataloader-pdf"
+version = "2.4.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1d/5a/98cd2079f2828f7886ee447eae21ee60a858930596aebcc8d275a1fe2b12/opendataloader_pdf-2.4.7.tar.gz", hash = "sha256:a16e995f2f526d706045218d9e359a31f50371a0bc0e3bb1bc15abb467c08fb7", size = 22554865, upload-time = "2026-05-27T10:04:54.285Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/14/f897eabf04eab4e6a40dce9214d921558165f3eaed68335892a5b1a004d0/opendataloader_pdf-2.4.7-py3-none-any.whl", hash = "sha256:1c359183650f4c012875010c156f13b6d3477b00762b8e3fbd8479fa03feb628", size = 22568934, upload-time = "2026-05-27T10:04:49.902Z" },
+]
+
 [[package]]
 name = "openpyxl"
 version = "3.1.5"
--- a/frontend/src/api/system/index.ts
+++ b/frontend/src/api/system/index.ts
@@ -86,6 +86,15 @@ export interface ParserEngineConfig {
  mineru_cloud_enable_table?: boolean | null
  mineru_cloud_enable_ocr?: boolean | null
  mineru_cloud_language?: string
+  // PaddleOCR-VL 自建参数
+  paddleocr_vl_endpoint?: string
+  paddleocr_vl_use_seal_recognition?: boolean | null
+  paddleocr_vl_use_chart_recognition?: boolean | null
+  // PaddleOCR-VL 云 API 参数
+  paddleocr_vl_cloud_token?: string
+  paddleocr_vl_cloud_model?: string
+  paddleocr_vl_cloud_use_seal_recognition?: boolean | null
+  paddleocr_vl_cloud_use_chart_recognition?: boolean | null
 }

 export interface ParserEnginesResponse {
--- a/frontend/src/components/doc-content.vue
+++ b/frontend/src/components/doc-content.vue
@@ -11,6 +11,7 @@ import { onMounted, ref, nextTick, onUnmounted, watch, computed } from "vue";
 import { downKnowledgeDetails, deleteGeneratedQuestion, getChunkByIdOnly, previewKnowledgeFile } from "@/api/knowledge-base/index";
 import { MessagePlugin, DialogPlugin } from "tdesign-vue-next";
 import { sanitizeHTML, safeMarkdownToHTML, createSafeImage, isValidImageURL, hydrateProtectedFileImages, isValidURL } from '@/utils/security';
+import { normalizeSpuriousTablePrefixes } from '@/utils/markdownTableNormalize';
 import { openMermaidFullscreen } from '@/utils/mermaidViewer';
 import { useI18n } from 'vue-i18n';
 import { useAuthStore } from '@/stores/auth';
@@ -283,8 +284,15 @@ let page = 1;
 let loadingChunks = false;
 let pendingRequestedPage: number | null = null;
 let pendingChunksBeforeLoad = 0;
-let doc = null;
+const CHUNK_PAGE_SIZE = 25;
+/** Scroll container for the main doc drawer (not the first .t-drawer__body on the page). */
+let docScrollEl: HTMLElement | null = null;
 let mdContentWrap = ref()
+// Drawer uses attach="body", so markdown nodes live outside mdContentWrap in the DOM.
+const docMarkdownRoot = ref<HTMLElement | null>(null)
+
+const getMarkdownRenderRoot = (): ParentNode | null =>
+  docMarkdownRoot.value ?? (mdContentWrap.value as ParentNode | null) ?? null
 let url = ref('')
 // 视图模式：chunks / merged / preview
 // file 类型默认「预览」，URL / 手动创建 默认「全文」
@@ -365,18 +373,40 @@ const mergeChunks = (chunks: any[]): string => {
  return merged;
 };

+const findDocDrawerScrollEl = (): HTMLElement | null =>
+  document.querySelector('.doc-main-drawer .t-drawer__body') as HTMLElement | null;
+
+const unbindDrawerScroll = () => {
+  if (docScrollEl) {
+    docScrollEl.removeEventListener('scroll', handleDetailsScroll);
+    docScrollEl = null;
+  }
+};
+
+const bindDrawerScroll = () => {
+  unbindDrawerScroll();
+  docScrollEl = findDocDrawerScrollEl();
+  if (docScrollEl) {
+    docScrollEl.addEventListener('scroll', handleDetailsScroll, { passive: true });
+  }
+};
+
 onMounted(() => {
  loadTraceDrawerWidth();
  loadMainDrawerWidth();
  window.addEventListener('resize', onTraceDrawerWindowResize, { passive: true });
-  nextTick(() => {
-    const drawers = document.getElementsByClassName('t-drawer__body');
-    if (drawers && drawers.length > 0) {
-      doc = drawers[0];
-      doc.addEventListener('scroll', handleDetailsScroll);
-    }
-  })
-})
+});
+
+watch(() => props.visible, (visible) => {
+  if (visible) {
+    nextTick(() => {
+      bindDrawerScroll();
+      maybeLoadMoreChunks();
+    });
+  } else {
+    unbindDrawerScroll();
+  }
+});
 watch(() => props.details?.id, () => {
  page = 1;
  loadingChunks = false;
@@ -396,15 +426,16 @@ watch(() => props.details?.chunkLoading, (val) => {
    pendingRequestedPage = null;
    pendingChunksBeforeLoad = 0;
    loadingChunks = false;
+    if (props.visible) {
+      nextTick(() => maybeLoadMoreChunks());
+    }
  }
 });
 onUnmounted(() => {
  window.removeEventListener('resize', onTraceDrawerWindowResize);
  cleanupTraceDrawerResize();
  cleanupMainDrawerResize();
-  if (doc) {
-    doc.removeEventListener('scroll', handleDetailsScroll);
-  }
+  unbindDrawerScroll();
  if (audioBlobUrl.value) {
    URL.revokeObjectURL(audioBlobUrl.value);
  }
@@ -561,7 +592,10 @@ const loadAudioPreview = async () => {
 };
 const runMarkdownPostRenderPipeline = async () => {
  await nextTick();
-  const renderRoot = mdContentWrap.value as ParentNode;
+  const renderRoot = getMarkdownRenderRoot();
+  if (!renderRoot) {
+    return;
+  }
  await hydrateProtectedFileImages(renderRoot);
  const images = renderRoot?.querySelectorAll?.('img.markdown-image') as NodeListOf<HTMLImageElement> | undefined;
  if (images) {
@@ -576,26 +610,29 @@ const runMarkdownPostRenderPipeline = async () => {
  await renderMermaidDiagrams();
 };

-watch(() => props.details.md, (newVal) => {
+watch(() => props.details.md, () => {
  runMarkdownPostRenderPipeline();
-}, { immediate: true, deep: true })
+}, { immediate: true, deep: true, flush: 'post' })

 watch(() => viewMode.value, (mode) => {
  if ((mode === 'chunks' || mode === 'merged') && props.visible) {
    runMarkdownPostRenderPipeline();
+    if (mode === 'chunks') {
+      nextTick(() => maybeLoadMoreChunks());
+    }
  }
-});
+}, { flush: 'post' });

 watch(() => props.visible, (visible) => {
  if (visible && (viewMode.value === 'chunks' || viewMode.value === 'merged')) {
    runMarkdownPostRenderPipeline();
  }
-});
+}, { flush: 'post' });

 // 渲染 Mermaid 图表的函数
 const renderMermaidDiagrams = async () => {
  try {
-    const mermaidElements = mdContentWrap.value?.querySelectorAll('.mermaid');
+    const mermaidElements = getMarkdownRenderRoot()?.querySelectorAll('.mermaid');
    console.log('[Mermaid] Found mermaid elements:', mermaidElements?.length);
    if (mermaidElements && mermaidElements.length > 0) {
      await mermaid.run({
@@ -624,12 +661,13 @@ const handleMermaidClick = (e: Event) => {

 // 为 Mermaid 容器绑定点击全屏事件（绑定在 div 上，不是 SVG 上）
 const bindMermaidClickEvents = () => {
-  if (!mdContentWrap.value) {
-    console.log('[Mermaid] mdContentWrap is null');
+  const renderRoot = getMarkdownRenderRoot();
+  if (!renderRoot) {
+    console.log('[Mermaid] markdown render root is null');
    return;
  }
  // 绑定在 .mermaid div 上，而不是 SVG 上
-  const mermaidDivs = mdContentWrap.value.querySelectorAll('.mermaid');
+  const mermaidDivs = renderRoot.querySelectorAll('.mermaid');
  console.log('[Mermaid] Found mermaid divs:', mermaidDivs.length);
  mermaidDivs.forEach((div, index) => {
    const divEl = div as HTMLElement;
@@ -663,6 +701,9 @@ const processMarkdown = (markdownText) => {
  // 处理被 <p> 包裹的表格行，转换为正常的表格行，并在前后补空行
  processedText = processedText.replace(/<p>\s*(\|[\s\S]*?\|)\s*<\/p>/gi, '\n$1\n');

+  // MarkItDown 常在表格前插入空行 + 分隔行，渲染会出现多余空行
+  processedText = normalizeSpuriousTablePrefixes(processedText);
+
  // 保留表格单元格中的 <br>，不转成换行，避免打散表格；其他区域原样交给 marked 处理

  // 先预处理数学定界符，再做安全预处理
@@ -683,7 +724,8 @@ const processMarkdown = (markdownText) => {
 };
 const handleClose = () => {
  emit("closeDoc", false);
-  if (doc) doc.scrollTop = 0;
+  const scrollEl = docScrollEl || findDocDrawerScrollEl();
+  if (scrollEl) scrollEl.scrollTop = 0;
  viewMode.value = 'merged';
 };

@@ -973,19 +1015,41 @@ const downloadFile = () => {
      MessagePlugin.error(t('file.downloadFailed'));
    });
 };
+const requestNextChunkPage = () => {
+  if (loadingChunks || props.details?.chunkLoading) return;
+  const total = props.details?.total ?? 0;
+  const loaded = props.details?.md?.length ?? 0;
+  if (loaded >= total || total === 0) return;
+  const pageNum = Math.ceil(total / CHUNK_PAGE_SIZE);
+  if (page + 1 > pageNum) return;
+  page++;
+  loadingChunks = true;
+  pendingRequestedPage = page;
+  pendingChunksBeforeLoad = loaded;
+  emit('getDoc', page);
+};
+
+/** When the list is shorter than the drawer, scroll never fires — prefetch until scrollable or done. */
+const maybeLoadMoreChunks = () => {
+  if (!props.visible || loadingChunks || props.details?.chunkLoading) return;
+  const el = docScrollEl || findDocDrawerScrollEl();
+  if (!el) return;
+  const loaded = props.details?.md?.length ?? 0;
+  const total = props.details?.total ?? 0;
+  if (loaded >= total) return;
+  const { scrollHeight, clientHeight } = el;
+  if (scrollHeight <= clientHeight + 8) {
+    requestNextChunkPage();
+  }
+};
+
 const handleDetailsScroll = () => {
-  if (doc && !loadingChunks) {
-    let pageNum = Math.ceil(props.details.total / 25);
-    const { scrollTop, scrollHeight, clientHeight } = doc;
-    if (scrollTop + clientHeight >= scrollHeight - 8) {
-      if (props.details.md.length < props.details.total && page + 1 <= pageNum) {
-        page++;
-        loadingChunks = true;
-        pendingRequestedPage = page;
-        pendingChunksBeforeLoad = props.details.md.length;
-        emit("getDoc", page);
-      }
-    }
+  if (loadingChunks || props.details?.chunkLoading) return;
+  const el = docScrollEl || findDocDrawerScrollEl();
+  if (!el) return;
+  const { scrollTop, scrollHeight, clientHeight } = el;
+  if (scrollTop + clientHeight >= scrollHeight - 8) {
+    requestNextChunkPage();
  }
 };
 </script>
@@ -1052,6 +1116,7 @@ const handleDetailsScroll = () => {
        </div>
      </t-drawer>

+      <div ref="docMarkdownRoot" class="doc-markdown-root">
      <!-- URL类型专属区域（保留：source 是真实链接，不与标题重复） -->
      <div v-if="details.type === 'url'" class="url_box">
        <span class="label">{{ $t('knowledgeBase.urlSource') }}</span>
@@ -1203,6 +1268,7 @@ const handleDetailsScroll = () => {
        <DocumentPreview :knowledgeId="details.id" :fileType="details.file_type" :fileName="details.title"
          :active="viewMode === 'preview'" />
      </div>
+      </div>

    </t-drawer>
  </div>
--- a/frontend/src/i18n/locales/en-US.ts
+++ b/frontend/src/i18n/locales/en-US.ts
@@ -951,6 +951,8 @@ export default {
      selfHostedEndpoint: 'Self-hosted Endpoint',
      formulaRecognition: 'Formula Recognition',
      tableRecognition: 'Table Recognition',
+      sealRecognition: 'Seal Recognition',
+      chartRecognition: 'Chart Recognition',
      language: 'Language',
      testConnection: 'Test Connection',
      saveConfig: 'Save Configuration',
@@ -971,6 +973,9 @@ export default {
      serverUrl: 'Server URL',
      vlmServerUrlPlaceholder: 'e.g. http://your-vllm-server:8000',
      vlmServerUrlHint: 'Required when Backend is vlm-http-client or hybrid-http-client',
+      paddleocrVlEndpointPlaceholder: 'e.g. http://your-paddleocr-vl:8080',
+      paddleocrVlEndpointHint: 'Base URL of the full PaddleOCR-VL pipeline service; no /layout-parsing suffix needed',
+      paddleocrVlCloudTokenPlaceholder: 'PaddleOCR-VL AI Studio Token',
    },
    storage: {
      title: 'Storage Engine',
@@ -2814,6 +2819,9 @@ export default {
          max_owned_per_user: 'Max tenants owned per user',
          default_storage_quota_gb: 'Default storage quota for new tenants (GB)',
        },
+        asynq: {
+          concurrency: 'Async task worker concurrency',
+        },
      },
      enumLabels: {
        auth: {
@@ -4240,6 +4248,14 @@ export default {
          name: 'MinerU Cloud',
          desc: 'MinerU Cloud API',
        },
+        paddleocr_vl: {
+          name: 'PaddleOCR-VL',
+          desc: 'PaddleOCR-VL self-hosted service',
+        },
+        paddleocr_vl_cloud: {
+          name: 'PaddleOCR-VL Cloud',
+          desc: 'PaddleOCR-VL Cloud API',
+        },
        weknoracloud: {
          name: 'WeKnora Cloud',
          desc: 'Document parsing via WeKnora Cloud',
--- a/frontend/src/i18n/locales/ko-KR.ts
+++ b/frontend/src/i18n/locales/ko-KR.ts
@@ -811,6 +811,8 @@ export default {
      selfHostedEndpoint: '자체 호스팅 엔드포인트',
      formulaRecognition: '수식 인식',
      tableRecognition: '표 인식',
+      sealRecognition: '인장 인식',
+      chartRecognition: '차트 인식',
      language: '언어',
      testConnection: '연결 테스트',
      saveConfig: '설정 저장',
@@ -831,6 +833,9 @@ export default {
      serverUrl: '서버 URL',
      vlmServerUrlPlaceholder: '예: http://your-vllm-server:8000',
      vlmServerUrlHint: 'Backend가 vlm-http-client 또는 hybrid-http-client인 경우 필요',
+      paddleocrVlEndpointPlaceholder: '예: http://your-paddleocr-vl:8080',
+      paddleocrVlEndpointHint: 'PaddleOCR-VL 전체 서비스(pipeline) 주소를 입력하세요. /layout-parsing 접미사는 불필요합니다',
+      paddleocrVlCloudTokenPlaceholder: 'PaddleOCR-VL AI Studio Token',
    },
    storage: {
      title: '스토리지 엔진',
@@ -2053,6 +2058,9 @@ export default {
          max_owned_per_user: "사용자당 최대 테넌트 수",
          default_storage_quota_gb: "신규 테넌트 기본 저장 용량 (GB)",
        },
+        asynq: {
+          concurrency: "비동기 작업 워커 동시 처리 수",
+        },
      },
      enumLabels: {
        auth: {
@@ -4302,6 +4310,14 @@ export default {
          name: 'MinerU Cloud',
          desc: 'MinerU Cloud API',
        },
+        paddleocr_vl: {
+          name: 'PaddleOCR-VL',
+          desc: 'PaddleOCR-VL 자체 호스팅 서비스',
+        },
+        paddleocr_vl_cloud: {
+          name: 'PaddleOCR-VL Cloud',
+          desc: 'PaddleOCR-VL Cloud API',
+        },
        weknoracloud: {
          name: 'WeKnora Cloud',
          desc: 'WeKnora Cloud를 통한 문서 파싱',
--- a/frontend/src/i18n/locales/ru-RU.ts
+++ b/frontend/src/i18n/locales/ru-RU.ts
@@ -866,6 +866,8 @@ export default {
      selfHostedEndpoint: 'Собственная конечная точка',
      formulaRecognition: 'Распознавание формул',
      tableRecognition: 'Распознавание таблиц',
+      sealRecognition: 'Распознавание печатей',
+      chartRecognition: 'Распознавание диаграмм',
      language: 'Язык',
      testConnection: 'Проверить с текущими параметрами',
      saveConfig: 'Сохранить конфигурацию',
@@ -882,7 +884,10 @@ export default {
      languagePlaceholder: 'напр. ch, en, ja (по умолчанию ch)',
      mineruCloudApiKeyPlaceholder: 'MinerU Cloud API Key',
      vlmLabel: 'vlm (визуальная языковая модель)',
-      mineruHtmlLabel: 'MinerU-HTML (HTML парсинг)'
+      mineruHtmlLabel: 'MinerU-HTML (HTML парсинг)',
+      paddleocrVlEndpointPlaceholder: 'напр. http://your-paddleocr-vl:8080',
+      paddleocrVlEndpointHint: 'Адрес полного сервиса PaddleOCR-VL (pipeline); суффикс /layout-parsing не требуется',
+      paddleocrVlCloudTokenPlaceholder: 'Токен PaddleOCR-VL AI Studio'
    },
    storage: {
      title: 'Хранилище',
@@ -1772,6 +1777,9 @@ export default {
          max_owned_per_user: 'Максимум тенантов на пользователя',
          default_storage_quota_gb: 'Квота хранилища для новых тенантов по умолчанию (ГБ)',
        },
+        asynq: {
+          concurrency: 'Параллелизм воркеров асинхронных задач',
+        },
      },
      enumLabels: {
        auth: {
@@ -3802,6 +3810,14 @@ export default {
          name: 'MinerU Cloud',
          desc: 'MinerU Cloud API',
        },
+        paddleocr_vl: {
+          name: 'PaddleOCR-VL',
+          desc: 'Самостоятельно развёрнутый сервис PaddleOCR-VL',
+        },
+        paddleocr_vl_cloud: {
+          name: 'PaddleOCR-VL Cloud',
+          desc: 'PaddleOCR-VL Cloud API',
+        },
        weknoracloud: {
          name: 'WeKnora Cloud',
          desc: 'Парсинг документов через WeKnora Cloud',
--- a/frontend/src/i18n/locales/zh-CN.ts
+++ b/frontend/src/i18n/locales/zh-CN.ts
@@ -807,6 +807,8 @@ export default {
      selfHostedEndpoint: "自建端点",
      formulaRecognition: "公式识别",
      tableRecognition: "表格识别",
+      sealRecognition: "印章识别",
+      chartRecognition: "图表识别",
      language: "语言",
      testConnection: "测试连接",
      saveConfig: "保存配置",
@@ -827,6 +829,9 @@ export default {
      serverUrl: "服务器地址",
      vlmServerUrlPlaceholder: "如 http://your-vllm-server:8000",
      vlmServerUrlHint: "当 Backend 选择 vlm-http-client 或 hybrid-http-client 时需要填写",
+      paddleocrVlEndpointPlaceholder: "如 http://your-paddleocr-vl:8080",
+      paddleocrVlEndpointHint: "填写 PaddleOCR-VL 完整服务（pipeline）地址，无需 /layout-parsing 后缀",
+      paddleocrVlCloudTokenPlaceholder: "PaddleOCR-VL 飞桨星河社区 Token",
    },
    storage: {
      title: "存储引擎",
@@ -2032,6 +2037,9 @@ export default {
          max_owned_per_user: "每用户最大租户数",
          default_storage_quota_gb: "新租户默认存储配额 (GB)",
        },
+        asynq: {
+          concurrency: "异步任务并发数",
+        },
      },
      enumLabels: {
        auth: {
@@ -4234,6 +4242,14 @@ export default {
          name: "MinerU Cloud",
          desc: "MinerU Cloud API",
        },
+        paddleocr_vl: {
+          name: "PaddleOCR-VL",
+          desc: "PaddleOCR-VL 自部署服务",
+        },
+        paddleocr_vl_cloud: {
+          name: "PaddleOCR-VL Cloud",
+          desc: "PaddleOCR-VL 云 API",
+        },
        weknoracloud: {
          name: "WeKnora Cloud",
          desc: "使用 WeKnora Cloud 进行文档解析",
--- a/frontend/src/utils/markdownTableNormalize.ts
+++ b/frontend/src/utils/markdownTableNormalize.ts
@@ -0,0 +1,72 @@
+/** Matches a GFM alignment cell (---, :---, ---:, :---:). */
+const SEPARATOR_CELL = /^:?-{3,}:?$/;
+
+function splitRowCells(line: string): string[] {
+  const inner = line.trim();
+  if (!inner.startsWith('|')) {
+    return [];
+  }
+  let parts = inner.split('|');
+  if (parts.length && parts[0].trim() === '') {
+    parts = parts.slice(1);
+  }
+  if (parts.length && parts[parts.length - 1].trim() === '') {
+    parts = parts.slice(0, -1);
+  }
+  return parts.map((part) => part.trim());
+}
+
+function isTableRow(line: string): boolean {
+  const stripped = line.trim();
+  return stripped.startsWith('|') && stripped.includes('|', 1);
+}
+
+function isSeparatorRow(line: string): boolean {
+  const cells = splitRowCells(line);
+  return cells.length > 0 && cells.every((cell) => SEPARATOR_CELL.test(cell));
+}
+
+function isEmptyRow(line: string): boolean {
+  const cells = splitRowCells(line);
+  return cells.length > 0 && cells.every((cell) => cell === '');
+}
+
+function separatorRowFor(headerLine: string): string {
+  const cells = splitRowCells(headerLine);
+  return `| ${cells.map(() => '---').join(' | ')} |`;
+}
+
+function normalizeTableBlock(block: string[]): string[] {
+  let rows = [...block];
+  while (rows.length && isEmptyRow(rows[0])) {
+    rows.shift();
+  }
+  if (rows.length && isSeparatorRow(rows[0])) {
+    rows.shift();
+  }
+  if (rows.length >= 2 && !isSeparatorRow(rows[1])) {
+    rows = [rows[0], separatorRowFor(rows[0]), ...rows.slice(1)];
+  }
+  return rows;
+}
+
+/** Fix MarkItDown-style tables: empty row + separator before real rows. */
+export function normalizeSpuriousTablePrefixes(content: string): string {
+  const lines = content.split('\n');
+  const out: string[] = [];
+  let i = 0;
+  while (i < lines.length) {
+    if (!isTableRow(lines[i])) {
+      out.push(lines[i]);
+      i += 1;
+      continue;
+    }
+    const block: string[] = [];
+    while (i < lines.length && isTableRow(lines[i])) {
+      block.push(lines[i]);
+      i += 1;
+    }
+    out.push(...normalizeTableBlock(block));
+  }
+  return out.join('\n');
+}
--- a/frontend/src/views/settings/ParserEngineSettings.vue
+++ b/frontend/src/views/settings/ParserEngineSettings.vue
@@ -311,6 +311,60 @@
            />
          </div>
        </section>
+
+        <!-- Section 3 — paddleocr_vl 自建配置 -->
+        <section v-if="currentEngine.Name === 'paddleocr_vl'" class="setting-drawer__section">
+          <h4 class="setting-drawer__section-title">{{ $t('settings.parser.configSection', '配置') }}</h4>
+
+          <div class="form-item">
+            <label class="form-label required">{{ t('settings.parser.selfHostedEndpoint') }}</label>
+            <t-input
+              v-model="config.paddleocr_vl_endpoint"
+              :placeholder="$t('settings.parser.paddleocrVlEndpointPlaceholder')"
+              clearable
+            />
+            <p class="form-desc">{{ $t('settings.parser.paddleocrVlEndpointHint') }}</p>
+          </div>
+          <div class="form-item">
+            <label class="form-label">{{ $t('settings.parser.featuresLabel', '识别选项') }}</label>
+            <div class="form-toggles">
+              <t-checkbox v-model="config.paddleocr_vl_use_seal_recognition">{{ $t('settings.parser.sealRecognition') }}</t-checkbox>
+              <t-checkbox v-model="config.paddleocr_vl_use_chart_recognition">{{ $t('settings.parser.chartRecognition') }}</t-checkbox>
+            </div>
+          </div>
+        </section>
+
+        <!-- Section 3 — paddleocr_vl_cloud 云 API 配置 -->
+        <section v-if="currentEngine.Name === 'paddleocr_vl_cloud'" class="setting-drawer__section">
+          <h4 class="setting-drawer__section-title">{{ $t('settings.parser.configSection', '配置') }}</h4>
+
+          <div class="form-item">
+            <label class="form-label required">Token</label>
+            <t-input
+              v-model="config.paddleocr_vl_cloud_token"
+              type="password"
+              :placeholder="$t('settings.parser.paddleocrVlCloudTokenPlaceholder')"
+              clearable
+            >
+              <template #prefix-icon><t-icon name="lock-on" /></template>
+            </t-input>
+          </div>
+          <div class="form-item">
+            <label class="form-label">Model</label>
+            <t-input
+              v-model="config.paddleocr_vl_cloud_model"
+              placeholder="PaddleOCR-VL-1.6"
+              clearable
+            />
+          </div>
+          <div class="form-item">
+            <label class="form-label">{{ $t('settings.parser.featuresLabel', '识别选项') }}</label>
+            <div class="form-toggles">
+              <t-checkbox v-model="config.paddleocr_vl_cloud_use_seal_recognition">{{ $t('settings.parser.sealRecognition') }}</t-checkbox>
+              <t-checkbox v-model="config.paddleocr_vl_cloud_use_chart_recognition">{{ $t('settings.parser.chartRecognition') }}</t-checkbox>
+            </div>
+          </div>
+        </section>
      </div>
    </SettingDrawer>
  </div>
@@ -336,7 +390,7 @@ const { t } = useI18n()
 const uiStore = useUIStore()
 const authStore = useAuthStore()

-const CONFIGURABLE_ENGINES = new Set(['mineru', 'mineru_cloud'])
+const CONFIGURABLE_ENGINES = new Set(['mineru', 'mineru_cloud', 'paddleocr_vl', 'paddleocr_vl_cloud'])

 /** 各解析引擎的项目/官方文档地址 */
 const ENGINE_DOC_LINKS: Record<string, string> = {
@@ -344,6 +398,8 @@ const ENGINE_DOC_LINKS: Record<string, string> = {
  markitdown: 'https://github.com/microsoft/markitdown',
  mineru: 'https://github.com/opendatalab/MinerU',
  mineru_cloud: 'https://mineru.net/apiManage/docs',
+  paddleocr_vl: 'https://github.com/PaddlePaddle/PaddleOCR',
+  paddleocr_vl_cloud: 'https://aistudio.baidu.com/paddleocr',
 }

 /** 解析引擎配置默认值（与 DocReader/Python 侧一致） */
@@ -363,6 +419,13 @@ const DEFAULT_PARSER_CONFIG: ParserEngineConfig = {
  mineru_cloud_enable_table: true,
  mineru_cloud_enable_ocr: true,
  mineru_cloud_language: 'ch',
+  paddleocr_vl_endpoint: '',
+  paddleocr_vl_use_seal_recognition: true,
+  paddleocr_vl_use_chart_recognition: false,
+  paddleocr_vl_cloud_token: '',
+  paddleocr_vl_cloud_model: 'PaddleOCR-VL-1.6',
+  paddleocr_vl_cloud_use_seal_recognition: true,
+  paddleocr_vl_cloud_use_chart_recognition: false,
 }

 const engines = ref<ParserEngineInfo[]>([])
@@ -407,6 +470,8 @@ const ENGINE_ORDER: Record<string, number> = {
  markitdown: 3,
  mineru: 4,
  mineru_cloud: 5,
+  paddleocr_vl: 6,
+  paddleocr_vl_cloud: 7,
 }

 const sortedEngines = computed(() => {
@@ -491,6 +556,13 @@ async function loadConfig() {
      mineru_cloud_enable_table: data?.mineru_cloud_enable_table ?? DEFAULT_PARSER_CONFIG.mineru_cloud_enable_table ?? true,
      mineru_cloud_enable_ocr: data?.mineru_cloud_enable_ocr ?? DEFAULT_PARSER_CONFIG.mineru_cloud_enable_ocr ?? true,
      mineru_cloud_language: data?.mineru_cloud_language ?? DEFAULT_PARSER_CONFIG.mineru_cloud_language ?? 'ch',
+      paddleocr_vl_endpoint: data?.paddleocr_vl_endpoint ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_endpoint ?? '',
+      paddleocr_vl_use_seal_recognition: data?.paddleocr_vl_use_seal_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_use_seal_recognition ?? true,
+      paddleocr_vl_use_chart_recognition: data?.paddleocr_vl_use_chart_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_use_chart_recognition ?? false,
+      paddleocr_vl_cloud_token: data?.paddleocr_vl_cloud_token ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_token ?? '',
+      paddleocr_vl_cloud_model: data?.paddleocr_vl_cloud_model ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_model ?? 'PaddleOCR-VL-1.6',
+      paddleocr_vl_cloud_use_seal_recognition: data?.paddleocr_vl_cloud_use_seal_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_use_seal_recognition ?? true,
+      paddleocr_vl_cloud_use_chart_recognition: data?.paddleocr_vl_cloud_use_chart_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_use_chart_recognition ?? false,
    }
  } catch {
    config.value = { ...DEFAULT_PARSER_CONFIG }
@@ -521,6 +593,13 @@ function buildConfigPayload(): ParserEngineConfig {
    mineru_cloud_enable_table: config.value.mineru_cloud_enable_table,
    mineru_cloud_enable_ocr: config.value.mineru_cloud_enable_ocr,
    mineru_cloud_language: config.value.mineru_cloud_language?.trim() ?? '',
+    paddleocr_vl_endpoint: config.value.paddleocr_vl_endpoint?.trim() ?? '',
+    paddleocr_vl_use_seal_recognition: config.value.paddleocr_vl_use_seal_recognition,
+    paddleocr_vl_use_chart_recognition: config.value.paddleocr_vl_use_chart_recognition,
+    paddleocr_vl_cloud_token: config.value.paddleocr_vl_cloud_token?.trim() ?? '',
+    paddleocr_vl_cloud_model: config.value.paddleocr_vl_cloud_model?.trim() ?? '',
+    paddleocr_vl_cloud_use_seal_recognition: config.value.paddleocr_vl_cloud_use_seal_recognition,
+    paddleocr_vl_cloud_use_chart_recognition: config.value.paddleocr_vl_cloud_use_chart_recognition,
  }
 }

@@ -739,7 +818,9 @@ onMounted(loadAll)
  color: #0089FF;
 }
 .engine-card--mineru .engine-card__badge,
-.engine-card--mineru_cloud .engine-card__badge {
+.engine-card--mineru_cloud .engine-card__badge,
+.engine-card--paddleocr_vl .engine-card__badge,
+.engine-card--paddleocr_vl_cloud .engine-card__badge {
  background: rgba(98, 53, 187, 0.12);
  color: #6235BB;
 }
@@ -1086,7 +1167,9 @@ onMounted(loadAll)
  color: #0089FF;
 }
 .parser-engine-drawer--mineru .setting-drawer__header-icon,
-.parser-engine-drawer--mineru_cloud .setting-drawer__header-icon {
+.parser-engine-drawer--mineru_cloud .setting-drawer__header-icon,
+.parser-engine-drawer--paddleocr_vl .setting-drawer__header-icon,
+.parser-engine-drawer--paddleocr_vl_cloud .setting-drawer__header-icon {
  background: rgba(98, 53, 187, 0.12);
  color: #6235BB;
 }
--- a/internal/application/service/knowledge_process.go
+++ b/internal/application/service/knowledge_process.go
@@ -3256,6 +3256,10 @@ func (s *knowledgeService) resolveDocReader(ctx context.Context, engine, fileTyp
 		return docparser.NewMinerUReader(overrides)
 	case "mineru_cloud":
 		return docparser.NewMinerUCloudReader(overrides)
+	case "paddleocr_vl":
+		return docparser.NewPaddleOCRVLReader(overrides)
+	case "paddleocr_vl_cloud":
+		return docparser.NewPaddleOCRVLCloudReader(overrides)
 	case "builtin":
 		// 明确指定使用 builtin 引擎（docreader），不使用 simple format 兜底
 		return s.documentReader
--- a/internal/application/service/system_setting.go
+++ b/internal/application/service/system_setting.go
@@ -87,6 +87,10 @@ type settingSpec struct {
 	// Description is shown in the UI under the key. Stored on the row
 	// at first write (mirrors Category).
 	Description string
+	// RequiresRestart marks keys whose value is bound at process startup
+	// (e.g. asynq worker pool size). The UI shows a restart badge; the
+	// service persists the flag on first write.
+	RequiresRestart bool
 }

 // registry pins the set of legal keys. Expanding it is a deliberate,
@@ -161,6 +165,20 @@ var registry = map[string]settingSpec{
 			"仅在创建时读取，修改后只对之后新建的租户生效，不会回写已存在的租户。" +
 			"0 或负数表示使用内置默认值 10GB。",
 	},
+	// asynq.concurrency is the asynq worker pool size (parallel in-flight
+	// tasks). Read once when the asynq server starts — changing it in the
+	// UI requires a process restart to take effect. Mirrors
+	// WEKNORA_ASYNQ_CONCURRENCY (default 16).
+	"asynq.concurrency": {
+		Type:            "int",
+		EnvName:         "WEKNORA_ASYNQ_CONCURRENCY",
+		Default:         int64(16),
+		Category:        "worker",
+		RequiresRestart: true,
+		Description: "异步任务 worker 并发数（asynq 线程池大小）。" +
+			"文档解析、嵌入等任务多为 I/O 等待，适当提高可缩短批量上传排队时间。" +
+			"修改后需重启服务进程方可生效。",
+	},
 }

 // systemSettingService wires the repository, audit log, and (P2)
@@ -655,7 +673,7 @@ func (s *systemSettingService) virtualSetting(key string, spec settingSpec) *typ
 		Category:        category,
 		Description:     spec.Description,
 		IsSecret:        false,
-		RequiresRestart: false,
+		RequiresRestart: spec.RequiresRestart,
 		LastModifiedBy:  "",
 		Enum:            spec.Enum,
 	}
@@ -817,6 +835,7 @@ func (s *systemSettingService) Update(ctx context.Context, key string, rawValue
 			category = "general"
 		}
 		description = spec.Description
+		requiresRestart = spec.RequiresRestart
 	}

 	row := &types.SystemSetting{
@@ -1142,6 +1161,14 @@ func encodeForType(declared string, rawValue any) (types.JSON, error) {
 //     400 body verbatim).
 func validateRegistryEntry(key string, rawValue any) error {
 	switch key {
+	case "asynq.concurrency":
+		n, err := coerceToPositiveInt64(rawValue)
+		if err != nil {
+			return err
+		}
+		if n <= 0 {
+			return errors.New("concurrency must be a positive integer")
+		}
 	case "ssrf.whitelist":
 		// Coerce into the same shape encodeForType produced. We don't
 		// look at the encoded JSON because that's already canonicalised
@@ -1155,6 +1182,23 @@ func validateRegistryEntry(key string, rawValue any) error {
 	return nil
 }

+// coerceToPositiveInt64 accepts int / int64 / float64 from JSON decoding.
+func coerceToPositiveInt64(rawValue any) (int64, error) {
+	switch v := rawValue.(type) {
+	case int:
+		return int64(v), nil
+	case int64:
+		return v, nil
+	case float64:
+		if v != float64(int64(v)) {
+			return 0, errors.New("expected integer value")
+		}
+		return int64(v), nil
+	default:
+		return 0, fmt.Errorf("expected integer, got %T", rawValue)
+	}
+}
+
 // coerceToStringSlice mirrors the input shapes accepted by
 // encodeForType for "string_list": []any of strings, []string, or a
 // comma-separated string. Returns the trimmed, empty-stripped result.
--- a/internal/infrastructure/chunker/header_tracker.go
+++ b/internal/infrastructure/chunker/header_tracker.go
@@ -36,12 +36,22 @@ var defaultHeaderHooks = []headerTrackerHook{
 // tableRowPattern matches a single Markdown table row: "| cell | cell | ... |\n"
 var tableRowPattern = regexp.MustCompile(`(?m)^\s*(?:\|[^|\n]*)+\|\s*$`)

+// markdownTableHookPriority matches DEFAULT_CONFIGS / defaultHeaderHooks table hook.
+const markdownTableHookPriority = 15
+
 // headerTracker maintains the state of active headers across split units.
 type headerTracker struct {
 	hooks         []headerTrackerHook
 	activeHeaders map[int]string // priority -> header text
 	endedHeaders  map[int]bool   // priorities that have been ended
 	pendingExtend map[int]bool   // headers with empty column names awaiting first data row
+	// pendingTableBreak is set when a table row unit ends with a paragraph break
+	// (the blank line between tables is consumed by \n\n splitting). The header
+	// stays active until the next unit is seen so we can detect a new table.
+	pendingTableBreak bool
+	// headerEndedThisUnit tells mergeUnits to flush before the current unit when a
+	// new table starts (column mismatch or pendingTableBreak + table row).
+	headerEndedThisUnit bool
 }

 func newHeaderTracker() *headerTracker {
@@ -55,6 +65,20 @@ func newHeaderTracker() *headerTracker {

 // update checks split text for header start/end markers and updates internal state.
 func (ht *headerTracker) update(split string) {
+	ht.headerEndedThisUnit = false
+
+	if ht.pendingTableBreak {
+		ht.pendingTableBreak = false
+		if _, active := ht.activeHeaders[markdownTableHookPriority]; active {
+			if firstTableRowColumnCount(split) > 0 {
+				ht.clearTableHeader()
+				ht.headerEndedThisUnit = true
+			} else {
+				ht.clearTableHeader()
+			}
+		}
+	}
+
 	// 1. Check for header-end markers among currently active headers
 	for _, hook := range ht.hooks {
 		if _, active := ht.activeHeaders[hook.priority]; active {
@@ -66,6 +90,19 @@ func (ht *headerTracker) update(split string) {
 		}
 	}

+	// 1b. Paragraph splits consume the blank line between tables. Mark a break
+	// after "| last row |\n\n" and resolve on the next unit; also end when a new
+	// table row has a different column count than the active header.
+	if _, active := ht.activeHeaders[markdownTableHookPriority]; active {
+		if !ht.pendingExtend[markdownTableHookPriority] {
+			if splitEndsWithParagraphBreak(split) {
+				ht.pendingTableBreak = true
+			} else {
+				ht.endTableHeaderOnColumnMismatch(split)
+			}
+		}
+	}
+
 	// 2. If a header has an empty column-name row (e.g. "||"), replace it with
 	//    a proper Markdown table header using the first data row as column names.
 	//
@@ -159,3 +196,73 @@ func extractSeparatorLine(header string) string {
 	}
 	return ""
 }
+
+func (ht *headerTracker) clearTableHeader() {
+	ht.endedHeaders[markdownTableHookPriority] = true
+	delete(ht.activeHeaders, markdownTableHookPriority)
+	delete(ht.pendingExtend, markdownTableHookPriority)
+}
+
+func (ht *headerTracker) endTableHeaderOnColumnMismatch(split string) {
+	header, ok := ht.activeHeaders[markdownTableHookPriority]
+	if !ok {
+		return
+	}
+	rowCols := firstTableRowColumnCount(split)
+	headerCols := headerTableColumnCount(header)
+	if rowCols > 0 && headerCols > 0 && rowCols != headerCols {
+		ht.clearTableHeader()
+		ht.headerEndedThisUnit = true
+	}
+}
+
+func splitEndsWithParagraphBreak(split string) bool {
+	trimmed := strings.TrimRight(split, " \t\r")
+	return strings.HasSuffix(trimmed, "\n\n") || strings.HasSuffix(trimmed, "\r\n\r\n")
+}
+
+func tableRowColumnCount(line string) int {
+	line = strings.TrimSpace(line)
+	if !strings.HasPrefix(line, "|") {
+		return 0
+	}
+	parts := strings.Split(line, "|")
+	if len(parts) > 0 && strings.TrimSpace(parts[0]) == "" {
+		parts = parts[1:]
+	}
+	if len(parts) > 0 && strings.TrimSpace(parts[len(parts)-1]) == "" {
+		parts = parts[:len(parts)-1]
+	}
+	return len(parts)
+}
+
+func firstTableRowColumnCount(text string) int {
+	for _, line := range strings.Split(text, "\n") {
+		line = strings.TrimSpace(line)
+		if line != "" && tableRowPattern.MatchString(line) {
+			return tableRowColumnCount(line)
+		}
+	}
+	return 0
+}
+
+func headerTableColumnCount(header string) int {
+	for _, line := range strings.Split(header, "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" || strings.Contains(line, "---") {
+			continue
+		}
+		if n := tableRowColumnCount(line); n > 0 {
+			return n
+		}
+	}
+	return 0
+}
+
+// headerColumnMismatch reports whether the next split unit starts a new table
+// whose width differs from the active markdown table header.
+func headerColumnMismatch(headers, nextUnit string) bool {
+	headerCols := headerTableColumnCount(headers)
+	rowCols := firstTableRowColumnCount(nextUnit)
+	return headerCols > 0 && rowCols > 0 && headerCols != rowCols
+}
--- a/internal/infrastructure/chunker/splitter.go
+++ b/internal/infrastructure/chunker/splitter.go
@@ -450,6 +450,13 @@ func mergeUnits(units []splitUnit, chunkSize, chunkOverlap int) []Chunk {

 		// Update header tracking
 		ht.update(u.text)
+		// Flush at table boundary so the next table is not merged into a chunk
+		// that still carries the previous table's prepended header context.
+		if ht.headerEndedThisUnit && len(current) > 0 {
+			chunks = append(chunks, buildChunk(current, len(chunks)))
+			current = nil
+			curLen = 0
+		}
 		headers := ht.getHeaders()
 		headersLen := runeLen(headers)
 		if headersLen > chunkSize {
@@ -475,7 +482,8 @@ func mergeUnits(units []splitUnit, chunkSize, chunkOverlap int) []Chunk {
 				// Prepend headers if the column-name context is not already present
 				// in the overlap or the next unit being added.
 				overlapText := unitsText(current)
-				if !headerAlreadyPresent(headers, overlapText, u.text) {
+				if !headerAlreadyPresent(headers, overlapText, u.text) &&
+					!headerColumnMismatch(headers, u.text) {
 					startPos := u.start
 					if len(current) > 0 {
 						startPos = current[0].start
--- a/internal/infrastructure/chunker/splitter_test.go
+++ b/internal/infrastructure/chunker/splitter_test.go
@@ -674,6 +674,66 @@ func TestSplitText_EmptyHeaderRowPrepend(t *testing.T) {
 	}
 }

+func TestHeaderTracker_ColumnMismatchEndsTable(t *testing.T) {
+	ht := newHeaderTracker()
+	ht.update("| Name | Game | Fame | Blame |\n| --- | --- | --- | --- |\n")
+	if ht.getHeaders() == "" {
+		t.Fatal("expected active table header")
+	}
+	ht.update("| Sinple | Table |\n")
+	if h := ht.getHeaders(); h != "" {
+		t.Fatalf("2-col row should end 4-col table header, still active:\n%s", h)
+	}
+}
+
+func TestHeaderTracker_ParagraphBreakEndsOnNextUnit(t *testing.T) {
+	ht := newHeaderTracker()
+	ht.update("| Name | Game | Fame | Blame |\n| --- | --- | --- | --- |\n")
+	ht.update("| Russell Wilson | Football | High | Tacky uniform |\n\n")
+	if h := ht.getHeaders(); h == "" {
+		t.Fatal("paragraph break alone should not clear header yet")
+	}
+	if !ht.pendingTableBreak {
+		t.Fatal("expected pendingTableBreak after row ending with \\n\\n")
+	}
+	ht.update("| Sinple | Table |\n")
+	if h := ht.getHeaders(); h != "" {
+		t.Fatalf("next table row should clear previous header, got %q", h)
+	}
+	if !ht.headerEndedThisUnit {
+		t.Fatal("expected flush signal when new table starts after paragraph break")
+	}
+}
+
+func TestSplitText_EnTablesNoCrossTableHeader(t *testing.T) {
+	text := "## A table, with and without a header row\n\n" +
+		"| Name | Game | Fame | Blame |\n" +
+		"| --- | --- | --- | --- |\n" +
+		"| Lebron James | Basketball | Very High | Leaving Cleveland |\n" +
+		"| Ryan Braun | Baseball | Moderate | Steroids |\n" +
+		"| Russell Wilson | Football | High | Tacky uniform |\n\n" +
+		"| Sinple | Table |\n" +
+		"| Without | Header |\n\n" +
+		"| Simple  Multiparagraph | Table  Full |\n" +
+		"| Of  Paragraphs | In each  Cell. |\n"
+
+	cfg := SplitterConfig{ChunkSize: 200, ChunkOverlap: 20, Separators: []string{"\n\n", "\n", "。"}}
+	chunks := SplitText(text, cfg)
+	if len(chunks) < 2 {
+		t.Fatalf("expected multiple chunks, got %d", len(chunks))
+	}
+
+	for i, c := range chunks {
+		hasSinple := strings.Contains(c.Content, "| Sinple | Table |")
+		hasSimple := strings.Contains(c.Content, "| Simple  Multiparagraph |")
+		if hasSinple || hasSimple {
+			if strings.Contains(c.Content, "| Name | Game | Fame | Blame |") {
+				t.Errorf("chunk[%d] must not carry table-1 header into later tables:\n%s", i, c.Content)
+			}
+		}
+	}
+}
+
 func TestSplitText_MultipleTablesInDocument(t *testing.T) {
 	text := "" +
 		"第一个表格：\n\n" +
--- a/internal/infrastructure/docparser/engine_registry.go
+++ b/internal/infrastructure/docparser/engine_registry.go
@@ -30,6 +30,8 @@ func init() {
 	RegisterEngine(&weKnoraCloudEngine{})
 	RegisterEngine(&mineruEngine{})
 	RegisterEngine(&mineruCloudEngine{})
+	RegisterEngine(&paddleOCRVLEngine{})
+	RegisterEngine(&paddleOCRVLCloudEngine{})
 }

 // ---------------------------------------------------------------------------
@@ -133,6 +135,44 @@ func (e *mineruCloudEngine) CheckAvailable(_ bool, overrides map[string]string)
 	return PingMinerUCloud(apiKey)
 }

+// ---------------------------------------------------------------------------
+// paddleocr_vl — Go-native, calls a self-hosted PaddleOCR-VL pipeline service
+// ---------------------------------------------------------------------------
+
+type paddleOCRVLEngine struct{}
+
+func (e *paddleOCRVLEngine) Name() string        { return "paddleocr_vl" }
+func (e *paddleOCRVLEngine) Description() string { return "PaddleOCR-VL self-hosted service" }
+func (e *paddleOCRVLEngine) FileTypes(_ bool) []string {
+	return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff"}
+}
+func (e *paddleOCRVLEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
+	endpoint := strings.TrimSpace(overrides["paddleocr_vl_endpoint"])
+	if endpoint == "" {
+		return false, "PaddleOCR-VL service not configured"
+	}
+	return PingPaddleOCRVL(endpoint)
+}
+
+// ---------------------------------------------------------------------------
+// paddleocr_vl_cloud — Go-native, calls the PaddleOCR-VL AI Studio cloud API
+// ---------------------------------------------------------------------------
+
+type paddleOCRVLCloudEngine struct{}
+
+func (e *paddleOCRVLCloudEngine) Name() string        { return "paddleocr_vl_cloud" }
+func (e *paddleOCRVLCloudEngine) Description() string { return "PaddleOCR-VL Cloud API" }
+func (e *paddleOCRVLCloudEngine) FileTypes(_ bool) []string {
+	return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff"}
+}
+func (e *paddleOCRVLCloudEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
+	token := strings.TrimSpace(overrides["paddleocr_vl_cloud_token"])
+	if token == "" {
+		return false, "PaddleOCR-VL Cloud Token not configured"
+	}
+	return PingPaddleOCRVLCloud(token)
+}
+
 // ---------------------------------------------------------------------------
 // ListAllEngines — merge local + remote
 // ---------------------------------------------------------------------------
--- a/internal/infrastructure/docparser/paddleocr_vl_cloud_converter.go
+++ b/internal/infrastructure/docparser/paddleocr_vl_cloud_converter.go
@@ -0,0 +1,353 @@
+package docparser
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"mime"
+	"mime/multipart"
+	"net/http"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/Tencent/WeKnora/internal/logger"
+	"github.com/Tencent/WeKnora/internal/types"
+	"github.com/Tencent/WeKnora/internal/utils"
+)
+
+const (
+	paddleOCRVLCloudDefaultBaseURL = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs"
+	paddleOCRVLCloudDefaultModel   = "PaddleOCR-VL-1.6"
+	paddleOCRVLCloudPollInterval   = 5 * time.Second
+	paddleOCRVLCloudTimeout        = 600 * time.Second
+)
+
+// PaddleOCRVLCloudReader calls the PaddleOCR-VL AI Studio cloud API.
+// Flow: POST /jobs (multipart) → poll GET /jobs/{id} → download result JSONL,
+// then fetch each referenced image URL.
+type PaddleOCRVLCloudReader struct {
+	token    string
+	baseURL  string
+	model    string
+	useSeal  bool
+	useChart bool
+}
+
+// NewPaddleOCRVLCloudReader creates a reader from ParserEngineOverrides.
+func NewPaddleOCRVLCloudReader(overrides map[string]string) *PaddleOCRVLCloudReader {
+	return &PaddleOCRVLCloudReader{
+		token:    strings.TrimSpace(overrides["paddleocr_vl_cloud_token"]),
+		baseURL:  strings.TrimRight(stringOr(overrides["paddleocr_vl_cloud_base_url"], paddleOCRVLCloudDefaultBaseURL), "/"),
+		model:    stringOr(overrides["paddleocr_vl_cloud_model"], paddleOCRVLCloudDefaultModel),
+		useSeal:  parseBoolOr(overrides["paddleocr_vl_cloud_use_seal_recognition"], true),
+		useChart: parseBoolOr(overrides["paddleocr_vl_cloud_use_chart_recognition"], false),
+	}
+}
+
+func (c *PaddleOCRVLCloudReader) Read(ctx context.Context, req *types.ReadRequest) (*types.ReadResult, error) {
+	if c.token == "" {
+		return &types.ReadResult{Error: "PaddleOCR-VL Cloud token is not configured"}, nil
+	}
+
+	content := req.FileContent
+	if len(content) == 0 {
+		return &types.ReadResult{Error: "no file content provided"}, nil
+	}
+
+	logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] Parsing file=%s size=%d model=%s",
+		req.FileName, len(content), c.model)
+
+	jobID, err := c.submitJob(ctx, req, content)
+	if err != nil {
+		return nil, fmt.Errorf("PaddleOCR-VL Cloud submit: %w", err)
+	}
+
+	jsonlURL, err := c.pollJob(ctx, jobID)
+	if err != nil {
+		return nil, fmt.Errorf("PaddleOCR-VL Cloud poll: %w", err)
+	}
+
+	mdContent, imagesURL, err := c.fetchResults(jsonlURL)
+	if err != nil {
+		return nil, fmt.Errorf("PaddleOCR-VL Cloud fetch results: %w", err)
+	}
+
+	imageRefs := c.downloadImages(mdContent, imagesURL)
+	mdContent, imageRefs = ensureOriginalImageRef(req, mdContent, imageRefs)
+
+	logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] Parsed successfully, markdown=%d chars, images=%d",
+		len(mdContent), len(imageRefs))
+
+	return &types.ReadResult{
+		MarkdownContent: mdContent,
+		ImageRefs:       imageRefs,
+	}, nil
+}
+
+func (c *PaddleOCRVLCloudReader) optionalPayload() map[string]interface{} {
+	// Shared with the self-hosted engine so both produce identical output.
+	return paddleOCRVLRecognitionParams(c.useSeal, c.useChart)
+}
+
+// --- job submit ---
+
+type paddleOCRVLCloudSubmitResponse struct {
+	Data struct {
+		JobID string `json:"jobId"`
+	} `json:"data"`
+	ErrorCode int    `json:"errorCode"`
+	ErrorMsg  string `json:"errorMsg"`
+}
+
+func (c *PaddleOCRVLCloudReader) submitJob(ctx context.Context, req *types.ReadRequest, content []byte) (string, error) {
+	optional, err := json.Marshal(c.optionalPayload())
+	if err != nil {
+		return "", fmt.Errorf("marshal optionalPayload: %w", err)
+	}
+
+	fileName := req.FileName
+	if fileName == "" {
+		ext := strings.TrimPrefix(req.FileType, ".")
+		if ext == "" {
+			ext = "pdf"
+		}
+		fileName = "document." + ext
+	}
+
+	var body bytes.Buffer
+	writer := multipart.NewWriter(&body)
+	_ = writer.WriteField("model", c.model)
+	_ = writer.WriteField("optionalPayload", string(optional))
+	part, err := writer.CreateFormFile("file", filepath.Base(fileName))
+	if err != nil {
+		return "", fmt.Errorf("create form file: %w", err)
+	}
+	if _, err := part.Write(content); err != nil {
+		return "", fmt.Errorf("write file content: %w", err)
+	}
+	writer.Close()
+
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL, &body)
+	if err != nil {
+		return "", fmt.Errorf("create request: %w", err)
+	}
+	httpReq.Header.Set("Authorization", "bearer "+c.token)
+	httpReq.Header.Set("Content-Type", writer.FormDataContentType())
+
+	client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 60 * time.Second, MaxRedirects: 5})
+	resp, err := client.Do(httpReq)
+	if err != nil {
+		return "", fmt.Errorf("HTTP request: %w", err)
+	}
+	defer resp.Body.Close()
+
+	respBody, _ := io.ReadAll(resp.Body)
+	if resp.StatusCode != http.StatusOK {
+		return "", fmt.Errorf("API status %d: %s", resp.StatusCode, string(respBody))
+	}
+
+	var result paddleOCRVLCloudSubmitResponse
+	if err := json.Unmarshal(respBody, &result); err != nil {
+		return "", fmt.Errorf("decode response: %w", err)
+	}
+	if result.Data.JobID == "" {
+		return "", fmt.Errorf("API returned no jobId: %s", string(respBody))
+	}
+
+	logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] job submitted: jobId=%s", result.Data.JobID)
+	return result.Data.JobID, nil
+}
+
+// --- polling ---
+
+type paddleOCRVLCloudPollResponse struct {
+	Data struct {
+		State           string `json:"state"`
+		ErrorMsg        string `json:"errorMsg"`
+		ExtractProgress struct {
+			TotalPages     int `json:"totalPages"`
+			ExtractedPages int `json:"extractedPages"`
+		} `json:"extractProgress"`
+		ResultURL struct {
+			JSONURL string `json:"jsonUrl"`
+		} `json:"resultUrl"`
+	} `json:"data"`
+}
+
+func (c *PaddleOCRVLCloudReader) pollJob(ctx context.Context, jobID string) (string, error) {
+	deadline := time.Now().Add(paddleOCRVLCloudTimeout)
+	pollCount := 0
+	url := c.baseURL + "/" + jobID
+
+	for time.Now().Before(deadline) {
+		pollCount++
+
+		httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+		if err != nil {
+			return "", fmt.Errorf("create poll request: %w", err)
+		}
+		httpReq.Header.Set("Authorization", "bearer "+c.token)
+
+		client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 30 * time.Second, MaxRedirects: 5})
+		resp, err := client.Do(httpReq)
+		if err != nil {
+			logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] poll #%d failed: %v", pollCount, err)
+			sleepCtx(ctx, paddleOCRVLCloudPollInterval)
+			continue
+		}
+		respBody, _ := io.ReadAll(resp.Body)
+		resp.Body.Close()
+
+		if resp.StatusCode != http.StatusOK {
+			logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] poll #%d status %d: %s", pollCount, resp.StatusCode, string(respBody))
+			sleepCtx(ctx, paddleOCRVLCloudPollInterval)
+			continue
+		}
+
+		var pollResp paddleOCRVLCloudPollResponse
+		if err := json.Unmarshal(respBody, &pollResp); err != nil {
+			logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] poll #%d decode error: %v", pollCount, err)
+			sleepCtx(ctx, paddleOCRVLCloudPollInterval)
+			continue
+		}
+
+		state := strings.ToLower(pollResp.Data.State)
+		if pollCount == 1 || pollCount%6 == 0 || state == "done" || state == "failed" {
+			logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] poll #%d: state=%s pages=%d/%d",
+				pollCount, state, pollResp.Data.ExtractProgress.ExtractedPages, pollResp.Data.ExtractProgress.TotalPages)
+		}
+
+		switch state {
+		case "done":
+			if pollResp.Data.ResultURL.JSONURL == "" {
+				return "", fmt.Errorf("state=done but no jsonUrl")
+			}
+			return pollResp.Data.ResultURL.JSONURL, nil
+		case "failed":
+			return "", fmt.Errorf("task failed: %s", pollResp.Data.ErrorMsg)
+		}
+
+		sleepCtx(ctx, paddleOCRVLCloudPollInterval)
+	}
+
+	return "", fmt.Errorf("task timed out after %d polls", pollCount)
+}
+
+// --- result parsing ---
+
+type paddleOCRVLCloudResultLine struct {
+	Result struct {
+		LayoutParsingResults []struct {
+			Markdown struct {
+				Text   string            `json:"text"`
+				Images map[string]string `json:"images"`
+			} `json:"markdown"`
+		} `json:"layoutParsingResults"`
+	} `json:"result"`
+}
+
+func (c *PaddleOCRVLCloudReader) fetchResults(jsonlURL string) (string, map[string]string, error) {
+	if err := utils.ValidateURLForSSRF(jsonlURL); err != nil {
+		return "", nil, fmt.Errorf("jsonl URL blocked by SSRF check: %v", err)
+	}
+	client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 120 * time.Second, MaxRedirects: 5})
+	resp, err := client.Get(jsonlURL)
+	if err != nil {
+		return "", nil, fmt.Errorf("download jsonl: %w", err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusOK {
+		return "", nil, fmt.Errorf("download jsonl status %d", resp.StatusCode)
+	}
+	data, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return "", nil, fmt.Errorf("read jsonl body: %w", err)
+	}
+
+	texts := make([]string, 0)
+	images := make(map[string]string)
+	for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		var parsed paddleOCRVLCloudResultLine
+		if err := json.Unmarshal([]byte(line), &parsed); err != nil {
+			logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] skip malformed jsonl line: %v", err)
+			continue
+		}
+		for _, p := range parsed.Result.LayoutParsingResults {
+			if t := strings.TrimSpace(p.Markdown.Text); t != "" {
+				texts = append(texts, p.Markdown.Text)
+			}
+			for path, u := range p.Markdown.Images {
+				if _, ok := images[path]; !ok {
+					images[path] = u
+				}
+			}
+		}
+	}
+
+	logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] fetched %d page(s), images=%d", len(texts), len(images))
+	return strings.Join(texts, "\n\n"), images, nil
+}
+
+// downloadImages fetches each referenced image URL and builds ImageRef entries.
+func (c *PaddleOCRVLCloudReader) downloadImages(mdContent string, imagesURL map[string]string) []types.ImageRef {
+	var refs []types.ImageRef
+	client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 60 * time.Second, MaxRedirects: 5})
+
+	for ipath, u := range imagesURL {
+		matchedRefs := mineruImageOriginalRefs(mdContent, ipath)
+		if len(matchedRefs) == 0 {
+			continue
+		}
+		if err := utils.ValidateURLForSSRF(u); err != nil {
+			logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] image URL blocked %s: %v", ipath, err)
+			continue
+		}
+		resp, err := client.Get(u)
+		if err != nil {
+			logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] download image %s: %v", ipath, err)
+			continue
+		}
+		imgBytes, err := io.ReadAll(resp.Body)
+		resp.Body.Close()
+		if err != nil || resp.StatusCode != http.StatusOK {
+			logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] read image %s status=%d err=%v", ipath, resp.StatusCode, err)
+			continue
+		}
+
+		ext := strings.TrimPrefix(filepath.Ext(ipath), ".")
+		if ext == "" {
+			ext = "png"
+		}
+		mimeType := mime.TypeByExtension("." + ext)
+		if mimeType == "" {
+			mimeType = "image/png"
+		}
+
+		for _, originalRef := range matchedRefs {
+			refs = append(refs, types.ImageRef{
+				Filename:    ipath,
+				OriginalRef: originalRef,
+				MimeType:    mimeType,
+				ImageData:   imgBytes,
+			})
+		}
+	}
+
+	return refs
+}
+
+// PingPaddleOCRVLCloud checks whether the cloud token is present (the API has
+// no lightweight health endpoint, so we only validate configuration here).
+func PingPaddleOCRVLCloud(token string) (bool, string) {
+	if strings.TrimSpace(token) == "" {
+		return false, "未配置 PaddleOCR-VL Cloud Token"
+	}
+	return true, ""
+}
--- a/internal/infrastructure/docparser/paddleocr_vl_converter.go
+++ b/internal/infrastructure/docparser/paddleocr_vl_converter.go
@@ -0,0 +1,282 @@
+package docparser
+
+import (
+	"bytes"
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io"
+	"mime"
+	"net/http"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/Tencent/WeKnora/internal/logger"
+	"github.com/Tencent/WeKnora/internal/types"
+	"github.com/Tencent/WeKnora/internal/utils"
+)
+
+const paddleOCRVLTimeout = 1000 * time.Second // large scanned PDFs can take a while
+
+// PaddleOCRVLReader calls a self-hosted PaddleOCR-VL pipeline service
+// (the full document-parsing API, not the bare VLM inference server).
+//
+// Flow: POST {endpoint}/layout-parsing with base64 file → synchronous JSON
+// response containing per-page markdown + inline base64 images.
+type PaddleOCRVLReader struct {
+	endpoint  string
+	useSeal   bool
+	useChart  bool
+	useLayout bool
+}
+
+// NewPaddleOCRVLReader creates a reader from ParserEngineOverrides.
+func NewPaddleOCRVLReader(overrides map[string]string) *PaddleOCRVLReader {
+	return &PaddleOCRVLReader{
+		endpoint:  strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
+		useSeal:   parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
+		useChart:  parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
+		useLayout: parseBoolOr(overrides["paddleocr_vl_use_layout_detection"], true),
+	}
+}
+
+func (c *PaddleOCRVLReader) Read(ctx context.Context, req *types.ReadRequest) (*types.ReadResult, error) {
+	if c.endpoint == "" {
+		return &types.ReadResult{Error: "PaddleOCR-VL endpoint is not configured"}, nil
+	}
+
+	content := req.FileContent
+	if len(content) == 0 {
+		return &types.ReadResult{Error: "no file content provided"}, nil
+	}
+
+	logger.Infof(context.Background(), "[PaddleOCR-VL] Parsing file=%s size=%d via %s",
+		req.FileName, len(content), c.endpoint)
+
+	mdContent, imagesB64, err := c.callLayoutParsing(ctx, req, content)
+	if err != nil {
+		return nil, fmt.Errorf("PaddleOCR-VL layout-parsing: %w", err)
+	}
+
+	imageRefs, mdContent := c.processImages(mdContent, imagesB64)
+	mdContent, imageRefs = ensureOriginalImageRef(req, mdContent, imageRefs)
+
+	logger.Infof(context.Background(), "[PaddleOCR-VL] Parsed successfully, markdown=%d chars, images=%d",
+		len(mdContent), len(imageRefs))
+
+	return &types.ReadResult{
+		MarkdownContent: mdContent,
+		ImageRefs:       imageRefs,
+	}, nil
+}
+
+// paddleOCRVLRecognitionParams returns the recognition / page-restructuring
+// parameters shared by the self-hosted (/layout-parsing, top-level body) and
+// cloud (optionalPayload) request bodies. Keeping both identical ensures the
+// self-hosted engine reproduces the cloud output: cross-page table merging,
+// multi-level heading reconstruction, header/footer stripping, and the same
+// sampling / resolution settings used by the AI Studio service.
+func paddleOCRVLRecognitionParams(useSeal, useChart bool) map[string]interface{} {
+	return map[string]interface{}{
+		"markdownIgnoreLabels": []string{
+			"header", "header_image", "footer", "footer_image",
+			"number", "footnote", "aside_text",
+		},
+		"useDocOrientationClassify": false,
+		"useDocUnwarping":           false,
+		"useLayoutDetection":        true,
+		"useChartRecognition":       useChart,
+		"useSealRecognition":        useSeal,
+		"useOcrForImageBlock":       false,
+		"mergeTables":               true,
+		"relevelTitles":             true,
+		"restructurePages":          true,
+		"layoutShapeMode":           "auto",
+		"promptLabel":               "ocr",
+		"layoutNms":                 true,
+		"repetitionPenalty":         1,
+		"temperature":               0,
+		"topP":                      1,
+		"minPixels":                 147384,
+		"maxPixels":                 2822400,
+	}
+}
+
+// fileTypeCode maps a request to the PaddleOCR-VL fileType field:
+// 0 = PDF, 1 = image (including TIFF).
+func fileTypeCode(req *types.ReadRequest) int {
+	ft := strings.ToLower(strings.TrimPrefix(req.FileType, "."))
+	if ft == "" {
+		ft = strings.TrimPrefix(strings.ToLower(filepath.Ext(req.FileName)), ".")
+	}
+	if ft == "pdf" {
+		return 0
+	}
+	return 1
+}
+
+// paddleOCRVLResponse mirrors the relevant fields of the PaddleX serving
+// /layout-parsing response. The service returns one entry per page.
+type paddleOCRVLResponse struct {
+	ErrorCode int    `json:"errorCode"`
+	ErrorMsg  string `json:"errorMsg"`
+	Result    struct {
+		LayoutParsingResults []struct {
+			Markdown struct {
+				Text   string            `json:"text"`
+				Images map[string]string `json:"images"`
+			} `json:"markdown"`
+		} `json:"layoutParsingResults"`
+	} `json:"result"`
+}
+
+func (c *PaddleOCRVLReader) callLayoutParsing(
+	ctx context.Context, req *types.ReadRequest, content []byte,
+) (string, map[string]string, error) {
+	payload := paddleOCRVLRecognitionParams(c.useSeal, c.useChart)
+	payload["file"] = base64.StdEncoding.EncodeToString(content)
+	payload["fileType"] = fileTypeCode(req)
+	payload["visualize"] = false
+	if !c.useLayout {
+		payload["useLayoutDetection"] = false
+	}
+
+	body, err := json.Marshal(payload)
+	if err != nil {
+		return "", nil, fmt.Errorf("marshal payload: %w", err)
+	}
+
+	httpReq, err := http.NewRequestWithContext(
+		ctx, http.MethodPost, c.endpoint+"/layout-parsing", bytes.NewReader(body),
+	)
+	if err != nil {
+		return "", nil, fmt.Errorf("create request: %w", err)
+	}
+	httpReq.Header.Set("Content-Type", "application/json")
+
+	client := &http.Client{Timeout: paddleOCRVLTimeout}
+	resp, err := client.Do(httpReq)
+	if err != nil {
+		return "", nil, fmt.Errorf("HTTP request: %w", err)
+	}
+	defer resp.Body.Close()
+
+	respBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return "", nil, fmt.Errorf("read response body: %w", err)
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		return "", nil, fmt.Errorf("PaddleOCR-VL API status %d: %s", resp.StatusCode, string(respBody))
+	}
+
+	var result paddleOCRVLResponse
+	if err := json.Unmarshal(respBody, &result); err != nil {
+		return "", nil, fmt.Errorf("decode response: %w", err)
+	}
+	if result.ErrorCode != 0 {
+		return "", nil, fmt.Errorf("PaddleOCR-VL error %d: %s", result.ErrorCode, result.ErrorMsg)
+	}
+
+	pages := result.Result.LayoutParsingResults
+	if len(pages) == 0 {
+		logger.Errorf(context.Background(), "[PaddleOCR-VL] response has no layoutParsingResults")
+		return "", nil, nil
+	}
+
+	// Merge per-page markdown and image dicts into one document.
+	texts := make([]string, 0, len(pages))
+	images := make(map[string]string)
+	for _, p := range pages {
+		if t := strings.TrimSpace(p.Markdown.Text); t != "" {
+			texts = append(texts, p.Markdown.Text)
+		}
+		for path, data := range p.Markdown.Images {
+			if _, ok := images[path]; !ok {
+				images[path] = data
+			}
+		}
+	}
+
+	logger.Infof(context.Background(), "[PaddleOCR-VL] parsed %d page(s), images=%d", len(pages), len(images))
+	return strings.Join(texts, "\n\n"), images, nil
+}
+
+// processImages decodes the inline base64 images returned by PaddleOCR-VL and
+// builds ImageRef entries, matching them against references in the markdown.
+func (c *PaddleOCRVLReader) processImages(
+	mdContent string, imagesB64 map[string]string,
+) ([]types.ImageRef, string) {
+	var refs []types.ImageRef
+
+	for ipath, b64Str := range imagesB64 {
+		matchedRefs := mineruImageOriginalRefs(mdContent, ipath)
+		if len(matchedRefs) == 0 {
+			continue
+		}
+
+		var imgBytes []byte
+		var ext string
+		if m := b64DataURIPattern.FindStringSubmatch(b64Str); len(m) == 3 {
+			ext = m[1]
+			decoded, err := base64.StdEncoding.DecodeString(m[2])
+			if err != nil {
+				logger.Errorf(context.Background(), "[PaddleOCR-VL] decode base64 image %s: %v", ipath, err)
+				continue
+			}
+			imgBytes = decoded
+		} else {
+			decoded, err := base64.StdEncoding.DecodeString(b64Str)
+			if err != nil {
+				logger.Errorf(context.Background(), "[PaddleOCR-VL] decode raw base64 image %s: %v", ipath, err)
+				continue
+			}
+			imgBytes = decoded
+			ext = strings.TrimPrefix(filepath.Ext(ipath), ".")
+			if ext == "" {
+				ext = "png"
+			}
+		}
+
+		mimeType := mime.TypeByExtension("." + ext)
+		if mimeType == "" {
+			mimeType = "image/png"
+		}
+
+		for _, originalRef := range matchedRefs {
+			refs = append(refs, types.ImageRef{
+				Filename:    ipath,
+				OriginalRef: originalRef,
+				MimeType:    mimeType,
+				ImageData:   imgBytes,
+			})
+		}
+	}
+
+	return refs, mdContent
+}
+
+// PingPaddleOCRVL checks whether a self-hosted PaddleOCR-VL service is reachable.
+func PingPaddleOCRVL(endpoint string) (bool, string) {
+	endpoint = strings.TrimRight(endpoint, "/")
+	if endpoint == "" {
+		return false, "未配置 PaddleOCR-VL 端点"
+	}
+	client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{
+		Timeout:      5 * time.Second,
+		MaxRedirects: 5,
+	})
+	// The pipeline only exposes POST /layout-parsing; an empty GET should still
+	// produce a routed HTTP response (e.g. 404/405) when the service is up.
+	resp, err := client.Get(endpoint + "/layout-parsing")
+	if err != nil {
+		return false, fmt.Sprintf("PaddleOCR-VL 服务不可达: %v", err)
+	}
+	resp.Body.Close()
+	if resp.StatusCode >= 500 {
+		return false, fmt.Sprintf("PaddleOCR-VL 服务返回状态 %d", resp.StatusCode)
+	}
+	return true, ""
+}
--- a/internal/router/task.go
+++ b/internal/router/task.go
@@ -120,18 +120,15 @@ func asynqRetryDelayFunc(n int, e error, t *asynq.Task) time.Duration {
 // not on local CPU).
 const defaultAsynqConcurrency = 16

-func readAsynqConcurrency() int {
-	if v := strings.TrimSpace(os.Getenv("WEKNORA_ASYNQ_CONCURRENCY")); v != "" {
-		if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
-			return parsed
+func NewAsynqServer(svc interfaces.SystemSettingService) *asynq.Server {
+	opt := getAsynqRedisClientOpt()
+	concurrency := defaultAsynqConcurrency
+	if svc != nil {
+		n := svc.GetInt(context.Background(), "asynq.concurrency", "WEKNORA_ASYNQ_CONCURRENCY", defaultAsynqConcurrency)
+		if n > 0 {
+			concurrency = int(n)
 		}
 	}
-	return defaultAsynqConcurrency
-}
-
-func NewAsynqServer() *asynq.Server {
-	opt := getAsynqRedisClientOpt()
-	concurrency := readAsynqConcurrency()
 	log.Printf("asynq server starting with concurrency=%d redis_op_timeout=%dms",
 		concurrency, readRedisOpTimeoutMs())
 	srv := asynq.NewServer(
--- a/internal/types/tenant.go
+++ b/internal/types/tenant.go
@@ -285,6 +285,24 @@ type ParserEngineConfig struct {
 	MinerUCloudEnableTable   *bool  `json:"mineru_cloud_enable_table,omitempty"`
 	MinerUCloudEnableOCR     *bool  `json:"mineru_cloud_enable_ocr,omitempty"`
 	MinerUCloudLanguage      string `json:"mineru_cloud_language,omitempty"`
+
+	// OpenDataLoader PDF (docreader engine); hybrid requires opendataloader-pdf-hybrid service.
+	ODLHybrid           string `json:"odl_hybrid,omitempty"`      // off (default), docling-fast, hancom-ai
+	ODLHybridURL        string `json:"odl_hybrid_url,omitempty"`  // e.g. http://odl-hybrid:5002
+	ODLHybridMode       string `json:"odl_hybrid_mode,omitempty"` // auto, full
+	ODLHybridFallback   *bool  `json:"odl_hybrid_fallback,omitempty"`
+	ODLMarkdownWithHTML *bool  `json:"odl_markdown_with_html,omitempty"`
+
+	// PaddleOCR-VL self-hosted pipeline service (full /layout-parsing API).
+	PaddleOCRVLEndpoint            string `json:"paddleocr_vl_endpoint,omitempty"` // e.g. http://paddleocr-vl:8080
+	PaddleOCRVLUseSealRecognition  *bool  `json:"paddleocr_vl_use_seal_recognition,omitempty"`
+	PaddleOCRVLUseChartRecognition *bool  `json:"paddleocr_vl_use_chart_recognition,omitempty"`
+
+	// PaddleOCR-VL AI Studio cloud API.
+	PaddleOCRVLCloudToken               string `json:"paddleocr_vl_cloud_token,omitempty"`
+	PaddleOCRVLCloudModel               string `json:"paddleocr_vl_cloud_model,omitempty"` // e.g. PaddleOCR-VL-1.6
+	PaddleOCRVLCloudUseSealRecognition  *bool  `json:"paddleocr_vl_cloud_use_seal_recognition,omitempty"`
+	PaddleOCRVLCloudUseChartRecognition *bool  `json:"paddleocr_vl_cloud_use_chart_recognition,omitempty"`
 }

 // ToOverridesMap returns a map suitable for ParserEngineOverrides in parse requests.
@@ -333,6 +351,42 @@ func (c *ParserEngineConfig) ToOverridesMap() map[string]string {
 	if c.MinerUCloudLanguage != "" {
 		m["mineru_cloud_language"] = c.MinerUCloudLanguage
 	}
+	if c.ODLHybrid != "" {
+		m["odl_hybrid"] = c.ODLHybrid
+	}
+	if c.ODLHybridURL != "" {
+		m["odl_hybrid_url"] = c.ODLHybridURL
+	}
+	if c.ODLHybridMode != "" {
+		m["odl_hybrid_mode"] = c.ODLHybridMode
+	}
+	if c.ODLHybridFallback != nil {
+		m["odl_hybrid_fallback"] = fmt.Sprintf("%v", *c.ODLHybridFallback)
+	}
+	if c.ODLMarkdownWithHTML != nil {
+		m["odl_markdown_with_html"] = fmt.Sprintf("%v", *c.ODLMarkdownWithHTML)
+	}
+	if c.PaddleOCRVLEndpoint != "" {
+		m["paddleocr_vl_endpoint"] = c.PaddleOCRVLEndpoint
+	}
+	if c.PaddleOCRVLUseSealRecognition != nil {
+		m["paddleocr_vl_use_seal_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLUseSealRecognition)
+	}
+	if c.PaddleOCRVLUseChartRecognition != nil {
+		m["paddleocr_vl_use_chart_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLUseChartRecognition)
+	}
+	if c.PaddleOCRVLCloudToken != "" {
+		m["paddleocr_vl_cloud_token"] = c.PaddleOCRVLCloudToken
+	}
+	if c.PaddleOCRVLCloudModel != "" {
+		m["paddleocr_vl_cloud_model"] = c.PaddleOCRVLCloudModel
+	}
+	if c.PaddleOCRVLCloudUseSealRecognition != nil {
+		m["paddleocr_vl_cloud_use_seal_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLCloudUseSealRecognition)
+	}
+	if c.PaddleOCRVLCloudUseChartRecognition != nil {
+		m["paddleocr_vl_cloud_use_chart_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLCloudUseChartRecognition)
+	}
 	if len(m) == 0 {
 		return nil
 	}
--- a/scripts/cloud-image/README.md
+++ b/scripts/cloud-image/README.md
@@ -91,6 +91,7 @@ WeKnora `docker-compose.yml` 大量服务是 **profile 限定**，本镜像只
 | `jaeger` | OpenTelemetry trace UI |
 | `langfuse` | 自建 Langfuse 可观测平台 |
 | `dex` | OIDC 登录 |
+| `odl-hybrid` | OpenDataLoader Docling hybrid（体积大，无预发布镜像，需 `--build`） |

 启用方式：

@@ -99,6 +100,7 @@ cd /opt/WeKnora
 docker compose --profile neo4j up -d                 # 启用 GraphRAG
 docker compose --profile langfuse up -d              # 启用自建 Langfuse
 docker compose --profile qdrant up -d                # 切换到 Qdrant
+docker compose --profile odl-hybrid up -d --build odl-hybrid  # Docling hybrid（按需）
 ```

 ---
--- a/scripts/dev.sh
+++ b/scripts/dev.sh
@@ -72,14 +72,17 @@ show_help() {
    echo "  --dex         启动 Dex（OIDC 身份认证）"
    echo "  --langfuse    启动 Langfuse（默认已开启）"
    echo "  --no-langfuse 不启动 Langfuse"
-    echo "  --full        启动所有可选服务"
+    echo "  --odl-hybrid  启动 OpenDataLoader hybrid（Docling，镜像较大，按需启用）"
+    echo "  --full        启动所有可选服务（不含 odl-hybrid，需另加 --odl-hybrid）"
    echo ""
    echo "示例："
    echo "  $0 start                    # 启动基础服务"
    echo "  $0 start --qdrant           # 启动基础服务 + Qdrant"
    echo "  $0 start --qdrant --jaeger  # 启动基础服务 + Qdrant + Jaeger"
    echo "  $0 start --dex             # 启动基础服务 + Dex"
+    echo "  $0 start --odl-hybrid       # 启动基础服务 + OpenDataLoader hybrid"
    echo "  $0 start --full             # 启动所有服务"
+    echo "  make dev-start DEV_ARGS=--odl-hybrid   # 同上（Makefile 传参）"
    echo "  $0 app                      # 在另一个终端启动后端"
    echo "  $0 frontend                 # 在另一个终端启动前端"
 }
@@ -104,6 +107,46 @@ check_docker() {
    return 0
 }

+# 检查 .env 是否启用了 hybrid 模式（用于 --odl-hybrid 启动后重建 docreader）
+_should_enable_odl_hybrid_from_env() {
+    local hybrid="${DOCREADER_ODL_HYBRID:-off}"
+    hybrid=$(printf '%s' "$hybrid" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]')
+    case "$hybrid" in
+        off|"") return 1 ;;
+        *) return 0 ;;
+    esac
+}
+
+_enable_odl_hybrid_profile() {
+    PROFILES="$PROFILES --profile odl-hybrid"
+    ENABLED_SERVICES="$ENABLED_SERVICES odl-hybrid"
+}
+
+# 等待 odl-hybrid HTTP 健康检查通过（compose 启动后服务可能仍在拉依赖）
+_wait_odl_hybrid_ready() {
+    local port="${ODL_HYBRID_PORT:-5002}"
+    local max_wait="${ODL_HYBRID_STARTUP_WAIT_SEC:-180}"
+    local waited=0
+    local interval=5
+
+    if ! command -v curl &> /dev/null; then
+        log_warning "未安装 curl，跳过 odl-hybrid 就绪等待；请手动检查 http://localhost:${port}/health"
+        return 0
+    fi
+
+    log_info "等待 odl-hybrid 就绪（最多 ${max_wait}s，首次需构建镜像: docker compose ... build odl-hybrid）..."
+    while [ "$waited" -lt "$max_wait" ]; do
+        if curl -sf "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then
+            log_success "odl-hybrid 已就绪 (http://localhost:${port}/health)"
+            return 0
+        fi
+        sleep "$interval"
+        waited=$((waited + interval))
+    done
+    log_warning "odl-hybrid 在 ${max_wait}s 内未就绪，请查看: docker logs WeKnora-odl-hybrid"
+    return 1
+}
+
 # 启动基础设施服务
 start_services() {
    log_info "启动开发环境基础设施服务..."
@@ -120,6 +163,11 @@ start_services() {
        log_error ".env 文件不存在，请先创建"
        return 1
    fi
+
+    set -a
+    # shellcheck source=/dev/null
+    source .env
+    set +a
    
    # 解析 profile 参数
    shift  # 移除 "start" 命令本身
@@ -127,7 +175,6 @@ start_services() {
    # 其余可选服务通过 --minio / --qdrant / --neo4j / --jaeger / --dex / --full 按需开启。
    PROFILES="--profile langfuse"
    ENABLED_SERVICES="langfuse"
-    
    while [ $# -gt 0 ]; do
        case "$1" in
            --minio)
@@ -158,6 +205,11 @@ start_services() {
                PROFILES="${PROFILES//--profile langfuse/}"
                ENABLED_SERVICES="${ENABLED_SERVICES//langfuse/}"
                ;;
+            --odl-hybrid)
+                if [[ "$ENABLED_SERVICES" != *"odl-hybrid"* ]]; then
+                    _enable_odl_hybrid_profile
+                fi
+                ;;
            --full)
                PROFILES="--profile full"
                ENABLED_SERVICES="minio qdrant neo4j jaeger dex"
@@ -169,11 +221,22 @@ start_services() {
        esac
        shift
    done
-    
-    # 启动服务
+
+    # 启动服务（odl-hybrid 单独 --build，避免每次重建 docreader）
    "$DOCKER_COMPOSE_BIN" $DOCKER_COMPOSE_SUBCMD -f docker-compose.dev.yml $PROFILES up -d
-    
-    if [ $? -eq 0 ]; then
+    local compose_rc=$?
+    if [ "$compose_rc" -eq 0 ] && [[ "$ENABLED_SERVICES" == *"odl-hybrid"* ]]; then
+        log_info "构建/更新 odl-hybrid 镜像..."
+        "$DOCKER_COMPOSE_BIN" $DOCKER_COMPOSE_SUBCMD -f docker-compose.dev.yml $PROFILES up -d --build odl-hybrid
+        _wait_odl_hybrid_ready || true
+        # docreader 需读取 DOCREADER_ODL_HYBRID；若刚改 .env，强制重建以注入环境变量
+        if _should_enable_odl_hybrid_from_env; then
+            log_info "重建 docreader 以应用 DOCREADER_ODL_HYBRID=${DOCREADER_ODL_HYBRID} ..."
+            "$DOCKER_COMPOSE_BIN" $DOCKER_COMPOSE_SUBCMD -f docker-compose.dev.yml up -d --force-recreate docreader
+        fi
+    fi
+
+    if [ "$compose_rc" -eq 0 ]; then
        log_success "基础设施服务已启动"
        echo ""
        log_info "服务访问地址:"
@@ -200,6 +263,10 @@ start_services() {
        if [[ "$ENABLED_SERVICES" == *"langfuse"* ]]; then
            echo "  - Langfuse:      http://localhost:${LANGFUSE_WEB_PORT:-3000}"
        fi
+        if [[ "$ENABLED_SERVICES" == *"odl-hybrid"* ]]; then
+            echo "  - ODL Hybrid:    http://localhost:${ODL_HYBRID_PORT:-5002} (health: /health)"
+            echo "                   docreader 需 DOCREADER_ODL_HYBRID=docling-fast"
+        fi
        
        echo ""
        log_info "接下来的步骤:"