mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
feat(parser): add OpenDataLoader, PaddleOCR-VL engines, and parser improvements
Introduce opendataloader and PaddleOCR-VL parser engines with tenant-level settings UI, replace liteparse, and harden Excel/PPT/Markdown parsing. Optional odl-hybrid sidecar stays local-build only and is excluded from default dev-start and full profiles.
This commit is contained in:
37
.env.example
37
.env.example
@@ -562,6 +562,39 @@ DOCREADER_TRANSPORT=grpc
|
||||
# 渲染页图的最大长边像素(防止超大页面 PDF 渲染出 100+MP 图、撑爆 gRPC 消息上限)
|
||||
# 调小可进一步减小图片体积;过小会影响 OCR 识别(密集中文建议 >=1600)
|
||||
# DOCREADER_PDF_RENDER_MAX_EDGE=2000
|
||||
# Layout text: insert spaces when glyph gaps exceed this × median char width (default 0.4).
|
||||
# DOCREADER_PDF_WORD_GAP_WIDTH_RATIO=0.4
|
||||
# Native PDF layout: drop narrow margin columns (arXiv sidebar) below this page-width ratio (default 0.12).
|
||||
# DOCREADER_PDF_MARGIN_COL_WIDTH_RATIO=0.12
|
||||
# DOCREADER_PDF_MIN_HEADING_LINE_CHARS=8
|
||||
# Remove U+FFFE/soft-hyphen artifacts; strip vector chart axis text; render chart areas as JPEG.
|
||||
# DOCREADER_PDF_SANITIZE_TEXT=true
|
||||
# DOCREADER_PDF_STRIP_CHART_DEBRIS=true
|
||||
# DOCREADER_PDF_RENDER_VECTOR_FIGURES=true
|
||||
|
||||
# OpenDataLoader PDF(知识库 parser_engine_rules 指定 engine: opendataloader)
|
||||
# 需 Java 11+;docreader 镜像已包含 openjdk-17-jre-headless。
|
||||
# DOCREADER_ODL_MAX_WORKERS=1
|
||||
# fast 模式(默认):DOCREADER_ODL_HYBRID=off
|
||||
# hybrid 需另起服务;镜像/模型较大,默认 pull/up 与 --full 均不含 odl-hybrid。
|
||||
# 默认 --no-ocr(不做 EasyOCR)。
|
||||
# 开发:make dev-start DEV_ARGS=--odl-hybrid(本地 build)
|
||||
# 生产/docker-compose.yml(需 DOCREADER_ODL_HYBRID=docling-fast 等):
|
||||
# docker compose --profile odl-hybrid up -d --build odl-hybrid
|
||||
# 该镜像未发布到 Docker Hub(本地 tag: weknora-odl-hybrid:local),make pull-images 不会拉取,只能按需 build。
|
||||
# 修改 Dockerfile.odl-hybrid 后需重建:docker compose --profile odl-hybrid build --no-cache odl-hybrid
|
||||
# ODL_HYBRID_EXTRA_ARGS=--no-ocr
|
||||
# 扫描件不要用 hybrid OCR,请用 builtin 扫描渲染 + Go OCR,或 MinerU;若坚持 hybrid OCR:
|
||||
# ODL_HYBRID_EXTRA_ARGS=--force-ocr
|
||||
# DOCREADER_ODL_HYBRID=docling-fast
|
||||
# DOCREADER_ODL_HYBRID_URL=http://odl-hybrid:5002
|
||||
# 开发环境 hybrid:make dev-start DEV_ARGS=--odl-hybrid
|
||||
# 仅用 fast 模式(不需 odl-hybrid 容器)时请保持 DOCREADER_ODL_HYBRID=off。
|
||||
# ODL_HYBRID_PORT=5002
|
||||
# ODL_HYBRID_STARTUP_WAIT_SEC=180
|
||||
# DOCREADER_ODL_HYBRID_MODE=auto
|
||||
# DOCREADER_ODL_HYBRID_FALLBACK=false
|
||||
# DOCREADER_ODL_MARKDOWN_WITH_HTML=false
|
||||
|
||||
# VLM(视觉模型)单次 HTTP 请求的整体超时时间(秒)。
|
||||
# 扫描件整页 OCR(全文+版式抽取)在慢端点上很容易超过默认值,
|
||||
@@ -648,7 +681,9 @@ DOCREADER_TRANSPORT=grpc
|
||||
# --- Async pipeline tuning (optional) -----------------------------------------
|
||||
# Worker pool size for the asynq server. Default 16 — raise it on machines
|
||||
# that handle many concurrent uploads (default Go runtime.NumCPU() under-
|
||||
# provisions for the I/O-bound document pipeline).
|
||||
# provisions for the I/O-bound document pipeline). Can also be set in the
|
||||
# management UI under Settings → System settings (asynq.concurrency);
|
||||
# UI changes require a process restart.
|
||||
# WEKNORA_ASYNQ_CONCURRENCY=16
|
||||
|
||||
# Read/write timeout (in milliseconds) the asynq client uses against Redis.
|
||||
|
||||
3
Makefile
3
Makefile
@@ -50,6 +50,7 @@ help:
|
||||
@echo ""
|
||||
@echo "开发模式(推荐):"
|
||||
@echo " dev-start 启动开发环境基础设施(仅启动依赖服务)"
|
||||
@echo " 可选: make dev-start DEV_ARGS=--odl-hybrid"
|
||||
@echo " dev-stop 停止开发环境"
|
||||
@echo " dev-restart 重启开发环境"
|
||||
@echo " dev-logs 查看开发环境日志"
|
||||
@@ -310,7 +311,7 @@ show-platform:
|
||||
|
||||
# Development mode commands
|
||||
dev-start:
|
||||
./scripts/dev.sh start
|
||||
./scripts/dev.sh start $(DEV_ARGS)
|
||||
|
||||
dev-stop:
|
||||
./scripts/dev.sh stop
|
||||
|
||||
@@ -248,8 +248,13 @@ services:
|
||||
- docreader-tmp-dev:/tmp/docreader
|
||||
environment:
|
||||
- DOCREADER_IMAGE_OUTPUT_DIR=/tmp/docreader
|
||||
- MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
|
||||
- MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-}
|
||||
- DOCREADER_ODL_MAX_WORKERS=${DOCREADER_ODL_MAX_WORKERS:-1}
|
||||
- DOCREADER_ODL_HYBRID=${DOCREADER_ODL_HYBRID:-off}
|
||||
- DOCREADER_ODL_HYBRID_URL=${DOCREADER_ODL_HYBRID_URL:-http://odl-hybrid:5002}
|
||||
- DOCREADER_ODL_HYBRID_MODE=${DOCREADER_ODL_HYBRID_MODE:-auto}
|
||||
- DOCREADER_ODL_HYBRID_FALLBACK=${DOCREADER_ODL_HYBRID_FALLBACK:-false}
|
||||
- DOCREADER_ODL_MARKDOWN_WITH_HTML=${DOCREADER_ODL_MARKDOWN_WITH_HTML:-false}
|
||||
- DOCREADER_MARKITDOWN_MAX_WORKERS=${DOCREADER_MARKITDOWN_MAX_WORKERS:-1}
|
||||
- DOCREADER_PDF_RENDER_MAX_WORKERS=${DOCREADER_PDF_RENDER_MAX_WORKERS:-1}
|
||||
- DOCREADER_PDF_RENDER_DPI=${DOCREADER_PDF_RENDER_DPI:-200}
|
||||
@@ -272,6 +277,27 @@ services:
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
# OpenDataLoader hybrid backend (optional). Enable profile "odl-hybrid" and set
|
||||
# DOCREADER_ODL_HYBRID=docling-fast on docreader. Default --no-ocr (no EasyOCR).
|
||||
# Local build only — not published to Docker Hub.
|
||||
odl-hybrid:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/Dockerfile.odl-hybrid
|
||||
image: weknora-odl-hybrid:local
|
||||
container_name: WeKnora-odl-hybrid
|
||||
profiles:
|
||||
- odl-hybrid
|
||||
ports:
|
||||
- "${ODL_HYBRID_PORT:-5002}:5002"
|
||||
environment:
|
||||
# Default --no-ocr (digital PDFs). Scanned PDFs: use builtin OCR / MinerU, or
|
||||
# ODL_HYBRID_EXTRA_ARGS="--force-ocr" (needs EasyOCR + libGL in image).
|
||||
- ODL_HYBRID_EXTRA_ARGS=${ODL_HYBRID_EXTRA_ARGS:---no-ocr}
|
||||
networks:
|
||||
- WeKnora-network-dev
|
||||
restart: unless-stopped
|
||||
|
||||
jaeger:
|
||||
image: jaegertracing/all-in-one:latest
|
||||
container_name: WeKnora-jaeger-dev
|
||||
|
||||
@@ -241,6 +241,12 @@ services:
|
||||
environment:
|
||||
- DOCREADER_IMAGE_OUTPUT_DIR=/tmp/docreader
|
||||
- MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-}
|
||||
- DOCREADER_ODL_MAX_WORKERS=${DOCREADER_ODL_MAX_WORKERS:-1}
|
||||
- DOCREADER_ODL_HYBRID=${DOCREADER_ODL_HYBRID:-off}
|
||||
- DOCREADER_ODL_HYBRID_URL=${DOCREADER_ODL_HYBRID_URL:-http://odl-hybrid:5002}
|
||||
- DOCREADER_ODL_HYBRID_MODE=${DOCREADER_ODL_HYBRID_MODE:-auto}
|
||||
- DOCREADER_ODL_HYBRID_FALLBACK=${DOCREADER_ODL_HYBRID_FALLBACK:-false}
|
||||
- DOCREADER_ODL_MARKDOWN_WITH_HTML=${DOCREADER_ODL_MARKDOWN_WITH_HTML:-false}
|
||||
- DOCREADER_MARKITDOWN_MAX_WORKERS=${DOCREADER_MARKITDOWN_MAX_WORKERS:-1}
|
||||
- DOCREADER_PDF_RENDER_MAX_WORKERS=${DOCREADER_PDF_RENDER_MAX_WORKERS:-1}
|
||||
- DOCREADER_PDF_RENDER_DPI=${DOCREADER_PDF_RENDER_DPI:-200}
|
||||
@@ -250,13 +256,6 @@ services:
|
||||
- GRPC_TLS_KEY=${GRPC_TLS_KEY:-}
|
||||
- GRPC_TLS_CA=${GRPC_TLS_CA:-}
|
||||
- GRPC_AUTH_TOKEN=${GRPC_AUTH_TOKEN:-}
|
||||
- OBS_ENDPOINT=${OBS_ENDPOINT:-}
|
||||
- OBS_REGION=${OBS_REGION:-}
|
||||
- OBS_ACCESS_KEY=${OBS_ACCESS_KEY:-}
|
||||
- OBS_SECRET_KEY=${OBS_SECRET_KEY:-}
|
||||
- OBS_BUCKET_NAME=${OBS_BUCKET_NAME:-}
|
||||
- OBS_PATH_PREFIX=${OBS_PATH_PREFIX:-}
|
||||
- OBS_PROXY_DOMAIN=${OBS_PROXY_DOMAIN:-}
|
||||
healthcheck:
|
||||
test: ["CMD", "grpc_health_probe", "-addr=localhost:50051"]
|
||||
interval: 30s
|
||||
@@ -269,6 +268,24 @@ services:
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
# OpenDataLoader hybrid backend (optional). Default --no-ocr (no EasyOCR/libGL).
|
||||
# Local build only — not published to Docker Hub; use --profile odl-hybrid --build.
|
||||
odl-hybrid:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/Dockerfile.odl-hybrid
|
||||
image: weknora-odl-hybrid:local
|
||||
container_name: WeKnora-odl-hybrid
|
||||
profiles:
|
||||
- odl-hybrid
|
||||
expose:
|
||||
- "5002"
|
||||
environment:
|
||||
- ODL_HYBRID_EXTRA_ARGS=${ODL_HYBRID_EXTRA_ARGS:---no-ocr}
|
||||
networks:
|
||||
- WeKnora-network
|
||||
restart: unless-stopped
|
||||
|
||||
# 修改的PostgreSQL配置
|
||||
postgres:
|
||||
image: paradedb/paradedb:v0.22.2-pg17
|
||||
|
||||
@@ -94,6 +94,7 @@ RUN apt-get update && apt-get install -y \
|
||||
libjpeg62-turbo \
|
||||
wget \
|
||||
gnupg \
|
||||
openjdk-17-jre-headless \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
antiword \
|
||||
|
||||
29
docker/Dockerfile.odl-hybrid
Normal file
29
docker/Dockerfile.odl-hybrid
Normal file
@@ -0,0 +1,29 @@
|
||||
# OpenDataLoader PDF hybrid backend (Docling). Pre-install deps so the
|
||||
# container listens on :5002 immediately instead of pip install on every start.
|
||||
#
|
||||
# Default --no-ocr: digital PDFs already have a text layer; Docling layout/table
|
||||
# still runs without EasyOCR (avoids libGL + heavy OCR stack in slim images).
|
||||
# For scanned PDFs use builtin docreader OCR, MinerU, or override with
|
||||
# ODL_HYBRID_EXTRA_ARGS="--force-ocr" (requires extra system/Python deps).
|
||||
FROM python:3.10.18-bookworm
|
||||
|
||||
# Docling table/layout models import cv2 (OpenCV), which needs libGL at runtime
|
||||
# even when hybrid runs with --no-ocr.
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
libgomp1 \
|
||||
libsm6 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip install --no-cache-dir "opendataloader-pdf[hybrid]>=2.4.7"
|
||||
|
||||
EXPOSE 5002
|
||||
|
||||
ENV ODL_HYBRID_EXTRA_ARGS="--no-ocr"
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=10s --retries=5 --start-period=120s \
|
||||
CMD curl -f http://localhost:5002/health || exit 1
|
||||
|
||||
CMD ["bash", "-c", "exec opendataloader-pdf-hybrid --host 0.0.0.0 --port 5002 ${ODL_HYBRID_EXTRA_ARGS}"]
|
||||
@@ -55,6 +55,12 @@ class DocReaderConfig:
|
||||
# Parser
|
||||
docx_max_pages: int
|
||||
markitdown_max_workers: int
|
||||
odl_max_workers: int
|
||||
odl_hybrid: str
|
||||
odl_hybrid_url: str
|
||||
odl_hybrid_mode: str
|
||||
odl_hybrid_fallback: bool
|
||||
odl_markdown_with_html: bool
|
||||
pdf_render_max_workers: int
|
||||
pdf_render_parallelism: int
|
||||
pdf_render_dpi: int
|
||||
@@ -81,6 +87,17 @@ def load_config() -> DocReaderConfig:
|
||||
grpc_port = _get_int(["DOCREADER_GRPC_PORT", "PORT"], 50051)
|
||||
docx_max_pages = _get_int(["DOCREADER_DOCX_MAX_PAGES"], 0)
|
||||
markitdown_max_workers = _get_int(["DOCREADER_MARKITDOWN_MAX_WORKERS"], 1)
|
||||
odl_max_workers = _get_int(["DOCREADER_ODL_MAX_WORKERS"], 1)
|
||||
odl_hybrid = _get_str(["DOCREADER_ODL_HYBRID"], "off")
|
||||
odl_hybrid_url = _get_str(
|
||||
["DOCREADER_ODL_HYBRID_URL"],
|
||||
"http://127.0.0.1:5002",
|
||||
)
|
||||
odl_hybrid_mode = _get_str(["DOCREADER_ODL_HYBRID_MODE"], "auto")
|
||||
odl_hybrid_fallback = _get_bool(["DOCREADER_ODL_HYBRID_FALLBACK"], False)
|
||||
odl_markdown_with_html = _get_bool(
|
||||
["DOCREADER_ODL_MARKDOWN_WITH_HTML"], False
|
||||
)
|
||||
pdf_render_max_workers = _get_int(["DOCREADER_PDF_RENDER_MAX_WORKERS"], 1)
|
||||
# Intra-document render parallelism: how many worker processes render the
|
||||
# scanned pages of a SINGLE PDF in parallel. pdfium is not thread-safe, so
|
||||
@@ -117,6 +134,12 @@ def load_config() -> DocReaderConfig:
|
||||
grpc_port=grpc_port,
|
||||
docx_max_pages=docx_max_pages,
|
||||
markitdown_max_workers=markitdown_max_workers,
|
||||
odl_max_workers=odl_max_workers,
|
||||
odl_hybrid=odl_hybrid,
|
||||
odl_hybrid_url=odl_hybrid_url,
|
||||
odl_hybrid_mode=odl_hybrid_mode,
|
||||
odl_hybrid_fallback=odl_hybrid_fallback,
|
||||
odl_markdown_with_html=odl_markdown_with_html,
|
||||
pdf_render_max_workers=pdf_render_max_workers,
|
||||
pdf_render_parallelism=pdf_render_parallelism,
|
||||
pdf_render_dpi=pdf_render_dpi,
|
||||
@@ -139,6 +162,12 @@ def dump_config(mask_secrets: bool = True) -> Dict[str, Any]:
|
||||
"DOCREADER_GRPC_PORT": cfg.grpc_port,
|
||||
"DOCREADER_DOCX_MAX_PAGES": cfg.docx_max_pages,
|
||||
"DOCREADER_MARKITDOWN_MAX_WORKERS": cfg.markitdown_max_workers,
|
||||
"DOCREADER_ODL_MAX_WORKERS": cfg.odl_max_workers,
|
||||
"DOCREADER_ODL_HYBRID": cfg.odl_hybrid,
|
||||
"DOCREADER_ODL_HYBRID_URL": cfg.odl_hybrid_url,
|
||||
"DOCREADER_ODL_HYBRID_MODE": cfg.odl_hybrid_mode,
|
||||
"DOCREADER_ODL_HYBRID_FALLBACK": cfg.odl_hybrid_fallback,
|
||||
"DOCREADER_ODL_MARKDOWN_WITH_HTML": cfg.odl_markdown_with_html,
|
||||
"DOCREADER_PDF_RENDER_MAX_WORKERS": cfg.pdf_render_max_workers,
|
||||
"DOCREADER_PDF_RENDER_PARALLELISM": cfg.pdf_render_parallelism,
|
||||
"DOCREADER_PDF_RENDER_DPI": cfg.pdf_render_dpi,
|
||||
|
||||
149
docreader/parser/excel_convert.py
Normal file
149
docreader/parser/excel_convert.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""LibreOffice helpers for normalizing legacy or unusual Excel uploads."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_XLS_MAGIC = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
|
||||
_ZIP_MAGIC = b"PK\x03\x04"
|
||||
|
||||
|
||||
def detect_excel_format(content: bytes) -> str | None:
|
||||
"""Return pandas/excel format id: xlsx, xls, xlsb, ods, or None."""
|
||||
if not content:
|
||||
return None
|
||||
|
||||
from pandas.io.excel._base import inspect_excel_format
|
||||
|
||||
ext = inspect_excel_format(content_or_path=content)
|
||||
if ext in ("xlsx", "xls", "xlsb", "ods"):
|
||||
return ext
|
||||
if ext == "zip":
|
||||
return "xlsx"
|
||||
|
||||
if content.startswith(_ZIP_MAGIC):
|
||||
return "xlsx"
|
||||
if len(content) >= len(_XLS_MAGIC) and content.startswith(_XLS_MAGIC):
|
||||
return "xls"
|
||||
return None
|
||||
|
||||
|
||||
def engine_for_format(ext: str | None) -> str:
|
||||
if ext == "xls":
|
||||
return "xlrd"
|
||||
if ext in ("xlsx", "xlsb"):
|
||||
return "openpyxl"
|
||||
if ext == "ods":
|
||||
return "odf"
|
||||
return "openpyxl"
|
||||
|
||||
|
||||
def convert_excel_to_xlsx_bytes(content: bytes, suffix: str = ".xlsx") -> bytes | None:
|
||||
"""Convert arbitrary spreadsheet bytes to XLSX using LibreOffice, if available."""
|
||||
soffice = find_soffice()
|
||||
if not soffice:
|
||||
return None
|
||||
|
||||
max_attempts = 3
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
with tempfile.TemporaryDirectory() as temp_dir, tempfile.TemporaryDirectory() as profile_dir:
|
||||
src = os.path.join(temp_dir, f"input{suffix}")
|
||||
with open(src, "wb") as handle:
|
||||
handle.write(content)
|
||||
|
||||
user_installation = Path(profile_dir).as_uri()
|
||||
cmd = [
|
||||
soffice,
|
||||
"--headless",
|
||||
f"-env:UserInstallation={user_installation}",
|
||||
"--convert-to",
|
||||
"xlsx",
|
||||
"--outdir",
|
||||
temp_dir,
|
||||
src,
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=120)
|
||||
except (OSError, subprocess.TimeoutExpired) as exc:
|
||||
logger.warning("LibreOffice convert failed to start: %s", exc)
|
||||
return None
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.decode("utf-8", errors="ignore")
|
||||
logger.warning(
|
||||
"LibreOffice convert failed (attempt %s/%s): %s",
|
||||
attempt,
|
||||
max_attempts,
|
||||
stderr,
|
||||
)
|
||||
if attempt < max_attempts:
|
||||
time.sleep(0.5 * attempt)
|
||||
continue
|
||||
return None
|
||||
|
||||
for name in os.listdir(temp_dir):
|
||||
if name.endswith(".xlsx"):
|
||||
with open(os.path.join(temp_dir, name), "rb") as handle:
|
||||
converted = handle.read()
|
||||
logger.info(
|
||||
"Converted spreadsheet via LibreOffice (%s -> xlsx, %d bytes)",
|
||||
suffix,
|
||||
len(converted),
|
||||
)
|
||||
return converted
|
||||
|
||||
if attempt < max_attempts:
|
||||
time.sleep(0.5 * attempt)
|
||||
return None
|
||||
|
||||
|
||||
def normalize_excel_bytes(content: bytes, file_type: str | None = None) -> bytes:
|
||||
"""Return bytes readable by pandas, converting via LibreOffice when needed."""
|
||||
ext = detect_excel_format(content)
|
||||
if ext is not None:
|
||||
return content
|
||||
|
||||
suffixes = []
|
||||
if file_type:
|
||||
suffixes.append(f".{file_type.lstrip('.')}")
|
||||
suffixes.extend([".xlsx", ".xls", ".et", ".csv"])
|
||||
seen: set[str] = set()
|
||||
for suffix in suffixes:
|
||||
if suffix in seen:
|
||||
continue
|
||||
seen.add(suffix)
|
||||
converted = convert_excel_to_xlsx_bytes(content, suffix=suffix)
|
||||
if converted and detect_excel_format(converted) is not None:
|
||||
return converted
|
||||
|
||||
raise ValueError(
|
||||
"Unrecognized Excel file format; the file may be corrupt, encrypted, "
|
||||
"or not a spreadsheet"
|
||||
)
|
||||
|
||||
|
||||
def find_soffice() -> Optional[str]:
|
||||
possible_paths = [
|
||||
"/usr/bin/soffice",
|
||||
"/usr/lib/libreoffice/program/soffice",
|
||||
"/opt/libreoffice25.2/program/soffice",
|
||||
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
||||
"C:\\Program Files\\LibreOffice\\program\\soffice.exe",
|
||||
"C:\\Program Files (x86)\\LibreOffice\\program\\soffice.exe",
|
||||
]
|
||||
for path in possible_paths:
|
||||
if path and os.path.exists(path):
|
||||
return path
|
||||
|
||||
result = subprocess.run(["which", "soffice"], capture_output=True, text=True)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
return None
|
||||
@@ -13,6 +13,14 @@ import pandas as pd
|
||||
|
||||
from docreader.models.document import Chunk, Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.excel_convert import (
|
||||
convert_excel_to_xlsx_bytes,
|
||||
detect_excel_format,
|
||||
engine_for_format,
|
||||
normalize_excel_bytes,
|
||||
)
|
||||
from docreader.parser.xlsx_merge import fill_merged_cells_xlsx
|
||||
from docreader.parser.xlsx_repair import repair_xlsx_bytes
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -60,13 +68,11 @@ class ExcelParser(BaseParser):
|
||||
text: List[str] = []
|
||||
start, end = 0, 0
|
||||
|
||||
# Load Excel file from bytes into pandas ExcelFile object
|
||||
excel_file = pd.ExcelFile(BytesIO(content))
|
||||
excel_file = _open_excel_file(content, file_type=self.file_type)
|
||||
|
||||
# Process each sheet in the Excel file
|
||||
for excel_sheet_name in excel_file.sheet_names:
|
||||
# Parse the sheet into a DataFrame
|
||||
df = excel_file.parse(sheet_name=excel_sheet_name)
|
||||
df = _read_sheet_dataframe(excel_file, excel_sheet_name)
|
||||
# Remove rows where all values are NaN (completely empty rows)
|
||||
df.dropna(how="all", inplace=True)
|
||||
|
||||
@@ -97,6 +103,90 @@ class ExcelParser(BaseParser):
|
||||
return Document(content="".join(text), chunks=chunks)
|
||||
|
||||
|
||||
def _read_sheet_dataframe(excel_file: pd.ExcelFile, sheet_name: str) -> pd.DataFrame:
|
||||
"""Read a worksheet into a DataFrame with stable column labels."""
|
||||
from openpyxl.utils import get_column_letter
|
||||
|
||||
# XLSX is preprocessed (merge fill); use A/B/C column letters and keep row 1 as data.
|
||||
if excel_file.engine == "openpyxl":
|
||||
df = excel_file.parse(sheet_name=sheet_name, header=None)
|
||||
df.columns = [get_column_letter(idx + 1) for idx in range(len(df.columns))]
|
||||
return df
|
||||
|
||||
df = excel_file.parse(sheet_name=sheet_name, header=0)
|
||||
if df.empty:
|
||||
df = excel_file.parse(sheet_name=sheet_name, header=None)
|
||||
df.columns = [get_column_letter(idx + 1) for idx in range(len(df.columns))]
|
||||
elif any(str(col).startswith("Unnamed:") for col in df.columns):
|
||||
df = excel_file.parse(sheet_name=sheet_name, header=None)
|
||||
df.columns = [get_column_letter(idx + 1) for idx in range(len(df.columns))]
|
||||
return df
|
||||
|
||||
|
||||
def _prepare_xlsx_bytes(data: bytes) -> bytes:
|
||||
repaired = repair_xlsx_bytes(data)
|
||||
if repaired is not None:
|
||||
data = repaired
|
||||
return fill_merged_cells_xlsx(data)
|
||||
|
||||
|
||||
def _open_excel_file(content: bytes, file_type: str | None = None) -> pd.ExcelFile:
|
||||
"""Open an Excel workbook with explicit engine selection and fallbacks."""
|
||||
data = content
|
||||
converted_via_soffice = False
|
||||
|
||||
while True:
|
||||
ext = detect_excel_format(data)
|
||||
if ext is None:
|
||||
if converted_via_soffice:
|
||||
raise ValueError(
|
||||
"Excel file format cannot be determined, you must specify an engine manually."
|
||||
)
|
||||
try:
|
||||
data = normalize_excel_bytes(data, file_type=file_type)
|
||||
except ValueError as exc:
|
||||
raise ValueError(
|
||||
"Excel file format cannot be determined, you must specify an engine manually."
|
||||
) from exc
|
||||
converted_via_soffice = True
|
||||
continue
|
||||
|
||||
if ext == "ods":
|
||||
converted = convert_excel_to_xlsx_bytes(data, suffix=".ods")
|
||||
if converted:
|
||||
data = converted
|
||||
continue
|
||||
|
||||
engine = engine_for_format(ext)
|
||||
if ext == "xlsx":
|
||||
data = _prepare_xlsx_bytes(data)
|
||||
engine = "openpyxl"
|
||||
try:
|
||||
return pd.ExcelFile(BytesIO(data), engine=engine)
|
||||
except ImportError as exc:
|
||||
raise ValueError(
|
||||
f"Excel engine {engine!r} is not available for .{ext} files"
|
||||
) from exc
|
||||
except KeyError as exc:
|
||||
if "sharedStrings.xml" not in str(exc) or engine != "openpyxl":
|
||||
raise
|
||||
repaired = repair_xlsx_bytes(data)
|
||||
if repaired is None:
|
||||
raise
|
||||
logger.info("Repaired XLSX sharedStrings packaging before parse")
|
||||
data = _prepare_xlsx_bytes(repaired)
|
||||
continue
|
||||
except ValueError as exc:
|
||||
if converted_via_soffice or "cannot be determined" not in str(exc):
|
||||
raise
|
||||
try:
|
||||
data = normalize_excel_bytes(content, file_type=file_type)
|
||||
except ValueError:
|
||||
raise
|
||||
converted_via_soffice = True
|
||||
continue
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage: Parse an Excel file and display results
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
"""Optional PDF engine backed by LiteParse (LlamaIndex, MIT).
|
||||
|
||||
LiteParse is a fast Rust/PDFium text extractor that performs spatial reading-order
|
||||
reconstruction natively (multi-column aware) and is considerably faster than the
|
||||
Python text path. It is exposed as a *selectable* engine (``liteparse``) rather
|
||||
than replacing the builtin engine, so users can opt in per knowledge base.
|
||||
|
||||
Scope/limitations (documented intentionally):
|
||||
* Text-first engine: it returns reading-order plain text, not figures. Scanned
|
||||
pages carry no text layer, so for image-dominated PDFs we fall back to the
|
||||
builtin scanned renderer (page -> JPEG, OCR'd by the Go App) to stay robust.
|
||||
* docreader never runs OCR itself; OCR/VLM remain Go-side responsibilities.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# If the extracted text averages fewer characters per page than this, the PDF is
|
||||
# treated as scanned/image-dominated and routed to the builtin image renderer.
|
||||
_MIN_CHARS_PER_PAGE = 20
|
||||
# If at least this fraction of sampled pages are image-dominated, the PDF is
|
||||
# scanned (even when it carries a garbled OCR text layer) and is routed to the
|
||||
# builtin image renderer rather than trusting the low-quality text.
|
||||
_SCANNED_PAGE_FRACTION = 0.5
|
||||
|
||||
|
||||
def liteparse_available(_overrides=None):
|
||||
"""Engine availability probe used by the registry/UI."""
|
||||
try:
|
||||
import liteparse # noqa: F401
|
||||
except Exception as e: # pragma: no cover - depends on install
|
||||
return False, f"liteparse 未安装: {e}"
|
||||
return True, ""
|
||||
|
||||
|
||||
class LiteParseParser(BaseParser):
|
||||
"""Parse a PDF with LiteParse, falling back to scanned rendering when empty."""
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
import liteparse
|
||||
|
||||
from docreader.parser.pdf_parser import (
|
||||
PDFScannedParser,
|
||||
estimate_scanned_fraction,
|
||||
)
|
||||
|
||||
# Image-dominated PDFs (incl. ones with a garbled OCR text layer) carry
|
||||
# no trustworthy text; render them as images for Go-side OCR instead.
|
||||
try:
|
||||
scanned_frac = estimate_scanned_fraction(content)
|
||||
except Exception:
|
||||
scanned_frac = 0.0
|
||||
if scanned_frac >= _SCANNED_PAGE_FRACTION:
|
||||
logger.info(
|
||||
"LiteParseParser: %s is image-dominated (%.0f%% scanned pages); "
|
||||
"using builtin scanned renderer",
|
||||
self.file_name,
|
||||
scanned_frac * 100,
|
||||
)
|
||||
return PDFScannedParser(
|
||||
file_name=self.file_name, file_type=self.file_type
|
||||
).parse_into_text(content)
|
||||
|
||||
engine = liteparse.LiteParse(ocr_enabled=False, quiet=True)
|
||||
result = engine.parse(content)
|
||||
page_count = int(result.num_pages)
|
||||
|
||||
page_texts = []
|
||||
for i in range(page_count):
|
||||
page = result.get_page(i)
|
||||
page_texts.append((getattr(page, "text", "") or "").strip())
|
||||
|
||||
doc_text = (getattr(result, "text", "") or "").strip()
|
||||
if not doc_text:
|
||||
doc_text = "\n\n".join(t for t in page_texts if t)
|
||||
|
||||
# Image-dominated / scanned PDFs yield little to no text: defer to the
|
||||
# builtin scanned renderer so the Go App can OCR the page images.
|
||||
if page_count and len(doc_text) < _MIN_CHARS_PER_PAGE * page_count:
|
||||
logger.info(
|
||||
"LiteParseParser: %s looks scanned (%d chars / %d pages); "
|
||||
"falling back to builtin scanned renderer",
|
||||
self.file_name,
|
||||
len(doc_text),
|
||||
page_count,
|
||||
)
|
||||
return PDFScannedParser(
|
||||
file_name=self.file_name, file_type=self.file_type
|
||||
).parse_into_text(content)
|
||||
|
||||
logger.info(
|
||||
"LiteParseParser: %s -> %d pages, content_len=%d",
|
||||
self.file_name,
|
||||
page_count,
|
||||
len(doc_text),
|
||||
)
|
||||
return Document(
|
||||
content=doc_text,
|
||||
images={},
|
||||
metadata={
|
||||
"page_count": page_count,
|
||||
"image_source_type": "pdf_text_layer",
|
||||
"parser_engine": "liteparse",
|
||||
},
|
||||
)
|
||||
@@ -18,6 +18,8 @@ import re
|
||||
import uuid
|
||||
from typing import Dict, List, Match, Optional, Tuple
|
||||
|
||||
_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
@@ -58,6 +60,71 @@ class MarkdownTableUtil:
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _split_row_cells(row_line: str) -> List[str]:
|
||||
"""Split a markdown table row into cells, preserving empty cells."""
|
||||
inner = row_line.strip()
|
||||
if not inner.startswith("|"):
|
||||
return []
|
||||
parts = inner.split("|")
|
||||
if parts and parts[0].strip() == "":
|
||||
parts = parts[1:]
|
||||
if parts and parts[-1].strip() == "":
|
||||
parts = parts[:-1]
|
||||
return [part.strip() for part in parts]
|
||||
|
||||
@staticmethod
|
||||
def _is_table_row(line: str) -> bool:
|
||||
stripped = line.strip()
|
||||
return stripped.startswith("|") and "|" in stripped[1:]
|
||||
|
||||
@classmethod
|
||||
def _is_separator_row(cls, line: str) -> bool:
|
||||
cells = cls._split_row_cells(line)
|
||||
return bool(cells) and all(_SEPARATOR_CELL.match(cell) for cell in cells)
|
||||
|
||||
@classmethod
|
||||
def _is_empty_row(cls, line: str) -> bool:
|
||||
cells = cls._split_row_cells(line)
|
||||
return bool(cells) and all(cell == "" for cell in cells)
|
||||
|
||||
@classmethod
|
||||
def _separator_row_for(cls, header_line: str) -> str:
|
||||
cells = cls._split_row_cells(header_line)
|
||||
return "| " + " | ".join("---" for _ in cells) + " |"
|
||||
|
||||
@classmethod
|
||||
def _normalize_table_block(cls, block: List[str]) -> List[str]:
|
||||
"""Fix MarkItDown-style tables: drop bogus prefix rows, ensure GFM delimiter."""
|
||||
while block and cls._is_empty_row(block[0]):
|
||||
block.pop(0)
|
||||
if block and cls._is_separator_row(block[0]):
|
||||
block.pop(0)
|
||||
# GFM/marked need "| --- |" after the first row. Headerless Word tables
|
||||
# only have data rows after we strip the fake empty+separator prefix.
|
||||
if len(block) >= 2 and not cls._is_separator_row(block[1]):
|
||||
sep = cls._separator_row_for(block[0])
|
||||
block = [block[0], sep] + block[1:]
|
||||
return block
|
||||
|
||||
def normalize_spurious_table_prefixes(self, content: str) -> str:
|
||||
"""Remove bogus empty/separator prefix rows from MarkItDown table output."""
|
||||
lines = content.split("\n")
|
||||
out: List[str] = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
if not self._is_table_row(line):
|
||||
out.append(line)
|
||||
i += 1
|
||||
continue
|
||||
block: List[str] = []
|
||||
while i < len(lines) and self._is_table_row(lines[i]):
|
||||
block.append(lines[i])
|
||||
i += 1
|
||||
out.extend(self._normalize_table_block(block))
|
||||
return "\n".join(out)
|
||||
|
||||
def format_table(self, content: str) -> str:
|
||||
"""Format all Markdown tables in the content.
|
||||
|
||||
@@ -70,8 +137,7 @@ class MarkdownTableUtil:
|
||||
|
||||
def process_align(match: Match[str]) -> str:
|
||||
"""Process alignment row to standardize format."""
|
||||
# Split by | and remove empty strings
|
||||
columns = [col.strip() for col in match.group(0).split("|") if col.strip()]
|
||||
columns = self._split_row_cells(match.group(0))
|
||||
|
||||
processed = []
|
||||
for col in columns:
|
||||
@@ -87,8 +153,7 @@ class MarkdownTableUtil:
|
||||
|
||||
def process_line(match: Match[str]) -> str:
|
||||
"""Process regular table row to standardize format."""
|
||||
# Split by | and remove empty strings
|
||||
columns = [col.strip() for col in match.group(0).split("|") if col.strip()]
|
||||
columns = self._split_row_cells(match.group(0))
|
||||
|
||||
# Preserve original indentation
|
||||
prefix = match.group(1)
|
||||
@@ -99,8 +164,7 @@ class MarkdownTableUtil:
|
||||
formatted_content = self.line_pattern.sub(process_line, formatted_content)
|
||||
# Then format alignment rows (must be done after to avoid conflicts)
|
||||
formatted_content = self.align_pattern.sub(process_align, formatted_content)
|
||||
|
||||
return formatted_content
|
||||
return self.normalize_spurious_table_prefixes(formatted_content)
|
||||
|
||||
@staticmethod
|
||||
def _self_test():
|
||||
|
||||
@@ -9,6 +9,11 @@ from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
from docreader.parser.concurrency import parser_worker_limit
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
from docreader.parser.ppt_convert import normalize_ppt_bytes
|
||||
from docreader.parser.pptx_media import (
|
||||
attach_pptx_media_to_markdown,
|
||||
markdown_needs_pptx_media_attach,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -32,16 +37,41 @@ class StdMarkitdownParser(BaseParser):
|
||||
Uses self.file_type (inherited from BaseParser) to hint the stream format.
|
||||
"""
|
||||
ext = self.file_type
|
||||
if ext and not ext.startswith('.'):
|
||||
ext = '.' + ext
|
||||
ft = (ext or "").lstrip(".").lower()
|
||||
pptx_bytes: bytes | None = None
|
||||
if ft in ("ppt", "pptx"):
|
||||
content, ext = normalize_ppt_bytes(content, ft)
|
||||
pptx_bytes = content
|
||||
ft = "pptx"
|
||||
elif ext and not ext.startswith("."):
|
||||
ext = "." + ext
|
||||
|
||||
with parser_worker_limit("markitdown", CONFIG.markitdown_max_workers):
|
||||
result = self.markitdown.convert(
|
||||
result = self._convert_markitdown(content, ext, keep_data_uris=True)
|
||||
if result is None:
|
||||
logger.warning(
|
||||
"MarkItDown failed with embedded images for %s; retrying without data URIs",
|
||||
ft or ext,
|
||||
)
|
||||
result = self._convert_markitdown(content, ext, keep_data_uris=False)
|
||||
|
||||
text = result.text_content
|
||||
images: dict[str, str] = {}
|
||||
if pptx_bytes is not None and markdown_needs_pptx_media_attach(text):
|
||||
text, images = attach_pptx_media_to_markdown(text, pptx_bytes)
|
||||
return Document(content=text, images=images)
|
||||
|
||||
def _convert_markitdown(self, content: bytes, ext: str | None, *, keep_data_uris: bool):
|
||||
try:
|
||||
return self.markitdown.convert(
|
||||
io.BytesIO(content),
|
||||
file_extension=ext,
|
||||
keep_data_uris=True
|
||||
keep_data_uris=keep_data_uris,
|
||||
)
|
||||
return Document(content=result.text_content)
|
||||
except Exception:
|
||||
if keep_data_uris:
|
||||
return None
|
||||
raise
|
||||
|
||||
|
||||
class MarkitdownParser(PipelineParser):
|
||||
|
||||
351
docreader/parser/opendataloader_parser.py
Normal file
351
docreader/parser/opendataloader_parser.py
Normal file
@@ -0,0 +1,351 @@
|
||||
"""PDF parser backed by OpenDataLoader PDF (Apache-2.0).
|
||||
|
||||
Requires Java 11+ on PATH and the ``opendataloader-pdf`` Python package.
|
||||
Each ``convert()`` spawns a JVM; concurrency is limited via
|
||||
``DOCREADER_ODL_MAX_WORKERS``.
|
||||
|
||||
Hybrid mode (``docling-fast``, etc.) needs a running
|
||||
``opendataloader-pdf-hybrid`` server — configure ``DOCREADER_ODL_HYBRID_URL``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import html
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from typing import Any, Dict, Mapping, Optional, Tuple
|
||||
|
||||
from docreader.config import CONFIG
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.concurrency import parser_worker_limit
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_MIN_CHARS_PER_PAGE = 20
|
||||
_IMAGE_SUFFIXES = (".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp")
|
||||
_MD_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
||||
_IMAGE_FILE_NUM_RE = re.compile(r"^imageFile(\d+)\.", re.I)
|
||||
|
||||
|
||||
def _override_str(overrides: Optional[Mapping[str, Any]], key: str, default: str = "") -> str:
|
||||
if overrides:
|
||||
v = overrides.get(key)
|
||||
if v is not None and str(v).strip() != "":
|
||||
return str(v).strip()
|
||||
return default
|
||||
|
||||
|
||||
def _override_bool(overrides: Optional[Mapping[str, Any]], key: str, default: bool) -> bool:
|
||||
if overrides:
|
||||
v = overrides.get(key)
|
||||
if v is not None and str(v).strip() != "":
|
||||
return str(v).strip().lower() in {"1", "true", "yes", "y", "on"}
|
||||
return default
|
||||
|
||||
|
||||
def _java_available() -> Tuple[bool, str]:
|
||||
if not shutil.which("java"):
|
||||
return False, "需要 Java 11+(JRE),请安装并在 PATH 中配置 java"
|
||||
return True, ""
|
||||
|
||||
|
||||
def _package_available() -> Tuple[bool, str]:
|
||||
try:
|
||||
import opendataloader_pdf # noqa: F401
|
||||
except ImportError as e:
|
||||
return False, f"opendataloader-pdf 未安装: {e}"
|
||||
return True, ""
|
||||
|
||||
|
||||
def _ping_hybrid(
|
||||
url: str,
|
||||
*,
|
||||
timeout_sec: float = 5.0,
|
||||
retries: int = 3,
|
||||
retry_delay_sec: float = 2.0,
|
||||
) -> Tuple[bool, str]:
|
||||
import time
|
||||
|
||||
base = url.rstrip("/")
|
||||
health_url = f"{base}/health"
|
||||
last_err = ""
|
||||
for attempt in range(max(1, retries)):
|
||||
try:
|
||||
req = urllib.request.Request(health_url, method="GET")
|
||||
with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
|
||||
if 200 <= resp.status < 300:
|
||||
return True, ""
|
||||
last_err = f"hybrid 健康检查 HTTP {resp.status}: {health_url}"
|
||||
except urllib.error.URLError as e:
|
||||
last_err = f"无法连接 OpenDataLoader hybrid 服务 ({health_url}): {e}"
|
||||
except Exception as e:
|
||||
last_err = f"hybrid 健康检查失败: {e}"
|
||||
if attempt + 1 < retries:
|
||||
time.sleep(retry_delay_sec)
|
||||
hint = (
|
||||
";若刚执行 make dev-start --odl-hybrid,请等待镜像构建/服务就绪"
|
||||
"(docker logs WeKnora-odl-hybrid)"
|
||||
)
|
||||
return False, last_err + hint
|
||||
|
||||
|
||||
def opendataloader_available(
|
||||
overrides: Optional[Mapping[str, Any]] = None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Registry / ListEngines availability probe."""
|
||||
ok, msg = _java_available()
|
||||
if not ok:
|
||||
return False, msg
|
||||
ok, msg = _package_available()
|
||||
if not ok:
|
||||
return False, msg
|
||||
|
||||
hybrid = _resolve_hybrid(overrides)
|
||||
if hybrid and hybrid.lower() not in ("off", ""):
|
||||
url = _resolve_hybrid_url(overrides)
|
||||
if url:
|
||||
return _ping_hybrid(url, retries=6, retry_delay_sec=5.0, timeout_sec=5.0)
|
||||
return True, ""
|
||||
|
||||
|
||||
def _resolve_hybrid(overrides: Optional[Mapping[str, Any]] = None) -> str:
|
||||
return _override_str(overrides, "odl_hybrid", CONFIG.odl_hybrid)
|
||||
|
||||
|
||||
def _resolve_hybrid_url(overrides: Optional[Mapping[str, Any]] = None) -> str:
|
||||
return _override_str(overrides, "odl_hybrid_url", CONFIG.odl_hybrid_url)
|
||||
|
||||
|
||||
def _find_markdown_file(output_dir: str, pdf_stem: str) -> str:
|
||||
candidates = []
|
||||
for root, _, files in os.walk(output_dir):
|
||||
for name in files:
|
||||
if name.lower().endswith(".md"):
|
||||
path = os.path.join(root, name)
|
||||
candidates.append(path)
|
||||
if not candidates:
|
||||
raise FileNotFoundError(f"OpenDataLoader 未在 {output_dir} 生成 markdown 文件")
|
||||
for path in candidates:
|
||||
base = os.path.splitext(os.path.basename(path))[0]
|
||||
if base == pdf_stem or base.startswith(pdf_stem):
|
||||
return path
|
||||
candidates.sort(key=lambda p: os.path.getmtime(p), reverse=True)
|
||||
return candidates[0]
|
||||
|
||||
|
||||
def _normalize_odl_image_url(raw: str) -> str:
|
||||
"""OpenDataLoader wraps paths as ``<images/foo.png>``; storage may HTML-escape them."""
|
||||
s = html.unescape((raw or "").strip())
|
||||
s = s.replace("<", "<").replace(">", ">").replace("&", "&")
|
||||
s = s.strip().strip("<>").strip().strip('"').strip("'")
|
||||
if s.startswith("./"):
|
||||
s = s[2:]
|
||||
return s.replace("\\", "/")
|
||||
|
||||
|
||||
def _canonical_image_ref(abs_path: str, output_dir: str) -> str:
|
||||
"""Use ``images/<file>`` keys to match OpenDataLoader markdown conventions."""
|
||||
rel = os.path.relpath(abs_path, output_dir).replace("\\", "/")
|
||||
name = os.path.basename(abs_path)
|
||||
if rel.startswith("images/"):
|
||||
return rel
|
||||
return f"images/{name}"
|
||||
|
||||
|
||||
def _collect_images_under_output(output_dir: str) -> Dict[str, str]:
|
||||
"""Collect every extracted image under the convert output tree."""
|
||||
images: Dict[str, str] = {}
|
||||
for root, _, files in os.walk(output_dir):
|
||||
for name in files:
|
||||
if not name.lower().endswith(_IMAGE_SUFFIXES):
|
||||
continue
|
||||
abs_path = os.path.join(root, name)
|
||||
ref = _canonical_image_ref(abs_path, output_dir)
|
||||
if ref in images:
|
||||
continue
|
||||
with open(abs_path, "rb") as f:
|
||||
images[ref] = base64.b64encode(f.read()).decode("utf-8")
|
||||
return images
|
||||
|
||||
|
||||
def _register_image_alias(aliases: Dict[str, str], alias: str, canonical: str) -> None:
|
||||
key = _normalize_odl_image_url(alias)
|
||||
if key:
|
||||
aliases[key] = canonical
|
||||
|
||||
|
||||
def _build_path_alias_map(images: Dict[str, str]) -> Dict[str, str]:
|
||||
"""Map ODL markdown spellings (angle brackets, entities, basenames) to dict keys."""
|
||||
aliases: Dict[str, str] = {}
|
||||
for ref in images:
|
||||
base = os.path.basename(ref)
|
||||
variants = [
|
||||
ref,
|
||||
base,
|
||||
f"images/{base}",
|
||||
f"<{ref}>",
|
||||
f"<images/{base}>",
|
||||
f"<{ref}>",
|
||||
f"<images/{base}>",
|
||||
]
|
||||
for variant in variants:
|
||||
_register_image_alias(aliases, variant, ref)
|
||||
return aliases
|
||||
|
||||
|
||||
def _resolve_image_ref(url: str, aliases: Dict[str, str]) -> Optional[str]:
|
||||
key = _normalize_odl_image_url(url)
|
||||
if not key or key.startswith("data:"):
|
||||
return None
|
||||
if key in aliases:
|
||||
return aliases[key]
|
||||
base = os.path.basename(key)
|
||||
for candidate in (base, f"images/{base}"):
|
||||
if candidate in aliases:
|
||||
return aliases[candidate]
|
||||
m = _IMAGE_FILE_NUM_RE.match(base)
|
||||
if m:
|
||||
num = int(m.group(1))
|
||||
numbered = []
|
||||
for ref in {aliases[k] for k in aliases}:
|
||||
bm = _IMAGE_FILE_NUM_RE.match(os.path.basename(ref))
|
||||
if bm:
|
||||
numbered.append((int(bm.group(1)), ref))
|
||||
numbered.sort(key=lambda x: x[0])
|
||||
for n, ref in numbered:
|
||||
if n == num:
|
||||
return ref
|
||||
if numbered and 1 <= num <= len(numbered):
|
||||
return numbered[num - 1][1]
|
||||
return None
|
||||
|
||||
|
||||
def _rewrite_markdown_image_refs(
|
||||
markdown: str, images: Dict[str, str]
|
||||
) -> str:
|
||||
if not images:
|
||||
return markdown
|
||||
aliases = _build_path_alias_map(images)
|
||||
|
||||
def repl(match: re.Match[str]) -> str:
|
||||
alt, raw_url = match.group(1), match.group(2)
|
||||
url = raw_url.strip().split()[0] if raw_url else ""
|
||||
canonical = _resolve_image_ref(url, aliases)
|
||||
if canonical is None:
|
||||
return match.group(0)
|
||||
return f""
|
||||
|
||||
return _MD_IMAGE_RE.sub(repl, markdown)
|
||||
|
||||
|
||||
def _run_convert(
|
||||
pdf_path: str,
|
||||
output_dir: str,
|
||||
image_dir: str,
|
||||
overrides: Optional[Mapping[str, Any]] = None,
|
||||
) -> None:
|
||||
import opendataloader_pdf
|
||||
|
||||
kwargs: Dict[str, Any] = {
|
||||
"input_path": pdf_path,
|
||||
"output_dir": output_dir,
|
||||
"format": "markdown",
|
||||
"image_output": "external",
|
||||
"image_dir": image_dir,
|
||||
"quiet": True,
|
||||
"markdown_with_html": _override_bool(
|
||||
overrides, "odl_markdown_with_html", CONFIG.odl_markdown_with_html
|
||||
),
|
||||
}
|
||||
hybrid = _resolve_hybrid(overrides)
|
||||
if hybrid and hybrid.lower() not in ("off", ""):
|
||||
kwargs["hybrid"] = hybrid
|
||||
hybrid_url = _resolve_hybrid_url(overrides)
|
||||
if hybrid_url:
|
||||
kwargs["hybrid_url"] = hybrid_url
|
||||
hybrid_mode = _override_str(overrides, "odl_hybrid_mode", CONFIG.odl_hybrid_mode)
|
||||
if hybrid_mode:
|
||||
kwargs["hybrid_mode"] = hybrid_mode
|
||||
if _override_bool(overrides, "odl_hybrid_fallback", CONFIG.odl_hybrid_fallback):
|
||||
kwargs["hybrid_fallback"] = True
|
||||
|
||||
opendataloader_pdf.convert(**kwargs)
|
||||
|
||||
|
||||
class OpenDataLoaderParser(BaseParser):
|
||||
"""Parse PDFs with OpenDataLoader (layout-aware markdown + external images)."""
|
||||
|
||||
def __init__(self, *args: Any, **kwargs: Any):
|
||||
self._engine_overrides: Dict[str, Any] = {
|
||||
k: v
|
||||
for k, v in kwargs.items()
|
||||
if k.startswith("odl_") or k in ("mineru_endpoint", "mineru_api_key")
|
||||
}
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
ok, msg = opendataloader_available(self._engine_overrides)
|
||||
if not ok:
|
||||
raise RuntimeError(msg)
|
||||
|
||||
safe_name = os.path.basename(self.file_name) or "document.pdf"
|
||||
if not safe_name.lower().endswith(".pdf"):
|
||||
safe_name = f"{os.path.splitext(safe_name)[0] or 'document'}.pdf"
|
||||
pdf_stem = os.path.splitext(safe_name)[0]
|
||||
|
||||
max_workers = CONFIG.odl_max_workers
|
||||
with parser_worker_limit("opendataloader", max_workers):
|
||||
with tempfile.TemporaryDirectory(prefix="weknora-odl-") as tmp_dir:
|
||||
pdf_path = os.path.join(tmp_dir, safe_name)
|
||||
with open(pdf_path, "wb") as f:
|
||||
f.write(content)
|
||||
image_dir = os.path.join(tmp_dir, "images")
|
||||
os.makedirs(image_dir, exist_ok=True)
|
||||
|
||||
_run_convert(
|
||||
pdf_path,
|
||||
tmp_dir,
|
||||
image_dir,
|
||||
overrides=self._engine_overrides,
|
||||
)
|
||||
|
||||
md_path = _find_markdown_file(tmp_dir, pdf_stem)
|
||||
with open(md_path, encoding="utf-8", errors="replace") as f:
|
||||
text = f.read()
|
||||
|
||||
images = _collect_images_under_output(tmp_dir)
|
||||
text = _rewrite_markdown_image_refs(text, images)
|
||||
|
||||
if len(text.strip()) < _MIN_CHARS_PER_PAGE:
|
||||
logger.info(
|
||||
"OpenDataLoaderParser: %s yielded little text; "
|
||||
"falling back to builtin scanned renderer",
|
||||
self.file_name,
|
||||
)
|
||||
from docreader.parser.pdf_parser import PDFScannedParser
|
||||
|
||||
return PDFScannedParser(
|
||||
file_name=self.file_name, file_type=self.file_type
|
||||
).parse_into_text(content)
|
||||
|
||||
logger.info(
|
||||
"OpenDataLoaderParser: %s -> content_len=%d images=%d",
|
||||
self.file_name,
|
||||
len(text),
|
||||
len(images),
|
||||
)
|
||||
return Document(
|
||||
content=text,
|
||||
images=images,
|
||||
metadata={
|
||||
"parser_engine": "opendataloader",
|
||||
"odl_hybrid": _resolve_hybrid(self._engine_overrides) or "off",
|
||||
},
|
||||
)
|
||||
@@ -20,6 +20,7 @@ import base64
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
|
||||
from docreader.config import CONFIG
|
||||
@@ -87,12 +88,51 @@ EMBED_MAX_IMAGES = _env_int("DOCREADER_PDF_EMBED_MAX_IMAGES", 50)
|
||||
# Reconstruct reading order with a geometric XY-cut so multi-column pages are
|
||||
# linearised column-by-column instead of line-interleaved.
|
||||
LAYOUT_ORDERING = _env_bool("DOCREADER_PDF_LAYOUT_ORDERING", True)
|
||||
# When glyphs are positioned without explicit space characters (common in OCR /
|
||||
# search text layers), insert a space if the horizontal gap exceeds this
|
||||
# multiple of the line's median glyph width.
|
||||
WORD_GAP_WIDTH_RATIO = _env_float("DOCREADER_PDF_WORD_GAP_WIDTH_RATIO", 0.4)
|
||||
# Promote visually larger lines to markdown headings (font-size proxy = rect
|
||||
# height relative to the page's median line height).
|
||||
DETECT_HEADINGS = _env_bool("DOCREADER_PDF_DETECT_HEADINGS", True)
|
||||
# Drop invisible (render-mode 3), off-page and degenerate text — a cheap guard
|
||||
# against hidden-text prompt injection and OCR artefacts.
|
||||
FILTER_HIDDEN_TEXT = _env_bool("DOCREADER_PDF_FILTER_HIDDEN_TEXT", True)
|
||||
# Narrow side strips (arXiv watermarks, page labels) narrower than this share of
|
||||
# page width are dropped when they look like vertical / single-glyph noise.
|
||||
MARGIN_COL_WIDTH_RATIO = _env_float("DOCREADER_PDF_MARGIN_COL_WIDTH_RATIO", 0.12)
|
||||
# Minimum characters on a line before font-size heuristics may promote it to a
|
||||
# markdown heading (avoids ``### C`` from margin glyphs).
|
||||
MIN_HEADING_LINE_CHARS = _env_int("DOCREADER_PDF_MIN_HEADING_LINE_CHARS", 8)
|
||||
# Strip pdfium placeholder glyphs (U+FFFE) and soft hyphens; remove axis/legend text
|
||||
# from vector figures when a Figure caption is present on the page.
|
||||
SANITIZE_PDF_TEXT = _env_bool("DOCREADER_PDF_SANITIZE_TEXT", True)
|
||||
STRIP_CHART_TEXT_DEBRIS = _env_bool("DOCREADER_PDF_STRIP_CHART_DEBRIS", True)
|
||||
# Render detected vector chart regions (no embedded bitmap) as JPEG for VLM/OCR.
|
||||
RENDER_VECTOR_FIGURES = _env_bool("DOCREADER_PDF_RENDER_VECTOR_FIGURES", True)
|
||||
MIN_CHART_REGION_CHARS = _env_int("DOCREADER_PDF_MIN_CHART_REGION_CHARS", 18)
|
||||
MIN_CHART_REGION_AREA_RATIO = _env_float("DOCREADER_PDF_MIN_CHART_REGION_AREA", 0.015)
|
||||
MAX_CHART_REGION_AREA_RATIO = _env_float("DOCREADER_PDF_MAX_CHART_REGION_AREA", 0.42)
|
||||
MAX_FIGURE_HEIGHT_RATIO = _env_float("DOCREADER_PDF_MAX_FIGURE_HEIGHT_RATIO", 0.38)
|
||||
|
||||
# pdfium / Adobe text layers often emit U+FFFE for missing hyphenation or ligatures.
|
||||
_PDF_ARTIFACT_RE = re.compile(r"[\u00ad\u200b-\u200f\ufeff\ufffe\uffff]")
|
||||
_PDF_ARTIFACT_JOIN_RE = re.compile(r"(\w)[\u00ad\ufffe](\w)")
|
||||
_CHART_DEBRIS_LINE_RE = re.compile(
|
||||
r"^(?:"
|
||||
r"[\d\s.]+|"
|
||||
r"\d{1,2}|"
|
||||
r"\d+-layer|"
|
||||
r"iter\.\s*\(1e4\)|"
|
||||
r"(?:training|test)\s+error\s*\(%\)"
|
||||
r")$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_CHART_LAYER_RE = re.compile(r"^\d+-layer$", re.IGNORECASE)
|
||||
_FIGURE_CAPTION_RE = re.compile(r"^Figure\s+\d+\b", re.IGNORECASE)
|
||||
_FIGURE_CAPTION_SEARCH_RE = re.compile(r"\bFigure\s+(\d+)\b", re.IGNORECASE)
|
||||
_ARXIV_LINE_RE = re.compile(r"^arXiv:\s*\S+", re.IGNORECASE)
|
||||
_PAGE_NUM_LINE_RE = re.compile(r"^\d{1,3}$")
|
||||
|
||||
|
||||
def _close_pdfium_resource(resource) -> None:
|
||||
@@ -150,6 +190,394 @@ def _extract_page_text(page) -> str:
|
||||
_close_pdfium_resource(textpage)
|
||||
|
||||
|
||||
def _sanitize_pdf_text(text: str) -> str:
|
||||
"""Remove PDF text-layer placeholders and repair broken hyphenations."""
|
||||
if not text:
|
||||
return text
|
||||
text = _PDF_ARTIFACT_RE.sub("", text)
|
||||
text = _PDF_ARTIFACT_JOIN_RE.sub(r"\1\2", text)
|
||||
return text
|
||||
|
||||
|
||||
def _is_chart_debris_line(line: str) -> bool:
|
||||
t = line.strip()
|
||||
if not t:
|
||||
return False
|
||||
if _CHART_DEBRIS_LINE_RE.match(t):
|
||||
return True
|
||||
if _CHART_LAYER_RE.match(t):
|
||||
return True
|
||||
# Tick labels like "0 1 2 3 4 5 6 0"
|
||||
if re.fullmatch(r"[\d\s.()-]+", t) and len(t) <= 24 and sum(c.isdigit() for c in t) >= 3:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _strip_chart_text_debris(text: str) -> str:
|
||||
"""Drop runs of axis/legend lines leaked from vector figures into the text layer."""
|
||||
if not text:
|
||||
return text
|
||||
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
|
||||
out: list = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if _is_chart_debris_line(lines[i]):
|
||||
j = i
|
||||
while j < len(lines) and (
|
||||
_is_chart_debris_line(lines[j]) or not lines[j].strip()
|
||||
):
|
||||
j += 1
|
||||
if j - i >= 3:
|
||||
i = j
|
||||
continue
|
||||
out.append(lines[i])
|
||||
i += 1
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def _strip_arxiv_and_page_num_lines(text: str) -> str:
|
||||
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
|
||||
kept: list = []
|
||||
for ln in lines:
|
||||
t = ln.strip()
|
||||
if _ARXIV_LINE_RE.match(t):
|
||||
continue
|
||||
if _PAGE_NUM_LINE_RE.match(t):
|
||||
continue
|
||||
if "arXiv:" in ln:
|
||||
ln = re.sub(r"\s*arXiv:\s*\S+\s*(?:\[[^\]]+\])?\s*[^\n]*", "", ln).strip()
|
||||
if not ln:
|
||||
continue
|
||||
kept.append(ln)
|
||||
return "\n".join(kept)
|
||||
|
||||
|
||||
def _strip_lines_above_figure_captions(text: str) -> str:
|
||||
"""Remove diagram/chart label lines that sit immediately above a Figure caption."""
|
||||
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
|
||||
out: list = []
|
||||
for ln in lines:
|
||||
if _line_has_figure_caption(ln):
|
||||
while out and _is_figure_interior_line(out[-1]):
|
||||
out.pop()
|
||||
out.append(ln)
|
||||
else:
|
||||
out.append(ln)
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def _is_body_paragraph_line(text: str) -> bool:
|
||||
t = text.strip()
|
||||
if len(t) < 48:
|
||||
return False
|
||||
return len(t.split()) >= 8
|
||||
|
||||
|
||||
def _is_figure_interior_line(text: str) -> bool:
|
||||
"""Short, non-body line directly above a Figure caption (diagram labels, ticks)."""
|
||||
t = text.strip()
|
||||
if not t or _FIGURE_CAPTION_RE.match(t):
|
||||
return False
|
||||
if _ARXIV_LINE_RE.match(t) or _PAGE_NUM_LINE_RE.match(t):
|
||||
return True
|
||||
if _is_body_paragraph_line(t):
|
||||
return False
|
||||
if _is_chart_debris_line(t):
|
||||
return True
|
||||
# Prose sentence above a figure (wrapped paragraph tail) — keep in text.
|
||||
if t.endswith((".", "。", "!", "?", "!")) and len(t) >= 15:
|
||||
return False
|
||||
if len(t.split()) >= 7:
|
||||
return False
|
||||
if len(t) <= 40:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _postprocess_pdf_text(text: str) -> str:
|
||||
if SANITIZE_PDF_TEXT:
|
||||
text = _sanitize_pdf_text(text)
|
||||
text = _strip_arxiv_and_page_num_lines(text)
|
||||
text = _strip_lines_above_figure_captions(text)
|
||||
if STRIP_CHART_TEXT_DEBRIS:
|
||||
text = _strip_chart_text_debris(text)
|
||||
return text
|
||||
|
||||
|
||||
def _char_looks_chart_axis_tick(ch: str) -> bool:
|
||||
"""Axis tick / numeric chart labels only (not words like ``layer`` in diagrams)."""
|
||||
t = ch.strip()
|
||||
if not t:
|
||||
return False
|
||||
if len(t) == 1 and t in "0123456789.%()-":
|
||||
return True
|
||||
if _CHART_LAYER_RE.match(t):
|
||||
return True
|
||||
if re.fullmatch(r"iter\.\s*\(1e4\)", t, re.I):
|
||||
return True
|
||||
if re.fullmatch(r"(?:training|test)\s+error\s*\(%\)", t, re.I):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _chars_bbox(char_list: list) -> tuple:
|
||||
return (
|
||||
min(c["x0"] for c in char_list),
|
||||
min(c["y0"] for c in char_list),
|
||||
max(c["x1"] for c in char_list),
|
||||
max(c["y1"] for c in char_list),
|
||||
)
|
||||
|
||||
|
||||
def _bbox_area_ratio(bbox, page_w: float, page_h: float) -> float:
|
||||
page_area = float(page_w) * float(page_h)
|
||||
if page_area <= 0:
|
||||
return 0.0
|
||||
x0, y0, x1, y1 = bbox
|
||||
return max(0.0, (x1 - x0) * (y1 - y0) / page_area)
|
||||
|
||||
|
||||
def _chart_region_bbox(chars: list, page_w: float, page_h: float):
|
||||
"""Bounding box of numeric chart axis labels (fallback when caption walk fails)."""
|
||||
chart = [c for c in chars if _char_looks_chart_axis_tick(c["ch"])]
|
||||
if len(chart) < MIN_CHART_REGION_CHARS:
|
||||
return None
|
||||
bbox = _chars_bbox(chart)
|
||||
ratio = _bbox_area_ratio(bbox, page_w, page_h)
|
||||
if ratio < MIN_CHART_REGION_AREA_RATIO or ratio > MAX_CHART_REGION_AREA_RATIO:
|
||||
return None
|
||||
x0, y0, x1, y1 = bbox
|
||||
pad_x = max(8.0, (x1 - x0) * 0.08)
|
||||
pad_y = max(8.0, (y1 - y0) * 0.08)
|
||||
return (
|
||||
max(0.0, x0 - pad_x),
|
||||
max(0.0, y0 - pad_y),
|
||||
min(page_w, x1 + pad_x),
|
||||
min(page_h, y1 + pad_y),
|
||||
)
|
||||
|
||||
|
||||
def _expand_chart_bbox(bbox, page_w: float, page_h: float, margin_frac: float = 0.18):
|
||||
x0, y0, x1, y1 = bbox
|
||||
dx = (x1 - x0) * margin_frac
|
||||
dy = (y1 - y0) * margin_frac
|
||||
return (
|
||||
max(0.0, x0 - dx),
|
||||
max(0.0, y0 - dy),
|
||||
min(page_w, x1 + dx),
|
||||
min(page_h, y1 + dy),
|
||||
)
|
||||
|
||||
|
||||
def _render_page_clip_jpeg(page, bbox, scale: float, quality: int, max_edge: int) -> bytes:
|
||||
"""Render a PDF page region to JPEG (bbox in PDF points, bottom-left origin)."""
|
||||
left, bottom, right, top = bbox
|
||||
scale_eff = _effective_scale(page, scale, max_edge)
|
||||
bitmap = None
|
||||
try:
|
||||
bitmap = page.render(scale=scale_eff)
|
||||
pil = bitmap.to_pil().convert("RGB")
|
||||
finally:
|
||||
_close_pdfium_resource(bitmap)
|
||||
page_w, page_h = page.get_size()
|
||||
x0 = int(left * scale_eff)
|
||||
x1 = int(right * scale_eff)
|
||||
y0 = int((page_h - top) * scale_eff)
|
||||
y1 = int((page_h - bottom) * scale_eff)
|
||||
if x1 <= x0 or y1 <= y0:
|
||||
raise ValueError("degenerate clip bbox")
|
||||
return _pil_to_jpeg_bytes(pil.crop((x0, y0, x1, y1)), quality)
|
||||
|
||||
|
||||
def _pil_to_jpeg_bytes(pil, quality: int) -> bytes:
|
||||
buf = io.BytesIO()
|
||||
if pil.mode not in ("RGB", "L"):
|
||||
pil = pil.convert("RGB")
|
||||
pil.save(buf, format="JPEG", quality=quality, optimize=True)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def _group_lines_with_chars(chars: list) -> list:
|
||||
"""Group glyphs into lines; each line includes its char list and bbox."""
|
||||
if not chars:
|
||||
return []
|
||||
heights = [c["y1"] - c["y0"] for c in chars if c["y1"] > c["y0"]]
|
||||
med_h = statistics.median(heights) if heights else 1.0
|
||||
ordered = sorted(chars, key=lambda c: -(c["y0"] + c["y1"]) / 2)
|
||||
groups: list = []
|
||||
cur: list = []
|
||||
ref = None
|
||||
for c in ordered:
|
||||
yc = (c["y0"] + c["y1"]) / 2
|
||||
if ref is None or abs(yc - ref) <= 0.5 * med_h:
|
||||
cur.append(c)
|
||||
ref = yc if ref is None else ref
|
||||
else:
|
||||
groups.append(cur)
|
||||
cur = [c]
|
||||
ref = yc
|
||||
if cur:
|
||||
groups.append(cur)
|
||||
|
||||
lines: list = []
|
||||
for grp in groups:
|
||||
grp_sorted = sorted(grp, key=lambda c: c["x0"])
|
||||
text = _join_line_glyphs(grp_sorted)
|
||||
if not text:
|
||||
continue
|
||||
hs = [c["y1"] - c["y0"] for c in grp_sorted if c["y1"] > c["y0"]]
|
||||
lines.append(
|
||||
{
|
||||
"text": text,
|
||||
"h": statistics.median(hs) if hs else med_h,
|
||||
"chars": grp_sorted,
|
||||
"bbox": _chars_bbox(grp_sorted),
|
||||
}
|
||||
)
|
||||
return lines
|
||||
|
||||
|
||||
def _line_has_figure_caption(text: str) -> bool:
|
||||
return bool(_FIGURE_CAPTION_SEARCH_RE.search((text or "").strip()))
|
||||
|
||||
|
||||
def _bbox_above_caption(lines: list, cap_i: int, page_w: float, page_h: float):
|
||||
"""Region above a Figure caption line (PDF coords, bottom-left origin)."""
|
||||
cap_bbox = lines[cap_i]["bbox"]
|
||||
cap_top = cap_bbox[3]
|
||||
x0, x1 = cap_bbox[0], cap_bbox[2]
|
||||
fig_h = page_h * min(MAX_FIGURE_HEIGHT_RATIO, 0.35)
|
||||
y_bottom = cap_top
|
||||
y_top = min(page_h, cap_top + fig_h)
|
||||
|
||||
for j in range(cap_i - 1, -1, -1):
|
||||
t = lines[j]["text"]
|
||||
b = lines[j]["bbox"]
|
||||
if b[3] < y_bottom - 4:
|
||||
continue
|
||||
if b[1] > y_top + 4:
|
||||
break
|
||||
if _is_body_paragraph_line(t) and not _is_figure_interior_line(t):
|
||||
break
|
||||
if _is_figure_interior_line(t) or _is_chart_debris_line(t) or not t.strip():
|
||||
x0 = min(x0, b[0])
|
||||
x1 = max(x1, b[2])
|
||||
y_top = max(y_top, min(page_h, b[3] + fig_h * 0.15))
|
||||
|
||||
min_h = page_h * 0.08
|
||||
if y_top - y_bottom < min_h:
|
||||
y_top = min(page_h, y_bottom + min_h)
|
||||
margin_x = max(8.0, (x1 - x0) * 0.05)
|
||||
return (
|
||||
max(0.0, x0 - margin_x),
|
||||
y_bottom,
|
||||
min(page_w, x1 + margin_x),
|
||||
y_top,
|
||||
)
|
||||
|
||||
|
||||
def _cap_bbox_height(bbox, page_h: float, cap_y_top: float) -> tuple:
|
||||
"""Limit figure bbox height (PDF coords, bottom-left origin)."""
|
||||
x0, y0, x1, y1 = bbox
|
||||
max_top = min(y1, cap_y_top + page_h * MAX_FIGURE_HEIGHT_RATIO)
|
||||
if max_top <= y0:
|
||||
return bbox
|
||||
return (x0, y0, x1, max_top)
|
||||
|
||||
|
||||
def _inject_figure_markdown_before_captions(text: str, clips: list) -> str:
|
||||
"""Place ``![...]()`` immediately before each Figure caption line in page text."""
|
||||
if not clips:
|
||||
return text
|
||||
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
|
||||
clip_idx = 0
|
||||
for i, ln in enumerate(lines):
|
||||
if clip_idx >= len(clips):
|
||||
break
|
||||
if not _line_has_figure_caption(ln):
|
||||
continue
|
||||
if i > 0 and lines[i - 1].lstrip().startswith("!["):
|
||||
continue
|
||||
ref_path = clips[clip_idx][0]
|
||||
fname = os.path.basename(ref_path)
|
||||
img_md = f""
|
||||
lines[i] = f"{img_md}\n\n{ln}"
|
||||
clip_idx += 1
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _extract_vector_figure_clips(
|
||||
page,
|
||||
page_index: int,
|
||||
plain_text: str,
|
||||
raw,
|
||||
base_name: str,
|
||||
scale: float,
|
||||
quality: int,
|
||||
max_edge: int,
|
||||
) -> list:
|
||||
"""Render vector figure regions anchored at each ``Figure N.`` caption on the page.
|
||||
|
||||
Returns ``[(ref_path, b64, y_sort, caption_line), ...]`` for markdown injection.
|
||||
"""
|
||||
if not RENDER_VECTOR_FIGURES or not re.search(r"\bFigure\s+\d+", plain_text, re.I):
|
||||
return []
|
||||
textpage = None
|
||||
try:
|
||||
textpage = page.get_textpage()
|
||||
chars, page_w = _page_chars(textpage, page, raw)
|
||||
if not chars:
|
||||
return []
|
||||
page_h = page.get_size()[1]
|
||||
lines = _merge_orphan_punctuation_lines(_group_lines_with_chars(chars))
|
||||
caption_indices = [
|
||||
i for i, ln in enumerate(lines) if _line_has_figure_caption(ln["text"])
|
||||
]
|
||||
if not caption_indices:
|
||||
return []
|
||||
|
||||
results: list = []
|
||||
for fig_idx, cap_i in enumerate(caption_indices):
|
||||
cap_line = lines[cap_i]["text"].strip()
|
||||
m = _FIGURE_CAPTION_SEARCH_RE.search(cap_line)
|
||||
if m:
|
||||
cap_line = cap_line[m.start() :].split("\n", 1)[0].strip()
|
||||
|
||||
bbox = _bbox_above_caption(lines, cap_i, page_w, page_h)
|
||||
if bbox is None:
|
||||
bbox = _chart_region_bbox(chars, page_w, page_h)
|
||||
if bbox is None:
|
||||
continue
|
||||
|
||||
ratio = _bbox_area_ratio(bbox, page_w, page_h)
|
||||
if ratio > MAX_CHART_REGION_AREA_RATIO:
|
||||
bbox = _cap_bbox_height(bbox, page_h, lines[cap_i]["bbox"][3])
|
||||
ratio = _bbox_area_ratio(bbox, page_w, page_h)
|
||||
if ratio > MAX_CHART_REGION_AREA_RATIO:
|
||||
continue
|
||||
if ratio < MIN_CHART_REGION_AREA_RATIO:
|
||||
continue
|
||||
|
||||
bbox = _expand_chart_bbox(bbox, page_w, page_h, margin_frac=0.06)
|
||||
jpeg = _render_page_clip_jpeg(page, bbox, scale, quality, max_edge)
|
||||
fname = f"{base_name}_p{page_index + 1}_fig{fig_idx + 1}.jpg"
|
||||
ref_path = f"images/{fname}"
|
||||
results.append(
|
||||
(
|
||||
ref_path,
|
||||
base64.b64encode(jpeg).decode("utf-8"),
|
||||
bbox[3],
|
||||
cap_line,
|
||||
)
|
||||
)
|
||||
return results
|
||||
except Exception:
|
||||
logger.debug("vector figure clip failed on page %d", page_index, exc_info=True)
|
||||
return []
|
||||
finally:
|
||||
_close_pdfium_resource(textpage)
|
||||
|
||||
|
||||
def _collect_invisible_boxes(page, raw) -> list:
|
||||
"""Bounding boxes of invisible (render-mode 3) text objects on the page."""
|
||||
boxes: list = []
|
||||
@@ -251,6 +679,109 @@ def _split_columns(chars: list, scale: float, width: float, depth: int = 0) -> l
|
||||
)
|
||||
|
||||
|
||||
def _column_x_span(chars: list) -> float:
|
||||
if not chars:
|
||||
return 0.0
|
||||
return max(c["x1"] for c in chars) - min(c["x0"] for c in chars)
|
||||
|
||||
|
||||
def _column_single_line_fraction(lines: list) -> float:
|
||||
if not lines:
|
||||
return 0.0
|
||||
single = sum(1 for ln in lines if len(ln["text"]) <= 2)
|
||||
return single / len(lines)
|
||||
|
||||
|
||||
def _is_artifact_column(chars: list, width: float) -> bool:
|
||||
"""Detect margin strips and vertical watermarks (e.g. arXiv sidebar).
|
||||
|
||||
Docling / MinerU solve this with learned layout regions; here we use
|
||||
geometry only: a narrow column whose lines are mostly one glyph tall is not
|
||||
part of the reading order.
|
||||
"""
|
||||
if not chars or width <= 0:
|
||||
return True
|
||||
span = _column_x_span(chars)
|
||||
if span <= 0:
|
||||
return True
|
||||
lines = _group_lines(chars)
|
||||
single_frac = _column_single_line_fraction(lines)
|
||||
narrow = span / width < MARGIN_COL_WIDTH_RATIO
|
||||
if narrow and single_frac >= 0.45:
|
||||
return True
|
||||
ys = [(c["y0"] + c["y1"]) / 2 for c in chars]
|
||||
y_span = max(ys) - min(ys)
|
||||
# Vertical text: tall stack, narrow horizontal extent, mostly one char/line.
|
||||
if y_span > span * 3.5 and len(chars) >= 8 and single_frac >= 0.35:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _filter_reading_columns(chars: list, scale: float, width: float) -> list:
|
||||
"""Split into columns and drop margin / watermark strips."""
|
||||
cols = _split_columns(chars, scale, width)
|
||||
kept = [c for c in cols if not _is_artifact_column(c, width)]
|
||||
if kept:
|
||||
return kept
|
||||
# All columns looked like noise — keep the widest glyph set (main body).
|
||||
if len(cols) > 1:
|
||||
return [max(cols, key=_column_x_span)]
|
||||
return cols
|
||||
|
||||
|
||||
def _merge_orphan_punctuation_lines(lines: list) -> list:
|
||||
"""Attach lines that are only punctuation to the previous visual line.
|
||||
|
||||
Many PDFs place ``.`` in figure labels or footnotes on a slightly different
|
||||
baseline; grouping by y then leaves ``Figure 1`` and ``2:`` on separate lines.
|
||||
"""
|
||||
if not lines:
|
||||
return []
|
||||
merged: list = []
|
||||
for ln in lines:
|
||||
t = ln["text"].strip()
|
||||
if (
|
||||
merged
|
||||
and t
|
||||
and len(t) <= 4
|
||||
and all(c in ".,;:!?…·" or c.isspace() for c in t)
|
||||
):
|
||||
suffix = "".join(t.split())
|
||||
prev = merged[-1]["text"]
|
||||
if suffix and prev and not prev.endswith((" ", "-")):
|
||||
merged[-1]["text"] = prev + suffix
|
||||
else:
|
||||
merged[-1]["text"] = (prev + " " + t).strip()
|
||||
continue
|
||||
merged.append(dict(ln))
|
||||
return merged
|
||||
|
||||
|
||||
def _join_line_glyphs(ln_sorted: list) -> str:
|
||||
"""Join a visual line's glyphs, inferring word spaces from horizontal gaps."""
|
||||
if not ln_sorted:
|
||||
return ""
|
||||
widths = [c["x1"] - c["x0"] for c in ln_sorted if c["x1"] > c["x0"]]
|
||||
med_w = statistics.median(widths) if widths else 1.0
|
||||
gap_threshold = med_w * WORD_GAP_WIDTH_RATIO
|
||||
|
||||
parts: list[str] = []
|
||||
for i, cur in enumerate(ln_sorted):
|
||||
ch = cur["ch"]
|
||||
if i == 0:
|
||||
parts.append(ch)
|
||||
continue
|
||||
prev = ln_sorted[i - 1]
|
||||
if ch.isspace() or prev["ch"].isspace():
|
||||
if not ch.isspace() or (parts and not parts[-1].endswith(" ")):
|
||||
parts.append(ch)
|
||||
continue
|
||||
if cur["x0"] - prev["x1"] > gap_threshold:
|
||||
parts.append(" ")
|
||||
parts.append(ch)
|
||||
return "".join(parts).strip()
|
||||
|
||||
|
||||
def _group_lines(chars: list) -> list:
|
||||
"""Group a column's glyphs into lines (top-to-bottom, glyphs sorted by x)."""
|
||||
if not chars:
|
||||
@@ -277,7 +808,7 @@ def _group_lines(chars: list) -> list:
|
||||
out: list = []
|
||||
for ln in lines:
|
||||
ln_sorted = sorted(ln, key=lambda c: c["x0"])
|
||||
text = "".join(c["ch"] for c in ln_sorted).strip()
|
||||
text = _join_line_glyphs(ln_sorted)
|
||||
if not text:
|
||||
continue
|
||||
hs = [c["y1"] - c["y0"] for c in ln_sorted if c["y1"] - c["y0"] > 0]
|
||||
@@ -293,7 +824,12 @@ def _segments_to_markdown(lines: list) -> str:
|
||||
|
||||
def level(ln) -> int:
|
||||
txt = ln["text"]
|
||||
if not DETECT_HEADINGS or body <= 0 or len(txt) > 80:
|
||||
if (
|
||||
not DETECT_HEADINGS
|
||||
or body <= 0
|
||||
or len(txt) > 80
|
||||
or len(txt) < MIN_HEADING_LINE_CHARS
|
||||
):
|
||||
return 0
|
||||
if txt[-1:] in ".。!!??,,;;::":
|
||||
return 0
|
||||
@@ -317,6 +853,100 @@ def _segments_to_markdown(lines: list) -> str:
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def _chars_to_layout_markdown(chars: list, scale: float, width: float) -> str:
|
||||
blocks: list = []
|
||||
for col in _filter_reading_columns(chars, scale, width):
|
||||
lines = _merge_orphan_punctuation_lines(_group_lines(col))
|
||||
md = _segments_to_markdown(lines)
|
||||
if md:
|
||||
blocks.append(md)
|
||||
return "\n".join(blocks)
|
||||
|
||||
|
||||
def _layout_line_stats(text: str) -> tuple:
|
||||
"""Return (line_count, single_char_line_count, punct_only_line_count)."""
|
||||
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||
if not lines:
|
||||
return 0, 0, 0
|
||||
single = sum(1 for ln in lines if len(ln) <= 2)
|
||||
punct_only = sum(
|
||||
1
|
||||
for ln in lines
|
||||
if len(ln) <= 4 and re.fullmatch(r"[\s.,;:!?…·\-–—]+", ln)
|
||||
)
|
||||
return len(lines), single, punct_only
|
||||
|
||||
|
||||
def _layout_garbled_line_fraction(text: str) -> float:
|
||||
"""Share of lines that look like broken OCR (many 1–2 letter tokens)."""
|
||||
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||
if not lines:
|
||||
return 0.0
|
||||
garbled = 0
|
||||
for ln in lines:
|
||||
words = ln.split()
|
||||
if len(words) >= 6 and sum(1 for w in words if len(w) <= 2) / len(words) > 0.45:
|
||||
garbled += 1
|
||||
return garbled / len(lines)
|
||||
|
||||
|
||||
def _plain_is_well_formed(plain: str) -> bool:
|
||||
"""True when pdfium plain text already has usable words and punctuation.
|
||||
|
||||
Academic PDFs (arXiv) and TOCs already expose a good text layer; running
|
||||
geometric layout on them often destroys citations and words. Scanned books
|
||||
with a poor text layer (no commas in refs, short glued tokens) still need
|
||||
layout gap inference.
|
||||
"""
|
||||
plain = (plain or "").strip()
|
||||
if not plain:
|
||||
return False
|
||||
if re.search(r"\[\w+,\s", plain):
|
||||
return True
|
||||
if plain.count(" . . ") >= 2:
|
||||
return True
|
||||
words = re.findall(r"\S+", plain)
|
||||
if len(words) < 30:
|
||||
return False
|
||||
avg_len = sum(len(w) for w in words) / len(words)
|
||||
return avg_len >= 5.0
|
||||
|
||||
|
||||
def _should_prefer_plain(plain: str, layout: str) -> bool:
|
||||
"""Fall back to pdfium plain text when layout reconstruction looks broken."""
|
||||
layout = (layout or "").strip()
|
||||
plain = (plain or "").strip()
|
||||
if not layout:
|
||||
return True
|
||||
if not plain:
|
||||
return False
|
||||
n, single, punct_only = _layout_line_stats(layout)
|
||||
if n == 0:
|
||||
return True
|
||||
if single / n >= 0.18 or punct_only / n >= 0.12:
|
||||
return True
|
||||
garbled = _layout_garbled_line_fraction(layout)
|
||||
if garbled >= 0.20 and _layout_garbled_line_fraction(plain) < 0.08:
|
||||
return True
|
||||
if re.search(r"\[\w+,\s", plain) and re.search(
|
||||
r"\[\w+\s+\w+\s+\d", layout
|
||||
):
|
||||
return True
|
||||
# Title / lead sentence from plain should survive in layout.
|
||||
for ln in plain.splitlines():
|
||||
probe = ln.strip()
|
||||
if len(probe) < 24:
|
||||
continue
|
||||
alnum = "".join(c for c in probe if c.isalnum())[:16]
|
||||
if len(alnum) < 12:
|
||||
continue
|
||||
layout_alnum = "".join(c for c in layout if c.isalnum())
|
||||
if alnum not in layout_alnum:
|
||||
return True
|
||||
break
|
||||
return False
|
||||
|
||||
|
||||
def _extract_layout_text(page, raw) -> str:
|
||||
"""Layout-aware extraction: reading order + headings + hidden-text filter.
|
||||
|
||||
@@ -331,12 +961,7 @@ def _extract_layout_text(page, raw) -> str:
|
||||
return ""
|
||||
heights = [c["y1"] - c["y0"] for c in chars if c["y1"] - c["y0"] > 0]
|
||||
scale = (statistics.median(heights) if heights else 1.0) or 1.0
|
||||
blocks = []
|
||||
for col in _split_columns(chars, scale, width):
|
||||
md = _segments_to_markdown(_group_lines(col))
|
||||
if md:
|
||||
blocks.append(md)
|
||||
return "\n".join(blocks)
|
||||
return _chars_to_layout_markdown(chars, scale, width)
|
||||
except Exception:
|
||||
logger.debug("layout extraction failed; using plain text", exc_info=True)
|
||||
return _extract_page_text(page)
|
||||
@@ -623,37 +1248,6 @@ def _extract_embedded_images(pdf, classes, raw, base_name: str, quality: int) ->
|
||||
return result
|
||||
|
||||
|
||||
def estimate_scanned_fraction(content: bytes, sample: int = 12) -> float:
|
||||
"""Return the fraction of (sampled) pages that look image-dominated.
|
||||
|
||||
Used by alternative engines (e.g. liteparse) that lack image-object access
|
||||
to decide whether a PDF is scanned, applying the same image-area signal the
|
||||
builtin router uses. Samples up to ``sample`` pages for speed on big PDFs.
|
||||
"""
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_r
|
||||
|
||||
pdf = pdfium.PdfDocument(content)
|
||||
try:
|
||||
page_count = len(pdf)
|
||||
if page_count <= 0:
|
||||
return 0.0
|
||||
step = max(1, page_count // sample)
|
||||
indices = list(range(0, page_count, step))
|
||||
scanned = 0
|
||||
for i in indices:
|
||||
page = pdf[i]
|
||||
try:
|
||||
ratio = _page_image_area_ratio(page, pdfium_r)
|
||||
finally:
|
||||
_close_pdfium_resource(page)
|
||||
if ratio >= SCAN_IMAGE_AREA_RATIO:
|
||||
scanned += 1
|
||||
return scanned / len(indices) if indices else 0.0
|
||||
finally:
|
||||
_close_pdfium_resource(pdf)
|
||||
|
||||
|
||||
def _strip_repeating_lines(texts: list, classes: list) -> list:
|
||||
"""Remove running headers/footers that repeat across most text pages.
|
||||
|
||||
@@ -791,6 +1385,7 @@ class PDFParser(BaseParser):
|
||||
# Pass 1: cheap text extraction + image-area classification.
|
||||
texts: list = []
|
||||
classes: list = []
|
||||
vector_clips: dict = {}
|
||||
for i in range(page_count):
|
||||
page = pdf[i]
|
||||
try:
|
||||
@@ -800,9 +1395,36 @@ class PDFParser(BaseParser):
|
||||
# Layout reconstruction only pays off (and is only spent) on
|
||||
# native text pages; scanned pages are rendered, not read.
|
||||
if cls == "text" and LAYOUT_ORDERING:
|
||||
text = _extract_layout_text(page, pdfium_r) or plain
|
||||
if _plain_is_well_formed(plain):
|
||||
text = plain
|
||||
else:
|
||||
layout = _extract_layout_text(page, pdfium_r)
|
||||
if layout and not _should_prefer_plain(plain, layout):
|
||||
text = layout
|
||||
else:
|
||||
text = plain
|
||||
else:
|
||||
text = plain
|
||||
if cls == "text":
|
||||
clips = _extract_vector_figure_clips(
|
||||
page,
|
||||
i,
|
||||
plain,
|
||||
pdfium_r,
|
||||
base_name,
|
||||
scale,
|
||||
quality,
|
||||
CONFIG.pdf_render_max_edge,
|
||||
)
|
||||
if clips:
|
||||
vector_clips[i] = clips
|
||||
for ref_path, b64, _y, _cap in clips:
|
||||
images[ref_path] = b64
|
||||
text = _postprocess_pdf_text(text)
|
||||
if cls == "text" and vector_clips.get(i):
|
||||
text = _inject_figure_markdown_before_captions(
|
||||
text, vector_clips[i]
|
||||
)
|
||||
finally:
|
||||
_close_pdfium_resource(page)
|
||||
texts.append(text)
|
||||
@@ -841,6 +1463,7 @@ class PDFParser(BaseParser):
|
||||
|
||||
# Assemble markdown in reading order.
|
||||
embedded_count = 0
|
||||
vector_figure_count = 0
|
||||
blocks = []
|
||||
for i in range(page_count):
|
||||
if classes[i] == "scanned":
|
||||
@@ -850,7 +1473,10 @@ class PDFParser(BaseParser):
|
||||
stripped = texts[i].strip()
|
||||
if stripped:
|
||||
blocks.append(stripped)
|
||||
for ref_path, _b64, _y in embedded.get(i, []):
|
||||
vector_figure_count += len(vector_clips.get(i, []))
|
||||
page_images = list(embedded.get(i, []))
|
||||
page_images.sort(key=lambda item: item[2], reverse=True)
|
||||
for ref_path, _b64, _y in page_images:
|
||||
fname = os.path.basename(ref_path)
|
||||
blocks.append(f"")
|
||||
embedded_count += 1
|
||||
@@ -862,6 +1488,7 @@ class PDFParser(BaseParser):
|
||||
"scanned_page_count": len(scanned_indices),
|
||||
"text_page_count": page_count - len(scanned_indices),
|
||||
"embedded_image_count": embedded_count,
|
||||
"vector_figure_count": vector_figure_count,
|
||||
"image_source_type": "scanned_pdf" if scanned_indices else "pdf_text_layer",
|
||||
}
|
||||
|
||||
|
||||
116
docreader/parser/ppt_convert.py
Normal file
116
docreader/parser/ppt_convert.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""LibreOffice helpers for legacy binary PowerPoint (.ppt) uploads."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from docreader.parser.excel_convert import find_soffice
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_OLE_MAGIC = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
|
||||
_ZIP_MAGIC = b"PK\x03\x04"
|
||||
|
||||
|
||||
def is_ole_compound(content: bytes) -> bool:
|
||||
return len(content) >= len(_OLE_MAGIC) and content.startswith(_OLE_MAGIC)
|
||||
|
||||
|
||||
def is_zip_openxml(content: bytes) -> bool:
|
||||
return len(content) >= len(_ZIP_MAGIC) and content.startswith(_ZIP_MAGIC)
|
||||
|
||||
|
||||
def needs_ppt_to_pptx_conversion(content: bytes, file_type: str | None) -> bool:
|
||||
"""True when content is legacy .ppt (OLE), not modern .pptx (ZIP)."""
|
||||
ext = (file_type or "").lstrip(".").lower()
|
||||
if ext == "pptx" or is_zip_openxml(content):
|
||||
return False
|
||||
if ext == "ppt" or is_ole_compound(content):
|
||||
return is_ole_compound(content) or ext == "ppt"
|
||||
return False
|
||||
|
||||
|
||||
def convert_ppt_to_pptx_bytes(content: bytes, suffix: str = ".ppt") -> bytes | None:
|
||||
"""Convert legacy PowerPoint bytes to PPTX using LibreOffice, if available."""
|
||||
soffice = find_soffice()
|
||||
if not soffice:
|
||||
return None
|
||||
|
||||
max_attempts = 3
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
with tempfile.TemporaryDirectory() as temp_dir, tempfile.TemporaryDirectory() as profile_dir:
|
||||
src = os.path.join(temp_dir, f"input{suffix}")
|
||||
with open(src, "wb") as handle:
|
||||
handle.write(content)
|
||||
|
||||
user_installation = Path(profile_dir).as_uri()
|
||||
cmd = [
|
||||
soffice,
|
||||
"--headless",
|
||||
f"-env:UserInstallation={user_installation}",
|
||||
"--convert-to",
|
||||
"pptx",
|
||||
"--outdir",
|
||||
temp_dir,
|
||||
src,
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=120)
|
||||
except (OSError, subprocess.TimeoutExpired) as exc:
|
||||
logger.warning("LibreOffice PPT convert failed to start: %s", exc)
|
||||
return None
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.decode("utf-8", errors="ignore")
|
||||
logger.warning(
|
||||
"LibreOffice PPT convert failed (attempt %s/%s): %s",
|
||||
attempt,
|
||||
max_attempts,
|
||||
stderr,
|
||||
)
|
||||
if attempt < max_attempts:
|
||||
time.sleep(0.5 * attempt)
|
||||
continue
|
||||
return None
|
||||
|
||||
for name in os.listdir(temp_dir):
|
||||
if name.endswith(".pptx"):
|
||||
with open(os.path.join(temp_dir, name), "rb") as handle:
|
||||
converted = handle.read()
|
||||
logger.info(
|
||||
"Converted presentation via LibreOffice (%s -> pptx, %d bytes)",
|
||||
suffix,
|
||||
len(converted),
|
||||
)
|
||||
return converted
|
||||
|
||||
if attempt < max_attempts:
|
||||
time.sleep(0.5 * attempt)
|
||||
return None
|
||||
|
||||
|
||||
def normalize_ppt_bytes(content: bytes, file_type: str | None) -> tuple[bytes, str]:
|
||||
"""Return (bytes, extension) suitable for MarkItDown (pptx when converted)."""
|
||||
ext = (file_type or "").lstrip(".").lower()
|
||||
|
||||
if is_zip_openxml(content):
|
||||
return content, ".pptx"
|
||||
|
||||
if not needs_ppt_to_pptx_conversion(content, ext):
|
||||
dotted = f".{ext}" if ext else ".pptx"
|
||||
return content, dotted
|
||||
|
||||
suffix = ".ppt" if ext in ("", "ppt") else f".{ext}"
|
||||
converted = convert_ppt_to_pptx_bytes(content, suffix=suffix)
|
||||
if converted:
|
||||
return converted, ".pptx"
|
||||
|
||||
raise ValueError(
|
||||
"Legacy PowerPoint (.ppt) is not supported by MarkItDown directly; "
|
||||
"LibreOffice is required to convert it to .pptx. Install LibreOffice "
|
||||
"(soffice) in the docreader environment or upload .pptx instead."
|
||||
)
|
||||
154
docreader/parser/pptx_media.py
Normal file
154
docreader/parser/pptx_media.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""Extract and rasterize images embedded in PPTX (e.g. WMF) when MarkItDown cannot inline them."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
import uuid
|
||||
import zipfile
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_MARKDOWN_IMAGE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
||||
_RASTER_EXT = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"}
|
||||
_VECTOR_EXT = {".wmf", ".emf", ".svg"}
|
||||
|
||||
|
||||
def _find_convert() -> str | None:
|
||||
for path in ("/usr/bin/convert", "/usr/local/bin/convert"):
|
||||
if os.path.isfile(path):
|
||||
return path
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["which", "convert"], capture_output=True, text=True, check=False
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
except OSError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _rasterize_with_imagemagick(data: bytes, suffix: str) -> bytes | None:
|
||||
convert = _find_convert()
|
||||
if not convert:
|
||||
return None
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
src = os.path.join(temp_dir, f"input{suffix}")
|
||||
dst = os.path.join(temp_dir, "output.png")
|
||||
with open(src, "wb") as handle:
|
||||
handle.write(data)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[convert, src, dst],
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
except (OSError, subprocess.TimeoutExpired) as exc:
|
||||
logger.warning("ImageMagick convert failed: %s", exc)
|
||||
return None
|
||||
if result.returncode != 0 or not os.path.isfile(dst):
|
||||
stderr = (result.stderr or b"").decode("utf-8", errors="ignore")
|
||||
logger.warning("ImageMagick convert exit %s: %s", result.returncode, stderr)
|
||||
return None
|
||||
with open(dst, "rb") as handle:
|
||||
return handle.read()
|
||||
|
||||
|
||||
def _rasterize_with_pillow(data: bytes) -> bytes | None:
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
return None
|
||||
try:
|
||||
img = Image.open(io.BytesIO(data))
|
||||
if img.mode not in ("RGB", "L"):
|
||||
img = img.convert("RGB")
|
||||
out = io.BytesIO()
|
||||
img.save(out, format="PNG")
|
||||
return out.getvalue()
|
||||
except Exception as exc:
|
||||
logger.debug("Pillow could not open media bytes: %s", exc)
|
||||
return None
|
||||
|
||||
|
||||
def rasterize_media_bytes(name: str, data: bytes) -> bytes | None:
|
||||
ext = os.path.splitext(name)[1].lower()
|
||||
if ext in _RASTER_EXT:
|
||||
png = _rasterize_with_pillow(data)
|
||||
if png:
|
||||
return png
|
||||
if ext in _VECTOR_EXT or ext in _RASTER_EXT:
|
||||
return _rasterize_with_imagemagick(data, ext or ".bin")
|
||||
return _rasterize_with_imagemagick(data, ext or ".bin")
|
||||
|
||||
|
||||
def list_pptx_media(pptx_bytes: bytes) -> List[Tuple[str, bytes]]:
|
||||
"""Return (zip path, raw bytes) for each file under ppt/media/, in archive order."""
|
||||
items: List[Tuple[str, bytes]] = []
|
||||
with zipfile.ZipFile(io.BytesIO(pptx_bytes)) as archive:
|
||||
for name in archive.namelist():
|
||||
if not name.startswith("ppt/media/"):
|
||||
continue
|
||||
base = os.path.basename(name)
|
||||
if not base or base.startswith("."):
|
||||
continue
|
||||
items.append((name, archive.read(name)))
|
||||
return items
|
||||
|
||||
|
||||
def extract_pptx_media_rasterized(pptx_bytes: bytes) -> List[bytes]:
|
||||
"""Rasterize all ppt/media assets to PNG bytes, skipping failures."""
|
||||
rasterized: List[bytes] = []
|
||||
for path, raw in list_pptx_media(pptx_bytes):
|
||||
png = rasterize_media_bytes(os.path.basename(path), raw)
|
||||
if png:
|
||||
rasterized.append(png)
|
||||
logger.info("Rasterized pptx media %s (%d -> %d bytes)", path, len(raw), len(png))
|
||||
else:
|
||||
logger.warning("Failed to rasterize pptx media %s", path)
|
||||
return rasterized
|
||||
|
||||
|
||||
def _is_unresolved_image_ref(url: str) -> bool:
|
||||
if not url or url.startswith("data:") or url.startswith("images/"):
|
||||
return False
|
||||
if url.startswith(("http://", "https://")):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def attach_pptx_media_to_markdown(
|
||||
markdown: str, pptx_bytes: bytes
|
||||
) -> Tuple[str, Dict[str, str]]:
|
||||
"""Replace unresolved  refs with images/ paths and inline image payloads."""
|
||||
media = extract_pptx_media_rasterized(pptx_bytes)
|
||||
if not media:
|
||||
return markdown, {}
|
||||
|
||||
images: Dict[str, str] = {}
|
||||
media_iter = iter(media)
|
||||
|
||||
def repl(match: re.Match[str]) -> str:
|
||||
alt, url = match.group(1), match.group(2)
|
||||
if not _is_unresolved_image_ref(url):
|
||||
return match.group(0)
|
||||
try:
|
||||
png = next(media_iter)
|
||||
except StopIteration:
|
||||
return match.group(0)
|
||||
ref = f"images/{uuid.uuid4()}.png"
|
||||
images[ref] = base64.b64encode(png).decode()
|
||||
return f""
|
||||
|
||||
return _MARKDOWN_IMAGE.sub(repl, markdown), images
|
||||
|
||||
|
||||
def markdown_needs_pptx_media_attach(markdown: str) -> bool:
|
||||
return any(_is_unresolved_image_ref(m.group(2)) for m in _MARKDOWN_IMAGE.finditer(markdown))
|
||||
@@ -7,8 +7,11 @@ from docreader.parser.docx2_parser import Docx2Parser
|
||||
from docreader.parser.excel_parser import ExcelParser
|
||||
from docreader.parser.image_parser import ImageParser
|
||||
from docreader.parser.markdown_parser import MarkdownParser
|
||||
from docreader.parser.liteparse_parser import LiteParseParser, liteparse_available
|
||||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||||
from docreader.parser.opendataloader_parser import (
|
||||
OpenDataLoaderParser,
|
||||
opendataloader_available,
|
||||
)
|
||||
from docreader.parser.pdf_parser import PDFParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -151,11 +154,11 @@ def _build_default_registry() -> ParserEngineRegistry:
|
||||
)
|
||||
|
||||
reg.register(
|
||||
"liteparse",
|
||||
{"pdf": LiteParseParser},
|
||||
description="LiteParse 解析引擎(快速空间阅读顺序,适合数字版 PDF)",
|
||||
check_available=liteparse_available,
|
||||
unavailable_hint="liteparse 未安装",
|
||||
"opendataloader",
|
||||
{"pdf": OpenDataLoaderParser},
|
||||
description="OpenDataLoader PDF(版面分析,需 Java 11+)",
|
||||
check_available=opendataloader_available,
|
||||
unavailable_hint="请安装 opendataloader-pdf 与 Java 11+",
|
||||
)
|
||||
|
||||
# NOTE: Engine listing is managed by Go-side engine registry
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from lxml.etree import XPath
|
||||
from playwright.async_api import async_playwright
|
||||
from playwright.async_api import Page, async_playwright
|
||||
from trafilatura import extract, utils, xpaths
|
||||
|
||||
from docreader.config import CONFIG
|
||||
@@ -15,6 +17,14 @@ from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_GOTO_TIMEOUT_MS = 30_000
|
||||
_NETWORK_IDLE_TIMEOUT_MS = 10_000
|
||||
_SPA_WAIT_TIMEOUT_MS = 15_000
|
||||
# Minimum visible characters before treating an SPA shell as "rendered".
|
||||
_SPA_MIN_TEXT_LEN = 80
|
||||
# Minimum visible characters for Playwright text fallback when trafilatura fails.
|
||||
_MIN_FALLBACK_TEXT_LEN = 50
|
||||
|
||||
# Monkey-patch trafilatura internals to better support WeChat Official Account
|
||||
# articles, whose images live on `mmbiz.qpic.cn` without a standard file
|
||||
# extension and whose main content sits inside `#js_content` /
|
||||
@@ -40,6 +50,78 @@ except (AttributeError, ImportError) as e:
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _ScrapeResult:
|
||||
html: str
|
||||
visible_text: str
|
||||
page_title: str
|
||||
|
||||
|
||||
def extract_markdown_from_html(html: str) -> Optional[str]:
|
||||
"""Run trafilatura on HTML; return markdown or None if nothing extracted."""
|
||||
if not html or not html.strip():
|
||||
return None
|
||||
md_text = extract(
|
||||
html,
|
||||
output_format="markdown",
|
||||
with_metadata=True,
|
||||
include_images=True,
|
||||
include_tables=True,
|
||||
include_links=True,
|
||||
)
|
||||
if not md_text or not md_text.strip():
|
||||
return None
|
||||
return md_text
|
||||
|
||||
|
||||
def build_visible_text_fallback(visible_text: str, page_title: str = "") -> Optional[str]:
|
||||
"""Build markdown from Playwright-visible text when trafilatura finds no article body."""
|
||||
text = (visible_text or "").strip()
|
||||
if len(text) < _MIN_FALLBACK_TEXT_LEN:
|
||||
return None
|
||||
title = (page_title or "").strip()
|
||||
if title and not text.startswith(title):
|
||||
return f"# {title}\n\n{text}"
|
||||
return text
|
||||
|
||||
|
||||
async def wait_for_rendered_content(page: Page) -> None:
|
||||
"""Wait for SPA/JS pages beyond the initial HTML shell."""
|
||||
try:
|
||||
await page.wait_for_load_state("networkidle", timeout=_NETWORK_IDLE_TIMEOUT_MS)
|
||||
logger.info("Network idle after navigation")
|
||||
except Exception:
|
||||
logger.info("Network idle wait timed out, continuing")
|
||||
|
||||
try:
|
||||
await page.wait_for_function(
|
||||
"""(minLen) => {
|
||||
const root = document.querySelector('#app')
|
||||
|| document.querySelector('main')
|
||||
|| document.body;
|
||||
return ((root?.innerText || '').trim().length >= minLen);
|
||||
}""",
|
||||
arg=_SPA_MIN_TEXT_LEN,
|
||||
timeout=_SPA_WAIT_TIMEOUT_MS,
|
||||
)
|
||||
logger.info("SPA/root visible text reached minimum length")
|
||||
except Exception:
|
||||
logger.info("SPA text wait timed out, using current DOM")
|
||||
|
||||
|
||||
async def read_visible_text(page: Page) -> str:
|
||||
"""Prefer #app/main innerText, then fall back to body."""
|
||||
return await page.evaluate(
|
||||
"""() => {
|
||||
const root = document.querySelector('#app')
|
||||
|| document.querySelector('main')
|
||||
|| document.querySelector('[role="main"]')
|
||||
|| document.body;
|
||||
return (root?.innerText || '').trim();
|
||||
}"""
|
||||
)
|
||||
|
||||
|
||||
class StdWebParser(BaseParser):
|
||||
"""Standard web page parser using Playwright and Trafilatura.
|
||||
|
||||
@@ -61,16 +143,17 @@ class StdWebParser(BaseParser):
|
||||
super().__init__(file_name=title, **kwargs)
|
||||
logger.info(f"Initialized WebParser with title: {title}")
|
||||
|
||||
async def scrape(self, url: str) -> str:
|
||||
async def scrape(self, url: str) -> _ScrapeResult:
|
||||
"""Scrape web page content using Playwright.
|
||||
|
||||
Args:
|
||||
url: The URL of the web page to scrape
|
||||
|
||||
Returns:
|
||||
HTML content of the web page as string, empty string on error
|
||||
HTML, visible text, and document title; empty fields on hard failure
|
||||
"""
|
||||
logger.info(f"Starting web page scraping for URL: {url}")
|
||||
empty = _ScrapeResult(html="", visible_text="", page_title="")
|
||||
try:
|
||||
async with async_playwright() as p:
|
||||
kwargs = {}
|
||||
@@ -83,30 +166,42 @@ class StdWebParser(BaseParser):
|
||||
|
||||
logger.info(f"Navigating to URL: {url}")
|
||||
try:
|
||||
# Navigate to URL with 30 second timeout
|
||||
await page.goto(url, timeout=30000)
|
||||
await page.goto(
|
||||
url,
|
||||
timeout=_GOTO_TIMEOUT_MS,
|
||||
wait_until="domcontentloaded",
|
||||
)
|
||||
logger.info("Initial page load complete")
|
||||
except Exception as e:
|
||||
logger.error(f"Error navigating to URL: {str(e)}")
|
||||
await browser.close()
|
||||
return ""
|
||||
return empty
|
||||
|
||||
logger.info("Retrieving page HTML content")
|
||||
# Get the full HTML content of the page
|
||||
await wait_for_rendered_content(page)
|
||||
|
||||
page_title = await page.title()
|
||||
visible_text = await read_visible_text(page)
|
||||
content = await page.content()
|
||||
logger.info(f"Retrieved {len(content)} bytes of HTML content")
|
||||
logger.info(
|
||||
"Retrieved %d bytes HTML, %d chars visible text, title=%r",
|
||||
len(content),
|
||||
len(visible_text),
|
||||
page_title[:80] if page_title else "",
|
||||
)
|
||||
|
||||
await browser.close()
|
||||
logger.info("Browser closed")
|
||||
|
||||
# Return raw HTML content for further processing
|
||||
logger.info("Successfully retrieved HTML content")
|
||||
return content
|
||||
return _ScrapeResult(
|
||||
html=content,
|
||||
visible_text=visible_text,
|
||||
page_title=page_title or "",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to scrape web page: {str(e)}")
|
||||
# Return empty string on error
|
||||
return ""
|
||||
return empty
|
||||
|
||||
def parse_into_text(self, content: bytes) -> Document:
|
||||
"""Parse web page content into a Document object.
|
||||
@@ -117,36 +212,49 @@ class StdWebParser(BaseParser):
|
||||
Returns:
|
||||
Document object containing the parsed markdown content
|
||||
"""
|
||||
# Decode bytes to get the URL string
|
||||
url = endecode.decode_bytes(content)
|
||||
|
||||
logger.info(f"Scraping web page: {url}")
|
||||
# Run async scraping in sync context
|
||||
chtml = asyncio.run(self.scrape(url))
|
||||
# Extract clean content from HTML using Trafilatura
|
||||
# Convert to markdown format with metadata, images, tables, and links
|
||||
md_text = extract(
|
||||
chtml,
|
||||
output_format="markdown",
|
||||
with_metadata=True,
|
||||
include_images=True,
|
||||
include_tables=True,
|
||||
include_links=True,
|
||||
)
|
||||
scrape_result = asyncio.run(self.scrape(url))
|
||||
if not scrape_result.html and not scrape_result.visible_text:
|
||||
logger.error("Failed to scrape web page (no HTML or visible text)")
|
||||
return Document(content=f"Error parsing web page: {url}")
|
||||
|
||||
md_text = extract_markdown_from_html(scrape_result.html)
|
||||
if not md_text:
|
||||
md_text = build_visible_text_fallback(
|
||||
scrape_result.visible_text,
|
||||
scrape_result.page_title,
|
||||
)
|
||||
if md_text:
|
||||
logger.info(
|
||||
"Trafilatura empty; using Playwright visible-text fallback (%d chars)",
|
||||
len(md_text),
|
||||
)
|
||||
|
||||
if not md_text:
|
||||
logger.error("Failed to parse web page")
|
||||
return Document(content=f"Error parsing web page: {url}")
|
||||
|
||||
# Extract title from trafilatura metadata output (e.g. "title: xxx" line)
|
||||
metadata = {}
|
||||
title_match = re.search(r"^title:\s*(.+)", md_text, re.MULTILINE)
|
||||
if title_match:
|
||||
extracted_title = title_match.group(1).strip()
|
||||
if extracted_title:
|
||||
metadata["title"] = extracted_title
|
||||
logger.info(f"Extracted article title from trafilatura: {extracted_title}")
|
||||
logger.info(
|
||||
f"Extracted article title from trafilatura: {extracted_title}"
|
||||
)
|
||||
elif scrape_result.page_title:
|
||||
metadata["title"] = scrape_result.page_title.strip()
|
||||
logger.info(
|
||||
"Using page title from Playwright: %s", metadata["title"]
|
||||
)
|
||||
else:
|
||||
logger.info(f"No title found in trafilatura output, first 200 chars: {md_text[:200]!r}")
|
||||
logger.info(
|
||||
"No title found in trafilatura output, first 200 chars: %r",
|
||||
md_text[:200],
|
||||
)
|
||||
return Document(content=md_text, metadata=metadata)
|
||||
|
||||
|
||||
|
||||
42
docreader/parser/xlsx_merge.py
Normal file
42
docreader/parser/xlsx_merge.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""Fill merged cell values before pandas reads an XLSX workbook."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def fill_merged_cells_xlsx(content: bytes) -> bytes:
|
||||
"""Unmerge ranges and copy the master cell value into every covered cell.
|
||||
|
||||
openpyxl only stores values on the top-left cell of a merge; pandas then
|
||||
sees NaN in the rest. Filling makes row-wise RAG chunks retain context.
|
||||
"""
|
||||
if not zipfile.is_zipfile(BytesIO(content)):
|
||||
return content
|
||||
|
||||
from openpyxl import load_workbook
|
||||
|
||||
wb = load_workbook(BytesIO(content), data_only=True)
|
||||
changed = False
|
||||
for ws in wb.worksheets:
|
||||
if not ws.merged_cells.ranges:
|
||||
continue
|
||||
for merge_range in list(ws.merged_cells.ranges):
|
||||
master_value = ws.cell(merge_range.min_row, merge_range.min_col).value
|
||||
ws.unmerge_cells(str(merge_range))
|
||||
for row in range(merge_range.min_row, merge_range.max_row + 1):
|
||||
for col in range(merge_range.min_col, merge_range.max_col + 1):
|
||||
ws.cell(row, col).value = master_value
|
||||
changed = True
|
||||
|
||||
if not changed:
|
||||
return content
|
||||
|
||||
out = BytesIO()
|
||||
wb.save(out)
|
||||
logger.info("Filled merged cells in XLSX before parse")
|
||||
return out.getvalue()
|
||||
126
docreader/parser/xlsx_repair.py
Normal file
126
docreader/parser/xlsx_repair.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""Repair common XLSX packaging issues before openpyxl/pandas read."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import re
|
||||
import zipfile
|
||||
from typing import Callable, Dict, Iterable, Set
|
||||
|
||||
SST_PART = "xl/sharedStrings.xml"
|
||||
_SST_OVERRIDE_RE = re.compile(
|
||||
r'<Override[^>]*PartName="[^"]*sharedStrings\.xml"[^>]*/>',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_SST_REL_RE = re.compile(
|
||||
r'<Relationship[^>]*Type="[^"]*sharedStrings"[^>]*/>',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def repair_xlsx_bytes(content: bytes) -> bytes | None:
|
||||
"""Return repaired XLSX bytes, or None if no repair was applied.
|
||||
|
||||
Handles workbooks that reference ``xl/sharedStrings.xml`` in package
|
||||
metadata but omit the part (common with some exporters). When worksheets
|
||||
only use inline strings, manifest references are stripped so openpyxl can
|
||||
read the file.
|
||||
"""
|
||||
if not zipfile.is_zipfile(io.BytesIO(content)):
|
||||
return None
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(content), "r") as zin:
|
||||
names = _normalized_names(zin.namelist())
|
||||
sst_path = _find_shared_strings_path(names)
|
||||
if sst_path:
|
||||
if sst_path == SST_PART:
|
||||
return None
|
||||
return _rewrite_zip(
|
||||
zin, lambda files: _rename_shared_strings_part(files, sst_path)
|
||||
)
|
||||
if not _package_references_shared_strings(zin, names):
|
||||
return None
|
||||
if _worksheets_use_shared_string_cells(zin, names):
|
||||
return None
|
||||
return _rewrite_zip(zin, _strip_shared_strings_manifest)
|
||||
|
||||
|
||||
def _normalized_names(namelist: Iterable[str]) -> Set[str]:
|
||||
return {name.replace("\\", "/") for name in namelist}
|
||||
|
||||
|
||||
def _find_shared_strings_path(names: Set[str]) -> str | None:
|
||||
for name in names:
|
||||
if name.lower().endswith("sharedstrings.xml"):
|
||||
return name
|
||||
return None
|
||||
|
||||
|
||||
def _package_references_shared_strings(
|
||||
zin: zipfile.ZipFile, names: Set[str]
|
||||
) -> bool:
|
||||
content_types = "[Content_Types].xml"
|
||||
if content_types in names:
|
||||
ct = zin.read(content_types).decode("utf-8", errors="replace")
|
||||
if "sharedstrings.xml" in ct.lower():
|
||||
return True
|
||||
|
||||
rels_path = "xl/_rels/workbook.xml.rels"
|
||||
if rels_path in names:
|
||||
rels = zin.read(rels_path).decode("utf-8", errors="replace")
|
||||
if "sharedstrings" in rels.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _worksheets_use_shared_string_cells(
|
||||
zin: zipfile.ZipFile, names: Set[str]
|
||||
) -> bool:
|
||||
for name in names:
|
||||
if not name.startswith("xl/worksheets/") or not name.endswith(".xml"):
|
||||
continue
|
||||
sheet = zin.read(name).decode("utf-8", errors="replace")
|
||||
if re.search(r'\bt="s"', sheet):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _rename_shared_strings_part(
|
||||
files: Dict[str, bytes], source_path: str
|
||||
) -> Dict[str, bytes]:
|
||||
updated = dict(files)
|
||||
updated[SST_PART] = updated.pop(source_path)
|
||||
return updated
|
||||
|
||||
|
||||
def _strip_shared_strings_manifest(files: Dict[str, bytes]) -> Dict[str, bytes]:
|
||||
updated = dict(files)
|
||||
ct_path = "[Content_Types].xml"
|
||||
if ct_path in updated:
|
||||
ct = updated[ct_path].decode("utf-8")
|
||||
ct = _SST_OVERRIDE_RE.sub("", ct)
|
||||
updated[ct_path] = ct.encode("utf-8")
|
||||
|
||||
rels_path = "xl/_rels/workbook.xml.rels"
|
||||
if rels_path in updated:
|
||||
rels = updated[rels_path].decode("utf-8")
|
||||
rels = _SST_REL_RE.sub("", rels)
|
||||
updated[rels_path] = rels.encode("utf-8")
|
||||
return updated
|
||||
|
||||
|
||||
def _rewrite_zip(
|
||||
zin: zipfile.ZipFile,
|
||||
transform: Callable[[Dict[str, bytes]], Dict[str, bytes]],
|
||||
) -> bytes:
|
||||
files: Dict[str, bytes] = {}
|
||||
for info in zin.infolist():
|
||||
name = info.filename.replace("\\", "/")
|
||||
files[name] = zin.read(info.filename)
|
||||
files = transform(files)
|
||||
|
||||
out = io.BytesIO()
|
||||
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
|
||||
for name, data in files.items():
|
||||
zout.writestr(name, data)
|
||||
return out.getvalue()
|
||||
@@ -9,9 +9,12 @@ dependencies = [
|
||||
"grpcio>=1.78.0",
|
||||
"grpcio-health-checking>=1.78.0",
|
||||
"grpcio-tools>=1.78.0",
|
||||
"liteparse>=2.0.4",
|
||||
"lxml>=6.1.0",
|
||||
"markitdown[docx,pdf,xls,xlsx]>=0.1.3",
|
||||
"opendataloader-pdf>=2.4.7",
|
||||
"openpyxl>=3.1.0",
|
||||
"pandas>=2.0.0",
|
||||
"xlrd>=2.0.0",
|
||||
"pillow>=12.0.0",
|
||||
"playwright>=1.55.0",
|
||||
"protobuf>=6.33.0",
|
||||
|
||||
@@ -62,6 +62,67 @@ DEFAULT_CONFIGS = [
|
||||
]
|
||||
DEFAULT_CONFIGS.sort(key=lambda x: -x.priority)
|
||||
|
||||
_TABLE_ROW_PATTERN = re.compile(r"^\s*(?:\|[^|\n]*)+\|\s*$", re.MULTILINE)
|
||||
_MARKDOWN_TABLE_PRIORITY = 15
|
||||
|
||||
|
||||
def _is_empty_table_header_row(header: str) -> bool:
|
||||
"""True when the column-name line is only pipes/whitespace (MarkItDown quirk)."""
|
||||
newline = header.find("\n")
|
||||
if newline < 0:
|
||||
return False
|
||||
row = header[:newline].strip()
|
||||
return bool(row) and all(ch in "| \t" for ch in row)
|
||||
|
||||
|
||||
def _extract_separator_line(header: str) -> str:
|
||||
for line in header.split("\n"):
|
||||
if "---" in line:
|
||||
return line + "\n"
|
||||
return ""
|
||||
|
||||
|
||||
def _table_row_column_count(line: str) -> int:
|
||||
line = line.strip()
|
||||
if not line.startswith("|"):
|
||||
return 0
|
||||
parts = line.split("|")
|
||||
if parts and parts[0].strip() == "":
|
||||
parts = parts[1:]
|
||||
if parts and parts[-1].strip() == "":
|
||||
parts = parts[:-1]
|
||||
return len(parts)
|
||||
|
||||
|
||||
def _first_table_row_column_count(text: str) -> int:
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if line and _TABLE_ROW_PATTERN.match(line):
|
||||
return _table_row_column_count(line)
|
||||
return 0
|
||||
|
||||
|
||||
def _header_table_column_count(header: str) -> int:
|
||||
for line in header.split("\n"):
|
||||
line = line.strip()
|
||||
if not line or "---" in line:
|
||||
continue
|
||||
count = _table_row_column_count(line)
|
||||
if count > 0:
|
||||
return count
|
||||
return 0
|
||||
|
||||
|
||||
def _split_ends_with_paragraph_break(split: str) -> bool:
|
||||
trimmed = split.rstrip(" \t\r")
|
||||
return trimmed.endswith("\n\n") or trimmed.endswith("\r\n\r\n")
|
||||
|
||||
|
||||
def header_column_mismatch(headers: str, next_unit: str) -> bool:
|
||||
header_cols = _header_table_column_count(headers)
|
||||
row_cols = _first_table_row_column_count(next_unit)
|
||||
return header_cols > 0 and row_cols > 0 and header_cols != row_cols
|
||||
|
||||
|
||||
# 定义Hook状态数据结构
|
||||
class HeaderTracker(BaseModel):
|
||||
@@ -70,10 +131,28 @@ class HeaderTracker(BaseModel):
|
||||
header_hook_configs: List[HeaderTrackerHook] = Field(default=DEFAULT_CONFIGS)
|
||||
active_headers: Dict[int, str] = Field(default_factory=dict)
|
||||
ended_headers: set[int] = Field(default_factory=set)
|
||||
pending_extend: Dict[int, bool] = Field(default_factory=dict)
|
||||
pending_table_break: bool = Field(default=False)
|
||||
header_ended_this_unit: bool = Field(default=False)
|
||||
|
||||
def _clear_table_header(self) -> None:
|
||||
self.ended_headers.add(_MARKDOWN_TABLE_PRIORITY)
|
||||
self.active_headers.pop(_MARKDOWN_TABLE_PRIORITY, None)
|
||||
self.pending_extend.pop(_MARKDOWN_TABLE_PRIORITY, None)
|
||||
|
||||
def update(self, split: str) -> Dict[int, str]:
|
||||
"""检测当前split中的表头开始/结束,更新Hook状态"""
|
||||
new_headers: Dict[int, str] = {}
|
||||
self.header_ended_this_unit = False
|
||||
|
||||
if self.pending_table_break:
|
||||
self.pending_table_break = False
|
||||
if _MARKDOWN_TABLE_PRIORITY in self.active_headers:
|
||||
if _first_table_row_column_count(split) > 0:
|
||||
self._clear_table_header()
|
||||
self.header_ended_this_unit = True
|
||||
else:
|
||||
self._clear_table_header()
|
||||
|
||||
# 1. 检查是否有表头结束标记
|
||||
for config in self.header_hook_configs:
|
||||
@@ -82,8 +161,31 @@ class HeaderTracker(BaseModel):
|
||||
):
|
||||
self.ended_headers.add(config.priority)
|
||||
del self.active_headers[config.priority]
|
||||
self.pending_extend.pop(config.priority, None)
|
||||
|
||||
# 2. 检查是否有新的表头开始标记(只处理未活跃且未结束的)
|
||||
# 1b. \n\n 分块会吞掉表间空行:段尾 \n\n 或列数变化时结束表头追踪
|
||||
if (
|
||||
_MARKDOWN_TABLE_PRIORITY in self.active_headers
|
||||
and not self.pending_extend.get(_MARKDOWN_TABLE_PRIORITY)
|
||||
):
|
||||
if _split_ends_with_paragraph_break(split):
|
||||
self.pending_table_break = True
|
||||
else:
|
||||
header = self.active_headers[_MARKDOWN_TABLE_PRIORITY]
|
||||
row_cols = _first_table_row_column_count(split)
|
||||
header_cols = _header_table_column_count(header)
|
||||
if row_cols > 0 and header_cols > 0 and row_cols != header_cols:
|
||||
self._clear_table_header()
|
||||
self.header_ended_this_unit = True
|
||||
|
||||
# 2. 空表头行:用首个数据行补全列名(与 Go header_tracker 一致)
|
||||
for priority in list(self.pending_extend.keys()):
|
||||
if priority in self.active_headers and _TABLE_ROW_PATTERN.search(split):
|
||||
sep = _extract_separator_line(self.active_headers[priority])
|
||||
self.active_headers[priority] = split + sep
|
||||
self.pending_extend.pop(priority, None)
|
||||
|
||||
# 3. 检查是否有新的表头开始标记(只处理未活跃且未结束的)
|
||||
for config in self.header_hook_configs:
|
||||
if (
|
||||
config.priority not in self.active_headers
|
||||
@@ -94,8 +196,10 @@ class HeaderTracker(BaseModel):
|
||||
header = config.extract_header_fn(match)
|
||||
self.active_headers[config.priority] = header
|
||||
new_headers[config.priority] = header
|
||||
if _is_empty_table_header_row(header):
|
||||
self.pending_extend[config.priority] = True
|
||||
|
||||
# 3. 检查是否所有活跃表头都已结束(清空结束标记)
|
||||
# 4. 检查是否所有活跃表头都已结束(清空结束标记)
|
||||
if not self.active_headers:
|
||||
self.ended_headers.clear()
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ from pydantic import BaseModel, Field, PrivateAttr
|
||||
|
||||
from docreader.splitter.header_hook import (
|
||||
HeaderTracker,
|
||||
header_column_mismatch,
|
||||
)
|
||||
from docreader.utils.split import split_by_char, split_by_sep
|
||||
|
||||
@@ -225,6 +226,16 @@ class TextSplitter(BaseModel, Generic[T]):
|
||||
|
||||
# Update header tracking with current split
|
||||
self.header_hook.update(split)
|
||||
if self.header_hook.header_ended_this_unit and len(cur_chunk) > 0:
|
||||
chunks.append(
|
||||
(
|
||||
cur_chunk[0][0],
|
||||
cur_chunk[-1][1],
|
||||
"".join([c[2] for c in cur_chunk]),
|
||||
)
|
||||
)
|
||||
cur_chunk = []
|
||||
cur_len = 0
|
||||
cur_headers = self.header_hook.get_headers()
|
||||
cur_headers_len = self.len_function(cur_headers)
|
||||
|
||||
@@ -276,6 +287,7 @@ class TextSplitter(BaseModel, Generic[T]):
|
||||
cur_headers
|
||||
and split_len + cur_headers_len < self.chunk_size
|
||||
and cur_headers not in split
|
||||
and not header_column_mismatch(cur_headers, split)
|
||||
):
|
||||
next_start = cur_chunk[0][0] if cur_chunk else cur_start
|
||||
|
||||
|
||||
210
docreader/tests/test_excel_parser.py
Normal file
210
docreader/tests/test_excel_parser.py
Normal file
@@ -0,0 +1,210 @@
|
||||
import io
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import unittest
|
||||
import zipfile
|
||||
|
||||
import openpyxl
|
||||
import pandas as pd
|
||||
|
||||
from docreader.parser.excel_convert import detect_excel_format, engine_for_format
|
||||
from docreader.parser.excel_parser import ExcelParser
|
||||
from docreader.parser.xlsx_merge import fill_merged_cells_xlsx
|
||||
from docreader.parser.xlsx_repair import repair_xlsx_bytes
|
||||
|
||||
|
||||
def _xlsx_with_phantom_shared_strings() -> bytes:
|
||||
"""Workbook with inline strings but a dangling sharedStrings manifest entry."""
|
||||
wb = openpyxl.Workbook()
|
||||
ws = wb.active
|
||||
ws["A1"] = "hello"
|
||||
ws["B1"] = 42
|
||||
bio = io.BytesIO()
|
||||
wb.save(bio)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with zipfile.ZipFile(io.BytesIO(bio.getvalue()), "r") as zin:
|
||||
zin.extractall(tmpdir)
|
||||
|
||||
ct_path = f"{tmpdir}/[Content_Types].xml"
|
||||
with open(ct_path, encoding="utf-8") as f:
|
||||
ct = f.read()
|
||||
override = (
|
||||
'<Override PartName="/xl/sharedStrings.xml" '
|
||||
'ContentType="application/vnd.openxmlformats-officedocument.'
|
||||
'spreadsheetml.sharedStrings+xml"/>'
|
||||
)
|
||||
with open(ct_path, "w", encoding="utf-8") as f:
|
||||
f.write(ct.replace("</Types>", override + "</Types>"))
|
||||
|
||||
out = io.BytesIO()
|
||||
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
|
||||
for root, _, files in os.walk(tmpdir):
|
||||
for name in files:
|
||||
path = os.path.join(root, name)
|
||||
arc = os.path.relpath(path, tmpdir)
|
||||
zout.write(path, arc)
|
||||
return out.getvalue()
|
||||
|
||||
|
||||
class ExcelFormatDetectionTest(unittest.TestCase):
|
||||
def test_detect_xlsx_and_engine(self):
|
||||
wb = openpyxl.Workbook()
|
||||
bio = io.BytesIO()
|
||||
wb.save(bio)
|
||||
content = bio.getvalue()
|
||||
self.assertEqual(detect_excel_format(content), "xlsx")
|
||||
self.assertEqual(engine_for_format("xlsx"), "openpyxl")
|
||||
|
||||
def test_detect_xls_magic(self):
|
||||
content = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" + b"\x00" * 512
|
||||
self.assertEqual(detect_excel_format(content), "xls")
|
||||
self.assertEqual(engine_for_format("xls"), "xlrd")
|
||||
|
||||
def test_open_legacy_xls_bytes_with_xlsx_extension(self):
|
||||
if not shutil.which("soffice"):
|
||||
self.skipTest("LibreOffice not available")
|
||||
wb = openpyxl.Workbook()
|
||||
ws = wb.active
|
||||
ws["A1"] = "legacy"
|
||||
xlsx_bio = io.BytesIO()
|
||||
wb.save(xlsx_bio)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
src = os.path.join(tmpdir, "sheet.xlsx")
|
||||
with open(src, "wb") as handle:
|
||||
handle.write(xlsx_bio.getvalue())
|
||||
subprocess.run(
|
||||
[
|
||||
"soffice",
|
||||
"--headless",
|
||||
"--convert-to",
|
||||
"xls",
|
||||
"--outdir",
|
||||
tmpdir,
|
||||
src,
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
xls_path = os.path.join(tmpdir, "sheet.xls")
|
||||
with open(xls_path, "rb") as handle:
|
||||
xls_bytes = handle.read()
|
||||
|
||||
document = ExcelParser(file_name="fake.xlsx", file_type="xlsx").parse_into_text(
|
||||
xls_bytes
|
||||
)
|
||||
self.assertIn("legacy", document.content)
|
||||
|
||||
|
||||
class XlsxRepairTest(unittest.TestCase):
|
||||
def test_repair_removes_phantom_shared_strings_reference(self):
|
||||
broken = _xlsx_with_phantom_shared_strings()
|
||||
with self.assertRaises(KeyError):
|
||||
pd.read_excel(io.BytesIO(broken))
|
||||
|
||||
repaired = repair_xlsx_bytes(broken)
|
||||
self.assertIsNotNone(repaired)
|
||||
df = pd.read_excel(io.BytesIO(repaired), header=None)
|
||||
self.assertEqual(df.values.tolist(), [["hello", 42]])
|
||||
|
||||
def test_repair_skips_when_shared_string_cells_need_table(self):
|
||||
import xlsxwriter
|
||||
|
||||
bio = io.BytesIO()
|
||||
wb = xlsxwriter.Workbook(bio, {"in_memory": True})
|
||||
ws = wb.add_worksheet()
|
||||
ws.write(0, 0, "hello")
|
||||
wb.close()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with zipfile.ZipFile(io.BytesIO(bio.getvalue()), "r") as zin:
|
||||
zin.extractall(tmpdir)
|
||||
os.remove(f"{tmpdir}/xl/sharedStrings.xml")
|
||||
|
||||
out = io.BytesIO()
|
||||
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
|
||||
for root, _, files in os.walk(tmpdir):
|
||||
for name in files:
|
||||
path = os.path.join(root, name)
|
||||
arc = os.path.relpath(path, tmpdir)
|
||||
zout.write(path, arc)
|
||||
broken = out.getvalue()
|
||||
|
||||
self.assertIsNone(repair_xlsx_bytes(broken))
|
||||
|
||||
|
||||
class XlsxMergeFillTest(unittest.TestCase):
|
||||
def test_fill_merged_cells_propagates_master_value(self):
|
||||
wb = openpyxl.Workbook()
|
||||
ws = wb.active
|
||||
ws["A1"] = "title"
|
||||
ws.merge_cells("A1:B1")
|
||||
ws["A2"] = "left"
|
||||
ws["B2"] = "right"
|
||||
ws.merge_cells("A2:A3")
|
||||
ws["B3"] = "only-b"
|
||||
bio = io.BytesIO()
|
||||
wb.save(bio)
|
||||
|
||||
filled = fill_merged_cells_xlsx(bio.getvalue())
|
||||
out_wb = openpyxl.load_workbook(io.BytesIO(filled), data_only=True)
|
||||
out_ws = out_wb.active
|
||||
self.assertEqual(out_ws["B1"].value, "title")
|
||||
self.assertEqual(out_ws["A3"].value, "left")
|
||||
self.assertEqual(out_ws["B3"].value, "only-b")
|
||||
|
||||
def test_parse_en_mergecell_workbook(self):
|
||||
path = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"..",
|
||||
"..",
|
||||
"testdata",
|
||||
"rag_test",
|
||||
"xlsx",
|
||||
"en_mergecell.xlsx",
|
||||
)
|
||||
if not os.path.isfile(path):
|
||||
self.skipTest("en_mergecell.xlsx fixture not available")
|
||||
with open(path, "rb") as handle:
|
||||
document = ExcelParser().parse_into_text(handle.read())
|
||||
|
||||
chunks = [chunk.content.strip() for chunk in document.chunks]
|
||||
self.assertEqual(len(chunks), 12)
|
||||
self.assertIn("A: A1", chunks[0])
|
||||
self.assertIn("A: A2", chunks[1])
|
||||
self.assertIn("B: B3", chunks[2])
|
||||
self.assertNotIn("Unnamed:", document.content)
|
||||
self.assertIn("A: A7", chunks[6])
|
||||
self.assertIn("A: A7", chunks[7])
|
||||
self.assertIn("D: D10", chunks[9])
|
||||
|
||||
|
||||
class ExcelParserTest(unittest.TestCase):
|
||||
def test_parse_phantom_shared_strings_workbook(self):
|
||||
document = ExcelParser().parse_into_text(_xlsx_with_phantom_shared_strings())
|
||||
self.assertIn("hello", document.content)
|
||||
self.assertIn("42", document.content)
|
||||
self.assertGreater(len(document.chunks), 0)
|
||||
|
||||
def test_parse_en_calcchain_shared_strings_case(self):
|
||||
path = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"..",
|
||||
"..",
|
||||
"testdata",
|
||||
"rag_test",
|
||||
"xlsx",
|
||||
"en_calcchain.xlsx",
|
||||
)
|
||||
if not os.path.isfile(path):
|
||||
self.skipTest("en_calcchain.xlsx fixture not available")
|
||||
with open(path, "rb") as f:
|
||||
document = ExcelParser().parse_into_text(f.read())
|
||||
self.assertGreater(len(document.content), 0)
|
||||
self.assertGreater(len(document.chunks), 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
57
docreader/tests/test_markdown_table_util.py
Normal file
57
docreader/tests/test_markdown_table_util.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import io
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
from docreader.parser.markdown_parser import MarkdownTableUtil
|
||||
|
||||
|
||||
class TestMarkdownTableUtil(unittest.TestCase):
|
||||
def test_preserves_empty_cells(self):
|
||||
"""Interior empty cells must not be dropped during formatting."""
|
||||
raw = "| a | | c |\n| --- | --- | --- |\n| 1 | 2 | 3 |"
|
||||
formatted = MarkdownTableUtil().format_table(raw)
|
||||
self.assertIn("| a | | c |", formatted)
|
||||
self.assertEqual(formatted.count("|"), raw.count("|"))
|
||||
|
||||
def test_format_nonempty_table(self):
|
||||
raw = "|Name|Age|\n|---|---|\n|John|30|"
|
||||
formatted = MarkdownTableUtil().format_table(raw)
|
||||
self.assertIn("| Name | Age |", formatted)
|
||||
self.assertIn("| --- | --- |", formatted)
|
||||
self.assertIn("| John | 30 |", formatted)
|
||||
|
||||
def test_normalize_markitdown_en_tables(self):
|
||||
docx = (
|
||||
Path(__file__).resolve().parents[2]
|
||||
/ "testdata"
|
||||
/ "rag_test"
|
||||
/ "docx"
|
||||
/ "en_tables.docx"
|
||||
)
|
||||
if not docx.is_file():
|
||||
docx = Path(__file__).resolve().parents[2].parent / "testdata/rag_test/docx/en_tables.docx"
|
||||
raw = MarkItDown().convert(io.BytesIO(docx.read_bytes()), file_extension=".docx").text_content
|
||||
normalized = MarkdownTableUtil().format_table(raw)
|
||||
|
||||
self.assertNotIn("| | | | |", normalized)
|
||||
self.assertIn("| Name | Game | Fame | Blame |", normalized)
|
||||
idx_name = normalized.index("| Name | Game | Fame | Blame |")
|
||||
idx_sep = normalized.index("| --- | --- | --- | --- |", idx_name)
|
||||
self.assertLess(idx_name, idx_sep)
|
||||
self.assertIn("| Lebron James | Basketball |", normalized)
|
||||
|
||||
# Headerless 2-row tables: delimiter inserted so GFM renderers show a table
|
||||
self.assertIn(
|
||||
"| Sinple | Table |\n| --- | --- |\n| Without | Header |", normalized
|
||||
)
|
||||
self.assertIn(
|
||||
"| Simple Multiparagraph | Table Full |\n| --- | --- |\n"
|
||||
"| Of Paragraphs | In each Cell. |",
|
||||
normalized,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
103
docreader/tests/test_opendataloader_parser.py
Normal file
103
docreader/tests/test_opendataloader_parser.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""Unit tests for OpenDataLoader parser helpers (no JVM required)."""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
from docreader.parser.opendataloader_parser import (
|
||||
OpenDataLoaderParser,
|
||||
_collect_images_under_output,
|
||||
_find_markdown_file,
|
||||
_normalize_odl_image_url,
|
||||
_rewrite_markdown_image_refs,
|
||||
opendataloader_available,
|
||||
)
|
||||
|
||||
|
||||
class OpenDataLoaderHelpersTest(unittest.TestCase):
|
||||
def test_find_markdown_prefers_stem_match(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
other = os.path.join(d, "other.md")
|
||||
target = os.path.join(d, "paper.md")
|
||||
with open(other, "w") as f:
|
||||
f.write("x")
|
||||
with open(target, "w") as f:
|
||||
f.write("# Title")
|
||||
self.assertEqual(_find_markdown_file(d, "paper"), target)
|
||||
|
||||
def test_collect_and_rewrite_images(self):
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
img_dir = os.path.join(d, "images")
|
||||
os.makedirs(img_dir)
|
||||
png = os.path.join(img_dir, "fig1.png")
|
||||
with open(png, "wb") as f:
|
||||
f.write(b"\x89PNG\r\n\x1a\n")
|
||||
images = _collect_images_under_output(d)
|
||||
self.assertIn("images/fig1.png", images)
|
||||
md = "See  and ."
|
||||
out = _rewrite_markdown_image_refs(md, images)
|
||||
self.assertIn("", out)
|
||||
self.assertIn("", out)
|
||||
|
||||
def test_rewrite_odl_angle_bracket_and_entity_urls(self):
|
||||
images = {"images/imageFile1.png": "e30="}
|
||||
for md_in in (
|
||||
"",
|
||||
"",
|
||||
):
|
||||
out = _rewrite_markdown_image_refs(md_in, images)
|
||||
self.assertEqual("", out)
|
||||
|
||||
def test_normalize_odl_image_url(self):
|
||||
self.assertEqual(
|
||||
_normalize_odl_image_url("<images/imageFile2.png>"),
|
||||
"images/imageFile2.png",
|
||||
)
|
||||
self.assertEqual(
|
||||
_normalize_odl_image_url("<images/imageFile2.png>"),
|
||||
"images/imageFile2.png",
|
||||
)
|
||||
|
||||
def test_rewrite_skips_data_uris(self):
|
||||
md = ""
|
||||
self.assertEqual(_rewrite_markdown_image_refs(md, {"images/a.png": "e30="}), md)
|
||||
|
||||
|
||||
class OpenDataLoaderParserTest(unittest.TestCase):
|
||||
@mock.patch("docreader.parser.opendataloader_parser.opendataloader_available")
|
||||
@mock.patch("docreader.parser.opendataloader_parser._run_convert")
|
||||
def test_parse_reads_markdown_and_images(self, mock_convert, mock_avail):
|
||||
mock_avail.return_value = (True, "")
|
||||
|
||||
def fake_convert(pdf_path, output_dir, image_dir, overrides=None):
|
||||
stem = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
md_path = os.path.join(output_dir, f"{stem}.md")
|
||||
with open(md_path, "w") as f:
|
||||
f.write("# Hello\n\n\n")
|
||||
os.makedirs(image_dir, exist_ok=True)
|
||||
with open(os.path.join(image_dir, "pic.png"), "wb") as f:
|
||||
f.write(b"png")
|
||||
|
||||
mock_convert.side_effect = fake_convert
|
||||
|
||||
parser = OpenDataLoaderParser(file_name="doc.pdf", file_type="pdf")
|
||||
doc = parser.parse_into_text(b"%PDF-1.4 fake")
|
||||
self.assertIn("# Hello", doc.content)
|
||||
self.assertIn("images/pic.png", doc.content)
|
||||
self.assertIn("images/pic.png", doc.images)
|
||||
self.assertEqual(doc.metadata.get("parser_engine"), "opendataloader")
|
||||
|
||||
@mock.patch("docreader.parser.opendataloader_parser.shutil.which", return_value=None)
|
||||
def test_availability_requires_java(self, _which):
|
||||
with mock.patch(
|
||||
"docreader.parser.opendataloader_parser._package_available",
|
||||
return_value=(True, ""),
|
||||
):
|
||||
ok, msg = opendataloader_available()
|
||||
self.assertFalse(ok)
|
||||
self.assertIn("Java", msg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -6,10 +6,15 @@ from PIL import Image
|
||||
from docreader.parser.pdf_parser import (
|
||||
PDFParser,
|
||||
_classify_page,
|
||||
_filter_reading_columns,
|
||||
_group_lines,
|
||||
_is_artifact_column,
|
||||
_join_line_glyphs,
|
||||
_merge_orphan_punctuation_lines,
|
||||
_point_in_boxes,
|
||||
_segments_to_markdown,
|
||||
_select_embedded_images,
|
||||
_should_prefer_plain,
|
||||
_split_columns,
|
||||
_strip_repeating_lines,
|
||||
)
|
||||
@@ -122,13 +127,25 @@ class ReadingOrderTest(unittest.TestCase):
|
||||
def test_group_lines_orders_by_y_then_x(self):
|
||||
# Two visual lines; within a line glyphs given out of x-order.
|
||||
chars = [
|
||||
_char("B", 120, 130, 700, 712),
|
||||
_char("A", 100, 110, 700, 712), # same line, left of B
|
||||
_char("B", 110, 120, 700, 712), # adjacent to A (no word-sized gap)
|
||||
_char("A", 100, 110, 700, 712),
|
||||
_char("C", 100, 110, 680, 692), # next line down
|
||||
]
|
||||
lines = _group_lines(chars)
|
||||
self.assertEqual([ln["text"] for ln in lines], ["AB", "C"])
|
||||
|
||||
def test_join_line_glyphs_inserts_word_spaces(self):
|
||||
# Wide gap between "copy" and "of" mimics positioned OCR / text layers.
|
||||
chars = [
|
||||
_char("c", 0, 4, 0, 10),
|
||||
_char("f", 10, 14, 0, 10),
|
||||
]
|
||||
self.assertEqual(_join_line_glyphs(chars), "c f")
|
||||
|
||||
def test_join_line_glyphs_keeps_adjacent_letters(self):
|
||||
chars = [_char("A", 100, 110, 700, 712), _char("B", 110, 120, 700, 712)]
|
||||
self.assertEqual(_join_line_glyphs(chars), "AB")
|
||||
|
||||
|
||||
class HeadingDetectionTest(unittest.TestCase):
|
||||
def test_promotes_large_line_to_heading(self):
|
||||
@@ -156,6 +173,235 @@ class HiddenTextFilterTest(unittest.TestCase):
|
||||
self.assertFalse(_point_in_boxes(20.0, 5.0, boxes))
|
||||
|
||||
|
||||
class MarginColumnFilterTest(unittest.TestCase):
|
||||
def test_drops_narrow_vertical_margin_column(self):
|
||||
# Mimics arXiv sidebar: narrow x span, one glyph per line.
|
||||
margin = [
|
||||
_char(c, 20, 28, 500 - i * 14, 512 - i * 14)
|
||||
for i, c in enumerate("0202luJ22")
|
||||
]
|
||||
body = [
|
||||
_char("L", 160, 170, 700, 712),
|
||||
_char("a", 170, 180, 700, 712),
|
||||
_char("n", 180, 190, 700, 712),
|
||||
]
|
||||
cols = _filter_reading_columns(margin + body, scale=10.0, width=612.0)
|
||||
self.assertEqual(len(cols), 1)
|
||||
self.assertEqual(cols[0][0]["ch"], "L")
|
||||
|
||||
def test_keeps_real_two_column_layout(self):
|
||||
left = [_char("L", 50, 150, 700 - i * 12, 712 - i * 12) for i in range(4)]
|
||||
right = [_char("R", 400, 500, 700 - i * 12, 712 - i * 12) for i in range(4)]
|
||||
cols = _filter_reading_columns(left + right, scale=12.0, width=600.0)
|
||||
self.assertEqual(len(cols), 2)
|
||||
|
||||
|
||||
class PunctuationMergeTest(unittest.TestCase):
|
||||
def test_merges_orphan_periods(self):
|
||||
lines = [
|
||||
_line("Figure 1 2", 10.0),
|
||||
_line(". .", 10.0),
|
||||
_line("Next", 10.0),
|
||||
]
|
||||
merged = _merge_orphan_punctuation_lines(lines)
|
||||
self.assertEqual([ln["text"] for ln in merged], ["Figure 1 2..", "Next"])
|
||||
|
||||
|
||||
class PdfTextSanitizeTest(unittest.TestCase):
|
||||
def test_removes_fffe_placeholder(self):
|
||||
from docreader.parser.pdf_parser import _postprocess_pdf_text
|
||||
|
||||
raw = "multi\ufffelayer and non\ufffetrivial"
|
||||
out = _postprocess_pdf_text(raw)
|
||||
self.assertEqual(out, "multilayer and nontrivial")
|
||||
|
||||
def test_strips_chart_axis_run(self):
|
||||
from docreader.parser.pdf_parser import _postprocess_pdf_text
|
||||
|
||||
raw = (
|
||||
"Deep convolutional neural networks have led to breakthroughs.\n"
|
||||
"0 1 2 3 4 5 6 0\n"
|
||||
"10\n"
|
||||
"20\n"
|
||||
"iter. (1e4)\n"
|
||||
"training error (%)\n"
|
||||
"56-layer\n"
|
||||
"20-layer\n"
|
||||
"Figure 1. Training error on CIFAR-10.\n"
|
||||
)
|
||||
out = _postprocess_pdf_text(raw)
|
||||
self.assertIn("breakthroughs", out)
|
||||
self.assertNotIn("56-layer", out)
|
||||
self.assertIn("Figure 1.", out)
|
||||
|
||||
def test_strips_diagram_labels_above_caption(self):
|
||||
from docreader.parser.pdf_parser import _postprocess_pdf_text
|
||||
|
||||
raw = (
|
||||
"Paragraph before.\n"
|
||||
"identity\n"
|
||||
"weight layer\n"
|
||||
"relu\n"
|
||||
"Figure 2. Residual learning block.\n"
|
||||
"Paragraph after.\n"
|
||||
)
|
||||
out = _postprocess_pdf_text(raw)
|
||||
self.assertIn("Paragraph before.", out)
|
||||
self.assertIn("Figure 2.", out)
|
||||
self.assertIn("Paragraph after.", out)
|
||||
self.assertNotIn("identity", out)
|
||||
self.assertNotIn("weight layer", out)
|
||||
|
||||
def test_strips_arxiv_header_line(self):
|
||||
from docreader.parser.pdf_parser import _postprocess_pdf_text
|
||||
|
||||
raw = "Body text.\n1\narXiv:1512.03385v1 [cs.CV] 10 Dec 2015\nMore body."
|
||||
out = _postprocess_pdf_text(raw)
|
||||
self.assertNotIn("arXiv:", out)
|
||||
self.assertIn("Body text.", out)
|
||||
|
||||
|
||||
class PlainWellFormedTest(unittest.TestCase):
|
||||
def test_academic_plain_skips_layout(self):
|
||||
from docreader.parser.pdf_parser import _plain_is_well_formed
|
||||
|
||||
plain = (
|
||||
"Recent work [DL15, MBXS17] shows progress on NLP tasks "
|
||||
"with pre-trained models."
|
||||
)
|
||||
self.assertTrue(_plain_is_well_formed(plain))
|
||||
|
||||
def test_glued_scan_plain_needs_layout(self):
|
||||
from docreader.parser.pdf_parser import _plain_is_well_formed
|
||||
|
||||
self.assertFalse(_plain_is_well_formed("Thisisadigitalcopyofabook"))
|
||||
|
||||
|
||||
class LayoutQualityFallbackTest(unittest.TestCase):
|
||||
def test_prefers_plain_when_many_single_char_lines(self):
|
||||
plain = "Language Models are Few-Shot Learners\nTom Brown"
|
||||
layout = "0\n2\n0\n2\nl\nu\nJ\nLan ua e Models"
|
||||
self.assertTrue(_should_prefer_plain(plain, layout))
|
||||
|
||||
def test_keeps_good_layout(self):
|
||||
plain = "Hello world"
|
||||
layout = "Hello world"
|
||||
self.assertFalse(_should_prefer_plain(plain, layout))
|
||||
|
||||
|
||||
class ResNetPaperFigureTest(unittest.TestCase):
|
||||
"""Regression: ResNet PDF (arXiv:1512.03385) vector figures and captions."""
|
||||
|
||||
def test_resnet_figures_and_captions(self):
|
||||
import os
|
||||
|
||||
from docreader.parser.pdf_parser import PDFParser
|
||||
|
||||
for path in (
|
||||
os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"..",
|
||||
"..",
|
||||
"testdata",
|
||||
"rag_test",
|
||||
"pdf_en",
|
||||
"resnet.pdf",
|
||||
),
|
||||
"/tmp/resnet.pdf",
|
||||
):
|
||||
if os.path.isfile(path):
|
||||
break
|
||||
else:
|
||||
self.skipTest("resnet.pdf not available")
|
||||
|
||||
with open(path, "rb") as f:
|
||||
doc = PDFParser(file_name="resnet.pdf", file_type="pdf").parse_into_text(
|
||||
f.read()
|
||||
)
|
||||
self.assertGreater(doc.metadata.get("vector_figure_count", 0), 0)
|
||||
self.assertIn("![", doc.content)
|
||||
self.assertIn("Figure 2. Residual learning", doc.content)
|
||||
self.assertNotIn("arXiv:", doc.content)
|
||||
fig2 = doc.content.find("Figure 2. Residual learning")
|
||||
before = doc.content[max(0, fig2 - 120) : fig2]
|
||||
self.assertIn("![", before)
|
||||
self.assertNotIn("identity", before)
|
||||
|
||||
|
||||
class Gpt3PaperLayoutTest(unittest.TestCase):
|
||||
"""Regression: arXiv GPT-3 paper title page must not be one-glyph-per-line."""
|
||||
|
||||
def test_gpt3_page0_title_and_authors(self):
|
||||
import os
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_r
|
||||
|
||||
from docreader.parser.pdf_parser import PDFParser, _extract_layout_text
|
||||
|
||||
pdf_path = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"..",
|
||||
"..",
|
||||
"testdata",
|
||||
"rag_test",
|
||||
"pdf_en",
|
||||
"gpt3.pdf",
|
||||
)
|
||||
if not os.path.isfile(pdf_path):
|
||||
self.skipTest("gpt3.pdf not in testdata")
|
||||
with open(pdf_path, "rb") as f:
|
||||
content = f.read()
|
||||
with pdfium.PdfDocument(content) as pdf:
|
||||
page = pdf[0]
|
||||
try:
|
||||
layout = _extract_layout_text(page, pdfium_r)
|
||||
finally:
|
||||
page.close()
|
||||
# Margin sidebar must not appear as one-glyph-per-line prefix.
|
||||
self.assertNotRegex(layout[:300], r"^0\n2\n0\n2")
|
||||
self.assertIn("Few-Shot Learners", layout)
|
||||
|
||||
doc = PDFParser(file_name="gpt3.pdf", file_type="pdf").parse_into_text(content)
|
||||
self.assertIn("Language Models are Few-Shot Learners", doc.content)
|
||||
self.assertIn("Tom B. Brown", doc.content[:1200])
|
||||
self.assertIn("[DL15, MBXS17, PNZtY18]", doc.content)
|
||||
self.assertIn("task-specific architectures), and more recently", doc.content)
|
||||
self.assertNotIn("k ifi hi d l", doc.content)
|
||||
|
||||
|
||||
class ScanEnglishDictLayoutTest(unittest.TestCase):
|
||||
"""Regression: Google Books-style PDFs lose spaces without gap inference."""
|
||||
|
||||
def test_scan_en_dict_page0_has_word_spaces(self):
|
||||
import os
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_r
|
||||
|
||||
from docreader.parser.pdf_parser import _extract_layout_text
|
||||
|
||||
pdf_path = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"..",
|
||||
"..",
|
||||
"testdata",
|
||||
"rag_test",
|
||||
"pdf_scan",
|
||||
"scan_en_dict.pdf",
|
||||
)
|
||||
if not os.path.isfile(pdf_path):
|
||||
self.skipTest("scan_en_dict.pdf not in testdata")
|
||||
with open(pdf_path, "rb") as f:
|
||||
pdf = pdfium.PdfDocument(f.read())
|
||||
try:
|
||||
text = _extract_layout_text(pdf[0], pdfium_r)
|
||||
finally:
|
||||
pdf.close()
|
||||
self.assertIn("This is a digital copy of a book", text)
|
||||
self.assertNotIn("Thisisadigitalcopyofabook", text)
|
||||
|
||||
|
||||
class PDFRouterIntegrationTest(unittest.TestCase):
|
||||
def test_image_only_pdf_routes_to_scanned(self):
|
||||
pdf_bytes = _make_image_only_pdf(2)
|
||||
|
||||
86
docreader/tests/test_ppt_convert.py
Normal file
86
docreader/tests/test_ppt_convert.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import shutil
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from docreader.parser.ppt_convert import (
|
||||
convert_ppt_to_pptx_bytes,
|
||||
is_ole_compound,
|
||||
is_zip_openxml,
|
||||
needs_ppt_to_pptx_conversion,
|
||||
normalize_ppt_bytes,
|
||||
)
|
||||
|
||||
TESTDATA = Path(__file__).resolve().parents[2] / "testdata" / "rag_test"
|
||||
LEGACY_PPT = TESTDATA / "ppt_old" / "en_38256.ppt"
|
||||
WMF_IMAGE_PPT = LEGACY_PPT
|
||||
IMAGE_HEAVY_PPT = TESTDATA / "ppt_old" / "en_41384.ppt"
|
||||
PPTX_SAMPLE = TESTDATA / "pptx" / "en_marker.pptx"
|
||||
|
||||
|
||||
class TestPptConvert(unittest.TestCase):
|
||||
def test_legacy_ppt_magic(self):
|
||||
content = LEGACY_PPT.read_bytes()
|
||||
self.assertTrue(is_ole_compound(content))
|
||||
self.assertFalse(is_zip_openxml(content))
|
||||
self.assertTrue(needs_ppt_to_pptx_conversion(content, "ppt"))
|
||||
|
||||
def test_pptx_does_not_need_conversion(self):
|
||||
content = PPTX_SAMPLE.read_bytes()
|
||||
self.assertTrue(is_zip_openxml(content))
|
||||
self.assertFalse(needs_ppt_to_pptx_conversion(content, "pptx"))
|
||||
|
||||
def test_normalize_pptx_passthrough(self):
|
||||
content = PPTX_SAMPLE.read_bytes()
|
||||
out, ext = normalize_ppt_bytes(content, "pptx")
|
||||
self.assertEqual(out, content)
|
||||
self.assertEqual(ext, ".pptx")
|
||||
|
||||
def test_legacy_ppt_requires_soffice(self):
|
||||
if not shutil.which("soffice"):
|
||||
with self.assertRaises(ValueError) as ctx:
|
||||
normalize_ppt_bytes(LEGACY_PPT.read_bytes(), "ppt")
|
||||
self.assertIn("LibreOffice", str(ctx.exception))
|
||||
self.skipTest("LibreOffice not available")
|
||||
converted = convert_ppt_to_pptx_bytes(LEGACY_PPT.read_bytes(), suffix=".ppt")
|
||||
self.assertIsNotNone(converted)
|
||||
self.assertTrue(is_zip_openxml(converted))
|
||||
out, ext = normalize_ppt_bytes(LEGACY_PPT.read_bytes(), "ppt")
|
||||
self.assertEqual(ext, ".pptx")
|
||||
self.assertTrue(is_zip_openxml(out))
|
||||
|
||||
def test_wmf_legacy_ppt_extracts_rasterized_image(self):
|
||||
if not shutil.which("soffice"):
|
||||
self.skipTest("LibreOffice not available")
|
||||
if not shutil.which("convert"):
|
||||
self.skipTest("ImageMagick convert not available")
|
||||
if not WMF_IMAGE_PPT.is_file():
|
||||
self.skipTest("testdata missing")
|
||||
|
||||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||||
|
||||
doc = MarkitdownParser(file_type="ppt").parse_into_text(
|
||||
WMF_IMAGE_PPT.read_bytes()
|
||||
)
|
||||
self.assertEqual(len(doc.images), 1)
|
||||
self.assertNotIn("bd10496_.jpg", doc.content)
|
||||
self.assertIn("images/", doc.content)
|
||||
|
||||
def test_image_heavy_legacy_ppt_extracts_images(self):
|
||||
if not shutil.which("soffice"):
|
||||
self.skipTest("LibreOffice not available")
|
||||
if not IMAGE_HEAVY_PPT.is_file():
|
||||
self.skipTest("testdata missing")
|
||||
|
||||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||||
|
||||
doc = MarkitdownParser(file_type="ppt").parse_into_text(
|
||||
IMAGE_HEAVY_PPT.read_bytes()
|
||||
)
|
||||
self.assertGreaterEqual(len(doc.images), 2)
|
||||
self.assertNotIn("", doc.content)
|
||||
for ref in doc.images:
|
||||
self.assertTrue(ref.startswith("images/"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
42
docreader/tests/test_web_parser.py
Normal file
42
docreader/tests/test_web_parser.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import unittest
|
||||
|
||||
from docreader.parser.web_parser import (
|
||||
build_visible_text_fallback,
|
||||
extract_markdown_from_html,
|
||||
)
|
||||
|
||||
|
||||
class TestWebParserHelpers(unittest.TestCase):
|
||||
def test_extract_markdown_empty_html(self):
|
||||
self.assertIsNone(extract_markdown_from_html(""))
|
||||
self.assertIsNone(extract_markdown_from_html(" "))
|
||||
|
||||
def test_extract_markdown_article_html(self):
|
||||
html = """
|
||||
<html><head><title>Demo</title></head><body>
|
||||
<article><h1>Hello</h1><p>World paragraph with enough text for extraction.</p></article>
|
||||
</body></html>
|
||||
"""
|
||||
md = extract_markdown_from_html(html)
|
||||
self.assertIsNotNone(md)
|
||||
self.assertIn("Hello", md)
|
||||
|
||||
def test_build_fallback_too_short(self):
|
||||
self.assertIsNone(build_visible_text_fallback("short"))
|
||||
self.assertIsNone(build_visible_text_fallback(""))
|
||||
|
||||
def test_build_fallback_with_title(self):
|
||||
text = "A" * 60
|
||||
md = build_visible_text_fallback(text, page_title="WeKnora")
|
||||
self.assertIsNotNone(md)
|
||||
self.assertTrue(md.startswith("# WeKnora"))
|
||||
self.assertIn(text, md)
|
||||
|
||||
def test_build_fallback_without_title(self):
|
||||
text = "B" * 60
|
||||
md = build_visible_text_fallback(text, page_title="")
|
||||
self.assertEqual(md, text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
50
docreader/uv.lock
generated
50
docreader/uv.lock
generated
@@ -463,9 +463,11 @@ dependencies = [
|
||||
{ name = "grpcio" },
|
||||
{ name = "grpcio-health-checking" },
|
||||
{ name = "grpcio-tools" },
|
||||
{ name = "liteparse" },
|
||||
{ name = "lxml" },
|
||||
{ name = "markitdown", extra = ["docx", "pdf", "xls", "xlsx"] },
|
||||
{ name = "opendataloader-pdf" },
|
||||
{ name = "openpyxl" },
|
||||
{ name = "pandas" },
|
||||
{ name = "pillow" },
|
||||
{ name = "playwright" },
|
||||
{ name = "protobuf" },
|
||||
@@ -476,6 +478,7 @@ dependencies = [
|
||||
{ name = "requests" },
|
||||
{ name = "textract" },
|
||||
{ name = "trafilatura" },
|
||||
{ name = "xlrd" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
@@ -484,9 +487,11 @@ requires-dist = [
|
||||
{ name = "grpcio", specifier = ">=1.78.0" },
|
||||
{ name = "grpcio-health-checking", specifier = ">=1.78.0" },
|
||||
{ name = "grpcio-tools", specifier = ">=1.78.0" },
|
||||
{ name = "liteparse", specifier = ">=2.0.4" },
|
||||
{ name = "lxml", specifier = ">=6.1.0" },
|
||||
{ name = "markitdown", extras = ["docx", "pdf", "xls", "xlsx"], specifier = ">=0.1.3" },
|
||||
{ name = "opendataloader-pdf", specifier = ">=2.4.7" },
|
||||
{ name = "openpyxl", specifier = ">=3.1.0" },
|
||||
{ name = "pandas", specifier = ">=2.0.0" },
|
||||
{ name = "pillow", specifier = ">=12.0.0" },
|
||||
{ name = "playwright", specifier = ">=1.55.0" },
|
||||
{ name = "protobuf", specifier = ">=6.33.0" },
|
||||
@@ -497,6 +502,7 @@ requires-dist = [
|
||||
{ name = "requests", specifier = ">=2.32.5" },
|
||||
{ name = "textract", specifier = "==1.5.0" },
|
||||
{ name = "trafilatura", specifier = ">=2.0.0" },
|
||||
{ name = "xlrd", specifier = ">=2.0.0" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -786,37 +792,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940, upload-time = "2025-02-25T20:21:44.179Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "liteparse"
|
||||
version = "2.0.4"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/0d/e7/ecf68643604a59247a0a7b2f8c73bee7415ea99e0165bb32e2838ddd0d3f/liteparse-2.0.4.tar.gz", hash = "sha256:17f6119f38e80b956c1ce3dc998ea7b0a8e80777ce1f49178f2b14bb17b35a9c", size = 115487, upload-time = "2026-05-30T06:32:12.351Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/b0/4f5007a52ef13679437a892a06ea58448b825de7ea78276e19b9d7fb9dcb/liteparse-2.0.4-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6df1e1199ffbeb2191bb64d7fcbff6af6bdfd1592973e0ad67a82eb09d377c08", size = 13027870, upload-time = "2026-05-30T06:31:11.022Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/83/2f/c7977a2d6f376e31c8c465ee010c238e27e06cbb2c3200d63f41983e40db/liteparse-2.0.4-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:35a72946b965d3b6b87a602051919e7ce243da15ad143d301152fb5e8cd0f6d2", size = 13149255, upload-time = "2026-05-30T06:31:13.636Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6b/9d/e7f1a1b8cb14ac867b1220fdb0c87bfe07b86c69bf98578573ab37b1a103/liteparse-2.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:c1bbc8b7206b8bfbf7aabc5341d2cf851b7464641d58375bd218b4e1dd3517f9", size = 11115466, upload-time = "2026-05-30T06:31:16.201Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9c/2d/be89a429a6a6bc78ce8d620974a4f8fbe9f566ea3592a2f1da8dc6bdda4a/liteparse-2.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:537ab6347a384f81980e48cc181d6cd33fc6ad2b7478e3db61350076744d952e", size = 11029024, upload-time = "2026-05-30T06:31:19.11Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a9/c8/7429622d86bf00ceaec95bf211adf1c9a7bdf46f8c2cd806685f9c02c0f1/liteparse-2.0.4-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:feae0c530197130cb38f176d718eeae639d9091264aa5f954835986c59470813", size = 13028074, upload-time = "2026-05-30T06:31:21.725Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/21/d0/a97174ae281d353251994ed080c8855ea9b0b5d81a60ab3b6b065e911c49/liteparse-2.0.4-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:2012a3a9b5a3f7e13ce34b5a770158971da43bb9d266c7c5a3ea62bdda7ca851", size = 13148977, upload-time = "2026-05-30T06:31:24.498Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/60/48/f41ebe428d8d8d70c53ddd47523baa7300c5cc96e404417d7af25578be01/liteparse-2.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:2d05d10f0d14b1beb34ef8c5e9a14d6cc966adf19f60c7ea1ec5717adc4c986f", size = 11115791, upload-time = "2026-05-30T06:31:26.961Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ea/58/be78c7c47147aeb1350d475336c6c2e17d5aa513be9244e9d95a170ced34/liteparse-2.0.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:87680616fae276b04ace6e5fc5e4e0c93980391b0d46c2d66d72c0742a3cb19e", size = 11026045, upload-time = "2026-05-30T06:31:29.405Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/86/1f/105ccdd9bc4608a836fe409394d68e8765e699fa7393c2f2f464c612057f/liteparse-2.0.4-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:414599a922aa51f567fa939183929579d1668ef74846fe25f7f46742bb31fcd8", size = 13022571, upload-time = "2026-05-30T06:31:32.321Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/58/9f/4bf4e9b112b47025ae085503fe9cbf13631673ffc41bfb864a3091285c22/liteparse-2.0.4-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:efdaa6b471084a1f4594574555eb6abb5f85de25f2155c8d539542239eacaa56", size = 13146871, upload-time = "2026-05-30T06:31:34.705Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/f0/bf10611e409732bd4e19f0fc0faf3194040e8e09bb75a166ee126d09b70f/liteparse-2.0.4-cp312-cp312-win_amd64.whl", hash = "sha256:fb67326ba957388214762acea35d24cf0d1230ae6a2fe1fdeaf74024e92e3c40", size = 11116682, upload-time = "2026-05-30T06:31:36.992Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6e/b5/02ed5fff6418fdc970688190eab4470f4f9c116f4de1e39a7deea0d9968a/liteparse-2.0.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8f02c6e0d8f71da671a3527d52d8f1e2c42fddebf81d1b4931c3d035e4ec1e6a", size = 11025231, upload-time = "2026-05-30T06:31:39.696Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/be/d7/b4633483502940d43d583f8057e0aed68b9091087a86d021f8bd7558ba0b/liteparse-2.0.4-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:924e3f669341e22e625b13d08535644d1dfd779bc6781e4ab6f6e54ea90a53d6", size = 13022754, upload-time = "2026-05-30T06:31:42.434Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8a/e0/3938561ad66d4a216922c8e1e6a878f63df82ce5f00f15a935f779fb7c5b/liteparse-2.0.4-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:137f169002f3abe21e3dd2e6781fbd86841096a0f3b0162afc1fd64eb21fa607", size = 13146432, upload-time = "2026-05-30T06:31:46.706Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e6/7f/a2017df8031677d7940ad1ce33640219aa28defae4a8171844ea8bed68ca/liteparse-2.0.4-cp313-cp313-win_amd64.whl", hash = "sha256:098fba3ecb2337f78426d9e077d1f70bc75871d4387ab8c3774b0cc5d26b890d", size = 11116383, upload-time = "2026-05-30T06:31:49.546Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/59/63/b2bb03bc30103e93c87695f63eae3ed007b08796a6cc06ea29acace54c4a/liteparse-2.0.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8aeaf821151aaaa854294f3499d64264dbea7d10e682fa9a2443f9177cd444c6", size = 11024196, upload-time = "2026-05-30T06:31:52.091Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b9/c1/6dedc6b4325aa8de3249694123a74bc9506e0d65a28c85aa5fad4bfdea5c/liteparse-2.0.4-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:d2efbaf7453d2bedc86db51b2b808078567817d7fc537122389b65a317927902", size = 13022936, upload-time = "2026-05-30T06:31:54.619Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7a/04/7e7c3a8edd01c9904b6eef76bf4a008f987a5df64b8334c61e742861ac84/liteparse-2.0.4-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:34c53d9cefa35f77dc67a19a875e6dca32b4f35006c2015a22eb30c9c810653b", size = 13146821, upload-time = "2026-05-30T06:31:57.284Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/66/f4/da191e881cad5941dc0065782497eb81027bc3f48ac0a3143deab094be33/liteparse-2.0.4-cp314-cp314-win_amd64.whl", hash = "sha256:6546ee0359dc56eebd9f45008bb59708118c234140ecf466f6c7121d9161d9e4", size = 11114558, upload-time = "2026-05-30T06:31:59.799Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/16/59/c554f376c0bdd1bf4c313ac5d77a34817740f021ab6ada9d3226a23fa4b6/liteparse-2.0.4-cp315-cp315-manylinux_2_28_aarch64.whl", hash = "sha256:7c02d0bb31cd5aefa3297ce6e58388abd6f3e109c62ac0fdeef07d8eac4b769e", size = 13023454, upload-time = "2026-05-30T06:32:02.464Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/89/96/04c595ab45162d81bc73218870d1459560428c3f40957e594a6c1c5ea2be/liteparse-2.0.4-cp315-cp315-manylinux_2_28_x86_64.whl", hash = "sha256:acdf3c76cb3215f8d389a935b6b68007fac2ffa9ce0b681dd53650b69d580521", size = 13146859, upload-time = "2026-05-30T06:32:05.175Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/97/9c/59cdd88ebc6c27312ea6cbd0a894002e78b6f8a3dead2b2bf60d7febba85/liteparse-2.0.4-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4cf31cb3987df1190e59b73d9f10976e538ff577f41c40281fd14b84fe4f9da1", size = 13030767, upload-time = "2026-05-30T06:32:07.9Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7d/ae/9b85e510ddb390ed63b407851d412152b7006487d06703d931f6a0b1414e/liteparse-2.0.4-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:a1f9cb9c24f2df0d4f71ddd66ddb474bfdec8a434ecc1428b791f83aab2a688b", size = 13152103, upload-time = "2026-05-30T06:32:10.457Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lxml"
|
||||
version = "6.1.0"
|
||||
@@ -1294,6 +1269,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/3d/6830fa61c69ca8e905f237001dbfc01689a4e4ab06147020a4518318881f/onnxruntime-1.23.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d2385e774f46ac38f02b3a91a91e30263d41b2f1f4f26ae34805b2a9ddef466", size = 15229610, upload-time = "2025-10-22T03:46:32.239Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opendataloader-pdf"
|
||||
version = "2.4.7"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/1d/5a/98cd2079f2828f7886ee447eae21ee60a858930596aebcc8d275a1fe2b12/opendataloader_pdf-2.4.7.tar.gz", hash = "sha256:a16e995f2f526d706045218d9e359a31f50371a0bc0e3bb1bc15abb467c08fb7", size = 22554865, upload-time = "2026-05-27T10:04:54.285Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b7/14/f897eabf04eab4e6a40dce9214d921558165f3eaed68335892a5b1a004d0/opendataloader_pdf-2.4.7-py3-none-any.whl", hash = "sha256:1c359183650f4c012875010c156f13b6d3477b00762b8e3fbd8479fa03feb628", size = 22568934, upload-time = "2026-05-27T10:04:49.902Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.1.5"
|
||||
|
||||
@@ -86,6 +86,15 @@ export interface ParserEngineConfig {
|
||||
mineru_cloud_enable_table?: boolean | null
|
||||
mineru_cloud_enable_ocr?: boolean | null
|
||||
mineru_cloud_language?: string
|
||||
// PaddleOCR-VL 自建参数
|
||||
paddleocr_vl_endpoint?: string
|
||||
paddleocr_vl_use_seal_recognition?: boolean | null
|
||||
paddleocr_vl_use_chart_recognition?: boolean | null
|
||||
// PaddleOCR-VL 云 API 参数
|
||||
paddleocr_vl_cloud_token?: string
|
||||
paddleocr_vl_cloud_model?: string
|
||||
paddleocr_vl_cloud_use_seal_recognition?: boolean | null
|
||||
paddleocr_vl_cloud_use_chart_recognition?: boolean | null
|
||||
}
|
||||
|
||||
export interface ParserEnginesResponse {
|
||||
|
||||
@@ -11,6 +11,7 @@ import { onMounted, ref, nextTick, onUnmounted, watch, computed } from "vue";
|
||||
import { downKnowledgeDetails, deleteGeneratedQuestion, getChunkByIdOnly, previewKnowledgeFile } from "@/api/knowledge-base/index";
|
||||
import { MessagePlugin, DialogPlugin } from "tdesign-vue-next";
|
||||
import { sanitizeHTML, safeMarkdownToHTML, createSafeImage, isValidImageURL, hydrateProtectedFileImages, isValidURL } from '@/utils/security';
|
||||
import { normalizeSpuriousTablePrefixes } from '@/utils/markdownTableNormalize';
|
||||
import { openMermaidFullscreen } from '@/utils/mermaidViewer';
|
||||
import { useI18n } from 'vue-i18n';
|
||||
import { useAuthStore } from '@/stores/auth';
|
||||
@@ -283,8 +284,15 @@ let page = 1;
|
||||
let loadingChunks = false;
|
||||
let pendingRequestedPage: number | null = null;
|
||||
let pendingChunksBeforeLoad = 0;
|
||||
let doc = null;
|
||||
const CHUNK_PAGE_SIZE = 25;
|
||||
/** Scroll container for the main doc drawer (not the first .t-drawer__body on the page). */
|
||||
let docScrollEl: HTMLElement | null = null;
|
||||
let mdContentWrap = ref()
|
||||
// Drawer uses attach="body", so markdown nodes live outside mdContentWrap in the DOM.
|
||||
const docMarkdownRoot = ref<HTMLElement | null>(null)
|
||||
|
||||
const getMarkdownRenderRoot = (): ParentNode | null =>
|
||||
docMarkdownRoot.value ?? (mdContentWrap.value as ParentNode | null) ?? null
|
||||
let url = ref('')
|
||||
// 视图模式:chunks / merged / preview
|
||||
// file 类型默认「预览」,URL / 手动创建 默认「全文」
|
||||
@@ -365,18 +373,40 @@ const mergeChunks = (chunks: any[]): string => {
|
||||
return merged;
|
||||
};
|
||||
|
||||
const findDocDrawerScrollEl = (): HTMLElement | null =>
|
||||
document.querySelector('.doc-main-drawer .t-drawer__body') as HTMLElement | null;
|
||||
|
||||
const unbindDrawerScroll = () => {
|
||||
if (docScrollEl) {
|
||||
docScrollEl.removeEventListener('scroll', handleDetailsScroll);
|
||||
docScrollEl = null;
|
||||
}
|
||||
};
|
||||
|
||||
const bindDrawerScroll = () => {
|
||||
unbindDrawerScroll();
|
||||
docScrollEl = findDocDrawerScrollEl();
|
||||
if (docScrollEl) {
|
||||
docScrollEl.addEventListener('scroll', handleDetailsScroll, { passive: true });
|
||||
}
|
||||
};
|
||||
|
||||
onMounted(() => {
|
||||
loadTraceDrawerWidth();
|
||||
loadMainDrawerWidth();
|
||||
window.addEventListener('resize', onTraceDrawerWindowResize, { passive: true });
|
||||
nextTick(() => {
|
||||
const drawers = document.getElementsByClassName('t-drawer__body');
|
||||
if (drawers && drawers.length > 0) {
|
||||
doc = drawers[0];
|
||||
doc.addEventListener('scroll', handleDetailsScroll);
|
||||
}
|
||||
})
|
||||
})
|
||||
});
|
||||
|
||||
watch(() => props.visible, (visible) => {
|
||||
if (visible) {
|
||||
nextTick(() => {
|
||||
bindDrawerScroll();
|
||||
maybeLoadMoreChunks();
|
||||
});
|
||||
} else {
|
||||
unbindDrawerScroll();
|
||||
}
|
||||
});
|
||||
watch(() => props.details?.id, () => {
|
||||
page = 1;
|
||||
loadingChunks = false;
|
||||
@@ -396,15 +426,16 @@ watch(() => props.details?.chunkLoading, (val) => {
|
||||
pendingRequestedPage = null;
|
||||
pendingChunksBeforeLoad = 0;
|
||||
loadingChunks = false;
|
||||
if (props.visible) {
|
||||
nextTick(() => maybeLoadMoreChunks());
|
||||
}
|
||||
}
|
||||
});
|
||||
onUnmounted(() => {
|
||||
window.removeEventListener('resize', onTraceDrawerWindowResize);
|
||||
cleanupTraceDrawerResize();
|
||||
cleanupMainDrawerResize();
|
||||
if (doc) {
|
||||
doc.removeEventListener('scroll', handleDetailsScroll);
|
||||
}
|
||||
unbindDrawerScroll();
|
||||
if (audioBlobUrl.value) {
|
||||
URL.revokeObjectURL(audioBlobUrl.value);
|
||||
}
|
||||
@@ -561,7 +592,10 @@ const loadAudioPreview = async () => {
|
||||
};
|
||||
const runMarkdownPostRenderPipeline = async () => {
|
||||
await nextTick();
|
||||
const renderRoot = mdContentWrap.value as ParentNode;
|
||||
const renderRoot = getMarkdownRenderRoot();
|
||||
if (!renderRoot) {
|
||||
return;
|
||||
}
|
||||
await hydrateProtectedFileImages(renderRoot);
|
||||
const images = renderRoot?.querySelectorAll?.('img.markdown-image') as NodeListOf<HTMLImageElement> | undefined;
|
||||
if (images) {
|
||||
@@ -576,26 +610,29 @@ const runMarkdownPostRenderPipeline = async () => {
|
||||
await renderMermaidDiagrams();
|
||||
};
|
||||
|
||||
watch(() => props.details.md, (newVal) => {
|
||||
watch(() => props.details.md, () => {
|
||||
runMarkdownPostRenderPipeline();
|
||||
}, { immediate: true, deep: true })
|
||||
}, { immediate: true, deep: true, flush: 'post' })
|
||||
|
||||
watch(() => viewMode.value, (mode) => {
|
||||
if ((mode === 'chunks' || mode === 'merged') && props.visible) {
|
||||
runMarkdownPostRenderPipeline();
|
||||
if (mode === 'chunks') {
|
||||
nextTick(() => maybeLoadMoreChunks());
|
||||
}
|
||||
}
|
||||
});
|
||||
}, { flush: 'post' });
|
||||
|
||||
watch(() => props.visible, (visible) => {
|
||||
if (visible && (viewMode.value === 'chunks' || viewMode.value === 'merged')) {
|
||||
runMarkdownPostRenderPipeline();
|
||||
}
|
||||
});
|
||||
}, { flush: 'post' });
|
||||
|
||||
// 渲染 Mermaid 图表的函数
|
||||
const renderMermaidDiagrams = async () => {
|
||||
try {
|
||||
const mermaidElements = mdContentWrap.value?.querySelectorAll('.mermaid');
|
||||
const mermaidElements = getMarkdownRenderRoot()?.querySelectorAll('.mermaid');
|
||||
console.log('[Mermaid] Found mermaid elements:', mermaidElements?.length);
|
||||
if (mermaidElements && mermaidElements.length > 0) {
|
||||
await mermaid.run({
|
||||
@@ -624,12 +661,13 @@ const handleMermaidClick = (e: Event) => {
|
||||
|
||||
// 为 Mermaid 容器绑定点击全屏事件(绑定在 div 上,不是 SVG 上)
|
||||
const bindMermaidClickEvents = () => {
|
||||
if (!mdContentWrap.value) {
|
||||
console.log('[Mermaid] mdContentWrap is null');
|
||||
const renderRoot = getMarkdownRenderRoot();
|
||||
if (!renderRoot) {
|
||||
console.log('[Mermaid] markdown render root is null');
|
||||
return;
|
||||
}
|
||||
// 绑定在 .mermaid div 上,而不是 SVG 上
|
||||
const mermaidDivs = mdContentWrap.value.querySelectorAll('.mermaid');
|
||||
const mermaidDivs = renderRoot.querySelectorAll('.mermaid');
|
||||
console.log('[Mermaid] Found mermaid divs:', mermaidDivs.length);
|
||||
mermaidDivs.forEach((div, index) => {
|
||||
const divEl = div as HTMLElement;
|
||||
@@ -663,6 +701,9 @@ const processMarkdown = (markdownText) => {
|
||||
// 处理被 <p> 包裹的表格行,转换为正常的表格行,并在前后补空行
|
||||
processedText = processedText.replace(/<p>\s*(\|[\s\S]*?\|)\s*<\/p>/gi, '\n$1\n');
|
||||
|
||||
// MarkItDown 常在表格前插入空行 + 分隔行,渲染会出现多余空行
|
||||
processedText = normalizeSpuriousTablePrefixes(processedText);
|
||||
|
||||
// 保留表格单元格中的 <br>,不转成换行,避免打散表格;其他区域原样交给 marked 处理
|
||||
|
||||
// 先预处理数学定界符,再做安全预处理
|
||||
@@ -683,7 +724,8 @@ const processMarkdown = (markdownText) => {
|
||||
};
|
||||
const handleClose = () => {
|
||||
emit("closeDoc", false);
|
||||
if (doc) doc.scrollTop = 0;
|
||||
const scrollEl = docScrollEl || findDocDrawerScrollEl();
|
||||
if (scrollEl) scrollEl.scrollTop = 0;
|
||||
viewMode.value = 'merged';
|
||||
};
|
||||
|
||||
@@ -973,19 +1015,41 @@ const downloadFile = () => {
|
||||
MessagePlugin.error(t('file.downloadFailed'));
|
||||
});
|
||||
};
|
||||
const requestNextChunkPage = () => {
|
||||
if (loadingChunks || props.details?.chunkLoading) return;
|
||||
const total = props.details?.total ?? 0;
|
||||
const loaded = props.details?.md?.length ?? 0;
|
||||
if (loaded >= total || total === 0) return;
|
||||
const pageNum = Math.ceil(total / CHUNK_PAGE_SIZE);
|
||||
if (page + 1 > pageNum) return;
|
||||
page++;
|
||||
loadingChunks = true;
|
||||
pendingRequestedPage = page;
|
||||
pendingChunksBeforeLoad = loaded;
|
||||
emit('getDoc', page);
|
||||
};
|
||||
|
||||
/** When the list is shorter than the drawer, scroll never fires — prefetch until scrollable or done. */
|
||||
const maybeLoadMoreChunks = () => {
|
||||
if (!props.visible || loadingChunks || props.details?.chunkLoading) return;
|
||||
const el = docScrollEl || findDocDrawerScrollEl();
|
||||
if (!el) return;
|
||||
const loaded = props.details?.md?.length ?? 0;
|
||||
const total = props.details?.total ?? 0;
|
||||
if (loaded >= total) return;
|
||||
const { scrollHeight, clientHeight } = el;
|
||||
if (scrollHeight <= clientHeight + 8) {
|
||||
requestNextChunkPage();
|
||||
}
|
||||
};
|
||||
|
||||
const handleDetailsScroll = () => {
|
||||
if (doc && !loadingChunks) {
|
||||
let pageNum = Math.ceil(props.details.total / 25);
|
||||
const { scrollTop, scrollHeight, clientHeight } = doc;
|
||||
if (scrollTop + clientHeight >= scrollHeight - 8) {
|
||||
if (props.details.md.length < props.details.total && page + 1 <= pageNum) {
|
||||
page++;
|
||||
loadingChunks = true;
|
||||
pendingRequestedPage = page;
|
||||
pendingChunksBeforeLoad = props.details.md.length;
|
||||
emit("getDoc", page);
|
||||
}
|
||||
}
|
||||
if (loadingChunks || props.details?.chunkLoading) return;
|
||||
const el = docScrollEl || findDocDrawerScrollEl();
|
||||
if (!el) return;
|
||||
const { scrollTop, scrollHeight, clientHeight } = el;
|
||||
if (scrollTop + clientHeight >= scrollHeight - 8) {
|
||||
requestNextChunkPage();
|
||||
}
|
||||
};
|
||||
</script>
|
||||
@@ -1052,6 +1116,7 @@ const handleDetailsScroll = () => {
|
||||
</div>
|
||||
</t-drawer>
|
||||
|
||||
<div ref="docMarkdownRoot" class="doc-markdown-root">
|
||||
<!-- URL类型专属区域(保留:source 是真实链接,不与标题重复) -->
|
||||
<div v-if="details.type === 'url'" class="url_box">
|
||||
<span class="label">{{ $t('knowledgeBase.urlSource') }}</span>
|
||||
@@ -1203,6 +1268,7 @@ const handleDetailsScroll = () => {
|
||||
<DocumentPreview :knowledgeId="details.id" :fileType="details.file_type" :fileName="details.title"
|
||||
:active="viewMode === 'preview'" />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</t-drawer>
|
||||
</div>
|
||||
|
||||
@@ -951,6 +951,8 @@ export default {
|
||||
selfHostedEndpoint: 'Self-hosted Endpoint',
|
||||
formulaRecognition: 'Formula Recognition',
|
||||
tableRecognition: 'Table Recognition',
|
||||
sealRecognition: 'Seal Recognition',
|
||||
chartRecognition: 'Chart Recognition',
|
||||
language: 'Language',
|
||||
testConnection: 'Test Connection',
|
||||
saveConfig: 'Save Configuration',
|
||||
@@ -971,6 +973,9 @@ export default {
|
||||
serverUrl: 'Server URL',
|
||||
vlmServerUrlPlaceholder: 'e.g. http://your-vllm-server:8000',
|
||||
vlmServerUrlHint: 'Required when Backend is vlm-http-client or hybrid-http-client',
|
||||
paddleocrVlEndpointPlaceholder: 'e.g. http://your-paddleocr-vl:8080',
|
||||
paddleocrVlEndpointHint: 'Base URL of the full PaddleOCR-VL pipeline service; no /layout-parsing suffix needed',
|
||||
paddleocrVlCloudTokenPlaceholder: 'PaddleOCR-VL AI Studio Token',
|
||||
},
|
||||
storage: {
|
||||
title: 'Storage Engine',
|
||||
@@ -2814,6 +2819,9 @@ export default {
|
||||
max_owned_per_user: 'Max tenants owned per user',
|
||||
default_storage_quota_gb: 'Default storage quota for new tenants (GB)',
|
||||
},
|
||||
asynq: {
|
||||
concurrency: 'Async task worker concurrency',
|
||||
},
|
||||
},
|
||||
enumLabels: {
|
||||
auth: {
|
||||
@@ -4240,6 +4248,14 @@ export default {
|
||||
name: 'MinerU Cloud',
|
||||
desc: 'MinerU Cloud API',
|
||||
},
|
||||
paddleocr_vl: {
|
||||
name: 'PaddleOCR-VL',
|
||||
desc: 'PaddleOCR-VL self-hosted service',
|
||||
},
|
||||
paddleocr_vl_cloud: {
|
||||
name: 'PaddleOCR-VL Cloud',
|
||||
desc: 'PaddleOCR-VL Cloud API',
|
||||
},
|
||||
weknoracloud: {
|
||||
name: 'WeKnora Cloud',
|
||||
desc: 'Document parsing via WeKnora Cloud',
|
||||
|
||||
@@ -811,6 +811,8 @@ export default {
|
||||
selfHostedEndpoint: '자체 호스팅 엔드포인트',
|
||||
formulaRecognition: '수식 인식',
|
||||
tableRecognition: '표 인식',
|
||||
sealRecognition: '인장 인식',
|
||||
chartRecognition: '차트 인식',
|
||||
language: '언어',
|
||||
testConnection: '연결 테스트',
|
||||
saveConfig: '설정 저장',
|
||||
@@ -831,6 +833,9 @@ export default {
|
||||
serverUrl: '서버 URL',
|
||||
vlmServerUrlPlaceholder: '예: http://your-vllm-server:8000',
|
||||
vlmServerUrlHint: 'Backend가 vlm-http-client 또는 hybrid-http-client인 경우 필요',
|
||||
paddleocrVlEndpointPlaceholder: '예: http://your-paddleocr-vl:8080',
|
||||
paddleocrVlEndpointHint: 'PaddleOCR-VL 전체 서비스(pipeline) 주소를 입력하세요. /layout-parsing 접미사는 불필요합니다',
|
||||
paddleocrVlCloudTokenPlaceholder: 'PaddleOCR-VL AI Studio Token',
|
||||
},
|
||||
storage: {
|
||||
title: '스토리지 엔진',
|
||||
@@ -2053,6 +2058,9 @@ export default {
|
||||
max_owned_per_user: "사용자당 최대 테넌트 수",
|
||||
default_storage_quota_gb: "신규 테넌트 기본 저장 용량 (GB)",
|
||||
},
|
||||
asynq: {
|
||||
concurrency: "비동기 작업 워커 동시 처리 수",
|
||||
},
|
||||
},
|
||||
enumLabels: {
|
||||
auth: {
|
||||
@@ -4302,6 +4310,14 @@ export default {
|
||||
name: 'MinerU Cloud',
|
||||
desc: 'MinerU Cloud API',
|
||||
},
|
||||
paddleocr_vl: {
|
||||
name: 'PaddleOCR-VL',
|
||||
desc: 'PaddleOCR-VL 자체 호스팅 서비스',
|
||||
},
|
||||
paddleocr_vl_cloud: {
|
||||
name: 'PaddleOCR-VL Cloud',
|
||||
desc: 'PaddleOCR-VL Cloud API',
|
||||
},
|
||||
weknoracloud: {
|
||||
name: 'WeKnora Cloud',
|
||||
desc: 'WeKnora Cloud를 통한 문서 파싱',
|
||||
|
||||
@@ -866,6 +866,8 @@ export default {
|
||||
selfHostedEndpoint: 'Собственная конечная точка',
|
||||
formulaRecognition: 'Распознавание формул',
|
||||
tableRecognition: 'Распознавание таблиц',
|
||||
sealRecognition: 'Распознавание печатей',
|
||||
chartRecognition: 'Распознавание диаграмм',
|
||||
language: 'Язык',
|
||||
testConnection: 'Проверить с текущими параметрами',
|
||||
saveConfig: 'Сохранить конфигурацию',
|
||||
@@ -882,7 +884,10 @@ export default {
|
||||
languagePlaceholder: 'напр. ch, en, ja (по умолчанию ch)',
|
||||
mineruCloudApiKeyPlaceholder: 'MinerU Cloud API Key',
|
||||
vlmLabel: 'vlm (визуальная языковая модель)',
|
||||
mineruHtmlLabel: 'MinerU-HTML (HTML парсинг)'
|
||||
mineruHtmlLabel: 'MinerU-HTML (HTML парсинг)',
|
||||
paddleocrVlEndpointPlaceholder: 'напр. http://your-paddleocr-vl:8080',
|
||||
paddleocrVlEndpointHint: 'Адрес полного сервиса PaddleOCR-VL (pipeline); суффикс /layout-parsing не требуется',
|
||||
paddleocrVlCloudTokenPlaceholder: 'Токен PaddleOCR-VL AI Studio'
|
||||
},
|
||||
storage: {
|
||||
title: 'Хранилище',
|
||||
@@ -1772,6 +1777,9 @@ export default {
|
||||
max_owned_per_user: 'Максимум тенантов на пользователя',
|
||||
default_storage_quota_gb: 'Квота хранилища для новых тенантов по умолчанию (ГБ)',
|
||||
},
|
||||
asynq: {
|
||||
concurrency: 'Параллелизм воркеров асинхронных задач',
|
||||
},
|
||||
},
|
||||
enumLabels: {
|
||||
auth: {
|
||||
@@ -3802,6 +3810,14 @@ export default {
|
||||
name: 'MinerU Cloud',
|
||||
desc: 'MinerU Cloud API',
|
||||
},
|
||||
paddleocr_vl: {
|
||||
name: 'PaddleOCR-VL',
|
||||
desc: 'Самостоятельно развёрнутый сервис PaddleOCR-VL',
|
||||
},
|
||||
paddleocr_vl_cloud: {
|
||||
name: 'PaddleOCR-VL Cloud',
|
||||
desc: 'PaddleOCR-VL Cloud API',
|
||||
},
|
||||
weknoracloud: {
|
||||
name: 'WeKnora Cloud',
|
||||
desc: 'Парсинг документов через WeKnora Cloud',
|
||||
|
||||
@@ -807,6 +807,8 @@ export default {
|
||||
selfHostedEndpoint: "自建端点",
|
||||
formulaRecognition: "公式识别",
|
||||
tableRecognition: "表格识别",
|
||||
sealRecognition: "印章识别",
|
||||
chartRecognition: "图表识别",
|
||||
language: "语言",
|
||||
testConnection: "测试连接",
|
||||
saveConfig: "保存配置",
|
||||
@@ -827,6 +829,9 @@ export default {
|
||||
serverUrl: "服务器地址",
|
||||
vlmServerUrlPlaceholder: "如 http://your-vllm-server:8000",
|
||||
vlmServerUrlHint: "当 Backend 选择 vlm-http-client 或 hybrid-http-client 时需要填写",
|
||||
paddleocrVlEndpointPlaceholder: "如 http://your-paddleocr-vl:8080",
|
||||
paddleocrVlEndpointHint: "填写 PaddleOCR-VL 完整服务(pipeline)地址,无需 /layout-parsing 后缀",
|
||||
paddleocrVlCloudTokenPlaceholder: "PaddleOCR-VL 飞桨星河社区 Token",
|
||||
},
|
||||
storage: {
|
||||
title: "存储引擎",
|
||||
@@ -2032,6 +2037,9 @@ export default {
|
||||
max_owned_per_user: "每用户最大租户数",
|
||||
default_storage_quota_gb: "新租户默认存储配额 (GB)",
|
||||
},
|
||||
asynq: {
|
||||
concurrency: "异步任务并发数",
|
||||
},
|
||||
},
|
||||
enumLabels: {
|
||||
auth: {
|
||||
@@ -4234,6 +4242,14 @@ export default {
|
||||
name: "MinerU Cloud",
|
||||
desc: "MinerU Cloud API",
|
||||
},
|
||||
paddleocr_vl: {
|
||||
name: "PaddleOCR-VL",
|
||||
desc: "PaddleOCR-VL 自部署服务",
|
||||
},
|
||||
paddleocr_vl_cloud: {
|
||||
name: "PaddleOCR-VL Cloud",
|
||||
desc: "PaddleOCR-VL 云 API",
|
||||
},
|
||||
weknoracloud: {
|
||||
name: "WeKnora Cloud",
|
||||
desc: "使用 WeKnora Cloud 进行文档解析",
|
||||
|
||||
72
frontend/src/utils/markdownTableNormalize.ts
Normal file
72
frontend/src/utils/markdownTableNormalize.ts
Normal file
@@ -0,0 +1,72 @@
|
||||
/** Matches a GFM alignment cell (---, :---, ---:, :---:). */
|
||||
const SEPARATOR_CELL = /^:?-{3,}:?$/;
|
||||
|
||||
function splitRowCells(line: string): string[] {
|
||||
const inner = line.trim();
|
||||
if (!inner.startsWith('|')) {
|
||||
return [];
|
||||
}
|
||||
let parts = inner.split('|');
|
||||
if (parts.length && parts[0].trim() === '') {
|
||||
parts = parts.slice(1);
|
||||
}
|
||||
if (parts.length && parts[parts.length - 1].trim() === '') {
|
||||
parts = parts.slice(0, -1);
|
||||
}
|
||||
return parts.map((part) => part.trim());
|
||||
}
|
||||
|
||||
function isTableRow(line: string): boolean {
|
||||
const stripped = line.trim();
|
||||
return stripped.startsWith('|') && stripped.includes('|', 1);
|
||||
}
|
||||
|
||||
function isSeparatorRow(line: string): boolean {
|
||||
const cells = splitRowCells(line);
|
||||
return cells.length > 0 && cells.every((cell) => SEPARATOR_CELL.test(cell));
|
||||
}
|
||||
|
||||
function isEmptyRow(line: string): boolean {
|
||||
const cells = splitRowCells(line);
|
||||
return cells.length > 0 && cells.every((cell) => cell === '');
|
||||
}
|
||||
|
||||
function separatorRowFor(headerLine: string): string {
|
||||
const cells = splitRowCells(headerLine);
|
||||
return `| ${cells.map(() => '---').join(' | ')} |`;
|
||||
}
|
||||
|
||||
function normalizeTableBlock(block: string[]): string[] {
|
||||
let rows = [...block];
|
||||
while (rows.length && isEmptyRow(rows[0])) {
|
||||
rows.shift();
|
||||
}
|
||||
if (rows.length && isSeparatorRow(rows[0])) {
|
||||
rows.shift();
|
||||
}
|
||||
if (rows.length >= 2 && !isSeparatorRow(rows[1])) {
|
||||
rows = [rows[0], separatorRowFor(rows[0]), ...rows.slice(1)];
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
/** Fix MarkItDown-style tables: empty row + separator before real rows. */
|
||||
export function normalizeSpuriousTablePrefixes(content: string): string {
|
||||
const lines = content.split('\n');
|
||||
const out: string[] = [];
|
||||
let i = 0;
|
||||
while (i < lines.length) {
|
||||
if (!isTableRow(lines[i])) {
|
||||
out.push(lines[i]);
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
const block: string[] = [];
|
||||
while (i < lines.length && isTableRow(lines[i])) {
|
||||
block.push(lines[i]);
|
||||
i += 1;
|
||||
}
|
||||
out.push(...normalizeTableBlock(block));
|
||||
}
|
||||
return out.join('\n');
|
||||
}
|
||||
@@ -311,6 +311,60 @@
|
||||
/>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Section 3 — paddleocr_vl 自建配置 -->
|
||||
<section v-if="currentEngine.Name === 'paddleocr_vl'" class="setting-drawer__section">
|
||||
<h4 class="setting-drawer__section-title">{{ $t('settings.parser.configSection', '配置') }}</h4>
|
||||
|
||||
<div class="form-item">
|
||||
<label class="form-label required">{{ t('settings.parser.selfHostedEndpoint') }}</label>
|
||||
<t-input
|
||||
v-model="config.paddleocr_vl_endpoint"
|
||||
:placeholder="$t('settings.parser.paddleocrVlEndpointPlaceholder')"
|
||||
clearable
|
||||
/>
|
||||
<p class="form-desc">{{ $t('settings.parser.paddleocrVlEndpointHint') }}</p>
|
||||
</div>
|
||||
<div class="form-item">
|
||||
<label class="form-label">{{ $t('settings.parser.featuresLabel', '识别选项') }}</label>
|
||||
<div class="form-toggles">
|
||||
<t-checkbox v-model="config.paddleocr_vl_use_seal_recognition">{{ $t('settings.parser.sealRecognition') }}</t-checkbox>
|
||||
<t-checkbox v-model="config.paddleocr_vl_use_chart_recognition">{{ $t('settings.parser.chartRecognition') }}</t-checkbox>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Section 3 — paddleocr_vl_cloud 云 API 配置 -->
|
||||
<section v-if="currentEngine.Name === 'paddleocr_vl_cloud'" class="setting-drawer__section">
|
||||
<h4 class="setting-drawer__section-title">{{ $t('settings.parser.configSection', '配置') }}</h4>
|
||||
|
||||
<div class="form-item">
|
||||
<label class="form-label required">Token</label>
|
||||
<t-input
|
||||
v-model="config.paddleocr_vl_cloud_token"
|
||||
type="password"
|
||||
:placeholder="$t('settings.parser.paddleocrVlCloudTokenPlaceholder')"
|
||||
clearable
|
||||
>
|
||||
<template #prefix-icon><t-icon name="lock-on" /></template>
|
||||
</t-input>
|
||||
</div>
|
||||
<div class="form-item">
|
||||
<label class="form-label">Model</label>
|
||||
<t-input
|
||||
v-model="config.paddleocr_vl_cloud_model"
|
||||
placeholder="PaddleOCR-VL-1.6"
|
||||
clearable
|
||||
/>
|
||||
</div>
|
||||
<div class="form-item">
|
||||
<label class="form-label">{{ $t('settings.parser.featuresLabel', '识别选项') }}</label>
|
||||
<div class="form-toggles">
|
||||
<t-checkbox v-model="config.paddleocr_vl_cloud_use_seal_recognition">{{ $t('settings.parser.sealRecognition') }}</t-checkbox>
|
||||
<t-checkbox v-model="config.paddleocr_vl_cloud_use_chart_recognition">{{ $t('settings.parser.chartRecognition') }}</t-checkbox>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
</SettingDrawer>
|
||||
</div>
|
||||
@@ -336,7 +390,7 @@ const { t } = useI18n()
|
||||
const uiStore = useUIStore()
|
||||
const authStore = useAuthStore()
|
||||
|
||||
const CONFIGURABLE_ENGINES = new Set(['mineru', 'mineru_cloud'])
|
||||
const CONFIGURABLE_ENGINES = new Set(['mineru', 'mineru_cloud', 'paddleocr_vl', 'paddleocr_vl_cloud'])
|
||||
|
||||
/** 各解析引擎的项目/官方文档地址 */
|
||||
const ENGINE_DOC_LINKS: Record<string, string> = {
|
||||
@@ -344,6 +398,8 @@ const ENGINE_DOC_LINKS: Record<string, string> = {
|
||||
markitdown: 'https://github.com/microsoft/markitdown',
|
||||
mineru: 'https://github.com/opendatalab/MinerU',
|
||||
mineru_cloud: 'https://mineru.net/apiManage/docs',
|
||||
paddleocr_vl: 'https://github.com/PaddlePaddle/PaddleOCR',
|
||||
paddleocr_vl_cloud: 'https://aistudio.baidu.com/paddleocr',
|
||||
}
|
||||
|
||||
/** 解析引擎配置默认值(与 DocReader/Python 侧一致) */
|
||||
@@ -363,6 +419,13 @@ const DEFAULT_PARSER_CONFIG: ParserEngineConfig = {
|
||||
mineru_cloud_enable_table: true,
|
||||
mineru_cloud_enable_ocr: true,
|
||||
mineru_cloud_language: 'ch',
|
||||
paddleocr_vl_endpoint: '',
|
||||
paddleocr_vl_use_seal_recognition: true,
|
||||
paddleocr_vl_use_chart_recognition: false,
|
||||
paddleocr_vl_cloud_token: '',
|
||||
paddleocr_vl_cloud_model: 'PaddleOCR-VL-1.6',
|
||||
paddleocr_vl_cloud_use_seal_recognition: true,
|
||||
paddleocr_vl_cloud_use_chart_recognition: false,
|
||||
}
|
||||
|
||||
const engines = ref<ParserEngineInfo[]>([])
|
||||
@@ -407,6 +470,8 @@ const ENGINE_ORDER: Record<string, number> = {
|
||||
markitdown: 3,
|
||||
mineru: 4,
|
||||
mineru_cloud: 5,
|
||||
paddleocr_vl: 6,
|
||||
paddleocr_vl_cloud: 7,
|
||||
}
|
||||
|
||||
const sortedEngines = computed(() => {
|
||||
@@ -491,6 +556,13 @@ async function loadConfig() {
|
||||
mineru_cloud_enable_table: data?.mineru_cloud_enable_table ?? DEFAULT_PARSER_CONFIG.mineru_cloud_enable_table ?? true,
|
||||
mineru_cloud_enable_ocr: data?.mineru_cloud_enable_ocr ?? DEFAULT_PARSER_CONFIG.mineru_cloud_enable_ocr ?? true,
|
||||
mineru_cloud_language: data?.mineru_cloud_language ?? DEFAULT_PARSER_CONFIG.mineru_cloud_language ?? 'ch',
|
||||
paddleocr_vl_endpoint: data?.paddleocr_vl_endpoint ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_endpoint ?? '',
|
||||
paddleocr_vl_use_seal_recognition: data?.paddleocr_vl_use_seal_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_use_seal_recognition ?? true,
|
||||
paddleocr_vl_use_chart_recognition: data?.paddleocr_vl_use_chart_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_use_chart_recognition ?? false,
|
||||
paddleocr_vl_cloud_token: data?.paddleocr_vl_cloud_token ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_token ?? '',
|
||||
paddleocr_vl_cloud_model: data?.paddleocr_vl_cloud_model ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_model ?? 'PaddleOCR-VL-1.6',
|
||||
paddleocr_vl_cloud_use_seal_recognition: data?.paddleocr_vl_cloud_use_seal_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_use_seal_recognition ?? true,
|
||||
paddleocr_vl_cloud_use_chart_recognition: data?.paddleocr_vl_cloud_use_chart_recognition ?? DEFAULT_PARSER_CONFIG.paddleocr_vl_cloud_use_chart_recognition ?? false,
|
||||
}
|
||||
} catch {
|
||||
config.value = { ...DEFAULT_PARSER_CONFIG }
|
||||
@@ -521,6 +593,13 @@ function buildConfigPayload(): ParserEngineConfig {
|
||||
mineru_cloud_enable_table: config.value.mineru_cloud_enable_table,
|
||||
mineru_cloud_enable_ocr: config.value.mineru_cloud_enable_ocr,
|
||||
mineru_cloud_language: config.value.mineru_cloud_language?.trim() ?? '',
|
||||
paddleocr_vl_endpoint: config.value.paddleocr_vl_endpoint?.trim() ?? '',
|
||||
paddleocr_vl_use_seal_recognition: config.value.paddleocr_vl_use_seal_recognition,
|
||||
paddleocr_vl_use_chart_recognition: config.value.paddleocr_vl_use_chart_recognition,
|
||||
paddleocr_vl_cloud_token: config.value.paddleocr_vl_cloud_token?.trim() ?? '',
|
||||
paddleocr_vl_cloud_model: config.value.paddleocr_vl_cloud_model?.trim() ?? '',
|
||||
paddleocr_vl_cloud_use_seal_recognition: config.value.paddleocr_vl_cloud_use_seal_recognition,
|
||||
paddleocr_vl_cloud_use_chart_recognition: config.value.paddleocr_vl_cloud_use_chart_recognition,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -739,7 +818,9 @@ onMounted(loadAll)
|
||||
color: #0089FF;
|
||||
}
|
||||
.engine-card--mineru .engine-card__badge,
|
||||
.engine-card--mineru_cloud .engine-card__badge {
|
||||
.engine-card--mineru_cloud .engine-card__badge,
|
||||
.engine-card--paddleocr_vl .engine-card__badge,
|
||||
.engine-card--paddleocr_vl_cloud .engine-card__badge {
|
||||
background: rgba(98, 53, 187, 0.12);
|
||||
color: #6235BB;
|
||||
}
|
||||
@@ -1086,7 +1167,9 @@ onMounted(loadAll)
|
||||
color: #0089FF;
|
||||
}
|
||||
.parser-engine-drawer--mineru .setting-drawer__header-icon,
|
||||
.parser-engine-drawer--mineru_cloud .setting-drawer__header-icon {
|
||||
.parser-engine-drawer--mineru_cloud .setting-drawer__header-icon,
|
||||
.parser-engine-drawer--paddleocr_vl .setting-drawer__header-icon,
|
||||
.parser-engine-drawer--paddleocr_vl_cloud .setting-drawer__header-icon {
|
||||
background: rgba(98, 53, 187, 0.12);
|
||||
color: #6235BB;
|
||||
}
|
||||
|
||||
@@ -3256,6 +3256,10 @@ func (s *knowledgeService) resolveDocReader(ctx context.Context, engine, fileTyp
|
||||
return docparser.NewMinerUReader(overrides)
|
||||
case "mineru_cloud":
|
||||
return docparser.NewMinerUCloudReader(overrides)
|
||||
case "paddleocr_vl":
|
||||
return docparser.NewPaddleOCRVLReader(overrides)
|
||||
case "paddleocr_vl_cloud":
|
||||
return docparser.NewPaddleOCRVLCloudReader(overrides)
|
||||
case "builtin":
|
||||
// 明确指定使用 builtin 引擎(docreader),不使用 simple format 兜底
|
||||
return s.documentReader
|
||||
|
||||
@@ -87,6 +87,10 @@ type settingSpec struct {
|
||||
// Description is shown in the UI under the key. Stored on the row
|
||||
// at first write (mirrors Category).
|
||||
Description string
|
||||
// RequiresRestart marks keys whose value is bound at process startup
|
||||
// (e.g. asynq worker pool size). The UI shows a restart badge; the
|
||||
// service persists the flag on first write.
|
||||
RequiresRestart bool
|
||||
}
|
||||
|
||||
// registry pins the set of legal keys. Expanding it is a deliberate,
|
||||
@@ -161,6 +165,20 @@ var registry = map[string]settingSpec{
|
||||
"仅在创建时读取,修改后只对之后新建的租户生效,不会回写已存在的租户。" +
|
||||
"0 或负数表示使用内置默认值 10GB。",
|
||||
},
|
||||
// asynq.concurrency is the asynq worker pool size (parallel in-flight
|
||||
// tasks). Read once when the asynq server starts — changing it in the
|
||||
// UI requires a process restart to take effect. Mirrors
|
||||
// WEKNORA_ASYNQ_CONCURRENCY (default 16).
|
||||
"asynq.concurrency": {
|
||||
Type: "int",
|
||||
EnvName: "WEKNORA_ASYNQ_CONCURRENCY",
|
||||
Default: int64(16),
|
||||
Category: "worker",
|
||||
RequiresRestart: true,
|
||||
Description: "异步任务 worker 并发数(asynq 线程池大小)。" +
|
||||
"文档解析、嵌入等任务多为 I/O 等待,适当提高可缩短批量上传排队时间。" +
|
||||
"修改后需重启服务进程方可生效。",
|
||||
},
|
||||
}
|
||||
|
||||
// systemSettingService wires the repository, audit log, and (P2)
|
||||
@@ -655,7 +673,7 @@ func (s *systemSettingService) virtualSetting(key string, spec settingSpec) *typ
|
||||
Category: category,
|
||||
Description: spec.Description,
|
||||
IsSecret: false,
|
||||
RequiresRestart: false,
|
||||
RequiresRestart: spec.RequiresRestart,
|
||||
LastModifiedBy: "",
|
||||
Enum: spec.Enum,
|
||||
}
|
||||
@@ -817,6 +835,7 @@ func (s *systemSettingService) Update(ctx context.Context, key string, rawValue
|
||||
category = "general"
|
||||
}
|
||||
description = spec.Description
|
||||
requiresRestart = spec.RequiresRestart
|
||||
}
|
||||
|
||||
row := &types.SystemSetting{
|
||||
@@ -1142,6 +1161,14 @@ func encodeForType(declared string, rawValue any) (types.JSON, error) {
|
||||
// 400 body verbatim).
|
||||
func validateRegistryEntry(key string, rawValue any) error {
|
||||
switch key {
|
||||
case "asynq.concurrency":
|
||||
n, err := coerceToPositiveInt64(rawValue)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if n <= 0 {
|
||||
return errors.New("concurrency must be a positive integer")
|
||||
}
|
||||
case "ssrf.whitelist":
|
||||
// Coerce into the same shape encodeForType produced. We don't
|
||||
// look at the encoded JSON because that's already canonicalised
|
||||
@@ -1155,6 +1182,23 @@ func validateRegistryEntry(key string, rawValue any) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// coerceToPositiveInt64 accepts int / int64 / float64 from JSON decoding.
|
||||
func coerceToPositiveInt64(rawValue any) (int64, error) {
|
||||
switch v := rawValue.(type) {
|
||||
case int:
|
||||
return int64(v), nil
|
||||
case int64:
|
||||
return v, nil
|
||||
case float64:
|
||||
if v != float64(int64(v)) {
|
||||
return 0, errors.New("expected integer value")
|
||||
}
|
||||
return int64(v), nil
|
||||
default:
|
||||
return 0, fmt.Errorf("expected integer, got %T", rawValue)
|
||||
}
|
||||
}
|
||||
|
||||
// coerceToStringSlice mirrors the input shapes accepted by
|
||||
// encodeForType for "string_list": []any of strings, []string, or a
|
||||
// comma-separated string. Returns the trimmed, empty-stripped result.
|
||||
|
||||
@@ -36,12 +36,22 @@ var defaultHeaderHooks = []headerTrackerHook{
|
||||
// tableRowPattern matches a single Markdown table row: "| cell | cell | ... |\n"
|
||||
var tableRowPattern = regexp.MustCompile(`(?m)^\s*(?:\|[^|\n]*)+\|\s*$`)
|
||||
|
||||
// markdownTableHookPriority matches DEFAULT_CONFIGS / defaultHeaderHooks table hook.
|
||||
const markdownTableHookPriority = 15
|
||||
|
||||
// headerTracker maintains the state of active headers across split units.
|
||||
type headerTracker struct {
|
||||
hooks []headerTrackerHook
|
||||
activeHeaders map[int]string // priority -> header text
|
||||
endedHeaders map[int]bool // priorities that have been ended
|
||||
pendingExtend map[int]bool // headers with empty column names awaiting first data row
|
||||
// pendingTableBreak is set when a table row unit ends with a paragraph break
|
||||
// (the blank line between tables is consumed by \n\n splitting). The header
|
||||
// stays active until the next unit is seen so we can detect a new table.
|
||||
pendingTableBreak bool
|
||||
// headerEndedThisUnit tells mergeUnits to flush before the current unit when a
|
||||
// new table starts (column mismatch or pendingTableBreak + table row).
|
||||
headerEndedThisUnit bool
|
||||
}
|
||||
|
||||
func newHeaderTracker() *headerTracker {
|
||||
@@ -55,6 +65,20 @@ func newHeaderTracker() *headerTracker {
|
||||
|
||||
// update checks split text for header start/end markers and updates internal state.
|
||||
func (ht *headerTracker) update(split string) {
|
||||
ht.headerEndedThisUnit = false
|
||||
|
||||
if ht.pendingTableBreak {
|
||||
ht.pendingTableBreak = false
|
||||
if _, active := ht.activeHeaders[markdownTableHookPriority]; active {
|
||||
if firstTableRowColumnCount(split) > 0 {
|
||||
ht.clearTableHeader()
|
||||
ht.headerEndedThisUnit = true
|
||||
} else {
|
||||
ht.clearTableHeader()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 1. Check for header-end markers among currently active headers
|
||||
for _, hook := range ht.hooks {
|
||||
if _, active := ht.activeHeaders[hook.priority]; active {
|
||||
@@ -66,6 +90,19 @@ func (ht *headerTracker) update(split string) {
|
||||
}
|
||||
}
|
||||
|
||||
// 1b. Paragraph splits consume the blank line between tables. Mark a break
|
||||
// after "| last row |\n\n" and resolve on the next unit; also end when a new
|
||||
// table row has a different column count than the active header.
|
||||
if _, active := ht.activeHeaders[markdownTableHookPriority]; active {
|
||||
if !ht.pendingExtend[markdownTableHookPriority] {
|
||||
if splitEndsWithParagraphBreak(split) {
|
||||
ht.pendingTableBreak = true
|
||||
} else {
|
||||
ht.endTableHeaderOnColumnMismatch(split)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. If a header has an empty column-name row (e.g. "||"), replace it with
|
||||
// a proper Markdown table header using the first data row as column names.
|
||||
//
|
||||
@@ -159,3 +196,73 @@ func extractSeparatorLine(header string) string {
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (ht *headerTracker) clearTableHeader() {
|
||||
ht.endedHeaders[markdownTableHookPriority] = true
|
||||
delete(ht.activeHeaders, markdownTableHookPriority)
|
||||
delete(ht.pendingExtend, markdownTableHookPriority)
|
||||
}
|
||||
|
||||
func (ht *headerTracker) endTableHeaderOnColumnMismatch(split string) {
|
||||
header, ok := ht.activeHeaders[markdownTableHookPriority]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
rowCols := firstTableRowColumnCount(split)
|
||||
headerCols := headerTableColumnCount(header)
|
||||
if rowCols > 0 && headerCols > 0 && rowCols != headerCols {
|
||||
ht.clearTableHeader()
|
||||
ht.headerEndedThisUnit = true
|
||||
}
|
||||
}
|
||||
|
||||
func splitEndsWithParagraphBreak(split string) bool {
|
||||
trimmed := strings.TrimRight(split, " \t\r")
|
||||
return strings.HasSuffix(trimmed, "\n\n") || strings.HasSuffix(trimmed, "\r\n\r\n")
|
||||
}
|
||||
|
||||
func tableRowColumnCount(line string) int {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "|") {
|
||||
return 0
|
||||
}
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) > 0 && strings.TrimSpace(parts[0]) == "" {
|
||||
parts = parts[1:]
|
||||
}
|
||||
if len(parts) > 0 && strings.TrimSpace(parts[len(parts)-1]) == "" {
|
||||
parts = parts[:len(parts)-1]
|
||||
}
|
||||
return len(parts)
|
||||
}
|
||||
|
||||
func firstTableRowColumnCount(text string) int {
|
||||
for _, line := range strings.Split(text, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" && tableRowPattern.MatchString(line) {
|
||||
return tableRowColumnCount(line)
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func headerTableColumnCount(header string) int {
|
||||
for _, line := range strings.Split(header, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" || strings.Contains(line, "---") {
|
||||
continue
|
||||
}
|
||||
if n := tableRowColumnCount(line); n > 0 {
|
||||
return n
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// headerColumnMismatch reports whether the next split unit starts a new table
|
||||
// whose width differs from the active markdown table header.
|
||||
func headerColumnMismatch(headers, nextUnit string) bool {
|
||||
headerCols := headerTableColumnCount(headers)
|
||||
rowCols := firstTableRowColumnCount(nextUnit)
|
||||
return headerCols > 0 && rowCols > 0 && headerCols != rowCols
|
||||
}
|
||||
|
||||
@@ -450,6 +450,13 @@ func mergeUnits(units []splitUnit, chunkSize, chunkOverlap int) []Chunk {
|
||||
|
||||
// Update header tracking
|
||||
ht.update(u.text)
|
||||
// Flush at table boundary so the next table is not merged into a chunk
|
||||
// that still carries the previous table's prepended header context.
|
||||
if ht.headerEndedThisUnit && len(current) > 0 {
|
||||
chunks = append(chunks, buildChunk(current, len(chunks)))
|
||||
current = nil
|
||||
curLen = 0
|
||||
}
|
||||
headers := ht.getHeaders()
|
||||
headersLen := runeLen(headers)
|
||||
if headersLen > chunkSize {
|
||||
@@ -475,7 +482,8 @@ func mergeUnits(units []splitUnit, chunkSize, chunkOverlap int) []Chunk {
|
||||
// Prepend headers if the column-name context is not already present
|
||||
// in the overlap or the next unit being added.
|
||||
overlapText := unitsText(current)
|
||||
if !headerAlreadyPresent(headers, overlapText, u.text) {
|
||||
if !headerAlreadyPresent(headers, overlapText, u.text) &&
|
||||
!headerColumnMismatch(headers, u.text) {
|
||||
startPos := u.start
|
||||
if len(current) > 0 {
|
||||
startPos = current[0].start
|
||||
|
||||
@@ -674,6 +674,66 @@ func TestSplitText_EmptyHeaderRowPrepend(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeaderTracker_ColumnMismatchEndsTable(t *testing.T) {
|
||||
ht := newHeaderTracker()
|
||||
ht.update("| Name | Game | Fame | Blame |\n| --- | --- | --- | --- |\n")
|
||||
if ht.getHeaders() == "" {
|
||||
t.Fatal("expected active table header")
|
||||
}
|
||||
ht.update("| Sinple | Table |\n")
|
||||
if h := ht.getHeaders(); h != "" {
|
||||
t.Fatalf("2-col row should end 4-col table header, still active:\n%s", h)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeaderTracker_ParagraphBreakEndsOnNextUnit(t *testing.T) {
|
||||
ht := newHeaderTracker()
|
||||
ht.update("| Name | Game | Fame | Blame |\n| --- | --- | --- | --- |\n")
|
||||
ht.update("| Russell Wilson | Football | High | Tacky uniform |\n\n")
|
||||
if h := ht.getHeaders(); h == "" {
|
||||
t.Fatal("paragraph break alone should not clear header yet")
|
||||
}
|
||||
if !ht.pendingTableBreak {
|
||||
t.Fatal("expected pendingTableBreak after row ending with \\n\\n")
|
||||
}
|
||||
ht.update("| Sinple | Table |\n")
|
||||
if h := ht.getHeaders(); h != "" {
|
||||
t.Fatalf("next table row should clear previous header, got %q", h)
|
||||
}
|
||||
if !ht.headerEndedThisUnit {
|
||||
t.Fatal("expected flush signal when new table starts after paragraph break")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitText_EnTablesNoCrossTableHeader(t *testing.T) {
|
||||
text := "## A table, with and without a header row\n\n" +
|
||||
"| Name | Game | Fame | Blame |\n" +
|
||||
"| --- | --- | --- | --- |\n" +
|
||||
"| Lebron James | Basketball | Very High | Leaving Cleveland |\n" +
|
||||
"| Ryan Braun | Baseball | Moderate | Steroids |\n" +
|
||||
"| Russell Wilson | Football | High | Tacky uniform |\n\n" +
|
||||
"| Sinple | Table |\n" +
|
||||
"| Without | Header |\n\n" +
|
||||
"| Simple Multiparagraph | Table Full |\n" +
|
||||
"| Of Paragraphs | In each Cell. |\n"
|
||||
|
||||
cfg := SplitterConfig{ChunkSize: 200, ChunkOverlap: 20, Separators: []string{"\n\n", "\n", "。"}}
|
||||
chunks := SplitText(text, cfg)
|
||||
if len(chunks) < 2 {
|
||||
t.Fatalf("expected multiple chunks, got %d", len(chunks))
|
||||
}
|
||||
|
||||
for i, c := range chunks {
|
||||
hasSinple := strings.Contains(c.Content, "| Sinple | Table |")
|
||||
hasSimple := strings.Contains(c.Content, "| Simple Multiparagraph |")
|
||||
if hasSinple || hasSimple {
|
||||
if strings.Contains(c.Content, "| Name | Game | Fame | Blame |") {
|
||||
t.Errorf("chunk[%d] must not carry table-1 header into later tables:\n%s", i, c.Content)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitText_MultipleTablesInDocument(t *testing.T) {
|
||||
text := "" +
|
||||
"第一个表格:\n\n" +
|
||||
|
||||
@@ -30,6 +30,8 @@ func init() {
|
||||
RegisterEngine(&weKnoraCloudEngine{})
|
||||
RegisterEngine(&mineruEngine{})
|
||||
RegisterEngine(&mineruCloudEngine{})
|
||||
RegisterEngine(&paddleOCRVLEngine{})
|
||||
RegisterEngine(&paddleOCRVLCloudEngine{})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -133,6 +135,44 @@ func (e *mineruCloudEngine) CheckAvailable(_ bool, overrides map[string]string)
|
||||
return PingMinerUCloud(apiKey)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// paddleocr_vl — Go-native, calls a self-hosted PaddleOCR-VL pipeline service
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type paddleOCRVLEngine struct{}
|
||||
|
||||
func (e *paddleOCRVLEngine) Name() string { return "paddleocr_vl" }
|
||||
func (e *paddleOCRVLEngine) Description() string { return "PaddleOCR-VL self-hosted service" }
|
||||
func (e *paddleOCRVLEngine) FileTypes(_ bool) []string {
|
||||
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff"}
|
||||
}
|
||||
func (e *paddleOCRVLEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
|
||||
endpoint := strings.TrimSpace(overrides["paddleocr_vl_endpoint"])
|
||||
if endpoint == "" {
|
||||
return false, "PaddleOCR-VL service not configured"
|
||||
}
|
||||
return PingPaddleOCRVL(endpoint)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// paddleocr_vl_cloud — Go-native, calls the PaddleOCR-VL AI Studio cloud API
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type paddleOCRVLCloudEngine struct{}
|
||||
|
||||
func (e *paddleOCRVLCloudEngine) Name() string { return "paddleocr_vl_cloud" }
|
||||
func (e *paddleOCRVLCloudEngine) Description() string { return "PaddleOCR-VL Cloud API" }
|
||||
func (e *paddleOCRVLCloudEngine) FileTypes(_ bool) []string {
|
||||
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff"}
|
||||
}
|
||||
func (e *paddleOCRVLCloudEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
|
||||
token := strings.TrimSpace(overrides["paddleocr_vl_cloud_token"])
|
||||
if token == "" {
|
||||
return false, "PaddleOCR-VL Cloud Token not configured"
|
||||
}
|
||||
return PingPaddleOCRVLCloud(token)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ListAllEngines — merge local + remote
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@@ -0,0 +1,353 @@
|
||||
package docparser
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Tencent/WeKnora/internal/logger"
|
||||
"github.com/Tencent/WeKnora/internal/types"
|
||||
"github.com/Tencent/WeKnora/internal/utils"
|
||||
)
|
||||
|
||||
const (
|
||||
paddleOCRVLCloudDefaultBaseURL = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs"
|
||||
paddleOCRVLCloudDefaultModel = "PaddleOCR-VL-1.6"
|
||||
paddleOCRVLCloudPollInterval = 5 * time.Second
|
||||
paddleOCRVLCloudTimeout = 600 * time.Second
|
||||
)
|
||||
|
||||
// PaddleOCRVLCloudReader calls the PaddleOCR-VL AI Studio cloud API.
|
||||
// Flow: POST /jobs (multipart) → poll GET /jobs/{id} → download result JSONL,
|
||||
// then fetch each referenced image URL.
|
||||
type PaddleOCRVLCloudReader struct {
|
||||
token string
|
||||
baseURL string
|
||||
model string
|
||||
useSeal bool
|
||||
useChart bool
|
||||
}
|
||||
|
||||
// NewPaddleOCRVLCloudReader creates a reader from ParserEngineOverrides.
|
||||
func NewPaddleOCRVLCloudReader(overrides map[string]string) *PaddleOCRVLCloudReader {
|
||||
return &PaddleOCRVLCloudReader{
|
||||
token: strings.TrimSpace(overrides["paddleocr_vl_cloud_token"]),
|
||||
baseURL: strings.TrimRight(stringOr(overrides["paddleocr_vl_cloud_base_url"], paddleOCRVLCloudDefaultBaseURL), "/"),
|
||||
model: stringOr(overrides["paddleocr_vl_cloud_model"], paddleOCRVLCloudDefaultModel),
|
||||
useSeal: parseBoolOr(overrides["paddleocr_vl_cloud_use_seal_recognition"], true),
|
||||
useChart: parseBoolOr(overrides["paddleocr_vl_cloud_use_chart_recognition"], false),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *PaddleOCRVLCloudReader) Read(ctx context.Context, req *types.ReadRequest) (*types.ReadResult, error) {
|
||||
if c.token == "" {
|
||||
return &types.ReadResult{Error: "PaddleOCR-VL Cloud token is not configured"}, nil
|
||||
}
|
||||
|
||||
content := req.FileContent
|
||||
if len(content) == 0 {
|
||||
return &types.ReadResult{Error: "no file content provided"}, nil
|
||||
}
|
||||
|
||||
logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] Parsing file=%s size=%d model=%s",
|
||||
req.FileName, len(content), c.model)
|
||||
|
||||
jobID, err := c.submitJob(ctx, req, content)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("PaddleOCR-VL Cloud submit: %w", err)
|
||||
}
|
||||
|
||||
jsonlURL, err := c.pollJob(ctx, jobID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("PaddleOCR-VL Cloud poll: %w", err)
|
||||
}
|
||||
|
||||
mdContent, imagesURL, err := c.fetchResults(jsonlURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("PaddleOCR-VL Cloud fetch results: %w", err)
|
||||
}
|
||||
|
||||
imageRefs := c.downloadImages(mdContent, imagesURL)
|
||||
mdContent, imageRefs = ensureOriginalImageRef(req, mdContent, imageRefs)
|
||||
|
||||
logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] Parsed successfully, markdown=%d chars, images=%d",
|
||||
len(mdContent), len(imageRefs))
|
||||
|
||||
return &types.ReadResult{
|
||||
MarkdownContent: mdContent,
|
||||
ImageRefs: imageRefs,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *PaddleOCRVLCloudReader) optionalPayload() map[string]interface{} {
|
||||
// Shared with the self-hosted engine so both produce identical output.
|
||||
return paddleOCRVLRecognitionParams(c.useSeal, c.useChart)
|
||||
}
|
||||
|
||||
// --- job submit ---
|
||||
|
||||
type paddleOCRVLCloudSubmitResponse struct {
|
||||
Data struct {
|
||||
JobID string `json:"jobId"`
|
||||
} `json:"data"`
|
||||
ErrorCode int `json:"errorCode"`
|
||||
ErrorMsg string `json:"errorMsg"`
|
||||
}
|
||||
|
||||
func (c *PaddleOCRVLCloudReader) submitJob(ctx context.Context, req *types.ReadRequest, content []byte) (string, error) {
|
||||
optional, err := json.Marshal(c.optionalPayload())
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("marshal optionalPayload: %w", err)
|
||||
}
|
||||
|
||||
fileName := req.FileName
|
||||
if fileName == "" {
|
||||
ext := strings.TrimPrefix(req.FileType, ".")
|
||||
if ext == "" {
|
||||
ext = "pdf"
|
||||
}
|
||||
fileName = "document." + ext
|
||||
}
|
||||
|
||||
var body bytes.Buffer
|
||||
writer := multipart.NewWriter(&body)
|
||||
_ = writer.WriteField("model", c.model)
|
||||
_ = writer.WriteField("optionalPayload", string(optional))
|
||||
part, err := writer.CreateFormFile("file", filepath.Base(fileName))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("create form file: %w", err)
|
||||
}
|
||||
if _, err := part.Write(content); err != nil {
|
||||
return "", fmt.Errorf("write file content: %w", err)
|
||||
}
|
||||
writer.Close()
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL, &body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("Authorization", "bearer "+c.token)
|
||||
httpReq.Header.Set("Content-Type", writer.FormDataContentType())
|
||||
|
||||
client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 60 * time.Second, MaxRedirects: 5})
|
||||
resp, err := client.Do(httpReq)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("HTTP request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("API status %d: %s", resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
var result paddleOCRVLCloudSubmitResponse
|
||||
if err := json.Unmarshal(respBody, &result); err != nil {
|
||||
return "", fmt.Errorf("decode response: %w", err)
|
||||
}
|
||||
if result.Data.JobID == "" {
|
||||
return "", fmt.Errorf("API returned no jobId: %s", string(respBody))
|
||||
}
|
||||
|
||||
logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] job submitted: jobId=%s", result.Data.JobID)
|
||||
return result.Data.JobID, nil
|
||||
}
|
||||
|
||||
// --- polling ---
|
||||
|
||||
type paddleOCRVLCloudPollResponse struct {
|
||||
Data struct {
|
||||
State string `json:"state"`
|
||||
ErrorMsg string `json:"errorMsg"`
|
||||
ExtractProgress struct {
|
||||
TotalPages int `json:"totalPages"`
|
||||
ExtractedPages int `json:"extractedPages"`
|
||||
} `json:"extractProgress"`
|
||||
ResultURL struct {
|
||||
JSONURL string `json:"jsonUrl"`
|
||||
} `json:"resultUrl"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
func (c *PaddleOCRVLCloudReader) pollJob(ctx context.Context, jobID string) (string, error) {
|
||||
deadline := time.Now().Add(paddleOCRVLCloudTimeout)
|
||||
pollCount := 0
|
||||
url := c.baseURL + "/" + jobID
|
||||
|
||||
for time.Now().Before(deadline) {
|
||||
pollCount++
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("create poll request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("Authorization", "bearer "+c.token)
|
||||
|
||||
client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 30 * time.Second, MaxRedirects: 5})
|
||||
resp, err := client.Do(httpReq)
|
||||
if err != nil {
|
||||
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] poll #%d failed: %v", pollCount, err)
|
||||
sleepCtx(ctx, paddleOCRVLCloudPollInterval)
|
||||
continue
|
||||
}
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] poll #%d status %d: %s", pollCount, resp.StatusCode, string(respBody))
|
||||
sleepCtx(ctx, paddleOCRVLCloudPollInterval)
|
||||
continue
|
||||
}
|
||||
|
||||
var pollResp paddleOCRVLCloudPollResponse
|
||||
if err := json.Unmarshal(respBody, &pollResp); err != nil {
|
||||
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] poll #%d decode error: %v", pollCount, err)
|
||||
sleepCtx(ctx, paddleOCRVLCloudPollInterval)
|
||||
continue
|
||||
}
|
||||
|
||||
state := strings.ToLower(pollResp.Data.State)
|
||||
if pollCount == 1 || pollCount%6 == 0 || state == "done" || state == "failed" {
|
||||
logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] poll #%d: state=%s pages=%d/%d",
|
||||
pollCount, state, pollResp.Data.ExtractProgress.ExtractedPages, pollResp.Data.ExtractProgress.TotalPages)
|
||||
}
|
||||
|
||||
switch state {
|
||||
case "done":
|
||||
if pollResp.Data.ResultURL.JSONURL == "" {
|
||||
return "", fmt.Errorf("state=done but no jsonUrl")
|
||||
}
|
||||
return pollResp.Data.ResultURL.JSONURL, nil
|
||||
case "failed":
|
||||
return "", fmt.Errorf("task failed: %s", pollResp.Data.ErrorMsg)
|
||||
}
|
||||
|
||||
sleepCtx(ctx, paddleOCRVLCloudPollInterval)
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("task timed out after %d polls", pollCount)
|
||||
}
|
||||
|
||||
// --- result parsing ---
|
||||
|
||||
type paddleOCRVLCloudResultLine struct {
|
||||
Result struct {
|
||||
LayoutParsingResults []struct {
|
||||
Markdown struct {
|
||||
Text string `json:"text"`
|
||||
Images map[string]string `json:"images"`
|
||||
} `json:"markdown"`
|
||||
} `json:"layoutParsingResults"`
|
||||
} `json:"result"`
|
||||
}
|
||||
|
||||
func (c *PaddleOCRVLCloudReader) fetchResults(jsonlURL string) (string, map[string]string, error) {
|
||||
if err := utils.ValidateURLForSSRF(jsonlURL); err != nil {
|
||||
return "", nil, fmt.Errorf("jsonl URL blocked by SSRF check: %v", err)
|
||||
}
|
||||
client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 120 * time.Second, MaxRedirects: 5})
|
||||
resp, err := client.Get(jsonlURL)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("download jsonl: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", nil, fmt.Errorf("download jsonl status %d", resp.StatusCode)
|
||||
}
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("read jsonl body: %w", err)
|
||||
}
|
||||
|
||||
texts := make([]string, 0)
|
||||
images := make(map[string]string)
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
var parsed paddleOCRVLCloudResultLine
|
||||
if err := json.Unmarshal([]byte(line), &parsed); err != nil {
|
||||
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] skip malformed jsonl line: %v", err)
|
||||
continue
|
||||
}
|
||||
for _, p := range parsed.Result.LayoutParsingResults {
|
||||
if t := strings.TrimSpace(p.Markdown.Text); t != "" {
|
||||
texts = append(texts, p.Markdown.Text)
|
||||
}
|
||||
for path, u := range p.Markdown.Images {
|
||||
if _, ok := images[path]; !ok {
|
||||
images[path] = u
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.Infof(context.Background(), "[PaddleOCR-VL Cloud] fetched %d page(s), images=%d", len(texts), len(images))
|
||||
return strings.Join(texts, "\n\n"), images, nil
|
||||
}
|
||||
|
||||
// downloadImages fetches each referenced image URL and builds ImageRef entries.
|
||||
func (c *PaddleOCRVLCloudReader) downloadImages(mdContent string, imagesURL map[string]string) []types.ImageRef {
|
||||
var refs []types.ImageRef
|
||||
client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{Timeout: 60 * time.Second, MaxRedirects: 5})
|
||||
|
||||
for ipath, u := range imagesURL {
|
||||
matchedRefs := mineruImageOriginalRefs(mdContent, ipath)
|
||||
if len(matchedRefs) == 0 {
|
||||
continue
|
||||
}
|
||||
if err := utils.ValidateURLForSSRF(u); err != nil {
|
||||
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] image URL blocked %s: %v", ipath, err)
|
||||
continue
|
||||
}
|
||||
resp, err := client.Get(u)
|
||||
if err != nil {
|
||||
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] download image %s: %v", ipath, err)
|
||||
continue
|
||||
}
|
||||
imgBytes, err := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
if err != nil || resp.StatusCode != http.StatusOK {
|
||||
logger.Errorf(context.Background(), "[PaddleOCR-VL Cloud] read image %s status=%d err=%v", ipath, resp.StatusCode, err)
|
||||
continue
|
||||
}
|
||||
|
||||
ext := strings.TrimPrefix(filepath.Ext(ipath), ".")
|
||||
if ext == "" {
|
||||
ext = "png"
|
||||
}
|
||||
mimeType := mime.TypeByExtension("." + ext)
|
||||
if mimeType == "" {
|
||||
mimeType = "image/png"
|
||||
}
|
||||
|
||||
for _, originalRef := range matchedRefs {
|
||||
refs = append(refs, types.ImageRef{
|
||||
Filename: ipath,
|
||||
OriginalRef: originalRef,
|
||||
MimeType: mimeType,
|
||||
ImageData: imgBytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return refs
|
||||
}
|
||||
|
||||
// PingPaddleOCRVLCloud checks whether the cloud token is present (the API has
|
||||
// no lightweight health endpoint, so we only validate configuration here).
|
||||
func PingPaddleOCRVLCloud(token string) (bool, string) {
|
||||
if strings.TrimSpace(token) == "" {
|
||||
return false, "未配置 PaddleOCR-VL Cloud Token"
|
||||
}
|
||||
return true, ""
|
||||
}
|
||||
282
internal/infrastructure/docparser/paddleocr_vl_converter.go
Normal file
282
internal/infrastructure/docparser/paddleocr_vl_converter.go
Normal file
@@ -0,0 +1,282 @@
|
||||
package docparser
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime"
|
||||
"net/http"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Tencent/WeKnora/internal/logger"
|
||||
"github.com/Tencent/WeKnora/internal/types"
|
||||
"github.com/Tencent/WeKnora/internal/utils"
|
||||
)
|
||||
|
||||
const paddleOCRVLTimeout = 1000 * time.Second // large scanned PDFs can take a while
|
||||
|
||||
// PaddleOCRVLReader calls a self-hosted PaddleOCR-VL pipeline service
|
||||
// (the full document-parsing API, not the bare VLM inference server).
|
||||
//
|
||||
// Flow: POST {endpoint}/layout-parsing with base64 file → synchronous JSON
|
||||
// response containing per-page markdown + inline base64 images.
|
||||
type PaddleOCRVLReader struct {
|
||||
endpoint string
|
||||
useSeal bool
|
||||
useChart bool
|
||||
useLayout bool
|
||||
}
|
||||
|
||||
// NewPaddleOCRVLReader creates a reader from ParserEngineOverrides.
|
||||
func NewPaddleOCRVLReader(overrides map[string]string) *PaddleOCRVLReader {
|
||||
return &PaddleOCRVLReader{
|
||||
endpoint: strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
|
||||
useSeal: parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
|
||||
useChart: parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
|
||||
useLayout: parseBoolOr(overrides["paddleocr_vl_use_layout_detection"], true),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *PaddleOCRVLReader) Read(ctx context.Context, req *types.ReadRequest) (*types.ReadResult, error) {
|
||||
if c.endpoint == "" {
|
||||
return &types.ReadResult{Error: "PaddleOCR-VL endpoint is not configured"}, nil
|
||||
}
|
||||
|
||||
content := req.FileContent
|
||||
if len(content) == 0 {
|
||||
return &types.ReadResult{Error: "no file content provided"}, nil
|
||||
}
|
||||
|
||||
logger.Infof(context.Background(), "[PaddleOCR-VL] Parsing file=%s size=%d via %s",
|
||||
req.FileName, len(content), c.endpoint)
|
||||
|
||||
mdContent, imagesB64, err := c.callLayoutParsing(ctx, req, content)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("PaddleOCR-VL layout-parsing: %w", err)
|
||||
}
|
||||
|
||||
imageRefs, mdContent := c.processImages(mdContent, imagesB64)
|
||||
mdContent, imageRefs = ensureOriginalImageRef(req, mdContent, imageRefs)
|
||||
|
||||
logger.Infof(context.Background(), "[PaddleOCR-VL] Parsed successfully, markdown=%d chars, images=%d",
|
||||
len(mdContent), len(imageRefs))
|
||||
|
||||
return &types.ReadResult{
|
||||
MarkdownContent: mdContent,
|
||||
ImageRefs: imageRefs,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// paddleOCRVLRecognitionParams returns the recognition / page-restructuring
|
||||
// parameters shared by the self-hosted (/layout-parsing, top-level body) and
|
||||
// cloud (optionalPayload) request bodies. Keeping both identical ensures the
|
||||
// self-hosted engine reproduces the cloud output: cross-page table merging,
|
||||
// multi-level heading reconstruction, header/footer stripping, and the same
|
||||
// sampling / resolution settings used by the AI Studio service.
|
||||
func paddleOCRVLRecognitionParams(useSeal, useChart bool) map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"markdownIgnoreLabels": []string{
|
||||
"header", "header_image", "footer", "footer_image",
|
||||
"number", "footnote", "aside_text",
|
||||
},
|
||||
"useDocOrientationClassify": false,
|
||||
"useDocUnwarping": false,
|
||||
"useLayoutDetection": true,
|
||||
"useChartRecognition": useChart,
|
||||
"useSealRecognition": useSeal,
|
||||
"useOcrForImageBlock": false,
|
||||
"mergeTables": true,
|
||||
"relevelTitles": true,
|
||||
"restructurePages": true,
|
||||
"layoutShapeMode": "auto",
|
||||
"promptLabel": "ocr",
|
||||
"layoutNms": true,
|
||||
"repetitionPenalty": 1,
|
||||
"temperature": 0,
|
||||
"topP": 1,
|
||||
"minPixels": 147384,
|
||||
"maxPixels": 2822400,
|
||||
}
|
||||
}
|
||||
|
||||
// fileTypeCode maps a request to the PaddleOCR-VL fileType field:
|
||||
// 0 = PDF, 1 = image (including TIFF).
|
||||
func fileTypeCode(req *types.ReadRequest) int {
|
||||
ft := strings.ToLower(strings.TrimPrefix(req.FileType, "."))
|
||||
if ft == "" {
|
||||
ft = strings.TrimPrefix(strings.ToLower(filepath.Ext(req.FileName)), ".")
|
||||
}
|
||||
if ft == "pdf" {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
// paddleOCRVLResponse mirrors the relevant fields of the PaddleX serving
|
||||
// /layout-parsing response. The service returns one entry per page.
|
||||
type paddleOCRVLResponse struct {
|
||||
ErrorCode int `json:"errorCode"`
|
||||
ErrorMsg string `json:"errorMsg"`
|
||||
Result struct {
|
||||
LayoutParsingResults []struct {
|
||||
Markdown struct {
|
||||
Text string `json:"text"`
|
||||
Images map[string]string `json:"images"`
|
||||
} `json:"markdown"`
|
||||
} `json:"layoutParsingResults"`
|
||||
} `json:"result"`
|
||||
}
|
||||
|
||||
func (c *PaddleOCRVLReader) callLayoutParsing(
|
||||
ctx context.Context, req *types.ReadRequest, content []byte,
|
||||
) (string, map[string]string, error) {
|
||||
payload := paddleOCRVLRecognitionParams(c.useSeal, c.useChart)
|
||||
payload["file"] = base64.StdEncoding.EncodeToString(content)
|
||||
payload["fileType"] = fileTypeCode(req)
|
||||
payload["visualize"] = false
|
||||
if !c.useLayout {
|
||||
payload["useLayoutDetection"] = false
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("marshal payload: %w", err)
|
||||
}
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(
|
||||
ctx, http.MethodPost, c.endpoint+"/layout-parsing", bytes.NewReader(body),
|
||||
)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
|
||||
client := &http.Client{Timeout: paddleOCRVLTimeout}
|
||||
resp, err := client.Do(httpReq)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("HTTP request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("read response body: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", nil, fmt.Errorf("PaddleOCR-VL API status %d: %s", resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
var result paddleOCRVLResponse
|
||||
if err := json.Unmarshal(respBody, &result); err != nil {
|
||||
return "", nil, fmt.Errorf("decode response: %w", err)
|
||||
}
|
||||
if result.ErrorCode != 0 {
|
||||
return "", nil, fmt.Errorf("PaddleOCR-VL error %d: %s", result.ErrorCode, result.ErrorMsg)
|
||||
}
|
||||
|
||||
pages := result.Result.LayoutParsingResults
|
||||
if len(pages) == 0 {
|
||||
logger.Errorf(context.Background(), "[PaddleOCR-VL] response has no layoutParsingResults")
|
||||
return "", nil, nil
|
||||
}
|
||||
|
||||
// Merge per-page markdown and image dicts into one document.
|
||||
texts := make([]string, 0, len(pages))
|
||||
images := make(map[string]string)
|
||||
for _, p := range pages {
|
||||
if t := strings.TrimSpace(p.Markdown.Text); t != "" {
|
||||
texts = append(texts, p.Markdown.Text)
|
||||
}
|
||||
for path, data := range p.Markdown.Images {
|
||||
if _, ok := images[path]; !ok {
|
||||
images[path] = data
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.Infof(context.Background(), "[PaddleOCR-VL] parsed %d page(s), images=%d", len(pages), len(images))
|
||||
return strings.Join(texts, "\n\n"), images, nil
|
||||
}
|
||||
|
||||
// processImages decodes the inline base64 images returned by PaddleOCR-VL and
|
||||
// builds ImageRef entries, matching them against references in the markdown.
|
||||
func (c *PaddleOCRVLReader) processImages(
|
||||
mdContent string, imagesB64 map[string]string,
|
||||
) ([]types.ImageRef, string) {
|
||||
var refs []types.ImageRef
|
||||
|
||||
for ipath, b64Str := range imagesB64 {
|
||||
matchedRefs := mineruImageOriginalRefs(mdContent, ipath)
|
||||
if len(matchedRefs) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
var imgBytes []byte
|
||||
var ext string
|
||||
if m := b64DataURIPattern.FindStringSubmatch(b64Str); len(m) == 3 {
|
||||
ext = m[1]
|
||||
decoded, err := base64.StdEncoding.DecodeString(m[2])
|
||||
if err != nil {
|
||||
logger.Errorf(context.Background(), "[PaddleOCR-VL] decode base64 image %s: %v", ipath, err)
|
||||
continue
|
||||
}
|
||||
imgBytes = decoded
|
||||
} else {
|
||||
decoded, err := base64.StdEncoding.DecodeString(b64Str)
|
||||
if err != nil {
|
||||
logger.Errorf(context.Background(), "[PaddleOCR-VL] decode raw base64 image %s: %v", ipath, err)
|
||||
continue
|
||||
}
|
||||
imgBytes = decoded
|
||||
ext = strings.TrimPrefix(filepath.Ext(ipath), ".")
|
||||
if ext == "" {
|
||||
ext = "png"
|
||||
}
|
||||
}
|
||||
|
||||
mimeType := mime.TypeByExtension("." + ext)
|
||||
if mimeType == "" {
|
||||
mimeType = "image/png"
|
||||
}
|
||||
|
||||
for _, originalRef := range matchedRefs {
|
||||
refs = append(refs, types.ImageRef{
|
||||
Filename: ipath,
|
||||
OriginalRef: originalRef,
|
||||
MimeType: mimeType,
|
||||
ImageData: imgBytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return refs, mdContent
|
||||
}
|
||||
|
||||
// PingPaddleOCRVL checks whether a self-hosted PaddleOCR-VL service is reachable.
|
||||
func PingPaddleOCRVL(endpoint string) (bool, string) {
|
||||
endpoint = strings.TrimRight(endpoint, "/")
|
||||
if endpoint == "" {
|
||||
return false, "未配置 PaddleOCR-VL 端点"
|
||||
}
|
||||
client := utils.NewSSRFSafeHTTPClient(utils.SSRFSafeHTTPClientConfig{
|
||||
Timeout: 5 * time.Second,
|
||||
MaxRedirects: 5,
|
||||
})
|
||||
// The pipeline only exposes POST /layout-parsing; an empty GET should still
|
||||
// produce a routed HTTP response (e.g. 404/405) when the service is up.
|
||||
resp, err := client.Get(endpoint + "/layout-parsing")
|
||||
if err != nil {
|
||||
return false, fmt.Sprintf("PaddleOCR-VL 服务不可达: %v", err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
if resp.StatusCode >= 500 {
|
||||
return false, fmt.Sprintf("PaddleOCR-VL 服务返回状态 %d", resp.StatusCode)
|
||||
}
|
||||
return true, ""
|
||||
}
|
||||
@@ -120,18 +120,15 @@ func asynqRetryDelayFunc(n int, e error, t *asynq.Task) time.Duration {
|
||||
// not on local CPU).
|
||||
const defaultAsynqConcurrency = 16
|
||||
|
||||
func readAsynqConcurrency() int {
|
||||
if v := strings.TrimSpace(os.Getenv("WEKNORA_ASYNQ_CONCURRENCY")); v != "" {
|
||||
if parsed, err := strconv.Atoi(v); err == nil && parsed > 0 {
|
||||
return parsed
|
||||
func NewAsynqServer(svc interfaces.SystemSettingService) *asynq.Server {
|
||||
opt := getAsynqRedisClientOpt()
|
||||
concurrency := defaultAsynqConcurrency
|
||||
if svc != nil {
|
||||
n := svc.GetInt(context.Background(), "asynq.concurrency", "WEKNORA_ASYNQ_CONCURRENCY", defaultAsynqConcurrency)
|
||||
if n > 0 {
|
||||
concurrency = int(n)
|
||||
}
|
||||
}
|
||||
return defaultAsynqConcurrency
|
||||
}
|
||||
|
||||
func NewAsynqServer() *asynq.Server {
|
||||
opt := getAsynqRedisClientOpt()
|
||||
concurrency := readAsynqConcurrency()
|
||||
log.Printf("asynq server starting with concurrency=%d redis_op_timeout=%dms",
|
||||
concurrency, readRedisOpTimeoutMs())
|
||||
srv := asynq.NewServer(
|
||||
|
||||
@@ -285,6 +285,24 @@ type ParserEngineConfig struct {
|
||||
MinerUCloudEnableTable *bool `json:"mineru_cloud_enable_table,omitempty"`
|
||||
MinerUCloudEnableOCR *bool `json:"mineru_cloud_enable_ocr,omitempty"`
|
||||
MinerUCloudLanguage string `json:"mineru_cloud_language,omitempty"`
|
||||
|
||||
// OpenDataLoader PDF (docreader engine); hybrid requires opendataloader-pdf-hybrid service.
|
||||
ODLHybrid string `json:"odl_hybrid,omitempty"` // off (default), docling-fast, hancom-ai
|
||||
ODLHybridURL string `json:"odl_hybrid_url,omitempty"` // e.g. http://odl-hybrid:5002
|
||||
ODLHybridMode string `json:"odl_hybrid_mode,omitempty"` // auto, full
|
||||
ODLHybridFallback *bool `json:"odl_hybrid_fallback,omitempty"`
|
||||
ODLMarkdownWithHTML *bool `json:"odl_markdown_with_html,omitempty"`
|
||||
|
||||
// PaddleOCR-VL self-hosted pipeline service (full /layout-parsing API).
|
||||
PaddleOCRVLEndpoint string `json:"paddleocr_vl_endpoint,omitempty"` // e.g. http://paddleocr-vl:8080
|
||||
PaddleOCRVLUseSealRecognition *bool `json:"paddleocr_vl_use_seal_recognition,omitempty"`
|
||||
PaddleOCRVLUseChartRecognition *bool `json:"paddleocr_vl_use_chart_recognition,omitempty"`
|
||||
|
||||
// PaddleOCR-VL AI Studio cloud API.
|
||||
PaddleOCRVLCloudToken string `json:"paddleocr_vl_cloud_token,omitempty"`
|
||||
PaddleOCRVLCloudModel string `json:"paddleocr_vl_cloud_model,omitempty"` // e.g. PaddleOCR-VL-1.6
|
||||
PaddleOCRVLCloudUseSealRecognition *bool `json:"paddleocr_vl_cloud_use_seal_recognition,omitempty"`
|
||||
PaddleOCRVLCloudUseChartRecognition *bool `json:"paddleocr_vl_cloud_use_chart_recognition,omitempty"`
|
||||
}
|
||||
|
||||
// ToOverridesMap returns a map suitable for ParserEngineOverrides in parse requests.
|
||||
@@ -333,6 +351,42 @@ func (c *ParserEngineConfig) ToOverridesMap() map[string]string {
|
||||
if c.MinerUCloudLanguage != "" {
|
||||
m["mineru_cloud_language"] = c.MinerUCloudLanguage
|
||||
}
|
||||
if c.ODLHybrid != "" {
|
||||
m["odl_hybrid"] = c.ODLHybrid
|
||||
}
|
||||
if c.ODLHybridURL != "" {
|
||||
m["odl_hybrid_url"] = c.ODLHybridURL
|
||||
}
|
||||
if c.ODLHybridMode != "" {
|
||||
m["odl_hybrid_mode"] = c.ODLHybridMode
|
||||
}
|
||||
if c.ODLHybridFallback != nil {
|
||||
m["odl_hybrid_fallback"] = fmt.Sprintf("%v", *c.ODLHybridFallback)
|
||||
}
|
||||
if c.ODLMarkdownWithHTML != nil {
|
||||
m["odl_markdown_with_html"] = fmt.Sprintf("%v", *c.ODLMarkdownWithHTML)
|
||||
}
|
||||
if c.PaddleOCRVLEndpoint != "" {
|
||||
m["paddleocr_vl_endpoint"] = c.PaddleOCRVLEndpoint
|
||||
}
|
||||
if c.PaddleOCRVLUseSealRecognition != nil {
|
||||
m["paddleocr_vl_use_seal_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLUseSealRecognition)
|
||||
}
|
||||
if c.PaddleOCRVLUseChartRecognition != nil {
|
||||
m["paddleocr_vl_use_chart_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLUseChartRecognition)
|
||||
}
|
||||
if c.PaddleOCRVLCloudToken != "" {
|
||||
m["paddleocr_vl_cloud_token"] = c.PaddleOCRVLCloudToken
|
||||
}
|
||||
if c.PaddleOCRVLCloudModel != "" {
|
||||
m["paddleocr_vl_cloud_model"] = c.PaddleOCRVLCloudModel
|
||||
}
|
||||
if c.PaddleOCRVLCloudUseSealRecognition != nil {
|
||||
m["paddleocr_vl_cloud_use_seal_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLCloudUseSealRecognition)
|
||||
}
|
||||
if c.PaddleOCRVLCloudUseChartRecognition != nil {
|
||||
m["paddleocr_vl_cloud_use_chart_recognition"] = fmt.Sprintf("%v", *c.PaddleOCRVLCloudUseChartRecognition)
|
||||
}
|
||||
if len(m) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -91,6 +91,7 @@ WeKnora `docker-compose.yml` 大量服务是 **profile 限定**,本镜像只
|
||||
| `jaeger` | OpenTelemetry trace UI |
|
||||
| `langfuse` | 自建 Langfuse 可观测平台 |
|
||||
| `dex` | OIDC 登录 |
|
||||
| `odl-hybrid` | OpenDataLoader Docling hybrid(体积大,无预发布镜像,需 `--build`) |
|
||||
|
||||
启用方式:
|
||||
|
||||
@@ -99,6 +100,7 @@ cd /opt/WeKnora
|
||||
docker compose --profile neo4j up -d # 启用 GraphRAG
|
||||
docker compose --profile langfuse up -d # 启用自建 Langfuse
|
||||
docker compose --profile qdrant up -d # 切换到 Qdrant
|
||||
docker compose --profile odl-hybrid up -d --build odl-hybrid # Docling hybrid(按需)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
@@ -72,14 +72,17 @@ show_help() {
|
||||
echo " --dex 启动 Dex(OIDC 身份认证)"
|
||||
echo " --langfuse 启动 Langfuse(默认已开启)"
|
||||
echo " --no-langfuse 不启动 Langfuse"
|
||||
echo " --full 启动所有可选服务"
|
||||
echo " --odl-hybrid 启动 OpenDataLoader hybrid(Docling,镜像较大,按需启用)"
|
||||
echo " --full 启动所有可选服务(不含 odl-hybrid,需另加 --odl-hybrid)"
|
||||
echo ""
|
||||
echo "示例:"
|
||||
echo " $0 start # 启动基础服务"
|
||||
echo " $0 start --qdrant # 启动基础服务 + Qdrant"
|
||||
echo " $0 start --qdrant --jaeger # 启动基础服务 + Qdrant + Jaeger"
|
||||
echo " $0 start --dex # 启动基础服务 + Dex"
|
||||
echo " $0 start --odl-hybrid # 启动基础服务 + OpenDataLoader hybrid"
|
||||
echo " $0 start --full # 启动所有服务"
|
||||
echo " make dev-start DEV_ARGS=--odl-hybrid # 同上(Makefile 传参)"
|
||||
echo " $0 app # 在另一个终端启动后端"
|
||||
echo " $0 frontend # 在另一个终端启动前端"
|
||||
}
|
||||
@@ -104,6 +107,46 @@ check_docker() {
|
||||
return 0
|
||||
}
|
||||
|
||||
# 检查 .env 是否启用了 hybrid 模式(用于 --odl-hybrid 启动后重建 docreader)
|
||||
_should_enable_odl_hybrid_from_env() {
|
||||
local hybrid="${DOCREADER_ODL_HYBRID:-off}"
|
||||
hybrid=$(printf '%s' "$hybrid" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]')
|
||||
case "$hybrid" in
|
||||
off|"") return 1 ;;
|
||||
*) return 0 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
_enable_odl_hybrid_profile() {
|
||||
PROFILES="$PROFILES --profile odl-hybrid"
|
||||
ENABLED_SERVICES="$ENABLED_SERVICES odl-hybrid"
|
||||
}
|
||||
|
||||
# 等待 odl-hybrid HTTP 健康检查通过(compose 启动后服务可能仍在拉依赖)
|
||||
_wait_odl_hybrid_ready() {
|
||||
local port="${ODL_HYBRID_PORT:-5002}"
|
||||
local max_wait="${ODL_HYBRID_STARTUP_WAIT_SEC:-180}"
|
||||
local waited=0
|
||||
local interval=5
|
||||
|
||||
if ! command -v curl &> /dev/null; then
|
||||
log_warning "未安装 curl,跳过 odl-hybrid 就绪等待;请手动检查 http://localhost:${port}/health"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log_info "等待 odl-hybrid 就绪(最多 ${max_wait}s,首次需构建镜像: docker compose ... build odl-hybrid)..."
|
||||
while [ "$waited" -lt "$max_wait" ]; do
|
||||
if curl -sf "http://127.0.0.1:${port}/health" >/dev/null 2>&1; then
|
||||
log_success "odl-hybrid 已就绪 (http://localhost:${port}/health)"
|
||||
return 0
|
||||
fi
|
||||
sleep "$interval"
|
||||
waited=$((waited + interval))
|
||||
done
|
||||
log_warning "odl-hybrid 在 ${max_wait}s 内未就绪,请查看: docker logs WeKnora-odl-hybrid"
|
||||
return 1
|
||||
}
|
||||
|
||||
# 启动基础设施服务
|
||||
start_services() {
|
||||
log_info "启动开发环境基础设施服务..."
|
||||
@@ -120,6 +163,11 @@ start_services() {
|
||||
log_error ".env 文件不存在,请先创建"
|
||||
return 1
|
||||
fi
|
||||
|
||||
set -a
|
||||
# shellcheck source=/dev/null
|
||||
source .env
|
||||
set +a
|
||||
|
||||
# 解析 profile 参数
|
||||
shift # 移除 "start" 命令本身
|
||||
@@ -127,7 +175,6 @@ start_services() {
|
||||
# 其余可选服务通过 --minio / --qdrant / --neo4j / --jaeger / --dex / --full 按需开启。
|
||||
PROFILES="--profile langfuse"
|
||||
ENABLED_SERVICES="langfuse"
|
||||
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--minio)
|
||||
@@ -158,6 +205,11 @@ start_services() {
|
||||
PROFILES="${PROFILES//--profile langfuse/}"
|
||||
ENABLED_SERVICES="${ENABLED_SERVICES//langfuse/}"
|
||||
;;
|
||||
--odl-hybrid)
|
||||
if [[ "$ENABLED_SERVICES" != *"odl-hybrid"* ]]; then
|
||||
_enable_odl_hybrid_profile
|
||||
fi
|
||||
;;
|
||||
--full)
|
||||
PROFILES="--profile full"
|
||||
ENABLED_SERVICES="minio qdrant neo4j jaeger dex"
|
||||
@@ -169,11 +221,22 @@ start_services() {
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# 启动服务
|
||||
|
||||
# 启动服务(odl-hybrid 单独 --build,避免每次重建 docreader)
|
||||
"$DOCKER_COMPOSE_BIN" $DOCKER_COMPOSE_SUBCMD -f docker-compose.dev.yml $PROFILES up -d
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
local compose_rc=$?
|
||||
if [ "$compose_rc" -eq 0 ] && [[ "$ENABLED_SERVICES" == *"odl-hybrid"* ]]; then
|
||||
log_info "构建/更新 odl-hybrid 镜像..."
|
||||
"$DOCKER_COMPOSE_BIN" $DOCKER_COMPOSE_SUBCMD -f docker-compose.dev.yml $PROFILES up -d --build odl-hybrid
|
||||
_wait_odl_hybrid_ready || true
|
||||
# docreader 需读取 DOCREADER_ODL_HYBRID;若刚改 .env,强制重建以注入环境变量
|
||||
if _should_enable_odl_hybrid_from_env; then
|
||||
log_info "重建 docreader 以应用 DOCREADER_ODL_HYBRID=${DOCREADER_ODL_HYBRID} ..."
|
||||
"$DOCKER_COMPOSE_BIN" $DOCKER_COMPOSE_SUBCMD -f docker-compose.dev.yml up -d --force-recreate docreader
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$compose_rc" -eq 0 ]; then
|
||||
log_success "基础设施服务已启动"
|
||||
echo ""
|
||||
log_info "服务访问地址:"
|
||||
@@ -200,6 +263,10 @@ start_services() {
|
||||
if [[ "$ENABLED_SERVICES" == *"langfuse"* ]]; then
|
||||
echo " - Langfuse: http://localhost:${LANGFUSE_WEB_PORT:-3000}"
|
||||
fi
|
||||
if [[ "$ENABLED_SERVICES" == *"odl-hybrid"* ]]; then
|
||||
echo " - ODL Hybrid: http://localhost:${ODL_HYBRID_PORT:-5002} (health: /health)"
|
||||
echo " docreader 需 DOCREADER_ODL_HYBRID=docling-fast"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
log_info "接下来的步骤:"
|
||||
|
||||
Reference in New Issue
Block a user