Files
WeKnora/docker/Dockerfile.odl-hybrid
wizardchen ef1047bf67 feat(parser): add OpenDataLoader, PaddleOCR-VL engines, and parser improvements
Introduce opendataloader and PaddleOCR-VL parser engines with tenant-level
settings UI, replace liteparse, and harden Excel/PPT/Markdown parsing.
Optional odl-hybrid sidecar stays local-build only and is excluded from
default dev-start and full profiles.
2026-06-03 12:29:13 +08:00

30 lines
1.1 KiB
Docker

# OpenDataLoader PDF hybrid backend (Docling). Pre-install deps so the
# container listens on :5002 immediately instead of pip install on every start.
#
# Default --no-ocr: digital PDFs already have a text layer; Docling layout/table
# still runs without EasyOCR (avoids libGL + heavy OCR stack in slim images).
# For scanned PDFs use builtin docreader OCR, MinerU, or override with
# ODL_HYBRID_EXTRA_ARGS="--force-ocr" (requires extra system/Python deps).
FROM python:3.10.18-bookworm
# Docling table/layout models import cv2 (OpenCV), which needs libGL at runtime
# even when hybrid runs with --no-ocr.
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
libgl1 \
libglib2.0-0 \
libgomp1 \
libsm6 \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir "opendataloader-pdf[hybrid]>=2.4.7"
EXPOSE 5002
ENV ODL_HYBRID_EXTRA_ARGS="--no-ocr"
HEALTHCHECK --interval=30s --timeout=10s --retries=5 --start-period=120s \
CMD curl -f http://localhost:5002/health || exit 1
CMD ["bash", "-c", "exec opendataloader-pdf-hybrid --host 0.0.0.0 --port 5002 ${ODL_HYBRID_EXTRA_ARGS}"]