mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
- Added support for customizable APT mirror in the Dockerfile for the docreader service, allowing users to specify a mirror via build arguments. - Updated docker-compose.yml to pass the APT_MIRROR argument during the build process. - Modified build_images.sh script to include the APT_MIRROR argument when building the docreader image. - Updated .gitignore to exclude .cursor/ directory. This update improves flexibility in package management during the image build process.
134 lines
4.2 KiB
Docker
134 lines
4.2 KiB
Docker
# =========================
|
||
# 构建阶段(轻量化:仅文档解析 + 图片提取,无 OCR/VLM)
|
||
# =========================
|
||
FROM python:3.10.18-bookworm AS builder
|
||
|
||
ARG APT_MIRROR=""
|
||
RUN if [ -n "$APT_MIRROR" ]; then \
|
||
sed -i "s@http://deb.debian.org@${APT_MIRROR}@g" /etc/apt/sources.list.d/debian.sources && \
|
||
sed -i "s@http://security.debian.org@${APT_MIRROR}@g" /etc/apt/sources.list.d/debian.sources; \
|
||
fi
|
||
|
||
WORKDIR /app
|
||
|
||
# 安装构建依赖
|
||
RUN apt-get update && apt-get install -y \
|
||
gcc \
|
||
python3-dev \
|
||
libjpeg-dev \
|
||
zlib1g-dev \
|
||
libffi-dev \
|
||
libgl1 \
|
||
libglib2.0-0 \
|
||
wget \
|
||
antiword \
|
||
curl \
|
||
unzip \
|
||
&& rm -rf /var/lib/apt/lists/*
|
||
|
||
# 检查是否存在本地protoc安装包,如果存在则离线安装,否则在线安装
|
||
ARG TARGETARCH
|
||
COPY packages/ /app/packages/
|
||
RUN echo "检查本地protoc安装包..." && \
|
||
case ${TARGETARCH} in \
|
||
"amd64") PROTOC_ARCH="x86_64" ;; \
|
||
"arm64") PROTOC_ARCH="aarch_64" ;; \
|
||
"arm") PROTOC_ARCH="arm" ;; \
|
||
*) echo "Unsupported architecture for protoc: ${TARGETARCH}" && exit 1 ;; \
|
||
esac && \
|
||
PROTOC_PACKAGE="protoc-3.19.4-linux-${PROTOC_ARCH}.zip" && \
|
||
if [ -f "/app/packages/${PROTOC_PACKAGE}" ]; then \
|
||
echo "发现本地protoc安装包,将进行离线安装"; \
|
||
cp /app/packages/${PROTOC_PACKAGE} /app/ && \
|
||
unzip -o /app/${PROTOC_PACKAGE} -d /usr/local && \
|
||
chmod +x /usr/local/bin/protoc && \
|
||
rm -f /app/${PROTOC_PACKAGE}; \
|
||
else \
|
||
echo "未发现本地protoc安装包,将进行在线安装"; \
|
||
curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.19.4/${PROTOC_PACKAGE} && \
|
||
unzip -o ${PROTOC_PACKAGE} -d /usr/local && \
|
||
chmod +x /usr/local/bin/protoc && \
|
||
rm -f ${PROTOC_PACKAGE}; \
|
||
fi
|
||
|
||
# 复制依赖文件
|
||
COPY docreader/pyproject.toml docreader/uv.lock ./
|
||
RUN pip install uv --break-system-packages && \
|
||
python -m uv sync --locked --no-dev
|
||
|
||
# 复制源代码和生成脚本
|
||
COPY docreader docreader
|
||
|
||
# 生成 protobuf 代码(使用 venv 中的 grpc_tools)
|
||
ENV PATH="/app/.venv/bin:${PATH}"
|
||
RUN chmod +x docreader/scripts/generate_proto.sh && \
|
||
bash docreader/scripts/generate_proto.sh
|
||
|
||
# =========================
|
||
# 运行阶段(轻量化)
|
||
# =========================
|
||
FROM python:3.10.18-bookworm AS runner
|
||
|
||
ARG APT_MIRROR=""
|
||
RUN if [ -n "$APT_MIRROR" ]; then \
|
||
sed -i "s@http://deb.debian.org@${APT_MIRROR}@g" /etc/apt/sources.list.d/debian.sources && \
|
||
sed -i "s@http://security.debian.org@${APT_MIRROR}@g" /etc/apt/sources.list.d/debian.sources; \
|
||
fi
|
||
|
||
WORKDIR /app
|
||
|
||
# 安装运行时依赖(已移除 OCR/PaddleOCR 相关依赖)
|
||
RUN apt-get update && apt-get install -y \
|
||
libjpeg62-turbo \
|
||
wget \
|
||
gnupg \
|
||
libgl1 \
|
||
libglib2.0-0 \
|
||
antiword \
|
||
tar \
|
||
dpkg \
|
||
libxinerama1 \
|
||
libfontconfig1 \
|
||
libdbus-glib-1-2 \
|
||
libcairo2 \
|
||
libcups2 \
|
||
libglu1-mesa \
|
||
libsm6 \
|
||
libreoffice \
|
||
curl \
|
||
&& rm -rf /var/lib/apt/lists/*
|
||
|
||
# 安装 grpc_health_probe
|
||
ARG TARGETARCH
|
||
RUN GRPC_HEALTH_PROBE_VERSION=v0.4.24 && \
|
||
case ${TARGETARCH} in \
|
||
"amd64") ARCH="amd64" ;; \
|
||
"arm64") ARCH="arm64" ;; \
|
||
"arm") ARCH="arm" ;; \
|
||
*) echo "Unsupported architecture: ${TARGETARCH}" && exit 1 ;; \
|
||
esac && \
|
||
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-${ARCH} && \
|
||
chmod +x /bin/grpc_health_probe
|
||
|
||
# 从构建阶段复制已安装的依赖和生成的代码
|
||
ENV VIRTUAL_ENV=/app/.venv
|
||
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
|
||
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
|
||
|
||
COPY --from=builder /usr/local/bin /usr/local/bin
|
||
|
||
# 安装 Playwright 浏览器(网页解析)
|
||
RUN python -m playwright install webkit
|
||
RUN python -m playwright install-deps webkit
|
||
|
||
COPY docreader/pyproject.toml docreader/uv.lock ./
|
||
COPY --from=builder /app/docreader docreader
|
||
|
||
# 创建共享临时图片目录
|
||
RUN mkdir -p /tmp/docreader
|
||
|
||
# 暴露 gRPC 端口
|
||
EXPOSE 50051
|
||
|
||
# 直接运行 Python 服务(日志输出到 stdout/stderr)
|
||
CMD ["uv", "run", "-m", "docreader.main"] |