mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
chore: bump Go to 1.26 and slim docreader dependencies
- Bump base image in docker/Dockerfile.app from golang:1.24 to golang:1.26
to match `go 1.26` declared in go.mod (fixes CI build failure on
`go mod download`).
- Drop unused docreader components and their dependencies:
- Remove `docreader/ocr/` package (paddle/vlm/dummy backends are
unreferenced by the main flow; OCR/VLM is handled by the Go App).
- Remove `docreader/parser/storage.py` (dead code; image persistence
happens in the Go App via inline ImageRef bytes).
- Remove `docreader/scripts/download_deps.py` (PaddleOCR pre-download).
- Drop deps: paddleocr, paddlepaddle, openai, ollama, minio,
cos-python-sdk-v5, oss2, asyncio, pypdf2, markdown, mistletoe,
goose3, markdownify, pdfplumber, antiword, urllib3.
- Re-lock uv.lock: 145 -> 79 packages.
- Update docreader/README.md to reflect that OCR/VLM/storage are no
longer configured at the docreader level.
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
# Build stage
|
||||
FROM golang:1.24-bookworm AS builder
|
||||
FROM golang:1.26-bookworm AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -84,30 +84,9 @@ docreader:
|
||||
- `DOCREADER_PDF_RENDER_DPI`: 扫描 PDF 渲染 DPI(默认:200)
|
||||
- `DOCREADER_PDF_JPEG_QUALITY`: 扫描 PDF 输出 JPEG 质量(默认:90,范围会自动限制在 1-95)
|
||||
|
||||
### OCR 配置
|
||||
### OCR / VLM
|
||||
|
||||
- `OCR_BACKEND`: OCR 引擎后端,可选值:
|
||||
- `paddle`: 使用 PaddleOCR(默认)
|
||||
- `no_ocr`: 禁用 OCR 功能
|
||||
- `api`: 使用外部 OCR API
|
||||
- `OCR_API_BASE_URL`: 外部 OCR API 的基础 URL
|
||||
- `OCR_API_KEY`: 外部 OCR API 的密钥
|
||||
- `OCR_MODEL`: OCR 模型名称
|
||||
|
||||
**示例**:禁用 OCR 功能
|
||||
```yaml
|
||||
environment:
|
||||
- OCR_BACKEND=no_ocr
|
||||
```
|
||||
|
||||
### VLM(视觉语言模型)配置
|
||||
|
||||
用于图像理解和描述生成:
|
||||
|
||||
- `VLM_MODEL_BASE_URL`: VLM 模型的 API 地址
|
||||
- `VLM_MODEL_NAME`: VLM 模型名称
|
||||
- `VLM_MODEL_API_KEY`: VLM 模型的 API 密钥
|
||||
- `VLM_INTERFACE_TYPE`: 接口类型,可选值:`openai`(默认)或 `ollama`
|
||||
DocReader 自身不再内置 OCR 与 VLM 后端。扫描 PDF 会被渲染为 JPEG 图片后交由 Go App 侧调用 OCR/VLM 服务处理,相关配置请参考主项目文档。
|
||||
|
||||
### 存储配置
|
||||
|
||||
@@ -167,7 +146,7 @@ docreader:
|
||||
- MAX_FILE_SIZE_MB=50
|
||||
```
|
||||
|
||||
### 高级配置(启用 MinerU + 自定义 OCR)
|
||||
### 高级配置(启用 MinerU)
|
||||
|
||||
```yaml
|
||||
docreader:
|
||||
@@ -176,10 +155,6 @@ docreader:
|
||||
- MINIO_PUBLIC_ENDPOINT=http://192.168.1.100:9000
|
||||
- MINERU_ENDPOINT=http://mineru:8080
|
||||
- MAX_FILE_SIZE_MB=100
|
||||
- OCR_BACKEND=paddle
|
||||
- VLM_MODEL_BASE_URL=http://ollama:11434
|
||||
- VLM_MODEL_NAME=llava
|
||||
- VLM_INTERFACE_TYPE=ollama
|
||||
```
|
||||
|
||||
### 使用腾讯云 COS
|
||||
@@ -214,12 +189,7 @@ docreader:
|
||||
|
||||
### 1. DocReader 服务无法启动?
|
||||
|
||||
如果日志中出现 PaddleOCR 相关错误,可以尝试禁用 OCR:
|
||||
|
||||
```yaml
|
||||
environment:
|
||||
- OCR_BACKEND=no_ocr
|
||||
```
|
||||
检查容器日志中是否存在依赖缺失或权限相关错误,必要时确认 `MINIO_ENDPOINT` / 存储相关环境变量是否正确配置。
|
||||
|
||||
### 2. 图片无法显示?
|
||||
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
import logging
|
||||
import threading
|
||||
from typing import Dict
|
||||
|
||||
from docreader.ocr.base import DummyOCRBackend, OCRBackend
|
||||
from docreader.ocr.paddle import PaddleOCRBackend
|
||||
from docreader.ocr.vlm import VLMOCRBackend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OCREngine:
|
||||
"""OCR Engine factory class for managing different OCR backend instances"""
|
||||
|
||||
_instances: Dict[str, OCRBackend] = {}
|
||||
_lock = threading.Lock()
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls, backend_type: str) -> OCRBackend:
|
||||
backend_type = (backend_type or "dummy").lower()
|
||||
|
||||
with cls._lock:
|
||||
inst = cls._instances.get(backend_type)
|
||||
if inst is not None:
|
||||
return inst
|
||||
|
||||
logger.info(f"Creating OCR engine instance for backend: {backend_type}")
|
||||
|
||||
if backend_type == "paddle":
|
||||
inst = PaddleOCRBackend()
|
||||
elif backend_type == "vlm":
|
||||
inst = VLMOCRBackend()
|
||||
else:
|
||||
inst = DummyOCRBackend()
|
||||
|
||||
cls._instances[backend_type] = inst
|
||||
return inst
|
||||
@@ -1,31 +0,0 @@
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Union
|
||||
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OCRBackend(ABC):
|
||||
"""Base class for OCR backends"""
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class DummyOCRBackend(OCRBackend):
|
||||
"""Dummy OCR backend implementation"""
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
logger.warning("Dummy OCR backend is used")
|
||||
return ""
|
||||
@@ -1,175 +0,0 @@
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from docreader.ocr.base import OCRBackend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PaddleOCRBackend(OCRBackend):
|
||||
"""PaddleOCR backend implementation"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize PaddleOCR backend"""
|
||||
self.ocr = None
|
||||
try:
|
||||
import paddle
|
||||
|
||||
# Set PaddlePaddle to use CPU and disable GPU
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||
paddle.device.set_device("cpu")
|
||||
|
||||
# Try to detect if CPU supports AVX instruction set
|
||||
# 尝试检测CPU是否支持AVX指令集
|
||||
try:
|
||||
# Detect if CPU supports AVX
|
||||
# 检测CPU是否支持AVX
|
||||
if platform.system() == "Linux":
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["grep", "-o", "avx", "/proc/cpuinfo"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
has_avx = "avx" in result.stdout.lower()
|
||||
if not has_avx:
|
||||
logger.warning(
|
||||
"CPU does not support AVX instructions, "
|
||||
"using compatibility mode"
|
||||
)
|
||||
# Further restrict instruction set usage
|
||||
# 进一步限制指令集使用
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
except (
|
||||
subprocess.TimeoutExpired,
|
||||
FileNotFoundError,
|
||||
subprocess.SubprocessError,
|
||||
):
|
||||
logger.warning(
|
||||
"Could not detect AVX support, using compatibility mode"
|
||||
)
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error detecting CPU capabilities: {e}, using compatibility mode"
|
||||
)
|
||||
os.environ["FLAGS_use_avx2"] = "0"
|
||||
os.environ["FLAGS_use_avx"] = "1"
|
||||
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
# OCR configuration with text orientation classification enabled
|
||||
ocr_config = {
|
||||
"use_gpu": False,
|
||||
"text_det_limit_type": "max",
|
||||
"text_det_limit_side_len": 960,
|
||||
"use_doc_orientation_classify": True, # Enable document orientation classification / 启用文档方向分类
|
||||
"use_doc_unwarping": False,
|
||||
"use_textline_orientation": True, # Enable text line orientation detection / 启用文本行方向检测
|
||||
"text_recognition_model_name": "PP-OCRv4_server_rec",
|
||||
"text_detection_model_name": "PP-OCRv4_server_det",
|
||||
"text_det_thresh": 0.3,
|
||||
"text_det_box_thresh": 0.6,
|
||||
"text_det_unclip_ratio": 1.5,
|
||||
"text_rec_score_thresh": 0.0,
|
||||
"ocr_version": "PP-OCRv4",
|
||||
"lang": "ch",
|
||||
"show_log": False,
|
||||
"use_dilation": True, # improves accuracy
|
||||
"det_db_score_mode": "slow", # improves accuracy
|
||||
}
|
||||
|
||||
self.ocr = PaddleOCR(**ocr_config)
|
||||
logger.info("PaddleOCR engine initialized successfully")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(
|
||||
f"Failed to import paddleocr: {str(e)}. "
|
||||
"Please install it with 'pip install paddleocr'"
|
||||
)
|
||||
except OSError as e:
|
||||
if "Illegal instruction" in str(e) or "core dumped" in str(e):
|
||||
logger.error(
|
||||
f"PaddlePaddle crashed due to CPU instruction set incompatibility:"
|
||||
f"{e}"
|
||||
)
|
||||
logger.error(
|
||||
"This happens when the CPU doesn't support AVX instructions. "
|
||||
"Try install CPU-only version of PaddlePaddle, "
|
||||
"or use a different OCR backend."
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"Failed to initialize PaddleOCR due to OS error: {str(e)}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
if isinstance(image, str):
|
||||
image = Image.open(image)
|
||||
elif isinstance(image, bytes):
|
||||
image = Image.open(io.BytesIO(image))
|
||||
|
||||
if not isinstance(image, Image.Image):
|
||||
raise TypeError("image must be a string, bytes, or PIL Image object")
|
||||
|
||||
return self._predict(image)
|
||||
|
||||
def _predict(self, image: Image.Image) -> str:
|
||||
"""Perform OCR recognition on the image
|
||||
|
||||
Args:
|
||||
image: Image object (PIL.Image or numpy array)
|
||||
|
||||
Returns:
|
||||
Extracted text string
|
||||
"""
|
||||
if self.ocr is None:
|
||||
logger.error("PaddleOCR engine not initialized")
|
||||
return ""
|
||||
try:
|
||||
# Ensure image is in RGB format
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
# Convert to numpy array for PaddleOCR processing
|
||||
image_array = np.array(image)
|
||||
|
||||
# Perform OCR recognition
|
||||
ocr_result = self.ocr.ocr(image_array, cls=False)
|
||||
|
||||
# Extract and concatenate text from OCR results
|
||||
ocr_text = ""
|
||||
if ocr_result and ocr_result[0]:
|
||||
text = [
|
||||
line[1][0] if line and len(line) >= 2 and line[1] else ""
|
||||
for line in ocr_result[0]
|
||||
]
|
||||
text = [t.strip() for t in text if t]
|
||||
ocr_text = " ".join(text)
|
||||
|
||||
logger.info(f"OCR extracted {len(ocr_text)} characters")
|
||||
return ocr_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR recognition error: {str(e)}")
|
||||
return ""
|
||||
@@ -1,87 +0,0 @@
|
||||
import logging
|
||||
from typing import Union
|
||||
|
||||
from openai import OpenAI
|
||||
from PIL import Image
|
||||
|
||||
from docreader.config import CONFIG
|
||||
from docreader.ocr.base import OCRBackend
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VLMOCRBackend(OCRBackend):
|
||||
"""VLM OCR backend implementation using OpenAI API format"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize VLM OCR backend
|
||||
|
||||
Args:
|
||||
api_key: API key for OpenAI API
|
||||
base_url: Base URL for OpenAI API
|
||||
model: Model name
|
||||
"""
|
||||
self.model = CONFIG.ocr_model
|
||||
self.client = OpenAI(
|
||||
api_key=CONFIG.ocr_api_key,
|
||||
base_url=CONFIG.ocr_api_base_url,
|
||||
timeout=30,
|
||||
)
|
||||
self.temperature = 0.0
|
||||
self.max_tokens = 5000
|
||||
|
||||
# Prompt for OCR text extraction with specific formatting requirements
|
||||
self.prompt = "提取文档图片中正文的所有信息用markdown格式表示,"
|
||||
"其中页眉、页脚部分忽略,"
|
||||
"表格用html格式表达,"
|
||||
"文档中公式用latex格式表示,"
|
||||
"按照阅读顺序组织进行解析。"
|
||||
|
||||
def predict(self, image: Union[str, bytes, Image.Image]) -> str:
|
||||
"""Extract text from an image using VLM OCR
|
||||
|
||||
Args:
|
||||
image: Image file path, bytes, or PIL Image object
|
||||
|
||||
Returns:
|
||||
Extracted text
|
||||
"""
|
||||
if self.client is None:
|
||||
logger.error("VLM OCR client not initialized")
|
||||
return ""
|
||||
|
||||
try:
|
||||
# Encode image to base64 format for API transmission
|
||||
img_base64 = endecode.decode_image(image)
|
||||
if not img_base64:
|
||||
return ""
|
||||
|
||||
# Call VLM OCR API using OpenAI-compatible format
|
||||
logger.info(f"Calling VLM OCR API with model: {self.model}")
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{img_base64}"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": self.prompt,
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
temperature=self.temperature,
|
||||
max_tokens=self.max_tokens,
|
||||
)
|
||||
return response.choices[0].message.content or ""
|
||||
except Exception as e:
|
||||
logger.error(f"VLM OCR prediction error: {str(e)}")
|
||||
return ""
|
||||
@@ -1,419 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import traceback
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Optional
|
||||
|
||||
from minio import Minio
|
||||
from qcloud_cos import CosConfig, CosS3Client
|
||||
|
||||
try:
|
||||
import oss2
|
||||
except ImportError:
|
||||
oss2 = None # type: ignore[assignment]
|
||||
|
||||
from docreader.utils import endecode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cfg(storage_config: Optional[Dict], key: str, *env_keys: str, default: str = "") -> str:
|
||||
"""Read a value from storage_config dict, falling back to env vars."""
|
||||
if storage_config:
|
||||
v = storage_config.get(key, "")
|
||||
if v:
|
||||
return str(v)
|
||||
for ek in env_keys:
|
||||
v = os.environ.get(ek, "")
|
||||
if v:
|
||||
return v
|
||||
return default
|
||||
|
||||
|
||||
class Storage(ABC):
|
||||
"""Abstract base class for object storage operations"""
|
||||
|
||||
@abstractmethod
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
pass
|
||||
|
||||
|
||||
class CosStorage(Storage):
|
||||
"""Tencent Cloud COS storage implementation"""
|
||||
|
||||
def __init__(self, storage_config: Optional[Dict] = None):
|
||||
self.storage_config = storage_config
|
||||
self.client, self.bucket_name, self.region, self.prefix = (
|
||||
self._init_cos_client()
|
||||
)
|
||||
|
||||
def _init_cos_client(self):
|
||||
try:
|
||||
sc = self.storage_config
|
||||
secret_id = _cfg(sc, "access_key_id", "COS_SECRET_ID")
|
||||
secret_key = _cfg(sc, "secret_access_key", "COS_SECRET_KEY")
|
||||
region = _cfg(sc, "region", "COS_REGION")
|
||||
bucket_name = _cfg(sc, "bucket_name", "COS_BUCKET_NAME")
|
||||
appid = _cfg(sc, "app_id", "COS_APP_ID")
|
||||
prefix = _cfg(sc, "path_prefix", "COS_PATH_PREFIX")
|
||||
enable_old_domain = os.environ.get("COS_ENABLE_OLD_DOMAIN", "").lower() in ("1", "true", "yes")
|
||||
|
||||
if not all([secret_id, secret_key, region, bucket_name, appid]):
|
||||
logger.error(
|
||||
"Incomplete COS configuration: "
|
||||
"secret_id=%s, region=%s, bucket=%s, appid=%s",
|
||||
bool(secret_id), region, bucket_name, appid,
|
||||
)
|
||||
return None, None, None, None
|
||||
|
||||
logger.info("Initializing COS client: region=%s, bucket=%s", region, bucket_name)
|
||||
config = CosConfig(
|
||||
Appid=appid,
|
||||
Region=region,
|
||||
SecretId=secret_id,
|
||||
SecretKey=secret_key,
|
||||
EnableOldDomain=enable_old_domain,
|
||||
)
|
||||
client = CosS3Client(config)
|
||||
return client, bucket_name, region, prefix
|
||||
except Exception as e:
|
||||
logger.error("Failed to initialize COS client: %s", e)
|
||||
return None, None, None, None
|
||||
|
||||
def _get_download_url(self, bucket_name, region, object_key):
|
||||
return f"https://{bucket_name}.cos.{region}.myqcloud.com/{object_key}"
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
try:
|
||||
if not self.client:
|
||||
return ""
|
||||
file_ext = os.path.splitext(file_path)[1]
|
||||
object_key = f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
self.client.upload_file(
|
||||
Bucket=self.bucket_name,
|
||||
LocalFilePath=file_path,
|
||||
Key=object_key,
|
||||
)
|
||||
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
|
||||
logger.info("COS upload_file ok: %s", file_url)
|
||||
return file_url
|
||||
except Exception as e:
|
||||
logger.error("COS upload_file failed: %s", e)
|
||||
return ""
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
try:
|
||||
if not self.client:
|
||||
return ""
|
||||
object_key = (
|
||||
f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
if self.prefix
|
||||
else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
)
|
||||
self.client.put_object(
|
||||
Bucket=self.bucket_name, Body=content, Key=object_key
|
||||
)
|
||||
file_url = self._get_download_url(self.bucket_name, self.region, object_key)
|
||||
logger.info("COS upload_bytes ok: %s", file_url)
|
||||
return file_url
|
||||
except Exception as e:
|
||||
logger.error("COS upload_bytes failed: %s", e)
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
|
||||
class MinioStorage(Storage):
|
||||
"""MinIO storage implementation"""
|
||||
|
||||
def __init__(self, storage_config: Optional[Dict] = None):
|
||||
self.storage_config = storage_config
|
||||
self.client, self.bucket_name, self.use_ssl, self.endpoint, self.path_prefix = (
|
||||
self._init_minio_client()
|
||||
)
|
||||
|
||||
def _init_minio_client(self):
|
||||
try:
|
||||
sc = self.storage_config
|
||||
access_key = _cfg(sc, "access_key_id", "MINIO_ACCESS_KEY_ID")
|
||||
secret_key = _cfg(sc, "secret_access_key", "MINIO_SECRET_ACCESS_KEY")
|
||||
bucket_name = _cfg(sc, "bucket_name", "MINIO_BUCKET_NAME")
|
||||
path_prefix_raw = _cfg(sc, "path_prefix", "MINIO_PATH_PREFIX")
|
||||
path_prefix = path_prefix_raw.strip().strip("/") if path_prefix_raw else ""
|
||||
endpoint = _cfg(sc, "endpoint", "MINIO_ENDPOINT")
|
||||
use_ssl = os.environ.get("MINIO_USE_SSL", "").lower() in ("1", "true", "yes")
|
||||
|
||||
if not all([endpoint, access_key, secret_key, bucket_name]):
|
||||
logger.error("Incomplete MinIO configuration")
|
||||
return None, None, None, None, None
|
||||
|
||||
client = Minio(
|
||||
endpoint, access_key=access_key, secret_key=secret_key, secure=use_ssl
|
||||
)
|
||||
|
||||
found = client.bucket_exists(bucket_name)
|
||||
if not found:
|
||||
client.make_bucket(bucket_name)
|
||||
policy = (
|
||||
"{"
|
||||
'"Version":"2012-10-17",'
|
||||
'"Statement":['
|
||||
'{"Effect":"Allow","Principal":{"AWS":["*"]},'
|
||||
'"Action":["s3:GetObject"],'
|
||||
'"Resource":["arn:aws:s3:::%s/*"]}'
|
||||
"]}" % (bucket_name)
|
||||
)
|
||||
client.set_bucket_policy(bucket_name, policy)
|
||||
|
||||
return client, bucket_name, use_ssl, endpoint, path_prefix
|
||||
except Exception as e:
|
||||
logger.error("Failed to initialize MinIO client: %s", e)
|
||||
return None, None, None, None, None
|
||||
|
||||
def _get_download_url(self, object_key: str):
|
||||
public_endpoint = os.environ.get("MINIO_PUBLIC_ENDPOINT", "")
|
||||
if public_endpoint:
|
||||
return f"{public_endpoint}/{self.bucket_name}/{object_key}"
|
||||
scheme = "https" if self.use_ssl else "http"
|
||||
return f"{scheme}://{self.endpoint}/{self.bucket_name}/{object_key}"
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
try:
|
||||
if not self.client:
|
||||
return ""
|
||||
file_name = os.path.basename(file_path)
|
||||
object_key = (
|
||||
f"{self.path_prefix}/images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
if self.path_prefix
|
||||
else f"images/{uuid.uuid4().hex}{os.path.splitext(file_name)[1]}"
|
||||
)
|
||||
with open(file_path, "rb") as file_data:
|
||||
file_size = os.path.getsize(file_path)
|
||||
self.client.put_object(
|
||||
bucket_name=self.bucket_name or "",
|
||||
object_name=object_key,
|
||||
data=file_data,
|
||||
length=file_size,
|
||||
content_type="application/octet-stream",
|
||||
)
|
||||
file_url = self._get_download_url(object_key)
|
||||
logger.info("MinIO upload_file ok: %s", file_url)
|
||||
return file_url
|
||||
except Exception as e:
|
||||
logger.error("MinIO upload_file failed: %s", e)
|
||||
return ""
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
try:
|
||||
if not self.client:
|
||||
return ""
|
||||
object_key = (
|
||||
f"{self.path_prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
if self.path_prefix
|
||||
else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
)
|
||||
self.client.put_object(
|
||||
self.bucket_name or "",
|
||||
object_key,
|
||||
data=io.BytesIO(content),
|
||||
length=len(content),
|
||||
content_type="application/octet-stream",
|
||||
)
|
||||
file_url = self._get_download_url(object_key)
|
||||
logger.info("MinIO upload_bytes ok: %s", file_url)
|
||||
return file_url
|
||||
except Exception as e:
|
||||
logger.error("MinIO upload_bytes failed: %s", e)
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
|
||||
class OssStorage(Storage):
|
||||
"""Alibaba Cloud OSS storage implementation"""
|
||||
|
||||
def __init__(self, storage_config: Optional[Dict] = None):
|
||||
self.storage_config = storage_config
|
||||
self.bucket, self.endpoint, self.prefix = self._init_oss_bucket()
|
||||
|
||||
def _init_oss_bucket(self):
|
||||
if oss2 is None:
|
||||
logger.error("oss2 package not installed. Install with: pip install oss2")
|
||||
return None, None, None
|
||||
try:
|
||||
sc = self.storage_config
|
||||
access_key = _cfg(sc, "access_key", "OSS_ACCESS_KEY")
|
||||
secret_key = _cfg(sc, "secret_key", "OSS_SECRET_KEY")
|
||||
endpoint_raw = _cfg(sc, "endpoint", "OSS_ENDPOINT")
|
||||
bucket_name = _cfg(sc, "bucket_name", "OSS_BUCKET_NAME")
|
||||
prefix_raw = _cfg(sc, "path_prefix", "OSS_PATH_PREFIX")
|
||||
|
||||
if not all([access_key, secret_key, endpoint_raw, bucket_name]):
|
||||
logger.error(
|
||||
"Incomplete OSS configuration: "
|
||||
"access_key=%s, endpoint=%s, bucket=%s",
|
||||
bool(access_key), endpoint_raw, bucket_name,
|
||||
)
|
||||
return None, None, None
|
||||
|
||||
# oss2 requires endpoint to include the scheme (http:// or https://)
|
||||
if not endpoint_raw.startswith(("http://", "https://")):
|
||||
endpoint_raw = f"https://{endpoint_raw}"
|
||||
|
||||
auth = oss2.Auth(access_key, secret_key)
|
||||
bucket = oss2.Bucket(auth, endpoint_raw, bucket_name)
|
||||
|
||||
# Verify bucket exists; do NOT auto-create to avoid unintended public-read buckets
|
||||
try:
|
||||
bucket.get_bucket_info()
|
||||
except oss2.exceptions.NoSuchBucket:
|
||||
logger.error(
|
||||
"OSS bucket %s does not exist. Please create it manually in the console.",
|
||||
bucket_name,
|
||||
)
|
||||
return None, None, None
|
||||
|
||||
prefix = prefix_raw.strip().strip("/") if prefix_raw else ""
|
||||
return bucket, endpoint_raw, prefix
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to initialize OSS client: %s", e)
|
||||
return None, None, None
|
||||
|
||||
def _get_download_url(self, object_key: str) -> str:
|
||||
# Use virtual-hosted style URL: https://{bucket}.{endpoint_without_scheme}/{key}
|
||||
endpoint_no_scheme = self.endpoint.removeprefix("https://").removeprefix("http://")
|
||||
return f"https://{self.bucket.bucket_name}.{endpoint_no_scheme}/{object_key}"
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
try:
|
||||
if not self.bucket:
|
||||
return ""
|
||||
file_ext = os.path.splitext(file_path)[1]
|
||||
object_key = (
|
||||
f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
if self.prefix
|
||||
else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
)
|
||||
self.bucket.put_object_from_file(object_key, file_path)
|
||||
file_url = self._get_download_url(object_key)
|
||||
logger.info("OSS upload_file ok: %s", file_url)
|
||||
return file_url
|
||||
except Exception as e:
|
||||
logger.error("OSS upload_file failed: %s", e)
|
||||
return ""
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
try:
|
||||
if not self.bucket:
|
||||
return ""
|
||||
object_key = (
|
||||
f"{self.prefix}/images/{uuid.uuid4().hex}{file_ext}"
|
||||
if self.prefix
|
||||
else f"images/{uuid.uuid4().hex}{file_ext}"
|
||||
)
|
||||
self.bucket.put_object(object_key, content)
|
||||
file_url = self._get_download_url(object_key)
|
||||
logger.info("OSS upload_bytes ok: %s", file_url)
|
||||
return file_url
|
||||
except Exception as e:
|
||||
logger.error("OSS upload_bytes failed: %s", e)
|
||||
traceback.print_exc()
|
||||
return ""
|
||||
|
||||
|
||||
class LocalStorage(Storage):
|
||||
"""Local file system storage implementation.
|
||||
|
||||
Saves files under base_dir and returns web-accessible URL paths
|
||||
(e.g. /files/images/uuid.jpg) so that the Go app can serve them.
|
||||
"""
|
||||
|
||||
def __init__(self, storage_config: Optional[Dict] = None):
|
||||
sc = storage_config or {}
|
||||
self.base_dir = (
|
||||
sc.get("base_dir")
|
||||
or os.environ.get("LOCAL_STORAGE_BASE_DIR", "/data/files")
|
||||
)
|
||||
path_prefix = (sc.get("path_prefix") or "").strip().strip("/")
|
||||
if path_prefix:
|
||||
self.image_dir = os.path.join(self.base_dir, path_prefix, "images")
|
||||
else:
|
||||
self.image_dir = os.path.join(self.base_dir, "images")
|
||||
self.url_prefix = (
|
||||
sc.get("url_prefix")
|
||||
or os.environ.get("LOCAL_STORAGE_URL_PREFIX", "/files")
|
||||
)
|
||||
os.makedirs(self.image_dir, exist_ok=True)
|
||||
|
||||
def _to_url(self, fpath: str) -> str:
|
||||
if self.url_prefix:
|
||||
rel = os.path.relpath(fpath, self.base_dir)
|
||||
return f"{self.url_prefix}/{rel}"
|
||||
return fpath
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
return file_path
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
fpath = os.path.join(self.image_dir, f"{uuid.uuid4()}{file_ext}")
|
||||
with open(fpath, "wb") as f:
|
||||
f.write(content)
|
||||
url = self._to_url(fpath)
|
||||
logger.info("Local storage saved: %s -> %s", fpath, url)
|
||||
return url
|
||||
|
||||
|
||||
class Base64Storage(Storage):
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
return file_path
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
file_ext = file_ext.lstrip(".")
|
||||
return f"data:image/{file_ext};base64,{endecode.decode_image(content)}"
|
||||
|
||||
|
||||
class DummyStorage(Storage):
|
||||
"""Dummy storage — all uploads return empty string."""
|
||||
|
||||
def upload_file(self, file_path: str) -> str:
|
||||
return ""
|
||||
|
||||
def upload_bytes(self, content: bytes, file_ext: str = ".png") -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def create_storage(storage_config: Optional[Dict[str, str]] = None) -> Storage:
|
||||
"""Create a storage instance based on storage_config dict.
|
||||
|
||||
The ``provider`` key in storage_config determines the backend:
|
||||
minio, cos, oss, local, base64.
|
||||
Falls back to STORAGE_TYPE env var, then ``local``.
|
||||
"""
|
||||
storage_type = ""
|
||||
if storage_config:
|
||||
provider = str(storage_config.get("provider", "")).lower().strip()
|
||||
if provider and provider not in ("unspecified", "storage_provider_unspecified"):
|
||||
storage_type = provider
|
||||
|
||||
if not storage_type:
|
||||
storage_type = os.environ.get("STORAGE_TYPE", "local").lower().strip()
|
||||
|
||||
logger.info("Creating %s storage instance", storage_type)
|
||||
|
||||
if storage_type == "minio":
|
||||
return MinioStorage(storage_config)
|
||||
elif storage_type == "cos":
|
||||
return CosStorage(storage_config)
|
||||
elif storage_type == "oss":
|
||||
return OssStorage(storage_config)
|
||||
elif storage_type == "local":
|
||||
return LocalStorage(storage_config)
|
||||
elif storage_type == "base64":
|
||||
return Base64Storage()
|
||||
return DummyStorage()
|
||||
@@ -5,36 +5,20 @@ description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10.18"
|
||||
dependencies = [
|
||||
"antiword>=0.1.0",
|
||||
"asyncio>=4.0.0",
|
||||
"beautifulsoup4>=4.14.2",
|
||||
"cos-python-sdk-v5>=1.9.42",
|
||||
"goose3[all]>=3.1.20",
|
||||
"grpcio>=1.76.0",
|
||||
"grpcio-health-checking>=1.76.0",
|
||||
"grpcio-tools>=1.76.0",
|
||||
"lxml>=6.1.0",
|
||||
"markdown>=3.10",
|
||||
"markdownify>=1.2.0",
|
||||
"markitdown[docx,pdf,xls,xlsx]>=0.1.3",
|
||||
"minio>=7.2.20",
|
||||
"mistletoe>=1.5.0",
|
||||
"ollama>=0.6.0",
|
||||
"openai>=2.7.1",
|
||||
"oss2>=2.19.1",
|
||||
"paddleocr>=2.10.0,<3.0.0",
|
||||
"paddlepaddle>=3.3.1,<4.0.0",
|
||||
"pdfplumber>=0.11.7",
|
||||
"pillow>=12.0.0",
|
||||
"playwright>=1.55.0",
|
||||
"protobuf>=6.33.0",
|
||||
"pydantic>=2.12.3",
|
||||
"pypdf>=6.1.3",
|
||||
"pypdf2>=3.0.1",
|
||||
"pypdfium2>=5.0.0",
|
||||
"python-docx>=1.2.0",
|
||||
"requests>=2.32.5",
|
||||
"textract==1.5.0",
|
||||
"trafilatura>=2.0.0",
|
||||
"urllib3>=2.5.0",
|
||||
]
|
||||
|
||||
@@ -1,70 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
# 添加当前目录到Python路径
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
if current_dir not in sys.path:
|
||||
sys.path.append(current_dir)
|
||||
|
||||
# 导入ImageParser
|
||||
from parser.image_parser import ImageParser
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def init_ocr_model():
|
||||
"""Initialize PaddleOCR model to pre-download and cache models"""
|
||||
try:
|
||||
logger.info("Initializing PaddleOCR model for pre-download...")
|
||||
|
||||
# 使用与代码中相同的配置
|
||||
ocr_config = {
|
||||
"use_gpu": False,
|
||||
"text_det_limit_type": "max",
|
||||
"text_det_limit_side_len": 960,
|
||||
"use_doc_orientation_classify": True, # 启用文档方向分类
|
||||
"use_doc_unwarping": False,
|
||||
"use_textline_orientation": True, # 启用文本行方向检测
|
||||
"text_recognition_model_name": "PP-OCRv4_server_rec",
|
||||
"text_detection_model_name": "PP-OCRv4_server_det",
|
||||
"text_det_thresh": 0.3,
|
||||
"text_det_box_thresh": 0.6,
|
||||
"text_det_unclip_ratio": 1.5,
|
||||
"text_rec_score_thresh": 0.0,
|
||||
"ocr_version": "PP-OCRv4",
|
||||
"lang": "ch",
|
||||
"show_log": False,
|
||||
"use_dilation": True,
|
||||
"det_db_score_mode": "slow",
|
||||
}
|
||||
|
||||
# 初始化PaddleOCR,这会触发模型下载和缓存
|
||||
ocr = PaddleOCR(**ocr_config)
|
||||
logger.info("PaddleOCR model initialization completed successfully")
|
||||
|
||||
# 测试OCR功能以确保模型正常工作
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
# 创建一个简单的测试图像
|
||||
test_image = np.ones((100, 300, 3), dtype=np.uint8) * 255
|
||||
test_pil = Image.fromarray(test_image)
|
||||
|
||||
# 执行一次OCR测试
|
||||
result = ocr.ocr(np.array(test_pil), cls=False)
|
||||
logger.info("PaddleOCR test completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize PaddleOCR model: {str(e)}")
|
||||
raise
|
||||
1726
docreader/uv.lock
generated
1726
docreader/uv.lock
generated
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user