Files
WeKnora/docreader/config.py

138 lines
4.2 KiB
Python

import logging
import os
from dataclasses import dataclass
from typing import Any, Dict, Iterable, Optional, Tuple
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def _get_first_env(keys: Iterable[str]) -> Tuple[Optional[str], Optional[str]]:
"""Return (value, key) for the first existing env var in keys."""
for k in keys:
if k in os.environ:
return os.environ.get(k), k
return None, None
def _get_str(keys: Iterable[str], default: str = "") -> str:
v, _ = _get_first_env(keys)
return default if v is None else str(v)
def _get_int(keys: Iterable[str], default: int) -> int:
v, _ = _get_first_env(keys)
if v is None or str(v).strip() == "":
return default
try:
return int(str(v).strip())
except Exception:
return default
def _get_bool(keys: Iterable[str], default: bool) -> bool:
v, _ = _get_first_env(keys)
if v is None or str(v).strip() == "":
return default
return str(v).strip().lower() in {"1", "true", "yes", "y", "on"}
def _mask_secret(v: str) -> str:
if not v:
return ""
if len(v) <= 6:
return "***"
return f"{v[:2]}***{v[-2:]}"
@dataclass(frozen=True)
class DocReaderConfig:
# gRPC
grpc_max_workers: int
grpc_max_file_size_mb: int
grpc_port: int
# Parser
docx_max_pages: int
markitdown_max_workers: int
pdf_render_max_workers: int
pdf_render_dpi: int
pdf_jpeg_quality: int
# Proxy
external_http_proxy: str
external_https_proxy: str
# Temp image output directory (shared with Go app via volume, local mode fallback)
image_output_dir: str
def load_config() -> DocReaderConfig:
"""Load config from environment variables (lightweight version)."""
grpc_max_workers = _get_int(["DOCREADER_GRPC_MAX_WORKERS", "GRPC_MAX_WORKERS"], 4)
grpc_max_file_size_mb = (
_get_int(["DOCREADER_GRPC_MAX_FILE_SIZE_MB", "MAX_FILE_SIZE_MB"], 50)
* 1024
* 1024
)
grpc_port = _get_int(["DOCREADER_GRPC_PORT", "PORT"], 50051)
docx_max_pages = _get_int(["DOCREADER_DOCX_MAX_PAGES"], 0)
markitdown_max_workers = _get_int(["DOCREADER_MARKITDOWN_MAX_WORKERS"], 1)
pdf_render_max_workers = _get_int(["DOCREADER_PDF_RENDER_MAX_WORKERS"], 1)
pdf_render_dpi = _get_int(["DOCREADER_PDF_RENDER_DPI"], 200)
pdf_jpeg_quality = _get_int(["DOCREADER_PDF_JPEG_QUALITY"], 90)
external_http_proxy = _get_str(
["DOCREADER_EXTERNAL_HTTP_PROXY", "EXTERNAL_HTTP_PROXY"], ""
)
external_https_proxy = _get_str(
["DOCREADER_EXTERNAL_HTTPS_PROXY", "EXTERNAL_HTTPS_PROXY"], ""
)
image_output_dir = _get_str(
["DOCREADER_IMAGE_OUTPUT_DIR", "IMAGE_OUTPUT_DIR"], "/tmp/docreader"
)
return DocReaderConfig(
grpc_max_workers=grpc_max_workers,
grpc_max_file_size_mb=grpc_max_file_size_mb,
grpc_port=grpc_port,
docx_max_pages=docx_max_pages,
markitdown_max_workers=markitdown_max_workers,
pdf_render_max_workers=pdf_render_max_workers,
pdf_render_dpi=pdf_render_dpi,
pdf_jpeg_quality=pdf_jpeg_quality,
external_http_proxy=external_http_proxy,
external_https_proxy=external_https_proxy,
image_output_dir=image_output_dir,
)
CONFIG = load_config()
def dump_config(mask_secrets: bool = True) -> Dict[str, Any]:
cfg = CONFIG
d: Dict[str, Any] = {
"DOCREADER_GRPC_MAX_WORKERS": cfg.grpc_max_workers,
"DOCREADER_GRPC_MAX_FILE_SIZE_MB": cfg.grpc_max_file_size_mb,
"DOCREADER_GRPC_PORT": cfg.grpc_port,
"DOCREADER_DOCX_MAX_PAGES": cfg.docx_max_pages,
"DOCREADER_MARKITDOWN_MAX_WORKERS": cfg.markitdown_max_workers,
"DOCREADER_PDF_RENDER_MAX_WORKERS": cfg.pdf_render_max_workers,
"DOCREADER_PDF_RENDER_DPI": cfg.pdf_render_dpi,
"DOCREADER_PDF_JPEG_QUALITY": cfg.pdf_jpeg_quality,
"DOCREADER_EXTERNAL_HTTP_PROXY": cfg.external_http_proxy,
"DOCREADER_EXTERNAL_HTTPS_PROXY": cfg.external_https_proxy,
"DOCREADER_IMAGE_OUTPUT_DIR": cfg.image_output_dir,
}
return d
def print_config() -> None:
d = dump_config(mask_secrets=True)
logger.info("DocReader env/config (effective values):")
for k in sorted(d.keys()):
logger.info("%s=%s", k, d[k])