From c19d3543c8529f43f0980d8e262135497a6ce405 Mon Sep 17 00:00:00 2001 From: Li Xianggang Date: Mon, 18 May 2026 17:25:22 +0800 Subject: [PATCH] =?UTF-8?q?feat(url):=20=E6=94=AF=E6=8C=81docreader?= =?UTF-8?q?=E4=B8=8D=E4=B8=8A=E4=BC=A0=E6=9B=BF=E6=8D=A2=E7=99=BD=E5=90=8D?= =?UTF-8?q?=E5=8D=95url=E7=9A=84=E5=9B=BE=E7=89=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.example | 5 ++ .env.lite.example | 2 + docker-compose.yml | 2 + .../docparser/image_resolver.go | 73 ++++++++++++++++--- 4 files changed, 70 insertions(+), 12 deletions(-) diff --git a/.env.example b/.env.example index 75c12fcb..11235cd0 100644 --- a/.env.example +++ b/.env.example @@ -493,6 +493,11 @@ DOCREADER_TRANSPORT=grpc # Weaviate gRPC 地址(Docker 内:weaviate:50051;宿主机访问:localhost:50052) # WEAVIATE_GRPC_ADDRESS=weaviate:50051 +# 保留原始 URL 的图片域名白名单(可选,逗号分隔) +# 配置后,这些域名的图片仍会被下载和分析(OCR/字幕),但 markdown 中保留原始 URL, +# 不会被替换为对象存储的 provider:// URL。适用于内部稳定服务(如自建 MinerU)。 +# IMAGE_HOST_KEEP_URL=mineru.internal.example.com + # Weaviate 架构模式 # WEAVIATE_SCHEME=http diff --git a/.env.lite.example b/.env.lite.example index cc00dd69..f88f5c54 100644 --- a/.env.lite.example +++ b/.env.lite.example @@ -43,6 +43,8 @@ NEO4J_ENABLE=false WEKNORA_SANDBOX_MODE=disabled ENABLE_GRAPH_RAG=false DISABLE_REGISTRATION=false +# 保留原始 URL 的图片域名白名单(可选) +# IMAGE_HOST_KEEP_URL=mineru.internal.example.com # === Langfuse 可观测性(可选) === # 追踪 chat / embedding / rerank / VLM / ASR 的 prompt、响应与 token 消耗。 diff --git a/docker-compose.yml b/docker-compose.yml index 75a8e5b6..398332e4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -149,6 +149,8 @@ services: - TENANT_AES_KEY=${TENANT_AES_KEY:-} - SYSTEM_AES_KEY=${SYSTEM_AES_KEY:-} - SSRF_WHITELIST=${SSRF_WHITELIST:-} + # 保留原始 URL 的图片域名白名单(逗号分隔,不替换为 provider://) + - IMAGE_HOST_KEEP_URL=${IMAGE_HOST_KEEP_URL:-} # Always allow the optional searxng sidecar (compose service hostname); # merged on top of SSRF_WHITELIST so user overrides don't clobber it. - SSRF_WHITELIST_EXTRA=${SSRF_WHITELIST_EXTRA:-searxng} diff --git a/internal/infrastructure/docparser/image_resolver.go b/internal/infrastructure/docparser/image_resolver.go index 5adce2c6..fe84e9b4 100644 --- a/internal/infrastructure/docparser/image_resolver.go +++ b/internal/infrastructure/docparser/image_resolver.go @@ -13,6 +13,8 @@ import ( "log" "mime" "net/http" + "net/url" + "os" "path" "path/filepath" "regexp" @@ -195,6 +197,35 @@ func isProviderScheme(p string) bool { return false } +// isWhitelistedImageHost checks if the image URL's host is in the whitelist. +// Whitelisted hosts are trusted (e.g. internal MinerU service) — images are +// still downloaded for validation and OCR/caption analysis, but not uploaded +// to object storage. The markdown keeps the original URL. +// Configure via IMAGE_HOST_KEEP_URL env var (comma-separated hosts). +func isWhitelistedImageHost(rawURL string) bool { + whitelist := strings.TrimSpace(os.Getenv("IMAGE_HOST_KEEP_URL")) + if whitelist == "" { + return false + } + u, err := url.Parse(rawURL) + if err != nil || u.Host == "" { + return false + } + host := strings.ToLower(u.Host) + hostname := strings.ToLower(u.Hostname()) + for _, h := range strings.Split(whitelist, ",") { + h = strings.ToLower(strings.TrimSpace(h)) + if h == "" { + continue + } + // Exact host match (includes port) or hostname match (any port) + if host == h || hostname == h { + return true + } + } + return false +} + // --------------------------------------------------------------------------- // Helper functions for base64 image handling // --------------------------------------------------------------------------- @@ -670,10 +701,18 @@ func (r *ImageResolver) ResolveRemoteImages( continue } - // --- SSRF check (centralised entry-point with whitelist support) --- - if err := secutils.ValidateURLForSSRF(imgURL); err != nil { - log.Printf("WARN: remote image blocked by SSRF check (%v): %s", err, imgURL) - continue + // For whitelisted hosts: download to validate (mime type, icon check), + // create StoredImage for downstream OCR/caption analysis, but do NOT + // upload to storage and keep the original URL in markdown. + // The multimodal service will download from the original URL later. + whitelisted := isWhitelistedImageHost(imgURL) + + // --- SSRF check (skip for whitelisted) --- + if !whitelisted { + if err := secutils.ValidateURLForSSRF(imgURL); err != nil { + log.Printf("WARN: remote image blocked by SSRF check (%v): %s", err, imgURL) + continue + } } // --- Download --- @@ -697,12 +736,20 @@ func (r *ImageResolver) ResolveRemoteImages( ext = ".png" // safe default } - // --- Upload to storage --- - fileName := uuid.New().String() + ext - servingURL, saveErr := fileSvc.SaveBytes(ctx, data, tenantID, fileName, false) - if saveErr != nil { - log.Printf("WARN: failed to save remote image %s: %v", imgURL, saveErr) - continue + var servingURL string + if whitelisted { + // Keep the original URL — ImageMultimodalService will download it + // directly for OCR/caption analysis. + servingURL = imgURL + } else { + // --- Upload to storage --- + fileName := uuid.New().String() + ext + var saveErr error + servingURL, saveErr = fileSvc.SaveBytes(ctx, data, tenantID, fileName, false) + if saveErr != nil { + log.Printf("WARN: failed to save remote image %s: %v", imgURL, saveErr) + continue + } } images = append(images, StoredImage{ @@ -711,8 +758,10 @@ func (r *ImageResolver) ResolveRemoteImages( MimeType: mimeType, }) - // Replace URL in markdown. - markdown = markdown[:m[4]] + servingURL + markdown[m[5]:] + if !whitelisted { + // Replace URL in markdown. + markdown = markdown[:m[4]] + servingURL + markdown[m[5]:] + } processed++ }