fix(summary): preserve image caption/OCR text in document summaries

Documents whose only payload is an embedded image (e.g. a docx with a single picture) intermittently produced the refusal line "No textual content was extractable from this document." even though the vision model had successfully extracted a caption. Three coordinated fixes: - Clarify the summary prompt that text inside `<image_caption>` and `<image_ocr>` is first-class extracted content, not an image reference, so the model only triggers the empty-content branch when the body is genuinely textless. - For image-dominated documents (real text < 200 runes after stripping image markup) include OCR alongside captions so screenshots and scanned figures contribute their actual content; text-heavy documents continue to use caption-only enrichment to avoid OCR noise from incidental figures. - Add `EnrichContentCaptionAndOCR` which embeds caption + OCR text inline next to the original Markdown image link, deliberately omitting the `<image url=...>` and `<image_original>` wrapper blocks. Those wrappers carry only opaque export hashes that consume tokens and have been observed to retrigger the LLM's "image reference with no extracted text" heuristic.
2026-06-04 13:30:32 +08:00 · 2026-05-22 16:14:07 +08:00
parent 72e52f7258
commit c0e4a1d2f1
3 changed files with 108 additions and 3 deletions
--- a/config/prompt_templates/generate_summary.yaml
+++ b/config/prompt_templates/generate_summary.yaml
@@ -28,10 +28,14 @@ templates:
      - For technical documents: preserve key terms, metrics, and specific details
      - For meeting notes/reports: highlight decisions, action items, and conclusions

+      ## Handling Image-Derived Text
+      - Text wrapped in `<image_caption>...</image_caption>` or `<image_ocr>...</image_ocr>` IS extracted text content (produced by a vision model from images/figures in the document). Treat it as first-class document text and summarise it normally.
+      - A document whose only textual content comes from `<image_caption>` / `<image_ocr>` blocks is NOT empty — summarise based on those captions/OCR results.
+
      ## Empty or Insufficient Content
-      - If the user-provided content is empty, contains only image/figure references with no extracted text, or otherwise carries no substantive textual information, you MUST output exactly the single line: "No textual content was extractable from this document." and nothing else.
+      - Only when the user-provided content is genuinely empty, contains only bare image placeholders with NO inner caption/OCR text, or otherwise carries no substantive textual information, output exactly the single line: "No textual content was extractable from this document." and nothing else.
      - Do NOT fabricate a topic, do NOT guess from any other clue, and do NOT copy content from examples or unrelated sources.
-      - It is correct and expected to refuse to summarise when the content is absent. This is preferred over inventing a plausible-sounding but unsupported summary.
+      - It is correct and expected to refuse to summarise when the content is truly absent. This is preferred over inventing a plausible-sounding but unsupported summary.

      ## Language
      - Use {{language}} for all outputs
--- a/internal/application/service/knowledge_process.go
+++ b/internal/application/service/knowledge_process.go
@@ -637,6 +637,14 @@ func (s *knowledgeService) processChunks(ctx context.Context,
 // defaultMaxInputChars is the default maximum characters used as input for summary generation.
 const defaultMaxInputChars = 1024 * 24

+// imageDominatedTextThreshold is the rune count below which a document is
+// considered "image-dominated" — i.e. the body text is so sparse that we
+// should fall back to full image enrichment (caption + OCR) for the summary
+// LLM call. Above this threshold the document has enough native text that
+// caption-only enrichment is preferable (OCR text from incidental figures
+// would otherwise add noise without contributing to the main topic).
+const imageDominatedTextThreshold = 200
+
 // errInsufficientSummaryContent signals that getSummary refused to call the
 // LLM because the document had no usable text after image markup was stripped
 // (typical for scanned PDFs where VLM OCR yielded nothing). Callers should
@@ -707,7 +715,22 @@ func (s *knowledgeService) getSummary(ctx context.Context,
 	imageInfoMap := searchutil.CollectImageInfoByChunkIDs(ctx, s.chunkRepo, knowledge.TenantID, chunkIDs)
 	mergedImageInfo := searchutil.MergeImageInfoJSON(imageInfoMap)
 	if mergedImageInfo != "" {
-		chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo)
+		// For image-dominated documents (e.g. a docx whose only payload is a
+		// single embedded picture, or a screenshot-only file), captions alone
+		// often carry too little signal — the real content lives in OCR text.
+		// Detect that case by measuring the document's real (non-image-markup)
+		// text BEFORE enrichment, and switch to full enrichment (caption + OCR)
+		// when the body is essentially empty. Text-heavy documents stay on the
+		// caption-only path to avoid OCR noise (page headers/footers/watermarks
+		// from many figures diluting the main topic).
+		if realTextRuneCount(chunkContents) < imageDominatedTextThreshold {
+			// Caption + OCR (no URL/original wrappers — those are pure noise
+			// for the summary LLM and have been observed to trigger the
+			// "image reference with no extracted text" refusal heuristic).
+			chunkContents = searchutil.EnrichContentCaptionAndOCR(chunkContents, mergedImageInfo)
+		} else {
+			chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo)
+		}
 	}

 	// Apply length limit: sample long content to fit within maxInputChars
--- a/internal/searchutil/imageinfo.go
+++ b/internal/searchutil/imageinfo.go
@@ -338,3 +338,81 @@ func EnrichContentCaptionOnly(content string, imageInfoJSON string) string {
 	}
 	return content
 }
+
+// EnrichContentCaptionAndOCR is like EnrichContentCaptionOnly but ALSO
+// embeds OCR text alongside captions. URL and <image_original> wrapper
+// blocks are deliberately omitted (unlike EnrichContentWithImageInfo) —
+// the summary LLM only needs the human-readable text, not opaque export
+// hashes. Used as a fallback for image-dominated documents where caption
+// alone carries too little signal.
+func EnrichContentCaptionAndOCR(content string, imageInfoJSON string) string {
+	var imageInfos []types.ImageInfo
+	if err := json.Unmarshal([]byte(imageInfoJSON), &imageInfos); err != nil {
+		return content
+	}
+	if len(imageInfos) == 0 {
+		return content
+	}
+
+	imageInfoMap := make(map[string]*types.ImageInfo)
+	for i := range imageInfos {
+		if imageInfos[i].URL != "" {
+			imageInfoMap[imageInfos[i].URL] = &imageInfos[i]
+		}
+		if imageInfos[i].OriginalURL != "" {
+			imageInfoMap[imageInfos[i].OriginalURL] = &imageInfos[i]
+		}
+	}
+
+	matches := MarkdownImageRegex.FindAllStringSubmatch(content, -1)
+	processedURLs := make(map[string]bool)
+
+	for _, match := range matches {
+		if len(match) < 3 {
+			continue
+		}
+		imgURL := match[2]
+		processedURLs[imgURL] = true
+
+		imgInfo, found := imageInfoMap[imgURL]
+		if !found || imgInfo == nil {
+			continue
+		}
+		appended := buildCaptionOCRBlock(imgInfo)
+		if appended == "" {
+			continue
+		}
+		content = strings.Replace(content, match[0], match[0]+"\n"+appended, 1)
+	}
+
+	var extras []string
+	for _, imgInfo := range imageInfos {
+		if processedURLs[imgInfo.URL] || processedURLs[imgInfo.OriginalURL] {
+			continue
+		}
+		if block := buildCaptionOCRBlock(&imgInfo); block != "" {
+			extras = append(extras, block)
+		}
+	}
+	if len(extras) > 0 {
+		if content != "" {
+			content += "\n"
+		}
+		content += strings.Join(extras, "\n")
+	}
+	return content
+}
+
+// buildCaptionOCRBlock returns the inline caption + OCR snippet (no URL
+// wrapper) used by EnrichContentCaptionAndOCR. Empty string when the image
+// has neither caption nor OCR.
+func buildCaptionOCRBlock(img *types.ImageInfo) string {
+	var parts []string
+	if img.Caption != "" {
+		parts = append(parts, fmt.Sprintf("<image_caption>%s</image_caption>", img.Caption))
+	}
+	if img.OCRText != "" {
+		parts = append(parts, fmt.Sprintf("<image_ocr>%s</image_ocr>", img.OCRText))
+	}
+	return strings.Join(parts, "\n")
+}