fix(summary): preserve image caption/OCR text in document summaries

Documents whose only payload is an embedded image (e.g. a docx with a
single picture) intermittently produced the refusal line "No textual
content was extractable from this document." even though the vision
model had successfully extracted a caption.

Three coordinated fixes:

- Clarify the summary prompt that text inside `<image_caption>` and
  `<image_ocr>` is first-class extracted content, not an image
  reference, so the model only triggers the empty-content branch when
  the body is genuinely textless.
- For image-dominated documents (real text < 200 runes after stripping
  image markup) include OCR alongside captions so screenshots and
  scanned figures contribute their actual content; text-heavy
  documents continue to use caption-only enrichment to avoid OCR
  noise from incidental figures.
- Add `EnrichContentCaptionAndOCR` which embeds caption + OCR text
  inline next to the original Markdown image link, deliberately
  omitting the `<image url=...>` and `<image_original>` wrapper
  blocks. Those wrappers carry only opaque export hashes that consume
  tokens and have been observed to retrigger the LLM's "image
  reference with no extracted text" heuristic.
This commit is contained in:
wizardchen
2026-05-22 16:14:07 +08:00
committed by lyingbug
parent 72e52f7258
commit c0e4a1d2f1
3 changed files with 108 additions and 3 deletions

View File

@@ -28,10 +28,14 @@ templates:
- For technical documents: preserve key terms, metrics, and specific details
- For meeting notes/reports: highlight decisions, action items, and conclusions
## Handling Image-Derived Text
- Text wrapped in `<image_caption>...</image_caption>` or `<image_ocr>...</image_ocr>` IS extracted text content (produced by a vision model from images/figures in the document). Treat it as first-class document text and summarise it normally.
- A document whose only textual content comes from `<image_caption>` / `<image_ocr>` blocks is NOT empty — summarise based on those captions/OCR results.
## Empty or Insufficient Content
- If the user-provided content is empty, contains only image/figure references with no extracted text, or otherwise carries no substantive textual information, you MUST output exactly the single line: "No textual content was extractable from this document." and nothing else.
- Only when the user-provided content is genuinely empty, contains only bare image placeholders with NO inner caption/OCR text, or otherwise carries no substantive textual information, output exactly the single line: "No textual content was extractable from this document." and nothing else.
- Do NOT fabricate a topic, do NOT guess from any other clue, and do NOT copy content from examples or unrelated sources.
- It is correct and expected to refuse to summarise when the content is absent. This is preferred over inventing a plausible-sounding but unsupported summary.
- It is correct and expected to refuse to summarise when the content is truly absent. This is preferred over inventing a plausible-sounding but unsupported summary.
## Language
- Use {{language}} for all outputs

View File

@@ -637,6 +637,14 @@ func (s *knowledgeService) processChunks(ctx context.Context,
// defaultMaxInputChars is the default maximum characters used as input for summary generation.
const defaultMaxInputChars = 1024 * 24
// imageDominatedTextThreshold is the rune count below which a document is
// considered "image-dominated" — i.e. the body text is so sparse that we
// should fall back to full image enrichment (caption + OCR) for the summary
// LLM call. Above this threshold the document has enough native text that
// caption-only enrichment is preferable (OCR text from incidental figures
// would otherwise add noise without contributing to the main topic).
const imageDominatedTextThreshold = 200
// errInsufficientSummaryContent signals that getSummary refused to call the
// LLM because the document had no usable text after image markup was stripped
// (typical for scanned PDFs where VLM OCR yielded nothing). Callers should
@@ -707,7 +715,22 @@ func (s *knowledgeService) getSummary(ctx context.Context,
imageInfoMap := searchutil.CollectImageInfoByChunkIDs(ctx, s.chunkRepo, knowledge.TenantID, chunkIDs)
mergedImageInfo := searchutil.MergeImageInfoJSON(imageInfoMap)
if mergedImageInfo != "" {
chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo)
// For image-dominated documents (e.g. a docx whose only payload is a
// single embedded picture, or a screenshot-only file), captions alone
// often carry too little signal — the real content lives in OCR text.
// Detect that case by measuring the document's real (non-image-markup)
// text BEFORE enrichment, and switch to full enrichment (caption + OCR)
// when the body is essentially empty. Text-heavy documents stay on the
// caption-only path to avoid OCR noise (page headers/footers/watermarks
// from many figures diluting the main topic).
if realTextRuneCount(chunkContents) < imageDominatedTextThreshold {
// Caption + OCR (no URL/original wrappers — those are pure noise
// for the summary LLM and have been observed to trigger the
// "image reference with no extracted text" refusal heuristic).
chunkContents = searchutil.EnrichContentCaptionAndOCR(chunkContents, mergedImageInfo)
} else {
chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo)
}
}
// Apply length limit: sample long content to fit within maxInputChars

View File

@@ -338,3 +338,81 @@ func EnrichContentCaptionOnly(content string, imageInfoJSON string) string {
}
return content
}
// EnrichContentCaptionAndOCR is like EnrichContentCaptionOnly but ALSO
// embeds OCR text alongside captions. URL and <image_original> wrapper
// blocks are deliberately omitted (unlike EnrichContentWithImageInfo) —
// the summary LLM only needs the human-readable text, not opaque export
// hashes. Used as a fallback for image-dominated documents where caption
// alone carries too little signal.
func EnrichContentCaptionAndOCR(content string, imageInfoJSON string) string {
var imageInfos []types.ImageInfo
if err := json.Unmarshal([]byte(imageInfoJSON), &imageInfos); err != nil {
return content
}
if len(imageInfos) == 0 {
return content
}
imageInfoMap := make(map[string]*types.ImageInfo)
for i := range imageInfos {
if imageInfos[i].URL != "" {
imageInfoMap[imageInfos[i].URL] = &imageInfos[i]
}
if imageInfos[i].OriginalURL != "" {
imageInfoMap[imageInfos[i].OriginalURL] = &imageInfos[i]
}
}
matches := MarkdownImageRegex.FindAllStringSubmatch(content, -1)
processedURLs := make(map[string]bool)
for _, match := range matches {
if len(match) < 3 {
continue
}
imgURL := match[2]
processedURLs[imgURL] = true
imgInfo, found := imageInfoMap[imgURL]
if !found || imgInfo == nil {
continue
}
appended := buildCaptionOCRBlock(imgInfo)
if appended == "" {
continue
}
content = strings.Replace(content, match[0], match[0]+"\n"+appended, 1)
}
var extras []string
for _, imgInfo := range imageInfos {
if processedURLs[imgInfo.URL] || processedURLs[imgInfo.OriginalURL] {
continue
}
if block := buildCaptionOCRBlock(&imgInfo); block != "" {
extras = append(extras, block)
}
}
if len(extras) > 0 {
if content != "" {
content += "\n"
}
content += strings.Join(extras, "\n")
}
return content
}
// buildCaptionOCRBlock returns the inline caption + OCR snippet (no URL
// wrapper) used by EnrichContentCaptionAndOCR. Empty string when the image
// has neither caption nor OCR.
func buildCaptionOCRBlock(img *types.ImageInfo) string {
var parts []string
if img.Caption != "" {
parts = append(parts, fmt.Sprintf("<image_caption>%s</image_caption>", img.Caption))
}
if img.OCRText != "" {
parts = append(parts, fmt.Sprintf("<image_ocr>%s</image_ocr>", img.OCRText))
}
return strings.Join(parts, "\n")
}