mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
fix(summary): preserve image caption/OCR text in document summaries
Documents whose only payload is an embedded image (e.g. a docx with a single picture) intermittently produced the refusal line "No textual content was extractable from this document." even though the vision model had successfully extracted a caption. Three coordinated fixes: - Clarify the summary prompt that text inside `<image_caption>` and `<image_ocr>` is first-class extracted content, not an image reference, so the model only triggers the empty-content branch when the body is genuinely textless. - For image-dominated documents (real text < 200 runes after stripping image markup) include OCR alongside captions so screenshots and scanned figures contribute their actual content; text-heavy documents continue to use caption-only enrichment to avoid OCR noise from incidental figures. - Add `EnrichContentCaptionAndOCR` which embeds caption + OCR text inline next to the original Markdown image link, deliberately omitting the `<image url=...>` and `<image_original>` wrapper blocks. Those wrappers carry only opaque export hashes that consume tokens and have been observed to retrigger the LLM's "image reference with no extracted text" heuristic.
This commit is contained in:
@@ -28,10 +28,14 @@ templates:
|
||||
- For technical documents: preserve key terms, metrics, and specific details
|
||||
- For meeting notes/reports: highlight decisions, action items, and conclusions
|
||||
|
||||
## Handling Image-Derived Text
|
||||
- Text wrapped in `<image_caption>...</image_caption>` or `<image_ocr>...</image_ocr>` IS extracted text content (produced by a vision model from images/figures in the document). Treat it as first-class document text and summarise it normally.
|
||||
- A document whose only textual content comes from `<image_caption>` / `<image_ocr>` blocks is NOT empty — summarise based on those captions/OCR results.
|
||||
|
||||
## Empty or Insufficient Content
|
||||
- If the user-provided content is empty, contains only image/figure references with no extracted text, or otherwise carries no substantive textual information, you MUST output exactly the single line: "No textual content was extractable from this document." and nothing else.
|
||||
- Only when the user-provided content is genuinely empty, contains only bare image placeholders with NO inner caption/OCR text, or otherwise carries no substantive textual information, output exactly the single line: "No textual content was extractable from this document." and nothing else.
|
||||
- Do NOT fabricate a topic, do NOT guess from any other clue, and do NOT copy content from examples or unrelated sources.
|
||||
- It is correct and expected to refuse to summarise when the content is absent. This is preferred over inventing a plausible-sounding but unsupported summary.
|
||||
- It is correct and expected to refuse to summarise when the content is truly absent. This is preferred over inventing a plausible-sounding but unsupported summary.
|
||||
|
||||
## Language
|
||||
- Use {{language}} for all outputs
|
||||
|
||||
@@ -637,6 +637,14 @@ func (s *knowledgeService) processChunks(ctx context.Context,
|
||||
// defaultMaxInputChars is the default maximum characters used as input for summary generation.
|
||||
const defaultMaxInputChars = 1024 * 24
|
||||
|
||||
// imageDominatedTextThreshold is the rune count below which a document is
|
||||
// considered "image-dominated" — i.e. the body text is so sparse that we
|
||||
// should fall back to full image enrichment (caption + OCR) for the summary
|
||||
// LLM call. Above this threshold the document has enough native text that
|
||||
// caption-only enrichment is preferable (OCR text from incidental figures
|
||||
// would otherwise add noise without contributing to the main topic).
|
||||
const imageDominatedTextThreshold = 200
|
||||
|
||||
// errInsufficientSummaryContent signals that getSummary refused to call the
|
||||
// LLM because the document had no usable text after image markup was stripped
|
||||
// (typical for scanned PDFs where VLM OCR yielded nothing). Callers should
|
||||
@@ -707,7 +715,22 @@ func (s *knowledgeService) getSummary(ctx context.Context,
|
||||
imageInfoMap := searchutil.CollectImageInfoByChunkIDs(ctx, s.chunkRepo, knowledge.TenantID, chunkIDs)
|
||||
mergedImageInfo := searchutil.MergeImageInfoJSON(imageInfoMap)
|
||||
if mergedImageInfo != "" {
|
||||
chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo)
|
||||
// For image-dominated documents (e.g. a docx whose only payload is a
|
||||
// single embedded picture, or a screenshot-only file), captions alone
|
||||
// often carry too little signal — the real content lives in OCR text.
|
||||
// Detect that case by measuring the document's real (non-image-markup)
|
||||
// text BEFORE enrichment, and switch to full enrichment (caption + OCR)
|
||||
// when the body is essentially empty. Text-heavy documents stay on the
|
||||
// caption-only path to avoid OCR noise (page headers/footers/watermarks
|
||||
// from many figures diluting the main topic).
|
||||
if realTextRuneCount(chunkContents) < imageDominatedTextThreshold {
|
||||
// Caption + OCR (no URL/original wrappers — those are pure noise
|
||||
// for the summary LLM and have been observed to trigger the
|
||||
// "image reference with no extracted text" refusal heuristic).
|
||||
chunkContents = searchutil.EnrichContentCaptionAndOCR(chunkContents, mergedImageInfo)
|
||||
} else {
|
||||
chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo)
|
||||
}
|
||||
}
|
||||
|
||||
// Apply length limit: sample long content to fit within maxInputChars
|
||||
|
||||
@@ -338,3 +338,81 @@ func EnrichContentCaptionOnly(content string, imageInfoJSON string) string {
|
||||
}
|
||||
return content
|
||||
}
|
||||
|
||||
// EnrichContentCaptionAndOCR is like EnrichContentCaptionOnly but ALSO
|
||||
// embeds OCR text alongside captions. URL and <image_original> wrapper
|
||||
// blocks are deliberately omitted (unlike EnrichContentWithImageInfo) —
|
||||
// the summary LLM only needs the human-readable text, not opaque export
|
||||
// hashes. Used as a fallback for image-dominated documents where caption
|
||||
// alone carries too little signal.
|
||||
func EnrichContentCaptionAndOCR(content string, imageInfoJSON string) string {
|
||||
var imageInfos []types.ImageInfo
|
||||
if err := json.Unmarshal([]byte(imageInfoJSON), &imageInfos); err != nil {
|
||||
return content
|
||||
}
|
||||
if len(imageInfos) == 0 {
|
||||
return content
|
||||
}
|
||||
|
||||
imageInfoMap := make(map[string]*types.ImageInfo)
|
||||
for i := range imageInfos {
|
||||
if imageInfos[i].URL != "" {
|
||||
imageInfoMap[imageInfos[i].URL] = &imageInfos[i]
|
||||
}
|
||||
if imageInfos[i].OriginalURL != "" {
|
||||
imageInfoMap[imageInfos[i].OriginalURL] = &imageInfos[i]
|
||||
}
|
||||
}
|
||||
|
||||
matches := MarkdownImageRegex.FindAllStringSubmatch(content, -1)
|
||||
processedURLs := make(map[string]bool)
|
||||
|
||||
for _, match := range matches {
|
||||
if len(match) < 3 {
|
||||
continue
|
||||
}
|
||||
imgURL := match[2]
|
||||
processedURLs[imgURL] = true
|
||||
|
||||
imgInfo, found := imageInfoMap[imgURL]
|
||||
if !found || imgInfo == nil {
|
||||
continue
|
||||
}
|
||||
appended := buildCaptionOCRBlock(imgInfo)
|
||||
if appended == "" {
|
||||
continue
|
||||
}
|
||||
content = strings.Replace(content, match[0], match[0]+"\n"+appended, 1)
|
||||
}
|
||||
|
||||
var extras []string
|
||||
for _, imgInfo := range imageInfos {
|
||||
if processedURLs[imgInfo.URL] || processedURLs[imgInfo.OriginalURL] {
|
||||
continue
|
||||
}
|
||||
if block := buildCaptionOCRBlock(&imgInfo); block != "" {
|
||||
extras = append(extras, block)
|
||||
}
|
||||
}
|
||||
if len(extras) > 0 {
|
||||
if content != "" {
|
||||
content += "\n"
|
||||
}
|
||||
content += strings.Join(extras, "\n")
|
||||
}
|
||||
return content
|
||||
}
|
||||
|
||||
// buildCaptionOCRBlock returns the inline caption + OCR snippet (no URL
|
||||
// wrapper) used by EnrichContentCaptionAndOCR. Empty string when the image
|
||||
// has neither caption nor OCR.
|
||||
func buildCaptionOCRBlock(img *types.ImageInfo) string {
|
||||
var parts []string
|
||||
if img.Caption != "" {
|
||||
parts = append(parts, fmt.Sprintf("<image_caption>%s</image_caption>", img.Caption))
|
||||
}
|
||||
if img.OCRText != "" {
|
||||
parts = append(parts, fmt.Sprintf("<image_ocr>%s</image_ocr>", img.OCRText))
|
||||
}
|
||||
return strings.Join(parts, "\n")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user