diff --git a/config/prompt_templates/generate_summary.yaml b/config/prompt_templates/generate_summary.yaml index d6267433..a93439c2 100644 --- a/config/prompt_templates/generate_summary.yaml +++ b/config/prompt_templates/generate_summary.yaml @@ -28,10 +28,14 @@ templates: - For technical documents: preserve key terms, metrics, and specific details - For meeting notes/reports: highlight decisions, action items, and conclusions + ## Handling Image-Derived Text + - Text wrapped in `...` or `...` IS extracted text content (produced by a vision model from images/figures in the document). Treat it as first-class document text and summarise it normally. + - A document whose only textual content comes from `` / `` blocks is NOT empty — summarise based on those captions/OCR results. + ## Empty or Insufficient Content - - If the user-provided content is empty, contains only image/figure references with no extracted text, or otherwise carries no substantive textual information, you MUST output exactly the single line: "No textual content was extractable from this document." and nothing else. + - Only when the user-provided content is genuinely empty, contains only bare image placeholders with NO inner caption/OCR text, or otherwise carries no substantive textual information, output exactly the single line: "No textual content was extractable from this document." and nothing else. - Do NOT fabricate a topic, do NOT guess from any other clue, and do NOT copy content from examples or unrelated sources. - - It is correct and expected to refuse to summarise when the content is absent. This is preferred over inventing a plausible-sounding but unsupported summary. + - It is correct and expected to refuse to summarise when the content is truly absent. This is preferred over inventing a plausible-sounding but unsupported summary. ## Language - Use {{language}} for all outputs diff --git a/internal/application/service/knowledge_process.go b/internal/application/service/knowledge_process.go index da01cce5..afe46eb6 100644 --- a/internal/application/service/knowledge_process.go +++ b/internal/application/service/knowledge_process.go @@ -637,6 +637,14 @@ func (s *knowledgeService) processChunks(ctx context.Context, // defaultMaxInputChars is the default maximum characters used as input for summary generation. const defaultMaxInputChars = 1024 * 24 +// imageDominatedTextThreshold is the rune count below which a document is +// considered "image-dominated" — i.e. the body text is so sparse that we +// should fall back to full image enrichment (caption + OCR) for the summary +// LLM call. Above this threshold the document has enough native text that +// caption-only enrichment is preferable (OCR text from incidental figures +// would otherwise add noise without contributing to the main topic). +const imageDominatedTextThreshold = 200 + // errInsufficientSummaryContent signals that getSummary refused to call the // LLM because the document had no usable text after image markup was stripped // (typical for scanned PDFs where VLM OCR yielded nothing). Callers should @@ -707,7 +715,22 @@ func (s *knowledgeService) getSummary(ctx context.Context, imageInfoMap := searchutil.CollectImageInfoByChunkIDs(ctx, s.chunkRepo, knowledge.TenantID, chunkIDs) mergedImageInfo := searchutil.MergeImageInfoJSON(imageInfoMap) if mergedImageInfo != "" { - chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo) + // For image-dominated documents (e.g. a docx whose only payload is a + // single embedded picture, or a screenshot-only file), captions alone + // often carry too little signal — the real content lives in OCR text. + // Detect that case by measuring the document's real (non-image-markup) + // text BEFORE enrichment, and switch to full enrichment (caption + OCR) + // when the body is essentially empty. Text-heavy documents stay on the + // caption-only path to avoid OCR noise (page headers/footers/watermarks + // from many figures diluting the main topic). + if realTextRuneCount(chunkContents) < imageDominatedTextThreshold { + // Caption + OCR (no URL/original wrappers — those are pure noise + // for the summary LLM and have been observed to trigger the + // "image reference with no extracted text" refusal heuristic). + chunkContents = searchutil.EnrichContentCaptionAndOCR(chunkContents, mergedImageInfo) + } else { + chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo) + } } // Apply length limit: sample long content to fit within maxInputChars diff --git a/internal/searchutil/imageinfo.go b/internal/searchutil/imageinfo.go index b526ff91..ae9f9e97 100644 --- a/internal/searchutil/imageinfo.go +++ b/internal/searchutil/imageinfo.go @@ -338,3 +338,81 @@ func EnrichContentCaptionOnly(content string, imageInfoJSON string) string { } return content } + +// EnrichContentCaptionAndOCR is like EnrichContentCaptionOnly but ALSO +// embeds OCR text alongside captions. URL and wrapper +// blocks are deliberately omitted (unlike EnrichContentWithImageInfo) — +// the summary LLM only needs the human-readable text, not opaque export +// hashes. Used as a fallback for image-dominated documents where caption +// alone carries too little signal. +func EnrichContentCaptionAndOCR(content string, imageInfoJSON string) string { + var imageInfos []types.ImageInfo + if err := json.Unmarshal([]byte(imageInfoJSON), &imageInfos); err != nil { + return content + } + if len(imageInfos) == 0 { + return content + } + + imageInfoMap := make(map[string]*types.ImageInfo) + for i := range imageInfos { + if imageInfos[i].URL != "" { + imageInfoMap[imageInfos[i].URL] = &imageInfos[i] + } + if imageInfos[i].OriginalURL != "" { + imageInfoMap[imageInfos[i].OriginalURL] = &imageInfos[i] + } + } + + matches := MarkdownImageRegex.FindAllStringSubmatch(content, -1) + processedURLs := make(map[string]bool) + + for _, match := range matches { + if len(match) < 3 { + continue + } + imgURL := match[2] + processedURLs[imgURL] = true + + imgInfo, found := imageInfoMap[imgURL] + if !found || imgInfo == nil { + continue + } + appended := buildCaptionOCRBlock(imgInfo) + if appended == "" { + continue + } + content = strings.Replace(content, match[0], match[0]+"\n"+appended, 1) + } + + var extras []string + for _, imgInfo := range imageInfos { + if processedURLs[imgInfo.URL] || processedURLs[imgInfo.OriginalURL] { + continue + } + if block := buildCaptionOCRBlock(&imgInfo); block != "" { + extras = append(extras, block) + } + } + if len(extras) > 0 { + if content != "" { + content += "\n" + } + content += strings.Join(extras, "\n") + } + return content +} + +// buildCaptionOCRBlock returns the inline caption + OCR snippet (no URL +// wrapper) used by EnrichContentCaptionAndOCR. Empty string when the image +// has neither caption nor OCR. +func buildCaptionOCRBlock(img *types.ImageInfo) string { + var parts []string + if img.Caption != "" { + parts = append(parts, fmt.Sprintf("%s", img.Caption)) + } + if img.OCRText != "" { + parts = append(parts, fmt.Sprintf("%s", img.OCRText)) + } + return strings.Join(parts, "\n") +}