diff --git a/config/prompt_templates/generate_summary.yaml b/config/prompt_templates/generate_summary.yaml
index d6267433..a93439c2 100644
--- a/config/prompt_templates/generate_summary.yaml
+++ b/config/prompt_templates/generate_summary.yaml
@@ -28,10 +28,14 @@ templates:
- For technical documents: preserve key terms, metrics, and specific details
- For meeting notes/reports: highlight decisions, action items, and conclusions
+ ## Handling Image-Derived Text
+ - Text wrapped in `...` or `...` IS extracted text content (produced by a vision model from images/figures in the document). Treat it as first-class document text and summarise it normally.
+ - A document whose only textual content comes from `` / `` blocks is NOT empty — summarise based on those captions/OCR results.
+
## Empty or Insufficient Content
- - If the user-provided content is empty, contains only image/figure references with no extracted text, or otherwise carries no substantive textual information, you MUST output exactly the single line: "No textual content was extractable from this document." and nothing else.
+ - Only when the user-provided content is genuinely empty, contains only bare image placeholders with NO inner caption/OCR text, or otherwise carries no substantive textual information, output exactly the single line: "No textual content was extractable from this document." and nothing else.
- Do NOT fabricate a topic, do NOT guess from any other clue, and do NOT copy content from examples or unrelated sources.
- - It is correct and expected to refuse to summarise when the content is absent. This is preferred over inventing a plausible-sounding but unsupported summary.
+ - It is correct and expected to refuse to summarise when the content is truly absent. This is preferred over inventing a plausible-sounding but unsupported summary.
## Language
- Use {{language}} for all outputs
diff --git a/internal/application/service/knowledge_process.go b/internal/application/service/knowledge_process.go
index da01cce5..afe46eb6 100644
--- a/internal/application/service/knowledge_process.go
+++ b/internal/application/service/knowledge_process.go
@@ -637,6 +637,14 @@ func (s *knowledgeService) processChunks(ctx context.Context,
// defaultMaxInputChars is the default maximum characters used as input for summary generation.
const defaultMaxInputChars = 1024 * 24
+// imageDominatedTextThreshold is the rune count below which a document is
+// considered "image-dominated" — i.e. the body text is so sparse that we
+// should fall back to full image enrichment (caption + OCR) for the summary
+// LLM call. Above this threshold the document has enough native text that
+// caption-only enrichment is preferable (OCR text from incidental figures
+// would otherwise add noise without contributing to the main topic).
+const imageDominatedTextThreshold = 200
+
// errInsufficientSummaryContent signals that getSummary refused to call the
// LLM because the document had no usable text after image markup was stripped
// (typical for scanned PDFs where VLM OCR yielded nothing). Callers should
@@ -707,7 +715,22 @@ func (s *knowledgeService) getSummary(ctx context.Context,
imageInfoMap := searchutil.CollectImageInfoByChunkIDs(ctx, s.chunkRepo, knowledge.TenantID, chunkIDs)
mergedImageInfo := searchutil.MergeImageInfoJSON(imageInfoMap)
if mergedImageInfo != "" {
- chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo)
+ // For image-dominated documents (e.g. a docx whose only payload is a
+ // single embedded picture, or a screenshot-only file), captions alone
+ // often carry too little signal — the real content lives in OCR text.
+ // Detect that case by measuring the document's real (non-image-markup)
+ // text BEFORE enrichment, and switch to full enrichment (caption + OCR)
+ // when the body is essentially empty. Text-heavy documents stay on the
+ // caption-only path to avoid OCR noise (page headers/footers/watermarks
+ // from many figures diluting the main topic).
+ if realTextRuneCount(chunkContents) < imageDominatedTextThreshold {
+ // Caption + OCR (no URL/original wrappers — those are pure noise
+ // for the summary LLM and have been observed to trigger the
+ // "image reference with no extracted text" refusal heuristic).
+ chunkContents = searchutil.EnrichContentCaptionAndOCR(chunkContents, mergedImageInfo)
+ } else {
+ chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo)
+ }
}
// Apply length limit: sample long content to fit within maxInputChars
diff --git a/internal/searchutil/imageinfo.go b/internal/searchutil/imageinfo.go
index b526ff91..ae9f9e97 100644
--- a/internal/searchutil/imageinfo.go
+++ b/internal/searchutil/imageinfo.go
@@ -338,3 +338,81 @@ func EnrichContentCaptionOnly(content string, imageInfoJSON string) string {
}
return content
}
+
+// EnrichContentCaptionAndOCR is like EnrichContentCaptionOnly but ALSO
+// embeds OCR text alongside captions. URL and wrapper
+// blocks are deliberately omitted (unlike EnrichContentWithImageInfo) —
+// the summary LLM only needs the human-readable text, not opaque export
+// hashes. Used as a fallback for image-dominated documents where caption
+// alone carries too little signal.
+func EnrichContentCaptionAndOCR(content string, imageInfoJSON string) string {
+ var imageInfos []types.ImageInfo
+ if err := json.Unmarshal([]byte(imageInfoJSON), &imageInfos); err != nil {
+ return content
+ }
+ if len(imageInfos) == 0 {
+ return content
+ }
+
+ imageInfoMap := make(map[string]*types.ImageInfo)
+ for i := range imageInfos {
+ if imageInfos[i].URL != "" {
+ imageInfoMap[imageInfos[i].URL] = &imageInfos[i]
+ }
+ if imageInfos[i].OriginalURL != "" {
+ imageInfoMap[imageInfos[i].OriginalURL] = &imageInfos[i]
+ }
+ }
+
+ matches := MarkdownImageRegex.FindAllStringSubmatch(content, -1)
+ processedURLs := make(map[string]bool)
+
+ for _, match := range matches {
+ if len(match) < 3 {
+ continue
+ }
+ imgURL := match[2]
+ processedURLs[imgURL] = true
+
+ imgInfo, found := imageInfoMap[imgURL]
+ if !found || imgInfo == nil {
+ continue
+ }
+ appended := buildCaptionOCRBlock(imgInfo)
+ if appended == "" {
+ continue
+ }
+ content = strings.Replace(content, match[0], match[0]+"\n"+appended, 1)
+ }
+
+ var extras []string
+ for _, imgInfo := range imageInfos {
+ if processedURLs[imgInfo.URL] || processedURLs[imgInfo.OriginalURL] {
+ continue
+ }
+ if block := buildCaptionOCRBlock(&imgInfo); block != "" {
+ extras = append(extras, block)
+ }
+ }
+ if len(extras) > 0 {
+ if content != "" {
+ content += "\n"
+ }
+ content += strings.Join(extras, "\n")
+ }
+ return content
+}
+
+// buildCaptionOCRBlock returns the inline caption + OCR snippet (no URL
+// wrapper) used by EnrichContentCaptionAndOCR. Empty string when the image
+// has neither caption nor OCR.
+func buildCaptionOCRBlock(img *types.ImageInfo) string {
+ var parts []string
+ if img.Caption != "" {
+ parts = append(parts, fmt.Sprintf("%s", img.Caption))
+ }
+ if img.OCRText != "" {
+ parts = append(parts, fmt.Sprintf("%s", img.OCRText))
+ }
+ return strings.Join(parts, "\n")
+}