feat(timeline): enrich per-image multimodal subspan output

The per-image multimodal subspan only captured image_url / enable_ocr / enable_caption on input and chunk_id on output, so the trace viewer could not answer "what did THIS image actually produce?" without joining back to the chunks table. Adds to the per-image span output: - vlm_model_id (or "legacy_inline" for inline-config KBs) - image_bytes (read size) - ocr_prompt: "default" | "scanned_pdf" - ocr_chars + ocr_preview (sanitized text, capped at 200 runes) - caption_chars + caption_preview - chunks_created (count of OCR/caption child chunks) - indexed (true after BatchIndex completes) - per-step error fields (read_error / ocr_error / caption_error / skipped reason) when something fails Also adds parent_chunk_id to the span input so the trace links back to the text chunk this image hangs off — useful when a doc has hundreds of inline images and you need to know WHERE in the text this one came from. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-04 13:30:32 +08:00 · 2026-05-27 22:41:51 +08:00
parent f99bcb79e7
commit 5dc0a49a9b
1 changed files with 37 additions and 3 deletions
--- a/internal/application/service/image_multimodal.go
+++ b/internal/application/service/image_multimodal.go
@@ -146,10 +146,18 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
 				"image_source_type": payload.ImageSourceType,
 				"enable_ocr":        payload.EnableOCR,
 				"enable_caption":    payload.EnableCaption,
+				"parent_chunk_id":   payload.ChunkID,
 			})
 		}
 	}

+	// Output map populated as we go — the deferred close picks it up.
+	// Captures real VLM results (model id, byte count, OCR/caption
+	// previews, downstream chunk counts) so the trace viewer can answer
+	// "what did this image actually produce?" without joining back to
+	// the chunks table.
+	imgOut := types.JSONMap{}
+
 	// finalize-once semantics: on success we always decrement the parent's
 	// pending counter. On failure we only decrement when this is the last
 	// asynq retry, so a permanently-failing single image cannot leave the
@@ -164,9 +172,7 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
 		// UI whether THIS specific image worked.
 		if imgSpan != nil {
 			if handleErr == nil {
-				tracker.EndSpan(ctx, imgSpan, types.JSONMap{
-					"chunk_id": payload.ChunkID,
-				})
+				tracker.EndSpan(ctx, imgSpan, imgOut)
 			} else if isFinalAsynqAttempt(ctx) {
 				tracker.FailSpan(ctx, imgSpan,
 					"MULTIMODAL_VLM_FAILED",
@@ -188,6 +194,17 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
 		handleErr = fmt.Errorf("resolve VLM: %w", err)
 		return handleErr
 	}
+	// Capture the resolved VLM model id (or "legacy_inline" for the
+	// legacy inline-config path) so the trace shows WHICH model handled
+	// this image. Without this, debugging "VLM is slow" requires a
+	// separate hop to the KB config.
+	if kb, kbErr := s.kbService.GetKnowledgeBaseByIDOnly(ctx, payload.KnowledgeBaseID); kbErr == nil && kb != nil {
+		if id := strings.TrimSpace(kb.VLMConfig.ModelID); id != "" {
+			imgOut["vlm_model_id"] = id
+		} else {
+			imgOut["vlm_model_id"] = "legacy_inline"
+		}
+	}

 	// Read image bytes. A provider:// URL must be resolved via FileService —
 	// it must NEVER be handed to the HTTP downloader (which would fail with
@@ -196,8 +213,11 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
 	imgBytes, readErr := s.readImageBytes(ctx, payload)
 	if readErr != nil {
 		logger.Errorf(ctx, "[ImageMultimodal] Skip unreadable image %s: %v", payload.ImageURL, readErr)
+		imgOut["skipped"] = "unreadable_image"
+		imgOut["read_error"] = readErr.Error()
 		return nil
 	}
+	imgOut["image_bytes"] = len(imgBytes)

 	imageInfo := types.ImageInfo{
 		URL:         payload.ImageURL,
@@ -209,17 +229,25 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
 		if payload.ImageSourceType == "scanned_pdf" {
 			prompt = vlmOCRScannedPDFPrompt
 			logger.Infof(ctx, "[ImageMultimodal] Using scanned PDF prompt for OCR: %s", payload.ImageURL)
+			imgOut["ocr_prompt"] = "scanned_pdf"
+		} else {
+			imgOut["ocr_prompt"] = "default"
 		}

 		ocrText, ocrErr := vlmModel.Predict(ctx, [][]byte{imgBytes}, prompt)
 		if ocrErr != nil {
 			logger.Warnf(ctx, "[ImageMultimodal] OCR failed for %s: %v", payload.ImageURL, ocrErr)
+			imgOut["ocr_error"] = ocrErr.Error()
 		} else {
 			ocrText = sanitizeOCRText(ocrText)
 			if ocrText != "" {
 				imageInfo.OCRText = ocrText
+				imgOut["ocr_chars"] = len([]rune(ocrText))
+				imgOut["ocr_preview"] = previewText(ocrText, 200)
 			} else {
 				logger.Warnf(ctx, "[ImageMultimodal] OCR returned empty/invalid content for %s, discarded", payload.ImageURL)
+				imgOut["ocr_chars"] = 0
+				imgOut["ocr_skipped"] = "empty_or_invalid"
 			}
 		}
 	}
@@ -227,8 +255,11 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
 	caption, capErr := vlmModel.Predict(ctx, [][]byte{imgBytes}, vlmCaptionPrompt)
 	if capErr != nil {
 		logger.Warnf(ctx, "[ImageMultimodal] Caption failed for %s: %v", payload.ImageURL, capErr)
+		imgOut["caption_error"] = capErr.Error()
 	} else if caption != "" {
 		imageInfo.Caption = caption
+		imgOut["caption_chars"] = len([]rune(caption))
+		imgOut["caption_preview"] = previewText(caption, 200)
 	}

 	// Build child chunks for OCR and caption results
@@ -268,9 +299,11 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
 			UpdatedAt:       time.Now(),
 		})
 	}
+	imgOut["chunks_created"] = len(newChunks)

 	if len(newChunks) == 0 {
 		// Deferred finalize will count this image on success.
+		imgOut["skipped"] = "no_extracted_content"
 		return nil
 	}

@@ -286,6 +319,7 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e

 	// Index chunks so they can be retrieved
 	s.indexChunks(ctx, payload, newChunks)
+	imgOut["indexed"] = true

 	// Enqueue question generation for the caption/OCR content if KB has it enabled.
 	// During initial processChunks, question generation is skipped for image-type