mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
feat(timeline): enrich per-image multimodal subspan output
The per-image multimodal subspan only captured image_url / enable_ocr / enable_caption on input and chunk_id on output, so the trace viewer could not answer "what did THIS image actually produce?" without joining back to the chunks table. Adds to the per-image span output: - vlm_model_id (or "legacy_inline" for inline-config KBs) - image_bytes (read size) - ocr_prompt: "default" | "scanned_pdf" - ocr_chars + ocr_preview (sanitized text, capped at 200 runes) - caption_chars + caption_preview - chunks_created (count of OCR/caption child chunks) - indexed (true after BatchIndex completes) - per-step error fields (read_error / ocr_error / caption_error / skipped reason) when something fails Also adds parent_chunk_id to the span input so the trace links back to the text chunk this image hangs off — useful when a doc has hundreds of inline images and you need to know WHERE in the text this one came from. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -146,10 +146,18 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
|
||||
"image_source_type": payload.ImageSourceType,
|
||||
"enable_ocr": payload.EnableOCR,
|
||||
"enable_caption": payload.EnableCaption,
|
||||
"parent_chunk_id": payload.ChunkID,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Output map populated as we go — the deferred close picks it up.
|
||||
// Captures real VLM results (model id, byte count, OCR/caption
|
||||
// previews, downstream chunk counts) so the trace viewer can answer
|
||||
// "what did this image actually produce?" without joining back to
|
||||
// the chunks table.
|
||||
imgOut := types.JSONMap{}
|
||||
|
||||
// finalize-once semantics: on success we always decrement the parent's
|
||||
// pending counter. On failure we only decrement when this is the last
|
||||
// asynq retry, so a permanently-failing single image cannot leave the
|
||||
@@ -164,9 +172,7 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
|
||||
// UI whether THIS specific image worked.
|
||||
if imgSpan != nil {
|
||||
if handleErr == nil {
|
||||
tracker.EndSpan(ctx, imgSpan, types.JSONMap{
|
||||
"chunk_id": payload.ChunkID,
|
||||
})
|
||||
tracker.EndSpan(ctx, imgSpan, imgOut)
|
||||
} else if isFinalAsynqAttempt(ctx) {
|
||||
tracker.FailSpan(ctx, imgSpan,
|
||||
"MULTIMODAL_VLM_FAILED",
|
||||
@@ -188,6 +194,17 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
|
||||
handleErr = fmt.Errorf("resolve VLM: %w", err)
|
||||
return handleErr
|
||||
}
|
||||
// Capture the resolved VLM model id (or "legacy_inline" for the
|
||||
// legacy inline-config path) so the trace shows WHICH model handled
|
||||
// this image. Without this, debugging "VLM is slow" requires a
|
||||
// separate hop to the KB config.
|
||||
if kb, kbErr := s.kbService.GetKnowledgeBaseByIDOnly(ctx, payload.KnowledgeBaseID); kbErr == nil && kb != nil {
|
||||
if id := strings.TrimSpace(kb.VLMConfig.ModelID); id != "" {
|
||||
imgOut["vlm_model_id"] = id
|
||||
} else {
|
||||
imgOut["vlm_model_id"] = "legacy_inline"
|
||||
}
|
||||
}
|
||||
|
||||
// Read image bytes. A provider:// URL must be resolved via FileService —
|
||||
// it must NEVER be handed to the HTTP downloader (which would fail with
|
||||
@@ -196,8 +213,11 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
|
||||
imgBytes, readErr := s.readImageBytes(ctx, payload)
|
||||
if readErr != nil {
|
||||
logger.Errorf(ctx, "[ImageMultimodal] Skip unreadable image %s: %v", payload.ImageURL, readErr)
|
||||
imgOut["skipped"] = "unreadable_image"
|
||||
imgOut["read_error"] = readErr.Error()
|
||||
return nil
|
||||
}
|
||||
imgOut["image_bytes"] = len(imgBytes)
|
||||
|
||||
imageInfo := types.ImageInfo{
|
||||
URL: payload.ImageURL,
|
||||
@@ -209,17 +229,25 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
|
||||
if payload.ImageSourceType == "scanned_pdf" {
|
||||
prompt = vlmOCRScannedPDFPrompt
|
||||
logger.Infof(ctx, "[ImageMultimodal] Using scanned PDF prompt for OCR: %s", payload.ImageURL)
|
||||
imgOut["ocr_prompt"] = "scanned_pdf"
|
||||
} else {
|
||||
imgOut["ocr_prompt"] = "default"
|
||||
}
|
||||
|
||||
ocrText, ocrErr := vlmModel.Predict(ctx, [][]byte{imgBytes}, prompt)
|
||||
if ocrErr != nil {
|
||||
logger.Warnf(ctx, "[ImageMultimodal] OCR failed for %s: %v", payload.ImageURL, ocrErr)
|
||||
imgOut["ocr_error"] = ocrErr.Error()
|
||||
} else {
|
||||
ocrText = sanitizeOCRText(ocrText)
|
||||
if ocrText != "" {
|
||||
imageInfo.OCRText = ocrText
|
||||
imgOut["ocr_chars"] = len([]rune(ocrText))
|
||||
imgOut["ocr_preview"] = previewText(ocrText, 200)
|
||||
} else {
|
||||
logger.Warnf(ctx, "[ImageMultimodal] OCR returned empty/invalid content for %s, discarded", payload.ImageURL)
|
||||
imgOut["ocr_chars"] = 0
|
||||
imgOut["ocr_skipped"] = "empty_or_invalid"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -227,8 +255,11 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
|
||||
caption, capErr := vlmModel.Predict(ctx, [][]byte{imgBytes}, vlmCaptionPrompt)
|
||||
if capErr != nil {
|
||||
logger.Warnf(ctx, "[ImageMultimodal] Caption failed for %s: %v", payload.ImageURL, capErr)
|
||||
imgOut["caption_error"] = capErr.Error()
|
||||
} else if caption != "" {
|
||||
imageInfo.Caption = caption
|
||||
imgOut["caption_chars"] = len([]rune(caption))
|
||||
imgOut["caption_preview"] = previewText(caption, 200)
|
||||
}
|
||||
|
||||
// Build child chunks for OCR and caption results
|
||||
@@ -268,9 +299,11 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
|
||||
UpdatedAt: time.Now(),
|
||||
})
|
||||
}
|
||||
imgOut["chunks_created"] = len(newChunks)
|
||||
|
||||
if len(newChunks) == 0 {
|
||||
// Deferred finalize will count this image on success.
|
||||
imgOut["skipped"] = "no_extracted_content"
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -286,6 +319,7 @@ func (s *ImageMultimodalService) Handle(ctx context.Context, task *asynq.Task) e
|
||||
|
||||
// Index chunks so they can be retrieved
|
||||
s.indexChunks(ctx, payload, newChunks)
|
||||
imgOut["indexed"] = true
|
||||
|
||||
// Enqueue question generation for the caption/OCR content if KB has it enabled.
|
||||
// During initial processChunks, question generation is skipped for image-type
|
||||
|
||||
Reference in New Issue
Block a user