feat: enhance multimodal image processing and session management

- Updated configuration to support three tasks in the assistant's prompt: rewriting questions, classifying intent for knowledge base retrieval, and analyzing attached images. - Implemented backend functionality to update image descriptions in user messages, ensuring that image context is preserved across conversation history. - Modified session handling to return complete session data after updates, improving the accuracy of session information provided to users. - Enhanced the chat pipeline to include detailed image descriptions and manage image captions effectively, enriching user interactions with visual content. These improvements significantly enhance the application's multimodal capabilities, providing a more robust and user-friendly experience when handling image-based queries.
2026-06-04 13:30:32 +08:00 · 2026-03-12 13:17:47 +08:00
parent bc2cb96d70
commit 4eed96f901
15 changed files with 319 additions and 65 deletions
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -29,9 +29,10 @@ conversation:
  enable_query_expansion: true
  enable_rerank: true
  rewrite_prompt_system: |
-    You are an intelligent assistant that performs TWO tasks on the user's question:
+    You are an intelligent assistant that performs THREE tasks on the user's question:
    1. Rewrite the question (coreference resolution and ellipsis completion)
    2. Classify whether the question requires knowledge base retrieval
+    3. Analyze attached images (when present)

    ## Task 1: Rewriting Goals
    Based on the conversation history, rewrite the current user question:
@@ -44,30 +45,32 @@ conversation:

    ## Task 2: Intent Classification
    Determine if the question requires knowledge base retrieval.
-    - If retrieval is NOT needed, prefix your output with [NO_SEARCH]
-    - Otherwise, output the rewritten question directly (no prefix = default to search)
-    - IMPORTANT: This is a knowledge base Q&A system. Default to SEARCH. Only add [NO_SEARCH] when you are very confident the question has nothing to do with the knowledge base.
+    - Output a boolean field `skip_kb_search` instead of any prefix marker.
+    - IMPORTANT: This is a knowledge base Q&A system. Default to search (`skip_kb_search=false`).
+    - Set `skip_kb_search=true` only when you are very confident retrieval is unnecessary.

-    When to output [NO_SEARCH]:
+    When to set skip_kb_search=true:
    - Pure greetings, thanks, or farewell with no question ("谢谢", "你好", "再见")
    - Requests to summarize or manipulate the previous conversation itself ("总结一下我们的对话")
    - Pure image understanding with NO intent to search documents: describing, summarizing, translating, or extracting content from the image itself ("这张图片是什么", "描述一下图片内容", "帮我翻译图中文字", "图里的表格数据是什么", "帮我识别一下这张图")
    - Follow-up questions that clearly refer to previous conversation content (especially previously uploaded images) and can be answered from dialogue context directly ("第一张图再详细描述一下", "第二张门上的字是什么意思", "这个再展开讲讲")

-    When NOT to output [NO_SEARCH] (common mistakes to avoid):
+    When NOT to set skip_kb_search=true (common mistakes to avoid):
    - User uploads an image and asks to FIND/SEARCH related content in knowledge base → SEARCH (the image content is the search query)
    - User asks how to fix/solve/handle something shown in the image ("这个报错怎么解决") → SEARCH (knowledge base may have solutions)
    - User asks a question that implies comparing image content against documents → SEARCH

    ## Task 3: Image Analysis (only when images are attached)
-    If the user's message includes images, after the rewritten question, add a separator "---" on a new line, then provide a concise description of the image content. This description is used as a text fallback for models that cannot process images directly.
-    If the image contains OCR text, also extract the key text information (OCR) from the image and include it in the description.
+    If the user's message includes images, provide a detailed description in `image_description`, including objects, scene, layout, relationships, and any visible key details.
+    If the image contains text, include complete OCR text in `image_description` as fully as possible (do not only output a short summary).
+    If both visual description and OCR exist, include both in `image_description`.
+    If there are no images or truly no useful visual/text information, set `image_description` to an empty string.

    ## Output Format
-    - Text only (needs search): rewritten question
-    - Text only (no search): [NO_SEARCH] rewritten question
-    - With images (needs search): rewritten question\n---\nimage description (including extracted OCR if present)
-    - With images (no search): [NO_SEARCH] rewritten question\n---\nimage description (including extracted OCR if present)
+    You MUST output ONLY a single JSON object.
+    Do NOT output markdown, code fences, explanations, or any extra text.
+    JSON schema:
+    {"rewrite_query":"string","skip_kb_search":true|false,"image_description":"string"}

  rewrite_prompt_user: |
    ## Conversation History
@@ -76,7 +79,7 @@ conversation:
    ## User Question to Rewrite
    {{query}}

-    ## Rewritten Question
+    ## JSON Output
  keywords_extraction_prompt: |
    # Role
    You are a professional keyword extraction assistant. Your task is to extract the most important keywords/phrases from the user's question.
--- a/internal/application/repository/message.go
+++ b/internal/application/repository/message.go
@@ -235,6 +235,16 @@ func (r *messageRepository) GetKnowledgeIDsBySessionID(
 	return knowledgeIDs, nil
 }

+// UpdateMessageImages updates only the images JSONB column for a message.
+// Uses Select to force GORM to include the column even when struct-based
+// Updates would otherwise skip custom Valuer types.
+func (r *messageRepository) UpdateMessageImages(ctx context.Context, sessionID, messageID string, images types.MessageImages) error {
+	return r.db.WithContext(ctx).
+		Model(&types.Message{}).
+		Where("id = ? AND session_id = ?", messageID, sessionID).
+		Update("images", images).Error
+}
+
 // UpdateMessageKnowledgeID updates the knowledge_id field for a message
 func (r *messageRepository) UpdateMessageKnowledgeID(
 	ctx context.Context, messageID string, knowledgeID string,
--- a/internal/application/repository/session.go
+++ b/internal/application/repository/session.go
@@ -43,7 +43,7 @@ func (r *sessionRepository) Get(ctx context.Context, tenantID uint64, id string)
 // GetByTenantID retrieves all sessions for a tenant
 func (r *sessionRepository) GetByTenantID(ctx context.Context, tenantID uint64) ([]*types.Session, error) {
 	var sessions []*types.Session
-	err := r.db.WithContext(ctx).Where("tenant_id = ?", tenantID).Order("created_at DESC").Find(&sessions).Error
+	err := r.db.WithContext(ctx).Where("tenant_id = ?", tenantID).Order("updated_at DESC").Find(&sessions).Error
 	if err != nil {
 		return nil, err
 	}
@@ -66,7 +66,7 @@ func (r *sessionRepository) GetPagedByTenantID(
 	// Then query the paginated data
 	err = r.db.WithContext(ctx).
 		Where("tenant_id = ?", tenantID).
-		Order("created_at DESC").
+		Order("updated_at DESC").
 		Offset(page.Offset()).
 		Limit(page.Limit()).
 		Find(&sessions).Error
@@ -80,7 +80,14 @@ func (r *sessionRepository) GetPagedByTenantID(
 // Update updates a session
 func (r *sessionRepository) Update(ctx context.Context, session *types.Session) error {
 	session.UpdatedAt = time.Now()
-	return r.db.WithContext(ctx).Where("tenant_id = ?", session.TenantID).Save(session).Error
+	return r.db.WithContext(ctx).
+		Model(&types.Session{}).
+		Where("tenant_id = ? AND id = ?", session.TenantID, session.ID).
+		Updates(map[string]interface{}{
+			"title":       session.Title,
+			"description": session.Description,
+			"updated_at":  session.UpdatedAt,
+		}).Error
 }

 // Delete deletes a session
--- a/internal/application/service/chat_pipline/common.go
+++ b/internal/application/service/chat_pipline/common.go
@@ -78,6 +78,19 @@ func prepareMessagesWithHistory(chatManage *types.ChatManage) []chat.Message {
 	return chatMessages
 }

+// extractImageCaptions concatenates non-empty Caption fields from stored
+// message images. Used when loading history so that previous turns' image
+// descriptions are visible to the model.
+func extractImageCaptions(images types.MessageImages) string {
+	var parts []string
+	for _, img := range images {
+		if img.Caption != "" {
+			parts = append(parts, img.Caption)
+		}
+	}
+	return strings.Join(parts, "\n")
+}
+
 // renderSystemPromptPlaceholders replaces placeholders in system prompt
 // Supported placeholders:
 //   - {{current_time}} -> current time in RFC3339 format
--- a/internal/application/service/chat_pipline/into_chat_message.go
+++ b/internal/application/service/chat_pipline/into_chat_message.go
@@ -90,8 +90,8 @@ func (p *PluginIntoChatMessage) OnEvent(ctx context.Context,
 				})
 			}
 		}
-		if chatManage.ImageOCRText != "" && !chatManage.ChatModelSupportsVision {
-			userContent += "\n\n[用户上传图片内容]\n" + chatManage.ImageOCRText
+		if chatManage.ImageDescription != "" && !chatManage.ChatModelSupportsVision {
+			userContent += "\n\n[用户上传图片内容]\n" + chatManage.ImageDescription
 		}
 		chatManage.UserContent = userContent
 		pipelineInfo(ctx, "IntoChatMessage", "skip_template_no_search", map[string]interface{}{
@@ -148,16 +148,19 @@ func (p *PluginIntoChatMessage) OnEvent(ctx context.Context,

 	// Append image description as text fallback only when the chat model cannot
 	// process images directly. Vision-capable models see images via MultiContent.
-	if chatManage.ImageOCRText != "" && !chatManage.ChatModelSupportsVision {
-		userContent += "\n\n[用户上传图片内容]\n" + chatManage.ImageOCRText
+	if chatManage.ImageDescription != "" && !chatManage.ChatModelSupportsVision {
+		userContent += "\n\n[用户上传图片内容]\n" + chatManage.ImageDescription
 	}

 	// Set formatted content back to chat management
 	chatManage.UserContent = userContent
 	pipelineInfo(ctx, "IntoChatMessage", "output", map[string]interface{}{
-		"session_id":       chatManage.SessionID,
-		"user_content_len": len(chatManage.UserContent),
-		"faq_priority":     chatManage.FAQPriorityEnabled,
+		"session_id":                 chatManage.SessionID,
+		"user_content_len":           len(chatManage.UserContent),
+		"faq_priority":               chatManage.FAQPriorityEnabled,
+		"skip_kb_search":             chatManage.SkipKBSearch,
+		"image_description":          chatManage.ImageDescription,
+		"chat_model_supports_vision": chatManage.ChatModelSupportsVision,
 	})
 	return next()
 }
--- a/internal/application/service/chat_pipline/load_history.go
+++ b/internal/application/service/chat_pipline/load_history.go
@@ -83,9 +83,11 @@ func (p *PluginLoadHistory) OnEvent(ctx context.Context,
 			h = &types.History{}
 		}
 		if message.Role == "user" {
-			// User message as query
 			h.Query = message.Content
 			h.CreateAt = message.CreatedAt
+			if desc := extractImageCaptions(message.Images); desc != "" {
+				h.Query += "\n\n[用户上传图片内容]\n" + desc
+			}
 		} else {
 			// System message as answer, while removing thinking process
 			h.Answer = regThink.ReplaceAllString(message.Content, "")
--- a/internal/application/service/chat_pipline/rewrite.go
+++ b/internal/application/service/chat_pipline/rewrite.go
@@ -4,6 +4,7 @@ package chatpipline

 import (
 	"context"
+	"encoding/json"
 	"regexp"
 	"slices"
 	"sort"
@@ -34,6 +35,12 @@ const (
 	noSearchPrefix = "[NO_SEARCH]"
 )

+type rewriteOutput struct {
+	RewriteQuery     string
+	SkipKBSearch     bool
+	ImageDescription string
+}
+
 // NewPluginRewrite creates a new query rewriting plugin instance
 // Also registers the plugin with the event manager
 func NewPluginRewrite(eventManager *EventManager,
@@ -183,16 +190,53 @@ func (p *PluginRewrite) OnEvent(ctx context.Context,

 	// --- Parse structured output ---
 	p.parseRewriteOutput(chatManage, response.Content)
+
+	// Persist image description back to the user message so that future turns
+	// can see it when loading conversation history.
+	if chatManage.ImageDescription != "" && chatManage.UserMessageID != "" {
+		p.updateUserMessageImageCaption(ctx, chatManage)
+	}
+
 	pipelineInfo(ctx, "Rewrite", "output", map[string]interface{}{
 		"session_id":      chatManage.SessionID,
 		"rewrite_query":   chatManage.RewriteQuery,
 		"skip_kb_search":  chatManage.SkipKBSearch,
-		"has_image_desc":  chatManage.ImageOCRText != "",
+		"has_image_desc":  chatManage.ImageDescription != "",
 		"original_output": response.Content,
 	})
 	return next()
 }

+// updateUserMessageImageCaption writes the generated ImageDescription back to
+// the stored user message's Images so that subsequent turns can see it in history.
+func (p *PluginRewrite) updateUserMessageImageCaption(ctx context.Context, chatManage *types.ChatManage) {
+	msg, err := p.messageService.GetMessage(ctx, chatManage.SessionID, chatManage.UserMessageID)
+	if err != nil {
+		pipelineWarn(ctx, "Rewrite", "get_user_message", map[string]interface{}{
+			"session_id":      chatManage.SessionID,
+			"user_message_id": chatManage.UserMessageID,
+			"error":           err.Error(),
+		})
+		return
+	}
+
+	if len(msg.Images) == 0 {
+		return
+	}
+
+	msg.Images[0].Caption = chatManage.ImageDescription
+
+	// Use the targeted UpdateMessageImages to reliably persist the JSONB column.
+	// GORM's struct-based Updates may silently skip custom Valuer types.
+	if err := p.messageService.UpdateMessageImages(ctx, chatManage.SessionID, chatManage.UserMessageID, msg.Images); err != nil {
+		pipelineWarn(ctx, "Rewrite", "update_image_caption", map[string]interface{}{
+			"session_id":      chatManage.SessionID,
+			"user_message_id": chatManage.UserMessageID,
+			"error":           err.Error(),
+		})
+	}
+}
+
 // loadHistory fetches and processes conversation history for rewrite context.
 func (p *PluginRewrite) loadHistory(ctx context.Context, chatManage *types.ChatManage) []*types.History {
 	history, err := p.messageService.GetRecentMessagesBySession(ctx, chatManage.SessionID, 20)
@@ -212,6 +256,9 @@ func (p *PluginRewrite) loadHistory(ctx context.Context, chatManage *types.ChatM
 		if message.Role == "user" {
 			h.Query = message.Content
 			h.CreateAt = message.CreatedAt
+			if desc := extractImageCaptions(message.Images); desc != "" {
+				h.Query += "\n\n[用户上传图片内容]\n" + desc
+			}
 		} else {
 			h.Answer = reg.ReplaceAllString(message.Content, "")
 			h.KnowledgeReferences = message.KnowledgeReferences
@@ -307,11 +354,14 @@ func (p *PluginRewrite) buildPrompts(chatManage *types.ChatManage, historyList [
 	}
 	// Strengthen context inheritance in multi-turn conversation:
 	// for follow-up questions that clearly refer to previous turns (especially
-	// uploaded-image understanding), prefer NO_SEARCH over KB retrieval.
+	// uploaded-image understanding), prefer skipping KB retrieval.
 	systemPrompt += "\n\n## Additional Context Inheritance Guidance\n" +
-		"- If the current question is a follow-up to previous conversation content (especially previously uploaded images) and can be answered by that context, you MUST classify it as NO_SEARCH.\n" +
+		"- If the current question is a follow-up to previous conversation content (especially previously uploaded images) and can be answered by that context, you MUST set skip_kb_search=true.\n" +
 		"- Examples: “第一张图再详细描述一下”, “第二张门上的字是什么意思”, “这个再展开讲讲”.\n" +
-		"- In these follow-up cases, output with [NO_SEARCH] prefix."
+		"- You MUST output ONLY a single JSON object (no markdown/code fences, no extra text) with schema: {\"rewrite_query\":\"...\",\"skip_kb_search\":true|false,\"image_description\":\"...\"}.\n" +
+		"- `skip_kb_search` field is REQUIRED and must be explicit true/false.\n" +
+		"- For images, `image_description` MUST include a detailed visual description and complete OCR text. Keep all key details; do not overly summarize.\n" +
+		"- image_description should be empty string only when there are no images or truly no readable visual text/details."

 	conversationText := formatConversationHistory(historyList)
 	currentTime := time.Now().Format("2006-01-02 15:04:05")
@@ -333,27 +383,34 @@ func (p *PluginRewrite) buildPrompts(chatManage *types.ChatManage, historyList [
 //
 // Expected formats:
 //
-//	Text only:  "[NO_SEARCH] rewritten question"  or  "rewritten question"
-//	With images: "[NO_SEARCH]\nrewritten question\n---\nimage description"
+//	Preferred: {"rewrite_query":"...","skip_kb_search":false,"image_description":"..."}
+//	Legacy fallback:
+//	  - Text only:  "[NO_SEARCH] rewritten question"  or  "rewritten question"
+//	  - With images: "[NO_SEARCH]\nrewritten question\n---\nimage description"
 func (p *PluginRewrite) parseRewriteOutput(chatManage *types.ChatManage, raw string) {
 	content := strings.TrimSpace(raw)
 	if content == "" {
 		return
 	}

-	// 1. Parse intent marker
+	if output, ok := parseStructuredRewriteOutput(content); ok {
+		if rewrite := strings.TrimSpace(output.RewriteQuery); rewrite != "" {
+			chatManage.RewriteQuery = rewrite
+		}
+		chatManage.SkipKBSearch = output.SkipKBSearch
+		chatManage.ImageDescription = strings.TrimSpace(output.ImageDescription)
+		return
+	}
+
+	// Legacy fallback parsing for older prompts/models.
 	if strings.HasPrefix(content, noSearchPrefix) {
 		chatManage.SkipKBSearch = true
 		content = strings.TrimSpace(strings.TrimPrefix(content, noSearchPrefix))
 	}

-	// 2. Split rewritten query and image description.
-	// Be tolerant to model output variants like:
-	// - "query\n---\nimage_desc" (expected)
-	// - "query---\nimage_desc"   (missing leading newline before separator)
 	if m := rewriteImageSepPattern.FindStringSubmatch(content); len(m) == 3 {
 		chatManage.RewriteQuery = strings.TrimSpace(m[1])
-		chatManage.ImageOCRText = strings.TrimSpace(m[2])
+		chatManage.ImageDescription = strings.TrimSpace(m[2])
 		return
 	}
 	if content != "" {
@@ -361,6 +418,128 @@ func (p *PluginRewrite) parseRewriteOutput(chatManage *types.ChatManage, raw str
 	}
 }

+func parseStructuredRewriteOutput(raw string) (rewriteOutput, bool) {
+	content := strings.TrimSpace(raw)
+	if content == "" {
+		return rewriteOutput{}, false
+	}
+
+	var out rewriteOutput
+	if parsed, ok := parseStructuredRewriteOutputJSON(content); ok {
+		return parsed, true
+	}
+
+	// Be tolerant to occasional markdown wrappers or extra prose.
+	start := strings.Index(content, "{")
+	end := strings.LastIndex(content, "}")
+	if start < 0 || end <= start {
+		return rewriteOutput{}, false
+	}
+	candidate := content[start : end+1]
+	if parsed, ok := parseStructuredRewriteOutputJSON(candidate); ok {
+		return parsed, true
+	}
+	return out, false
+}
+
+func parseStructuredRewriteOutputJSON(content string) (rewriteOutput, bool) {
+	var obj map[string]json.RawMessage
+	if err := json.Unmarshal([]byte(content), &obj); err != nil {
+		return rewriteOutput{}, false
+	}
+
+	out := rewriteOutput{
+		RewriteQuery: strings.TrimSpace(firstStringField(obj,
+			"rewrite_query", "rewritten_query", "query", "question")),
+	}
+
+	// Support common variants and semantic inversion for need_search.
+	if v, ok := firstBoolField(obj, "skip_kb_search", "skip_search", "no_search"); ok {
+		out.SkipKBSearch = v
+	} else if v, ok := firstBoolField(obj, "need_search", "requires_search"); ok {
+		out.SkipKBSearch = !v
+	}
+
+	desc := strings.TrimSpace(firstStringField(obj,
+		"image_description", "image_desc", "image_text", "image_ocr_text", "description"))
+	ocr := strings.TrimSpace(firstStringField(obj,
+		"ocr_text", "ocr", "full_ocr", "image_ocr", "ocr_content"))
+	combined, set := mergeImageDescAndOCR(desc, ocr)
+	if set {
+		out.ImageDescription = combined
+	}
+
+	return out, true
+}
+
+func firstStringField(obj map[string]json.RawMessage, keys ...string) string {
+	for _, key := range keys {
+		raw, ok := obj[key]
+		if !ok || len(raw) == 0 {
+			continue
+		}
+
+		var s string
+		if err := json.Unmarshal(raw, &s); err == nil {
+			return s
+		}
+	}
+	return ""
+}
+
+func firstBoolField(obj map[string]json.RawMessage, keys ...string) (bool, bool) {
+	for _, key := range keys {
+		raw, ok := obj[key]
+		if !ok || len(raw) == 0 {
+			continue
+		}
+		if v, ok := parseBoolJSON(raw); ok {
+			return v, true
+		}
+	}
+	return false, false
+}
+
+func parseBoolJSON(raw json.RawMessage) (bool, bool) {
+	var b bool
+	if err := json.Unmarshal(raw, &b); err == nil {
+		return b, true
+	}
+
+	var s string
+	if err := json.Unmarshal(raw, &s); err == nil {
+		switch strings.ToLower(strings.TrimSpace(s)) {
+		case "true", "1", "yes", "y":
+			return true, true
+		case "false", "0", "no", "n":
+			return false, true
+		}
+	}
+
+	var n float64
+	if err := json.Unmarshal(raw, &n); err == nil {
+		return n != 0, true
+	}
+
+	return false, false
+}
+
+func mergeImageDescAndOCR(desc, ocr string) (string, bool) {
+	if desc == "" && ocr == "" {
+		return "", false
+	}
+	if desc == "" {
+		return ocr, true
+	}
+	if ocr == "" {
+		return desc, true
+	}
+	if strings.Contains(desc, ocr) {
+		return desc, true
+	}
+	return desc + "\n\n[OCR]\n" + ocr, true
+}
+
 // formatConversationHistory formats conversation history for prompt template
 func formatConversationHistory(historyList []*types.History) string {
 	if len(historyList) == 0 {
--- a/internal/application/service/message.go
+++ b/internal/application/service/message.go
@@ -237,6 +237,11 @@ func (s *messageService) UpdateMessage(ctx context.Context, message *types.Messa
 	return nil
 }

+// UpdateMessageImages updates only the images JSONB column for a message.
+func (s *messageService) UpdateMessageImages(ctx context.Context, sessionID, messageID string, images types.MessageImages) error {
+	return s.messageRepo.UpdateMessageImages(ctx, sessionID, messageID, images)
+}
+
 // DeleteMessage removes a message from a session, also cleaning up its Knowledge entry in the chat history KB.
 func (s *messageService) DeleteMessage(ctx context.Context, sessionID string, messageID string) error {
 	logger.Info(ctx, "Start deleting message")
--- a/internal/application/service/session.go
+++ b/internal/application/service/session.go
@@ -523,7 +523,7 @@ func (s *sessionService) KnowledgeQA(
 	eventBus *event.EventBus,
 	customAgent *types.CustomAgent,
 	enableMemory bool,
-	imageURLs []string, imageOCRText string,
+	imageURLs []string, imageDescription string, userMessageID string,
 ) error {
 	logger.Infof(
 		ctx,
@@ -768,8 +768,9 @@ func (s *sessionService) KnowledgeQA(
 		FAQDirectAnswerThreshold: faqDirectAnswerThreshold,
 		FAQScoreBoost:            faqScoreBoost,
 		// Image support
+		UserMessageID:           userMessageID,
 		Images:                  imageURLs,
-		ImageOCRText:            imageOCRText,
+		ImageDescription:        imageDescription,
 		VLMModelID:              vlmModelID,
 		ChatModelSupportsVision: chatModelSupportsVision,
 	}
@@ -783,8 +784,8 @@ func (s *sessionService) KnowledgeQA(
 		// For pure chat, UserContent is the Query (since INTO_CHAT_MESSAGE is skipped)
 		// Only append image text description for non-vision models; vision models see images directly
 		userContent := query
-		if imageOCRText != "" && !chatModelSupportsVision {
-			userContent += "\n\n[用户上传图片内容]\n" + imageOCRText
+		if imageDescription != "" && !chatModelSupportsVision {
+			userContent += "\n\n[用户上传图片内容]\n" + imageDescription
 		}
 		chatManage.UserContent = userContent

@@ -1391,8 +1392,9 @@ func (s *sessionService) AgentQA(
 	customAgent *types.CustomAgent,
 	knowledgeBaseIDs []string,
 	knowledgeIDs []string,
-	imageURLs []string, imageOCRText string,
+	imageURLs []string, imageDescription string, userMessageID string,
 ) error {
+	_ = userMessageID // reserved for future use (AgentQA pipeline differs from KnowledgeQA)
 	sessionID := session.ID
 	sessionJSON, err := json.Marshal(session)
 	if err != nil {
@@ -1624,9 +1626,9 @@ func (s *sessionService) AgentQA(
 	if agentModelSupportsVision && len(imageURLs) > 0 {
 		agentImageURLs = imageURLs
 		logger.Infof(ctx, "Agent model supports vision, passing %d image(s) directly", len(agentImageURLs))
-	} else if imageOCRText != "" {
-		agentQuery = query + "\n\n[用户上传图片内容]\n" + imageOCRText
-		logger.Infof(ctx, "Agent model does not support vision, appending image OCR text (%d chars)", len(imageOCRText))
+	} else if imageDescription != "" {
+		agentQuery = query + "\n\n[用户上传图片内容]\n" + imageDescription
+		logger.Infof(ctx, "Agent model does not support vision, appending image description (%d chars)", len(imageDescription))
 	}

 	// Execute agent with streaming (asynchronously)
@@ -1762,9 +1764,11 @@ func (s *sessionService) handleModelFallback(ctx context.Context, chatManage *ty
 	}

 	// Start streaming response
-	responseChan, err := chatModel.ChatStream(ctx, []chat.Message{
-		{Role: "user", Content: promptContent},
-	}, opt)
+	userMsg := chat.Message{Role: "user", Content: promptContent}
+	if chatManage.ChatModelSupportsVision && len(chatManage.Images) > 0 {
+		userMsg.Images = chatManage.Images
+	}
+	responseChan, err := chatModel.ChatStream(ctx, []chat.Message{userMsg}, opt)
 	if err != nil {
 		logger.Errorf(ctx, "Failed to start streaming fallback response: %v, falling back to fixed response", err)
 		s.handleFixedFallback(ctx, chatManage)
@@ -1781,10 +1785,17 @@ func (s *sessionService) handleModelFallback(ctx context.Context, chatManage *ty
 	go s.consumeFallbackStream(ctx, chatManage, responseChan)
 }

-// renderFallbackPrompt renders the fallback prompt template with Query variable
+// renderFallbackPrompt renders the fallback prompt template with query and image context.
 func (s *sessionService) renderFallbackPrompt(ctx context.Context, chatManage *types.ChatManage) (string, error) {
-	// Use simple string replacement instead of Go template
-	result := strings.ReplaceAll(chatManage.FallbackPrompt, "{{query}}", chatManage.Query)
+	query := chatManage.Query
+	if rq := strings.TrimSpace(chatManage.RewriteQuery); rq != "" {
+		query = rq
+	}
+	result := strings.ReplaceAll(chatManage.FallbackPrompt, "{{query}}", query)
+
+	if chatManage.ImageDescription != "" && !chatManage.ChatModelSupportsVision {
+		result += "\n\n[用户上传图片内容]\n" + chatManage.ImageDescription
+	}
 	return result, nil
 }

--- a/internal/handler/session/handler.go
+++ b/internal/handler/session/handler.go
@@ -260,11 +260,19 @@ func (h *Handler) UpdateSession(c *gin.Context) {
 		return
 	}

+	// Reload session from database to return complete timestamps and stored fields
+	updatedSession, err := h.sessionService.GetSession(ctx, id)
+	if err != nil {
+		logger.ErrorWithFields(ctx, err, nil)
+		c.Error(errors.NewInternalServerError(err.Error()))
+		return
+	}
+
 	// Return updated session
 	logger.Infof(ctx, "Session updated successfully, ID: %s", id)
 	c.JSON(http.StatusOK, gin.H{
 		"success": true,
-		"data":    session,
+		"data":    updatedSession,
 	})
 }

--- a/internal/handler/session/helpers.go
+++ b/internal/handler/session/helpers.go
@@ -164,9 +164,9 @@ func createAgentQueryEvent(sessionID, assistantMessageID string) interfaces.Stre
 	}
 }

-// createUserMessage creates a user message
-func (h *Handler) createUserMessage(ctx context.Context, sessionID, query, requestID string, mentionedItems types.MentionedItems, images types.MessageImages) error {
-	_, err := h.messageService.CreateMessage(ctx, &types.Message{
+// createUserMessage creates a user message and returns the created message.
+func (h *Handler) createUserMessage(ctx context.Context, sessionID, query, requestID string, mentionedItems types.MentionedItems, images types.MessageImages) (*types.Message, error) {
+	return h.messageService.CreateMessage(ctx, &types.Message{
 		SessionID:      sessionID,
 		Role:           "user",
 		Content:        query,
@@ -176,7 +176,6 @@ func (h *Handler) createUserMessage(ctx context.Context, sessionID, query, reque
 		MentionedItems: mentionedItems,
 		Images:         images,
 	})
-	return err
 }

 // createAssistantMessage creates an assistant message
--- a/internal/handler/session/qa.go
+++ b/internal/handler/session/qa.go
@@ -35,6 +35,7 @@ type qaRequestContext struct {
 	mentionedItems    types.MentionedItems
 	effectiveTenantID uint64            // when using shared agent, tenant ID for model/KB/MCP resolution; 0 = use context tenant
 	images            []ImageAttachment // Uploaded images with analysis text
+	userMessageID     string            // Created user message ID (populated after createUserMessage)
 }

 // parseQARequest parses and validates a QA request, returns the request context
@@ -403,10 +404,12 @@ func (h *Handler) executeNormalModeQA(reqCtx *qaRequestContext, generateTitle bo
 	sessionID := reqCtx.sessionID

 	// Create user message
-	if err := h.createUserMessage(ctx, sessionID, reqCtx.query, reqCtx.requestID, reqCtx.mentionedItems, convertImageAttachments(reqCtx.images)); err != nil {
+	userMsg, err := h.createUserMessage(ctx, sessionID, reqCtx.query, reqCtx.requestID, reqCtx.mentionedItems, convertImageAttachments(reqCtx.images))
+	if err != nil {
 		reqCtx.c.Error(errors.NewInternalServerError(err.Error()))
 		return
 	}
+	reqCtx.userMessageID = userMsg.ID

 	// Create assistant message
 	if _, err := h.createAssistantMessage(ctx, reqCtx.assistantMessage); err != nil {
@@ -515,7 +518,7 @@ func (h *Handler) executeNormalModeQA(reqCtx *qaRequestContext, generateTitle bo
 			})
 		}

-		imageURLs, imageOCRText := extractImageURLsAndOCRText(reqCtx.images)
+		imageURLs, imageDescription := extractImageURLsAndOCRText(reqCtx.images)
 		err := h.sessionService.KnowledgeQA(
 			streamCtx.asyncCtx,
 			reqCtx.session,
@@ -528,7 +531,8 @@ func (h *Handler) executeNormalModeQA(reqCtx *qaRequestContext, generateTitle bo
 			streamCtx.eventBus,
 			reqCtx.customAgent,
 			reqCtx.enableMemory,
-			imageURLs, imageOCRText,
+			imageURLs, imageDescription,
+			reqCtx.userMessageID,
 		)
 		if err != nil {
 			logger.ErrorWithFields(streamCtx.asyncCtx, err, nil)
@@ -571,10 +575,12 @@ func (h *Handler) executeAgentModeQA(reqCtx *qaRequestContext) {
 	}

 	// Create user message
-	if err := h.createUserMessage(ctx, sessionID, reqCtx.query, reqCtx.requestID, reqCtx.mentionedItems, convertImageAttachments(reqCtx.images)); err != nil {
+	userMsg, err := h.createUserMessage(ctx, sessionID, reqCtx.query, reqCtx.requestID, reqCtx.mentionedItems, convertImageAttachments(reqCtx.images))
+	if err != nil {
 		reqCtx.c.Error(errors.NewInternalServerError(err.Error()))
 		return
 	}
+	reqCtx.userMessageID = userMsg.ID

 	// Create assistant message
 	assistantMessagePtr, err := h.createAssistantMessage(ctx, reqCtx.assistantMessage)
@@ -635,7 +641,7 @@ func (h *Handler) executeAgentModeQA(reqCtx *qaRequestContext) {
 			})
 		}

-		imageURLs, imageOCRText := extractImageURLsAndOCRText(reqCtx.images)
+		imageURLs, imageDescription := extractImageURLsAndOCRText(reqCtx.images)
 		err := h.sessionService.AgentQA(
 			streamCtx.asyncCtx,
 			reqCtx.session,
@@ -646,7 +652,8 @@ func (h *Handler) executeAgentModeQA(reqCtx *qaRequestContext) {
 			reqCtx.customAgent,
 			reqCtx.knowledgeBaseIDs,
 			reqCtx.knowledgeIDs,
-			imageURLs, imageOCRText,
+			imageURLs, imageDescription,
+			reqCtx.userMessageID,
 		)
 		if err != nil {
 			logger.ErrorWithFields(streamCtx.asyncCtx, err, nil)
--- a/internal/types/chat_manage.go
+++ b/internal/types/chat_manage.go
@@ -62,8 +62,9 @@ type ChatManage struct {
 	FAQScoreBoost            float64 `json:"-"` // Score multiplier for FAQ results

 	// Image support for multimodal chat
+	UserMessageID           string   `json:"-"` // User message ID for updating image captions after rewrite
 	Images                  []string `json:"-"` // Image URLs for MultiContent in current user message
-	ImageOCRText            string   `json:"-"` // Image description/OCR text generated by VLM (used as fallback for non-vision models)
+	ImageDescription        string   `json:"-"` // Image description (visual details + OCR text) generated by VLM (used as fallback for non-vision models)
 	VLMModelID              string   `json:"-"` // Agent-configured VLM model ID for image analysis
 	ChatModelSupportsVision bool     `json:"-"` // Whether the chat model accepts multimodal/image input
 	SkipKBSearch            bool     `json:"-"` // Set by rewrite intent classification: true = skip KB retrieval
@@ -136,8 +137,9 @@ func (c *ChatManage) Clone() *ChatManage {
 		FAQPriorityEnabled:       c.FAQPriorityEnabled,
 		FAQDirectAnswerThreshold: c.FAQDirectAnswerThreshold,
 		FAQScoreBoost:            c.FAQScoreBoost,
+		UserMessageID:            c.UserMessageID,
 		Images:                   append([]string(nil), c.Images...),
-		ImageOCRText:             c.ImageOCRText,
+		ImageDescription:         c.ImageDescription,
 		VLMModelID:               c.VLMModelID,
 		ChatModelSupportsVision:  c.ChatModelSupportsVision,
 		SkipKBSearch:             c.SkipKBSearch,
--- a/internal/types/interfaces/message.go
+++ b/internal/types/interfaces/message.go
@@ -29,6 +29,9 @@ type MessageService interface {
 	// UpdateMessage updates a message
 	UpdateMessage(ctx context.Context, message *types.Message) error

+	// UpdateMessageImages updates only the images JSONB column for a message.
+	UpdateMessageImages(ctx context.Context, sessionID, messageID string, images types.MessageImages) error
+
 	// DeleteMessage deletes a message
 	DeleteMessage(ctx context.Context, sessionID string, id string) error

@@ -66,6 +69,8 @@ type MessageRepository interface {
 	) ([]*types.Message, error)
 	// UpdateMessage updates a message
 	UpdateMessage(ctx context.Context, message *types.Message) error
+	// UpdateMessageImages updates only the images JSONB column for a message
+	UpdateMessageImages(ctx context.Context, sessionID, messageID string, images types.MessageImages) error
 	// DeleteMessage deletes a message
 	DeleteMessage(ctx context.Context, sessionID string, id string) error
 	// GetFirstMessageOfUser gets the first message of a user
--- a/internal/types/interfaces/session.go
+++ b/internal/types/interfaces/session.go
@@ -44,7 +44,7 @@ type SessionService interface {
 		session *types.Session, query string, knowledgeBaseIDs []string, knowledgeIDs []string,
 		assistantMessageID string, summaryModelID string, webSearchEnabled bool, eventBus *event.EventBus,
 		customAgent *types.CustomAgent, enableMemory bool,
-		imageURLs []string, imageOCRText string,
+		imageURLs []string, imageDescription string, userMessageID string,
 	) error
 	// KnowledgeQAByEvent performs knowledge-based question answering by event
 	KnowledgeQAByEvent(ctx context.Context, chatManage *types.ChatManage, eventList []types.EventType) error
@@ -66,7 +66,7 @@ type SessionService interface {
 		customAgent *types.CustomAgent,
 		knowledgeBaseIDs []string,
 		knowledgeIDs []string,
-		imageURLs []string, imageOCRText string,
+		imageURLs []string, imageDescription string, userMessageID string,
 	) error
 	// ClearContext clears the LLM context for a session
 	ClearContext(ctx context.Context, sessionID string) error