Files
WeKnora/internal/application/service/knowledge_process.go
wizardchen 4a0a151d41 fix(knowledge): drain finalizing counter on all terminal subtask exits
The finalizing subtask counter (introduced when wiki ingest was counted)
could leak slots, leaving a fully-parsed doc stuck in "finalizing" until
the housekeeping sweep wrongly marked it "failed".

- wiki ingest: a doc skipped in map (knowledge deleted / no chunks /
  insufficient text) produced no docResult and was not a failedOp, so
  neither the success nor the dead-letter drain fired. Drain the slot on
  that terminal skip path.
- summary & question: the drain was keyed on the span-error variable,
  which assumes "err != nil => asynq will retry". Several branches set
  that variable yet `return nil` (insufficient text content, KB/knowledge
  fetch failures) - terminal, no retry - so the drain was skipped. Key
  the drain on the value actually returned to asynq (named retErr)
  instead, so terminal nil-returns drain and only retried errors wait
  for the final attempt.

Also fix the trace panel header flashing "已完成" mid-wiki: the latest
attempt's root span closes while async post-pipeline subspans keep
running, so trace.status read terminal while the row was still
"finalizing". Prefer parse_status on the latest attempt while it is
non-terminal so the panel header, LIVE badge and doc card agree, and add
the "finalizing" status label to all locales.
2026-05-29 19:32:13 +08:00

3022 lines
112 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package service
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"slices"
"sort"
"strings"
"time"
"github.com/Tencent/WeKnora/internal/application/service/retriever"
werrors "github.com/Tencent/WeKnora/internal/errors"
"github.com/Tencent/WeKnora/internal/infrastructure/chunker"
"github.com/Tencent/WeKnora/internal/infrastructure/docparser"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/models/chat"
"github.com/Tencent/WeKnora/internal/models/embedding"
"github.com/Tencent/WeKnora/internal/searchutil"
"github.com/Tencent/WeKnora/internal/tracing"
"github.com/Tencent/WeKnora/internal/tracing/langfuse"
"github.com/Tencent/WeKnora/internal/types"
"github.com/Tencent/WeKnora/internal/types/interfaces"
secutils "github.com/Tencent/WeKnora/internal/utils"
"github.com/google/uuid"
"github.com/hibiken/asynq"
"go.opentelemetry.io/otel/attribute"
)
func (s *knowledgeService) cloneKnowledge(
ctx context.Context,
src *types.Knowledge,
targetKB *types.KnowledgeBase,
) (err error) {
if src.ParseStatus != "completed" {
logger.GetLogger(ctx).WithField("knowledge_id", src.ID).Errorf("MoveKnowledge parse status is not completed")
return nil
}
tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant)
dst := &types.Knowledge{
ID: uuid.New().String(),
TenantID: targetKB.TenantID,
KnowledgeBaseID: targetKB.ID,
Type: src.Type,
Channel: src.Channel,
Title: src.Title,
Description: src.Description,
Source: src.Source,
ParseStatus: "processing",
EnableStatus: "disabled",
EmbeddingModelID: targetKB.EmbeddingModelID,
FileName: src.FileName,
FileType: src.FileType,
FileSize: src.FileSize,
FileHash: src.FileHash,
FilePath: src.FilePath,
StorageSize: src.StorageSize,
Metadata: src.Metadata,
}
defer func() {
if err != nil {
dst.ParseStatus = "failed"
dst.ErrorMessage = err.Error()
_ = s.repo.UpdateKnowledge(ctx, dst)
logger.GetLogger(ctx).WithField("error", err).Errorf("MoveKnowledge failed to move knowledge")
} else {
dst.ParseStatus = "completed"
dst.EnableStatus = "enabled"
_ = s.repo.UpdateKnowledge(ctx, dst)
logger.GetLogger(ctx).WithField("knowledge_id", dst.ID).Infof("MoveKnowledge move knowledge successfully")
}
}()
if err = s.repo.CreateKnowledge(ctx, dst); err != nil {
logger.GetLogger(ctx).WithField("error", err).Errorf("MoveKnowledge create knowledge failed")
return
}
tenantInfo.StorageUsed += dst.StorageSize
if err = s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, dst.StorageSize); err != nil {
logger.GetLogger(ctx).WithField("error", err).Errorf("MoveKnowledge update tenant storage used failed")
return
}
if err = s.CloneChunk(ctx, src, dst); err != nil {
logger.GetLogger(ctx).WithField("knowledge_id", dst.ID).
WithField("error", err).Errorf("MoveKnowledge move chunks failed")
return
}
return
}
// processDocumentFromPassage handles asynchronous processing of text passages
func (s *knowledgeService) processDocumentFromPassage(ctx context.Context,
kb *types.KnowledgeBase, knowledge *types.Knowledge, passage []string,
) {
// Update status to processing
knowledge.ParseStatus = "processing"
knowledge.UpdatedAt = time.Now()
if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil {
return
}
// Convert passages to chunks
chunks := make([]types.ParsedChunk, 0, len(passage))
start, end := 0, 0
for i, p := range passage {
if p == "" {
continue
}
end += len([]rune(p))
chunks = append(chunks, types.ParsedChunk{
Content: p,
Seq: i,
Start: start,
End: end,
})
start = end
}
// Process and store chunks
var opts ProcessChunksOptions
if kb.QuestionGenerationConfig != nil && kb.QuestionGenerationConfig.Enabled {
opts.EnableQuestionGeneration = true
opts.QuestionCount = kb.QuestionGenerationConfig.QuestionCount
if opts.QuestionCount <= 0 {
opts.QuestionCount = 3
}
}
s.processChunks(ctx, kb, knowledge, chunks, opts)
}
// ProcessChunksOptions contains options for processing chunks
type ProcessChunksOptions struct {
EnableQuestionGeneration bool
QuestionCount int
EnableMultimodel bool
StoredImages []docparser.StoredImage
// ParentChunks holds parent chunk data when parent-child chunking is enabled.
// When set, the chunks passed to processChunks are child chunks, and each
// child's ParentIndex references an entry in this slice.
ParentChunks []types.ParsedParentChunk
Metadata map[string]string
}
// finalizeIndexedKnowledgeState makes a document retrievable as soon as chunks
// and indexes are persisted (enable_status=enabled), but it deliberately does
// NOT mark the row completed when enrichment is still expected. Whenever the
// document still has work to fan out — pending multimodal image tasks, or text
// chunks that feed summary/question/graph generation — parse_status stays
// "processing" so KnowledgePostProcess remains the single authority that drives
// processing → finalizing → completed. Marking the row completed here would make
// post-process hit its "non-processing status" guard and skip the summary
// fan-out, stranding summary_status on "pending" forever.
func finalizeIndexedKnowledgeState(
knowledge *types.Knowledge,
totalStorageSize int64,
textChunkCount int,
hasPendingMultimodal bool,
now time.Time,
) {
if hasPendingMultimodal || textChunkCount > 0 {
knowledge.ParseStatus = types.ParseStatusProcessing
knowledge.SummaryStatus = types.SummaryStatusNone
} else {
// No text chunks and no pending multimodal work: there is nothing for
// post-process to enrich, so complete immediately.
knowledge.ParseStatus = types.ParseStatusCompleted
knowledge.SummaryStatus = types.SummaryStatusNone
}
knowledge.EnableStatus = "enabled"
knowledge.StorageSize = totalStorageSize
knowledge.ProcessedAt = &now
knowledge.UpdatedAt = now
}
// buildSplitterConfig creates a SplitterConfig with fallbacks from a KnowledgeBase.
// Defaults mirror chunker.DefaultChunkSize / DefaultChunkOverlap so behavior is
// identical whether callers come through this path or invoke the chunker
// directly with a zero-value config.
func buildSplitterConfig(kb *types.KnowledgeBase) chunker.SplitterConfig {
chunkCfg := chunker.SplitterConfig{
ChunkSize: kb.ChunkingConfig.ChunkSize,
ChunkOverlap: kb.ChunkingConfig.ChunkOverlap,
Separators: kb.ChunkingConfig.Separators,
Strategy: kb.ChunkingConfig.Strategy,
TokenLimit: kb.ChunkingConfig.TokenLimit,
Languages: kb.ChunkingConfig.Languages,
}
if chunkCfg.ChunkSize <= 0 {
chunkCfg.ChunkSize = chunker.DefaultChunkSize
}
if chunkCfg.ChunkOverlap <= 0 {
chunkCfg.ChunkOverlap = chunker.DefaultChunkOverlap
}
if len(chunkCfg.Separators) == 0 {
chunkCfg.Separators = []string{"\n\n", "\n", "。"}
}
return chunkCfg
}
// buildParentChildConfigs derives parent and child SplitterConfig from ChunkingConfig.
// The base config (already validated with defaults) is used for separators.
func buildParentChildConfigs(cc types.ChunkingConfig, base chunker.SplitterConfig) (parent, child chunker.SplitterConfig) {
parentSize := cc.ParentChunkSize
if parentSize <= 0 {
parentSize = 4096
}
childSize := cc.ChildChunkSize
if childSize <= 0 {
childSize = 384
}
parent = chunker.SplitterConfig{
ChunkSize: parentSize,
ChunkOverlap: base.ChunkOverlap, // reuse configured overlap for parents
Separators: base.Separators,
}
child = chunker.SplitterConfig{
ChunkSize: childSize,
ChunkOverlap: childSize / 5, // ~20% overlap for child chunks
Separators: base.Separators,
}
return
}
// processChunks processes chunks and creates embeddings for knowledge content
func (s *knowledgeService) processChunks(ctx context.Context,
kb *types.KnowledgeBase, knowledge *types.Knowledge, chunks []types.ParsedChunk,
opts ...ProcessChunksOptions,
) {
// Get options
var options ProcessChunksOptions
if len(opts) > 0 {
options = opts[0]
}
ctx, span := tracing.ContextWithSpan(ctx, "knowledgeService.processChunks")
defer span.End()
span.SetAttributes(
attribute.Int("tenant_id", int(knowledge.TenantID)),
attribute.String("knowledge_base_id", knowledge.KnowledgeBaseID),
attribute.String("knowledge_id", knowledge.ID),
attribute.String("embedding_model_id", kb.EmbeddingModelID),
attribute.Int("chunk_count", len(chunks)),
)
// Check if knowledge is being deleted/cancelled before processing.
// Both statuses short-circuit identically here — there's nothing to clean
// up yet so the branch is purely "stop early".
if aborted, status := s.isKnowledgeAborted(ctx, knowledge.TenantID, knowledge.ID); aborted {
logger.Infof(ctx, "Knowledge aborted (%s), skipping chunk processing: %s", status, knowledge.ID)
span.AddEvent("aborted: knowledge " + status)
return
}
// Get embedding model for vectorization — only needed when vector/keyword indexing is enabled
var embeddingModel embedding.Embedder
if kb.NeedsEmbeddingModel() {
var err error
embeddingModel, err = s.modelService.GetEmbeddingModel(ctx, kb.EmbeddingModelID)
if err != nil {
logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks get embedding model failed")
span.RecordError(err)
return
}
} else {
logger.Infof(ctx, "Vector/keyword indexing disabled for KB %s, skipping embedding model", kb.ID)
}
// 幂等性处理清理旧的chunks和索引数据避免重复数据
logger.Infof(ctx, "Cleaning up existing chunks and index data for knowledge: %s", knowledge.ID)
// 删除旧的chunks
if err := s.chunkService.DeleteChunksByKnowledgeID(ctx, knowledge.ID); err != nil {
logger.Warnf(ctx, "Failed to delete existing chunks (may not exist): %v", err)
// 不返回错误,继续处理(可能没有旧数据)
}
// 删除旧的索引数据 — only when vector/keyword indexing is enabled
tenantInfo := ctx.Value(types.TenantInfoContextKey).(*types.Tenant)
retrieveEngine, err := retriever.CreateRetrieveEngineForKB(
ctx, s.retrieveEngine, s.ownership, tenantInfo.ID, kb.VectorStoreID)
if err == nil && embeddingModel != nil {
if err := retrieveEngine.DeleteByKnowledgeIDList(ctx, []string{knowledge.ID}, embeddingModel.GetDimensions(), knowledge.Type); err != nil {
logger.Warnf(ctx, "Failed to delete existing index data (may not exist): %v", err)
// 不返回错误,继续处理(可能没有旧数据)
} else {
logger.Infof(ctx, "Successfully deleted existing index data for knowledge: %s", knowledge.ID)
}
}
// 删除知识图谱数据(如果存在)
namespace := types.NameSpace{KnowledgeBase: knowledge.KnowledgeBaseID, Knowledge: knowledge.ID}
if err := s.graphEngine.DelGraph(ctx, []types.NameSpace{namespace}); err != nil {
logger.Warnf(ctx, "Failed to delete existing graph data (may not exist): %v", err)
// 不返回错误,继续处理
}
logger.Infof(ctx, "Cleanup completed, starting to process new chunks")
// ========== DocReader 解析结果日志 ==========
logger.Infof(ctx, "[DocReader] ========== 解析结果概览 ==========")
logger.Infof(ctx, "[DocReader] 知识ID: %s, 知识库ID: %s", knowledge.ID, knowledge.KnowledgeBaseID)
logger.Infof(ctx, "[DocReader] 总Chunk数量: %d", len(chunks))
// 统计图片信息
totalImages := 0
chunksWithImages := 0
for _, chunkData := range chunks {
if len(chunkData.Images) > 0 {
chunksWithImages++
totalImages += len(chunkData.Images)
}
}
logger.Infof(ctx, "[DocReader] 包含图片的Chunk数: %d, 总图片数: %d", chunksWithImages, totalImages)
// 打印每个Chunk的详细信息
for idx, chunkData := range chunks {
contentPreview := chunkData.Content
if len(contentPreview) > 200 {
contentPreview = contentPreview[:200] + "..."
}
logger.Infof(ctx, "[DocReader] Chunk #%d (seq=%d): 内容长度=%d, 图片数=%d, 范围=[%d-%d]",
idx, chunkData.Seq, len(chunkData.Content), len(chunkData.Images), chunkData.Start, chunkData.End)
logger.Debugf(ctx, "[DocReader] Chunk #%d 内容预览: %s", idx, contentPreview)
// 打印图片详细信息
for imgIdx, img := range chunkData.Images {
logger.Infof(ctx, "[DocReader] 图片 #%d: URL=%s", imgIdx, img.URL)
logger.Infof(ctx, "[DocReader] 图片 #%d: OriginalURL=%s", imgIdx, img.OriginalURL)
if img.Caption != "" {
captionPreview := img.Caption
if len(captionPreview) > 100 {
captionPreview = captionPreview[:100] + "..."
}
logger.Infof(ctx, "[DocReader] 图片 #%d: Caption=%s", imgIdx, captionPreview)
}
if img.OCRText != "" {
ocrPreview := img.OCRText
if len(ocrPreview) > 100 {
ocrPreview = ocrPreview[:100] + "..."
}
logger.Infof(ctx, "[DocReader] 图片 #%d: OCRText=%s", imgIdx, ocrPreview)
}
logger.Infof(ctx, "[DocReader] 图片 #%d: 位置=[%d-%d]", imgIdx, img.Start, img.End)
}
}
logger.Infof(ctx, "[DocReader] ========== 解析结果概览结束 ==========")
// Create chunk objects from proto chunks
maxSeq := 0
// 统计图片相关的子Chunk数量用于扩展insertChunks的容量
imageChunkCount := 0
for _, chunkData := range chunks {
if len(chunkData.Images) > 0 {
// 为每个图片的OCR和Caption分别创建一个Chunk
imageChunkCount += len(chunkData.Images) * 2
}
if int(chunkData.Seq) > maxSeq {
maxSeq = int(chunkData.Seq)
}
}
// === Parent-Child Chunking: create parent chunks first ===
hasParentChild := len(options.ParentChunks) > 0
var parentDBChunks []*types.Chunk // indexed by ParsedParentChunk position
if hasParentChild {
parentDBChunks = make([]*types.Chunk, len(options.ParentChunks))
for i, pc := range options.ParentChunks {
parentDBChunks[i] = &types.Chunk{
ID: uuid.New().String(),
TenantID: knowledge.TenantID,
KnowledgeID: knowledge.ID,
KnowledgeBaseID: knowledge.KnowledgeBaseID,
Content: pc.Content,
ChunkIndex: pc.Seq,
IsEnabled: true,
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
StartAt: pc.Start,
EndAt: pc.End,
ChunkType: types.ChunkTypeParentText,
}
}
// Set prev/next links for parent chunks
for i := range parentDBChunks {
if i > 0 {
parentDBChunks[i-1].NextChunkID = parentDBChunks[i].ID
parentDBChunks[i].PreChunkID = parentDBChunks[i-1].ID
}
}
logger.Infof(ctx, "Created %d parent chunks for parent-child strategy", len(parentDBChunks))
}
// 重新分配容量考虑图片相关的Chunk + parent chunks
parentCount := len(options.ParentChunks)
insertChunks := make([]*types.Chunk, 0, len(chunks)+imageChunkCount+parentCount)
// Add parent chunks first (they go into DB but NOT into the vector index)
if hasParentChild {
insertChunks = append(insertChunks, parentDBChunks...)
}
for idx, chunkData := range chunks {
if strings.TrimSpace(chunkData.Content) == "" {
continue
}
// 创建主文本Chunk
textChunk := &types.Chunk{
ID: uuid.New().String(),
TenantID: knowledge.TenantID,
KnowledgeID: knowledge.ID,
KnowledgeBaseID: knowledge.KnowledgeBaseID,
Content: chunkData.Content,
ContextHeader: chunkData.ContextHeader,
ChunkIndex: int(chunkData.Seq),
IsEnabled: true,
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
StartAt: int(chunkData.Start),
EndAt: int(chunkData.End),
ChunkType: types.ChunkTypeText,
}
// Wire up ParentChunkID for child chunks
if hasParentChild && chunkData.ParentIndex >= 0 && chunkData.ParentIndex < len(parentDBChunks) {
textChunk.ParentChunkID = parentDBChunks[chunkData.ParentIndex].ID
}
chunks[idx].ChunkID = textChunk.ID
insertChunks = append(insertChunks, textChunk)
}
// Sort chunks by index for proper ordering
sort.Slice(insertChunks, func(i, j int) bool {
return insertChunks[i].ChunkIndex < insertChunks[j].ChunkIndex
})
// 仅为文本类型的Chunk设置前后关系child chunks only, parents already linked above
textChunks := make([]*types.Chunk, 0, len(chunks))
for _, chunk := range insertChunks {
if chunk.ChunkType == types.ChunkTypeText && chunk.ParentChunkID != "" {
// This is a child chunk in parent-child mode
textChunks = append(textChunks, chunk)
} else if chunk.ChunkType == types.ChunkTypeText && !hasParentChild {
// Normal flat chunk (no parent-child mode)
textChunks = append(textChunks, chunk)
}
}
// 设置文本Chunk之间的前后关系 (skip if parent-child, children don't need prev/next links)
if !hasParentChild {
for i, chunk := range textChunks {
if i > 0 {
textChunks[i-1].NextChunkID = chunk.ID
}
if i < len(textChunks)-1 {
textChunks[i+1].PreChunkID = chunk.ID
}
}
}
// Check if knowledge is being deleted/cancelled before writing chunks.
// Nothing has been persisted yet, so both branches just bail.
if aborted, status := s.isKnowledgeAborted(ctx, knowledge.TenantID, knowledge.ID); aborted {
logger.Infof(ctx, "Knowledge aborted (%s), skipping chunk write: %s", status, knowledge.ID)
span.AddEvent("aborted: knowledge " + status + " before saving")
return
}
// Save chunks to database — ALWAYS, regardless of indexing strategy.
// Chunks are needed for wiki generation, graph extraction, and summary generation
// even when vector/keyword indexing is disabled.
span.AddEvent("create chunks")
s.beginStage(ctx, knowledge.ID, types.StageChunking, types.JSONMap{
"chunks_planned": len(insertChunks),
})
if err := s.chunkService.CreateChunks(ctx, insertChunks); err != nil {
knowledge.ParseStatus = types.ParseStatusFailed
knowledge.ErrorMessage = err.Error()
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
span.RecordError(err)
s.failStage(ctx, knowledge.ID, types.StageChunking,
werrors.ErrCodeChunkingFailed, "create chunks failed", err)
return
}
totalChunkChars := 0
for _, c := range insertChunks {
totalChunkChars += len(c.Content)
}
s.endStage(ctx, knowledge.ID, types.StageChunking, types.JSONMap{
"chunks_written": len(insertChunks),
"total_text_chars": totalChunkChars,
})
// Create index information and perform vector indexing — only when vector/keyword is enabled.
// Chunks are ALWAYS saved to DB (above) because wiki and graph need them even without vector indexing.
var totalStorageSize int64
if kb.NeedsEmbeddingModel() && embeddingModel != nil {
embedInput := types.JSONMap{
"chunks_to_embed": len(textChunks),
"model_id": kb.EmbeddingModelID,
}
if dim := embeddingModel.GetDimensions(); dim > 0 {
embedInput["dim"] = dim
}
s.beginStage(ctx, knowledge.ID, types.StageEmbedding, embedInput)
// Create index information — only for child/flat chunks, NOT parent chunks.
// Parent chunks are stored for context retrieval but do not need vector embeddings.
// Prepend the document title to improve semantic alignment between
// question-style queries and statement-style chunk content.
indexInfoList := make([]*types.IndexInfo, 0, len(textChunks))
titlePrefix := ""
if t := strings.TrimSpace(knowledge.Title); t != "" {
titlePrefix = t + "\n"
}
for _, chunk := range textChunks {
// chunk.EmbeddingContent prepends ContextHeader (heading breadcrumb)
// when the chunker populated it during Tier-1 splitting; falls back
// to plain Content otherwise. Title prefix sits outermost.
indexContent := titlePrefix + chunk.EmbeddingContent()
indexInfoList = append(indexInfoList, &types.IndexInfo{
Content: indexContent,
SourceID: chunk.ID,
SourceType: types.ChunkSourceType,
ChunkID: chunk.ID,
KnowledgeID: knowledge.ID,
KnowledgeBaseID: knowledge.KnowledgeBaseID,
IsEnabled: true,
})
}
// Calculate storage size required for embeddings
span.AddEvent("estimate storage size")
totalStorageSize = retrieveEngine.EstimateStorageSize(ctx, embeddingModel, indexInfoList)
if tenantInfo.StorageQuota > 0 {
// Re-fetch tenant storage information
tenantInfo, err = s.tenantRepo.GetTenantByID(ctx, tenantInfo.ID)
if err != nil {
knowledge.ParseStatus = types.ParseStatusFailed
knowledge.ErrorMessage = err.Error()
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
span.RecordError(err)
return
}
// Check if there's enough storage quota available
if tenantInfo.StorageUsed+totalStorageSize > tenantInfo.StorageQuota {
knowledge.ParseStatus = types.ParseStatusFailed
knowledge.ErrorMessage = "存储空间不足"
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
span.RecordError(errors.New("storage quota exceeded"))
return
}
}
// Check again before batch indexing (heavy operation).
// deleting → row is going away anyway, drop the chunks we just wrote.
// cancelled → user wants to keep what was already persisted, just stop.
if aborted, status := s.isKnowledgeAborted(ctx, knowledge.TenantID, knowledge.ID); aborted {
logger.Infof(ctx, "Knowledge aborted (%s) before indexing: %s", status, knowledge.ID)
if status == types.ParseStatusDeleting {
if err := s.chunkService.DeleteChunksByKnowledgeID(ctx, knowledge.ID); err != nil {
logger.Warnf(ctx, "Failed to cleanup chunks after deletion detected: %v", err)
}
}
span.AddEvent("aborted: knowledge " + status + " before indexing")
return
}
span.AddEvent("batch index")
err = retrieveEngine.BatchIndex(ctx, embeddingModel, indexInfoList)
if err != nil {
knowledge.ParseStatus = types.ParseStatusFailed
knowledge.ErrorMessage = err.Error()
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
// delete failed chunks
if err := s.chunkService.DeleteChunksByKnowledgeID(ctx, knowledge.ID); err != nil {
logger.Errorf(ctx, "Delete chunks failed: %v", err)
}
// delete index
if err := retrieveEngine.DeleteByKnowledgeIDList(
ctx, []string{knowledge.ID}, embeddingModel.GetDimensions(), kb.Type,
); err != nil {
logger.Errorf(ctx, "Delete index failed: %v", err)
}
span.RecordError(err)
// Map vector store / embedding rate-limit errors to a
// stable code so the UI can offer "retry later" hints.
code := werrors.ErrCodeVectorStoreWriteFailed
if isLikelyRateLimitError(err) {
code = werrors.ErrCodeEmbeddingRateLimit
}
s.failStage(ctx, knowledge.ID, types.StageEmbedding,
code, "batch index failed", err)
return
}
logger.GetLogger(ctx).Infof("processChunks batch index successfully, with %d index", len(indexInfoList))
s.endStage(ctx, knowledge.ID, types.StageEmbedding, types.JSONMap{
"vectors_written": len(indexInfoList),
"storage_bytes": totalStorageSize,
})
// Final check before marking as completed.
// deleting → drop chunks+index we just wrote.
// cancelled → keep persisted data; the row stays in cancelled status
// and downstream stages skip via the entry guards.
if aborted, status := s.isKnowledgeAborted(ctx, knowledge.TenantID, knowledge.ID); aborted {
logger.Infof(ctx, "Knowledge aborted (%s) after indexing: %s", status, knowledge.ID)
if status == types.ParseStatusDeleting {
if err := s.chunkService.DeleteChunksByKnowledgeID(ctx, knowledge.ID); err != nil {
logger.Warnf(ctx, "Failed to cleanup chunks after deletion detected: %v", err)
}
if err := retrieveEngine.DeleteByKnowledgeIDList(ctx, []string{knowledge.ID}, embeddingModel.GetDimensions(), kb.Type); err != nil {
logger.Warnf(ctx, "Failed to cleanup index after deletion detected: %v", err)
}
}
span.AddEvent("aborted: knowledge " + status + " during processing")
return
}
} else {
logger.Infof(ctx, "Vector/keyword indexing disabled for KB %s, skipping BatchIndex", kb.ID)
s.skipStage(ctx, knowledge.ID, types.StageEmbedding, "skipped")
}
// Check if this document has extracted images that will be processed asynchronously
isImage := IsImageType(knowledge.FileType)
isVideo := IsVideoType(knowledge.FileType)
pendingMultimodal := isImage && options.EnableMultimodel && len(options.StoredImages) > 0
pendingPDFMultimodal := !isImage && !isVideo && options.EnableMultimodel && len(options.StoredImages) > 0
now := time.Now()
finalizeIndexedKnowledgeState(
knowledge,
totalStorageSize,
len(textChunks),
pendingMultimodal || pendingPDFMultimodal,
now,
)
if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil {
logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks update knowledge failed")
}
// Enqueue multimodal tasks for images (async, non-blocking)
if options.EnableMultimodel && len(options.StoredImages) > 0 {
s.beginStage(ctx, knowledge.ID, types.StageMultimodal, types.JSONMap{
"image_count": len(options.StoredImages),
"enable_ocr": true,
"enable_caption": true,
})
s.enqueueImageMultimodalTasks(ctx, knowledge, kb, options.StoredImages, chunks, options.Metadata)
} else {
s.skipStage(ctx, knowledge.ID, types.StageMultimodal, "skipped")
// If there are no multimodal tasks, enqueue the post process task immediately
lang, _ := types.LanguageFromContext(ctx)
postProcessPayload := types.KnowledgePostProcessPayload{
TenantID: knowledge.TenantID,
KnowledgeID: knowledge.ID,
KnowledgeBaseID: knowledge.KnowledgeBaseID,
Language: lang,
Attempt: attemptFromCtx(ctx),
}
langfuse.InjectTracing(ctx, &postProcessPayload)
payloadBytes, err := json.Marshal(postProcessPayload)
if err == nil {
task := asynq.NewTask(types.TypeKnowledgePostProcess, payloadBytes, asynq.Queue("default"), asynq.MaxRetry(3))
if _, err := s.task.Enqueue(task); err != nil {
logger.Errorf(ctx, "Failed to enqueue knowledge post process task: %v", err)
} else {
logger.Infof(ctx, "Enqueued knowledge post process task for %s", knowledge.ID)
}
} else {
logger.Errorf(ctx, "Failed to marshal knowledge post process payload: %v", err)
}
}
// Update tenant's storage usage
tenantInfo.StorageUsed += totalStorageSize
if err := s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, totalStorageSize); err != nil {
logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks update tenant storage used failed")
}
logger.GetLogger(ctx).Infof("processChunks successfully")
}
// defaultMaxInputChars is the default maximum characters used as input for summary generation.
const defaultMaxInputChars = 1024 * 24
// imageDominatedTextThreshold is the rune count below which a document is
// considered "image-dominated" — i.e. the body text is so sparse that we
// should fall back to full image enrichment (caption + OCR) for the summary
// LLM call. Above this threshold the document has enough native text that
// caption-only enrichment is preferable (OCR text from incidental figures
// would otherwise add noise without contributing to the main topic).
const imageDominatedTextThreshold = 200
// errInsufficientSummaryContent signals that getSummary refused to call the
// LLM because the document had no usable text after image markup was stripped
// (typical for scanned PDFs where VLM OCR yielded nothing). Callers should
// mark the knowledge's summary as failed instead of falling back to the first
// chunk's raw content (which would just be a bare image reference).
var errInsufficientSummaryContent = errors.New("insufficient text content for summary generation")
// checkSufficientSummaryContent returns errInsufficientSummaryContent if the
// given content does not carry enough real text (after stripping image markup)
// for an LLM summary call, and logs a warning at the call site. Returns nil
// when the content passes the threshold.
//
// Extracted so the threshold gate can be unit-tested without standing up the
// full ProcessSummaryGeneration dependency graph.
func checkSufficientSummaryContent(ctx context.Context, knowledgeID, content string) error {
realTextLen := realTextRuneCount(content)
if realTextLen < minTextContentRunes {
logger.GetLogger(ctx).Warnf(
"summary content check: knowledge %s has insufficient text after stripping image markup (real_text_runes=%d, min=%d); skipping LLM call",
knowledgeID, realTextLen, minTextContentRunes,
)
return errInsufficientSummaryContent
}
return nil
}
// getSummary generates a summary for knowledge content using an AI model
func (s *knowledgeService) getSummary(ctx context.Context,
summaryModel chat.Chat, knowledge *types.Knowledge, chunks []*types.Chunk,
) (string, error) {
// Get knowledge info from the first chunk
if len(chunks) == 0 {
return "", fmt.Errorf("no chunks provided for summary generation")
}
// Determine max input chars from config
maxInputChars := defaultMaxInputChars
if s.config.Conversation.Summary != nil && s.config.Conversation.Summary.MaxInputChars > 0 {
maxInputChars = s.config.Conversation.Summary.MaxInputChars
}
// Sort chunks by StartAt for proper concatenation
sortedChunks := make([]*types.Chunk, len(chunks))
copy(sortedChunks, chunks)
sort.Slice(sortedChunks, func(i, j int) bool {
return sortedChunks[i].StartAt < sortedChunks[j].StartAt
})
// Concatenate original chunk contents by StartAt offset to reconstruct the
// document, then enrich with image info in a second pass. Enrichment must
// happen AFTER concatenation because StartAt is based on original document
// offsets — enriched (longer) content would break the positioning.
chunkContents := ""
for _, chunk := range sortedChunks {
runes := []rune(chunkContents)
if chunk.StartAt <= len(runes) {
chunkContents = string(runes[:chunk.StartAt]) + chunk.Content
} else {
chunkContents = chunkContents + chunk.Content
}
}
// Collect image_info from image_ocr/image_caption children and enrich
chunkIDs := make([]string, len(sortedChunks))
for i, c := range sortedChunks {
chunkIDs[i] = c.ID
}
imageInfoMap := searchutil.CollectImageInfoByChunkIDs(ctx, s.chunkRepo, knowledge.TenantID, chunkIDs)
mergedImageInfo := searchutil.MergeImageInfoJSON(imageInfoMap)
if mergedImageInfo != "" {
// For image-dominated documents (e.g. a docx whose only payload is a
// single embedded picture, or a screenshot-only file), captions alone
// often carry too little signal — the real content lives in OCR text.
// Detect that case by measuring the document's real (non-image-markup)
// text BEFORE enrichment, and switch to full enrichment (caption + OCR)
// when the body is essentially empty. Text-heavy documents stay on the
// caption-only path to avoid OCR noise (page headers/footers/watermarks
// from many figures diluting the main topic).
if realTextRuneCount(chunkContents) < imageDominatedTextThreshold {
// Caption + OCR (no URL/original wrappers — those are pure noise
// for the summary LLM and have been observed to trigger the
// "image reference with no extracted text" refusal heuristic).
chunkContents = searchutil.EnrichContentCaptionAndOCR(chunkContents, mergedImageInfo)
} else {
chunkContents = searchutil.EnrichContentCaptionOnly(chunkContents, mergedImageInfo)
}
}
// Apply length limit: sample long content to fit within maxInputChars
chunkContents = sampleLongContent(chunkContents, maxInputChars)
logger.GetLogger(ctx).Infof("getSummary: content length=%d chars (max=%d) for knowledge %s",
len([]rune(chunkContents)), maxInputChars, knowledge.ID)
// Bail out before the LLM call when there is not enough actual text to
// summarise. We deliberately do not pass filename/file-type metadata to the
// LLM: scanned PDFs frequently carry filenames like "MX5280.pdf" (the
// scanner model), and feeding that to the model would invite it to
// hallucinate a scanner manual instead of admitting the document had no
// extractable text.
if err := checkSufficientSummaryContent(ctx, knowledge.ID, chunkContents); err != nil {
return "", err
}
// Pass the raw chunk text to the LLM with no filename / file-type framing.
contentWithMetadata := chunkContents
// Determine max output tokens from config
maxTokens := 2048
if s.config.Conversation.Summary != nil && s.config.Conversation.Summary.MaxCompletionTokens > 0 {
maxTokens = s.config.Conversation.Summary.MaxCompletionTokens
}
// Generate summary using AI model
summaryPrompt := types.RenderPromptPlaceholders(s.config.Conversation.GenerateSummaryPrompt, types.PlaceholderValues{
"language": types.LanguageNameFromContext(ctx),
})
thinking := false
summary, err := summaryModel.Chat(ctx, []chat.Message{
{
Role: "system",
Content: summaryPrompt,
},
{
Role: "user",
Content: contentWithMetadata,
},
}, &chat.ChatOptions{
Temperature: 0.3,
MaxTokens: maxTokens,
Thinking: &thinking,
})
if err != nil {
logger.GetLogger(ctx).WithField("error", err).Errorf("GetSummary failed")
return "", err
}
logger.GetLogger(ctx).WithField("summary", summary.Content).Infof("GetSummary success")
return summary.Content, nil
}
// sampleLongContent returns content that fits within maxChars.
// For short content (≤ maxChars), it is returned as-is.
// For long content, it samples: head (60%), tail (20%), and evenly-spaced middle (20%),
// joined by "[...content omitted...]" markers so the LLM knows content was skipped.
func sampleLongContent(content string, maxChars int) string {
runes := []rune(content)
if len(runes) <= maxChars {
return content
}
const omitMarker = "\n\n[...content omitted...]\n\n"
omitRunes := len([]rune(omitMarker))
// Reserve space for two omit markers (head→middle, middle→tail)
usable := maxChars - 2*omitRunes
if usable < 100 {
// Fallback: just truncate
return string(runes[:maxChars])
}
headLen := usable * 60 / 100
tailLen := usable * 20 / 100
midLen := usable - headLen - tailLen
head := string(runes[:headLen])
tail := string(runes[len(runes)-tailLen:])
// Sample middle portion: take a contiguous block from the center of the document
midStart := len(runes)/2 - midLen/2
if midStart < headLen {
midStart = headLen
}
midEnd := midStart + midLen
if midEnd > len(runes)-tailLen {
midEnd = len(runes) - tailLen
midStart = midEnd - midLen
if midStart < headLen {
midStart = headLen
}
}
middle := string(runes[midStart:midEnd])
return head + omitMarker + middle + omitMarker + tail
}
// ProcessSummaryGeneration handles async summary generation task
func (s *knowledgeService) ProcessSummaryGeneration(ctx context.Context, t *asynq.Task) (retErr error) {
var payload types.SummaryGenerationPayload
if err := json.Unmarshal(t.Payload(), &payload); err != nil {
logger.Errorf(ctx, "Failed to unmarshal summary generation payload: %v", err)
return nil // Don't retry on unmarshal error
}
logger.Infof(ctx, "Processing summary generation for knowledge: %s", payload.KnowledgeID)
// Set tenant and language context
ctx = context.WithValue(ctx, types.TenantIDContextKey, payload.TenantID)
if payload.Language != "" {
ctx = context.WithValue(ctx, types.LanguageContextKey, payload.Language)
}
// A newer attempt (re-upload / edit / reparse) has superseded this one:
// skip before opening the span or registering the FinalizeSubtask defer
// so we neither read stale chunks nor decrement the new attempt's counter.
if attemptSuperseded(ctx, s.tracker(), payload.KnowledgeID, payload.Attempt) {
logger.Infof(ctx, "summary: attempt %d superseded for %s, skipping stale enrichment",
payload.Attempt, payload.KnowledgeID)
return nil
}
// Open a subspan under the parent attempt's postprocess stage so the
// trace surface shows the real summary-generation duration (LLM call
// + chunk write + index) instead of just the upstream's enqueue time.
// Closes via the deferred handler below — every return path lands in
// the defer, including the early returns ahead.
span := s.beginPostprocessSubspan(ctx, payload.KnowledgeID, payload.Attempt, "postprocess.summary",
types.JSONMap{
"language": payload.Language,
})
var summaryErr error
summaryOut := types.JSONMap{}
defer func() {
// Decrement the parent's enrichment counter on terminal exit.
// "Terminal" is keyed on the value RETURNED to asynq, not on
// summaryErr: several branches record a failure on the span
// (summaryErr != nil) yet deliberately `return nil` so asynq does
// NOT retry (e.g. insufficient text content, KB/knowledge fetch
// failures). Those are terminal and must drain — keying on
// summaryErr would skip them and leave the row stuck in
// "finalizing". When we DO return an error asynq will retry, so
// we only drain on the final attempt.
if (retErr == nil || isFinalAsynqAttempt(ctx)) && payload.KnowledgeID != "" {
if _, _, ferr := s.repo.FinalizeSubtask(ctx, payload.KnowledgeID); ferr != nil {
logger.Warnf(ctx, "summary: FinalizeSubtask failed for %s: %v", payload.KnowledgeID, ferr)
}
}
if span == nil {
return
}
if summaryErr != nil {
s.failPostprocessSubspan(ctx, span, "SUMMARY_FAILED", summaryErr.Error(), summaryErr)
} else {
s.endPostprocessSubspan(ctx, span, summaryOut)
}
}()
// Get knowledge base
kb, err := s.kbService.GetKnowledgeBaseByID(ctx, payload.KnowledgeBaseID)
if err != nil {
logger.Errorf(ctx, "Failed to get knowledge base: %v", err)
summaryErr = err
return nil
}
// Capture the resolved model id on the span output the moment we
// know it — debugging "summary stage took 60s" benefits hugely from
// seeing WHICH chat model was actually used (kb config drift, fall-
// throughs to a slow upstream, etc.).
summaryOut["model_id"] = kb.SummaryModelID
if kb.SummaryModelID == "" {
logger.Warn(ctx, "Knowledge base summary model ID is empty, skipping summary generation")
summaryOut["skipped"] = "no_summary_model"
return nil
}
// Get knowledge
knowledge, err := s.repo.GetKnowledgeByID(ctx, payload.TenantID, payload.KnowledgeID)
if err != nil {
logger.Errorf(ctx, "Failed to get knowledge: %v", err)
summaryErr = err
return nil
}
// Short-circuit when the user cancelled parsing or the row is being deleted.
if knowledge != nil {
switch knowledge.ParseStatus {
case types.ParseStatusCancelled, types.ParseStatusDeleting:
logger.Infof(ctx, "Summary generation: knowledge aborted (%s), skipping: %s",
knowledge.ParseStatus, payload.KnowledgeID)
summaryOut["skipped"] = "knowledge_" + knowledge.ParseStatus
return nil
}
}
// Update summary status to processing
knowledge.SummaryStatus = types.SummaryStatusProcessing
knowledge.UpdatedAt = time.Now()
if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil {
logger.Warnf(ctx, "Failed to update summary status to processing: %v", err)
}
// Helper function to mark summary as failed
markSummaryFailed := func() {
knowledge.SummaryStatus = types.SummaryStatusFailed
knowledge.UpdatedAt = time.Now()
if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil {
logger.Warnf(ctx, "Failed to update summary status to failed: %v", err)
}
}
// Get text chunks for this knowledge
chunks, err := s.chunkService.ListChunksByKnowledgeID(ctx, payload.KnowledgeID)
if err != nil {
logger.Errorf(ctx, "Failed to get chunks: %v", err)
markSummaryFailed()
summaryErr = err
return nil
}
// Filter text chunks only
textChunks := make([]*types.Chunk, 0)
for _, chunk := range chunks {
if chunk.ChunkType == types.ChunkTypeText {
textChunks = append(textChunks, chunk)
}
}
summaryOut["text_chunks"] = len(textChunks)
if len(textChunks) == 0 {
logger.Infof(ctx, "No text chunks found for knowledge: %s", payload.KnowledgeID)
// Mark as completed since there's nothing to summarize
knowledge.SummaryStatus = types.SummaryStatusCompleted
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
summaryOut["skipped"] = "no_text_chunks"
return nil
}
// Sort chunks by ChunkIndex for proper ordering
sort.Slice(textChunks, func(i, j int) bool {
return textChunks[i].ChunkIndex < textChunks[j].ChunkIndex
})
// Initialize chat model for summary
chatModel, err := s.modelService.GetChatModel(ctx, kb.SummaryModelID)
if err != nil {
logger.Errorf(ctx, "Failed to get chat model: %v", err)
markSummaryFailed()
summaryErr = err
return fmt.Errorf("failed to get chat model: %w", err)
}
// Generate summary
summary, err := s.getSummary(ctx, chatModel, knowledge, textChunks)
if err != nil {
logger.Errorf(ctx, "Failed to generate summary for knowledge %s: %v", payload.KnowledgeID, err)
// Surface the underlying LLM/IO error on the span so the trace UI
// can explain "why did this stage take 60s and then fall back?"
// without forcing the operator to grep worker logs. We also capture
// the error type to disambiguate timeouts from upstream HTTP errors
// (deadline exceeded vs unexpected EOF vs 5xx, etc.).
summaryOut["error"] = previewText(err.Error(), 500)
summaryOut["error_type"] = fmt.Sprintf("%T", err)
// For the insufficient-content case (scanned PDF without OCR, etc.)
// we deliberately do NOT fall back to the first chunk's raw content,
// since that chunk is typically just a bare markdown image reference
// and surfacing it in the description is misleading.
if errors.Is(err, errInsufficientSummaryContent) {
knowledge.Description = ""
knowledge.SummaryStatus = types.SummaryStatusFailed
knowledge.UpdatedAt = time.Now()
if updateErr := s.repo.UpdateKnowledge(ctx, knowledge); updateErr != nil {
logger.Errorf(ctx, "Failed to mark summary as failed: %v", updateErr)
summaryErr = updateErr
return fmt.Errorf("failed to update knowledge: %w", updateErr)
}
summaryOut["fallback"] = "insufficient_content"
summaryErr = err
return nil
}
// For other errors (LLM API issues etc.), fall back to the first chunk.
if len(textChunks) > 0 {
summary = textChunks[0].Content
if len(summary) > 500 {
runes := []rune(summary)
if len(runes) > 500 {
summary = string(runes[:500])
}
}
summaryOut["fallback"] = "first_chunk"
}
}
// Update knowledge description
knowledge.Description = summary
knowledge.SummaryStatus = types.SummaryStatusCompleted
knowledge.UpdatedAt = time.Now()
summaryOut["summary_chars"] = len([]rune(summary))
// Preview the generated summary on the span output so the trace
// viewer can show "this is what the LLM produced" at a glance,
// without hopping to the knowledge-detail page. Capped to keep
// span rows compact.
summaryOut["summary_preview"] = previewText(summary, 240)
if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil {
logger.Errorf(ctx, "Failed to update knowledge description: %v", err)
summaryErr = err
return fmt.Errorf("failed to update knowledge: %w", err)
}
// Create summary chunk and index it — only when RAG indexing is enabled.
// Wiki-only KBs don't need summary chunks in the vector index.
if strings.TrimSpace(summary) != "" && kb.NeedsEmbeddingModel() {
// Get max chunk index
maxChunkIndex := 0
for _, chunk := range chunks {
if chunk.ChunkIndex > maxChunkIndex {
maxChunkIndex = chunk.ChunkIndex
}
}
// Embed only the LLM-generated summary in the indexed chunk.
// We deliberately omit knowledge.FileName here: filenames are an
// unreliable signal (e.g. "MX5280.pdf" for a scanned legal letter)
// and surfacing them in retrieved RAG context can re-introduce the
// hallucination vector this branch is meant to close.
summaryChunk := &types.Chunk{
ID: uuid.New().String(),
TenantID: knowledge.TenantID,
KnowledgeID: knowledge.ID,
KnowledgeBaseID: knowledge.KnowledgeBaseID,
Content: fmt.Sprintf("# Summary\n%s", summary),
ChunkIndex: maxChunkIndex + 1,
IsEnabled: true,
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
StartAt: 0,
EndAt: 0,
ChunkType: types.ChunkTypeSummary,
ParentChunkID: textChunks[0].ID,
}
// Save summary chunk
if err := s.chunkService.CreateChunks(ctx, []*types.Chunk{summaryChunk}); err != nil {
logger.Errorf(ctx, "Failed to create summary chunk: %v", err)
summaryErr = err
return fmt.Errorf("failed to create summary chunk: %w", err)
}
// Index summary chunk
tenantInfo, err := s.tenantRepo.GetTenantByID(ctx, payload.TenantID)
if err != nil {
logger.Errorf(ctx, "Failed to get tenant info: %v", err)
summaryErr = err
return fmt.Errorf("failed to get tenant info: %w", err)
}
ctx = context.WithValue(ctx, types.TenantInfoContextKey, tenantInfo)
retrieveEngine, err := retriever.CreateRetrieveEngineForKB(
ctx, s.retrieveEngine, s.ownership, tenantInfo.ID, kb.VectorStoreID)
if err != nil {
logger.Errorf(ctx, "Failed to init retrieve engine: %v", err)
summaryErr = err
return fmt.Errorf("failed to init retrieve engine: %w", err)
}
embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, kb.EmbeddingModelID)
if err != nil {
logger.Errorf(ctx, "Failed to get embedding model: %v", err)
summaryErr = err
return fmt.Errorf("failed to get embedding model: %w", err)
}
indexInfo := []*types.IndexInfo{{
Content: summaryChunk.Content,
SourceID: summaryChunk.ID,
SourceType: types.ChunkSourceType,
ChunkID: summaryChunk.ID,
KnowledgeID: knowledge.ID,
KnowledgeBaseID: knowledge.KnowledgeBaseID,
IsEnabled: true,
}}
if err := retrieveEngine.BatchIndex(ctx, embeddingModel, indexInfo); err != nil {
logger.Errorf(ctx, "Failed to index summary chunk: %v", err)
summaryErr = err
return fmt.Errorf("failed to index summary chunk: %w", err)
}
logger.Infof(ctx, "Successfully created and indexed summary chunk for knowledge: %s", payload.KnowledgeID)
summaryOut["summary_chunk_indexed"] = true
}
logger.Infof(ctx, "Successfully generated summary for knowledge: %s", payload.KnowledgeID)
summaryOut["status"] = "completed"
return nil
}
// ProcessQuestionGeneration handles async question generation task
func (s *knowledgeService) ProcessQuestionGeneration(ctx context.Context, t *asynq.Task) (retErr error) {
taskStartedAt := time.Now()
retryCount, _ := asynq.GetRetryCount(ctx)
maxRetry, _ := asynq.GetMaxRetry(ctx)
var payload types.QuestionGenerationPayload
exitStatus := "success"
totalChunks := 0
totalTextChunks := 0
emptyContentChunks := 0
llmCallAttempts := 0
llmCallSuccess := 0
llmCallFailed := 0
llmCallEmpty := 0
generatedQuestionsTotal := 0
chunkMetadataSetFailed := 0
chunkUpdateFailed := 0
indexEntriesPrepared := 0
indexBatchAttempted := false
indexBatchSucceeded := false
// Sample question + model id surfaced on the span output so the
// trace viewer can answer "what did the LLM actually produce?" and
// "which model did it run on?" without joining back to the chunk
// store. Captured the first time we see a non-empty question batch.
var sampleQuestion string
var resolvedModelID string
// Postprocess subspan for the trace viewer. Opened lazily after we
// unmarshal the payload (so we have payload.Attempt) and closed in
// the defer below alongside the stats log so the span output mirrors
// what we already log to stdout.
var qSpan *Span
var qErr error
// Set when a newer attempt supersedes this run; suppresses the
// FinalizeSubtask decrement so a stale task can't drain the new
// attempt's counter.
superseded := false
// Decrement enrichment counter on terminal exit. Keyed on the value
// RETURNED to asynq (retErr), not qErr: some branches record a span
// failure (qErr != nil) yet `return nil` so asynq won't retry (KB /
// knowledge fetch failures); those are terminal and must drain.
// Keying on qErr would skip them and strand the row in "finalizing".
// When we return an error asynq retries, so we only drain on the
// final attempt. Runs AFTER the stats-log defer below — defers
// unwind LIFO, so this one declared first executes last.
defer func() {
if !superseded && (retErr == nil || isFinalAsynqAttempt(ctx)) && payload.KnowledgeID != "" {
if _, _, ferr := s.repo.FinalizeSubtask(ctx, payload.KnowledgeID); ferr != nil {
logger.Warnf(ctx, "question: FinalizeSubtask failed for %s: %v", payload.KnowledgeID, ferr)
}
}
}()
defer func() {
logger.Infof(
ctx,
"Question generation stats: knowledge=%s kb=%s retry=%d/%d status=%s elapsed=%s chunks(total=%d,text=%d,empty_text=%d) llm(attempt=%d,success=%d,empty=%d,failed=%d) generated_questions=%d chunk_update_failed=%d metadata_set_failed=%d index(prepared=%d,attempted=%v,succeeded=%v)",
payload.KnowledgeID,
payload.KnowledgeBaseID,
retryCount,
maxRetry,
exitStatus,
time.Since(taskStartedAt).Round(time.Millisecond),
totalChunks,
totalTextChunks,
emptyContentChunks,
llmCallAttempts,
llmCallSuccess,
llmCallEmpty,
llmCallFailed,
generatedQuestionsTotal,
chunkUpdateFailed,
chunkMetadataSetFailed,
indexEntriesPrepared,
indexBatchAttempted,
indexBatchSucceeded,
)
if qSpan != nil {
out := types.JSONMap{
"status": exitStatus,
"total_chunks": totalChunks,
"text_chunks": totalTextChunks,
"empty_content_chunks": emptyContentChunks,
"llm_attempts": llmCallAttempts,
"llm_success": llmCallSuccess,
"llm_empty": llmCallEmpty,
"llm_failed": llmCallFailed,
"questions_generated": generatedQuestionsTotal,
"chunk_update_failed": chunkUpdateFailed,
"metadata_set_failed": chunkMetadataSetFailed,
"index_entries_prepared": indexEntriesPrepared,
"index_batch_attempted": indexBatchAttempted,
"index_batch_succeeded": indexBatchSucceeded,
"retry": retryCount,
"max_retry": maxRetry,
}
// Surface the resolved model id and a sample question on the
// span output. These help debugging "why is question generation
// slow" — both questions ("which model was hit?") and ("what
// did it produce?") are hard to answer from logs alone.
if resolvedModelID != "" {
out["model_id"] = resolvedModelID
}
if sampleQuestion != "" {
out["sample_question"] = sampleQuestion
}
// Treat any non-success exitStatus as a failed run; the
// existing stats-string already enumerates them. qErr stays
// optional for callers that want to surface a Go error.
if exitStatus != "success" || qErr != nil {
msg := exitStatus
var detailErr error = qErr
if qErr != nil {
msg = qErr.Error()
}
s.failPostprocessSubspan(ctx, qSpan, "QUESTION_FAILED", msg, detailErr)
} else {
s.endPostprocessSubspan(ctx, qSpan, out)
}
}
}()
if err := json.Unmarshal(t.Payload(), &payload); err != nil {
exitStatus = "invalid_payload"
logger.Errorf(ctx, "Failed to unmarshal question generation payload: %v", err)
return nil // Don't retry on unmarshal error
}
logger.Infof(ctx, "Processing question generation for knowledge: %s", payload.KnowledgeID)
// A newer attempt has superseded this one: skip before opening the span
// so we don't read stale chunks. superseded suppresses the counter
// decrement in the defer above; qSpan stays nil so the stats defer no-ops.
if attemptSuperseded(ctx, s.tracker(), payload.KnowledgeID, payload.Attempt) {
superseded = true
exitStatus = "superseded"
logger.Infof(ctx, "question: attempt %d superseded for %s, skipping stale enrichment",
payload.Attempt, payload.KnowledgeID)
return nil
}
// Open the postprocess.question subspan now that we have payload.Attempt.
// Closes via the defer above.
qSpan = s.beginPostprocessSubspan(ctx, payload.KnowledgeID, payload.Attempt, "postprocess.question",
types.JSONMap{
"question_count": payload.QuestionCount,
"language": payload.Language,
})
// Set tenant context
ctx = context.WithValue(ctx, types.TenantIDContextKey, payload.TenantID)
if payload.Language != "" {
ctx = context.WithValue(ctx, types.LanguageContextKey, payload.Language)
}
if strings.TrimSpace(s.config.Conversation.GenerateQuestionsPrompt) == "" {
exitStatus = "prompt_not_configured"
logger.Errorf(ctx, "GenerateQuestionsPrompt is empty: configure conversation.generate_questions_prompt_id")
qErr = fmt.Errorf("generate questions prompt not configured")
return qErr
}
// Get knowledge base
kb, err := s.kbService.GetKnowledgeBaseByID(ctx, payload.KnowledgeBaseID)
if err != nil {
exitStatus = "kb_not_found"
logger.Errorf(ctx, "Failed to get knowledge base: %v", err)
qErr = err
return nil
}
// Get knowledge
knowledge, err := s.repo.GetKnowledgeByID(ctx, payload.TenantID, payload.KnowledgeID)
if err != nil {
exitStatus = "knowledge_not_found"
logger.Errorf(ctx, "Failed to get knowledge: %v", err)
qErr = err
return nil
}
// Short-circuit when the user cancelled parsing or the row is being deleted.
if knowledge != nil {
switch knowledge.ParseStatus {
case types.ParseStatusCancelled, types.ParseStatusDeleting:
exitStatus = "knowledge_" + knowledge.ParseStatus
logger.Infof(ctx, "Question generation: knowledge aborted (%s), skipping: %s",
knowledge.ParseStatus, payload.KnowledgeID)
return nil
}
}
// Get text chunks for this knowledge
chunks, err := s.chunkService.ListChunksByKnowledgeID(ctx, payload.KnowledgeID)
if err != nil {
exitStatus = "list_chunks_failed"
logger.Errorf(ctx, "Failed to get chunks: %v", err)
return nil
}
totalChunks = len(chunks)
// Filter text chunks only
textChunks := make([]*types.Chunk, 0)
for _, chunk := range chunks {
if chunk.ChunkType == types.ChunkTypeText {
textChunks = append(textChunks, chunk)
}
}
totalTextChunks = len(textChunks)
if len(textChunks) == 0 {
exitStatus = "no_text_chunks"
logger.Infof(ctx, "No text chunks found for knowledge: %s", payload.KnowledgeID)
return nil
}
// Sort chunks by StartAt for context building
sort.Slice(textChunks, func(i, j int) bool {
return textChunks[i].StartAt < textChunks[j].StartAt
})
// Initialize chat model
chatModel, err := s.modelService.GetChatModel(ctx, kb.SummaryModelID)
if err != nil {
exitStatus = "get_chat_model_failed"
logger.Errorf(ctx, "Failed to get chat model: %v", err)
return fmt.Errorf("failed to get chat model: %w", err)
}
resolvedModelID = kb.SummaryModelID
// Initialize embedding model and retrieval engine
embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, kb.EmbeddingModelID)
if err != nil {
exitStatus = "get_embedding_model_failed"
logger.Errorf(ctx, "Failed to get embedding model: %v", err)
return fmt.Errorf("failed to get embedding model: %w", err)
}
tenantInfo, err := s.tenantRepo.GetTenantByID(ctx, payload.TenantID)
if err != nil {
exitStatus = "get_tenant_failed"
logger.Errorf(ctx, "Failed to get tenant info: %v", err)
return fmt.Errorf("failed to get tenant info: %w", err)
}
ctx = context.WithValue(ctx, types.TenantInfoContextKey, tenantInfo)
retrieveEngine, err := retriever.CreateRetrieveEngineForKB(
ctx, s.retrieveEngine, s.ownership, tenantInfo.ID, kb.VectorStoreID)
if err != nil {
exitStatus = "init_retrieve_engine_failed"
logger.Errorf(ctx, "Failed to init retrieve engine: %v", err)
return fmt.Errorf("failed to init retrieve engine: %w", err)
}
questionCount := payload.QuestionCount
if questionCount <= 0 {
questionCount = 3
}
if questionCount > 10 {
questionCount = 10
}
// Collect image info for all text chunks so question generation can
// see caption / OCR text instead of bare image links.
textChunkIDs := make([]string, len(textChunks))
for i, c := range textChunks {
textChunkIDs[i] = c.ID
}
imageInfoMap := searchutil.CollectImageInfoByChunkIDs(ctx, s.chunkRepo, payload.TenantID, textChunkIDs)
enrichContent := func(chunk *types.Chunk) string {
if info, ok := imageInfoMap[chunk.ID]; ok && info != "" {
return searchutil.EnrichContentWithImageInfo(chunk.Content, info)
}
return chunk.Content
}
// Generate questions for each chunk with context
var indexInfoList []*types.IndexInfo
for i, chunk := range textChunks {
if strings.TrimSpace(chunk.Content) == "" {
emptyContentChunks++
continue
}
// Build context from adjacent chunks
var prevContent, nextContent string
if i > 0 {
prevContent = enrichContent(textChunks[i-1])
}
if i < len(textChunks)-1 {
nextContent = enrichContent(textChunks[i+1])
}
llmCallAttempts++
questions, err := s.generateQuestionsWithContext(ctx, chatModel, enrichContent(chunk), prevContent, nextContent, knowledge.Title, questionCount)
if err != nil {
llmCallFailed++
logger.Warnf(ctx, "Failed to generate questions for chunk %s: %v", chunk.ID, err)
continue
}
if len(questions) == 0 {
llmCallEmpty++
continue
}
llmCallSuccess++
generatedQuestionsTotal += len(questions)
if sampleQuestion == "" && len(questions) > 0 {
sampleQuestion = previewText(questions[0], 200)
}
// Update chunk metadata with unique IDs for each question
generatedQuestions := make([]types.GeneratedQuestion, len(questions))
for j, question := range questions {
questionID := fmt.Sprintf("q%d", time.Now().UnixNano()+int64(j))
generatedQuestions[j] = types.GeneratedQuestion{
ID: questionID,
Question: question,
}
}
meta := &types.DocumentChunkMetadata{
GeneratedQuestions: generatedQuestions,
}
if err := chunk.SetDocumentMetadata(meta); err != nil {
chunkMetadataSetFailed++
logger.Warnf(ctx, "Failed to set document metadata for chunk %s: %v", chunk.ID, err)
continue
}
// Update chunk in database
if err := s.chunkService.UpdateChunk(ctx, chunk); err != nil {
chunkUpdateFailed++
logger.Warnf(ctx, "Failed to update chunk %s: %v", chunk.ID, err)
continue
}
// Create index entries for generated questions
for _, gq := range generatedQuestions {
sourceID := fmt.Sprintf("%s-%s", chunk.ID, gq.ID)
indexInfoList = append(indexInfoList, &types.IndexInfo{
Content: gq.Question,
SourceID: sourceID,
SourceType: types.ChunkSourceType,
ChunkID: chunk.ID,
KnowledgeID: knowledge.ID,
KnowledgeBaseID: knowledge.KnowledgeBaseID,
IsEnabled: true,
})
}
logger.Debugf(ctx, "Generated %d questions for chunk %s", len(questions), chunk.ID)
}
indexEntriesPrepared = len(indexInfoList)
// Index generated questions
if len(indexInfoList) > 0 {
indexBatchAttempted = true
if err := retrieveEngine.BatchIndex(ctx, embeddingModel, indexInfoList); err != nil {
exitStatus = "index_questions_failed"
logger.Errorf(ctx, "Failed to index generated questions: %v", err)
return fmt.Errorf("failed to index questions: %w", err)
}
indexBatchSucceeded = true
logger.Infof(ctx, "Successfully indexed %d generated questions for knowledge: %s", len(indexInfoList), payload.KnowledgeID)
}
return nil
}
// generateQuestionsWithContext generates questions for a chunk with surrounding context
func (s *knowledgeService) generateQuestionsWithContext(ctx context.Context,
chatModel chat.Chat, content, prevContent, nextContent, docName string, questionCount int,
) ([]string, error) {
if content == "" || questionCount <= 0 {
return nil, nil
}
prompt := strings.TrimSpace(s.config.Conversation.GenerateQuestionsPrompt)
if prompt == "" {
return nil, fmt.Errorf("generate questions prompt not configured")
}
// Build context section
var contextSection string
if prevContent != "" || nextContent != "" {
contextSection = "<surrounding_context>\n"
if prevContent != "" {
contextSection += fmt.Sprintf("<preceding_content>\n%s\n\n</preceding_content>\n\n", prevContent)
}
if nextContent != "" {
contextSection += fmt.Sprintf("<following_content>\n%s\n\n</following_content>\n\n", nextContent)
}
contextSection += "</surrounding_context>\n\n"
}
langName := types.LanguageNameFromContext(ctx)
prompt = types.RenderPromptPlaceholders(prompt, types.PlaceholderValues{
"question_count": fmt.Sprintf("%d", questionCount),
"content": content,
"context": contextSection,
"doc_name": docName,
"language": langName,
})
thinking := false
response, err := chatModel.Chat(ctx, []chat.Message{
{
Role: "user",
Content: prompt,
},
}, &chat.ChatOptions{
Temperature: 0.7,
MaxTokens: 512,
Thinking: &thinking,
})
if err != nil {
return nil, fmt.Errorf("failed to generate questions: %w", err)
}
// Parse response
lines := strings.Split(response.Content, "\n")
questions := make([]string, 0, questionCount)
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
continue
}
line = strings.TrimLeft(line, "0123456789.-*) ")
line = strings.TrimSpace(line)
if line != "" && len(line) > 5 {
questions = append(questions, line)
if len(questions) >= questionCount {
break
}
}
}
return questions, nil
}
// ReparseKnowledge deletes existing document content and re-parses the knowledge asynchronously.
// This method reuses the logic from UpdateManualKnowledge for resource cleanup and async parsing.
func (s *knowledgeService) ReparseKnowledge(ctx context.Context, knowledgeID string) (*types.Knowledge, error) {
logger.Info(ctx, "Start re-parsing knowledge")
tenantID := ctx.Value(types.TenantIDContextKey).(uint64)
existing, err := s.repo.GetKnowledgeByID(ctx, tenantID, knowledgeID)
if err != nil {
logger.Errorf(ctx, "Failed to load knowledge: %v", err)
return nil, err
}
// Allocate a fresh span tree attempt up front. Doing this BEFORE
// the cleanup + enqueue means: (a) the UI immediately sees a new
// attempt with all five stages back to "pending" instead of the
// previous run's "failed" badge lingering; (b) the worker's
// fallback path won't double-allocate when payload.Attempt is
// already set on the queued task.
reparseAttempt := 0
if root, n, err := s.tracker().OpenAttempt(ctx, existing.ID, ""); err == nil && root != nil {
reparseAttempt = n
} else if err != nil {
logger.Warnf(ctx, "[Reparse] OpenAttempt failed for %s: %v (will fall back in worker)", existing.ID, err)
}
// Get knowledge base configuration
kb, err := s.kbService.GetKnowledgeBaseByID(ctx, existing.KnowledgeBaseID)
if err != nil {
logger.Errorf(ctx, "Failed to get knowledge base for reparse: %v", err)
return nil, err
}
// Keep wiki's pending queue consistent across both manual and non-manual
// paths. The destructive work (swapping old wiki contributions for new)
// happens asynchronously inside mapOneDocument — see its oldPageSlugs
// handling — once post-process re-enqueues wiki ingest. All we need to
// do here is stop any stale pending ingest op from firing against the
// pre-reparse chunk set.
if kb != nil && kb.IsWikiEnabled() {
s.prepareWikiForReparse(ctx, existing)
}
// For manual knowledge, use async manual processing (cleanup + re-indexing in worker)
if existing.IsManual() {
meta, metaErr := existing.ManualMetadata()
if metaErr != nil || meta == nil {
logger.Errorf(ctx, "Failed to get manual metadata for reparse: %v", metaErr)
return nil, werrors.NewBadRequestError("无法获取手工知识内容")
}
existing.ParseStatus = "pending"
existing.EnableStatus = "disabled"
existing.Description = ""
existing.ProcessedAt = nil
existing.EmbeddingModelID = kb.EmbeddingModelID
// Reset the enrichment counter so a leftover value from a
// previous attempt (e.g. cancelled before all subtasks decremented)
// cannot block the new finalizing transition later.
existing.PendingSubtasksCount = 0
if err := s.repo.UpdateKnowledge(ctx, existing); err != nil {
logger.Errorf(ctx, "Failed to update knowledge status before reparse: %v", err)
return nil, err
}
if err := s.enqueueManualProcessing(ctx, existing, meta.Content, true); err != nil {
logger.Errorf(ctx, "Failed to enqueue manual reparse task: %v", err)
existing.ParseStatus = "failed"
existing.ErrorMessage = "Failed to enqueue processing task"
s.repo.UpdateKnowledge(ctx, existing)
}
return existing, nil
}
// For non-manual knowledge, cleanup synchronously then enqueue document processing
logger.Infof(ctx, "Cleaning up existing resources for knowledge: %s", knowledgeID)
if err := s.cleanupKnowledgeResources(ctx, existing); err != nil {
logger.ErrorWithFields(ctx, err, map[string]interface{}{
"knowledge_id": knowledgeID,
})
return nil, err
}
// Step 2: Update knowledge status and metadata
existing.ParseStatus = "pending"
existing.EnableStatus = "disabled"
existing.Description = ""
existing.ProcessedAt = nil
existing.EmbeddingModelID = kb.EmbeddingModelID
// Reset the enrichment counter so a leftover value from a previous
// attempt cannot block the new finalizing transition later.
existing.PendingSubtasksCount = 0
if err := s.repo.UpdateKnowledge(ctx, existing); err != nil {
logger.Errorf(ctx, "Failed to update knowledge status before reparse: %v", err)
return nil, err
}
// Step 3: Trigger async re-parsing based on knowledge type
logger.Infof(ctx, "Knowledge status updated, scheduling async reparse, ID: %s, Type: %s", existing.ID, existing.Type)
// For file-based knowledge, enqueue document processing task
if existing.FilePath != "" {
tenantID := ctx.Value(types.TenantIDContextKey).(uint64)
// Determine multimodal setting
enableMultimodel := kb.IsMultimodalEnabled()
// Check question generation config
enableQuestionGeneration := false
questionCount := 3 // default
if kb.QuestionGenerationConfig != nil && kb.QuestionGenerationConfig.Enabled {
enableQuestionGeneration = true
if kb.QuestionGenerationConfig.QuestionCount > 0 {
questionCount = kb.QuestionGenerationConfig.QuestionCount
}
}
lang, _ := types.LanguageFromContext(ctx)
taskPayload := types.DocumentProcessPayload{
TenantID: tenantID,
KnowledgeID: existing.ID,
KnowledgeBaseID: existing.KnowledgeBaseID,
FilePath: existing.FilePath,
FileName: existing.FileName,
FileType: getFileType(existing.FileName),
EnableMultimodel: enableMultimodel,
EnableQuestionGeneration: enableQuestionGeneration,
QuestionCount: questionCount,
Language: lang,
Attempt: reparseAttempt,
}
langfuse.InjectTracing(ctx, &taskPayload)
payloadBytes, err := json.Marshal(taskPayload)
if err != nil {
logger.Errorf(ctx, "Failed to marshal reparse task payload: %v", err)
return existing, nil
}
task := asynq.NewTask(
types.TypeDocumentProcess,
payloadBytes,
documentProcessTaskOptions(s.config, asynq.MaxRetry(3))...,
)
info, err := s.task.Enqueue(task)
if err != nil {
logger.Errorf(ctx, "Failed to enqueue reparse task: %v", err)
return existing, nil
}
logger.Infof(ctx, "Enqueued reparse task: id=%s queue=%s knowledge_id=%s", info.ID, info.Queue, existing.ID)
// For data tables (csv, xlsx, xls), also enqueue summary task
if slices.Contains([]string{"csv", "xlsx", "xls"}, getFileType(existing.FileName)) {
NewDataTableSummaryTask(ctx, s.task, tenantID, existing.ID, kb.SummaryModelID, kb.EmbeddingModelID)
}
return existing, nil
}
// For file-URL-based knowledge, enqueue document processing task with FileURL field
if existing.Type == "file_url" && existing.Source != "" {
tenantID := ctx.Value(types.TenantIDContextKey).(uint64)
enableMultimodel := kb.IsMultimodalEnabled()
// Check question generation config
enableQuestionGeneration := false
questionCount := 3
if kb.QuestionGenerationConfig != nil && kb.QuestionGenerationConfig.Enabled {
enableQuestionGeneration = true
if kb.QuestionGenerationConfig.QuestionCount > 0 {
questionCount = kb.QuestionGenerationConfig.QuestionCount
}
}
lang, _ := types.LanguageFromContext(ctx)
taskPayload := types.DocumentProcessPayload{
TenantID: tenantID,
KnowledgeID: existing.ID,
KnowledgeBaseID: existing.KnowledgeBaseID,
FileURL: existing.Source,
FileName: existing.FileName,
FileType: existing.FileType,
EnableMultimodel: enableMultimodel,
EnableQuestionGeneration: enableQuestionGeneration,
QuestionCount: questionCount,
Language: lang,
Attempt: reparseAttempt,
}
langfuse.InjectTracing(ctx, &taskPayload)
payloadBytes, err := json.Marshal(taskPayload)
if err != nil {
logger.Errorf(ctx, "Failed to marshal file URL reparse task payload: %v", err)
return existing, nil
}
task := asynq.NewTask(
types.TypeDocumentProcess,
payloadBytes,
documentProcessTaskOptions(s.config)...,
)
info, err := s.task.Enqueue(task)
if err != nil {
logger.Errorf(ctx, "Failed to enqueue file URL reparse task: %v", err)
return existing, nil
}
logger.Infof(ctx, "Enqueued file URL reparse task: id=%s queue=%s knowledge_id=%s", info.ID, info.Queue, existing.ID)
return existing, nil
}
// For URL-based knowledge, enqueue URL processing task
if existing.Type == "url" && existing.Source != "" {
tenantID := ctx.Value(types.TenantIDContextKey).(uint64)
enableMultimodel := kb.IsMultimodalEnabled()
// Check question generation config
enableQuestionGeneration := false
questionCount := 3
if kb.QuestionGenerationConfig != nil && kb.QuestionGenerationConfig.Enabled {
enableQuestionGeneration = true
if kb.QuestionGenerationConfig.QuestionCount > 0 {
questionCount = kb.QuestionGenerationConfig.QuestionCount
}
}
lang, _ := types.LanguageFromContext(ctx)
taskPayload := types.DocumentProcessPayload{
TenantID: tenantID,
KnowledgeID: existing.ID,
KnowledgeBaseID: existing.KnowledgeBaseID,
URL: existing.Source,
EnableMultimodel: enableMultimodel,
EnableQuestionGeneration: enableQuestionGeneration,
QuestionCount: questionCount,
Language: lang,
Attempt: reparseAttempt,
}
langfuse.InjectTracing(ctx, &taskPayload)
payloadBytes, err := json.Marshal(taskPayload)
if err != nil {
logger.Errorf(ctx, "Failed to marshal URL reparse task payload: %v", err)
return existing, nil
}
task := asynq.NewTask(
types.TypeDocumentProcess,
payloadBytes,
documentProcessTaskOptions(s.config, asynq.MaxRetry(3))...,
)
info, err := s.task.Enqueue(task)
if err != nil {
logger.Errorf(ctx, "Failed to enqueue URL reparse task: %v", err)
return existing, nil
}
logger.Infof(ctx, "Enqueued URL reparse task: id=%s queue=%s knowledge_id=%s", info.ID, info.Queue, existing.ID)
return existing, nil
}
logger.Warnf(ctx, "Knowledge %s has no parseable content (no file, URL, or manual content)", knowledgeID)
return existing, nil
}
// CancelKnowledgeParse marks an in-progress parse as cancelled by the user.
//
// Semantics (kept aligned with the existing deleting path, but partial work
// is preserved instead of cleaned up):
// - parse_status is set to "cancelled"; partial chunks/index already written
// to the database remain on disk. The user can re-trigger parsing via the
// existing ReparseKnowledge API, which overwrites status back to pending.
// - Any in-flight worker reads the new status at its next checkpoint and
// bails (see processChunks / ProcessDocument / downstream handlers).
// - The asynq inspector (if available) dequeues pending / scheduled / retry
// tasks for this knowledge_id across the default / critical / low queues
// and signals active workers to stop. Lite mode (no Redis) skips the
// dequeue step — the checkpoint-based abort is the only stop signal there.
// - Idempotent: re-calling on an already-cancelled row is a no-op.
//
// Errors:
// - ParseStatusCompleted / ParseStatusFailed: the parse has already finished.
// - ParseStatusDeleting: a delete is in progress; cancel cannot supersede it.
func (s *knowledgeService) CancelKnowledgeParse(
ctx context.Context, knowledgeID string,
) (*types.Knowledge, error) {
tenantID := ctx.Value(types.TenantIDContextKey).(uint64)
existing, err := s.repo.GetKnowledgeByID(ctx, tenantID, knowledgeID)
if err != nil {
logger.Errorf(ctx, "CancelKnowledgeParse: failed to load knowledge: %v", err)
return nil, err
}
if existing == nil {
return nil, werrors.NewNotFoundError("knowledge not found")
}
switch existing.ParseStatus {
case types.ParseStatusCancelled:
// Idempotent — still attempt the dequeue in case earlier calls
// raced an enqueue, but skip the row update / span close path.
s.dequeueKnowledgeTasks(ctx, knowledgeID)
return existing, nil
case types.ParseStatusCompleted, types.ParseStatusFailed:
return nil, werrors.NewBadRequestError("解析已结束,无法取消")
case types.ParseStatusDeleting:
return nil, werrors.NewBadRequestError("知识正在删除中,无法取消解析")
case types.ParseStatusPending, types.ParseStatusProcessing, types.ParseStatusFinalizing:
// Cancellable. `finalizing` is the post-process fan-out window
// where graph-extract / summary / question subtasks are still
// running; cancel here stops the LLM cost they would burn.
default:
// Unknown status — let it through but log. Should never happen
// outside test fixtures or hand-edited rows.
logger.Warnf(ctx, "CancelKnowledgeParse: unexpected status %q for %s, proceeding",
existing.ParseStatus, knowledgeID)
}
// Flip the row to cancelled and zero the enrichment counter in one
// update so a late subtask FinalizeSubtask call can't race-promote
// the row back to completed. Persisted partial data is left in
// place — the user can reuse it on the next reparse attempt.
now := time.Now()
if err := s.repo.UpdateKnowledgeColumns(ctx, existing.ID, map[string]interface{}{
"parse_status": types.ParseStatusCancelled,
"error_message": "用户已取消解析",
"pending_subtasks_count": 0,
"updated_at": now,
}); err != nil {
logger.Errorf(ctx, "CancelKnowledgeParse: failed to mark knowledge cancelled: %v", err)
return nil, err
}
existing.ParseStatus = types.ParseStatusCancelled
existing.ErrorMessage = "用户已取消解析"
existing.PendingSubtasksCount = 0
existing.UpdatedAt = now
logger.Infof(ctx, "Knowledge %s marked as cancelled by user", knowledgeID)
// Close the active attempt span tree so the UI stops showing "进行中"
// for the cancelled run. AbortAttempt cascade-cancels every still-
// running descendant (multimodal per-image, postprocess subtasks,
// graph chunks) BEFORE closing the root, otherwise the trace
// viewer would leave those striped/running bars hanging forever
// because workers exit via their abort-guard without ever calling
// FailSpan on their own subspan. Best-effort: nil tracker / missing
// attempt no-ops.
if attempt := s.tracker().LatestAttempt(ctx, knowledgeID); attempt > 0 {
s.tracker().AbortAttempt(ctx, knowledgeID, attempt,
"USER_CANCELLED", "用户已取消解析", "用户已取消解析")
}
// Best-effort dequeue. Failures here don't block the cancel — the
// downstream tasks will still self-abort at their entry guards.
s.dequeueKnowledgeTasks(ctx, knowledgeID)
// Wiki ingest lives in its own per-KB pending queue (task_pending_ops)
// rather than asynq, so dequeueKnowledgeTasks above can't see it.
// Mirror the deletion path's scrub so a cancelled knowledge doesn't
// get picked up by the next 30s batch and burn a wiki LLM call on a
// doc the user already abandoned. The in-flight worker would skip it
// at isWikiKnowledgeAborted anyway, but scrubbing avoids waking the
// batch in the first place.
s.scrubWikiPendingIngest(ctx, existing.KnowledgeBaseID, knowledgeID, "cancel")
return existing, nil
}
// dequeueKnowledgeTasks asks the task inspector to remove any queued
// tasks for this knowledge and signal active workers to stop. Safe to
// call when the inspector is a no-op (Lite mode).
func (s *knowledgeService) dequeueKnowledgeTasks(ctx context.Context, knowledgeID string) {
if s.taskInspector == nil {
return
}
if _, _, err := s.taskInspector.CancelTasksForKnowledge(ctx, knowledgeID); err != nil {
logger.Warnf(ctx, "CancelKnowledgeParse: dequeue best-effort failed for %s: %v", knowledgeID, err)
}
}
func (s *knowledgeService) updateChunkVector(ctx context.Context, kbID string, chunks []*types.Chunk) error {
// Get embedding model from knowledge base
sourceKB, err := s.kbService.GetKnowledgeBaseByID(ctx, kbID)
if err != nil {
return err
}
embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, sourceKB.EmbeddingModelID)
if err != nil {
return err
}
// Initialize composite retrieve engine from tenant configuration
indexInfo := make([]*types.IndexInfo, 0, len(chunks))
ids := make([]string, 0, len(chunks))
for _, chunk := range chunks {
if chunk.KnowledgeBaseID != kbID {
logger.Warnf(ctx, "Knowledge base ID mismatch: %s != %s", chunk.KnowledgeBaseID, kbID)
continue
}
indexInfo = append(indexInfo, &types.IndexInfo{
Content: chunk.Content,
SourceID: chunk.ID,
SourceType: types.ChunkSourceType,
ChunkID: chunk.ID,
KnowledgeID: chunk.KnowledgeID,
KnowledgeBaseID: chunk.KnowledgeBaseID,
IsEnabled: true,
})
ids = append(ids, chunk.ID)
}
retrieveEngine, err := retriever.CreateRetrieveEngineForKB(
ctx, s.retrieveEngine, s.ownership, types.MustTenantIDFromContext(ctx), sourceKB.VectorStoreID)
if err != nil {
return err
}
// Delete old vector representation of the chunk
err = retrieveEngine.DeleteByChunkIDList(ctx, ids, embeddingModel.GetDimensions(), sourceKB.Type)
if err != nil {
return err
}
// Index updated chunk content with new vector representation
err = retrieveEngine.BatchIndex(ctx, embeddingModel, indexInfo)
if err != nil {
return err
}
return nil
}
func (s *knowledgeService) UpdateImageInfo(
ctx context.Context,
knowledgeID string,
chunkID string,
imageInfo string,
) error {
var images []*types.ImageInfo
if err := json.Unmarshal([]byte(imageInfo), &images); err != nil {
logger.Errorf(ctx, "Failed to unmarshal image info: %v", err)
return err
}
if len(images) != 1 {
logger.Warnf(ctx, "Expected exactly one image info, got %d", len(images))
return nil
}
image := images[0]
// Retrieve all chunks with the given parent chunk ID
chunk, err := s.chunkService.GetChunkByID(ctx, chunkID)
if err != nil {
logger.Errorf(ctx, "Failed to get chunk: %v", err)
return err
}
chunk.ImageInfo = imageInfo
tenantID := ctx.Value(types.TenantIDContextKey).(uint64)
chunkChildren, err := s.chunkService.ListChunkByParentID(ctx, tenantID, chunkID)
if err != nil {
logger.ErrorWithFields(ctx, err, map[string]interface{}{
"parent_chunk_id": chunkID,
"tenant_id": tenantID,
})
return err
}
logger.Infof(ctx, "Found %d chunks with parent chunk ID: %s", len(chunkChildren), chunkID)
// Iterate through each chunk and update its content based on the image information
updateChunk := []*types.Chunk{chunk}
var addChunk []*types.Chunk
// Track whether we've found OCR and caption child chunks for this image
hasOCRChunk := false
hasCaptionChunk := false
for i, child := range chunkChildren {
// Skip chunks that are not image types
var cImageInfo []*types.ImageInfo
err = json.Unmarshal([]byte(child.ImageInfo), &cImageInfo)
if err != nil {
logger.Warnf(ctx, "Failed to unmarshal image %s info: %v", child.ID, err)
continue
}
if len(cImageInfo) == 0 {
continue
}
if cImageInfo[0].OriginalURL != image.OriginalURL {
logger.Warnf(ctx, "Skipping chunk ID: %s, image URL mismatch: %s != %s",
child.ID, cImageInfo[0].OriginalURL, image.OriginalURL)
continue
}
// Mark that we've found chunks for this image
switch child.ChunkType {
case types.ChunkTypeImageCaption:
hasCaptionChunk = true
// Update caption if it has changed
if image.Caption != cImageInfo[0].Caption {
child.Content = image.Caption
child.ImageInfo = imageInfo
updateChunk = append(updateChunk, chunkChildren[i])
}
case types.ChunkTypeImageOCR:
hasOCRChunk = true
// Update OCR if it has changed
if image.OCRText != cImageInfo[0].OCRText {
child.Content = image.OCRText
child.ImageInfo = imageInfo
updateChunk = append(updateChunk, chunkChildren[i])
}
}
}
// Create a new caption chunk if it doesn't exist and we have caption data
if !hasCaptionChunk && image.Caption != "" {
captionChunk := &types.Chunk{
ID: uuid.New().String(),
TenantID: tenantID,
KnowledgeID: chunk.KnowledgeID,
KnowledgeBaseID: chunk.KnowledgeBaseID,
Content: image.Caption,
ChunkType: types.ChunkTypeImageCaption,
ParentChunkID: chunk.ID,
ImageInfo: imageInfo,
}
addChunk = append(addChunk, captionChunk)
logger.Infof(ctx, "Created new caption chunk ID: %s for image URL: %s", captionChunk.ID, image.OriginalURL)
}
// Create a new OCR chunk if it doesn't exist and we have OCR data
if !hasOCRChunk && image.OCRText != "" {
ocrChunk := &types.Chunk{
ID: uuid.New().String(),
TenantID: tenantID,
KnowledgeID: chunk.KnowledgeID,
KnowledgeBaseID: chunk.KnowledgeBaseID,
Content: image.OCRText,
ChunkType: types.ChunkTypeImageOCR,
ParentChunkID: chunk.ID,
ImageInfo: imageInfo,
}
addChunk = append(addChunk, ocrChunk)
logger.Infof(ctx, "Created new OCR chunk ID: %s for image URL: %s", ocrChunk.ID, image.OriginalURL)
}
logger.Infof(ctx, "Updated %d chunks out of %d total chunks", len(updateChunk), len(chunkChildren)+1)
if len(addChunk) > 0 {
err := s.chunkService.CreateChunks(ctx, addChunk)
if err != nil {
logger.ErrorWithFields(ctx, err, map[string]interface{}{
"add_chunk_size": len(addChunk),
})
return err
}
}
// Update the chunks
for _, c := range updateChunk {
err := s.chunkService.UpdateChunk(ctx, c)
if err != nil {
logger.ErrorWithFields(ctx, err, map[string]interface{}{
"chunk_id": c.ID,
"knowledge_id": c.KnowledgeID,
})
return err
}
}
// Update the chunk vector
err = s.updateChunkVector(ctx, chunk.KnowledgeBaseID, append(updateChunk, addChunk...))
if err != nil {
logger.ErrorWithFields(ctx, err, map[string]interface{}{
"chunk_id": chunk.ID,
"knowledge_id": chunk.KnowledgeID,
})
return err
}
// Update the knowledge file hash
knowledge, err := s.repo.GetKnowledgeByID(ctx, tenantID, knowledgeID)
if err != nil {
logger.Errorf(ctx, "Failed to get knowledge: %v", err)
return err
}
fileHash := calculateStr(knowledgeID, knowledge.FileHash, imageInfo)
knowledge.FileHash = fileHash
err = s.repo.UpdateKnowledge(ctx, knowledge)
if err != nil {
logger.Warnf(ctx, "Failed to update knowledge file hash: %v", err)
}
logger.Infof(ctx, "Updated chunk successfully, chunk ID: %s, knowledge ID: %s", chunk.ID, chunk.KnowledgeID)
return nil
}
// ProcessManualUpdate handles Asynq manual knowledge update tasks.
// It performs cleanup of old indexes/chunks (when NeedCleanup is true) and re-indexes the content.
func (s *knowledgeService) ProcessManualUpdate(ctx context.Context, t *asynq.Task) error {
var payload types.ManualProcessPayload
if err := json.Unmarshal(t.Payload(), &payload); err != nil {
logger.Errorf(ctx, "failed to unmarshal manual process task payload: %v", err)
return nil
}
ctx = logger.WithRequestID(ctx, payload.RequestId)
ctx = logger.WithField(ctx, "manual_process", payload.KnowledgeID)
ctx = context.WithValue(ctx, types.TenantIDContextKey, payload.TenantID)
tenantInfo, err := s.tenantRepo.GetTenantByID(ctx, payload.TenantID)
if err != nil {
logger.Errorf(ctx, "ProcessManualUpdate: failed to get tenant: %v", err)
return nil
}
ctx = context.WithValue(ctx, types.TenantInfoContextKey, tenantInfo)
knowledge, err := s.repo.GetKnowledgeByID(ctx, payload.TenantID, payload.KnowledgeID)
if err != nil {
logger.Errorf(ctx, "ProcessManualUpdate: failed to get knowledge: %v", err)
return nil
}
if knowledge == nil {
logger.Warnf(ctx, "ProcessManualUpdate: knowledge not found: %s", payload.KnowledgeID)
return nil
}
// Skip if already completed or being deleted
if knowledge.ParseStatus == types.ParseStatusCompleted {
logger.Infof(ctx, "ProcessManualUpdate: already completed, skipping: %s", payload.KnowledgeID)
return nil
}
if knowledge.ParseStatus == types.ParseStatusDeleting {
logger.Infof(ctx, "ProcessManualUpdate: being deleted, skipping: %s", payload.KnowledgeID)
return nil
}
if knowledge.ParseStatus == types.ParseStatusCancelled {
logger.Infof(ctx, "ProcessManualUpdate: cancelled by user, skipping: %s", payload.KnowledgeID)
return nil
}
kb, err := s.kbService.GetKnowledgeBaseByID(ctx, payload.KnowledgeBaseID)
if err != nil {
logger.Errorf(ctx, "ProcessManualUpdate: failed to get knowledge base: %v", err)
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = fmt.Sprintf("failed to get knowledge base: %v", err)
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
return nil
}
// Re-check abort status right before marking processing — see the same
// note in ProcessDocument for the cancel race this guards.
if aborted, status := s.isKnowledgeAborted(ctx, knowledge.TenantID, knowledge.ID); aborted {
logger.Infof(ctx, "ProcessManualUpdate: knowledge aborted (%s), skipping: %s", status, knowledge.ID)
return nil
}
// Update status to processing
knowledge.ParseStatus = "processing"
knowledge.UpdatedAt = time.Now()
if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil {
logger.Errorf(ctx, "ProcessManualUpdate: failed to update status to processing: %v", err)
return nil
}
// Allocate a fresh span-tracking attempt for this manual (re)index.
// Without it attemptFromCtx stays 0, so processChunks drops all stage
// spans and KnowledgePostProcess falls back to LatestAttempt — piling
// this run's summary/wiki subspans onto the previous attempt's trace.
attempt := 0
if root, n, err := s.tracker().OpenAttempt(ctx, knowledge.ID, payload.LangfuseTraceID); err == nil && root != nil {
attempt = n
} else if err != nil {
logger.Warnf(ctx, "ProcessManualUpdate: OpenAttempt failed for %s: %v", knowledge.ID, err)
}
ctx = withAttempt(ctx, attempt)
// Cleanup old resources (indexes, chunks, graph) for update operations
if payload.NeedCleanup {
if err := s.cleanupKnowledgeResources(ctx, knowledge); err != nil {
logger.ErrorWithFields(ctx, err, map[string]interface{}{
"knowledge_id": payload.KnowledgeID,
})
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = fmt.Sprintf("failed to cleanup old resources: %v", err)
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
return nil
}
}
// Run manual processing (image resolution + chunking + embedding) synchronously within the worker
s.triggerManualProcessing(ctx, kb, knowledge, payload.Content, true)
return nil
}
// ProcessDocument handles Asynq document processing tasks
func (s *knowledgeService) ProcessDocument(ctx context.Context, t *asynq.Task) error {
var payload types.DocumentProcessPayload
if err := json.Unmarshal(t.Payload(), &payload); err != nil {
logger.Errorf(ctx, "failed to unmarshal document process task payload: %v", err)
return nil
}
ctx = logger.WithRequestID(ctx, payload.RequestId)
ctx = logger.WithField(ctx, "document_process", payload.KnowledgeID)
ctx = context.WithValue(ctx, types.TenantIDContextKey, payload.TenantID)
if payload.Language != "" {
ctx = context.WithValue(ctx, types.LanguageContextKey, payload.Language)
}
// 获取任务重试信息,用于判断是否是最后一次重试
retryCount, _ := asynq.GetRetryCount(ctx)
maxRetry, _ := asynq.GetMaxRetry(ctx)
isLastRetry := retryCount >= maxRetry
tenantInfo, err := s.tenantRepo.GetTenantByID(ctx, payload.TenantID)
if err != nil {
logger.Errorf(ctx, "failed to get tenant: %v", err)
return nil
}
ctx = context.WithValue(ctx, types.TenantInfoContextKey, tenantInfo)
logger.Infof(ctx, "Processing document task: knowledge_id=%s, file_path=%s, retry=%d/%d",
payload.KnowledgeID, payload.FilePath, retryCount, maxRetry)
// 幂等性检查获取knowledge记录
knowledge, err := s.repo.GetKnowledgeByID(ctx, payload.TenantID, payload.KnowledgeID)
if err != nil {
logger.Errorf(ctx, "failed to get knowledge: %v", err)
return nil
}
if knowledge == nil {
return nil
}
// 检查是否正在删除 / 已被用户取消 - 如果是则直接退出
if knowledge.ParseStatus == types.ParseStatusDeleting {
logger.Infof(ctx, "Knowledge is being deleted, aborting processing: %s", payload.KnowledgeID)
return nil
}
if knowledge.ParseStatus == types.ParseStatusCancelled {
logger.Infof(ctx, "Knowledge cancelled by user, aborting processing: %s", payload.KnowledgeID)
return nil
}
// 检查任务状态 - 幂等性处理
if knowledge.ParseStatus == types.ParseStatusCompleted {
logger.Infof(ctx, "Document already completed, skipping: %s", payload.KnowledgeID)
return nil // 幂等:已完成的任务直接返回
}
if knowledge.ParseStatus == types.ParseStatusFailed {
// 检查是否可恢复(例如:超时、临时错误等)
// 对于不可恢复的错误,直接返回
logger.Warnf(
ctx,
"Document processing previously failed: %s, error: %s",
payload.KnowledgeID,
knowledge.ErrorMessage,
)
// 这里可以根据错误类型判断是否可恢复,暂时允许重试
}
// 检查是否有部分处理有chunks但状态不是completed
if knowledge.ParseStatus != "completed" && knowledge.ParseStatus != "pending" &&
knowledge.ParseStatus != "processing" {
// 状态异常,记录日志但继续处理
logger.Warnf(ctx, "Unexpected parse status: %s for knowledge: %s", knowledge.ParseStatus, payload.KnowledgeID)
}
// 获取知识库信息
kb, err := s.kbService.GetKnowledgeBaseByID(ctx, payload.KnowledgeBaseID)
if err != nil {
logger.Errorf(ctx, "failed to get knowledge base: %v", err)
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = fmt.Sprintf("failed to get knowledge base: %v", err)
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
return nil
}
// Re-check abort status right before flipping to "processing" — closes
// the race where the user cancels between the entry guard above and
// this write (otherwise the worker would overwrite cancelled→processing
// and downstream checkpoints would treat the run as live).
if aborted, status := s.isKnowledgeAborted(ctx, knowledge.TenantID, knowledge.ID); aborted {
logger.Infof(ctx, "Knowledge aborted (%s) before marking processing: %s", status, knowledge.ID)
return nil
}
knowledge.ParseStatus = "processing"
knowledge.UpdatedAt = time.Now()
if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil {
logger.Errorf(ctx, "failed to update knowledge status to processing: %v", err)
return nil
}
// Resolve the attempt for span tracking. The enqueue site sets
// payload.Attempt to a fresh number for the initial parse and to
// max+1 for each user-initiated reparse. Asynq retries within a
// single user action keep the same payload (so retries record
// onto the same attempt). For payloads predating this code we
// fall back to OpenAttempt.
attempt := payload.Attempt
if attempt <= 0 {
if root, n, err := s.tracker().OpenAttempt(ctx, knowledge.ID, payload.LangfuseTraceID); err == nil && root != nil {
attempt = n
}
}
ctx = withAttempt(ctx, attempt)
// 检查多模态配置(仅对文件导入)
if payload.FilePath != "" && !payload.EnableMultimodel && IsImageType(payload.FileType) {
logger.GetLogger(ctx).WithField("knowledge_id", knowledge.ID).
WithField("error", ErrImageNotParse).Errorf("processDocument image without enable multimodel")
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = ErrImageNotParse.Error()
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
return nil
}
// 检查音频ASR配置仅对文件导入
if payload.FilePath != "" && IsAudioType(payload.FileType) && !kb.ASRConfig.IsASREnabled() {
logger.GetLogger(ctx).WithField("knowledge_id", knowledge.ID).
Errorf("processDocument audio without ASR model configured")
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = "上传音频文件需要设置ASR语音识别模型"
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
return nil
}
// 视频文件不再支持入库解析
if payload.FilePath != "" && IsVideoType(payload.FileType) {
logger.GetLogger(ctx).WithField("knowledge_id", knowledge.ID).
Errorf("processDocument video not supported")
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = "暂不支持视频文件"
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
return nil
}
// New pipeline: convert -> store images -> chunk -> vectorize -> multimodal tasks
var convertResult *types.ReadResult
var chunks []types.ParsedChunk
if payload.FileURL != "" {
// file_url import: SSRF re-check (防 DNS 重绑定), download, persist, then delegate to convert()
if err := secutils.ValidateURLForSSRF(payload.FileURL); err != nil {
logger.Errorf(ctx, "File URL rejected for SSRF protection in ProcessDocument: %s, err: %v", payload.FileURL, err)
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = "File URL is not allowed for security reasons"
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
return nil
}
resolvedFileName := payload.FileName
resolvedFileType := payload.FileType
contentBytes, err := downloadFileFromURL(ctx, payload.FileURL, &resolvedFileName, &resolvedFileType)
if err != nil {
logger.Errorf(ctx, "Failed to download file from URL: %s, error: %v", payload.FileURL, err)
if isLastRetry {
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = err.Error()
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
}
return fmt.Errorf("failed to download file from URL: %w", err)
}
if resolvedFileType != "" && !allowedFileURLExtensions[strings.ToLower(resolvedFileType)] {
logger.Errorf(ctx, "Unsupported file type resolved from file URL: %s", resolvedFileType)
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = fmt.Sprintf("unsupported file type: %s", resolvedFileType)
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
return nil
}
if resolvedFileName != "" && knowledge.FileName == "" {
knowledge.FileName = resolvedFileName
}
if resolvedFileType != "" && knowledge.FileType == "" {
knowledge.FileType = resolvedFileType
s.repo.UpdateKnowledge(ctx, knowledge)
}
fileSvc := s.resolveFileService(ctx, kb)
filePath, err := fileSvc.SaveBytes(ctx, contentBytes, payload.TenantID, resolvedFileName, true)
if err != nil {
if isLastRetry {
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = err.Error()
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
}
return fmt.Errorf("failed to save downloaded file: %w", err)
}
payload.FilePath = filePath
payload.FileName = resolvedFileName
payload.FileType = resolvedFileType
convertResult, err = s.convert(ctx, payload, kb, knowledge, isLastRetry)
if err != nil {
return err
}
if convertResult == nil {
return nil
}
} else if payload.URL != "" {
// URL import
convertResult, err = s.convert(ctx, payload, kb, knowledge, isLastRetry)
if err != nil {
return err
}
if convertResult == nil {
return nil
}
// Update knowledge title from extracted page title if not already set
if knowledge.Title == "" || knowledge.Title == payload.URL {
if extractedTitle := convertResult.Metadata["title"]; extractedTitle != "" {
knowledge.Title = extractedTitle
knowledge.UpdatedAt = time.Now()
if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil {
logger.Warnf(ctx, "Failed to update knowledge title from extracted page title: %v", err)
} else {
logger.Infof(ctx, "Updated knowledge title to extracted page title: %s", extractedTitle)
}
}
}
} else if len(payload.Passages) > 0 {
// Text passage import - direct chunking, no conversion needed
passageChunks := make([]types.ParsedChunk, 0, len(payload.Passages))
start, end := 0, 0
for i, p := range payload.Passages {
if p == "" {
continue
}
end += len([]rune(p))
passageChunks = append(passageChunks, types.ParsedChunk{
Content: p,
Seq: i,
Start: start,
End: end,
})
start = end
}
passageOpts := ProcessChunksOptions{
EnableQuestionGeneration: payload.EnableQuestionGeneration,
QuestionCount: payload.QuestionCount,
}
s.processChunks(ctx, kb, knowledge, passageChunks, passageOpts)
return nil
} else {
// File import
convertResult, err = s.convert(ctx, payload, kb, knowledge, isLastRetry)
if err != nil {
return err
}
if convertResult == nil {
return nil
}
}
// Step 1.5: ASR transcription for audio files
if convertResult != nil && convertResult.IsAudio && len(convertResult.AudioData) > 0 {
if !kb.ASRConfig.IsASREnabled() {
logger.Error(ctx, "Audio file detected but ASR is not configured")
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = "ASR model is not configured for audio transcription"
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
return nil
}
logger.Infof(ctx, "[ASR] Starting audio transcription for knowledge %s, audio size=%d bytes",
knowledge.ID, len(convertResult.AudioData))
asrModel, err := s.modelService.GetASRModel(ctx, kb.ASRConfig.ModelID)
if err != nil {
logger.Errorf(ctx, "[ASR] Failed to get ASR model: %v", err)
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = fmt.Sprintf("failed to get ASR model: %v", err)
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
return nil
}
transcriptionResult, err := asrModel.Transcribe(ctx, convertResult.AudioData, knowledge.FileName)
if err != nil {
logger.Errorf(ctx, "[ASR] Transcription failed: %v", err)
if isLastRetry {
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = fmt.Sprintf("audio transcription failed: %v", err)
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
}
return fmt.Errorf("audio transcription failed: %w", err)
}
var transcribedText string
if transcriptionResult != nil {
transcribedText = transcriptionResult.Text
}
if transcribedText == "" {
logger.Warn(ctx, "[ASR] Transcription returned empty text")
transcribedText = "[No speech detected in audio file]"
}
logger.Infof(ctx, "[ASR] Transcription completed, text length=%d", len(transcribedText))
// Replace the audio placeholder with the transcribed text
convertResult.MarkdownContent = transcribedText
convertResult.IsAudio = false
convertResult.AudioData = nil
}
// Step 2: Store images and update markdown references
var storedImages []docparser.StoredImage
if s.imageResolver != nil && convertResult != nil {
fileSvc := s.resolveFileService(ctx, kb)
tenantID, _ := ctx.Value(types.TenantIDContextKey).(uint64)
updatedMarkdown, images, resolveErr := s.imageResolver.ResolveAndStore(ctx, convertResult, fileSvc, tenantID)
if resolveErr != nil {
logger.Warnf(ctx, "Image resolution partially failed: %v", resolveErr)
}
if updatedMarkdown != "" {
convertResult.MarkdownContent = updatedMarkdown
}
storedImages = images
// Resolve remote http(s) images (e.g. markdown external URLs) → download + upload to storage.
// ResolveAndStore handles inline bytes and base64; ResolveRemoteImages handles http/https URLs.
updatedContent, remoteImages, remoteErr := s.imageResolver.ResolveRemoteImages(ctx, convertResult.MarkdownContent, fileSvc, tenantID)
if remoteErr != nil {
logger.Warnf(ctx, "Remote image resolution partially failed: %v", remoteErr)
}
if len(remoteImages) > 0 {
logger.Infof(ctx, "Resolved %d remote images for knowledge %s", len(remoteImages), knowledge.ID)
convertResult.MarkdownContent = updatedContent
storedImages = append(storedImages, remoteImages...)
}
logger.Infof(ctx, "Resolved %d total images for knowledge %s", len(storedImages), knowledge.ID)
}
// Step 3: Split into chunks using Go chunker
chunkCfg := buildSplitterConfig(kb)
processOpts := ProcessChunksOptions{
EnableQuestionGeneration: payload.EnableQuestionGeneration,
QuestionCount: payload.QuestionCount,
EnableMultimodel: payload.EnableMultimodel,
StoredImages: storedImages,
}
if convertResult != nil {
processOpts.Metadata = convertResult.Metadata
}
if kb.ChunkingConfig.EnableParentChild {
parentCfg, childCfg := buildParentChildConfigs(kb.ChunkingConfig, chunkCfg)
pcResult := chunker.SplitParentChild(convertResult.MarkdownContent, parentCfg, childCfg)
chunks = make([]types.ParsedChunk, len(pcResult.Children))
for i, c := range pcResult.Children {
chunks[i] = types.ParsedChunk{
Content: c.Content,
ContextHeader: c.ContextHeader,
Seq: c.Seq,
Start: c.Start,
End: c.End,
ParentIndex: c.ParentIndex,
}
}
parentChunks := make([]types.ParsedParentChunk, len(pcResult.Parents))
for i, p := range pcResult.Parents {
parentChunks[i] = types.ParsedParentChunk{Content: p.Content, Seq: p.Seq, Start: p.Start, End: p.End}
}
processOpts.ParentChunks = parentChunks
logger.Infof(ctx, "Split document into %d parent + %d child chunks for knowledge %s",
len(pcResult.Parents), len(pcResult.Children), knowledge.ID)
} else {
splitChunks := chunker.Split(convertResult.MarkdownContent, chunkCfg)
chunks = make([]types.ParsedChunk, len(splitChunks))
for i, c := range splitChunks {
chunks[i] = types.ParsedChunk{
Content: c.Content,
ContextHeader: c.ContextHeader,
Seq: c.Seq,
Start: c.Start,
End: c.End,
}
}
logger.Infof(ctx, "Split document into %d chunks for knowledge %s", len(chunks), knowledge.ID)
}
// Step 4: Process chunks (vectorize + index + enqueue async tasks)
s.processChunks(ctx, kb, knowledge, chunks, processOpts)
return nil
}
// convert handles both file and URL reading using a unified ReadRequest.
func (s *knowledgeService) convert(
ctx context.Context,
payload types.DocumentProcessPayload,
kb *types.KnowledgeBase,
knowledge *types.Knowledge,
isLastRetry bool,
) (*types.ReadResult, error) {
// Stage tracking: docreader. Mark the stage as running here so the
// timeline reflects "DocReader" the moment a worker picks the task
// up — before that, the stage stays "pending" from the initial
// upload. Failure/skip transitions are emitted at the specific
// failure points below; success is emitted at the bottom.
docInput := types.JSONMap{
"file_name": payload.FileName,
"file_type": payload.FileType,
"is_url": payload.URL != "",
}
if payload.URL != "" {
docInput["url"] = payload.URL
}
s.beginStage(ctx, knowledge.ID, types.StageDocReader, docInput)
isURL := payload.URL != ""
fileType := payload.FileType
overrides := s.getParserEngineOverridesFromContext(ctx)
if isURL {
if err := secutils.ValidateURLForSSRF(payload.URL); err != nil {
logger.Errorf(ctx, "URL rejected for SSRF protection: %s, err: %v", payload.URL, err)
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = "URL is not allowed for security reasons"
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
s.failStage(ctx, knowledge.ID, types.StageDocReader,
werrors.ErrCodeDocReaderParseFailed, "URL rejected for security reasons", err)
return nil, nil
}
}
parserEngine := kb.ChunkingConfig.ResolveParserEngine(fileType)
if isURL {
parserEngine = kb.ChunkingConfig.ResolveParserEngine("url")
}
logger.Infof(ctx, "[convert] kb=%s fileType=%s isURL=%v engine=%q rules=%+v",
kb.ID, fileType, isURL, parserEngine, kb.ChunkingConfig.ParserEngineRules)
var reader interfaces.DocReader = s.resolveDocReader(ctx, parserEngine, fileType, isURL, overrides)
if reader == nil {
logger.Errorf(ctx, "[convert] no doc reader for kb=%s knowledge=%s fileType=%s engine=%q isURL=%v",
kb.ID, knowledge.ID, fileType, parserEngine, isURL)
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = "Document parsing service is not configured. Please use text/paragraph import or set DOCREADER_ADDR."
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
s.failStage(ctx, knowledge.ID, types.StageDocReader,
werrors.ErrCodeDocReaderUnavailable, knowledge.ErrorMessage, nil)
return nil, nil
}
req := &types.ReadRequest{
URL: payload.URL,
Title: knowledge.Title,
ParserEngine: parserEngine,
RequestID: payload.RequestId,
ParserEngineOverrides: overrides,
}
if !isURL {
fileReader, err := s.resolveFileServiceForPath(ctx, kb, payload.FilePath).GetFile(ctx, payload.FilePath)
if err != nil {
s.failStage(ctx, knowledge.ID, types.StageDocReader,
werrors.ErrCodeDocReaderParseFailed, "failed to get file", err)
return s.failKnowledge(ctx, knowledge, isLastRetry, "failed to get file: %v", err)
}
defer fileReader.Close()
contentBytes, err := io.ReadAll(fileReader)
if err != nil {
s.failStage(ctx, knowledge.ID, types.StageDocReader,
werrors.ErrCodeDocReaderParseFailed, "failed to read file", err)
return s.failKnowledge(ctx, knowledge, isLastRetry, "failed to read file: %v", err)
}
req.FileContent = contentBytes
req.FileName = payload.FileName
req.FileType = fileType
}
result, err := s.callDocReaderWithTimeout(ctx, reader, req)
if err != nil {
// Distinguish DocReader timeout (a knowable user-facing
// failure) from generic read errors so the UI can suggest
// "split this large file" specifically when relevant.
code := werrors.ErrCodeDocReaderParseFailed
if errors.Is(err, context.DeadlineExceeded) || strings.Contains(err.Error(), "docreader call timeout") {
code = werrors.ErrCodeDocReaderTimeout
}
s.failStage(ctx, knowledge.ID, types.StageDocReader,
code, "document read failed", err)
return s.failKnowledge(ctx, knowledge, isLastRetry, "document read failed: %v", err)
}
if result.Error != "" {
logger.Errorf(ctx, "[convert] parser returned error kb=%s knowledge=%s file=%q type=%s engine=%q: %s",
kb.ID, knowledge.ID, req.FileName, fileType, parserEngine, result.Error)
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = result.Error
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
s.failStage(ctx, knowledge.ID, types.StageDocReader,
werrors.ErrCodeDocReaderParseFailed, result.Error, nil)
return nil, nil
}
docOutput := types.JSONMap{
"text_length": len(result.MarkdownContent),
"images_found": len(result.ImageRefs),
"is_audio": result.IsAudio,
}
if pages := result.Metadata["pages"]; pages != "" {
docOutput["pages"] = pages
}
s.endStage(ctx, knowledge.ID, types.StageDocReader, docOutput)
return result, nil
}
// callDocReaderWithTimeout wraps the DocReader RPC in a child context whose
// deadline is min(parent_deadline, DocReaderCallTimeout). Without this cap,
// a hung docreader (network partition, GC pause, OCR runaway) silently
// burns the whole DocumentProcessTimeout budget and pins a worker for hours
// — the #1 cause of "knowledge stuck in processing" reports.
//
// On timeout we annotate the error so retries / dead-letter consumers can
// distinguish "docreader was slow" from "docreader returned an error".
func (s *knowledgeService) callDocReaderWithTimeout(
ctx context.Context, reader interfaces.DocReader, req *types.ReadRequest,
) (*types.ReadResult, error) {
timeout := 30 * time.Minute
if s.config != nil && s.config.KnowledgeBase != nil && s.config.KnowledgeBase.DocReaderCallTimeout > 0 {
timeout = s.config.KnowledgeBase.DocReaderCallTimeout
}
callCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
start := time.Now()
result, err := reader.Read(callCtx, req)
elapsed := time.Since(start)
if err != nil {
// Promote DeadlineExceeded into a clearer message; retain underlying
// error via %w so errors.Is(callCtx.Err(), context.DeadlineExceeded)
// still works for upstream classification.
if errors.Is(callCtx.Err(), context.DeadlineExceeded) && !errors.Is(ctx.Err(), context.DeadlineExceeded) {
logger.Errorf(ctx, "[convert] docreader call timed out after %s (limit %s) for %q",
elapsed, timeout, req.FileName)
return nil, fmt.Errorf("docreader call timeout after %s: %w", timeout, err)
}
return nil, err
}
logger.Infof(ctx, "[convert] docreader call ok in %s for %q", elapsed, req.FileName)
return result, nil
}
// isLikelyRateLimitError performs a fuzzy classification of an error as a
// rate-limit / quota / backpressure failure. We only need a hint — the
// caller maps to one of two error_codes so the UI can offer "retry later"
// vs. "fix configuration" advice. False positives are harmless (the
// detail is preserved in error_detail anyway).
func isLikelyRateLimitError(err error) bool {
if err == nil {
return false
}
msg := strings.ToLower(err.Error())
for _, needle := range []string{"rate limit", "ratelimit", "429", "too many requests", "quota"} {
if strings.Contains(msg, needle) {
return true
}
}
return false
}
// Returns nil when the required service is unavailable.
func (s *knowledgeService) resolveDocReader(ctx context.Context, engine, fileType string, isURL bool, overrides map[string]string) interfaces.DocReader {
switch engine {
case docparser.SimpleEngineName:
return &docparser.SimpleFormatReader{}
case docparser.WeKnoraCloudEngineName:
creds := s.tenantService.GetWeKnoraCloudCredentials(ctx)
if creds == nil {
logger.Warnf(ctx, "[resolveDocReader] WeKnoraCloud: no tenant credentials (fileType=%s)", fileType)
return nil
}
reader, err := docparser.NewWeKnoraCloudSignedDocumentReader(creds.AppID, creds.AppSecret)
if err != nil {
logger.Errorf(ctx, "[resolveDocReader] WeKnoraCloud reader init failed: %v", err)
return nil
}
return reader
case "mineru":
return docparser.NewMinerUReader(overrides)
case "mineru_cloud":
return docparser.NewMinerUCloudReader(overrides)
case "builtin":
// 明确指定使用 builtin 引擎docreader不使用 simple format 兜底
return s.documentReader
default:
// 未指定引擎时的兜底逻辑simple format 使用 Go 原生处理,其他使用 docreader
if !isURL && docparser.IsSimpleFormat(fileType) {
return &docparser.SimpleFormatReader{}
}
return s.documentReader
}
}
// failKnowledge marks knowledge as failed (only on last retry) and returns an error.
func (s *knowledgeService) failKnowledge(
ctx context.Context,
knowledge *types.Knowledge,
isLastRetry bool,
format string,
args ...interface{},
) (*types.ReadResult, error) {
errMsg := fmt.Sprintf(format, args...)
if isLastRetry {
knowledge.ParseStatus = "failed"
knowledge.ErrorMessage = errMsg
knowledge.UpdatedAt = time.Now()
s.repo.UpdateKnowledge(ctx, knowledge)
}
return nil, fmt.Errorf(format, args...)
}
// enqueueImageMultimodalTasks enqueues asynq tasks for multimodal image processing.
func (s *knowledgeService) enqueueImageMultimodalTasks(
ctx context.Context,
knowledge *types.Knowledge,
kb *types.KnowledgeBase,
images []docparser.StoredImage,
chunks []types.ParsedChunk,
metadata map[string]string,
) {
if s.task == nil || len(images) == 0 {
return
}
attempt := attemptFromCtx(ctx)
redisKey := fmt.Sprintf("multimodal:pending:%s", knowledge.ID)
if s.redisClient != nil {
if err := s.redisClient.Set(ctx, redisKey, len(images), 24*time.Hour).Err(); err != nil {
logger.Warnf(ctx, "Failed to set multimodal pending count for %s: %v", knowledge.ID, err)
}
}
for idx, img := range images {
// Match image to the ParsedChunk whose content contains the image URL.
// ChunkID was populated by processChunks with the real DB UUID.
chunkID := ""
for _, c := range chunks {
if strings.Contains(c.Content, img.ServingURL) {
chunkID = c.ChunkID
break
}
}
if chunkID == "" && len(chunks) > 0 {
chunkID = chunks[0].ChunkID
}
lang, _ := types.LanguageFromContext(ctx)
payload := types.ImageMultimodalPayload{
TenantID: knowledge.TenantID,
KnowledgeID: knowledge.ID,
KnowledgeBaseID: kb.ID,
ChunkID: chunkID,
ImageURL: img.ServingURL,
EnableOCR: true,
EnableCaption: true,
Language: lang,
ImageSourceType: metadata["image_source_type"],
Attempt: attempt,
ImageIndex: idx,
}
langfuse.InjectTracing(ctx, &payload)
payloadBytes, err := json.Marshal(payload)
if err != nil {
logger.Warnf(ctx, "Failed to marshal image multimodal payload: %v", err)
continue
}
task := asynq.NewTask(types.TypeImageMultimodal, payloadBytes)
if _, err := s.task.Enqueue(task); err != nil {
logger.Warnf(ctx, "Failed to enqueue image multimodal task for %s: %v", img.ServingURL, err)
} else {
logger.Infof(ctx, "Enqueued image:multimodal task for %s", img.ServingURL)
}
}
}