From 98e246483138f189954209d3d2c65dd66117c675 Mon Sep 17 00:00:00 2001 From: wizardchen Date: Wed, 8 Apr 2026 20:23:43 +0800 Subject: [PATCH] refactor: Enhance wiki ingest service with knowledge service integration and content sorting - Introduced a new `knowledgeSvc` interface to the `wikiIngestService`, allowing for improved document title retrieval based on knowledge ID. - Updated the document processing logic to prioritize titles from the knowledge service, enhancing accuracy. - Refactored content reconstruction to sort chunks by `StartAt` and `ChunkIndex`, improving the organization of the final output. - Added functionality to append image information at the end of the content, ensuring a cleaner text flow. These changes improve the efficiency and clarity of the wiki ingest process, enhancing overall content management. --- internal/application/service/wiki_ingest.go | 107 +++++++++++++++++--- 1 file changed, 92 insertions(+), 15 deletions(-) diff --git a/internal/application/service/wiki_ingest.go b/internal/application/service/wiki_ingest.go index 8e8f6cab..9b14d00b 100644 --- a/internal/application/service/wiki_ingest.go +++ b/internal/application/service/wiki_ingest.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "sort" "strings" "text/template" "time" @@ -72,6 +73,7 @@ type WikiRetractPayload struct { type wikiIngestService struct { wikiService interfaces.WikiPageService kbService interfaces.KnowledgeBaseService + knowledgeSvc interfaces.KnowledgeService chunkRepo interfaces.ChunkRepository modelService interfaces.ModelService task interfaces.TaskEnqueuer @@ -82,6 +84,7 @@ type wikiIngestService struct { func NewWikiIngestService( wikiService interfaces.WikiPageService, kbService interfaces.KnowledgeBaseService, + knowledgeSvc interfaces.KnowledgeService, chunkRepo interfaces.ChunkRepository, modelService interfaces.ModelService, task interfaces.TaskEnqueuer, @@ -90,6 +93,7 @@ func NewWikiIngestService( svc := &wikiIngestService{ wikiService: wikiService, kbService: kbService, + knowledgeSvc: knowledgeSvc, chunkRepo: chunkRepo, modelService: modelService, task: task, @@ -535,12 +539,16 @@ func (s *wikiIngestService) processOneDocument( // Get document title docTitle := knowledgeID - for _, ch := range chunks { - if ch.Content != "" { - lines := strings.SplitN(ch.Content, "\n", 2) - if len(lines) > 0 && len(lines[0]) > 0 && len(lines[0]) < 200 { - docTitle = strings.TrimPrefix(strings.TrimSpace(lines[0]), "# ") - break + if kn, err := s.knowledgeSvc.GetKnowledgeByIDOnly(ctx, knowledgeID); err == nil && kn != nil && kn.Title != "" { + docTitle = kn.Title + } else { + for _, ch := range chunks { + if ch.Content != "" { + lines := strings.SplitN(ch.Content, "\n", 2) + if len(lines) > 0 && len(lines[0]) > 0 && len(lines[0]) < 200 { + docTitle = strings.TrimPrefix(strings.TrimSpace(lines[0]), "# ") + break + } } } } @@ -1389,20 +1397,89 @@ func reconstructContent(chunks []*types.Chunk) string { } } - // Sort by chunk index - for i := 0; i < len(textChunks); i++ { - for j := i + 1; j < len(textChunks); j++ { - if textChunks[i].ChunkIndex > textChunks[j].ChunkIndex { - textChunks[i], textChunks[j] = textChunks[j], textChunks[i] + // Sort by StartAt, then ChunkIndex + sort.Slice(textChunks, func(i, j int) bool { + if textChunks[i].StartAt == textChunks[j].StartAt { + return textChunks[i].ChunkIndex < textChunks[j].ChunkIndex + } + return textChunks[i].StartAt < textChunks[j].StartAt + }) + + var sb strings.Builder + lastEndAt := -1 + for _, c := range textChunks { + toAppend := c.Content + + if c.StartAt > lastEndAt || c.EndAt == 0 { + // Non-overlapping or missing position info + if sb.Len() > 0 { + sb.WriteString("\n") + } + sb.WriteString(toAppend) + if c.EndAt > 0 { + lastEndAt = c.EndAt + } + } else if c.EndAt > lastEndAt { + // Partial overlap + contentRunes := []rune(toAppend) + offset := len(contentRunes) - (c.EndAt - lastEndAt) + if offset >= 0 && offset < len(contentRunes) { + sb.WriteString(string(contentRunes[offset:])) + } else { + // Fallback if offset calculation is invalid + if sb.Len() > 0 { + sb.WriteString("\n") + } + sb.WriteString(toAppend) + } + lastEndAt = c.EndAt + } + // If c.EndAt <= lastEndAt, it's fully contained, so skip appending text + } + + // Append image information at the end to avoid interrupting text flow + var hasImages bool + seenURLs := make(map[string]bool) + for _, c := range textChunks { + if c.ImageInfo != "" { + var imageInfos []types.ImageInfo + if err := json.Unmarshal([]byte(c.ImageInfo), &imageInfos); err == nil && len(imageInfos) > 0 { + for _, img := range imageInfos { + // Deduplicate images by URL to avoid printing the same image multiple times from overlapping chunks + if img.URL != "" { + if seenURLs[img.URL] { + continue + } + seenURLs[img.URL] = true + } + + if !hasImages { + sb.WriteString("\n\n\n") + hasImages = true + } else { + sb.WriteString("\n") + } + + sb.WriteString("\n") + if img.URL != "" { + sb.WriteString(fmt.Sprintf(" %s\n", img.URL)) + } + if img.Caption != "" { + sb.WriteString(fmt.Sprintf(" %s\n", img.Caption)) + } + if img.OCRText != "" { + sb.WriteString(fmt.Sprintf(" %s\n", img.OCRText)) + } + sb.WriteString("\n") + } } } } - var sb strings.Builder - for _, c := range textChunks { - sb.WriteString(c.Content) - sb.WriteString("\n") + if hasImages { + sb.WriteString("\n") } + return sb.String() }