mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
refactor: Enhance wiki ingest service with knowledge service integration and content sorting
- Introduced a new `knowledgeSvc` interface to the `wikiIngestService`, allowing for improved document title retrieval based on knowledge ID. - Updated the document processing logic to prioritize titles from the knowledge service, enhancing accuracy. - Refactored content reconstruction to sort chunks by `StartAt` and `ChunkIndex`, improving the organization of the final output. - Added functionality to append image information at the end of the content, ensuring a cleaner text flow. These changes improve the efficiency and clarity of the wiki ingest process, enhancing overall content management.
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"text/template"
|
||||
"time"
|
||||
@@ -72,6 +73,7 @@ type WikiRetractPayload struct {
|
||||
type wikiIngestService struct {
|
||||
wikiService interfaces.WikiPageService
|
||||
kbService interfaces.KnowledgeBaseService
|
||||
knowledgeSvc interfaces.KnowledgeService
|
||||
chunkRepo interfaces.ChunkRepository
|
||||
modelService interfaces.ModelService
|
||||
task interfaces.TaskEnqueuer
|
||||
@@ -82,6 +84,7 @@ type wikiIngestService struct {
|
||||
func NewWikiIngestService(
|
||||
wikiService interfaces.WikiPageService,
|
||||
kbService interfaces.KnowledgeBaseService,
|
||||
knowledgeSvc interfaces.KnowledgeService,
|
||||
chunkRepo interfaces.ChunkRepository,
|
||||
modelService interfaces.ModelService,
|
||||
task interfaces.TaskEnqueuer,
|
||||
@@ -90,6 +93,7 @@ func NewWikiIngestService(
|
||||
svc := &wikiIngestService{
|
||||
wikiService: wikiService,
|
||||
kbService: kbService,
|
||||
knowledgeSvc: knowledgeSvc,
|
||||
chunkRepo: chunkRepo,
|
||||
modelService: modelService,
|
||||
task: task,
|
||||
@@ -535,12 +539,16 @@ func (s *wikiIngestService) processOneDocument(
|
||||
|
||||
// Get document title
|
||||
docTitle := knowledgeID
|
||||
for _, ch := range chunks {
|
||||
if ch.Content != "" {
|
||||
lines := strings.SplitN(ch.Content, "\n", 2)
|
||||
if len(lines) > 0 && len(lines[0]) > 0 && len(lines[0]) < 200 {
|
||||
docTitle = strings.TrimPrefix(strings.TrimSpace(lines[0]), "# ")
|
||||
break
|
||||
if kn, err := s.knowledgeSvc.GetKnowledgeByIDOnly(ctx, knowledgeID); err == nil && kn != nil && kn.Title != "" {
|
||||
docTitle = kn.Title
|
||||
} else {
|
||||
for _, ch := range chunks {
|
||||
if ch.Content != "" {
|
||||
lines := strings.SplitN(ch.Content, "\n", 2)
|
||||
if len(lines) > 0 && len(lines[0]) > 0 && len(lines[0]) < 200 {
|
||||
docTitle = strings.TrimPrefix(strings.TrimSpace(lines[0]), "# ")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1389,20 +1397,89 @@ func reconstructContent(chunks []*types.Chunk) string {
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by chunk index
|
||||
for i := 0; i < len(textChunks); i++ {
|
||||
for j := i + 1; j < len(textChunks); j++ {
|
||||
if textChunks[i].ChunkIndex > textChunks[j].ChunkIndex {
|
||||
textChunks[i], textChunks[j] = textChunks[j], textChunks[i]
|
||||
// Sort by StartAt, then ChunkIndex
|
||||
sort.Slice(textChunks, func(i, j int) bool {
|
||||
if textChunks[i].StartAt == textChunks[j].StartAt {
|
||||
return textChunks[i].ChunkIndex < textChunks[j].ChunkIndex
|
||||
}
|
||||
return textChunks[i].StartAt < textChunks[j].StartAt
|
||||
})
|
||||
|
||||
var sb strings.Builder
|
||||
lastEndAt := -1
|
||||
for _, c := range textChunks {
|
||||
toAppend := c.Content
|
||||
|
||||
if c.StartAt > lastEndAt || c.EndAt == 0 {
|
||||
// Non-overlapping or missing position info
|
||||
if sb.Len() > 0 {
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
sb.WriteString(toAppend)
|
||||
if c.EndAt > 0 {
|
||||
lastEndAt = c.EndAt
|
||||
}
|
||||
} else if c.EndAt > lastEndAt {
|
||||
// Partial overlap
|
||||
contentRunes := []rune(toAppend)
|
||||
offset := len(contentRunes) - (c.EndAt - lastEndAt)
|
||||
if offset >= 0 && offset < len(contentRunes) {
|
||||
sb.WriteString(string(contentRunes[offset:]))
|
||||
} else {
|
||||
// Fallback if offset calculation is invalid
|
||||
if sb.Len() > 0 {
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
sb.WriteString(toAppend)
|
||||
}
|
||||
lastEndAt = c.EndAt
|
||||
}
|
||||
// If c.EndAt <= lastEndAt, it's fully contained, so skip appending text
|
||||
}
|
||||
|
||||
// Append image information at the end to avoid interrupting text flow
|
||||
var hasImages bool
|
||||
seenURLs := make(map[string]bool)
|
||||
for _, c := range textChunks {
|
||||
if c.ImageInfo != "" {
|
||||
var imageInfos []types.ImageInfo
|
||||
if err := json.Unmarshal([]byte(c.ImageInfo), &imageInfos); err == nil && len(imageInfos) > 0 {
|
||||
for _, img := range imageInfos {
|
||||
// Deduplicate images by URL to avoid printing the same image multiple times from overlapping chunks
|
||||
if img.URL != "" {
|
||||
if seenURLs[img.URL] {
|
||||
continue
|
||||
}
|
||||
seenURLs[img.URL] = true
|
||||
}
|
||||
|
||||
if !hasImages {
|
||||
sb.WriteString("\n\n<images>\n")
|
||||
hasImages = true
|
||||
} else {
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
|
||||
sb.WriteString("<image>\n")
|
||||
if img.URL != "" {
|
||||
sb.WriteString(fmt.Sprintf(" <url>%s</url>\n", img.URL))
|
||||
}
|
||||
if img.Caption != "" {
|
||||
sb.WriteString(fmt.Sprintf(" <caption>%s</caption>\n", img.Caption))
|
||||
}
|
||||
if img.OCRText != "" {
|
||||
sb.WriteString(fmt.Sprintf(" <ocr_text>%s</ocr_text>\n", img.OCRText))
|
||||
}
|
||||
sb.WriteString("</image>\n")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var sb strings.Builder
|
||||
for _, c := range textChunks {
|
||||
sb.WriteString(c.Content)
|
||||
sb.WriteString("\n")
|
||||
if hasImages {
|
||||
sb.WriteString("</images>\n")
|
||||
}
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user