From 16562eebbbc501de02963d78799cfce3639326c2 Mon Sep 17 00:00:00 2001 From: wizardchen Date: Wed, 3 Dec 2025 01:42:31 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0AI=E9=97=AE=E9=A2=98?= =?UTF-8?q?=E7=94=9F=E6=88=90=E5=8A=9F=E8=83=BD=EF=BC=8C=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E7=94=9F=E6=88=90=E6=95=B0=E9=87=8F=E5=B9=B6?= =?UTF-8?q?=E5=9C=A8=E5=89=8D=E7=AB=AF=E5=B1=95=E7=A4=BA=E7=94=9F=E6=88=90?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/config.yaml | 30 +- frontend/src/api/initialization/index.ts | 4 + frontend/src/components/doc-content.vue | 112 +++- frontend/src/i18n/locales/en-US.ts | 10 +- frontend/src/i18n/locales/ru-RU.ts | 10 +- frontend/src/i18n/locales/zh-CN.ts | 10 +- .../knowledge/KnowledgeBaseEditorModal.vue | 28 + .../knowledge/settings/KBAdvancedSettings.vue | 65 ++ internal/application/repository/chunk.go | 5 +- .../retriever/postgres/repository.go | 2 +- .../service/chat_pipline/preprocess.go | 211 +----- internal/application/service/knowledge.go | 627 ++++++++++++++++-- internal/config/config.go | 2 + internal/handler/initialization.go | 28 + internal/router/task.go | 6 + internal/types/extract_graph.go | 53 +- internal/types/faq.go | 37 ++ internal/types/interfaces/knowledge.go | 4 + internal/types/knowledgebase.go | 28 + ...17_add_question_generation_config.down.sql | 4 + ...0017_add_question_generation_config.up.sql | 7 + 21 files changed, 987 insertions(+), 296 deletions(-) create mode 100644 migrations/versioned/000017_add_question_generation_config.down.sql create mode 100644 migrations/versioned/000017_add_question_generation_config.up.sql diff --git a/config/config.yaml b/config/config.yaml index 919b810b..c7669909 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -518,6 +518,34 @@ conversation: } ] + generate_questions_prompt: | + 你是一个专业的问题生成助手。你的任务是根据给定的【主要内容】生成用户可能会问的相关问题。 + + {{.Context}} + ## 主要内容(请基于此内容生成问题) + 文档名称:{{.DocName}} + 文档内容: + {{.Content}} + + ## 核心要求 + - 生成的问题必须与【主要内容】直接相关 + - 问题中禁止使用任何代词或指代词(如"它"、"这个"、"该文档"、"本文"、"文中"、"其"等),必须用具体名称替代 + - 问题必须是完整独立的,脱离上下文也能被理解 + - 问题应该是用户在实际场景中可能会提出的自然问题 + - 问题应该多样化,覆盖内容的不同方面 + - 每个问题应该简洁明了,长度控制在30字以内 + - 生成的问题数量为 {{.QuestionCount}} 个 + + ## 问题类型建议 + - 定义类:什么是...?...是什么? + - 原因类:为什么...?...的原因是什么? + - 方法类:如何...?怎样...? + - 比较类:...和...有什么区别? + - 应用类:...可以用于什么场景? + + ## 输出格式 + 直接输出问题列表,每行一个问题,不要有序号或其他前缀。 + # 知识库配置 knowledge_base: chunk_size: 512 @@ -617,4 +645,4 @@ web_search: # 租户配置 tenant: # 是否启用跨租户访问功能(内网环境可开启) - enable_cross_tenant_access: true + enable_cross_tenant_access: false diff --git a/frontend/src/api/initialization/index.ts b/frontend/src/api/initialization/index.ts index 7ec62baf..8ec7ed06 100644 --- a/frontend/src/api/initialization/index.ts +++ b/frontend/src/api/initialization/index.ts @@ -107,6 +107,10 @@ export interface KBModelConfigRequest { nodes: Node[] relations: Relation[] } + questionGeneration?: { + enabled: boolean + questionCount: number + } } export function updateKBConfig(kbId: string, config: KBModelConfigRequest): Promise { diff --git a/frontend/src/components/doc-content.vue b/frontend/src/components/doc-content.vue index 2e2cf282..2671ae51 100644 --- a/frontend/src/components/doc-content.vue +++ b/frontend/src/components/doc-content.vue @@ -185,6 +185,32 @@ const getChunkMeta = (item: any) => { return parts.join(' · '); }; +// 解析生成的问题 +const getGeneratedQuestions = (item: any): string[] => { + if (!item || !item.metadata) return []; + try { + const metadata = typeof item.metadata === 'string' ? JSON.parse(item.metadata) : item.metadata; + return metadata.generated_questions || []; + } catch { + return []; + } +}; + +// 展开状态管理 +const expandedChunks = ref>(new Set()); + +const toggleQuestions = (index: number) => { + if (expandedChunks.value.has(index)) { + expandedChunks.value.delete(index); + } else { + expandedChunks.value.add(index); + } + // 触发响应式更新 + expandedChunks.value = new Set(expandedChunks.value); +}; + +const isExpanded = (index: number) => expandedChunks.value.has(index); + const downloadFile = () => { downKnowledgeDetails(props.details.id) .then((result) => { @@ -272,17 +298,44 @@ const handleDetailsScroll = () => {
{{ $t('common.noData') }}
-
{{ $t('knowledgeBase.segment') || '片段' }} {{ index + 1 }} - {{ getChunkMeta(item) }} +
+ + {{ $t('knowledgeBase.questions') || '问题' }} {{ getGeneratedQuestions(item).length }} + + {{ getChunkMeta(item) }} +
+ + +
+
+ + {{ $t('knowledgeBase.generatedQuestions') || '生成的问题' }} ({{ getGeneratedQuestions(item).length }}) +
+
+
+ + {{ question }} +
+
+
@@ -493,12 +546,65 @@ const handleDetailsScroll = () => { letter-spacing: 0.5px; } + .chunk-header-right { + display: flex; + align-items: center; + gap: 8px; + } + .chunk-meta { color: #00000066; font-size: 11px; } } +// 生成的问题样式 +.questions-section { + margin-top: 12px; + padding-top: 10px; + border-top: 1px dashed #e0e0e0; +} + +.questions-toggle { + display: flex; + align-items: center; + gap: 6px; + cursor: pointer; + color: #059669; + font-size: 12px; + font-weight: 500; + padding: 4px 0; + transition: color 0.2s ease; + + &:hover { + color: #07c05f; + } +} + +.questions-list { + margin-top: 8px; + padding-left: 4px; +} + +.question-item { + display: flex; + align-items: flex-start; + gap: 8px; + padding: 6px 8px; + margin-bottom: 4px; + background: #f0fdf4; + border-radius: 4px; + font-size: 13px; + color: #1d2129; + line-height: 1.5; + + .question-icon { + color: #059669; + flex-shrink: 0; + margin-top: 2px; + } +} + .md-content { word-break: break-word; line-height: 1.6; diff --git a/frontend/src/i18n/locales/en-US.ts b/frontend/src/i18n/locales/en-US.ts index 3e620ac0..0639b7cd 100644 --- a/frontend/src/i18n/locales/en-US.ts +++ b/frontend/src/i18n/locales/en-US.ts @@ -92,6 +92,8 @@ export default { createTime: 'Create Time', characters: 'chars', segment: 'Segment', + questions: 'Questions', + generatedQuestions: 'Generated Questions', notInitialized: 'Knowledge base is not initialized. Please configure models in settings before uploading files', getInfoFailed: 'Failed to get knowledge base information, file upload is not possible', missingId: 'Knowledge base ID is missing', @@ -1001,7 +1003,13 @@ export default { }, advanced: { title: 'Advanced Settings', - description: 'Configure multimodal features', + description: 'Configure question generation, multimodal features', + questionGeneration: { + label: 'AI Question Generation', + description: 'Generate related questions for each chunk using LLM during document parsing to improve retrieval recall. Enabling this will increase document parsing time.', + countLabel: 'Question Count', + countDescription: 'Number of questions to generate per document chunk (1-10)', + }, multimodal: { label: 'Multimodal Feature', description: 'Enable understanding of multimodal content such as images and videos', diff --git a/frontend/src/i18n/locales/ru-RU.ts b/frontend/src/i18n/locales/ru-RU.ts index fa81f3da..e1d49007 100644 --- a/frontend/src/i18n/locales/ru-RU.ts +++ b/frontend/src/i18n/locales/ru-RU.ts @@ -92,6 +92,8 @@ export default { createTime: 'Время создания', characters: 'символов', segment: 'Фрагмент', + questions: 'Вопросы', + generatedQuestions: 'Сгенерированные вопросы', notInitialized: 'База знаний не инициализирована. Пожалуйста, настройте модели в разделе настроек перед загрузкой файлов', getInfoFailed: 'Не удалось получить информацию о базе знаний, загрузка файла невозможна', missingId: 'Отсутствует ID базы знаний', @@ -1095,7 +1097,13 @@ export default { }, advanced: { title: 'Расширенные настройки', - description: 'Настройте мультимодальные возможности', + description: 'Настройте генерацию вопросов и мультимодальные возможности', + questionGeneration: { + label: 'AI генерация вопросов', + description: 'Генерация связанных вопросов для каждого фрагмента с помощью LLM при парсинге документа для улучшения полноты поиска. Включение увеличит время парсинга документа.', + countLabel: 'Количество вопросов', + countDescription: 'Количество вопросов для генерации на фрагмент документа (1-10)', + }, multimodal: { label: 'Мультимодальная функция', description: 'Включите понимание мультимедийного контента, такого как изображения и видео', diff --git a/frontend/src/i18n/locales/zh-CN.ts b/frontend/src/i18n/locales/zh-CN.ts index b086df51..51e5baf2 100644 --- a/frontend/src/i18n/locales/zh-CN.ts +++ b/frontend/src/i18n/locales/zh-CN.ts @@ -91,6 +91,8 @@ export default { createTime: "创建时间", characters: "字符", segment: "片段", + questions: "问题", + generatedQuestions: "生成的问题", docActionUnsupported: "当前知识库类型不支持该操作", notInitialized: "该知识库尚未完成初始化配置,请先前往设置页面配置模型信息后再上传文件", @@ -1334,7 +1336,13 @@ export default { }, advanced: { title: "高级设置", - description: "配置多模态等高级功能", + description: "配置问题生成、多模态等高级功能", + questionGeneration: { + label: "AI 问题生成", + description: "解析文档时调用大模型为每个分块生成相关问题,提高检索召回率。启用后会增加文档解析耗时。", + countLabel: "生成问题数量", + countDescription: "每个文档分块生成的问题数量(1-10)", + }, multimodal: { label: "多模态功能", description: "启用图片、视频等多模态内容的理解能力", diff --git a/frontend/src/views/knowledge/KnowledgeBaseEditorModal.vue b/frontend/src/views/knowledge/KnowledgeBaseEditorModal.vue index 62005931..bdbd2ba0 100644 --- a/frontend/src/views/knowledge/KnowledgeBaseEditorModal.vue +++ b/frontend/src/views/knowledge/KnowledgeBaseEditorModal.vue @@ -144,8 +144,10 @@ ref="advancedSettingsRef" v-if="formData" :multimodal="formData.multimodalConfig" + :question-generation="formData.questionGenerationConfig" :all-models="allModels" @update:multimodal="handleMultimodalUpdate" + @update:question-generation="handleQuestionGenerationUpdate" /> @@ -296,6 +298,10 @@ const initFormData = (type: 'document' | 'faq' = 'document') => { type: string }> }, + questionGenerationConfig: { + enabled: false, + questionCount: 3 + }, } } @@ -377,6 +383,10 @@ const loadKBData = async () => { })), relations: kb.extract_config?.relations || [] }, + questionGenerationConfig: { + enabled: kb.question_generation_config?.enabled || false, + questionCount: kb.question_generation_config?.question_count || 3 + }, } } catch (error) { console.error('Failed to load knowledge base data:', error) @@ -406,6 +416,12 @@ const handleMultimodalUpdate = (config: any) => { } } +const handleQuestionGenerationUpdate = (config: any) => { + if (formData.value) { + formData.value.questionGenerationConfig = { ...config } + } +} + const handleNodeExtractUpdate = (config: any) => { if (formData.value) { formData.value.nodeExtractConfig = { ...config } @@ -514,6 +530,14 @@ const buildSubmitData = () => { } } + // 添加问题生成配置 + if (formData.value.questionGenerationConfig?.enabled) { + data.question_generation_config = { + enabled: true, + question_count: formData.value.questionGenerationConfig.questionCount || 3 + } + } + if (formData.value.type === 'faq') { data.faq_config = { index_mode: formData.value.faqConfig?.indexMode || 'question_only', @@ -598,6 +622,10 @@ const handleSubmit = async () => { tags: data.extract_config?.tags || [], nodes: data.extract_config?.nodes || [], relations: data.extract_config?.relations || [] + }, + questionGeneration: { + enabled: data.question_generation_config?.enabled || false, + questionCount: data.question_generation_config?.question_count || 3 } } diff --git a/frontend/src/views/knowledge/settings/KBAdvancedSettings.vue b/frontend/src/views/knowledge/settings/KBAdvancedSettings.vue index a59f5ff9..f4a4011c 100644 --- a/frontend/src/views/knowledge/settings/KBAdvancedSettings.vue +++ b/frontend/src/views/knowledge/settings/KBAdvancedSettings.vue @@ -6,6 +6,42 @@
+ +
+
+ +

{{ $t('knowledgeEditor.advanced.questionGeneration.description') }}

+
+
+ +
+
+ + +
+
+
+ +

{{ $t('knowledgeEditor.advanced.questionGeneration.countDescription') }}

+
+
+ +
+
+
+
@@ -241,8 +277,14 @@ interface MultimodalConfig { } } +interface QuestionGenerationConfig { + enabled: boolean + questionCount: number +} + interface Props { multimodal: MultimodalConfig + questionGeneration?: QuestionGenerationConfig allModels?: any[] } @@ -250,9 +292,13 @@ const props = defineProps() const emit = defineEmits<{ 'update:multimodal': [value: MultimodalConfig] + 'update:questionGeneration': [value: QuestionGenerationConfig] }>() const localMultimodal = ref({ ...props.multimodal }) +const localQuestionGeneration = ref( + props.questionGeneration || { enabled: false, questionCount: 3 } +) const vllmSelectorRef = ref() const isMinioEnabled = ref(false) @@ -287,6 +333,25 @@ watch(() => props.multimodal, (newVal) => { localMultimodal.value = { ...newVal } }, { deep: true }) +watch(() => props.questionGeneration, (newVal) => { + if (newVal) { + localQuestionGeneration.value = { ...newVal } + } +}, { deep: true }) + +// Handle question generation toggle +const handleQuestionGenerationToggle = () => { + if (!localQuestionGeneration.value.enabled) { + localQuestionGeneration.value.questionCount = 3 + } + emit('update:questionGeneration', localQuestionGeneration.value) +} + +// Handle question generation config change +const handleQuestionGenerationChange = () => { + emit('update:questionGeneration', localQuestionGeneration.value) +} + // Handle multimodal toggle const handleMultimodalToggle = () => { // Reset related configuration when multimodal is disabled diff --git a/internal/application/repository/chunk.go b/internal/application/repository/chunk.go index 7159607b..21bfa98c 100644 --- a/internal/application/repository/chunk.go +++ b/internal/application/repository/chunk.go @@ -60,7 +60,6 @@ func (r *chunkRepository) ListChunksByKnowledgeID( ) ([]*types.Chunk, error) { var chunks []*types.Chunk if err := r.db.WithContext(ctx). - Select("id, content, knowledge_id, knowledge_base_id, start_at, end_at, chunk_index, is_enabled, chunk_type, parent_chunk_id, image_info"). Where("tenant_id = ? AND knowledge_id = ? and chunk_type = ?", tenantID, knowledgeID, "text"). Order("chunk_index ASC"). Find(&chunks).Error; err != nil { @@ -85,7 +84,7 @@ func (r *chunkRepository) ListPagedChunksByKnowledgeID( baseFilter := func(db *gorm.DB) *gorm.DB { db = db.Where("tenant_id = ? AND knowledge_id = ? AND chunk_type IN (?) AND status in (?)", - tenantID, knowledgeID, chunkType, []types.ChunkStatus{types.ChunkStatusIndexed, types.ChunkStatusDefault}) + tenantID, knowledgeID, chunkType, []int{int(types.ChunkStatusIndexed), int(types.ChunkStatusDefault)}) if tagID != "" { db = db.Where("tag_id = ?", tagID) } @@ -106,7 +105,7 @@ func (r *chunkRepository) ListPagedChunksByKnowledgeID( // Then query the paginated data dataQuery := baseFilter( r.db.WithContext(ctx). - Select("id, content, knowledge_id, knowledge_base_id, start_at, end_at, chunk_index, is_enabled, chunk_type, parent_chunk_id, image_info, metadata, tag_id"), + Select("id, content, knowledge_id, knowledge_base_id, start_at, end_at, chunk_index, is_enabled, chunk_type, parent_chunk_id, image_info, metadata, tag_id, status"), ) if err := dataQuery. diff --git a/internal/application/repository/retriever/postgres/repository.go b/internal/application/repository/retriever/postgres/repository.go index a551112c..adfe4208 100644 --- a/internal/application/repository/retriever/postgres/repository.go +++ b/internal/application/repository/retriever/postgres/repository.go @@ -291,7 +291,7 @@ func (g *pgRepository) VectorRetrieve(ctx context.Context, ORDER BY embedding::halfvec(%d) <=> $1::halfvec LIMIT $%d ) AS candidates - WHERE distance < $%d + WHERE distance <= $%d ORDER BY distance ASC LIMIT $%d `, dimension, whereClause, dimension, subqueryLimitParam, thresholdParam, finalLimitParam) diff --git a/internal/application/service/chat_pipline/preprocess.go b/internal/application/service/chat_pipline/preprocess.go index b6855bde..5d7d11a8 100644 --- a/internal/application/service/chat_pipline/preprocess.go +++ b/internal/application/service/chat_pipline/preprocess.go @@ -5,34 +5,24 @@ import ( "encoding/json" "regexp" "strings" - "unicode" - "unicode/utf8" "github.com/Tencent/WeKnora/internal/config" "github.com/Tencent/WeKnora/internal/models/chat" "github.com/Tencent/WeKnora/internal/types" "github.com/Tencent/WeKnora/internal/types/interfaces" - "github.com/yanyiwu/gojieba" ) // PluginPreprocess Query preprocessing plugin type PluginPreprocess struct { config *config.Config - jieba *gojieba.Jieba - stopwords map[string]struct{} modelService interfaces.ModelService } // Regular expressions for text cleaning var ( - multiSpaceRegex = regexp.MustCompile(`\s+`) // Multiple spaces - urlRegex = regexp.MustCompile(`https?://\S+`) // URLs - emailRegex = regexp.MustCompile(`\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,}\b`) // Email addresses - punctRegex = regexp.MustCompile(`[^\p{L}\p{N}\s]`) // Punctuation marks + multiSpaceRegex = regexp.MustCompile(`\s+`) // Multiple spaces ) -const maxProcessedTokens = 12 - // NewPluginPreprocess Creates a new query preprocessing plugin func NewPluginPreprocess( eventManager *EventManager, @@ -40,51 +30,15 @@ func NewPluginPreprocess( cleaner interfaces.ResourceCleaner, modelService interfaces.ModelService, ) *PluginPreprocess { - // Use default dictionary for Jieba tokenizer - jieba := gojieba.NewJieba() - - // Load stopwords from built-in stopword library - stopwords := loadStopwords() - res := &PluginPreprocess{ config: config, - jieba: jieba, - stopwords: stopwords, modelService: modelService, } - // Register resource cleanup function - if cleaner != nil { - cleaner.RegisterWithName("JiebaPreprocessor", func() error { - res.Close() - return nil - }) - } - eventManager.Register(res) return res } -// Load stopwords -func loadStopwords() map[string]struct{} { - // Directly use some common stopwords built into Jieba - commonStopwords := []string{ - "的", "了", "和", "是", "在", "我", "你", "他", "她", "它", - "这", "那", "什么", "怎么", "如何", "为什么", "哪里", "什么时候", - "the", "is", "are", "am", "I", "you", "he", "she", "it", "this", - "that", "what", "how", "a", "an", "and", "or", "but", "if", "of", - "to", "in", "on", "at", "by", "for", "with", "about", "from", - "有", "无", "好", "来", "去", "说", "看", "想", "会", "可以", - "吗", "呢", "啊", "吧", "的话", "就是", "只是", "因为", "所以", - } - - result := make(map[string]struct{}, len(commonStopwords)) - for _, word := range commonStopwords { - result[word] = struct{}{} - } - return result -} - // ActivationEvents Register activation events func (p *PluginPreprocess) ActivationEvents() []types.EventType { return []types.EventType{types.PREPROCESS_QUERY} @@ -107,171 +61,22 @@ func (p *PluginPreprocess) OnEvent( "rewrite_query": rawQuery, }) - normalized := normalizeWhitespace(rawQuery) - sanitized := strings.TrimSpace(p.cleanText(normalized)) - if sanitized == "" { - sanitized = normalized - } - - var ( - processed = sanitized - strategy = "original" - tokenPreview string - tokenCount int - ) - - switch { - case containsChineseCharacters(sanitized): - segments := p.segmentText(sanitized) - tokens := p.selectMeaningfulTokens(segments) - tokenCount = len(tokens) - if len(tokens) >= 2 { - processed = strings.Join(tokens, " ") - strategy = "zh_tokens" - tokenPreview = strings.Join(tokens, ",") - } else { - strategy = "fallback_original" - } - case containsLatinLetters(sanitized): - processed = normalizeLatinQuery(sanitized) - if processed != sanitized { - strategy = "latin_normalize" - } - default: - strategy = "original" - } - - if strings.TrimSpace(processed) == "" { - processed = rawQuery - strategy = "fallback_original" - } + // Lightweight normalization: just collapse multiple spaces + processed := multiSpaceRegex.ReplaceAllString(rawQuery, " ") + processed = strings.TrimSpace(processed) chatManage.ProcessedQuery = processed - chatManage.QueryIntent = p.detectIntentLLM(ctx, chatManage, sanitized) + chatManage.QueryIntent = p.detectIntentLLM(ctx, chatManage, processed) pipelineInfo(ctx, "Preprocess", "output", map[string]interface{}{ "session_id": chatManage.SessionID, "processed_query": processed, - "strategy": strategy, - "token_count": tokenCount, - "token_preview": tokenPreview, "query_intent": chatManage.QueryIntent, }) return next() } -// cleanText Basic text cleaning -func (p *PluginPreprocess) cleanText(text string) string { - // Remove URLs - text = urlRegex.ReplaceAllString(text, " ") - - // Remove email addresses - text = emailRegex.ReplaceAllString(text, " ") - - // Remove excessive spaces - text = multiSpaceRegex.ReplaceAllString(text, " ") - - // Remove punctuation marks - text = punctRegex.ReplaceAllString(text, " ") - - // Trim leading and trailing spaces - text = strings.TrimSpace(text) - - return text -} - -// segmentText Text tokenization -func (p *PluginPreprocess) segmentText(text string) []string { - // Use Jieba tokenizer for tokenization, using search engine mode - segments := p.jieba.CutForSearch(text, true) - return segments -} - -// filterStopwords Filter stopwords -func (p *PluginPreprocess) selectMeaningfulTokens(segments []string) []string { - var tokens []string - seen := make(map[string]struct{}) - - for _, word := range segments { - word = strings.TrimSpace(word) - if word == "" { - continue - } - if _, stop := p.stopwords[word]; stop { - continue - } - if _, exists := seen[word]; exists { - continue - } - if !isInformativeToken(word) { - continue - } - - seen[word] = struct{}{} - tokens = append(tokens, word) - if len(tokens) >= maxProcessedTokens { - break - } - } - - return tokens -} - -// isBlank Check if a string is blank -func isInformativeToken(token string) bool { - if token == "" { - return false - } - - runeCount := utf8.RuneCountInString(token) - if runeCount == 1 { - r, _ := utf8.DecodeRuneInString(token) - if unicode.IsDigit(r) { - return true - } - if r <= unicode.MaxASCII && unicode.IsLetter(r) { - return true - } - return false - } - - return true -} - -// containsChineseCharacters checks if a string contains Chinese characters -func containsChineseCharacters(text string) bool { - for _, r := range text { - if unicode.Is(unicode.Han, r) { - return true - } - } - return false -} - -// containsLatinLetters checks if a string contains Latin letters -func containsLatinLetters(text string) bool { - for _, r := range text { - if r <= unicode.MaxASCII && unicode.IsLetter(r) { - return true - } - } - return false -} - -// normalizeWhitespace normalizes whitespace in a string -func normalizeWhitespace(text string) string { - text = strings.TrimSpace(text) - return multiSpaceRegex.ReplaceAllString(text, " ") -} - -// normalizeLatinQuery normalizes a Latin query -func normalizeLatinQuery(text string) string { - text = strings.ToLower(text) - text = multiSpaceRegex.ReplaceAllString(text, " ") - return strings.TrimSpace(text) -} - // intentResp is a response for intent detection type intentResp struct { Intent string `json:"intent"` @@ -349,12 +154,8 @@ func extractJSONBody(text string) string { return "{}" } -// Ensure resources are properly released +// Close Releases resources func (p *PluginPreprocess) Close() { - if p.jieba != nil { - p.jieba.Free() - p.jieba = nil - } } // ShutdownHandler Returns shutdown function diff --git a/internal/application/service/knowledge.go b/internal/application/service/knowledge.go index 071264f4..5a54540b 100644 --- a/internal/application/service/knowledge.go +++ b/internal/application/service/knowledge.go @@ -24,7 +24,6 @@ import ( "github.com/Tencent/WeKnora/internal/logger" "github.com/Tencent/WeKnora/internal/models/chat" "github.com/Tencent/WeKnora/internal/models/embedding" - "github.com/Tencent/WeKnora/internal/models/utils" "github.com/Tencent/WeKnora/internal/tracing" "github.com/Tencent/WeKnora/internal/types" "github.com/Tencent/WeKnora/internal/types/interfaces" @@ -283,14 +282,26 @@ func (s *knowledgeService) CreateKnowledgeFromFile(ctx context.Context, enableMultimodelValue = kb.VLMConfig.Enabled } + // Check question generation config + enableQuestionGeneration := false + questionCount := 3 // default + if kb.QuestionGenerationConfig != nil && kb.QuestionGenerationConfig.Enabled { + enableQuestionGeneration = true + if kb.QuestionGenerationConfig.QuestionCount > 0 { + questionCount = kb.QuestionGenerationConfig.QuestionCount + } + } + taskPayload := types.DocumentProcessPayload{ - TenantID: tenantID, - KnowledgeID: knowledge.ID, - KnowledgeBaseID: kbID, - FilePath: filePath, - FileName: safeFilename, - FileType: getFileType(safeFilename), - EnableMultimodel: enableMultimodelValue, + TenantID: tenantID, + KnowledgeID: knowledge.ID, + KnowledgeBaseID: kbID, + FilePath: filePath, + FileName: safeFilename, + FileType: getFileType(safeFilename), + EnableMultimodel: enableMultimodelValue, + EnableQuestionGeneration: enableQuestionGeneration, + QuestionCount: questionCount, } payloadBytes, err := json.Marshal(taskPayload) @@ -405,12 +416,24 @@ func (s *knowledgeService) CreateKnowledgeFromURL(ctx context.Context, enableMultimodelValue = kb.VLMConfig.Enabled } + // Check question generation config + enableQuestionGeneration := false + questionCount := 3 // default + if kb.QuestionGenerationConfig != nil && kb.QuestionGenerationConfig.Enabled { + enableQuestionGeneration = true + if kb.QuestionGenerationConfig.QuestionCount > 0 { + questionCount = kb.QuestionGenerationConfig.QuestionCount + } + } + taskPayload := types.DocumentProcessPayload{ - TenantID: tenantID, - KnowledgeID: knowledge.ID, - KnowledgeBaseID: kbID, - URL: url, - EnableMultimodel: enableMultimodelValue, + TenantID: tenantID, + KnowledgeID: knowledge.ID, + KnowledgeBaseID: kbID, + URL: url, + EnableMultimodel: enableMultimodelValue, + EnableQuestionGeneration: enableQuestionGeneration, + QuestionCount: questionCount, } payloadBytes, err := json.Marshal(taskPayload) @@ -595,12 +618,25 @@ func (s *knowledgeService) createKnowledgeFromPassageInternal(ctx context.Contex // Enqueue passage processing task to Asynq logger.Info(ctx, "Enqueuing passage processing task to Asynq") tenantID := ctx.Value(types.TenantIDContextKey).(uint64) + + // Check question generation config + enableQuestionGeneration := false + questionCount := 3 // default + if kb.QuestionGenerationConfig != nil && kb.QuestionGenerationConfig.Enabled { + enableQuestionGeneration = true + if kb.QuestionGenerationConfig.QuestionCount > 0 { + questionCount = kb.QuestionGenerationConfig.QuestionCount + } + } + taskPayload := types.DocumentProcessPayload{ - TenantID: tenantID, - KnowledgeID: knowledge.ID, - KnowledgeBaseID: kbID, - Passages: safePassages, - EnableMultimodel: false, // 文本段落不支持多模态 + TenantID: tenantID, + KnowledgeID: knowledge.ID, + KnowledgeBaseID: kbID, + Passages: safePassages, + EnableMultimodel: false, // 文本段落不支持多模态 + EnableQuestionGeneration: enableQuestionGeneration, + QuestionCount: questionCount, } payloadBytes, err := json.Marshal(taskPayload) @@ -917,10 +953,23 @@ func (s *knowledgeService) processDocumentFromPassage(ctx context.Context, s.processChunks(ctx, kb, knowledge, chunks) } +// ProcessChunksOptions contains options for processing chunks +type ProcessChunksOptions struct { + EnableQuestionGeneration bool + QuestionCount int +} + // processChunks processes chunks and creates embeddings for knowledge content func (s *knowledgeService) processChunks(ctx context.Context, kb *types.KnowledgeBase, knowledge *types.Knowledge, chunks []*proto.Chunk, + opts ...ProcessChunksOptions, ) { + // Get options + var options ProcessChunksOptions + if len(opts) > 0 { + options = opts[0] + } + ctx, span := tracing.ContextWithSpan(ctx, "knowledgeService.processChunks") defer span.End() span.SetAttributes( @@ -969,14 +1018,6 @@ func (s *knowledgeService) processChunks(ctx context.Context, logger.Infof(ctx, "Cleanup completed, starting to process new chunks") - // Generate document summary - 只使用文本类型的 Chunk - chatModel, err := s.modelService.GetChatModel(ctx, kb.SummaryModelID) - if err != nil { - logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks get summary model failed") - span.RecordError(err) - return - } - // Create chunk objects from proto chunks maxSeq := 0 @@ -1118,52 +1159,19 @@ func (s *knowledgeService) processChunks(ctx context.Context, } } - span.AddEvent("extract summary") - summary, err := s.getSummary(ctx, chatModel, knowledge, textChunks) - if err != nil { - logger.GetLogger(ctx).WithField("knowledge_id", knowledge.ID). - WithField("error", err).Errorf("processChunks get summary failed, use first chunk as description") - if len(textChunks) > 0 { - knowledge.Description = textChunks[0].Content - } - } else { - knowledge.Description = summary - } - span.SetAttributes(attribute.String("summary", knowledge.Description)) - - // 批量索引 - if strings.TrimSpace(knowledge.Description) != "" && len(textChunks) > 0 { - sChunk := &types.Chunk{ - ID: uuid.New().String(), - TenantID: knowledge.TenantID, - KnowledgeID: knowledge.ID, - KnowledgeBaseID: knowledge.KnowledgeBaseID, - Content: fmt.Sprintf("# 文档名称\n%s\n\n# 摘要\n%s", knowledge.FileName, knowledge.Description), - ChunkIndex: maxSeq + 3, // 使用不冲突的索引方式 - IsEnabled: true, - CreatedAt: time.Now(), - UpdatedAt: time.Now(), - StartAt: 0, - EndAt: 0, - ChunkType: types.ChunkTypeSummary, - ParentChunkID: textChunks[0].ID, - } - logger.GetLogger(ctx).Infof("Created summary chunk for %s with index %d", - sChunk.ParentChunkID, sChunk.ChunkIndex) - insertChunks = append(insertChunks, sChunk) - } - - // Create index information for each chunk - indexInfoList := utils.MapSlice(insertChunks, func(chunk *types.Chunk) *types.IndexInfo { - return &types.IndexInfo{ + // Create index information for each chunk (without generated questions for now) + indexInfoList := make([]*types.IndexInfo, 0, len(insertChunks)) + for _, chunk := range insertChunks { + // Add original chunk content to index + indexInfoList = append(indexInfoList, &types.IndexInfo{ Content: chunk.Content, SourceID: chunk.ID, SourceType: types.ChunkSourceType, ChunkID: chunk.ID, KnowledgeID: knowledge.ID, KnowledgeBaseID: knowledge.KnowledgeBaseID, - } - }) + }) + } // Initialize retrieval engine @@ -1249,6 +1257,23 @@ func (s *knowledgeService) processChunks(ctx context.Context, logger.GetLogger(ctx).WithField("error", err).Errorf("processChunks update knowledge failed") } + // Enqueue question generation task if enabled (async, non-blocking) + if options.EnableQuestionGeneration && len(textChunks) > 0 { + questionCount := options.QuestionCount + if questionCount <= 0 { + questionCount = 3 + } + if questionCount > 10 { + questionCount = 10 + } + s.enqueueQuestionGenerationTask(ctx, knowledge.KnowledgeBaseID, knowledge.ID, questionCount) + } + + // Enqueue summary generation task (async, non-blocking) + if len(textChunks) > 0 { + s.enqueueSummaryGenerationTask(ctx, knowledge.KnowledgeBaseID, knowledge.ID) + } + // Update tenant's storage usage tenantInfo.StorageUsed += totalStorageSize if err := s.tenantRepo.AdjustStorageUsed(ctx, tenantInfo.ID, totalStorageSize); err != nil { @@ -1354,6 +1379,471 @@ func (s *knowledgeService) getSummary(ctx context.Context, return summary.Content, nil } +// enqueueQuestionGenerationTask enqueues an async task for question generation +func (s *knowledgeService) enqueueQuestionGenerationTask(ctx context.Context, + kbID, knowledgeID string, questionCount int, +) { + tenantID := ctx.Value(types.TenantIDContextKey).(uint64) + payload := types.QuestionGenerationPayload{ + TenantID: tenantID, + KnowledgeBaseID: kbID, + KnowledgeID: knowledgeID, + QuestionCount: questionCount, + } + + payloadBytes, err := json.Marshal(payload) + if err != nil { + logger.Errorf(ctx, "Failed to marshal question generation payload: %v", err) + return + } + + task := asynq.NewTask(types.TypeQuestionGeneration, payloadBytes, asynq.Queue("low"), asynq.MaxRetry(3)) + info, err := s.task.Enqueue(task) + if err != nil { + logger.Errorf(ctx, "Failed to enqueue question generation task: %v", err) + return + } + logger.Infof(ctx, "Enqueued question generation task: %s for knowledge: %s", info.ID, knowledgeID) +} + +// enqueueSummaryGenerationTask enqueues an async task for summary generation +func (s *knowledgeService) enqueueSummaryGenerationTask(ctx context.Context, + kbID, knowledgeID string, +) { + tenantID := ctx.Value(types.TenantIDContextKey).(uint64) + payload := types.SummaryGenerationPayload{ + TenantID: tenantID, + KnowledgeBaseID: kbID, + KnowledgeID: knowledgeID, + } + + payloadBytes, err := json.Marshal(payload) + if err != nil { + logger.Errorf(ctx, "Failed to marshal summary generation payload: %v", err) + return + } + + task := asynq.NewTask(types.TypeSummaryGeneration, payloadBytes, asynq.Queue("low"), asynq.MaxRetry(3)) + info, err := s.task.Enqueue(task) + if err != nil { + logger.Errorf(ctx, "Failed to enqueue summary generation task: %v", err) + return + } + logger.Infof(ctx, "Enqueued summary generation task: %s for knowledge: %s", info.ID, knowledgeID) +} + +// ProcessSummaryGeneration handles async summary generation task +func (s *knowledgeService) ProcessSummaryGeneration(ctx context.Context, t *asynq.Task) error { + var payload types.SummaryGenerationPayload + if err := json.Unmarshal(t.Payload(), &payload); err != nil { + logger.Errorf(ctx, "Failed to unmarshal summary generation payload: %v", err) + return nil // Don't retry on unmarshal error + } + + logger.Infof(ctx, "Processing summary generation for knowledge: %s", payload.KnowledgeID) + + // Set tenant context + ctx = context.WithValue(ctx, types.TenantIDContextKey, payload.TenantID) + + // Get knowledge base + kb, err := s.kbService.GetKnowledgeBaseByID(ctx, payload.KnowledgeBaseID) + if err != nil { + logger.Errorf(ctx, "Failed to get knowledge base: %v", err) + return nil + } + + // Get knowledge + knowledge, err := s.repo.GetKnowledgeByID(ctx, payload.TenantID, payload.KnowledgeID) + if err != nil { + logger.Errorf(ctx, "Failed to get knowledge: %v", err) + return nil + } + + // Get text chunks for this knowledge + chunks, err := s.chunkService.ListChunksByKnowledgeID(ctx, payload.KnowledgeID) + if err != nil { + logger.Errorf(ctx, "Failed to get chunks: %v", err) + return nil + } + + // Filter text chunks only + textChunks := make([]*types.Chunk, 0) + for _, chunk := range chunks { + if chunk.ChunkType == types.ChunkTypeText { + textChunks = append(textChunks, chunk) + } + } + + if len(textChunks) == 0 { + logger.Infof(ctx, "No text chunks found for knowledge: %s", payload.KnowledgeID) + return nil + } + + // Sort chunks by ChunkIndex for proper ordering + sort.Slice(textChunks, func(i, j int) bool { + return textChunks[i].ChunkIndex < textChunks[j].ChunkIndex + }) + + // Initialize chat model for summary + chatModel, err := s.modelService.GetChatModel(ctx, kb.SummaryModelID) + if err != nil { + logger.Errorf(ctx, "Failed to get chat model: %v", err) + return fmt.Errorf("failed to get chat model: %w", err) + } + + // Generate summary + summary, err := s.getSummary(ctx, chatModel, knowledge, textChunks) + if err != nil { + logger.Errorf(ctx, "Failed to generate summary for knowledge %s: %v", payload.KnowledgeID, err) + // Use first chunk content as fallback + if len(textChunks) > 0 { + summary = textChunks[0].Content + if len(summary) > 500 { + summary = summary[:500] + } + } + } + + // Update knowledge description + knowledge.Description = summary + knowledge.UpdatedAt = time.Now() + if err := s.repo.UpdateKnowledge(ctx, knowledge); err != nil { + logger.Errorf(ctx, "Failed to update knowledge description: %v", err) + return fmt.Errorf("failed to update knowledge: %w", err) + } + + // Create summary chunk and index it + if strings.TrimSpace(summary) != "" { + // Get max chunk index + maxChunkIndex := 0 + for _, chunk := range chunks { + if chunk.ChunkIndex > maxChunkIndex { + maxChunkIndex = chunk.ChunkIndex + } + } + + summaryChunk := &types.Chunk{ + ID: uuid.New().String(), + TenantID: knowledge.TenantID, + KnowledgeID: knowledge.ID, + KnowledgeBaseID: knowledge.KnowledgeBaseID, + Content: fmt.Sprintf("# 文档名称\n%s\n\n# 摘要\n%s", knowledge.FileName, summary), + ChunkIndex: maxChunkIndex + 1, + IsEnabled: true, + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + StartAt: 0, + EndAt: 0, + ChunkType: types.ChunkTypeSummary, + ParentChunkID: textChunks[0].ID, + } + + // Save summary chunk + if err := s.chunkService.CreateChunks(ctx, []*types.Chunk{summaryChunk}); err != nil { + logger.Errorf(ctx, "Failed to create summary chunk: %v", err) + return fmt.Errorf("failed to create summary chunk: %w", err) + } + + // Index summary chunk + tenantInfo, err := s.tenantRepo.GetTenantByID(ctx, payload.TenantID) + if err != nil { + logger.Errorf(ctx, "Failed to get tenant info: %v", err) + return fmt.Errorf("failed to get tenant info: %w", err) + } + ctx = context.WithValue(ctx, types.TenantInfoContextKey, tenantInfo) + + retrieveEngine, err := retriever.NewCompositeRetrieveEngine(s.retrieveEngine, tenantInfo.RetrieverEngines.Engines) + if err != nil { + logger.Errorf(ctx, "Failed to init retrieve engine: %v", err) + return fmt.Errorf("failed to init retrieve engine: %w", err) + } + + embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, kb.EmbeddingModelID) + if err != nil { + logger.Errorf(ctx, "Failed to get embedding model: %v", err) + return fmt.Errorf("failed to get embedding model: %w", err) + } + + indexInfo := []*types.IndexInfo{{ + Content: summaryChunk.Content, + SourceID: summaryChunk.ID, + SourceType: types.ChunkSourceType, + ChunkID: summaryChunk.ID, + KnowledgeID: knowledge.ID, + KnowledgeBaseID: knowledge.KnowledgeBaseID, + }} + + if err := retrieveEngine.BatchIndex(ctx, embeddingModel, indexInfo); err != nil { + logger.Errorf(ctx, "Failed to index summary chunk: %v", err) + return fmt.Errorf("failed to index summary chunk: %w", err) + } + + logger.Infof(ctx, "Successfully created and indexed summary chunk for knowledge: %s", payload.KnowledgeID) + } + + logger.Infof(ctx, "Successfully generated summary for knowledge: %s", payload.KnowledgeID) + return nil +} + +// ProcessQuestionGeneration handles async question generation task +func (s *knowledgeService) ProcessQuestionGeneration(ctx context.Context, t *asynq.Task) error { + ctx, span := tracing.ContextWithSpan(ctx, "knowledgeService.ProcessQuestionGeneration") + defer span.End() + + var payload types.QuestionGenerationPayload + if err := json.Unmarshal(t.Payload(), &payload); err != nil { + logger.Errorf(ctx, "Failed to unmarshal question generation payload: %v", err) + return nil // Don't retry on unmarshal error + } + + logger.Infof(ctx, "Processing question generation for knowledge: %s", payload.KnowledgeID) + + // Set tenant context + ctx = context.WithValue(ctx, types.TenantIDContextKey, payload.TenantID) + + // Get knowledge base + kb, err := s.kbService.GetKnowledgeBaseByID(ctx, payload.KnowledgeBaseID) + if err != nil { + logger.Errorf(ctx, "Failed to get knowledge base: %v", err) + return nil + } + + // Get knowledge + knowledge, err := s.repo.GetKnowledgeByID(ctx, payload.TenantID, payload.KnowledgeID) + if err != nil { + logger.Errorf(ctx, "Failed to get knowledge: %v", err) + return nil + } + + // Get text chunks for this knowledge + chunks, err := s.chunkService.ListChunksByKnowledgeID(ctx, payload.KnowledgeID) + if err != nil { + logger.Errorf(ctx, "Failed to get chunks: %v", err) + return nil + } + + // Filter text chunks only + textChunks := make([]*types.Chunk, 0) + for _, chunk := range chunks { + if chunk.ChunkType == types.ChunkTypeText { + textChunks = append(textChunks, chunk) + } + } + + if len(textChunks) == 0 { + logger.Infof(ctx, "No text chunks found for knowledge: %s", payload.KnowledgeID) + return nil + } + + // Sort chunks by StartAt for context building + sort.Slice(textChunks, func(i, j int) bool { + return textChunks[i].StartAt < textChunks[j].StartAt + }) + + // Initialize chat model + chatModel, err := s.modelService.GetChatModel(ctx, kb.SummaryModelID) + if err != nil { + logger.Errorf(ctx, "Failed to get chat model: %v", err) + return fmt.Errorf("failed to get chat model: %w", err) + } + + // Initialize embedding model and retrieval engine + embeddingModel, err := s.modelService.GetEmbeddingModel(ctx, kb.EmbeddingModelID) + if err != nil { + logger.Errorf(ctx, "Failed to get embedding model: %v", err) + return fmt.Errorf("failed to get embedding model: %w", err) + } + + tenantInfo, err := s.tenantRepo.GetTenantByID(ctx, payload.TenantID) + if err != nil { + logger.Errorf(ctx, "Failed to get tenant info: %v", err) + return fmt.Errorf("failed to get tenant info: %w", err) + } + ctx = context.WithValue(ctx, types.TenantInfoContextKey, tenantInfo) + + retrieveEngine, err := retriever.NewCompositeRetrieveEngine(s.retrieveEngine, tenantInfo.RetrieverEngines.Engines) + if err != nil { + logger.Errorf(ctx, "Failed to init retrieve engine: %v", err) + return fmt.Errorf("failed to init retrieve engine: %w", err) + } + + questionCount := payload.QuestionCount + if questionCount <= 0 { + questionCount = 3 + } + if questionCount > 10 { + questionCount = 10 + } + + // Generate questions for each chunk with context + var indexInfoList []*types.IndexInfo + for i, chunk := range textChunks { + // Build context from adjacent chunks + var prevContent, nextContent string + if i > 0 { + prevContent = textChunks[i-1].Content + // Limit context size + if len(prevContent) > 500 { + prevContent = prevContent[len(prevContent)-500:] + } + } + if i < len(textChunks)-1 { + nextContent = textChunks[i+1].Content + // Limit context size + if len(nextContent) > 500 { + nextContent = nextContent[:500] + } + } + + questions, err := s.generateQuestionsWithContext(ctx, chatModel, chunk.Content, prevContent, nextContent, knowledge.Title, questionCount) + if err != nil { + logger.Warnf(ctx, "Failed to generate questions for chunk %s: %v", chunk.ID, err) + continue + } + + if len(questions) == 0 { + continue + } + + // Update chunk metadata + meta := &types.DocumentChunkMetadata{ + GeneratedQuestions: questions, + } + if err := chunk.SetDocumentMetadata(meta); err != nil { + logger.Warnf(ctx, "Failed to set document metadata for chunk %s: %v", chunk.ID, err) + continue + } + + // Update chunk in database + if err := s.chunkService.UpdateChunk(ctx, chunk); err != nil { + logger.Warnf(ctx, "Failed to update chunk %s: %v", chunk.ID, err) + continue + } + + // Create index entries for generated questions + for j, question := range questions { + sourceID := fmt.Sprintf("%s-q%d", chunk.ID, j) + indexInfoList = append(indexInfoList, &types.IndexInfo{ + Content: question, + SourceID: sourceID, + SourceType: types.ChunkSourceType, + ChunkID: chunk.ID, + KnowledgeID: knowledge.ID, + KnowledgeBaseID: knowledge.KnowledgeBaseID, + }) + } + logger.Debugf(ctx, "Generated %d questions for chunk %s", len(questions), chunk.ID) + } + + // Index generated questions + if len(indexInfoList) > 0 { + if err := retrieveEngine.BatchIndex(ctx, embeddingModel, indexInfoList); err != nil { + logger.Errorf(ctx, "Failed to index generated questions: %v", err) + return fmt.Errorf("failed to index questions: %w", err) + } + logger.Infof(ctx, "Successfully indexed %d generated questions for knowledge: %s", len(indexInfoList), payload.KnowledgeID) + } + + return nil +} + +// generateQuestionsWithContext generates questions for a chunk with surrounding context +func (s *knowledgeService) generateQuestionsWithContext(ctx context.Context, + chatModel chat.Chat, content, prevContent, nextContent, docName string, questionCount int, +) ([]string, error) { + if content == "" || questionCount <= 0 { + return nil, nil + } + + // Build prompt with context + prompt := s.config.Conversation.GenerateQuestionsPrompt + if prompt == "" { + prompt = defaultQuestionGenerationPrompt + } + + // Build context section + var contextSection string + if prevContent != "" || nextContent != "" { + contextSection = "## 上下文信息(仅供参考,帮助理解主要内容)\n" + if prevContent != "" { + contextSection += fmt.Sprintf("【前文】%s\n", prevContent) + } + if nextContent != "" { + contextSection += fmt.Sprintf("【后文】%s\n", nextContent) + } + contextSection += "\n" + } + + // Replace placeholders + prompt = strings.ReplaceAll(prompt, "{{.QuestionCount}}", fmt.Sprintf("%d", questionCount)) + prompt = strings.ReplaceAll(prompt, "{{.Content}}", content) + prompt = strings.ReplaceAll(prompt, "{{.Context}}", contextSection) + prompt = strings.ReplaceAll(prompt, "{{.DocName}}", docName) + + thinking := false + response, err := chatModel.Chat(ctx, []chat.Message{ + { + Role: "user", + Content: prompt, + }, + }, &chat.ChatOptions{ + Temperature: 0.7, + MaxTokens: 512, + Thinking: &thinking, + }) + if err != nil { + return nil, fmt.Errorf("failed to generate questions: %w", err) + } + + // Parse response + lines := strings.Split(response.Content, "\n") + questions := make([]string, 0, questionCount) + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + line = strings.TrimLeft(line, "0123456789.-*) ") + line = strings.TrimSpace(line) + if line != "" && len(line) > 5 { + questions = append(questions, line) + if len(questions) >= questionCount { + break + } + } + } + + return questions, nil +} + +// Default prompt for question generation with context support +const defaultQuestionGenerationPrompt = `你是一个专业的问题生成助手。你的任务是根据给定的【主要内容】生成用户可能会问的相关问题。 + +{{.Context}} +## 主要内容(请基于此内容生成问题) +文档名称:{{.DocName}} +文档内容: +{{.Content}} + +## 核心要求 +- 生成的问题必须与【主要内容】直接相关 +- 问题中禁止使用任何代词或指代词(如"它"、"这个"、"该文档"、"本文"、"文中"、"其"等),必须用具体名称替代 +- 问题必须是完整独立的,脱离上下文也能被理解 +- 问题应该是用户在实际场景中可能会提出的自然问题 +- 问题应该多样化,覆盖内容的不同方面 +- 每个问题应该简洁明了,长度控制在30字以内 +- 生成的问题数量为 {{.QuestionCount}} 个 + +## 问题类型建议 +- 定义类:什么是...?...是什么? +- 原因类:为什么...?...的原因是什么? +- 方法类:如何...?怎样...? +- 比较类:...和...有什么区别? +- 应用类:...可以用于什么场景? + +## 输出格式 +直接输出问题列表,每行一个问题,不要有序号或其他前缀。` + // GetKnowledgeFile retrieves the physical file associated with a knowledge entry func (s *knowledgeService) GetKnowledgeFile(ctx context.Context, id string) (io.ReadCloser, string, error) { // Get knowledge record @@ -4051,7 +4541,10 @@ func (s *knowledgeService) ProcessDocument(ctx context.Context, t *asynq.Task) e } // 处理chunks(这会更新状态为completed) - s.processChunks(ctx, kb, knowledge, chunks) + s.processChunks(ctx, kb, knowledge, chunks, ProcessChunksOptions{ + EnableQuestionGeneration: payload.EnableQuestionGeneration, + QuestionCount: payload.QuestionCount, + }) return nil } diff --git a/internal/config/config.go b/internal/config/config.go index 31768118..f70fe9e1 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -57,6 +57,8 @@ type ConversationConfig struct { SimplifyQueryPromptUser string `yaml:"simplify_query_prompt_user" json:"simplify_query_prompt_user"` ExtractEntitiesPrompt string `yaml:"extract_entities_prompt" json:"extract_entities_prompt"` ExtractRelationshipsPrompt string `yaml:"extract_relationships_prompt" json:"extract_relationships_prompt"` + // GenerateQuestionsPrompt is used to generate questions for document chunks to improve recall + GenerateQuestionsPrompt string `yaml:"generate_questions_prompt" json:"generate_questions_prompt"` } // SummaryConfig 摘要配置 diff --git a/internal/handler/initialization.go b/internal/handler/initialization.go index 42f62e10..6bdb4189 100644 --- a/internal/handler/initialization.go +++ b/internal/handler/initialization.go @@ -123,6 +123,12 @@ type KBModelConfigRequest struct { Nodes []types.GraphNode `json:"nodes"` Relations []types.GraphRelation `json:"relations"` } `json:"nodeExtract"` + + // 问题生成配置 + QuestionGeneration struct { + Enabled bool `json:"enabled"` + QuestionCount int `json:"questionCount"` + } `json:"questionGeneration"` } // InitializationRequest 初始化请求结构 @@ -192,6 +198,11 @@ type InitializationRequest struct { Type string `json:"type"` } `json:"relations"` } `json:"nodeExtract"` + + QuestionGeneration struct { + Enabled bool `json:"enabled"` + QuestionCount int `json:"questionCount"` + } `json:"questionGeneration"` } // UpdateKBConfig 根据知识库ID和模型ID更新配置(简化版) @@ -333,6 +344,23 @@ func (h *InitializationHandler) UpdateKBConfig(c *gin.Context) { return } + // 更新问题生成配置 + if req.QuestionGeneration.Enabled { + questionCount := req.QuestionGeneration.QuestionCount + if questionCount <= 0 { + questionCount = 3 + } + if questionCount > 10 { + questionCount = 10 + } + kb.QuestionGenerationConfig = &types.QuestionGenerationConfig{ + Enabled: true, + QuestionCount: questionCount, + } + } else { + kb.QuestionGenerationConfig = &types.QuestionGenerationConfig{Enabled: false} + } + // 保存更新后的知识库 if err := h.kbRepository.UpdateKnowledgeBase(ctx, kb); err != nil { logger.Error(ctx, "Failed to update knowledge base", err) diff --git a/internal/router/task.go b/internal/router/task.go index 4ace9643..e860c2fd 100644 --- a/internal/router/task.go +++ b/internal/router/task.go @@ -67,6 +67,12 @@ func RunAsynqServer(params AsynqTaskParams) *asynq.ServeMux { // Register FAQ import handler mux.HandleFunc(types.TypeFAQImport, params.KnowledgeService.ProcessFAQImport) + // Register question generation handler + mux.HandleFunc(types.TypeQuestionGeneration, params.KnowledgeService.ProcessQuestionGeneration) + + // Register summary generation handler + mux.HandleFunc(types.TypeSummaryGeneration, params.KnowledgeService.ProcessSummaryGeneration) + go func() { // Start the server if err := params.Server.Run(mux); err != nil { diff --git a/internal/types/extract_graph.go b/internal/types/extract_graph.go index 8d1c0c78..aae79d9c 100644 --- a/internal/types/extract_graph.go +++ b/internal/types/extract_graph.go @@ -1,9 +1,11 @@ package types const ( - TypeChunkExtract = "chunk:extract" - TypeDocumentProcess = "document:process" // 文档处理任务 - TypeFAQImport = "faq:import" // FAQ导入任务 + TypeChunkExtract = "chunk:extract" + TypeDocumentProcess = "document:process" // 文档处理任务 + TypeFAQImport = "faq:import" // FAQ导入任务 + TypeQuestionGeneration = "question:generation" // 问题生成任务 + TypeSummaryGeneration = "summary:generation" // 摘要生成任务 ) // ExtractChunkPayload represents the extract chunk task payload @@ -15,16 +17,18 @@ type ExtractChunkPayload struct { // DocumentProcessPayload represents the document process task payload type DocumentProcessPayload struct { - RequestId string `json:"request_id"` - TenantID uint64 `json:"tenant_id"` - KnowledgeID string `json:"knowledge_id"` - KnowledgeBaseID string `json:"knowledge_base_id"` - FilePath string `json:"file_path,omitempty"` // 文件路径(文件导入时使用) - FileName string `json:"file_name,omitempty"` // 文件名(文件导入时使用) - FileType string `json:"file_type,omitempty"` // 文件类型(文件导入时使用) - URL string `json:"url,omitempty"` // URL(URL导入时使用) - Passages []string `json:"passages,omitempty"` // 文本段落(文本导入时使用) - EnableMultimodel bool `json:"enable_multimodel"` + RequestId string `json:"request_id"` + TenantID uint64 `json:"tenant_id"` + KnowledgeID string `json:"knowledge_id"` + KnowledgeBaseID string `json:"knowledge_base_id"` + FilePath string `json:"file_path,omitempty"` // 文件路径(文件导入时使用) + FileName string `json:"file_name,omitempty"` // 文件名(文件导入时使用) + FileType string `json:"file_type,omitempty"` // 文件类型(文件导入时使用) + URL string `json:"url,omitempty"` // URL(URL导入时使用) + Passages []string `json:"passages,omitempty"` // 文本段落(文本导入时使用) + EnableMultimodel bool `json:"enable_multimodel"` + EnableQuestionGeneration bool `json:"enable_question_generation"` // 是否启用问题生成 + QuestionCount int `json:"question_count,omitempty"` // 每个chunk生成的问题数量 } // FAQImportPayload represents the FAQ import task payload @@ -37,6 +41,29 @@ type FAQImportPayload struct { Mode string `json:"mode"` } +// QuestionGenerationPayload represents the question generation task payload +type QuestionGenerationPayload struct { + TenantID uint64 `json:"tenant_id"` + KnowledgeBaseID string `json:"knowledge_base_id"` + KnowledgeID string `json:"knowledge_id"` + QuestionCount int `json:"question_count"` +} + +// SummaryGenerationPayload represents the summary generation task payload +type SummaryGenerationPayload struct { + TenantID uint64 `json:"tenant_id"` + KnowledgeBaseID string `json:"knowledge_base_id"` + KnowledgeID string `json:"knowledge_id"` +} + +// ChunkContext represents chunk content with surrounding context +type ChunkContext struct { + ChunkID string `json:"chunk_id"` + Content string `json:"content"` + PrevContent string `json:"prev_content,omitempty"` // Previous chunk content for context + NextContent string `json:"next_content,omitempty"` // Next chunk content for context +} + // PromptTemplateStructured represents the prompt template structured type PromptTemplateStructured struct { Description string `json:"description"` diff --git a/internal/types/faq.go b/internal/types/faq.go index 7c209c85..fb7f5fec 100644 --- a/internal/types/faq.go +++ b/internal/types/faq.go @@ -19,6 +19,43 @@ type FAQChunkMetadata struct { Source string `json:"source,omitempty"` } +// DocumentChunkMetadata 定义文档 Chunk 的元数据结构 +// 用于存储AI生成的问题等增强信息 +type DocumentChunkMetadata struct { + // GeneratedQuestions 存储AI为该Chunk生成的相关问题 + // 这些问题会被独立索引以提高召回率 + GeneratedQuestions []string `json:"generated_questions,omitempty"` +} + +// DocumentMetadata 解析 Chunk 中的文档元数据 +func (c *Chunk) DocumentMetadata() (*DocumentChunkMetadata, error) { + if c == nil || len(c.Metadata) == 0 { + return nil, nil + } + var meta DocumentChunkMetadata + if err := json.Unmarshal(c.Metadata, &meta); err != nil { + return nil, err + } + return &meta, nil +} + +// SetDocumentMetadata 设置 Chunk 的文档元数据 +func (c *Chunk) SetDocumentMetadata(meta *DocumentChunkMetadata) error { + if c == nil { + return nil + } + if meta == nil { + c.Metadata = nil + return nil + } + bytes, err := json.Marshal(meta) + if err != nil { + return err + } + c.Metadata = JSON(bytes) + return nil +} + // Normalize 清理空白与重复项 func (m *FAQChunkMetadata) Normalize() { if m == nil { diff --git a/internal/types/interfaces/knowledge.go b/internal/types/interfaces/knowledge.go index d5a27b3c..20a26594 100644 --- a/internal/types/interfaces/knowledge.go +++ b/internal/types/interfaces/knowledge.go @@ -99,6 +99,10 @@ type KnowledgeService interface { ProcessDocument(ctx context.Context, t *asynq.Task) error // ProcessFAQImport handles Asynq FAQ import tasks ProcessFAQImport(ctx context.Context, t *asynq.Task) error + // ProcessQuestionGeneration handles Asynq question generation tasks + ProcessQuestionGeneration(ctx context.Context, t *asynq.Task) error + // ProcessSummaryGeneration handles Asynq summary generation tasks + ProcessSummaryGeneration(ctx context.Context, t *asynq.Task) error } // KnowledgeRepository defines the interface for knowledge repositories. diff --git a/internal/types/knowledgebase.go b/internal/types/knowledgebase.go index cd542571..512c71fa 100644 --- a/internal/types/knowledgebase.go +++ b/internal/types/knowledgebase.go @@ -65,6 +65,8 @@ type KnowledgeBase struct { ExtractConfig *ExtractConfig `yaml:"extract_config" json:"extract_config" gorm:"column:extract_config;type:json"` // FAQConfig stores FAQ specific configuration such as indexing strategy FAQConfig *FAQConfig `yaml:"faq_config" json:"faq_config" gorm:"column:faq_config;type:json"` + // QuestionGenerationConfig stores question generation configuration for document knowledge bases + QuestionGenerationConfig *QuestionGenerationConfig `yaml:"question_generation_config" json:"question_generation_config" gorm:"column:question_generation_config;type:json"` // Creation time of the knowledge base CreatedAt time.Time `yaml:"created_at" json:"created_at"` // Last updated time of the knowledge base @@ -180,6 +182,32 @@ type VLMConfig struct { ModelID string `yaml:"model_id" json:"model_id"` } +// QuestionGenerationConfig represents the question generation configuration for document knowledge bases +// When enabled, the system will use LLM to generate questions for each chunk during document parsing +// These generated questions will be indexed separately to improve recall +type QuestionGenerationConfig struct { + Enabled bool `yaml:"enabled" json:"enabled"` + // Number of questions to generate per chunk (default: 3, max: 10) + QuestionCount int `yaml:"question_count" json:"question_count"` +} + +// Value implements the driver.Valuer interface +func (c QuestionGenerationConfig) Value() (driver.Value, error) { + return json.Marshal(c) +} + +// Scan implements the sql.Scanner interface +func (c *QuestionGenerationConfig) Scan(value interface{}) error { + if value == nil { + return nil + } + b, ok := value.([]byte) + if !ok { + return nil + } + return json.Unmarshal(b, c) +} + // Value implements the driver.Valuer interface, used to convert VLMConfig to database value func (c VLMConfig) Value() (driver.Value, error) { return json.Marshal(c) diff --git a/migrations/versioned/000017_add_question_generation_config.down.sql b/migrations/versioned/000017_add_question_generation_config.down.sql new file mode 100644 index 00000000..a8f39dac --- /dev/null +++ b/migrations/versioned/000017_add_question_generation_config.down.sql @@ -0,0 +1,4 @@ +-- Remove question_generation_config column from knowledge_bases table + +ALTER TABLE knowledge_bases +DROP COLUMN IF EXISTS question_generation_config; diff --git a/migrations/versioned/000017_add_question_generation_config.up.sql b/migrations/versioned/000017_add_question_generation_config.up.sql new file mode 100644 index 00000000..7b9c483a --- /dev/null +++ b/migrations/versioned/000017_add_question_generation_config.up.sql @@ -0,0 +1,7 @@ +-- Add question_generation_config column to knowledge_bases table +-- This column stores configuration for AI question generation feature +-- When enabled, the system generates questions for document chunks to improve recall + +ALTER TABLE knowledge_bases +ADD COLUMN IF NOT EXISTS question_generation_config JSON NULL +COMMENT 'Question generation configuration for document knowledge bases';