Files
WeKnora/internal/application/service/knowledge_housekeeping_test.go
wizardchen 44d6175559 feat: add knowledge parse cancellation with finalizing post-process state
Lets users stop an in-flight document parse to free up LLM / worker
resources without losing the chunks and index already written. The
core insight is that the previous parse_status=completed flipped as
soon as primary chunks landed, while the most expensive subtasks
(graph extract = N LLM calls per chunk, plus summary, question
generation) were still running in the background — so "completed"
wasn't actually terminal from a resource standpoint.

State machine

  pending -> processing -> finalizing -> completed
                              |
                              +-> cancelled (any of the three
                                            in-flight states)
                              +-> failed
                              +-> deleting

`finalizing` is the new post-process fan-out window. parse_status
only promotes to `completed` once pending_subtasks_count (a new
column tracking summary + question + per-chunk graph extract)
drains to zero via atomic FinalizeSubtask. Wiki ingest is
intentionally excluded from the counter — it's a KB-scoped
debounced batch and would otherwise pin parse_status in
`finalizing` for the wiki batch window.

Backend

- New ParseStatusFinalizing + pending_subtasks_count column with
  migration 000056.
- knowledgeRepository.SetFinalizing transitions processing -> finalizing
  conditionally so a racing cancel cannot be clobbered.
- knowledgeRepository.FinalizeSubtask atomically decrements the
  counter and self-promotes the row to completed when it hits zero.
- KnowledgePostProcess restructured to compute expected subtask
  count up front, flip to finalizing (or completed when no
  enrichment is enabled), and only then fan out subtasks. Subtask
  handlers (summary, question, graph extract) defer-decrement on
  terminal exit using the existing isFinalAsynqAttempt convention.
- New POST /api/v1/knowledge/{id}/cancel-parse handler accepting
  pending / processing / finalizing. Marks the row cancelled,
  zeroes the counter, best-effort dequeues asynq tasks via a new
  TaskInspector abstraction (asynq-mode walks pending/scheduled/
  retry queues; Lite-mode noop), and scrubs wiki ingest pending op.
- SpanTracker.AbortAttempt flat-sweeps every still-running span
  for the attempt via a new repo.CancelAllOpenSpans helper so the
  trace viewer's striped bars all flip to cancelled, even leaf
  generations whose parent stage already EndSpan'd (multimodal
  fan-out pattern). knowledge_post_process closes its postSpan
  via SkipSpan on the cancel/deleting entry guard so a worker
  that opens a span AFTER the cancel sweep doesn't leak it.
- Housekeeping and resetPendingTasks sweep finalizing rows
  identically to processing so a crash/restart can't strand them.
- DeleteKnowledge/DeleteKnowledgeList proactively dequeue
  downstream tasks via the same TaskInspector path.
- ChunkExtractService gets a cancel entry guard so the most
  expensive enrichment (graph extract) bails immediately when the
  parent knowledge is aborted.

Frontend

- New cancelKnowledgeParse API client + "Stop parsing" entry in
  both list view and card view more menus, gated on
  pending/processing/finalizing.
- Polling predicate refactored to a shared isParseInFlight helper
  that recognises `finalizing` (previously the doc list silently
  stopped polling once parse_status flipped from processing).
- Knowledge processing timeline: isPolling includes finalizing,
  new isHardTerminal short-circuits LIVE for cancelled/failed/
  completed so stranded child spans cannot pin LIVE on.
- DocumentListView.computeStatus distinguishes finalizing
  ("增强中") from completed and shows the previous "生成摘要中"
  copy when summary_status is still pending under finalizing.
  Added cancelled badge as well.
- i18n: statusFinalizing / statusCancelled / cancelParse* keys
  across zh-CN, en-US, ko-KR, ru-RU.

Docs / SDK

- docs/api/knowledge.md: documents the new finalizing state,
  cancel-parse semantics, and which statuses accept cancel.
- client (Go SDK): CancelKnowledgeParse with docstring listing
  the cancellable statuses.
2026-05-28 20:16:02 +08:00

190 lines
7.1 KiB
Go

package service
import (
"context"
"testing"
"time"
"github.com/Tencent/WeKnora/internal/config"
"github.com/Tencent/WeKnora/internal/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"gorm.io/driver/sqlite"
"gorm.io/gorm"
)
// knowledgeTestDDL is the minimal subset of the knowledge schema this
// suite needs. We avoid AutoMigrate because Knowledge carries multiple
// JSONB-tagged fields whose SQLite mapping is fragile.
//
// Table name is `knowledges` (plural) — that's what migration 000000
// creates and what GORM's default pluralization expects when the
// service code uses Model(&types.Knowledge{}).
const knowledgeTestDDL = `
CREATE TABLE IF NOT EXISTS knowledges (
id VARCHAR(64) PRIMARY KEY,
tenant_id INTEGER NOT NULL DEFAULT 0,
knowledge_base_id VARCHAR(64),
parse_status VARCHAR(32) NOT NULL DEFAULT 'pending',
summary_status VARCHAR(32) NOT NULL DEFAULT 'none',
pending_subtasks_count INTEGER NOT NULL DEFAULT 0,
error_message TEXT,
title TEXT,
file_type TEXT,
enable_status TEXT NOT NULL DEFAULT 'enabled',
type TEXT NOT NULL DEFAULT 'document',
embedding_model_id TEXT NOT NULL DEFAULT '',
storage_size BIGINT NOT NULL DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
deleted_at DATETIME
);
`
const housekeepingSpansDDL = `
CREATE TABLE IF NOT EXISTS knowledge_processing_spans (
id INTEGER PRIMARY KEY AUTOINCREMENT,
knowledge_id VARCHAR(64) NOT NULL,
attempt INTEGER NOT NULL DEFAULT 1,
span_id VARCHAR(64) NOT NULL,
parent_span_id VARCHAR(64),
name VARCHAR(64) NOT NULL,
kind VARCHAR(16) NOT NULL,
status VARCHAR(16) NOT NULL,
input TEXT,
output TEXT,
metadata TEXT,
error_code VARCHAR(64),
error_message TEXT,
error_detail TEXT,
started_at DATETIME,
finished_at DATETIME,
duration_ms BIGINT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
UNIQUE (knowledge_id, attempt, span_id)
);
`
func setupHousekeepingDB(t *testing.T) *gorm.DB {
t.Helper()
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{})
require.NoError(t, err)
require.NoError(t, db.Exec(knowledgeTestDDL).Error)
require.NoError(t, db.Exec(housekeepingSpansDDL).Error)
return db
}
// insertKnowledge writes a knowledge row at the given updated_at. We
// can't pass updated_at through GORM defaults since CURRENT_TIMESTAMP
// would override our test fixture; raw SQL keeps the timestamp.
func insertKnowledge(t *testing.T, db *gorm.DB, id, status string, updatedAt time.Time) {
t.Helper()
require.NoError(t, db.Exec(
`INSERT INTO knowledges (id, parse_status, updated_at) VALUES (?, ?, ?)`,
id, status, updatedAt,
).Error)
}
func insertSpan(t *testing.T, db *gorm.DB, kid string, attempt int, spanID, status string, updatedAt time.Time) {
t.Helper()
require.NoError(t, db.Exec(
`INSERT INTO knowledge_processing_spans (knowledge_id, attempt, span_id, name, kind, status, updated_at)
VALUES (?, ?, ?, 'docreader', 'stage', ?, ?)`,
kid, attempt, spanID, status, updatedAt,
).Error)
}
func newHousekeepingSvcForTest(db *gorm.DB) *HousekeepingService {
cfg := &config.Config{KnowledgeBase: &config.KnowledgeBaseConfig{
// 1h floor + 10min buffer = 70min cutoff. Tight enough to keep
// the test's relative timestamps in seconds; the production
// default of 2h+10min is just a constant scale factor.
DocumentProcessTimeout: 1 * time.Hour,
}}
return NewHousekeepingService(db, cfg)
}
// TestHousekeeping_RecoversAbandoned exercises the happy path: a
// knowledge stuck at "processing" with no recent heartbeat (no spans,
// stale knowledge.updated_at) MUST be flipped to failed.
func TestHousekeeping_RecoversAbandoned(t *testing.T) {
db := setupHousekeepingDB(t)
svc := newHousekeepingSvcForTest(db)
stale := time.Now().Add(-3 * time.Hour) // well past 70min cutoff
insertKnowledge(t, db, "kid-abandoned", types.ParseStatusProcessing, stale)
svc.runSweep(context.Background())
var status, errMsg string
require.NoError(t, db.Raw(
`SELECT parse_status, error_message FROM knowledges WHERE id = ?`, "kid-abandoned",
).Row().Scan(&status, &errMsg))
assert.Equal(t, types.ParseStatusFailed, status)
assert.Contains(t, errMsg, "stuck in processing")
}
// TestHousekeeping_NoFalseKill_ActiveSpan is the regression test for
// the "long DocReader silently runs longer than DocumentProcessTimeout"
// scenario the user flagged. A knowledge whose knowledge.updated_at
// looks stale BUT whose span tree shows recent activity must NOT be
// killed.
func TestHousekeeping_NoFalseKill_ActiveSpan(t *testing.T) {
db := setupHousekeepingDB(t)
svc := newHousekeepingSvcForTest(db)
stale := time.Now().Add(-3 * time.Hour)
insertKnowledge(t, db, "kid-active", types.ParseStatusProcessing, stale)
// Span heartbeat well within the 70min cutoff — it represents
// "we're STILL working, the worker just hasn't transitioned the
// parse_status column yet".
insertSpan(t, db, "kid-active", 1, "docreader-1", types.SpanStatusRunning, time.Now().Add(-2*time.Minute))
svc.runSweep(context.Background())
var status string
require.NoError(t, db.Raw(
`SELECT parse_status FROM knowledges WHERE id = ?`, "kid-active",
).Row().Scan(&status))
assert.Equal(t, types.ParseStatusProcessing, status,
"knowledge with recent span heartbeat must NOT be flipped to failed")
}
// TestHousekeeping_NoFalseKill_StaleSpanRecovers confirms the inverse:
// a knowledge whose span tree has ALSO gone silent past the threshold
// is genuinely stuck and must be recovered.
func TestHousekeeping_NoFalseKill_StaleSpanRecovers(t *testing.T) {
db := setupHousekeepingDB(t)
svc := newHousekeepingSvcForTest(db)
stale := time.Now().Add(-3 * time.Hour)
insertKnowledge(t, db, "kid-stuck", types.ParseStatusProcessing, stale)
// Span row stale by the same amount — no recent activity anywhere.
insertSpan(t, db, "kid-stuck", 1, "docreader-1", types.SpanStatusRunning, stale)
svc.runSweep(context.Background())
var status string
require.NoError(t, db.Raw(
`SELECT parse_status FROM knowledges WHERE id = ?`, "kid-stuck",
).Row().Scan(&status))
assert.Equal(t, types.ParseStatusFailed, status,
"genuinely stuck knowledge (knowledge AND spans both stale) must still be recovered")
}
// TestHousekeeping_PreservesRecentlyTouched: any knowledge whose
// updated_at is within the cutoff is left alone — that's the cheap
// fast path that doesn't even consult the spans table.
func TestHousekeeping_PreservesRecentlyTouched(t *testing.T) {
db := setupHousekeepingDB(t)
svc := newHousekeepingSvcForTest(db)
insertKnowledge(t, db, "kid-fresh", types.ParseStatusProcessing, time.Now().Add(-30*time.Second))
svc.runSweep(context.Background())
var status string
require.NoError(t, db.Raw(
`SELECT parse_status FROM knowledges WHERE id = ?`, "kid-fresh",
).Row().Scan(&status))
assert.Equal(t, types.ParseStatusProcessing, status,
"knowledge updated within the cutoff must be left alone")
}