mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
fix(chunker): keep top-level heading chunks separate
This commit is contained in:
@@ -148,10 +148,11 @@ func coalesceTinyChunks(in []Chunk, chunkSize int) []Chunk {
|
||||
for i := 1; i < len(in); i++ {
|
||||
next := in[i]
|
||||
nextLen := utf8.RuneCountInString(next.Content)
|
||||
sharedHeader := commonHeadingPrefix(cur.ContextHeader, next.ContextHeader)
|
||||
// Adjacent + still-small + would not blow the size budget → merge.
|
||||
if cur.End == next.Start && curLen < target && curLen+nextLen <= chunkSize {
|
||||
if sharedHeader != "" && cur.End == next.Start && curLen < target && curLen+nextLen <= chunkSize {
|
||||
cur.Content += next.Content
|
||||
cur.ContextHeader = commonHeadingPrefix(cur.ContextHeader, next.ContextHeader)
|
||||
cur.ContextHeader = sharedHeader
|
||||
cur.End = next.End
|
||||
curLen += nextLen
|
||||
continue
|
||||
@@ -274,4 +275,3 @@ func observeSubHeadings(runes []rune, primaryLevel int, h *HeadingHierarchy) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -202,6 +202,27 @@ embedding 表缺列。`
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitByHeadings_DoesNotCoalesceDistinctTopLevelHeadings(t *testing.T) {
|
||||
doc := `# Intro
|
||||
short intro.
|
||||
|
||||
# Usage
|
||||
short usage.
|
||||
|
||||
# FAQ
|
||||
short faq.`
|
||||
cfg := SplitterConfig{ChunkSize: 500, ChunkOverlap: 0}
|
||||
chunks := splitByHeadingsImpl(doc, cfg, nil)
|
||||
if len(chunks) != 3 {
|
||||
t.Fatalf("expected one chunk per top-level heading, got %d:\n%v", len(chunks), chunks)
|
||||
}
|
||||
for i, heading := range []string{"# Intro", "# Usage", "# FAQ"} {
|
||||
if !strings.Contains(chunks[i].Content, heading) {
|
||||
t.Errorf("chunk %d should contain heading %q, got:\n%s", i, heading, chunks[i].Content)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestSplitByHeadings_CoalescePreservesPositionInvariant guards the
|
||||
// End-Start == len([]rune(Content)) invariant after merging. Adjacent
|
||||
// chunks (cur.End == next.Start) must concatenate cleanly; the merge must
|
||||
|
||||
@@ -48,6 +48,27 @@ func TestSplit_AutoStrategy_PicksHeadingForMarkdownDoc(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplit_HeadingStrategyKeepsDistinctTopLevelHeadings(t *testing.T) {
|
||||
doc := `# Intro
|
||||
short intro.
|
||||
|
||||
# Usage
|
||||
short usage.
|
||||
|
||||
# FAQ
|
||||
short faq.`
|
||||
cfg := SplitterConfig{ChunkSize: 500, ChunkOverlap: 0, Strategy: StrategyHeading}
|
||||
chunks := Split(doc, cfg)
|
||||
if len(chunks) != 3 {
|
||||
t.Fatalf("expected one chunk per top-level heading, got %d:\n%v", len(chunks), chunks)
|
||||
}
|
||||
for i, heading := range []string{"# Intro", "# Usage", "# FAQ"} {
|
||||
if !strings.Contains(chunks[i].Content, heading) {
|
||||
t.Errorf("chunk %d should contain heading %q, got:\n%s", i, heading, chunks[i].Content)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestSplit_PreservesPositionInvariantAcrossTiers ensures every chunk's
|
||||
// (Start, End, Content) triple stays consistent — End-Start must equal the
|
||||
// rune length of Content, and runes[Start:End] must equal Content. This is
|
||||
|
||||
Reference in New Issue
Block a user