fix(chunker): keep top-level heading chunks separate

This commit is contained in:
wolfkill
2026-05-19 15:29:08 +08:00
committed by lyingbug
parent 1cb522e621
commit eb52caf033
3 changed files with 45 additions and 3 deletions

View File

@@ -148,10 +148,11 @@ func coalesceTinyChunks(in []Chunk, chunkSize int) []Chunk {
for i := 1; i < len(in); i++ {
next := in[i]
nextLen := utf8.RuneCountInString(next.Content)
sharedHeader := commonHeadingPrefix(cur.ContextHeader, next.ContextHeader)
// Adjacent + still-small + would not blow the size budget → merge.
if cur.End == next.Start && curLen < target && curLen+nextLen <= chunkSize {
if sharedHeader != "" && cur.End == next.Start && curLen < target && curLen+nextLen <= chunkSize {
cur.Content += next.Content
cur.ContextHeader = commonHeadingPrefix(cur.ContextHeader, next.ContextHeader)
cur.ContextHeader = sharedHeader
cur.End = next.End
curLen += nextLen
continue
@@ -274,4 +275,3 @@ func observeSubHeadings(runes []rune, primaryLevel int, h *HeadingHierarchy) {
}
}
}

View File

@@ -202,6 +202,27 @@ embedding 表缺列。`
}
}
func TestSplitByHeadings_DoesNotCoalesceDistinctTopLevelHeadings(t *testing.T) {
doc := `# Intro
short intro.
# Usage
short usage.
# FAQ
short faq.`
cfg := SplitterConfig{ChunkSize: 500, ChunkOverlap: 0}
chunks := splitByHeadingsImpl(doc, cfg, nil)
if len(chunks) != 3 {
t.Fatalf("expected one chunk per top-level heading, got %d:\n%v", len(chunks), chunks)
}
for i, heading := range []string{"# Intro", "# Usage", "# FAQ"} {
if !strings.Contains(chunks[i].Content, heading) {
t.Errorf("chunk %d should contain heading %q, got:\n%s", i, heading, chunks[i].Content)
}
}
}
// TestSplitByHeadings_CoalescePreservesPositionInvariant guards the
// End-Start == len([]rune(Content)) invariant after merging. Adjacent
// chunks (cur.End == next.Start) must concatenate cleanly; the merge must

View File

@@ -48,6 +48,27 @@ func TestSplit_AutoStrategy_PicksHeadingForMarkdownDoc(t *testing.T) {
}
}
func TestSplit_HeadingStrategyKeepsDistinctTopLevelHeadings(t *testing.T) {
doc := `# Intro
short intro.
# Usage
short usage.
# FAQ
short faq.`
cfg := SplitterConfig{ChunkSize: 500, ChunkOverlap: 0, Strategy: StrategyHeading}
chunks := Split(doc, cfg)
if len(chunks) != 3 {
t.Fatalf("expected one chunk per top-level heading, got %d:\n%v", len(chunks), chunks)
}
for i, heading := range []string{"# Intro", "# Usage", "# FAQ"} {
if !strings.Contains(chunks[i].Content, heading) {
t.Errorf("chunk %d should contain heading %q, got:\n%s", i, heading, chunks[i].Content)
}
}
}
// TestSplit_PreservesPositionInvariantAcrossTiers ensures every chunk's
// (Start, End, Content) triple stays consistent — End-Start must equal the
// rune length of Content, and runes[Start:End] must equal Content. This is