diff --git a/internal/infrastructure/chunker/heading_splitter.go b/internal/infrastructure/chunker/heading_splitter.go index c9242d91..35555bfa 100644 --- a/internal/infrastructure/chunker/heading_splitter.go +++ b/internal/infrastructure/chunker/heading_splitter.go @@ -148,10 +148,11 @@ func coalesceTinyChunks(in []Chunk, chunkSize int) []Chunk { for i := 1; i < len(in); i++ { next := in[i] nextLen := utf8.RuneCountInString(next.Content) + sharedHeader := commonHeadingPrefix(cur.ContextHeader, next.ContextHeader) // Adjacent + still-small + would not blow the size budget → merge. - if cur.End == next.Start && curLen < target && curLen+nextLen <= chunkSize { + if sharedHeader != "" && cur.End == next.Start && curLen < target && curLen+nextLen <= chunkSize { cur.Content += next.Content - cur.ContextHeader = commonHeadingPrefix(cur.ContextHeader, next.ContextHeader) + cur.ContextHeader = sharedHeader cur.End = next.End curLen += nextLen continue @@ -274,4 +275,3 @@ func observeSubHeadings(runes []rune, primaryLevel int, h *HeadingHierarchy) { } } } - diff --git a/internal/infrastructure/chunker/heading_splitter_test.go b/internal/infrastructure/chunker/heading_splitter_test.go index 82e1509d..fb629ee8 100644 --- a/internal/infrastructure/chunker/heading_splitter_test.go +++ b/internal/infrastructure/chunker/heading_splitter_test.go @@ -202,6 +202,27 @@ embedding 表缺列。` } } +func TestSplitByHeadings_DoesNotCoalesceDistinctTopLevelHeadings(t *testing.T) { + doc := `# Intro +short intro. + +# Usage +short usage. + +# FAQ +short faq.` + cfg := SplitterConfig{ChunkSize: 500, ChunkOverlap: 0} + chunks := splitByHeadingsImpl(doc, cfg, nil) + if len(chunks) != 3 { + t.Fatalf("expected one chunk per top-level heading, got %d:\n%v", len(chunks), chunks) + } + for i, heading := range []string{"# Intro", "# Usage", "# FAQ"} { + if !strings.Contains(chunks[i].Content, heading) { + t.Errorf("chunk %d should contain heading %q, got:\n%s", i, heading, chunks[i].Content) + } + } +} + // TestSplitByHeadings_CoalescePreservesPositionInvariant guards the // End-Start == len([]rune(Content)) invariant after merging. Adjacent // chunks (cur.End == next.Start) must concatenate cleanly; the merge must diff --git a/internal/infrastructure/chunker/strategy_test.go b/internal/infrastructure/chunker/strategy_test.go index 48a8803e..205bf276 100644 --- a/internal/infrastructure/chunker/strategy_test.go +++ b/internal/infrastructure/chunker/strategy_test.go @@ -48,6 +48,27 @@ func TestSplit_AutoStrategy_PicksHeadingForMarkdownDoc(t *testing.T) { } } +func TestSplit_HeadingStrategyKeepsDistinctTopLevelHeadings(t *testing.T) { + doc := `# Intro +short intro. + +# Usage +short usage. + +# FAQ +short faq.` + cfg := SplitterConfig{ChunkSize: 500, ChunkOverlap: 0, Strategy: StrategyHeading} + chunks := Split(doc, cfg) + if len(chunks) != 3 { + t.Fatalf("expected one chunk per top-level heading, got %d:\n%v", len(chunks), chunks) + } + for i, heading := range []string{"# Intro", "# Usage", "# FAQ"} { + if !strings.Contains(chunks[i].Content, heading) { + t.Errorf("chunk %d should contain heading %q, got:\n%s", i, heading, chunks[i].Content) + } + } +} + // TestSplit_PreservesPositionInvariantAcrossTiers ensures every chunk's // (Start, End, Content) triple stays consistent — End-Start must equal the // rune length of Content, and runes[Start:End] must equal Content. This is