feat: add tests for incomplete Markdown image handling and improve stream flush logic

This commit introduces a new test, `TestFindIncompleteMarkdownImage`, to validate the detection of incomplete Markdown images in various scenarios. Additionally, it enhances the `holdbackCutoff` function to prioritize handling incomplete Markdown images, ensuring that they are correctly managed during stream flush operations. The changes improve the robustness of image processing in the application, addressing potential issues with unclosed image URLs in Markdown content.
This commit is contained in:
wizardchen
2026-05-29 12:21:24 +08:00
committed by lyingbug
parent 5d02404fdd
commit bdb164d432
2 changed files with 214 additions and 9 deletions

View File

@@ -2,6 +2,7 @@ package im
import (
"context"
"regexp"
"strings"
"testing"
@@ -231,6 +232,28 @@ func TestRewriteStorageURLs_COSPathNotSignedAsLocalKey(t *testing.T) {
}
}
func TestFindIncompleteMarkdownImage(t *testing.T) {
tests := []struct {
name string
in string
want int
}{
{"complete image", "![img](local://1/a.png)", -1},
{"complete then text", "![img](local://1/a.png) trailing", -1},
{"truncated provider URL in image", `![知识助理"知识库"管理视图界面](minio://wizard-test/10000/exports/c91cf852`, 0},
{"open paren only", "text ![alt](", 5},
{"bare provider suffix without markdown", "text minio://wizard-test/10000/exp", -1},
{"two images complete", "![a](local://1/a.png) ![b](local://1/b.png)", -1},
{"first complete second incomplete", "![a](local://1/a.png) ![b](minio://part", 22},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := findIncompleteMarkdownImage(tt.in)
assert.Equal(t, tt.want, got, "findIncompleteMarkdownImage(%q)", tt.in)
})
}
}
func TestHoldbackCutoff(t *testing.T) {
tests := []struct {
name string
@@ -240,13 +263,17 @@ func TestHoldbackCutoff(t *testing.T) {
{
"no holdback needed",
"plain text with complete ![img](local://1/img.png) content",
// URL is terminated by `)` then space → no holdback
-1, // placeholder, computed below
-1,
},
{
"truncated URL",
"truncated URL inside markdown image",
"text ![img](local://1/abc/im",
12,
5, // hold back from ![ — not from local://
},
{
"truncated markdown image with quotes in alt",
"### 2\n![知识助理\"知识库\"管理视图界面](minio://wizard-test/10000/exports/c91cf852",
-2, // computed via strings.Index below
},
{
"truncated XML",
@@ -254,19 +281,179 @@ func TestHoldbackCutoff(t *testing.T) {
5,
},
{
"both truncated, URL earlier",
"both truncated, XML earlier",
"<image url=\"local://1/im",
0, // <image at 0 is earlier than local:// at 12
0,
},
{
"bare truncated provider URL without markdown wrapper",
"prefix minio://wizard-test/10000/exp",
7,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
want := tt.want
if want == -2 {
want = strings.Index(tt.in, "![知识助理")
require.GreaterOrEqual(t, want, 0)
}
got := holdbackCutoff(tt.in)
if tt.want == -1 {
if want == -1 {
assert.Equal(t, len(tt.in), got, "holdbackCutoff(%q) should be len(in)", tt.in)
} else {
assert.Equal(t, tt.want, got, "holdbackCutoff(%q)", tt.in)
assert.Equal(t, want, got, "holdbackCutoff(%q)", tt.in)
}
})
}
}
// simulateIMStreamFlushStep mirrors one handleMessageStream flush (see flush() in service.go).
func simulateIMStreamFlushStep(holdback, incoming string, final bool) (sent, newHoldback string) {
chunk := holdback + incoming
holdback = ""
if chunk == "" {
return "", ""
}
if !final {
if cut := holdbackCutoff(chunk); cut < len(chunk) {
holdback = chunk[cut:]
chunk = chunk[:cut]
}
}
return chunk, holdback
}
// simulateIMStreamFlush mirrors handleMessageStream's flush/holdback logic (non-final
// flushes apply holdbackCutoff; final flush sends everything).
func simulateIMStreamFlush(parts []string) (sent []string, remainder string) {
var pending string
for i, part := range parts {
final := i == len(parts)-1
var chunk string
chunk, pending = simulateIMStreamFlushStep(pending, part, final)
if chunk != "" {
sent = append(sent, chunk)
}
}
return sent, pending
}
// orphanMarkdownImageOpenRe detects a flush chunk that ends with an opened Markdown
// image (](...) without the URL — the bug we fixed.
var orphanMarkdownImageOpenRe = regexp.MustCompile(`!\[[^\]]*\]\(\s*$`)
func TestSimulateIMStreamFlush_MiddleImageNotSplit(t *testing.T) {
// Reproduces the user report: three images; the middle one was split at minio://.
part1 := "### 1⃣ 智能交互入口\n\n" +
"![知识助理AI对话界面](minio://wizard-test/10000/exports/bb524693-a3f7-48bd-88ca-0c3957cb56e7.png)\n\n" +
"### 2⃣ 知识管理\n- 支持创建\n\n"
part2 := `![知识助理"知识库"管理视图界面](minio://wizard-test/10000/exports/c91cf852`
part3 := `-9c72-f549da25619a.png)\n\n### 3⃣ API\n\n` +
"![API](minio://wizard-test/10000/exports/a0423e91-95c9-46bf-ab4c-9586b43d0a61.png)\n"
sent, remainder := simulateIMStreamFlush([]string{part1, part2, part3})
require.Empty(t, remainder, "final flush should drain holdback")
for i, chunk := range sent {
assert.False(t, orphanMarkdownImageOpenRe.MatchString(chunk),
"chunk %d must not end with orphan ![alt]( : %q", i, chunk)
}
joined := strings.Join(sent, "")
assert.Contains(t, joined, "bb524693-a3f7-48bd-88ca-0c3957cb56e7.png)")
assert.Contains(t, joined, "c91cf852-9c72-f549da25619a.png)")
assert.Contains(t, joined, "a0423e91-95c9-46bf-ab4c-9586b43d0a61.png)")
// Middle image must appear intact in some single sent chunk (after merge for assertion).
middleImg := `![知识助理"知识库"管理视图界面](minio://wizard-test/10000/exports/c91cf852-9c72-f549da25619a.png)`
assert.Contains(t, joined, middleImg)
foundMiddleIntact := false
for _, chunk := range sent {
if strings.Contains(chunk, middleImg) {
foundMiddleIntact = true
break
}
}
assert.True(t, foundMiddleIntact, "middle markdown image should ship in one chunk, got %d chunks", len(sent))
}
func TestSimulateIMStreamFlush_FirstChunkHoldsMiddleImagePrefix(t *testing.T) {
part1 := "### 1\n![ok](minio://wizard-test/10000/exports/ok.png)\n\n### 2\n"
part2 := `![知识助理"知识库"管理视图界面](minio://wizard-test/10000/exports/c91cf852`
s1, hb1 := simulateIMStreamFlushStep("", part1, false)
assert.Equal(t, part1, s1)
assert.Empty(t, hb1)
s2, hb2 := simulateIMStreamFlushStep(hb1, part2, false)
assert.Empty(t, s2, "incomplete middle image must not be sent on non-final flush")
require.NotEmpty(t, hb2)
assert.True(t, strings.HasPrefix(hb2, "![知识助理"),
"holdback should include full markdown image prefix, got %q", hb2)
assert.False(t, orphanMarkdownImageOpenRe.MatchString(s1))
}
func TestSimulateIMStreamFlush_CompleteImageInOnePart(t *testing.T) {
one := `![x](minio://wizard-test/10000/exports/uuid.png) tail`
sent, rem := simulateIMStreamFlush([]string{one})
require.Empty(t, rem)
require.Len(t, sent, 1)
assert.Equal(t, one, sent[0])
}
func TestSimulateIMStreamFlush_BareProviderURLStillHeld(t *testing.T) {
s1, hb1 := simulateIMStreamFlushStep("", "intro ", false)
assert.Equal(t, "intro ", s1)
s2, hb2 := simulateIMStreamFlushStep(hb1, "minio://wizard-test/10000/exp", false)
assert.Empty(t, s2)
require.NotEmpty(t, hb2)
assert.True(t, strings.HasPrefix(hb2, "minio://wizard-test/10000/exp"))
}
func TestCleanIMContent_AfterStreamReassembly(t *testing.T) {
t.Setenv("SYSTEM_AES_KEY", "weknora-test-aes-key-32bytes!!!")
t.Setenv("APP_EXTERNAL_URL", "https://weknora.example.com")
tenant := &types.Tenant{
StorageEngineConfig: &types.StorageEngineConfig{
DefaultProvider: "cos",
COS: &types.COSEngineConfig{
SecretID: "id",
SecretKey: "key",
BucketName: "bucket",
Region: "ap-shanghai",
},
},
}
parts := []string{
`![知识助理"知识库"管理视图界面](local://10000/exports/c91cf852`,
`-9c72-f549da25619a.png)`,
}
sent, rem := simulateIMStreamFlush(parts)
require.Empty(t, rem)
joined := strings.Join(sent, "")
out := cleanIMContent(context.Background(), joined, tenant)
assert.Contains(t, out, "/api/v1/files/presigned")
assert.NotContains(t, out, "local://10000")
}
func TestRewriteStorageURLs_MultipleImagesInOneChunk(t *testing.T) {
t.Setenv("SYSTEM_AES_KEY", "weknora-test-aes-key-32bytes!!!")
t.Setenv("APP_EXTERNAL_URL", "https://weknora.example.com")
tenant := &types.Tenant{
StorageEngineConfig: &types.StorageEngineConfig{DefaultProvider: "cos"},
}
doc := "### 1\n\n![a](local://10000/exports/bb524693.png)\n\n### 2\n\n" +
`![知识助理"知识库"管理视图界面](local://10000/exports/c91cf852.png)` + "\n\n### 3\n\n" +
"![c](local://10000/exports/a0423e91.png)\n"
out := rewriteStorageURLs(context.Background(), doc, tenant)
assert.NotContains(t, out, "local://")
assert.Equal(t, 3, strings.Count(out, "/api/v1/files/presigned"))
}

View File

@@ -135,6 +135,22 @@ func findIncompleteStorageURL(s string) int {
return loc[0]
}
// incompleteMarkdownImageSuffixRe matches a Markdown image whose destination URL
// (the parenthesized part) is not yet closed — e.g. "![alt](minio://part" or "![alt](".
// Holding back only from "minio://" would flush "![alt](" to the IM client and break
// the image once the URL arrives in the next chunk.
var incompleteMarkdownImageSuffixRe = regexp.MustCompile(`!\[[^\]]*\]\([^)]*$`)
// findIncompleteMarkdownImage returns the byte offset of an unclosed ![alt](url
// suffix at the end of s, or -1 if none.
func findIncompleteMarkdownImage(s string) int {
loc := incompleteMarkdownImageSuffixRe.FindStringIndex(s)
if loc == nil {
return -1
}
return loc[0]
}
// incompleteXMLTagRe matches the opening of an <image…>, <kb…>, or <web…> tag
// that reaches the end of the string without a closing '>'.
var incompleteXMLTagRe = regexp.MustCompile(
@@ -155,7 +171,9 @@ func findIncompleteXMLTag(s string) int {
// chunk, or len(chunk) if the chunk is safe to flush entirely.
func holdbackCutoff(chunk string) int {
cutoff := len(chunk)
if idx := findIncompleteStorageURL(chunk); idx >= 0 && idx < cutoff {
if idx := findIncompleteMarkdownImage(chunk); idx >= 0 && idx < cutoff {
cutoff = idx
} else if idx := findIncompleteStorageURL(chunk); idx >= 0 && idx < cutoff {
cutoff = idx
}
if idx := findIncompleteXMLTag(chunk); idx >= 0 && idx < cutoff {