mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
feat: add tests for incomplete Markdown image handling and improve stream flush logic
This commit introduces a new test, `TestFindIncompleteMarkdownImage`, to validate the detection of incomplete Markdown images in various scenarios. Additionally, it enhances the `holdbackCutoff` function to prioritize handling incomplete Markdown images, ensuring that they are correctly managed during stream flush operations. The changes improve the robustness of image processing in the application, addressing potential issues with unclosed image URLs in Markdown content.
This commit is contained in:
@@ -2,6 +2,7 @@ package im
|
||||
|
||||
import (
|
||||
"context"
|
||||
"regexp"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
@@ -231,6 +232,28 @@ func TestRewriteStorageURLs_COSPathNotSignedAsLocalKey(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindIncompleteMarkdownImage(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
in string
|
||||
want int
|
||||
}{
|
||||
{"complete image", "", -1},
|
||||
{"complete then text", " trailing", -1},
|
||||
{"truncated provider URL in image", ` ", -1},
|
||||
{"first complete second incomplete", "  {
|
||||
got := findIncompleteMarkdownImage(tt.in)
|
||||
assert.Equal(t, tt.want, got, "findIncompleteMarkdownImage(%q)", tt.in)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestHoldbackCutoff(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -240,13 +263,17 @@ func TestHoldbackCutoff(t *testing.T) {
|
||||
{
|
||||
"no holdback needed",
|
||||
"plain text with complete  content",
|
||||
// URL is terminated by `)` then space → no holdback
|
||||
-1, // placeholder, computed below
|
||||
-1,
|
||||
},
|
||||
{
|
||||
"truncated URL",
|
||||
"truncated URL inside markdown image",
|
||||
"text  {
|
||||
5,
|
||||
},
|
||||
{
|
||||
"both truncated, URL earlier",
|
||||
"both truncated, XML earlier",
|
||||
"<image url=\"local://1/im",
|
||||
0, // <image at 0 is earlier than local:// at 12
|
||||
0,
|
||||
},
|
||||
{
|
||||
"bare truncated provider URL without markdown wrapper",
|
||||
"prefix minio://wizard-test/10000/exp",
|
||||
7,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
want := tt.want
|
||||
if want == -2 {
|
||||
want = strings.Index(tt.in, "![知识助理")
|
||||
require.GreaterOrEqual(t, want, 0)
|
||||
}
|
||||
got := holdbackCutoff(tt.in)
|
||||
if tt.want == -1 {
|
||||
if want == -1 {
|
||||
assert.Equal(t, len(tt.in), got, "holdbackCutoff(%q) should be len(in)", tt.in)
|
||||
} else {
|
||||
assert.Equal(t, tt.want, got, "holdbackCutoff(%q)", tt.in)
|
||||
assert.Equal(t, want, got, "holdbackCutoff(%q)", tt.in)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// simulateIMStreamFlushStep mirrors one handleMessageStream flush (see flush() in service.go).
|
||||
func simulateIMStreamFlushStep(holdback, incoming string, final bool) (sent, newHoldback string) {
|
||||
chunk := holdback + incoming
|
||||
holdback = ""
|
||||
if chunk == "" {
|
||||
return "", ""
|
||||
}
|
||||
if !final {
|
||||
if cut := holdbackCutoff(chunk); cut < len(chunk) {
|
||||
holdback = chunk[cut:]
|
||||
chunk = chunk[:cut]
|
||||
}
|
||||
}
|
||||
return chunk, holdback
|
||||
}
|
||||
|
||||
// simulateIMStreamFlush mirrors handleMessageStream's flush/holdback logic (non-final
|
||||
// flushes apply holdbackCutoff; final flush sends everything).
|
||||
func simulateIMStreamFlush(parts []string) (sent []string, remainder string) {
|
||||
var pending string
|
||||
for i, part := range parts {
|
||||
final := i == len(parts)-1
|
||||
var chunk string
|
||||
chunk, pending = simulateIMStreamFlushStep(pending, part, final)
|
||||
if chunk != "" {
|
||||
sent = append(sent, chunk)
|
||||
}
|
||||
}
|
||||
return sent, pending
|
||||
}
|
||||
|
||||
// orphanMarkdownImageOpenRe detects a flush chunk that ends with an opened Markdown
|
||||
// image (](...) without the URL — the bug we fixed.
|
||||
var orphanMarkdownImageOpenRe = regexp.MustCompile(`!\[[^\]]*\]\(\s*$`)
|
||||
|
||||
func TestSimulateIMStreamFlush_MiddleImageNotSplit(t *testing.T) {
|
||||
// Reproduces the user report: three images; the middle one was split at minio://.
|
||||
part1 := "### 1️⃣ 智能交互入口\n\n" +
|
||||
"\n\n" +
|
||||
"### 2️⃣ 知识管理\n- 支持创建\n\n"
|
||||
part2 := `\n\n### 3️⃣ API\n\n` +
|
||||
"\n"
|
||||
|
||||
sent, remainder := simulateIMStreamFlush([]string{part1, part2, part3})
|
||||
require.Empty(t, remainder, "final flush should drain holdback")
|
||||
|
||||
for i, chunk := range sent {
|
||||
assert.False(t, orphanMarkdownImageOpenRe.MatchString(chunk),
|
||||
"chunk %d must not end with orphan 
|
||||
}
|
||||
|
||||
joined := strings.Join(sent, "")
|
||||
assert.Contains(t, joined, "bb524693-a3f7-48bd-88ca-0c3957cb56e7.png)")
|
||||
assert.Contains(t, joined, "c91cf852-9c72-f549da25619a.png)")
|
||||
assert.Contains(t, joined, "a0423e91-95c9-46bf-ab4c-9586b43d0a61.png)")
|
||||
|
||||
// Middle image must appear intact in some single sent chunk (after merge for assertion).
|
||||
middleImg := ``
|
||||
assert.Contains(t, joined, middleImg)
|
||||
|
||||
foundMiddleIntact := false
|
||||
for _, chunk := range sent {
|
||||
if strings.Contains(chunk, middleImg) {
|
||||
foundMiddleIntact = true
|
||||
break
|
||||
}
|
||||
}
|
||||
assert.True(t, foundMiddleIntact, "middle markdown image should ship in one chunk, got %d chunks", len(sent))
|
||||
}
|
||||
|
||||
func TestSimulateIMStreamFlush_FirstChunkHoldsMiddleImagePrefix(t *testing.T) {
|
||||
part1 := "### 1\n\n\n### 2\n"
|
||||
part2 := `
|
||||
assert.Equal(t, part1, s1)
|
||||
assert.Empty(t, hb1)
|
||||
|
||||
s2, hb2 := simulateIMStreamFlushStep(hb1, part2, false)
|
||||
assert.Empty(t, s2, "incomplete middle image must not be sent on non-final flush")
|
||||
require.NotEmpty(t, hb2)
|
||||
assert.True(t, strings.HasPrefix(hb2, " tail`
|
||||
sent, rem := simulateIMStreamFlush([]string{one})
|
||||
require.Empty(t, rem)
|
||||
require.Len(t, sent, 1)
|
||||
assert.Equal(t, one, sent[0])
|
||||
}
|
||||
|
||||
func TestSimulateIMStreamFlush_BareProviderURLStillHeld(t *testing.T) {
|
||||
s1, hb1 := simulateIMStreamFlushStep("", "intro ", false)
|
||||
assert.Equal(t, "intro ", s1)
|
||||
|
||||
s2, hb2 := simulateIMStreamFlushStep(hb1, "minio://wizard-test/10000/exp", false)
|
||||
assert.Empty(t, s2)
|
||||
require.NotEmpty(t, hb2)
|
||||
assert.True(t, strings.HasPrefix(hb2, "minio://wizard-test/10000/exp"))
|
||||
}
|
||||
|
||||
func TestCleanIMContent_AfterStreamReassembly(t *testing.T) {
|
||||
t.Setenv("SYSTEM_AES_KEY", "weknora-test-aes-key-32bytes!!!")
|
||||
t.Setenv("APP_EXTERNAL_URL", "https://weknora.example.com")
|
||||
|
||||
tenant := &types.Tenant{
|
||||
StorageEngineConfig: &types.StorageEngineConfig{
|
||||
DefaultProvider: "cos",
|
||||
COS: &types.COSEngineConfig{
|
||||
SecretID: "id",
|
||||
SecretKey: "key",
|
||||
BucketName: "bucket",
|
||||
Region: "ap-shanghai",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
parts := []string{
|
||||
``,
|
||||
}
|
||||
sent, rem := simulateIMStreamFlush(parts)
|
||||
require.Empty(t, rem)
|
||||
joined := strings.Join(sent, "")
|
||||
out := cleanIMContent(context.Background(), joined, tenant)
|
||||
assert.Contains(t, out, "/api/v1/files/presigned")
|
||||
assert.NotContains(t, out, "local://10000")
|
||||
}
|
||||
|
||||
func TestRewriteStorageURLs_MultipleImagesInOneChunk(t *testing.T) {
|
||||
t.Setenv("SYSTEM_AES_KEY", "weknora-test-aes-key-32bytes!!!")
|
||||
t.Setenv("APP_EXTERNAL_URL", "https://weknora.example.com")
|
||||
|
||||
tenant := &types.Tenant{
|
||||
StorageEngineConfig: &types.StorageEngineConfig{DefaultProvider: "cos"},
|
||||
}
|
||||
|
||||
doc := "### 1\n\n\n\n### 2\n\n" +
|
||||
`` + "\n\n### 3\n\n" +
|
||||
"\n"
|
||||
|
||||
out := rewriteStorageURLs(context.Background(), doc, tenant)
|
||||
assert.NotContains(t, out, "local://")
|
||||
assert.Equal(t, 3, strings.Count(out, "/api/v1/files/presigned"))
|
||||
}
|
||||
|
||||
@@ -135,6 +135,22 @@ func findIncompleteStorageURL(s string) int {
|
||||
return loc[0]
|
||||
}
|
||||
|
||||
// incompleteMarkdownImageSuffixRe matches a Markdown image whose destination URL
|
||||
// (the parenthesized part) is not yet closed — e.g. "]*$`)
|
||||
|
||||
// findIncompleteMarkdownImage returns the byte offset of an unclosed  int {
|
||||
loc := incompleteMarkdownImageSuffixRe.FindStringIndex(s)
|
||||
if loc == nil {
|
||||
return -1
|
||||
}
|
||||
return loc[0]
|
||||
}
|
||||
|
||||
// incompleteXMLTagRe matches the opening of an <image…>, <kb…>, or <web…> tag
|
||||
// that reaches the end of the string without a closing '>'.
|
||||
var incompleteXMLTagRe = regexp.MustCompile(
|
||||
@@ -155,7 +171,9 @@ func findIncompleteXMLTag(s string) int {
|
||||
// chunk, or len(chunk) if the chunk is safe to flush entirely.
|
||||
func holdbackCutoff(chunk string) int {
|
||||
cutoff := len(chunk)
|
||||
if idx := findIncompleteStorageURL(chunk); idx >= 0 && idx < cutoff {
|
||||
if idx := findIncompleteMarkdownImage(chunk); idx >= 0 && idx < cutoff {
|
||||
cutoff = idx
|
||||
} else if idx := findIncompleteStorageURL(chunk); idx >= 0 && idx < cutoff {
|
||||
cutoff = idx
|
||||
}
|
||||
if idx := findIncompleteXMLTag(chunk); idx >= 0 && idx < cutoff {
|
||||
|
||||
Reference in New Issue
Block a user