From 6210f44fb618b7edff3f2fbedd85584b14867c80 Mon Sep 17 00:00:00 2001 From: MidnightSun Date: Tue, 19 May 2026 11:21:50 +0800 Subject: [PATCH] fix(docparser): preserve MinerU markdown and persist relative images MinerU already returns markdown with embedded HTML blocks, but the current\nreader runs the whole document back through html-to-markdown. That\nsecond conversion escapes valid headings and image syntax, so chunk\nprofiling sees plain text instead of markdown structure and relative\nimage references stop matching the storage pipeline.\n\nKeep MinerU output in its original markdown form and only apply narrow\ncompatibility normalization for the specific over-escaped patterns we\nactually need to recover. The converter now matches image refs by the\npaths that are really present in markdown or embedded HTML instead of\nassuming a single images/ form.\n\nExtend ImageResolver so relative HTML references share the\nsame storage rewrite path as markdown images, deduplicate repeated saves,\nand keep the frontend sanitizer compatible with MinerU's details/summary\nblocks. Add focused docparser tests that cover escaped markdown repair,\nvariant image path matching, and relative HTML image persistence. --- frontend/src/utils/security.ts | 4 +- .../docparser/image_resolver.go | 140 +++++++++++++----- .../image_resolver_relative_html_test.go | 43 ++++++ .../docparser/mineru_converter.go | 111 ++++++++++---- .../docparser/mineru_converter_test.go | 66 +++++++++ 5 files changed, 298 insertions(+), 66 deletions(-) create mode 100644 internal/infrastructure/docparser/image_resolver_relative_html_test.go create mode 100644 internal/infrastructure/docparser/mineru_converter_test.go diff --git a/frontend/src/utils/security.ts b/frontend/src/utils/security.ts index 3d495b36..14025f87 100644 --- a/frontend/src/utils/security.ts +++ b/frontend/src/utils/security.ts @@ -14,7 +14,7 @@ const DOMPurifyConfig = { 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'blockquote', 'pre', 'code', 'a', 'img', 'table', 'thead', 'tbody', 'tr', 'th', 'td', - 'div', 'span', 'figure', 'figcaption', 'think', + 'div', 'span', 'figure', 'figcaption', 'details', 'summary', 'think', // Mermaid SVG 支持的标签 'svg', 'g', 'path', 'rect', 'circle', 'ellipse', 'line', 'polygon', 'polyline', 'text', 'tspan', 'defs', 'marker', 'filter', 'use', @@ -26,7 +26,7 @@ const DOMPurifyConfig = { // 允许的属性 ALLOWED_ATTR: [ 'href', 'title', 'alt', 'src', 'class', 'id', 'style', 'data-protected-src', - 'target', 'rel', 'width', 'height', + 'target', 'rel', 'width', 'height', 'open', // Mermaid SVG 支持的属性 'd', 'fill', 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-miterlimit', 'stroke-opacity', diff --git a/internal/infrastructure/docparser/image_resolver.go b/internal/infrastructure/docparser/image_resolver.go index fe84e9b4..d35bebde 100644 --- a/internal/infrastructure/docparser/image_resolver.go +++ b/internal/infrastructure/docparser/image_resolver.go @@ -80,7 +80,7 @@ func (r *ImageResolver) ResolveAndStore( fileSvc interfaces.FileService, tenantID uint64, ) (updatedMarkdown string, images []StoredImage, err error) { - markdown := UnwrapLinkedImages(result.MarkdownContent) + markdown := UnwrapLinkedImages(normalizeMinerUMarkdown(result.MarkdownContent)) md2, imgDataURIs, _ := r.ResolveDataURIImages(ctx, markdown, fileSvc, tenantID) markdown = md2 images = append(images, imgDataURIs...) @@ -102,6 +102,7 @@ func (r *ImageResolver) ResolveAndStore( for _, ref := range result.ImageRefs { refMap[ref.OriginalRef] = ref } + savedRefs := make(map[string]StoredImage) // Process each image reference found in the markdown. // The URL group supports one level of balanced parentheses so that URLs @@ -123,51 +124,77 @@ func (r *ImageResolver) ResolveAndStore( } // Find inline image bytes from the result - ref, found := refMap[refPath] - if !found || len(ref.ImageData) == 0 { + stored, ok := r.saveReferencedImage(ctx, fileSvc, tenantID, refPath, refMap, savedRefs) + if !ok { continue } - - // Filter out small icons and decorative images. Skip the filter - // when the reference is the originally uploaded file itself, so - // that a standalone image upload is never silently dropped even - // if its dimensions are below the icon threshold. - if !ref.IsOriginal && isIconImage(ref.ImageData) { - // Remove the image reference from markdown entirely - markdown = markdown[:m[0]] + markdown[m[1]:] - continue - } - - // Determine extension - ext := extFromMime(ref.MimeType) - if ext == "" { - ext = filepath.Ext(ref.Filename) - } - if ext == "" { - ext = ".png" - } - - // Save via FileService — returns provider:// path - fileName := uuid.New().String() + ext - servingURL, saveErr := fileSvc.SaveBytes(ctx, ref.ImageData, tenantID, fileName, false) - if saveErr != nil { - log.Printf("WARN: failed to save image %s: %v", refPath, saveErr) - continue - } - - images = append(images, StoredImage{ - OriginalRef: refPath, - ServingURL: servingURL, - MimeType: ref.MimeType, - }) + images = appendStoredImage(images, stored) // Replace in markdown - markdown = markdown[:m[4]] + servingURL + markdown[m[5]:] + markdown = markdown[:m[4]] + stored.ServingURL + markdown[m[5]:] } + md5, imgRelativeHTML, _ := r.ResolveRelativeHTMLImages(ctx, markdown, fileSvc, tenantID, refMap, savedRefs) + markdown = md5 + images = append(images, imgRelativeHTML...) + return markdown, images, nil } +func appendStoredImage(images []StoredImage, stored StoredImage) []StoredImage { + for _, existing := range images { + if existing.OriginalRef == stored.OriginalRef && existing.ServingURL == stored.ServingURL { + return images + } + } + return append(images, stored) +} + +func (r *ImageResolver) saveReferencedImage( + ctx context.Context, + fileSvc interfaces.FileService, + tenantID uint64, + refPath string, + refMap map[string]types.ImageRef, + savedRefs map[string]StoredImage, +) (StoredImage, bool) { + if stored, ok := savedRefs[refPath]; ok { + return stored, true + } + + ref, found := refMap[refPath] + if !found || len(ref.ImageData) == 0 { + return StoredImage{}, false + } + + if !ref.IsOriginal && isIconImage(ref.ImageData) { + return StoredImage{}, false + } + + ext := extFromMime(ref.MimeType) + if ext == "" { + ext = filepath.Ext(ref.Filename) + } + if ext == "" { + ext = ".png" + } + + fileName := uuid.New().String() + ext + servingURL, saveErr := fileSvc.SaveBytes(ctx, ref.ImageData, tenantID, fileName, false) + if saveErr != nil { + log.Printf("WARN: failed to save image %s: %v", refPath, saveErr) + return StoredImage{}, false + } + + stored := StoredImage{ + OriginalRef: refPath, + ServingURL: servingURL, + MimeType: ref.MimeType, + } + savedRefs[refPath] = stored + return stored, true +} + func extFromMime(mime string) string { switch mime { case "image/png": @@ -287,6 +314,10 @@ var imgHTMLDataURI = regexp.MustCompile( `(?i)]*?src\s*=\s*["'](data:image/[^;]+;base64,[^"']+)["'][^>]*?/?\s*>`, ) +var imgHTMLRelativeSrc = regexp.MustCompile( + `(?i)]*?)\bsrc\s*=\s*['"]([^'"]+)['"]([^>]*)>`, +) + // ResolveHTMLDataURIImages finds tags in markdown, // decodes the images, stores them via fileSvc, and replaces each tag with a markdown // image reference using the storage URL. @@ -349,6 +380,41 @@ func (r *ImageResolver) ResolveHTMLDataURIImages( return markdown, images, nil } +// ResolveRelativeHTMLImages finds HTML tags whose src points at a +// relative document image reference, stores the corresponding bytes via +// fileSvc, and replaces only the src attribute value with the storage URL. +func (r *ImageResolver) ResolveRelativeHTMLImages( + ctx context.Context, + markdown string, + fileSvc interfaces.FileService, + tenantID uint64, + refMap map[string]types.ImageRef, + savedRefs map[string]StoredImage, +) (updatedMarkdown string, images []StoredImage, err error) { + matches := imgHTMLRelativeSrc.FindAllStringSubmatchIndex(markdown, -1) + if len(matches) == 0 { + return markdown, nil, nil + } + + for i := len(matches) - 1; i >= 0; i-- { + m := matches[i] + src := strings.TrimSpace(markdown[m[4]:m[5]]) + if src == "" || strings.HasPrefix(src, "http://") || strings.HasPrefix(src, "https://") || + isProviderScheme(src) || strings.HasPrefix(strings.ToLower(src), "data:image/") { + continue + } + + stored, ok := r.saveReferencedImage(ctx, fileSvc, tenantID, src, refMap, savedRefs) + if !ok { + continue + } + images = appendStoredImage(images, stored) + markdown = markdown[:m[4]] + stored.ServingURL + markdown[m[5]:] + } + + return markdown, images, nil +} + // --------------------------------------------------------------------------- // Bare base64/data URI resolution (catch-all) // --------------------------------------------------------------------------- diff --git a/internal/infrastructure/docparser/image_resolver_relative_html_test.go b/internal/infrastructure/docparser/image_resolver_relative_html_test.go new file mode 100644 index 00000000..ccd6307f --- /dev/null +++ b/internal/infrastructure/docparser/image_resolver_relative_html_test.go @@ -0,0 +1,43 @@ +package docparser + +import ( + "context" + "strings" + "testing" + + "github.com/Tencent/WeKnora/internal/types" +) + +func TestResolveAndStoreRelativeHTMLImages(t *testing.T) { + png := createTestPNG(200, 150) + result := &types.ReadResult{ + MarkdownContent: `
profile
`, + ImageRefs: []types.ImageRef{ + { + Filename: "profile.jpg", + OriginalRef: "images/profile.jpg", + MimeType: "image/png", + ImageData: png, + }, + }, + } + + svc := &captureSaveBytes{} + r := NewImageResolver() + out, imgs, err := r.ResolveAndStore(context.Background(), result, svc, 1) + if err != nil { + t.Fatal(err) + } + if len(imgs) != 1 { + t.Fatalf("expected 1 stored image but got %d", len(imgs)) + } + if len(svc.saved) != 1 { + t.Fatalf("expected SaveBytes to be called once but got %d", len(svc.saved)) + } + if strings.Contains(out, `src="images/profile.jpg"`) { + t.Fatalf("expected relative html img src to be replaced, got: %s", out) + } + if !strings.Contains(out, `src="local://test/`) { + t.Fatalf("expected stored file url in html img src, got: %s", out) + } +} diff --git a/internal/infrastructure/docparser/mineru_converter.go b/internal/infrastructure/docparser/mineru_converter.go index 3dd19c37..b2066072 100644 --- a/internal/infrastructure/docparser/mineru_converter.go +++ b/internal/infrastructure/docparser/mineru_converter.go @@ -10,12 +10,12 @@ import ( "mime" "mime/multipart" "net/http" + "net/url" "path/filepath" "regexp" "strings" "time" - htmltomd "github.com/JohannesKaufmann/html-to-markdown/v2" "github.com/Tencent/WeKnora/internal/logger" "github.com/Tencent/WeKnora/internal/types" ) @@ -66,13 +66,10 @@ func (c *MinerUReader) Read(ctx context.Context, req *types.ReadRequest) (*types return nil, fmt.Errorf("MinerU file_parse: %w", err) } - // HTML -> Markdown conversion (equivalent to Python markdownify). - // MinerU's md_content is mostly Markdown with embedded HTML blocks (e.g. ), - // but html-to-markdown sees the whole string as HTML and escapes Markdown special - // chars in already-valid Markdown — notably turning `![](...)` into `!\[](...)`, - // which then breaks downstream image extraction. Unescape those after conversion. - mdContent = htmlToMarkdown(mdContent) - mdContent = unescapeMarkdownImageSyntax(mdContent) + // MinerU already returns markdown with embedded HTML blocks (e.g.
,
). + // Re-running the whole document through html-to-markdown corrupts valid markdown + // by escaping headings and image syntax. Only apply narrow compatibility fixes. + mdContent = normalizeMinerUMarkdown(mdContent) // Process images: decode base64, build ImageRef list, replace refs in markdown imageRefs, mdContent := c.processImages(mdContent, imagesB64) @@ -208,8 +205,8 @@ func (c *MinerUReader) processImages(mdContent string, imagesB64 map[string]stri var refs []types.ImageRef for ipath, b64Str := range imagesB64 { - originalRef := "images/" + ipath - if !strings.Contains(mdContent, originalRef) { + matchedRefs := mineruImageOriginalRefs(mdContent, ipath) + if len(matchedRefs) == 0 { continue } @@ -243,12 +240,14 @@ func (c *MinerUReader) processImages(mdContent string, imagesB64 map[string]stri mimeType = "image/png" } - refs = append(refs, types.ImageRef{ - Filename: ipath, - OriginalRef: originalRef, - MimeType: mimeType, - ImageData: imgBytes, - }) + for _, originalRef := range matchedRefs { + refs = append(refs, types.ImageRef{ + Filename: ipath, + OriginalRef: originalRef, + MimeType: mimeType, + ImageData: imgBytes, + }) + } } return refs, mdContent @@ -277,26 +276,84 @@ func PingMinerU(endpoint string) (bool, string) { return true, "" } -// htmlToMarkdown converts HTML content to markdown. -// Falls back to the original content if conversion fails. -func htmlToMarkdown(content string) string { - md, err := htmltomd.ConvertString(content) - if err != nil { - logger.Errorf(context.Background(), "[MinerU] html-to-markdown conversion failed, using raw content: %v", err) - return content - } - return md -} - // escapedImageSyntaxPattern matches markdown image references whose `[` was // over-escaped to `\[` by html-to-markdown. The URL group mirrors the // downstream image-extraction regex so escapes are only stripped for actual // image references. var escapedImageSyntaxPattern = regexp.MustCompile(`!\\\[(.*?)\\?\]\(([^()\n]*(?:\([^)]*\)[^()\n]*)*)\)`) +// escapedHeadingPattern restores markdown headings that were over-escaped to +// \# Heading. We only touch line-leading heading markers to avoid rewriting +// body text that intentionally contains escaped # characters. +var escapedHeadingPattern = regexp.MustCompile(`(?m)^\\(#{1,6})(\s+)`) + // unescapeMarkdownImageSyntax restores `![alt](url)` from html-to-markdown's // over-escaped `!\[alt\](url)` form. Without this, the downstream image regex // in ImageResolver fails to match and images are never persisted. func unescapeMarkdownImageSyntax(content string) string { return escapedImageSyntaxPattern.ReplaceAllString(content, "![$1]($2)") } + +func normalizeEscapedMarkdownHeadings(content string) string { + return escapedHeadingPattern.ReplaceAllString(content, `$1$2`) +} + +func normalizeMinerUMarkdown(content string) string { + content = unescapeMarkdownImageSyntax(content) + content = normalizeEscapedMarkdownHeadings(content) + return content +} + +func mineruImageOriginalRefs(mdContent, imagePath string) []string { + normalizedTarget := normalizeMinerUImagePath(imagePath) + if normalizedTarget == "" { + return nil + } + + referenced := extractImageRefsFromContent(mdContent) + seen := make(map[string]struct{}, len(referenced)) + var matched []string + for _, refPath := range referenced { + if normalizeMinerUImagePath(refPath) != normalizedTarget { + continue + } + if _, ok := seen[refPath]; ok { + continue + } + matched = append(matched, refPath) + seen[refPath] = struct{}{} + } + + return matched +} + +func extractImageRefsFromContent(content string) []string { + var refs []string + + for _, match := range imgMarkdownPattern.FindAllStringSubmatch(content, -1) { + if len(match) >= 3 { + refs = append(refs, match[2]) + } + } + for _, match := range imgHTMLRelativeSrc.FindAllStringSubmatch(content, -1) { + if len(match) >= 3 { + refs = append(refs, match[2]) + } + } + + return refs +} + +func normalizeMinerUImagePath(p string) string { + p = strings.TrimSpace(p) + if p == "" { + return "" + } + if decoded, err := url.PathUnescape(p); err == nil && decoded != "" { + p = decoded + } + p = strings.TrimPrefix(p, "./") + p = strings.TrimPrefix(p, "/") + p = strings.TrimPrefix(p, "images/") + return p +} diff --git a/internal/infrastructure/docparser/mineru_converter_test.go b/internal/infrastructure/docparser/mineru_converter_test.go new file mode 100644 index 00000000..36a89453 --- /dev/null +++ b/internal/infrastructure/docparser/mineru_converter_test.go @@ -0,0 +1,66 @@ +package docparser + +import ( + "encoding/base64" + "strings" + "testing" +) + +func TestNormalizeMinerUMarkdownPreservesMarkdownAndHTML(t *testing.T) { + input := strings.Join([]string{ + "# Heading", + "", + "![](images/cover.jpg)", + "", + `
text_imagecaption
`, + "", + `
`, + }, "\n") + + got := normalizeMinerUMarkdown(input) + + if !strings.Contains(got, "# Heading") { + t.Fatalf("expected heading to stay intact, got: %q", got) + } + if strings.Contains(got, `\# Heading`) { + t.Fatalf("expected heading to avoid escaped form, got: %q", got) + } + if !strings.Contains(got, "![](images/cover.jpg)") { + t.Fatalf("expected markdown image syntax to stay intact, got: %q", got) + } + if strings.Contains(got, `!\[](images/cover.jpg)`) { + t.Fatalf("expected markdown image syntax to avoid escaped form, got: %q", got) + } + if !strings.Contains(got, `
text_imagecaption
`) { + t.Fatalf("expected details/summary block to be preserved, got: %q", got) + } + if !strings.Contains(got, ``) { + t.Fatalf("expected html img tag to be preserved, got: %q", got) + } +} + +func TestProcessImagesKeepsReferencedVariants(t *testing.T) { + reader := &MinerUReader{} + mdContent := strings.Join([]string{ + "![](images/cover.jpg)", + ``, + `![](plain.jpg)`, + }, "\n") + + png := createTestPNG(200, 150) + b64 := base64.StdEncoding.EncodeToString(png) + images := map[string]string{ + "cover.jpg": "data:image/png;base64," + b64, + "profile.jpg": "data:image/png;base64," + b64, + "plain.jpg": "data:image/png;base64," + b64, + } + + refs, gotMarkdown := reader.processImages(mdContent, images) + + if gotMarkdown != mdContent { + t.Fatalf("processImages should not rewrite markdown content") + } + if len(refs) != 3 { + t.Fatalf("expected 3 image refs, got %d", len(refs)) + } +}