fix(docparser): preserve MinerU markdown and persist relative images

MinerU already returns markdown with embedded HTML blocks, but the current\nreader runs the whole document back through html-to-markdown. That\nsecond conversion escapes valid headings and image syntax, so chunk\nprofiling sees plain text instead of markdown structure and relative\nimage references stop matching the storage pipeline.\n\nKeep MinerU output in its original markdown form and only apply narrow\ncompatibility normalization for the specific over-escaped patterns we\nactually need to recover. The converter now matches image refs by the\npaths that are really present in markdown or embedded HTML instead of\nassuming a single images/<name> form.\n\nExtend ImageResolver so relative HTML <img src=...> references share the\nsame storage rewrite path as markdown images, deduplicate repeated saves,\nand keep the frontend sanitizer compatible with MinerU's details/summary\nblocks. Add focused docparser tests that cover escaped markdown repair,\nvariant image path matching, and relative HTML image persistence.
This commit is contained in:
MidnightSun
2026-05-19 11:21:50 +08:00
committed by lyingbug
parent 9f6148784a
commit 6210f44fb6
5 changed files with 298 additions and 66 deletions

View File

@@ -14,7 +14,7 @@ const DOMPurifyConfig = {
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'ul', 'ol', 'li', 'blockquote', 'pre', 'code',
'a', 'img', 'table', 'thead', 'tbody', 'tr', 'th', 'td',
'div', 'span', 'figure', 'figcaption', 'think',
'div', 'span', 'figure', 'figcaption', 'details', 'summary', 'think',
// Mermaid SVG 支持的标签
'svg', 'g', 'path', 'rect', 'circle', 'ellipse', 'line', 'polygon',
'polyline', 'text', 'tspan', 'defs', 'marker', 'filter', 'use',
@@ -26,7 +26,7 @@ const DOMPurifyConfig = {
// 允许的属性
ALLOWED_ATTR: [
'href', 'title', 'alt', 'src', 'class', 'id', 'style', 'data-protected-src',
'target', 'rel', 'width', 'height',
'target', 'rel', 'width', 'height', 'open',
// Mermaid SVG 支持的属性
'd', 'fill', 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-dasharray', 'stroke-dashoffset', 'stroke-miterlimit', 'stroke-opacity',

View File

@@ -80,7 +80,7 @@ func (r *ImageResolver) ResolveAndStore(
fileSvc interfaces.FileService,
tenantID uint64,
) (updatedMarkdown string, images []StoredImage, err error) {
markdown := UnwrapLinkedImages(result.MarkdownContent)
markdown := UnwrapLinkedImages(normalizeMinerUMarkdown(result.MarkdownContent))
md2, imgDataURIs, _ := r.ResolveDataURIImages(ctx, markdown, fileSvc, tenantID)
markdown = md2
images = append(images, imgDataURIs...)
@@ -102,6 +102,7 @@ func (r *ImageResolver) ResolveAndStore(
for _, ref := range result.ImageRefs {
refMap[ref.OriginalRef] = ref
}
savedRefs := make(map[string]StoredImage)
// Process each image reference found in the markdown.
// The URL group supports one level of balanced parentheses so that URLs
@@ -123,51 +124,77 @@ func (r *ImageResolver) ResolveAndStore(
}
// Find inline image bytes from the result
ref, found := refMap[refPath]
if !found || len(ref.ImageData) == 0 {
stored, ok := r.saveReferencedImage(ctx, fileSvc, tenantID, refPath, refMap, savedRefs)
if !ok {
continue
}
// Filter out small icons and decorative images. Skip the filter
// when the reference is the originally uploaded file itself, so
// that a standalone image upload is never silently dropped even
// if its dimensions are below the icon threshold.
if !ref.IsOriginal && isIconImage(ref.ImageData) {
// Remove the image reference from markdown entirely
markdown = markdown[:m[0]] + markdown[m[1]:]
continue
}
// Determine extension
ext := extFromMime(ref.MimeType)
if ext == "" {
ext = filepath.Ext(ref.Filename)
}
if ext == "" {
ext = ".png"
}
// Save via FileService — returns provider:// path
fileName := uuid.New().String() + ext
servingURL, saveErr := fileSvc.SaveBytes(ctx, ref.ImageData, tenantID, fileName, false)
if saveErr != nil {
log.Printf("WARN: failed to save image %s: %v", refPath, saveErr)
continue
}
images = append(images, StoredImage{
OriginalRef: refPath,
ServingURL: servingURL,
MimeType: ref.MimeType,
})
images = appendStoredImage(images, stored)
// Replace in markdown
markdown = markdown[:m[4]] + servingURL + markdown[m[5]:]
markdown = markdown[:m[4]] + stored.ServingURL + markdown[m[5]:]
}
md5, imgRelativeHTML, _ := r.ResolveRelativeHTMLImages(ctx, markdown, fileSvc, tenantID, refMap, savedRefs)
markdown = md5
images = append(images, imgRelativeHTML...)
return markdown, images, nil
}
func appendStoredImage(images []StoredImage, stored StoredImage) []StoredImage {
for _, existing := range images {
if existing.OriginalRef == stored.OriginalRef && existing.ServingURL == stored.ServingURL {
return images
}
}
return append(images, stored)
}
func (r *ImageResolver) saveReferencedImage(
ctx context.Context,
fileSvc interfaces.FileService,
tenantID uint64,
refPath string,
refMap map[string]types.ImageRef,
savedRefs map[string]StoredImage,
) (StoredImage, bool) {
if stored, ok := savedRefs[refPath]; ok {
return stored, true
}
ref, found := refMap[refPath]
if !found || len(ref.ImageData) == 0 {
return StoredImage{}, false
}
if !ref.IsOriginal && isIconImage(ref.ImageData) {
return StoredImage{}, false
}
ext := extFromMime(ref.MimeType)
if ext == "" {
ext = filepath.Ext(ref.Filename)
}
if ext == "" {
ext = ".png"
}
fileName := uuid.New().String() + ext
servingURL, saveErr := fileSvc.SaveBytes(ctx, ref.ImageData, tenantID, fileName, false)
if saveErr != nil {
log.Printf("WARN: failed to save image %s: %v", refPath, saveErr)
return StoredImage{}, false
}
stored := StoredImage{
OriginalRef: refPath,
ServingURL: servingURL,
MimeType: ref.MimeType,
}
savedRefs[refPath] = stored
return stored, true
}
func extFromMime(mime string) string {
switch mime {
case "image/png":
@@ -287,6 +314,10 @@ var imgHTMLDataURI = regexp.MustCompile(
`(?i)<img\s[^>]*?src\s*=\s*["'](data:image/[^;]+;base64,[^"']+)["'][^>]*?/?\s*>`,
)
var imgHTMLRelativeSrc = regexp.MustCompile(
`(?i)<img\b([^>]*?)\bsrc\s*=\s*['"]([^'"]+)['"]([^>]*)>`,
)
// ResolveHTMLDataURIImages finds <img src="data:image/*;base64,..."> tags in markdown,
// decodes the images, stores them via fileSvc, and replaces each tag with a markdown
// image reference using the storage URL.
@@ -349,6 +380,41 @@ func (r *ImageResolver) ResolveHTMLDataURIImages(
return markdown, images, nil
}
// ResolveRelativeHTMLImages finds HTML <img> tags whose src points at a
// relative document image reference, stores the corresponding bytes via
// fileSvc, and replaces only the src attribute value with the storage URL.
func (r *ImageResolver) ResolveRelativeHTMLImages(
ctx context.Context,
markdown string,
fileSvc interfaces.FileService,
tenantID uint64,
refMap map[string]types.ImageRef,
savedRefs map[string]StoredImage,
) (updatedMarkdown string, images []StoredImage, err error) {
matches := imgHTMLRelativeSrc.FindAllStringSubmatchIndex(markdown, -1)
if len(matches) == 0 {
return markdown, nil, nil
}
for i := len(matches) - 1; i >= 0; i-- {
m := matches[i]
src := strings.TrimSpace(markdown[m[4]:m[5]])
if src == "" || strings.HasPrefix(src, "http://") || strings.HasPrefix(src, "https://") ||
isProviderScheme(src) || strings.HasPrefix(strings.ToLower(src), "data:image/") {
continue
}
stored, ok := r.saveReferencedImage(ctx, fileSvc, tenantID, src, refMap, savedRefs)
if !ok {
continue
}
images = appendStoredImage(images, stored)
markdown = markdown[:m[4]] + stored.ServingURL + markdown[m[5]:]
}
return markdown, images, nil
}
// ---------------------------------------------------------------------------
// Bare base64/data URI resolution (catch-all)
// ---------------------------------------------------------------------------

View File

@@ -0,0 +1,43 @@
package docparser
import (
"context"
"strings"
"testing"
"github.com/Tencent/WeKnora/internal/types"
)
func TestResolveAndStoreRelativeHTMLImages(t *testing.T) {
png := createTestPNG(200, 150)
result := &types.ReadResult{
MarkdownContent: `<table><tr><td><img src="images/profile.jpg" alt="profile"/></td></tr></table>`,
ImageRefs: []types.ImageRef{
{
Filename: "profile.jpg",
OriginalRef: "images/profile.jpg",
MimeType: "image/png",
ImageData: png,
},
},
}
svc := &captureSaveBytes{}
r := NewImageResolver()
out, imgs, err := r.ResolveAndStore(context.Background(), result, svc, 1)
if err != nil {
t.Fatal(err)
}
if len(imgs) != 1 {
t.Fatalf("expected 1 stored image but got %d", len(imgs))
}
if len(svc.saved) != 1 {
t.Fatalf("expected SaveBytes to be called once but got %d", len(svc.saved))
}
if strings.Contains(out, `src="images/profile.jpg"`) {
t.Fatalf("expected relative html img src to be replaced, got: %s", out)
}
if !strings.Contains(out, `src="local://test/`) {
t.Fatalf("expected stored file url in html img src, got: %s", out)
}
}

View File

@@ -10,12 +10,12 @@ import (
"mime"
"mime/multipart"
"net/http"
"net/url"
"path/filepath"
"regexp"
"strings"
"time"
htmltomd "github.com/JohannesKaufmann/html-to-markdown/v2"
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types"
)
@@ -66,13 +66,10 @@ func (c *MinerUReader) Read(ctx context.Context, req *types.ReadRequest) (*types
return nil, fmt.Errorf("MinerU file_parse: %w", err)
}
// HTML -> Markdown conversion (equivalent to Python markdownify).
// MinerU's md_content is mostly Markdown with embedded HTML blocks (e.g. <table>),
// but html-to-markdown sees the whole string as HTML and escapes Markdown special
// chars in already-valid Markdown — notably turning `![](...)` into `!\[](...)`,
// which then breaks downstream image extraction. Unescape those after conversion.
mdContent = htmlToMarkdown(mdContent)
mdContent = unescapeMarkdownImageSyntax(mdContent)
// MinerU already returns markdown with embedded HTML blocks (e.g. <table>, <details>).
// Re-running the whole document through html-to-markdown corrupts valid markdown
// by escaping headings and image syntax. Only apply narrow compatibility fixes.
mdContent = normalizeMinerUMarkdown(mdContent)
// Process images: decode base64, build ImageRef list, replace refs in markdown
imageRefs, mdContent := c.processImages(mdContent, imagesB64)
@@ -208,8 +205,8 @@ func (c *MinerUReader) processImages(mdContent string, imagesB64 map[string]stri
var refs []types.ImageRef
for ipath, b64Str := range imagesB64 {
originalRef := "images/" + ipath
if !strings.Contains(mdContent, originalRef) {
matchedRefs := mineruImageOriginalRefs(mdContent, ipath)
if len(matchedRefs) == 0 {
continue
}
@@ -243,12 +240,14 @@ func (c *MinerUReader) processImages(mdContent string, imagesB64 map[string]stri
mimeType = "image/png"
}
refs = append(refs, types.ImageRef{
Filename: ipath,
OriginalRef: originalRef,
MimeType: mimeType,
ImageData: imgBytes,
})
for _, originalRef := range matchedRefs {
refs = append(refs, types.ImageRef{
Filename: ipath,
OriginalRef: originalRef,
MimeType: mimeType,
ImageData: imgBytes,
})
}
}
return refs, mdContent
@@ -277,26 +276,84 @@ func PingMinerU(endpoint string) (bool, string) {
return true, ""
}
// htmlToMarkdown converts HTML content to markdown.
// Falls back to the original content if conversion fails.
func htmlToMarkdown(content string) string {
md, err := htmltomd.ConvertString(content)
if err != nil {
logger.Errorf(context.Background(), "[MinerU] html-to-markdown conversion failed, using raw content: %v", err)
return content
}
return md
}
// escapedImageSyntaxPattern matches markdown image references whose `[` was
// over-escaped to `\[` by html-to-markdown. The URL group mirrors the
// downstream image-extraction regex so escapes are only stripped for actual
// image references.
var escapedImageSyntaxPattern = regexp.MustCompile(`!\\\[(.*?)\\?\]\(([^()\n]*(?:\([^)]*\)[^()\n]*)*)\)`)
// escapedHeadingPattern restores markdown headings that were over-escaped to
// \# Heading. We only touch line-leading heading markers to avoid rewriting
// body text that intentionally contains escaped # characters.
var escapedHeadingPattern = regexp.MustCompile(`(?m)^\\(#{1,6})(\s+)`)
// unescapeMarkdownImageSyntax restores `![alt](url)` from html-to-markdown's
// over-escaped `!\[alt\](url)` form. Without this, the downstream image regex
// in ImageResolver fails to match and images are never persisted.
func unescapeMarkdownImageSyntax(content string) string {
return escapedImageSyntaxPattern.ReplaceAllString(content, "![$1]($2)")
}
func normalizeEscapedMarkdownHeadings(content string) string {
return escapedHeadingPattern.ReplaceAllString(content, `$1$2`)
}
func normalizeMinerUMarkdown(content string) string {
content = unescapeMarkdownImageSyntax(content)
content = normalizeEscapedMarkdownHeadings(content)
return content
}
func mineruImageOriginalRefs(mdContent, imagePath string) []string {
normalizedTarget := normalizeMinerUImagePath(imagePath)
if normalizedTarget == "" {
return nil
}
referenced := extractImageRefsFromContent(mdContent)
seen := make(map[string]struct{}, len(referenced))
var matched []string
for _, refPath := range referenced {
if normalizeMinerUImagePath(refPath) != normalizedTarget {
continue
}
if _, ok := seen[refPath]; ok {
continue
}
matched = append(matched, refPath)
seen[refPath] = struct{}{}
}
return matched
}
func extractImageRefsFromContent(content string) []string {
var refs []string
for _, match := range imgMarkdownPattern.FindAllStringSubmatch(content, -1) {
if len(match) >= 3 {
refs = append(refs, match[2])
}
}
for _, match := range imgHTMLRelativeSrc.FindAllStringSubmatch(content, -1) {
if len(match) >= 3 {
refs = append(refs, match[2])
}
}
return refs
}
func normalizeMinerUImagePath(p string) string {
p = strings.TrimSpace(p)
if p == "" {
return ""
}
if decoded, err := url.PathUnescape(p); err == nil && decoded != "" {
p = decoded
}
p = strings.TrimPrefix(p, "./")
p = strings.TrimPrefix(p, "/")
p = strings.TrimPrefix(p, "images/")
return p
}

View File

@@ -0,0 +1,66 @@
package docparser
import (
"encoding/base64"
"strings"
"testing"
)
func TestNormalizeMinerUMarkdownPreservesMarkdownAndHTML(t *testing.T) {
input := strings.Join([]string{
"# Heading",
"",
"![](images/cover.jpg)",
"",
`<details><summary>text_image</summary>caption</details>`,
"",
`<table><tr><td><img src="images/profile.jpg"/></td></tr></table>`,
}, "\n")
got := normalizeMinerUMarkdown(input)
if !strings.Contains(got, "# Heading") {
t.Fatalf("expected heading to stay intact, got: %q", got)
}
if strings.Contains(got, `\# Heading`) {
t.Fatalf("expected heading to avoid escaped form, got: %q", got)
}
if !strings.Contains(got, "![](images/cover.jpg)") {
t.Fatalf("expected markdown image syntax to stay intact, got: %q", got)
}
if strings.Contains(got, `!\[](images/cover.jpg)`) {
t.Fatalf("expected markdown image syntax to avoid escaped form, got: %q", got)
}
if !strings.Contains(got, `<details><summary>text_image</summary>caption</details>`) {
t.Fatalf("expected details/summary block to be preserved, got: %q", got)
}
if !strings.Contains(got, `<img src="images/profile.jpg"/>`) {
t.Fatalf("expected html img tag to be preserved, got: %q", got)
}
}
func TestProcessImagesKeepsReferencedVariants(t *testing.T) {
reader := &MinerUReader{}
mdContent := strings.Join([]string{
"![](images/cover.jpg)",
`<img src="./images/profile.jpg"/>`,
`![](plain.jpg)`,
}, "\n")
png := createTestPNG(200, 150)
b64 := base64.StdEncoding.EncodeToString(png)
images := map[string]string{
"cover.jpg": "data:image/png;base64," + b64,
"profile.jpg": "data:image/png;base64," + b64,
"plain.jpg": "data:image/png;base64," + b64,
}
refs, gotMarkdown := reader.processImages(mdContent, images)
if gotMarkdown != mdContent {
t.Fatalf("processImages should not rewrite markdown content")
}
if len(refs) != 3 {
t.Fatalf("expected 3 image refs, got %d", len(refs))
}
}