mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
fix(docparser): preserve MinerU markdown and persist relative images
MinerU already returns markdown with embedded HTML blocks, but the current\nreader runs the whole document back through html-to-markdown. That\nsecond conversion escapes valid headings and image syntax, so chunk\nprofiling sees plain text instead of markdown structure and relative\nimage references stop matching the storage pipeline.\n\nKeep MinerU output in its original markdown form and only apply narrow\ncompatibility normalization for the specific over-escaped patterns we\nactually need to recover. The converter now matches image refs by the\npaths that are really present in markdown or embedded HTML instead of\nassuming a single images/<name> form.\n\nExtend ImageResolver so relative HTML <img src=...> references share the\nsame storage rewrite path as markdown images, deduplicate repeated saves,\nand keep the frontend sanitizer compatible with MinerU's details/summary\nblocks. Add focused docparser tests that cover escaped markdown repair,\nvariant image path matching, and relative HTML image persistence.
This commit is contained in:
@@ -14,7 +14,7 @@ const DOMPurifyConfig = {
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'ul', 'ol', 'li', 'blockquote', 'pre', 'code',
|
||||
'a', 'img', 'table', 'thead', 'tbody', 'tr', 'th', 'td',
|
||||
'div', 'span', 'figure', 'figcaption', 'think',
|
||||
'div', 'span', 'figure', 'figcaption', 'details', 'summary', 'think',
|
||||
// Mermaid SVG 支持的标签
|
||||
'svg', 'g', 'path', 'rect', 'circle', 'ellipse', 'line', 'polygon',
|
||||
'polyline', 'text', 'tspan', 'defs', 'marker', 'filter', 'use',
|
||||
@@ -26,7 +26,7 @@ const DOMPurifyConfig = {
|
||||
// 允许的属性
|
||||
ALLOWED_ATTR: [
|
||||
'href', 'title', 'alt', 'src', 'class', 'id', 'style', 'data-protected-src',
|
||||
'target', 'rel', 'width', 'height',
|
||||
'target', 'rel', 'width', 'height', 'open',
|
||||
// Mermaid SVG 支持的属性
|
||||
'd', 'fill', 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||
'stroke-dasharray', 'stroke-dashoffset', 'stroke-miterlimit', 'stroke-opacity',
|
||||
|
||||
@@ -80,7 +80,7 @@ func (r *ImageResolver) ResolveAndStore(
|
||||
fileSvc interfaces.FileService,
|
||||
tenantID uint64,
|
||||
) (updatedMarkdown string, images []StoredImage, err error) {
|
||||
markdown := UnwrapLinkedImages(result.MarkdownContent)
|
||||
markdown := UnwrapLinkedImages(normalizeMinerUMarkdown(result.MarkdownContent))
|
||||
md2, imgDataURIs, _ := r.ResolveDataURIImages(ctx, markdown, fileSvc, tenantID)
|
||||
markdown = md2
|
||||
images = append(images, imgDataURIs...)
|
||||
@@ -102,6 +102,7 @@ func (r *ImageResolver) ResolveAndStore(
|
||||
for _, ref := range result.ImageRefs {
|
||||
refMap[ref.OriginalRef] = ref
|
||||
}
|
||||
savedRefs := make(map[string]StoredImage)
|
||||
|
||||
// Process each image reference found in the markdown.
|
||||
// The URL group supports one level of balanced parentheses so that URLs
|
||||
@@ -123,51 +124,77 @@ func (r *ImageResolver) ResolveAndStore(
|
||||
}
|
||||
|
||||
// Find inline image bytes from the result
|
||||
ref, found := refMap[refPath]
|
||||
if !found || len(ref.ImageData) == 0 {
|
||||
stored, ok := r.saveReferencedImage(ctx, fileSvc, tenantID, refPath, refMap, savedRefs)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
// Filter out small icons and decorative images. Skip the filter
|
||||
// when the reference is the originally uploaded file itself, so
|
||||
// that a standalone image upload is never silently dropped even
|
||||
// if its dimensions are below the icon threshold.
|
||||
if !ref.IsOriginal && isIconImage(ref.ImageData) {
|
||||
// Remove the image reference from markdown entirely
|
||||
markdown = markdown[:m[0]] + markdown[m[1]:]
|
||||
continue
|
||||
}
|
||||
|
||||
// Determine extension
|
||||
ext := extFromMime(ref.MimeType)
|
||||
if ext == "" {
|
||||
ext = filepath.Ext(ref.Filename)
|
||||
}
|
||||
if ext == "" {
|
||||
ext = ".png"
|
||||
}
|
||||
|
||||
// Save via FileService — returns provider:// path
|
||||
fileName := uuid.New().String() + ext
|
||||
servingURL, saveErr := fileSvc.SaveBytes(ctx, ref.ImageData, tenantID, fileName, false)
|
||||
if saveErr != nil {
|
||||
log.Printf("WARN: failed to save image %s: %v", refPath, saveErr)
|
||||
continue
|
||||
}
|
||||
|
||||
images = append(images, StoredImage{
|
||||
OriginalRef: refPath,
|
||||
ServingURL: servingURL,
|
||||
MimeType: ref.MimeType,
|
||||
})
|
||||
images = appendStoredImage(images, stored)
|
||||
|
||||
// Replace in markdown
|
||||
markdown = markdown[:m[4]] + servingURL + markdown[m[5]:]
|
||||
markdown = markdown[:m[4]] + stored.ServingURL + markdown[m[5]:]
|
||||
}
|
||||
|
||||
md5, imgRelativeHTML, _ := r.ResolveRelativeHTMLImages(ctx, markdown, fileSvc, tenantID, refMap, savedRefs)
|
||||
markdown = md5
|
||||
images = append(images, imgRelativeHTML...)
|
||||
|
||||
return markdown, images, nil
|
||||
}
|
||||
|
||||
func appendStoredImage(images []StoredImage, stored StoredImage) []StoredImage {
|
||||
for _, existing := range images {
|
||||
if existing.OriginalRef == stored.OriginalRef && existing.ServingURL == stored.ServingURL {
|
||||
return images
|
||||
}
|
||||
}
|
||||
return append(images, stored)
|
||||
}
|
||||
|
||||
func (r *ImageResolver) saveReferencedImage(
|
||||
ctx context.Context,
|
||||
fileSvc interfaces.FileService,
|
||||
tenantID uint64,
|
||||
refPath string,
|
||||
refMap map[string]types.ImageRef,
|
||||
savedRefs map[string]StoredImage,
|
||||
) (StoredImage, bool) {
|
||||
if stored, ok := savedRefs[refPath]; ok {
|
||||
return stored, true
|
||||
}
|
||||
|
||||
ref, found := refMap[refPath]
|
||||
if !found || len(ref.ImageData) == 0 {
|
||||
return StoredImage{}, false
|
||||
}
|
||||
|
||||
if !ref.IsOriginal && isIconImage(ref.ImageData) {
|
||||
return StoredImage{}, false
|
||||
}
|
||||
|
||||
ext := extFromMime(ref.MimeType)
|
||||
if ext == "" {
|
||||
ext = filepath.Ext(ref.Filename)
|
||||
}
|
||||
if ext == "" {
|
||||
ext = ".png"
|
||||
}
|
||||
|
||||
fileName := uuid.New().String() + ext
|
||||
servingURL, saveErr := fileSvc.SaveBytes(ctx, ref.ImageData, tenantID, fileName, false)
|
||||
if saveErr != nil {
|
||||
log.Printf("WARN: failed to save image %s: %v", refPath, saveErr)
|
||||
return StoredImage{}, false
|
||||
}
|
||||
|
||||
stored := StoredImage{
|
||||
OriginalRef: refPath,
|
||||
ServingURL: servingURL,
|
||||
MimeType: ref.MimeType,
|
||||
}
|
||||
savedRefs[refPath] = stored
|
||||
return stored, true
|
||||
}
|
||||
|
||||
func extFromMime(mime string) string {
|
||||
switch mime {
|
||||
case "image/png":
|
||||
@@ -287,6 +314,10 @@ var imgHTMLDataURI = regexp.MustCompile(
|
||||
`(?i)<img\s[^>]*?src\s*=\s*["'](data:image/[^;]+;base64,[^"']+)["'][^>]*?/?\s*>`,
|
||||
)
|
||||
|
||||
var imgHTMLRelativeSrc = regexp.MustCompile(
|
||||
`(?i)<img\b([^>]*?)\bsrc\s*=\s*['"]([^'"]+)['"]([^>]*)>`,
|
||||
)
|
||||
|
||||
// ResolveHTMLDataURIImages finds <img src="data:image/*;base64,..."> tags in markdown,
|
||||
// decodes the images, stores them via fileSvc, and replaces each tag with a markdown
|
||||
// image reference using the storage URL.
|
||||
@@ -349,6 +380,41 @@ func (r *ImageResolver) ResolveHTMLDataURIImages(
|
||||
return markdown, images, nil
|
||||
}
|
||||
|
||||
// ResolveRelativeHTMLImages finds HTML <img> tags whose src points at a
|
||||
// relative document image reference, stores the corresponding bytes via
|
||||
// fileSvc, and replaces only the src attribute value with the storage URL.
|
||||
func (r *ImageResolver) ResolveRelativeHTMLImages(
|
||||
ctx context.Context,
|
||||
markdown string,
|
||||
fileSvc interfaces.FileService,
|
||||
tenantID uint64,
|
||||
refMap map[string]types.ImageRef,
|
||||
savedRefs map[string]StoredImage,
|
||||
) (updatedMarkdown string, images []StoredImage, err error) {
|
||||
matches := imgHTMLRelativeSrc.FindAllStringSubmatchIndex(markdown, -1)
|
||||
if len(matches) == 0 {
|
||||
return markdown, nil, nil
|
||||
}
|
||||
|
||||
for i := len(matches) - 1; i >= 0; i-- {
|
||||
m := matches[i]
|
||||
src := strings.TrimSpace(markdown[m[4]:m[5]])
|
||||
if src == "" || strings.HasPrefix(src, "http://") || strings.HasPrefix(src, "https://") ||
|
||||
isProviderScheme(src) || strings.HasPrefix(strings.ToLower(src), "data:image/") {
|
||||
continue
|
||||
}
|
||||
|
||||
stored, ok := r.saveReferencedImage(ctx, fileSvc, tenantID, src, refMap, savedRefs)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
images = appendStoredImage(images, stored)
|
||||
markdown = markdown[:m[4]] + stored.ServingURL + markdown[m[5]:]
|
||||
}
|
||||
|
||||
return markdown, images, nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Bare base64/data URI resolution (catch-all)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
package docparser
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/Tencent/WeKnora/internal/types"
|
||||
)
|
||||
|
||||
func TestResolveAndStoreRelativeHTMLImages(t *testing.T) {
|
||||
png := createTestPNG(200, 150)
|
||||
result := &types.ReadResult{
|
||||
MarkdownContent: `<table><tr><td><img src="images/profile.jpg" alt="profile"/></td></tr></table>`,
|
||||
ImageRefs: []types.ImageRef{
|
||||
{
|
||||
Filename: "profile.jpg",
|
||||
OriginalRef: "images/profile.jpg",
|
||||
MimeType: "image/png",
|
||||
ImageData: png,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
svc := &captureSaveBytes{}
|
||||
r := NewImageResolver()
|
||||
out, imgs, err := r.ResolveAndStore(context.Background(), result, svc, 1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(imgs) != 1 {
|
||||
t.Fatalf("expected 1 stored image but got %d", len(imgs))
|
||||
}
|
||||
if len(svc.saved) != 1 {
|
||||
t.Fatalf("expected SaveBytes to be called once but got %d", len(svc.saved))
|
||||
}
|
||||
if strings.Contains(out, `src="images/profile.jpg"`) {
|
||||
t.Fatalf("expected relative html img src to be replaced, got: %s", out)
|
||||
}
|
||||
if !strings.Contains(out, `src="local://test/`) {
|
||||
t.Fatalf("expected stored file url in html img src, got: %s", out)
|
||||
}
|
||||
}
|
||||
@@ -10,12 +10,12 @@ import (
|
||||
"mime"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
htmltomd "github.com/JohannesKaufmann/html-to-markdown/v2"
|
||||
"github.com/Tencent/WeKnora/internal/logger"
|
||||
"github.com/Tencent/WeKnora/internal/types"
|
||||
)
|
||||
@@ -66,13 +66,10 @@ func (c *MinerUReader) Read(ctx context.Context, req *types.ReadRequest) (*types
|
||||
return nil, fmt.Errorf("MinerU file_parse: %w", err)
|
||||
}
|
||||
|
||||
// HTML -> Markdown conversion (equivalent to Python markdownify).
|
||||
// MinerU's md_content is mostly Markdown with embedded HTML blocks (e.g. <table>),
|
||||
// but html-to-markdown sees the whole string as HTML and escapes Markdown special
|
||||
// chars in already-valid Markdown — notably turning `` into `!\[](...)`,
|
||||
// which then breaks downstream image extraction. Unescape those after conversion.
|
||||
mdContent = htmlToMarkdown(mdContent)
|
||||
mdContent = unescapeMarkdownImageSyntax(mdContent)
|
||||
// MinerU already returns markdown with embedded HTML blocks (e.g. <table>, <details>).
|
||||
// Re-running the whole document through html-to-markdown corrupts valid markdown
|
||||
// by escaping headings and image syntax. Only apply narrow compatibility fixes.
|
||||
mdContent = normalizeMinerUMarkdown(mdContent)
|
||||
|
||||
// Process images: decode base64, build ImageRef list, replace refs in markdown
|
||||
imageRefs, mdContent := c.processImages(mdContent, imagesB64)
|
||||
@@ -208,8 +205,8 @@ func (c *MinerUReader) processImages(mdContent string, imagesB64 map[string]stri
|
||||
var refs []types.ImageRef
|
||||
|
||||
for ipath, b64Str := range imagesB64 {
|
||||
originalRef := "images/" + ipath
|
||||
if !strings.Contains(mdContent, originalRef) {
|
||||
matchedRefs := mineruImageOriginalRefs(mdContent, ipath)
|
||||
if len(matchedRefs) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -243,12 +240,14 @@ func (c *MinerUReader) processImages(mdContent string, imagesB64 map[string]stri
|
||||
mimeType = "image/png"
|
||||
}
|
||||
|
||||
refs = append(refs, types.ImageRef{
|
||||
Filename: ipath,
|
||||
OriginalRef: originalRef,
|
||||
MimeType: mimeType,
|
||||
ImageData: imgBytes,
|
||||
})
|
||||
for _, originalRef := range matchedRefs {
|
||||
refs = append(refs, types.ImageRef{
|
||||
Filename: ipath,
|
||||
OriginalRef: originalRef,
|
||||
MimeType: mimeType,
|
||||
ImageData: imgBytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return refs, mdContent
|
||||
@@ -277,26 +276,84 @@ func PingMinerU(endpoint string) (bool, string) {
|
||||
return true, ""
|
||||
}
|
||||
|
||||
// htmlToMarkdown converts HTML content to markdown.
|
||||
// Falls back to the original content if conversion fails.
|
||||
func htmlToMarkdown(content string) string {
|
||||
md, err := htmltomd.ConvertString(content)
|
||||
if err != nil {
|
||||
logger.Errorf(context.Background(), "[MinerU] html-to-markdown conversion failed, using raw content: %v", err)
|
||||
return content
|
||||
}
|
||||
return md
|
||||
}
|
||||
|
||||
// escapedImageSyntaxPattern matches markdown image references whose `[` was
|
||||
// over-escaped to `\[` by html-to-markdown. The URL group mirrors the
|
||||
// downstream image-extraction regex so escapes are only stripped for actual
|
||||
// image references.
|
||||
var escapedImageSyntaxPattern = regexp.MustCompile(`!\\\[(.*?)\\?\]\(([^()\n]*(?:\([^)]*\)[^()\n]*)*)\)`)
|
||||
|
||||
// escapedHeadingPattern restores markdown headings that were over-escaped to
|
||||
// \# Heading. We only touch line-leading heading markers to avoid rewriting
|
||||
// body text that intentionally contains escaped # characters.
|
||||
var escapedHeadingPattern = regexp.MustCompile(`(?m)^\\(#{1,6})(\s+)`)
|
||||
|
||||
// unescapeMarkdownImageSyntax restores `` from html-to-markdown's
|
||||
// over-escaped `!\[alt\](url)` form. Without this, the downstream image regex
|
||||
// in ImageResolver fails to match and images are never persisted.
|
||||
func unescapeMarkdownImageSyntax(content string) string {
|
||||
return escapedImageSyntaxPattern.ReplaceAllString(content, "")
|
||||
}
|
||||
|
||||
func normalizeEscapedMarkdownHeadings(content string) string {
|
||||
return escapedHeadingPattern.ReplaceAllString(content, `$1$2`)
|
||||
}
|
||||
|
||||
func normalizeMinerUMarkdown(content string) string {
|
||||
content = unescapeMarkdownImageSyntax(content)
|
||||
content = normalizeEscapedMarkdownHeadings(content)
|
||||
return content
|
||||
}
|
||||
|
||||
func mineruImageOriginalRefs(mdContent, imagePath string) []string {
|
||||
normalizedTarget := normalizeMinerUImagePath(imagePath)
|
||||
if normalizedTarget == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
referenced := extractImageRefsFromContent(mdContent)
|
||||
seen := make(map[string]struct{}, len(referenced))
|
||||
var matched []string
|
||||
for _, refPath := range referenced {
|
||||
if normalizeMinerUImagePath(refPath) != normalizedTarget {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[refPath]; ok {
|
||||
continue
|
||||
}
|
||||
matched = append(matched, refPath)
|
||||
seen[refPath] = struct{}{}
|
||||
}
|
||||
|
||||
return matched
|
||||
}
|
||||
|
||||
func extractImageRefsFromContent(content string) []string {
|
||||
var refs []string
|
||||
|
||||
for _, match := range imgMarkdownPattern.FindAllStringSubmatch(content, -1) {
|
||||
if len(match) >= 3 {
|
||||
refs = append(refs, match[2])
|
||||
}
|
||||
}
|
||||
for _, match := range imgHTMLRelativeSrc.FindAllStringSubmatch(content, -1) {
|
||||
if len(match) >= 3 {
|
||||
refs = append(refs, match[2])
|
||||
}
|
||||
}
|
||||
|
||||
return refs
|
||||
}
|
||||
|
||||
func normalizeMinerUImagePath(p string) string {
|
||||
p = strings.TrimSpace(p)
|
||||
if p == "" {
|
||||
return ""
|
||||
}
|
||||
if decoded, err := url.PathUnescape(p); err == nil && decoded != "" {
|
||||
p = decoded
|
||||
}
|
||||
p = strings.TrimPrefix(p, "./")
|
||||
p = strings.TrimPrefix(p, "/")
|
||||
p = strings.TrimPrefix(p, "images/")
|
||||
return p
|
||||
}
|
||||
|
||||
66
internal/infrastructure/docparser/mineru_converter_test.go
Normal file
66
internal/infrastructure/docparser/mineru_converter_test.go
Normal file
@@ -0,0 +1,66 @@
|
||||
package docparser
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNormalizeMinerUMarkdownPreservesMarkdownAndHTML(t *testing.T) {
|
||||
input := strings.Join([]string{
|
||||
"# Heading",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
`<details><summary>text_image</summary>caption</details>`,
|
||||
"",
|
||||
`<table><tr><td><img src="images/profile.jpg"/></td></tr></table>`,
|
||||
}, "\n")
|
||||
|
||||
got := normalizeMinerUMarkdown(input)
|
||||
|
||||
if !strings.Contains(got, "# Heading") {
|
||||
t.Fatalf("expected heading to stay intact, got: %q", got)
|
||||
}
|
||||
if strings.Contains(got, `\# Heading`) {
|
||||
t.Fatalf("expected heading to avoid escaped form, got: %q", got)
|
||||
}
|
||||
if !strings.Contains(got, "") {
|
||||
t.Fatalf("expected markdown image syntax to stay intact, got: %q", got)
|
||||
}
|
||||
if strings.Contains(got, `!\[](images/cover.jpg)`) {
|
||||
t.Fatalf("expected markdown image syntax to avoid escaped form, got: %q", got)
|
||||
}
|
||||
if !strings.Contains(got, `<details><summary>text_image</summary>caption</details>`) {
|
||||
t.Fatalf("expected details/summary block to be preserved, got: %q", got)
|
||||
}
|
||||
if !strings.Contains(got, `<img src="images/profile.jpg"/>`) {
|
||||
t.Fatalf("expected html img tag to be preserved, got: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessImagesKeepsReferencedVariants(t *testing.T) {
|
||||
reader := &MinerUReader{}
|
||||
mdContent := strings.Join([]string{
|
||||
"",
|
||||
`<img src="./images/profile.jpg"/>`,
|
||||
``,
|
||||
}, "\n")
|
||||
|
||||
png := createTestPNG(200, 150)
|
||||
b64 := base64.StdEncoding.EncodeToString(png)
|
||||
images := map[string]string{
|
||||
"cover.jpg": "data:image/png;base64," + b64,
|
||||
"profile.jpg": "data:image/png;base64," + b64,
|
||||
"plain.jpg": "data:image/png;base64," + b64,
|
||||
}
|
||||
|
||||
refs, gotMarkdown := reader.processImages(mdContent, images)
|
||||
|
||||
if gotMarkdown != mdContent {
|
||||
t.Fatalf("processImages should not rewrite markdown content")
|
||||
}
|
||||
if len(refs) != 3 {
|
||||
t.Fatalf("expected 3 image refs, got %d", len(refs))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user