fix(docparser): unescape MinerU markdown image syntax html-to-markdown over-escapes to on MinerU'smixed-Markdown+HTML output, breaking downstream image extraction. Restore the canonical image syntax after conversion so images are persisted.

This commit is contained in:
yuheng.huang
2026-05-18 20:55:23 +08:00
committed by lyingbug
parent 7444c2190d
commit 00d39e006d

View File

@@ -64,8 +64,13 @@ func (c *MinerUReader) Read(ctx context.Context, req *types.ReadRequest) (*types
return nil, fmt.Errorf("MinerU file_parse: %w", err)
}
// HTML -> Markdown conversion (equivalent to Python markdownify)
// HTML -> Markdown conversion (equivalent to Python markdownify).
// MinerU's md_content is mostly Markdown with embedded HTML blocks (e.g. <table>),
// but html-to-markdown sees the whole string as HTML and escapes Markdown special
// chars in already-valid Markdown — notably turning `![](...)` into `!\[](...)`,
// which then breaks downstream image extraction. Unescape those after conversion.
mdContent = htmlToMarkdown(mdContent)
mdContent = unescapeMarkdownImageSyntax(mdContent)
// Process images: decode base64, build ImageRef list, replace refs in markdown
imageRefs, mdContent := c.processImages(mdContent, imagesB64)
@@ -277,3 +282,16 @@ func htmlToMarkdown(content string) string {
}
return md
}
// escapedImageSyntaxPattern matches markdown image references whose `[` was
// over-escaped to `\[` by html-to-markdown. The URL group mirrors the
// downstream image-extraction regex so escapes are only stripped for actual
// image references.
var escapedImageSyntaxPattern = regexp.MustCompile(`!\\\[(.*?)\\?\]\(([^()\n]*(?:\([^)]*\)[^()\n]*)*)\)`)
// unescapeMarkdownImageSyntax restores `![alt](url)` from html-to-markdown's
// over-escaped `!\[alt\](url)` form. Without this, the downstream image regex
// in ImageResolver fails to match and images are never persisted.
func unescapeMarkdownImageSyntax(content string) string {
return escapedImageSyntaxPattern.ReplaceAllString(content, "![$1]($2)")
}