From 00d39e006dd565571b598ea2bd00d0e666964145 Mon Sep 17 00:00:00 2001 From: "yuheng.huang" Date: Mon, 18 May 2026 20:55:23 +0800 Subject: [PATCH] fix(docparser): unescape MinerU markdown image syntax html-to-markdown over-escapes to on MinerU'smixed-Markdown+HTML output, breaking downstream image extraction. Restore the canonical image syntax after conversion so images are persisted. --- .../docparser/mineru_converter.go | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/internal/infrastructure/docparser/mineru_converter.go b/internal/infrastructure/docparser/mineru_converter.go index ccdabb60..93b1f949 100644 --- a/internal/infrastructure/docparser/mineru_converter.go +++ b/internal/infrastructure/docparser/mineru_converter.go @@ -64,8 +64,13 @@ func (c *MinerUReader) Read(ctx context.Context, req *types.ReadRequest) (*types return nil, fmt.Errorf("MinerU file_parse: %w", err) } - // HTML -> Markdown conversion (equivalent to Python markdownify) + // HTML -> Markdown conversion (equivalent to Python markdownify). + // MinerU's md_content is mostly Markdown with embedded HTML blocks (e.g. ), + // but html-to-markdown sees the whole string as HTML and escapes Markdown special + // chars in already-valid Markdown — notably turning `![](...)` into `!\[](...)`, + // which then breaks downstream image extraction. Unescape those after conversion. mdContent = htmlToMarkdown(mdContent) + mdContent = unescapeMarkdownImageSyntax(mdContent) // Process images: decode base64, build ImageRef list, replace refs in markdown imageRefs, mdContent := c.processImages(mdContent, imagesB64) @@ -277,3 +282,16 @@ func htmlToMarkdown(content string) string { } return md } + +// escapedImageSyntaxPattern matches markdown image references whose `[` was +// over-escaped to `\[` by html-to-markdown. The URL group mirrors the +// downstream image-extraction regex so escapes are only stripped for actual +// image references. +var escapedImageSyntaxPattern = regexp.MustCompile(`!\\\[(.*?)\\?\]\(([^()\n]*(?:\([^)]*\)[^()\n]*)*)\)`) + +// unescapeMarkdownImageSyntax restores `![alt](url)` from html-to-markdown's +// over-escaped `!\[alt\](url)` form. Without this, the downstream image regex +// in ImageResolver fails to match and images are never persisted. +func unescapeMarkdownImageSyntax(content string) string { + return escapedImageSyntaxPattern.ReplaceAllString(content, "![$1]($2)") +}