From 00d39e006dd565571b598ea2bd00d0e666964145 Mon Sep 17 00:00:00 2001
From: "yuheng.huang" <yuheng.huang@vipshop.com>
Date: Mon, 18 May 2026 20:55:23 +0800
Subject: [PATCH] fix(docparser): unescape MinerU markdown image syntax
 html-to-markdown over-escapes  to  on MinerU'smixed-Markdown+HTML output,
 breaking downstream image extraction. Restore the canonical image syntax
 after conversion so images are persisted.

---
 .../docparser/mineru_converter.go             | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/internal/infrastructure/docparser/mineru_converter.go b/internal/infrastructure/docparser/mineru_converter.go
index ccdabb60..93b1f949 100644
--- a/internal/infrastructure/docparser/mineru_converter.go
+++ b/internal/infrastructure/docparser/mineru_converter.go
@@ -64,8 +64,13 @@ func (c *MinerUReader) Read(ctx context.Context, req *types.ReadRequest) (*types
 		return nil, fmt.Errorf("MinerU file_parse: %w", err)
 	}
 
-	// HTML -> Markdown conversion (equivalent to Python markdownify)
+	// HTML -> Markdown conversion (equivalent to Python markdownify).
+	// MinerU's md_content is mostly Markdown with embedded HTML blocks (e.g. <table>),
+	// but html-to-markdown sees the whole string as HTML and escapes Markdown special
+	// chars in already-valid Markdown — notably turning `![](...)` into `!\[](...)`,
+	// which then breaks downstream image extraction. Unescape those after conversion.
 	mdContent = htmlToMarkdown(mdContent)
+	mdContent = unescapeMarkdownImageSyntax(mdContent)
 
 	// Process images: decode base64, build ImageRef list, replace refs in markdown
 	imageRefs, mdContent := c.processImages(mdContent, imagesB64)
@@ -277,3 +282,16 @@ func htmlToMarkdown(content string) string {
 	}
 	return md
 }
+
+// escapedImageSyntaxPattern matches markdown image references whose `[` was
+// over-escaped to `\[` by html-to-markdown. The URL group mirrors the
+// downstream image-extraction regex so escapes are only stripped for actual
+// image references.
+var escapedImageSyntaxPattern = regexp.MustCompile(`!\\\[(.*?)\\?\]\(([^()\n]*(?:\([^)]*\)[^()\n]*)*)\)`)
+
+// unescapeMarkdownImageSyntax restores `![alt](url)` from html-to-markdown's
+// over-escaped `!\[alt\](url)` form. Without this, the downstream image regex
+// in ImageResolver fails to match and images are never persisted.
+func unescapeMarkdownImageSyntax(content string) string {
+	return escapedImageSyntaxPattern.ReplaceAllString(content, "![$1]($2)")
+}