fix(docparser): unescape MinerU markdown image syntax html-to-markdown over-escapes to on MinerU'smixed-Markdown+HTML output, breaking downstream image extraction. Restore the canonical image syntax after conversion so images are persisted.

2026-06-04 13:30:32 +08:00 · 2026-05-18 20:55:23 +08:00
parent 7444c2190d
commit 00d39e006d
1 changed files with 19 additions and 1 deletions
--- a/internal/infrastructure/docparser/mineru_converter.go
+++ b/internal/infrastructure/docparser/mineru_converter.go
@@ -64,8 +64,13 @@ func (c *MinerUReader) Read(ctx context.Context, req *types.ReadRequest) (*types
 		return nil, fmt.Errorf("MinerU file_parse: %w", err)
 	}

-	// HTML -> Markdown conversion (equivalent to Python markdownify)
+	// HTML -> Markdown conversion (equivalent to Python markdownify).
+	// MinerU's md_content is mostly Markdown with embedded HTML blocks (e.g. <table>),
+	// but html-to-markdown sees the whole string as HTML and escapes Markdown special
+	// chars in already-valid Markdown — notably turning `![](...)` into `!\[](...)`,
+	// which then breaks downstream image extraction. Unescape those after conversion.
 	mdContent = htmlToMarkdown(mdContent)
+	mdContent = unescapeMarkdownImageSyntax(mdContent)

 	// Process images: decode base64, build ImageRef list, replace refs in markdown
 	imageRefs, mdContent := c.processImages(mdContent, imagesB64)
@@ -277,3 +282,16 @@ func htmlToMarkdown(content string) string {
 	}
 	return md
 }
+
+// escapedImageSyntaxPattern matches markdown image references whose `[` was
+// over-escaped to `\[` by html-to-markdown. The URL group mirrors the
+// downstream image-extraction regex so escapes are only stripped for actual
+// image references.
+var escapedImageSyntaxPattern = regexp.MustCompile(`!\\\[(.*?)\\?\]\(([^()\n]*(?:\([^)]*\)[^()\n]*)*)\)`)
+
+// unescapeMarkdownImageSyntax restores `![alt](url)` from html-to-markdown's
+// over-escaped `!\[alt\](url)` form. Without this, the downstream image regex
+// in ImageResolver fails to match and images are never persisted.
+func unescapeMarkdownImageSyntax(content string) string {
+	return escapedImageSyntaxPattern.ReplaceAllString(content, "![$1]($2)")
+}