refactor(parser): reorganize Markdown parser and enhance gRPC document reading

- Moved the _SEPARATOR_CELL regex definition to a more appropriate location in the Markdown parser. - Implemented a fallback mechanism in the gRPC document reader to handle cases where the ReadStream RPC is unimplemented, ensuring compatibility with older versions. - Added a readUnary method to maintain backward compatibility with the legacy unary Read RPC. - Improved cancellation handling in the MinerUCloud and PaddleOCR-VL readers to prevent excessive API calls during context cancellation.
2026-06-04 13:30:32 +08:00 · 2026-06-03 12:26:16 +08:00
parent ef1047bf67
commit bbd3f6324a
5 changed files with 76 additions and 13 deletions
--- a/docreader/parser/markdown_parser.py
+++ b/docreader/parser/markdown_parser.py
@@ -18,8 +18,6 @@ import re
 import uuid
 from typing import Dict, List, Match, Optional, Tuple
 _SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
 from docreader.models.document import Document
 from docreader.parser.base_parser import BaseParser
 from docreader.parser.chain_parser import PipelineParser
@@ -28,6 +26,8 @@ from docreader.utils import endecode
 # Get logger object
 logger = logging.getLogger(__name__)
 _SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
 class MarkdownTableUtil:
    """Utility class for formatting Markdown tables.
--- a/internal/infrastructure/docparser/grpc_parser.go
+++ b/internal/infrastructure/docparser/grpc_parser.go
@@ -14,7 +14,9 @@ import (
 	"github.com/Tencent/WeKnora/internal/logger"
 	"github.com/Tencent/WeKnora/internal/types"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/resolver"
 	"google.golang.org/grpc/status"
 )
 func getMaxMessageSize() int {
@@ -129,6 +131,27 @@ func (p *GRPCDocumentReader) Read(ctx context.Context, req *types.ReadRequest) (
 	// Use the streaming RPC so documents with many page images (large scanned
 	// PDFs) are not capped by the unary message-size limit. The meta frame
 	// arrives first, followed by one frame per image.
 	result, err := p.readStream(ctx, client, protoReq)
 	if err != nil {
 		// An older docreader build may not implement ReadStream. Fall back to
 		// the unary Read RPC so a version-skewed deployment still parses
 		// documents (small/medium docs only — the unary path remains capped by
 		// the gRPC message-size limit, which is exactly what streaming avoids).
 		if status.Code(err) == codes.Unimplemented {
 			logger.Warnf(ctx, "docreader ReadStream unimplemented, falling back to unary Read: %v", err)
 			return p.readUnary(ctx, client, protoReq)
 		}
 		return nil, err
 	}
 	return result, nil
 }
 // readStream consumes the server-streaming ReadStream RPC: one meta frame
 // followed by one frame per image. Errors are returned verbatim so the caller
 // can inspect the gRPC status code (e.g. Unimplemented) for fallback.
 func (p *GRPCDocumentReader) readStream(
 	ctx context.Context, client proto.DocReaderClient, protoReq *proto.ReadRequest,
 ) (*types.ReadResult, error) {
 	stream, err := client.ReadStream(ctx, protoReq)
 	if err != nil {
 		return nil, fmt.Errorf("gRPC ReadStream failed: %w", err)
@@ -174,6 +197,37 @@ func (p *GRPCDocumentReader) Read(ctx context.Context, req *types.ReadRequest) (
 	return result, nil
 }
 // readUnary calls the legacy unary Read RPC. Used only as a compatibility
 // fallback when the connected docreader does not implement ReadStream.
 func (p *GRPCDocumentReader) readUnary(
 	ctx context.Context, client proto.DocReaderClient, protoReq *proto.ReadRequest,
 ) (*types.ReadResult, error) {
 	resp, err := client.Read(ctx, protoReq)
 	if err != nil {
 		return nil, fmt.Errorf("gRPC Read failed: %w", err)
 	}
 	result := &types.ReadResult{
 		MarkdownContent: resp.GetMarkdownContent(),
 		ImageDirPath:    resp.GetImageDirPath(),
 		Metadata:        resp.GetMetadata(),
 		Error:           resp.GetError(),
 	}
 	if refs := resp.GetImageRefs(); len(refs) > 0 {
 		result.ImageRefs = make([]types.ImageRef, 0, len(refs))
 		for _, img := range refs {
 			result.ImageRefs = append(result.ImageRefs, types.ImageRef{
 				Filename:    img.GetFilename(),
 				OriginalRef: img.GetOriginalRef(),
 				MimeType:    img.GetMimeType(),
 				StorageKey:  img.GetStorageKey(),
 				ImageData:   img.GetImageData(),
 			})
 		}
 	}
 	return result, nil
 }
 func (p *GRPCDocumentReader) ListEngines(ctx context.Context, overrides map[string]string) ([]types.ParserEngineInfo, error) {
 	p.mu.RLock()
 	client := p.client
--- a/internal/infrastructure/docparser/mineru_cloud_converter.go
+++ b/internal/infrastructure/docparser/mineru_cloud_converter.go
@@ -215,6 +215,13 @@ func (c *MinerUCloudReader) pollBatchResult(ctx context.Context, batchID string)
 	}
 	for time.Now().Before(deadline) {
 		// Bail out promptly on caller cancellation instead of spinning:
 		// fetchBatchStatus fails immediately and sleepCtx returns at once on a
 		// cancelled ctx, so without this guard the loop busy-hammers the cloud
 		// API and floods logs until the deadline.
 		if err := ctx.Err(); err != nil {
 			return "", nil, err
 		}
 		pollCount++
 		items, err := c.fetchBatchStatus(ctx, batchID, headers)
--- a/internal/infrastructure/docparser/paddleocr_vl_cloud_converter.go
+++ b/internal/infrastructure/docparser/paddleocr_vl_cloud_converter.go
@@ -183,6 +183,13 @@ func (c *PaddleOCRVLCloudReader) pollJob(ctx context.Context, jobID string) (str
 	url := c.baseURL + "/" + jobID
 	for time.Now().Before(deadline) {
 		// Bail out promptly when the caller cancels (task cancelled / timed
 		// out) instead of spinning: client.Do would fail immediately and
 		// sleepCtx returns at once on a cancelled ctx, so without this guard
 		// the loop busy-hammers the cloud API and floods logs until deadline.
 		if err := ctx.Err(); err != nil {
 			return "", err
 		}
 		pollCount++
 		httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
--- a/internal/infrastructure/docparser/paddleocr_vl_converter.go
+++ b/internal/infrastructure/docparser/paddleocr_vl_converter.go
@@ -26,19 +26,17 @@ const paddleOCRVLTimeout = 1000 * time.Second // large scanned PDFs can take a w
 // Flow: POST {endpoint}/layout-parsing with base64 file → synchronous JSON
 // response containing per-page markdown + inline base64 images.
 type PaddleOCRVLReader struct {
-	endpoint  string
+	endpoint string
-	useSeal   bool
+	useSeal  bool
-	useChart  bool
+	useChart bool
 	useLayout bool
 }
 // NewPaddleOCRVLReader creates a reader from ParserEngineOverrides.
 func NewPaddleOCRVLReader(overrides map[string]string) *PaddleOCRVLReader {
 	return &PaddleOCRVLReader{
-		endpoint:  strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
+		endpoint: strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
-		useSeal:   parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
+		useSeal:  parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
-		useChart:  parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
+		useChart: parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
 		useLayout: parseBoolOr(overrides["paddleocr_vl_use_layout_detection"], true),
 	}
 }
@@ -139,9 +137,6 @@ func (c *PaddleOCRVLReader) callLayoutParsing(
 	payload["file"] = base64.StdEncoding.EncodeToString(content)
 	payload["fileType"] = fileTypeCode(req)
 	payload["visualize"] = false
 	if !c.useLayout {
 		payload["useLayoutDetection"] = false
 	}
 	body, err := json.Marshal(payload)
 	if err != nil {