refactor(parser): reorganize Markdown parser and enhance gRPC document reading

- Moved the _SEPARATOR_CELL regex definition to a more appropriate location in the Markdown parser. - Implemented a fallback mechanism in the gRPC document reader to handle cases where the ReadStream RPC is unimplemented, ensuring compatibility with older versions. - Added a readUnary method to maintain backward compatibility with the legacy unary Read RPC. - Improved cancellation handling in the MinerUCloud and PaddleOCR-VL readers to prevent excessive API calls during context cancellation.
2026-06-04 13:30:32 +08:00 · 2026-06-03 12:26:16 +08:00
parent ef1047bf67
commit bbd3f6324a
5 changed files with 76 additions and 13 deletions
--- a/docreader/parser/markdown_parser.py
+++ b/docreader/parser/markdown_parser.py
@@ -18,8 +18,6 @@ import re
 import uuid
 from typing import Dict, List, Match, Optional, Tuple

-_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
-
 from docreader.models.document import Document
 from docreader.parser.base_parser import BaseParser
 from docreader.parser.chain_parser import PipelineParser
@@ -28,6 +26,8 @@ from docreader.utils import endecode
 # Get logger object
 logger = logging.getLogger(__name__)

+_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
+

 class MarkdownTableUtil:
    """Utility class for formatting Markdown tables.
--- a/internal/infrastructure/docparser/grpc_parser.go
+++ b/internal/infrastructure/docparser/grpc_parser.go
@@ -14,7 +14,9 @@ import (
 	"github.com/Tencent/WeKnora/internal/logger"
 	"github.com/Tencent/WeKnora/internal/types"
 	"google.golang.org/grpc"
+	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/resolver"
+	"google.golang.org/grpc/status"
 )

 func getMaxMessageSize() int {
@@ -129,6 +131,27 @@ func (p *GRPCDocumentReader) Read(ctx context.Context, req *types.ReadRequest) (
 	// Use the streaming RPC so documents with many page images (large scanned
 	// PDFs) are not capped by the unary message-size limit. The meta frame
 	// arrives first, followed by one frame per image.
+	result, err := p.readStream(ctx, client, protoReq)
+	if err != nil {
+		// An older docreader build may not implement ReadStream. Fall back to
+		// the unary Read RPC so a version-skewed deployment still parses
+		// documents (small/medium docs only — the unary path remains capped by
+		// the gRPC message-size limit, which is exactly what streaming avoids).
+		if status.Code(err) == codes.Unimplemented {
+			logger.Warnf(ctx, "docreader ReadStream unimplemented, falling back to unary Read: %v", err)
+			return p.readUnary(ctx, client, protoReq)
+		}
+		return nil, err
+	}
+	return result, nil
+}
+
+// readStream consumes the server-streaming ReadStream RPC: one meta frame
+// followed by one frame per image. Errors are returned verbatim so the caller
+// can inspect the gRPC status code (e.g. Unimplemented) for fallback.
+func (p *GRPCDocumentReader) readStream(
+	ctx context.Context, client proto.DocReaderClient, protoReq *proto.ReadRequest,
+) (*types.ReadResult, error) {
 	stream, err := client.ReadStream(ctx, protoReq)
 	if err != nil {
 		return nil, fmt.Errorf("gRPC ReadStream failed: %w", err)
@@ -174,6 +197,37 @@ func (p *GRPCDocumentReader) Read(ctx context.Context, req *types.ReadRequest) (
 	return result, nil
 }

+// readUnary calls the legacy unary Read RPC. Used only as a compatibility
+// fallback when the connected docreader does not implement ReadStream.
+func (p *GRPCDocumentReader) readUnary(
+	ctx context.Context, client proto.DocReaderClient, protoReq *proto.ReadRequest,
+) (*types.ReadResult, error) {
+	resp, err := client.Read(ctx, protoReq)
+	if err != nil {
+		return nil, fmt.Errorf("gRPC Read failed: %w", err)
+	}
+
+	result := &types.ReadResult{
+		MarkdownContent: resp.GetMarkdownContent(),
+		ImageDirPath:    resp.GetImageDirPath(),
+		Metadata:        resp.GetMetadata(),
+		Error:           resp.GetError(),
+	}
+	if refs := resp.GetImageRefs(); len(refs) > 0 {
+		result.ImageRefs = make([]types.ImageRef, 0, len(refs))
+		for _, img := range refs {
+			result.ImageRefs = append(result.ImageRefs, types.ImageRef{
+				Filename:    img.GetFilename(),
+				OriginalRef: img.GetOriginalRef(),
+				MimeType:    img.GetMimeType(),
+				StorageKey:  img.GetStorageKey(),
+				ImageData:   img.GetImageData(),
+			})
+		}
+	}
+	return result, nil
+}
+
 func (p *GRPCDocumentReader) ListEngines(ctx context.Context, overrides map[string]string) ([]types.ParserEngineInfo, error) {
 	p.mu.RLock()
 	client := p.client
--- a/internal/infrastructure/docparser/mineru_cloud_converter.go
+++ b/internal/infrastructure/docparser/mineru_cloud_converter.go
@@ -215,6 +215,13 @@ func (c *MinerUCloudReader) pollBatchResult(ctx context.Context, batchID string)
 	}

 	for time.Now().Before(deadline) {
+		// Bail out promptly on caller cancellation instead of spinning:
+		// fetchBatchStatus fails immediately and sleepCtx returns at once on a
+		// cancelled ctx, so without this guard the loop busy-hammers the cloud
+		// API and floods logs until the deadline.
+		if err := ctx.Err(); err != nil {
+			return "", nil, err
+		}
 		pollCount++

 		items, err := c.fetchBatchStatus(ctx, batchID, headers)
--- a/internal/infrastructure/docparser/paddleocr_vl_cloud_converter.go
+++ b/internal/infrastructure/docparser/paddleocr_vl_cloud_converter.go
@@ -183,6 +183,13 @@ func (c *PaddleOCRVLCloudReader) pollJob(ctx context.Context, jobID string) (str
 	url := c.baseURL + "/" + jobID

 	for time.Now().Before(deadline) {
+		// Bail out promptly when the caller cancels (task cancelled / timed
+		// out) instead of spinning: client.Do would fail immediately and
+		// sleepCtx returns at once on a cancelled ctx, so without this guard
+		// the loop busy-hammers the cloud API and floods logs until deadline.
+		if err := ctx.Err(); err != nil {
+			return "", err
+		}
 		pollCount++

 		httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
--- a/internal/infrastructure/docparser/paddleocr_vl_converter.go
+++ b/internal/infrastructure/docparser/paddleocr_vl_converter.go
@@ -26,19 +26,17 @@ const paddleOCRVLTimeout = 1000 * time.Second // large scanned PDFs can take a w
 // Flow: POST {endpoint}/layout-parsing with base64 file → synchronous JSON
 // response containing per-page markdown + inline base64 images.
 type PaddleOCRVLReader struct {
-	endpoint  string
-	useSeal   bool
-	useChart  bool
-	useLayout bool
+	endpoint string
+	useSeal  bool
+	useChart bool
 }

 // NewPaddleOCRVLReader creates a reader from ParserEngineOverrides.
 func NewPaddleOCRVLReader(overrides map[string]string) *PaddleOCRVLReader {
 	return &PaddleOCRVLReader{
-		endpoint:  strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
-		useSeal:   parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
-		useChart:  parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
-		useLayout: parseBoolOr(overrides["paddleocr_vl_use_layout_detection"], true),
+		endpoint: strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
+		useSeal:  parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
+		useChart: parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
 	}
 }

@@ -139,9 +137,6 @@ func (c *PaddleOCRVLReader) callLayoutParsing(
 	payload["file"] = base64.StdEncoding.EncodeToString(content)
 	payload["fileType"] = fileTypeCode(req)
 	payload["visualize"] = false
-	if !c.useLayout {
-		payload["useLayoutDetection"] = false
-	}

 	body, err := json.Marshal(payload)
 	if err != nil {