refactor(parser): reorganize Markdown parser and enhance gRPC document reading

- Moved the _SEPARATOR_CELL regex definition to a more appropriate location in the Markdown parser.
- Implemented a fallback mechanism in the gRPC document reader to handle cases where the ReadStream RPC is unimplemented, ensuring compatibility with older versions.
- Added a readUnary method to maintain backward compatibility with the legacy unary Read RPC.
- Improved cancellation handling in the MinerUCloud and PaddleOCR-VL readers to prevent excessive API calls during context cancellation.
This commit is contained in:
wizardchen
2026-06-03 12:26:16 +08:00
committed by lyingbug
parent ef1047bf67
commit bbd3f6324a
5 changed files with 76 additions and 13 deletions

View File

@@ -18,8 +18,6 @@ import re
import uuid
from typing import Dict, List, Match, Optional, Tuple
_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
from docreader.models.document import Document
from docreader.parser.base_parser import BaseParser
from docreader.parser.chain_parser import PipelineParser
@@ -28,6 +26,8 @@ from docreader.utils import endecode
# Get logger object
logger = logging.getLogger(__name__)
_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
class MarkdownTableUtil:
"""Utility class for formatting Markdown tables.

View File

@@ -14,7 +14,9 @@ import (
"github.com/Tencent/WeKnora/internal/logger"
"github.com/Tencent/WeKnora/internal/types"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/resolver"
"google.golang.org/grpc/status"
)
func getMaxMessageSize() int {
@@ -129,6 +131,27 @@ func (p *GRPCDocumentReader) Read(ctx context.Context, req *types.ReadRequest) (
// Use the streaming RPC so documents with many page images (large scanned
// PDFs) are not capped by the unary message-size limit. The meta frame
// arrives first, followed by one frame per image.
result, err := p.readStream(ctx, client, protoReq)
if err != nil {
// An older docreader build may not implement ReadStream. Fall back to
// the unary Read RPC so a version-skewed deployment still parses
// documents (small/medium docs only — the unary path remains capped by
// the gRPC message-size limit, which is exactly what streaming avoids).
if status.Code(err) == codes.Unimplemented {
logger.Warnf(ctx, "docreader ReadStream unimplemented, falling back to unary Read: %v", err)
return p.readUnary(ctx, client, protoReq)
}
return nil, err
}
return result, nil
}
// readStream consumes the server-streaming ReadStream RPC: one meta frame
// followed by one frame per image. Errors are returned verbatim so the caller
// can inspect the gRPC status code (e.g. Unimplemented) for fallback.
func (p *GRPCDocumentReader) readStream(
ctx context.Context, client proto.DocReaderClient, protoReq *proto.ReadRequest,
) (*types.ReadResult, error) {
stream, err := client.ReadStream(ctx, protoReq)
if err != nil {
return nil, fmt.Errorf("gRPC ReadStream failed: %w", err)
@@ -174,6 +197,37 @@ func (p *GRPCDocumentReader) Read(ctx context.Context, req *types.ReadRequest) (
return result, nil
}
// readUnary calls the legacy unary Read RPC. Used only as a compatibility
// fallback when the connected docreader does not implement ReadStream.
func (p *GRPCDocumentReader) readUnary(
ctx context.Context, client proto.DocReaderClient, protoReq *proto.ReadRequest,
) (*types.ReadResult, error) {
resp, err := client.Read(ctx, protoReq)
if err != nil {
return nil, fmt.Errorf("gRPC Read failed: %w", err)
}
result := &types.ReadResult{
MarkdownContent: resp.GetMarkdownContent(),
ImageDirPath: resp.GetImageDirPath(),
Metadata: resp.GetMetadata(),
Error: resp.GetError(),
}
if refs := resp.GetImageRefs(); len(refs) > 0 {
result.ImageRefs = make([]types.ImageRef, 0, len(refs))
for _, img := range refs {
result.ImageRefs = append(result.ImageRefs, types.ImageRef{
Filename: img.GetFilename(),
OriginalRef: img.GetOriginalRef(),
MimeType: img.GetMimeType(),
StorageKey: img.GetStorageKey(),
ImageData: img.GetImageData(),
})
}
}
return result, nil
}
func (p *GRPCDocumentReader) ListEngines(ctx context.Context, overrides map[string]string) ([]types.ParserEngineInfo, error) {
p.mu.RLock()
client := p.client

View File

@@ -215,6 +215,13 @@ func (c *MinerUCloudReader) pollBatchResult(ctx context.Context, batchID string)
}
for time.Now().Before(deadline) {
// Bail out promptly on caller cancellation instead of spinning:
// fetchBatchStatus fails immediately and sleepCtx returns at once on a
// cancelled ctx, so without this guard the loop busy-hammers the cloud
// API and floods logs until the deadline.
if err := ctx.Err(); err != nil {
return "", nil, err
}
pollCount++
items, err := c.fetchBatchStatus(ctx, batchID, headers)

View File

@@ -183,6 +183,13 @@ func (c *PaddleOCRVLCloudReader) pollJob(ctx context.Context, jobID string) (str
url := c.baseURL + "/" + jobID
for time.Now().Before(deadline) {
// Bail out promptly when the caller cancels (task cancelled / timed
// out) instead of spinning: client.Do would fail immediately and
// sleepCtx returns at once on a cancelled ctx, so without this guard
// the loop busy-hammers the cloud API and floods logs until deadline.
if err := ctx.Err(); err != nil {
return "", err
}
pollCount++
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)

View File

@@ -26,19 +26,17 @@ const paddleOCRVLTimeout = 1000 * time.Second // large scanned PDFs can take a w
// Flow: POST {endpoint}/layout-parsing with base64 file → synchronous JSON
// response containing per-page markdown + inline base64 images.
type PaddleOCRVLReader struct {
endpoint string
useSeal bool
useChart bool
useLayout bool
endpoint string
useSeal bool
useChart bool
}
// NewPaddleOCRVLReader creates a reader from ParserEngineOverrides.
func NewPaddleOCRVLReader(overrides map[string]string) *PaddleOCRVLReader {
return &PaddleOCRVLReader{
endpoint: strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
useSeal: parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
useChart: parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
useLayout: parseBoolOr(overrides["paddleocr_vl_use_layout_detection"], true),
endpoint: strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
useSeal: parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
useChart: parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
}
}
@@ -139,9 +137,6 @@ func (c *PaddleOCRVLReader) callLayoutParsing(
payload["file"] = base64.StdEncoding.EncodeToString(content)
payload["fileType"] = fileTypeCode(req)
payload["visualize"] = false
if !c.useLayout {
payload["useLayoutDetection"] = false
}
body, err := json.Marshal(payload)
if err != nil {