mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
refactor(parser): reorganize Markdown parser and enhance gRPC document reading
- Moved the _SEPARATOR_CELL regex definition to a more appropriate location in the Markdown parser. - Implemented a fallback mechanism in the gRPC document reader to handle cases where the ReadStream RPC is unimplemented, ensuring compatibility with older versions. - Added a readUnary method to maintain backward compatibility with the legacy unary Read RPC. - Improved cancellation handling in the MinerUCloud and PaddleOCR-VL readers to prevent excessive API calls during context cancellation.
This commit is contained in:
@@ -18,8 +18,6 @@ import re
|
||||
import uuid
|
||||
from typing import Dict, List, Match, Optional, Tuple
|
||||
|
||||
_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
|
||||
|
||||
from docreader.models.document import Document
|
||||
from docreader.parser.base_parser import BaseParser
|
||||
from docreader.parser.chain_parser import PipelineParser
|
||||
@@ -28,6 +26,8 @@ from docreader.utils import endecode
|
||||
# Get logger object
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
|
||||
|
||||
|
||||
class MarkdownTableUtil:
|
||||
"""Utility class for formatting Markdown tables.
|
||||
|
||||
@@ -14,7 +14,9 @@ import (
|
||||
"github.com/Tencent/WeKnora/internal/logger"
|
||||
"github.com/Tencent/WeKnora/internal/types"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/resolver"
|
||||
"google.golang.org/grpc/status"
|
||||
)
|
||||
|
||||
func getMaxMessageSize() int {
|
||||
@@ -129,6 +131,27 @@ func (p *GRPCDocumentReader) Read(ctx context.Context, req *types.ReadRequest) (
|
||||
// Use the streaming RPC so documents with many page images (large scanned
|
||||
// PDFs) are not capped by the unary message-size limit. The meta frame
|
||||
// arrives first, followed by one frame per image.
|
||||
result, err := p.readStream(ctx, client, protoReq)
|
||||
if err != nil {
|
||||
// An older docreader build may not implement ReadStream. Fall back to
|
||||
// the unary Read RPC so a version-skewed deployment still parses
|
||||
// documents (small/medium docs only — the unary path remains capped by
|
||||
// the gRPC message-size limit, which is exactly what streaming avoids).
|
||||
if status.Code(err) == codes.Unimplemented {
|
||||
logger.Warnf(ctx, "docreader ReadStream unimplemented, falling back to unary Read: %v", err)
|
||||
return p.readUnary(ctx, client, protoReq)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// readStream consumes the server-streaming ReadStream RPC: one meta frame
|
||||
// followed by one frame per image. Errors are returned verbatim so the caller
|
||||
// can inspect the gRPC status code (e.g. Unimplemented) for fallback.
|
||||
func (p *GRPCDocumentReader) readStream(
|
||||
ctx context.Context, client proto.DocReaderClient, protoReq *proto.ReadRequest,
|
||||
) (*types.ReadResult, error) {
|
||||
stream, err := client.ReadStream(ctx, protoReq)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("gRPC ReadStream failed: %w", err)
|
||||
@@ -174,6 +197,37 @@ func (p *GRPCDocumentReader) Read(ctx context.Context, req *types.ReadRequest) (
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// readUnary calls the legacy unary Read RPC. Used only as a compatibility
|
||||
// fallback when the connected docreader does not implement ReadStream.
|
||||
func (p *GRPCDocumentReader) readUnary(
|
||||
ctx context.Context, client proto.DocReaderClient, protoReq *proto.ReadRequest,
|
||||
) (*types.ReadResult, error) {
|
||||
resp, err := client.Read(ctx, protoReq)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("gRPC Read failed: %w", err)
|
||||
}
|
||||
|
||||
result := &types.ReadResult{
|
||||
MarkdownContent: resp.GetMarkdownContent(),
|
||||
ImageDirPath: resp.GetImageDirPath(),
|
||||
Metadata: resp.GetMetadata(),
|
||||
Error: resp.GetError(),
|
||||
}
|
||||
if refs := resp.GetImageRefs(); len(refs) > 0 {
|
||||
result.ImageRefs = make([]types.ImageRef, 0, len(refs))
|
||||
for _, img := range refs {
|
||||
result.ImageRefs = append(result.ImageRefs, types.ImageRef{
|
||||
Filename: img.GetFilename(),
|
||||
OriginalRef: img.GetOriginalRef(),
|
||||
MimeType: img.GetMimeType(),
|
||||
StorageKey: img.GetStorageKey(),
|
||||
ImageData: img.GetImageData(),
|
||||
})
|
||||
}
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (p *GRPCDocumentReader) ListEngines(ctx context.Context, overrides map[string]string) ([]types.ParserEngineInfo, error) {
|
||||
p.mu.RLock()
|
||||
client := p.client
|
||||
|
||||
@@ -215,6 +215,13 @@ func (c *MinerUCloudReader) pollBatchResult(ctx context.Context, batchID string)
|
||||
}
|
||||
|
||||
for time.Now().Before(deadline) {
|
||||
// Bail out promptly on caller cancellation instead of spinning:
|
||||
// fetchBatchStatus fails immediately and sleepCtx returns at once on a
|
||||
// cancelled ctx, so without this guard the loop busy-hammers the cloud
|
||||
// API and floods logs until the deadline.
|
||||
if err := ctx.Err(); err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
pollCount++
|
||||
|
||||
items, err := c.fetchBatchStatus(ctx, batchID, headers)
|
||||
|
||||
@@ -183,6 +183,13 @@ func (c *PaddleOCRVLCloudReader) pollJob(ctx context.Context, jobID string) (str
|
||||
url := c.baseURL + "/" + jobID
|
||||
|
||||
for time.Now().Before(deadline) {
|
||||
// Bail out promptly when the caller cancels (task cancelled / timed
|
||||
// out) instead of spinning: client.Do would fail immediately and
|
||||
// sleepCtx returns at once on a cancelled ctx, so without this guard
|
||||
// the loop busy-hammers the cloud API and floods logs until deadline.
|
||||
if err := ctx.Err(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
pollCount++
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
|
||||
@@ -26,19 +26,17 @@ const paddleOCRVLTimeout = 1000 * time.Second // large scanned PDFs can take a w
|
||||
// Flow: POST {endpoint}/layout-parsing with base64 file → synchronous JSON
|
||||
// response containing per-page markdown + inline base64 images.
|
||||
type PaddleOCRVLReader struct {
|
||||
endpoint string
|
||||
useSeal bool
|
||||
useChart bool
|
||||
useLayout bool
|
||||
endpoint string
|
||||
useSeal bool
|
||||
useChart bool
|
||||
}
|
||||
|
||||
// NewPaddleOCRVLReader creates a reader from ParserEngineOverrides.
|
||||
func NewPaddleOCRVLReader(overrides map[string]string) *PaddleOCRVLReader {
|
||||
return &PaddleOCRVLReader{
|
||||
endpoint: strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
|
||||
useSeal: parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
|
||||
useChart: parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
|
||||
useLayout: parseBoolOr(overrides["paddleocr_vl_use_layout_detection"], true),
|
||||
endpoint: strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
|
||||
useSeal: parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
|
||||
useChart: parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -139,9 +137,6 @@ func (c *PaddleOCRVLReader) callLayoutParsing(
|
||||
payload["file"] = base64.StdEncoding.EncodeToString(content)
|
||||
payload["fileType"] = fileTypeCode(req)
|
||||
payload["visualize"] = false
|
||||
if !c.useLayout {
|
||||
payload["useLayoutDetection"] = false
|
||||
}
|
||||
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user