mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
refactor(parser): reorganize Markdown parser and enhance gRPC document reading
- Moved the _SEPARATOR_CELL regex definition to a more appropriate location in the Markdown parser. - Implemented a fallback mechanism in the gRPC document reader to handle cases where the ReadStream RPC is unimplemented, ensuring compatibility with older versions. - Added a readUnary method to maintain backward compatibility with the legacy unary Read RPC. - Improved cancellation handling in the MinerUCloud and PaddleOCR-VL readers to prevent excessive API calls during context cancellation.
This commit is contained in:
@@ -18,8 +18,6 @@ import re
|
|||||||
import uuid
|
import uuid
|
||||||
from typing import Dict, List, Match, Optional, Tuple
|
from typing import Dict, List, Match, Optional, Tuple
|
||||||
|
|
||||||
_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
|
|
||||||
|
|
||||||
from docreader.models.document import Document
|
from docreader.models.document import Document
|
||||||
from docreader.parser.base_parser import BaseParser
|
from docreader.parser.base_parser import BaseParser
|
||||||
from docreader.parser.chain_parser import PipelineParser
|
from docreader.parser.chain_parser import PipelineParser
|
||||||
@@ -28,6 +26,8 @@ from docreader.utils import endecode
|
|||||||
# Get logger object
|
# Get logger object
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_SEPARATOR_CELL = re.compile(r"^:?-{3,}:?$")
|
||||||
|
|
||||||
|
|
||||||
class MarkdownTableUtil:
|
class MarkdownTableUtil:
|
||||||
"""Utility class for formatting Markdown tables.
|
"""Utility class for formatting Markdown tables.
|
||||||
|
|||||||
@@ -14,7 +14,9 @@ import (
|
|||||||
"github.com/Tencent/WeKnora/internal/logger"
|
"github.com/Tencent/WeKnora/internal/logger"
|
||||||
"github.com/Tencent/WeKnora/internal/types"
|
"github.com/Tencent/WeKnora/internal/types"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
|
"google.golang.org/grpc/codes"
|
||||||
"google.golang.org/grpc/resolver"
|
"google.golang.org/grpc/resolver"
|
||||||
|
"google.golang.org/grpc/status"
|
||||||
)
|
)
|
||||||
|
|
||||||
func getMaxMessageSize() int {
|
func getMaxMessageSize() int {
|
||||||
@@ -129,6 +131,27 @@ func (p *GRPCDocumentReader) Read(ctx context.Context, req *types.ReadRequest) (
|
|||||||
// Use the streaming RPC so documents with many page images (large scanned
|
// Use the streaming RPC so documents with many page images (large scanned
|
||||||
// PDFs) are not capped by the unary message-size limit. The meta frame
|
// PDFs) are not capped by the unary message-size limit. The meta frame
|
||||||
// arrives first, followed by one frame per image.
|
// arrives first, followed by one frame per image.
|
||||||
|
result, err := p.readStream(ctx, client, protoReq)
|
||||||
|
if err != nil {
|
||||||
|
// An older docreader build may not implement ReadStream. Fall back to
|
||||||
|
// the unary Read RPC so a version-skewed deployment still parses
|
||||||
|
// documents (small/medium docs only — the unary path remains capped by
|
||||||
|
// the gRPC message-size limit, which is exactly what streaming avoids).
|
||||||
|
if status.Code(err) == codes.Unimplemented {
|
||||||
|
logger.Warnf(ctx, "docreader ReadStream unimplemented, falling back to unary Read: %v", err)
|
||||||
|
return p.readUnary(ctx, client, protoReq)
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// readStream consumes the server-streaming ReadStream RPC: one meta frame
|
||||||
|
// followed by one frame per image. Errors are returned verbatim so the caller
|
||||||
|
// can inspect the gRPC status code (e.g. Unimplemented) for fallback.
|
||||||
|
func (p *GRPCDocumentReader) readStream(
|
||||||
|
ctx context.Context, client proto.DocReaderClient, protoReq *proto.ReadRequest,
|
||||||
|
) (*types.ReadResult, error) {
|
||||||
stream, err := client.ReadStream(ctx, protoReq)
|
stream, err := client.ReadStream(ctx, protoReq)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("gRPC ReadStream failed: %w", err)
|
return nil, fmt.Errorf("gRPC ReadStream failed: %w", err)
|
||||||
@@ -174,6 +197,37 @@ func (p *GRPCDocumentReader) Read(ctx context.Context, req *types.ReadRequest) (
|
|||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// readUnary calls the legacy unary Read RPC. Used only as a compatibility
|
||||||
|
// fallback when the connected docreader does not implement ReadStream.
|
||||||
|
func (p *GRPCDocumentReader) readUnary(
|
||||||
|
ctx context.Context, client proto.DocReaderClient, protoReq *proto.ReadRequest,
|
||||||
|
) (*types.ReadResult, error) {
|
||||||
|
resp, err := client.Read(ctx, protoReq)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("gRPC Read failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
result := &types.ReadResult{
|
||||||
|
MarkdownContent: resp.GetMarkdownContent(),
|
||||||
|
ImageDirPath: resp.GetImageDirPath(),
|
||||||
|
Metadata: resp.GetMetadata(),
|
||||||
|
Error: resp.GetError(),
|
||||||
|
}
|
||||||
|
if refs := resp.GetImageRefs(); len(refs) > 0 {
|
||||||
|
result.ImageRefs = make([]types.ImageRef, 0, len(refs))
|
||||||
|
for _, img := range refs {
|
||||||
|
result.ImageRefs = append(result.ImageRefs, types.ImageRef{
|
||||||
|
Filename: img.GetFilename(),
|
||||||
|
OriginalRef: img.GetOriginalRef(),
|
||||||
|
MimeType: img.GetMimeType(),
|
||||||
|
StorageKey: img.GetStorageKey(),
|
||||||
|
ImageData: img.GetImageData(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (p *GRPCDocumentReader) ListEngines(ctx context.Context, overrides map[string]string) ([]types.ParserEngineInfo, error) {
|
func (p *GRPCDocumentReader) ListEngines(ctx context.Context, overrides map[string]string) ([]types.ParserEngineInfo, error) {
|
||||||
p.mu.RLock()
|
p.mu.RLock()
|
||||||
client := p.client
|
client := p.client
|
||||||
|
|||||||
@@ -215,6 +215,13 @@ func (c *MinerUCloudReader) pollBatchResult(ctx context.Context, batchID string)
|
|||||||
}
|
}
|
||||||
|
|
||||||
for time.Now().Before(deadline) {
|
for time.Now().Before(deadline) {
|
||||||
|
// Bail out promptly on caller cancellation instead of spinning:
|
||||||
|
// fetchBatchStatus fails immediately and sleepCtx returns at once on a
|
||||||
|
// cancelled ctx, so without this guard the loop busy-hammers the cloud
|
||||||
|
// API and floods logs until the deadline.
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return "", nil, err
|
||||||
|
}
|
||||||
pollCount++
|
pollCount++
|
||||||
|
|
||||||
items, err := c.fetchBatchStatus(ctx, batchID, headers)
|
items, err := c.fetchBatchStatus(ctx, batchID, headers)
|
||||||
|
|||||||
@@ -183,6 +183,13 @@ func (c *PaddleOCRVLCloudReader) pollJob(ctx context.Context, jobID string) (str
|
|||||||
url := c.baseURL + "/" + jobID
|
url := c.baseURL + "/" + jobID
|
||||||
|
|
||||||
for time.Now().Before(deadline) {
|
for time.Now().Before(deadline) {
|
||||||
|
// Bail out promptly when the caller cancels (task cancelled / timed
|
||||||
|
// out) instead of spinning: client.Do would fail immediately and
|
||||||
|
// sleepCtx returns at once on a cancelled ctx, so without this guard
|
||||||
|
// the loop busy-hammers the cloud API and floods logs until deadline.
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
pollCount++
|
pollCount++
|
||||||
|
|
||||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||||
|
|||||||
@@ -26,19 +26,17 @@ const paddleOCRVLTimeout = 1000 * time.Second // large scanned PDFs can take a w
|
|||||||
// Flow: POST {endpoint}/layout-parsing with base64 file → synchronous JSON
|
// Flow: POST {endpoint}/layout-parsing with base64 file → synchronous JSON
|
||||||
// response containing per-page markdown + inline base64 images.
|
// response containing per-page markdown + inline base64 images.
|
||||||
type PaddleOCRVLReader struct {
|
type PaddleOCRVLReader struct {
|
||||||
endpoint string
|
endpoint string
|
||||||
useSeal bool
|
useSeal bool
|
||||||
useChart bool
|
useChart bool
|
||||||
useLayout bool
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewPaddleOCRVLReader creates a reader from ParserEngineOverrides.
|
// NewPaddleOCRVLReader creates a reader from ParserEngineOverrides.
|
||||||
func NewPaddleOCRVLReader(overrides map[string]string) *PaddleOCRVLReader {
|
func NewPaddleOCRVLReader(overrides map[string]string) *PaddleOCRVLReader {
|
||||||
return &PaddleOCRVLReader{
|
return &PaddleOCRVLReader{
|
||||||
endpoint: strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
|
endpoint: strings.TrimRight(overrides["paddleocr_vl_endpoint"], "/"),
|
||||||
useSeal: parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
|
useSeal: parseBoolOr(overrides["paddleocr_vl_use_seal_recognition"], true),
|
||||||
useChart: parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
|
useChart: parseBoolOr(overrides["paddleocr_vl_use_chart_recognition"], false),
|
||||||
useLayout: parseBoolOr(overrides["paddleocr_vl_use_layout_detection"], true),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -139,9 +137,6 @@ func (c *PaddleOCRVLReader) callLayoutParsing(
|
|||||||
payload["file"] = base64.StdEncoding.EncodeToString(content)
|
payload["file"] = base64.StdEncoding.EncodeToString(content)
|
||||||
payload["fileType"] = fileTypeCode(req)
|
payload["fileType"] = fileTypeCode(req)
|
||||||
payload["visualize"] = false
|
payload["visualize"] = false
|
||||||
if !c.useLayout {
|
|
||||||
payload["useLayoutDetection"] = false
|
|
||||||
}
|
|
||||||
|
|
||||||
body, err := json.Marshal(payload)
|
body, err := json.Marshal(payload)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user