Files
WeKnora/internal/infrastructure/docparser/engine_registry.go
wizardchen ef1047bf67 feat(parser): add OpenDataLoader, PaddleOCR-VL engines, and parser improvements
Introduce opendataloader and PaddleOCR-VL parser engines with tenant-level
settings UI, replace liteparse, and harden Excel/PPT/Markdown parsing.
Optional odl-hybrid sidecar stays local-build only and is excluded from
default dev-start and full profiles.
2026-06-03 12:29:13 +08:00

234 lines
8.9 KiB
Go

package docparser
import (
"strings"
"github.com/Tencent/WeKnora/internal/types"
)
// EngineRegistration is the interface every locally registered parser engine
// must implement. Remote-only engines (e.g. markitdown) are discovered via
// the docreader ListEngines RPC and do not need a local registration.
type EngineRegistration interface {
Name() string
Description() string
FileTypes(docreaderConnected bool) []string
CheckAvailable(docreaderConnected bool, overrides map[string]string) (available bool, reason string)
}
// localEngines holds all locally registered parser engines.
var localEngines []EngineRegistration
// RegisterEngine adds an engine to the local registry. Called in init().
func RegisterEngine(e EngineRegistration) {
localEngines = append(localEngines, e)
}
func init() {
RegisterEngine(&builtinEngine{})
RegisterEngine(&simpleEngine{})
RegisterEngine(&weKnoraCloudEngine{})
RegisterEngine(&mineruEngine{})
RegisterEngine(&mineruCloudEngine{})
RegisterEngine(&paddleOCRVLEngine{})
RegisterEngine(&paddleOCRVLCloudEngine{})
}
// ---------------------------------------------------------------------------
// builtin — DocReader-backed parser for complex document formats.
// ---------------------------------------------------------------------------
type builtinEngine struct{}
func (e *builtinEngine) Name() string { return "builtin" }
func (e *builtinEngine) Description() string {
return "DocReader built-in parser engine"
}
func (e *builtinEngine) FileTypes(_ bool) []string {
return []string{"docx", "doc", "pdf", "md", "markdown", "xlsx", "xls", "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", "mp3", "wav", "m4a", "flac", "ogg"}
}
func (e *builtinEngine) CheckAvailable(docreaderConnected bool, _ map[string]string) (bool, string) {
if docreaderConnected {
return true, ""
}
return false, "DocReader service not connected"
}
// SimpleEngineName is the engine name for Go-native simple format handling.
const SimpleEngineName = "simple"
// WeKnoraCloudEngineName is the engine name for WeKnoraCloud-backed document parsing.
const WeKnoraCloudEngineName = "weknoracloud"
// ---------------------------------------------------------------------------
// simple — Go handles md/txt/csv natively, no external service needed.
// Distinct from docreader's "builtin" which uses Python libraries for
// complex formats (docx, pdf, etc.).
// ---------------------------------------------------------------------------
type simpleEngine struct{}
func (e *simpleEngine) Name() string { return SimpleEngineName }
func (e *simpleEngine) Description() string {
return "Simple format & image parsing (no external service required)"
}
func (e *simpleEngine) FileTypes(_ bool) []string {
return []string{"md", "markdown", "txt", "csv", "json", "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", "mp3", "wav", "m4a", "flac", "ogg"}
}
func (e *simpleEngine) CheckAvailable(_ bool, _ map[string]string) (bool, string) {
return true, ""
}
// ---------------------------------------------------------------------------
// weknoracloud — Tenant-scoped WeKnoraCloud docreader with signed requests.
// ---------------------------------------------------------------------------
type weKnoraCloudEngine struct{}
func (e *weKnoraCloudEngine) Name() string { return WeKnoraCloudEngineName }
func (e *weKnoraCloudEngine) Description() string { return "WeKnoraCloud document reader" }
func (e *weKnoraCloudEngine) FileTypes(_ bool) []string {
return []string{"docx", "doc", "pdf", "md", "markdown", "xlsx", "xls", "pptx", "ppt"}
}
func (e *weKnoraCloudEngine) CheckAvailable(docreaderConnected bool, overrides map[string]string) (bool, string) {
if overrides["weknoracloud_app_id"] != "" {
return true, ""
}
return false, "WeKnora Cloud credentials not configured. Go to Settings → WeKnora Cloud to set up."
}
// ---------------------------------------------------------------------------
// mineru — Go-native, calls self-hosted MinerU API directly
// ---------------------------------------------------------------------------
type mineruEngine struct{}
func (e *mineruEngine) Name() string { return "mineru" }
func (e *mineruEngine) Description() string { return "MinerU self-hosted service" }
func (e *mineruEngine) FileTypes(_ bool) []string {
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff", "doc", "docx", "ppt", "pptx"}
}
func (e *mineruEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
endpoint := strings.TrimSpace(overrides["mineru_endpoint"])
if endpoint == "" {
return false, "MinerU service not configured"
}
return PingMinerU(endpoint)
}
// ---------------------------------------------------------------------------
// mineru_cloud — Go-native, calls MinerU Cloud API directly
// ---------------------------------------------------------------------------
type mineruCloudEngine struct{}
func (e *mineruCloudEngine) Name() string { return "mineru_cloud" }
func (e *mineruCloudEngine) Description() string { return "MinerU Cloud API" }
func (e *mineruCloudEngine) FileTypes(_ bool) []string {
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff", "doc", "docx", "ppt", "pptx"}
}
func (e *mineruCloudEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
apiKey := strings.TrimSpace(overrides["mineru_api_key"])
if apiKey == "" {
return false, "MinerU API Key not configured"
}
return PingMinerUCloud(apiKey)
}
// ---------------------------------------------------------------------------
// paddleocr_vl — Go-native, calls a self-hosted PaddleOCR-VL pipeline service
// ---------------------------------------------------------------------------
type paddleOCRVLEngine struct{}
func (e *paddleOCRVLEngine) Name() string { return "paddleocr_vl" }
func (e *paddleOCRVLEngine) Description() string { return "PaddleOCR-VL self-hosted service" }
func (e *paddleOCRVLEngine) FileTypes(_ bool) []string {
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff"}
}
func (e *paddleOCRVLEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
endpoint := strings.TrimSpace(overrides["paddleocr_vl_endpoint"])
if endpoint == "" {
return false, "PaddleOCR-VL service not configured"
}
return PingPaddleOCRVL(endpoint)
}
// ---------------------------------------------------------------------------
// paddleocr_vl_cloud — Go-native, calls the PaddleOCR-VL AI Studio cloud API
// ---------------------------------------------------------------------------
type paddleOCRVLCloudEngine struct{}
func (e *paddleOCRVLCloudEngine) Name() string { return "paddleocr_vl_cloud" }
func (e *paddleOCRVLCloudEngine) Description() string { return "PaddleOCR-VL Cloud API" }
func (e *paddleOCRVLCloudEngine) FileTypes(_ bool) []string {
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff"}
}
func (e *paddleOCRVLCloudEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
token := strings.TrimSpace(overrides["paddleocr_vl_cloud_token"])
if token == "" {
return false, "PaddleOCR-VL Cloud Token not configured"
}
return PingPaddleOCRVLCloud(token)
}
// ---------------------------------------------------------------------------
// ListAllEngines — merge local + remote
// ---------------------------------------------------------------------------
// ListAllEngines returns the merged engine list: locally registered engines
// plus engines discovered from the remote docreader via ListEngines RPC.
//
// Merge rules:
// - Local engines are always included, with Go-side availability checks.
// - For a remote engine whose name matches a local one, the remote's
// file_types and description take precedence (the remote service is
// authoritative for its own capabilities).
// - Remote engines not present locally are appended as-is, enabling
// auto-discovery of newly added docreader engines without Go changes.
func ListAllEngines(docreaderConnected bool, overrides map[string]string, remoteEngines []types.ParserEngineInfo) []types.ParserEngineInfo {
remoteMap := make(map[string]types.ParserEngineInfo, len(remoteEngines))
for _, re := range remoteEngines {
remoteMap[re.Name] = re
}
seen := make(map[string]bool, len(localEngines))
result := make([]types.ParserEngineInfo, 0, len(localEngines)+len(remoteEngines))
for _, e := range localEngines {
name := e.Name()
seen[name] = true
fileTypes := e.FileTypes(docreaderConnected)
description := e.Description()
if re, ok := remoteMap[name]; ok {
if len(re.FileTypes) > 0 {
fileTypes = re.FileTypes
}
if re.Description != "" {
description = re.Description
}
}
available, reason := e.CheckAvailable(docreaderConnected, overrides)
result = append(result, types.ParserEngineInfo{
Name: name,
Description: description,
FileTypes: fileTypes,
Available: available,
UnavailableReason: reason,
})
}
for _, re := range remoteEngines {
if seen[re.Name] {
continue
}
result = append(result, re)
}
return result
}