mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 13:30:32 +08:00
Introduce opendataloader and PaddleOCR-VL parser engines with tenant-level settings UI, replace liteparse, and harden Excel/PPT/Markdown parsing. Optional odl-hybrid sidecar stays local-build only and is excluded from default dev-start and full profiles.
234 lines
8.9 KiB
Go
234 lines
8.9 KiB
Go
package docparser
|
|
|
|
import (
|
|
"strings"
|
|
|
|
"github.com/Tencent/WeKnora/internal/types"
|
|
)
|
|
|
|
// EngineRegistration is the interface every locally registered parser engine
|
|
// must implement. Remote-only engines (e.g. markitdown) are discovered via
|
|
// the docreader ListEngines RPC and do not need a local registration.
|
|
type EngineRegistration interface {
|
|
Name() string
|
|
Description() string
|
|
FileTypes(docreaderConnected bool) []string
|
|
CheckAvailable(docreaderConnected bool, overrides map[string]string) (available bool, reason string)
|
|
}
|
|
|
|
// localEngines holds all locally registered parser engines.
|
|
var localEngines []EngineRegistration
|
|
|
|
// RegisterEngine adds an engine to the local registry. Called in init().
|
|
func RegisterEngine(e EngineRegistration) {
|
|
localEngines = append(localEngines, e)
|
|
}
|
|
|
|
func init() {
|
|
RegisterEngine(&builtinEngine{})
|
|
RegisterEngine(&simpleEngine{})
|
|
RegisterEngine(&weKnoraCloudEngine{})
|
|
RegisterEngine(&mineruEngine{})
|
|
RegisterEngine(&mineruCloudEngine{})
|
|
RegisterEngine(&paddleOCRVLEngine{})
|
|
RegisterEngine(&paddleOCRVLCloudEngine{})
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// builtin — DocReader-backed parser for complex document formats.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
type builtinEngine struct{}
|
|
|
|
func (e *builtinEngine) Name() string { return "builtin" }
|
|
func (e *builtinEngine) Description() string {
|
|
return "DocReader built-in parser engine"
|
|
}
|
|
func (e *builtinEngine) FileTypes(_ bool) []string {
|
|
return []string{"docx", "doc", "pdf", "md", "markdown", "xlsx", "xls", "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", "mp3", "wav", "m4a", "flac", "ogg"}
|
|
}
|
|
func (e *builtinEngine) CheckAvailable(docreaderConnected bool, _ map[string]string) (bool, string) {
|
|
if docreaderConnected {
|
|
return true, ""
|
|
}
|
|
return false, "DocReader service not connected"
|
|
}
|
|
|
|
// SimpleEngineName is the engine name for Go-native simple format handling.
|
|
const SimpleEngineName = "simple"
|
|
|
|
// WeKnoraCloudEngineName is the engine name for WeKnoraCloud-backed document parsing.
|
|
const WeKnoraCloudEngineName = "weknoracloud"
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// simple — Go handles md/txt/csv natively, no external service needed.
|
|
// Distinct from docreader's "builtin" which uses Python libraries for
|
|
// complex formats (docx, pdf, etc.).
|
|
// ---------------------------------------------------------------------------
|
|
|
|
type simpleEngine struct{}
|
|
|
|
func (e *simpleEngine) Name() string { return SimpleEngineName }
|
|
func (e *simpleEngine) Description() string {
|
|
return "Simple format & image parsing (no external service required)"
|
|
}
|
|
func (e *simpleEngine) FileTypes(_ bool) []string {
|
|
return []string{"md", "markdown", "txt", "csv", "json", "jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp", "mp3", "wav", "m4a", "flac", "ogg"}
|
|
}
|
|
func (e *simpleEngine) CheckAvailable(_ bool, _ map[string]string) (bool, string) {
|
|
return true, ""
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// weknoracloud — Tenant-scoped WeKnoraCloud docreader with signed requests.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
type weKnoraCloudEngine struct{}
|
|
|
|
func (e *weKnoraCloudEngine) Name() string { return WeKnoraCloudEngineName }
|
|
func (e *weKnoraCloudEngine) Description() string { return "WeKnoraCloud document reader" }
|
|
func (e *weKnoraCloudEngine) FileTypes(_ bool) []string {
|
|
return []string{"docx", "doc", "pdf", "md", "markdown", "xlsx", "xls", "pptx", "ppt"}
|
|
}
|
|
func (e *weKnoraCloudEngine) CheckAvailable(docreaderConnected bool, overrides map[string]string) (bool, string) {
|
|
if overrides["weknoracloud_app_id"] != "" {
|
|
return true, ""
|
|
}
|
|
return false, "WeKnora Cloud credentials not configured. Go to Settings → WeKnora Cloud to set up."
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// mineru — Go-native, calls self-hosted MinerU API directly
|
|
// ---------------------------------------------------------------------------
|
|
|
|
type mineruEngine struct{}
|
|
|
|
func (e *mineruEngine) Name() string { return "mineru" }
|
|
func (e *mineruEngine) Description() string { return "MinerU self-hosted service" }
|
|
func (e *mineruEngine) FileTypes(_ bool) []string {
|
|
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff", "doc", "docx", "ppt", "pptx"}
|
|
}
|
|
func (e *mineruEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
|
|
endpoint := strings.TrimSpace(overrides["mineru_endpoint"])
|
|
if endpoint == "" {
|
|
return false, "MinerU service not configured"
|
|
}
|
|
return PingMinerU(endpoint)
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// mineru_cloud — Go-native, calls MinerU Cloud API directly
|
|
// ---------------------------------------------------------------------------
|
|
|
|
type mineruCloudEngine struct{}
|
|
|
|
func (e *mineruCloudEngine) Name() string { return "mineru_cloud" }
|
|
func (e *mineruCloudEngine) Description() string { return "MinerU Cloud API" }
|
|
func (e *mineruCloudEngine) FileTypes(_ bool) []string {
|
|
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff", "doc", "docx", "ppt", "pptx"}
|
|
}
|
|
func (e *mineruCloudEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
|
|
apiKey := strings.TrimSpace(overrides["mineru_api_key"])
|
|
if apiKey == "" {
|
|
return false, "MinerU API Key not configured"
|
|
}
|
|
return PingMinerUCloud(apiKey)
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// paddleocr_vl — Go-native, calls a self-hosted PaddleOCR-VL pipeline service
|
|
// ---------------------------------------------------------------------------
|
|
|
|
type paddleOCRVLEngine struct{}
|
|
|
|
func (e *paddleOCRVLEngine) Name() string { return "paddleocr_vl" }
|
|
func (e *paddleOCRVLEngine) Description() string { return "PaddleOCR-VL self-hosted service" }
|
|
func (e *paddleOCRVLEngine) FileTypes(_ bool) []string {
|
|
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff"}
|
|
}
|
|
func (e *paddleOCRVLEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
|
|
endpoint := strings.TrimSpace(overrides["paddleocr_vl_endpoint"])
|
|
if endpoint == "" {
|
|
return false, "PaddleOCR-VL service not configured"
|
|
}
|
|
return PingPaddleOCRVL(endpoint)
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// paddleocr_vl_cloud — Go-native, calls the PaddleOCR-VL AI Studio cloud API
|
|
// ---------------------------------------------------------------------------
|
|
|
|
type paddleOCRVLCloudEngine struct{}
|
|
|
|
func (e *paddleOCRVLCloudEngine) Name() string { return "paddleocr_vl_cloud" }
|
|
func (e *paddleOCRVLCloudEngine) Description() string { return "PaddleOCR-VL Cloud API" }
|
|
func (e *paddleOCRVLCloudEngine) FileTypes(_ bool) []string {
|
|
return []string{"pdf", "jpg", "jpeg", "png", "bmp", "tiff"}
|
|
}
|
|
func (e *paddleOCRVLCloudEngine) CheckAvailable(_ bool, overrides map[string]string) (bool, string) {
|
|
token := strings.TrimSpace(overrides["paddleocr_vl_cloud_token"])
|
|
if token == "" {
|
|
return false, "PaddleOCR-VL Cloud Token not configured"
|
|
}
|
|
return PingPaddleOCRVLCloud(token)
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// ListAllEngines — merge local + remote
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// ListAllEngines returns the merged engine list: locally registered engines
|
|
// plus engines discovered from the remote docreader via ListEngines RPC.
|
|
//
|
|
// Merge rules:
|
|
// - Local engines are always included, with Go-side availability checks.
|
|
// - For a remote engine whose name matches a local one, the remote's
|
|
// file_types and description take precedence (the remote service is
|
|
// authoritative for its own capabilities).
|
|
// - Remote engines not present locally are appended as-is, enabling
|
|
// auto-discovery of newly added docreader engines without Go changes.
|
|
func ListAllEngines(docreaderConnected bool, overrides map[string]string, remoteEngines []types.ParserEngineInfo) []types.ParserEngineInfo {
|
|
remoteMap := make(map[string]types.ParserEngineInfo, len(remoteEngines))
|
|
for _, re := range remoteEngines {
|
|
remoteMap[re.Name] = re
|
|
}
|
|
|
|
seen := make(map[string]bool, len(localEngines))
|
|
result := make([]types.ParserEngineInfo, 0, len(localEngines)+len(remoteEngines))
|
|
|
|
for _, e := range localEngines {
|
|
name := e.Name()
|
|
seen[name] = true
|
|
|
|
fileTypes := e.FileTypes(docreaderConnected)
|
|
description := e.Description()
|
|
|
|
if re, ok := remoteMap[name]; ok {
|
|
if len(re.FileTypes) > 0 {
|
|
fileTypes = re.FileTypes
|
|
}
|
|
if re.Description != "" {
|
|
description = re.Description
|
|
}
|
|
}
|
|
|
|
available, reason := e.CheckAvailable(docreaderConnected, overrides)
|
|
result = append(result, types.ParserEngineInfo{
|
|
Name: name,
|
|
Description: description,
|
|
FileTypes: fileTypes,
|
|
Available: available,
|
|
UnavailableReason: reason,
|
|
})
|
|
}
|
|
|
|
for _, re := range remoteEngines {
|
|
if seen[re.Name] {
|
|
continue
|
|
}
|
|
result = append(result, re)
|
|
}
|
|
|
|
return result
|
|
}
|