Files
WeKnora/docreader/parser/base_parser.py
wizardchen 397689d2f3 feat: introduce WeKnora Lite edition with lightweight configuration and deployment
- Added a new `.env.lite.example` file for the Lite version, providing a minimal configuration template.
- Updated `.env.example` to remove deprecated variables and include new Docreader settings.
- Enhanced Docker configurations to support the Lite version, including a new Dockerfile for the Docreader service.
- Introduced a Makefile target for building and running the Lite version, along with packaging capabilities.
- Created GitHub workflows for building and releasing Lite binaries, including Homebrew formula support.
- Implemented a new service file for managing the Lite version as a system service.

This update enables a streamlined, single-binary deployment of WeKnora, reducing external dependencies and simplifying setup.
2026-03-02 21:21:49 +08:00

62 lines
1.7 KiB
Python

# -*- coding: utf-8 -*-
import logging
import os
from abc import ABC, abstractmethod
from typing import Optional
from docreader.models.document import Document
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class BaseParser(ABC):
"""Base parser interface.
After the lightweight refactoring, BaseParser only extracts markdown text
and raw image references from documents. Chunking, image storage, OCR,
and VLM caption are handled by the Go App module.
"""
def __init__(
self,
file_name: str = "",
file_type: Optional[str] = None,
**kwargs,
):
self.file_name = file_name
self.file_type = file_type or os.path.splitext(file_name)[1].lstrip(".")
logger.info(
"Initializing parser for file=%s, type=%s",
file_name,
self.file_type,
)
@abstractmethod
def parse_into_text(self, content: bytes) -> Document:
"""Parse document content into markdown text.
Returns:
Document with ``content`` (markdown string) and optional
``images`` dict mapping storage-relative paths to base64 data.
"""
def parse(self, content: bytes) -> Document:
"""Parse document and return markdown + image references.
No chunking, no OCR, no VLM caption — those are done in Go.
"""
logger.info(
"Parsing document with %s, bytes: %d",
self.__class__.__name__,
len(content),
)
document = self.parse_into_text(content)
logger.info(
"Extracted %d characters from %s",
len(document.content),
self.file_name,
)
return document