WeKnora/docreader/parser/base_parser.py

# -*- coding: utf-8 -*-
import logging
import os
from abc import ABC, abstractmethod
from typing import Optional

from docreader.models.document import Document

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class BaseParser(ABC):
    """Base parser interface.

    After the lightweight refactoring, BaseParser only extracts markdown text
    and raw image references from documents. Chunking, image storage, OCR,
    and VLM caption are handled by the Go App module.
    """

    def __init__(
        self,
        file_name: str = "",
        file_type: Optional[str] = None,
        **kwargs,
    ):
        self.file_name = file_name
        self.file_type = file_type or os.path.splitext(file_name)[1].lstrip(".")

        logger.info(
            "Initializing parser for file=%s, type=%s",
            file_name,
            self.file_type,
        )

    @abstractmethod
    def parse_into_text(self, content: bytes) -> Document:
        """Parse document content into markdown text.

        Returns:
            Document with ``content`` (markdown string) and optional
            ``images`` dict mapping storage-relative paths to base64 data.
        """

    def parse(self, content: bytes) -> Document:
        """Parse document and return markdown + image references.

        No chunking, no OCR, no VLM caption — those are done in Go.
        """
        logger.info(
            "Parsing document with %s, bytes: %d",
            self.__class__.__name__,
            len(content),
        )
        document = self.parse_into_text(content)
        logger.info(
            "Extracted %d characters from %s",
            len(document.content),
            self.file_name,
        )
        return document