mirror of
https://github.com/Tencent/WeKnora.git
synced 2026-06-04 21:34:31 +08:00
移除日志设置与冗余代码,优化导入、类型提示及OCR后端管理 统一调整各文件模块导入路径为绝对导入 调整导入路径,移除部分导入,优化日志及注释 升级文档解析器为 Docx2Parser,优化超时与图片处理逻辑
29 lines
855 B
Python
29 lines
855 B
Python
import logging
|
||
|
||
from docreader.parser.chain_parser import FirstParser
|
||
from docreader.parser.docx_parser import DocxParser
|
||
from docreader.parser.markitdown_parser import MarkitdownParser
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class Docx2Parser(FirstParser):
|
||
_parser_cls = (MarkitdownParser, DocxParser)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
logging.basicConfig(level=logging.DEBUG)
|
||
|
||
your_file = "/path/to/your/file.docx"
|
||
parser = Docx2Parser(separators=[".", "?", "!", "。", "?", "!"])
|
||
with open(your_file, "rb") as f:
|
||
content = f.read()
|
||
|
||
document = parser.parse(content)
|
||
for cc in document.chunks:
|
||
logger.info(f"chunk: {cc}")
|
||
|
||
# document = parser.parse_into_text(content)
|
||
# logger.info(f"docx content: {document.content}")
|
||
# logger.info(f"find images {document.images.keys()}")
|