Skip to content

Commit 2d66abe

Browse files
begoniezhaolyingbug
authored andcommitted
feat: 新增文档模型类,调整配置与解析逻辑,优化日志及导入
移除日志设置与冗余代码,优化导入、类型提示及OCR后端管理 统一调整各文件模块导入路径为绝对导入 调整导入路径,移除部分导入,优化日志及注释 升级文档解析器为 Docx2Parser,优化超时与图片处理逻辑
1 parent af62080 commit 2d66abe

39 files changed

+2674
-1568
lines changed

.gitignore

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,14 @@ node_modules/
2424
tmp/
2525
temp/
2626

27-
# Docker compose файл (локальные настройки)
28-
# docker-compose.yml
29-
3027
WeKnora
3128
/models/
32-
**/__pycache__
3329
test/data/mswag.txt
3430
data/files/
3531

36-
.python-version
3732
.venv/
33+
**/__pycache__
34+
.python-version
3835

3936
### macOS
4037
# General

docker-compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ services:
127127
- MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME:-}
128128
- MINIO_USE_SSL=${MINIO_USE_SSL:-}
129129
- WEB_PROXY=${WEB_PROXY:-}
130+
- MINERU_ENDPOINT=${MINERU_ENDPOINT:-}
130131
healthcheck:
131132
test: ["CMD", "grpc_health_probe", "-addr=:50051"]
132133
interval: 30s

docker/Dockerfile.docreader

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,11 @@ RUN pip install uv --break-system-packages && \
8080
python -m uv sync --locked --no-dev
8181

8282
# 复制源代码和生成脚本
83-
COPY docreader .
83+
COPY docreader docreader
8484

8585
# 生成 protobuf 代码
86-
RUN chmod +x scripts/generate_proto.sh && bash scripts/generate_proto.sh
86+
RUN chmod +x docreader/scripts/generate_proto.sh && \
87+
bash docreader/scripts/generate_proto.sh
8788

8889
# 确保模型目录存在
8990
RUN ls -la /root/.paddleocr/whl/
@@ -150,10 +151,11 @@ RUN python -m playwright install-deps webkit
150151
# COPY docreader/scripts/download_deps.py download_deps.py
151152
# RUN python -m download_deps
152153

153-
COPY --from=builder /app/ ./
154+
COPY docreader/pyproject.toml docreader/uv.lock ./
155+
COPY --from=builder /app/docreader docreader
154156

155157
# 暴露 gRPC 端口
156158
EXPOSE 50051
157159

158160
# 直接运行 Python 服务(日志输出到 stdout/stderr)
159-
CMD ["uv", "run", "main.py"]
161+
CMD ["uv", "run", "-m", "docreader.main"]

docreader/.pylintrc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[LOGGING]
2+
logging-format-style=fstr
3+
4+
[MESSAGES CONTROL]
5+
; disable=W1203

docreader/main.py

Lines changed: 32 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,25 @@
1+
import logging
12
import os
3+
import re
24
import sys
3-
import logging
4-
from concurrent import futures
55
import traceback
6-
import grpc
76
import uuid
8-
import atexit
7+
from concurrent import futures
8+
from typing import Optional
9+
10+
import grpc
911
from grpc_health.v1 import health_pb2_grpc
1012
from grpc_health.v1.health import HealthServicer
1113

12-
# Add parent directory to Python path
13-
current_dir = os.path.dirname(os.path.abspath(__file__))
14-
parent_dir = os.path.dirname(current_dir)
15-
if parent_dir not in sys.path:
16-
sys.path.insert(0, parent_dir)
17-
18-
from proto.docreader_pb2 import ReadResponse, Chunk, Image
19-
from proto import docreader_pb2_grpc
20-
from parser import Parser, OCREngine
21-
from parser.config import ChunkingConfig
22-
from utils.request import request_id_context, init_logging_request_id
14+
from docreader.models.read_config import ChunkingConfig
15+
from docreader.parser import Parser
16+
from docreader.parser.ocr_engine import OCREngine
17+
from docreader.proto import docreader_pb2_grpc
18+
from docreader.proto.docreader_pb2 import Chunk, Image, ReadResponse
19+
from docreader.utils.request import init_logging_request_id, request_id_context
2320

24-
# --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
25-
import re
26-
from typing import Optional
27-
28-
try:
29-
# Optional dependency for charset detection; install via `pip install charset-normalizer`
30-
from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore
31-
except Exception: # pragma: no cover
32-
_cn_from_bytes = None # type: ignore
33-
34-
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
21+
# Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
22+
# cannot be encoded to UTF-8
3523
_SURROGATE_RE = re.compile(r"[\ud800-\udfff]")
3624

3725

@@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
4735
return s.encode("utf-8", errors="replace").decode("utf-8")
4836

4937

50-
def read_text_with_fallback(file_path: str) -> str:
51-
"""Read text from file supporting multiple encodings with graceful fallback.
52-
53-
This server currently receives bytes over gRPC and delegates decoding to the parser.
54-
This helper is provided for future local-file reads if needed.
55-
"""
56-
with open(file_path, "rb") as f:
57-
raw = f.read()
58-
if _cn_from_bytes is not None:
59-
try:
60-
result = _cn_from_bytes(raw).best()
61-
if result:
62-
return str(result)
63-
except Exception:
64-
pass
65-
for enc in ("utf-8", "gb18030", "latin-1"):
66-
try:
67-
return raw.decode(enc, errors="replace")
68-
except UnicodeDecodeError:
69-
continue
70-
return raw.decode("utf-8", errors="replace")
71-
72-
7338
# Ensure no existing handlers
7439
for handler in logging.root.handlers[:]:
7540
logging.root.removeHandler(handler)
@@ -113,7 +78,7 @@ def ReadFromFile(self, request, context):
11378
request.file_type or os.path.splitext(request.file_name)[1][1:]
11479
)
11580
logger.info(
116-
f"Received ReadFromFile request for file: {request.file_name}, type: {file_type}"
81+
f"ReadFromFile for file: {request.file_name}, type: {file_type}"
11782
)
11883
logger.info(f"File content size: {len(request.file_content)} bytes")
11984

@@ -124,8 +89,8 @@ def ReadFromFile(self, request, context):
12489
enable_multimodal = request.read_config.enable_multimodal or False
12590

12691
logger.info(
127-
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
128-
f"multimodal={enable_multimodal}"
92+
f"Using chunking config: size={chunk_size}, "
93+
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
12994
)
13095

13196
# Get Storage and VLM config from request
@@ -144,7 +109,8 @@ def ReadFromFile(self, request, context):
144109
"path_prefix": sc.path_prefix,
145110
}
146111
logger.info(
147-
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
112+
f"Using Storage config: provider={storage_config.get('provider')}, "
113+
f"bucket={storage_config['bucket_name']}"
148114
)
149115

150116
vlm_config = {
@@ -170,7 +136,7 @@ def ReadFromFile(self, request, context):
170136
)
171137

172138
# Parse file
173-
logger.info(f"Starting file parsing process")
139+
logger.info("Starting file parsing process")
174140
result = self.parser.parse_file(
175141
request.file_name, file_type, request.file_content, chunking_config
176142
)
@@ -184,7 +150,7 @@ def ReadFromFile(self, request, context):
184150

185151
# Convert to protobuf message
186152
logger.info(
187-
f"Successfully parsed file {request.file_name}, returning {len(result.chunks)} chunks"
153+
f"Parsed file {request.file_name}, with {len(result.chunks)} chunks"
188154
)
189155

190156
# Build response, including image info
@@ -224,8 +190,8 @@ def ReadFromURL(self, request, context):
224190
enable_multimodal = request.read_config.enable_multimodal or False
225191

226192
logger.info(
227-
f"Using chunking config: size={chunk_size}, overlap={chunk_overlap}, "
228-
f"multimodal={enable_multimodal}"
193+
f"Using chunking config: size={chunk_size}, "
194+
f"overlap={chunk_overlap}, multimodal={enable_multimodal}"
229195
)
230196

231197
# Get Storage and VLM config from request
@@ -243,7 +209,8 @@ def ReadFromURL(self, request, context):
243209
"path_prefix": sc.path_prefix,
244210
}
245211
logger.info(
246-
f"Using Storage config: provider={storage_config.get('provider')}, bucket={storage_config['bucket_name']}"
212+
f"Using Storage config: provider={storage_config.get('provider')}, "
213+
f"bucket={storage_config['bucket_name']}"
247214
)
248215

249216
vlm_config = {
@@ -269,7 +236,7 @@ def ReadFromURL(self, request, context):
269236
)
270237

271238
# Parse URL
272-
logger.info(f"Starting URL parsing process")
239+
logger.info("Starting URL parsing process")
273240
result = self.parser.parse_url(
274241
request.url, request.title, chunking_config
275242
)
@@ -282,7 +249,7 @@ def ReadFromURL(self, request, context):
282249

283250
# Convert to protobuf message, including image info
284251
logger.info(
285-
f"Successfully parsed URL {request.url}, returning {len(result.chunks)} chunks"
252+
f"Parsed URL {request.url}, returning {len(result.chunks)} chunks"
286253
)
287254

288255
response = ReadResponse(
@@ -335,29 +302,15 @@ def _convert_chunk_to_proto(self, chunk):
335302
return proto_chunk
336303

337304

338-
def init_ocr_engine(ocr_backend, ocr_config):
305+
def init_ocr_engine(ocr_backend: Optional[str] = None, **kwargs):
339306
"""Initialize OCR engine"""
340-
try:
341-
logger.info(f"Initializing OCR engine with backend: {ocr_backend}")
342-
ocr_engine = OCREngine.get_instance(backend_type=ocr_backend, **ocr_config)
343-
if ocr_engine:
344-
logger.info("OCR engine initialized successfully")
345-
return True
346-
else:
347-
logger.error("OCR engine initialization failed")
348-
return False
349-
except Exception as e:
350-
logger.error(f"Error initializing OCR engine: {str(e)}")
351-
return False
307+
backend_type = ocr_backend or os.getenv("OCR_BACKEND", "paddle")
308+
logger.info(f"Initializing OCR engine with backend: {backend_type}")
309+
OCREngine.get_instance(backend_type=backend_type, **kwargs)
352310

353311

354312
def main():
355-
init_ocr_engine(
356-
os.getenv("OCR_BACKEND", "paddle"),
357-
{
358-
"OCR_API_BASE_URL": os.getenv("OCR_API_BASE_URL", ""),
359-
},
360-
)
313+
init_ocr_engine()
361314

362315
# Set max number of worker threads
363316
max_workers = int(os.environ.get("GRPC_MAX_WORKERS", "4"))

docreader/models/__init__.py

Whitespace-only changes.

docreader/models/document.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""Chunk document schema."""
2+
3+
import json
4+
from typing import Any, Dict, List
5+
6+
from pydantic import BaseModel, Field
7+
8+
9+
class Chunk(BaseModel):
10+
"""Document Chunk including chunk content, chunk metadata."""
11+
12+
content: str = Field(default="", description="chunk text content")
13+
seq: int = Field(default=0, description="Chunk sequence number")
14+
start: int = Field(default=0, description="Chunk start position")
15+
end: int = Field(description="Chunk end position")
16+
images: List[Dict[str, Any]] = Field(
17+
default_factory=list, description="Images in the chunk"
18+
)
19+
20+
metadata: Dict[str, Any] = Field(
21+
default_factory=dict,
22+
description="metadata fields",
23+
)
24+
25+
def to_dict(self, **kwargs: Any) -> Dict[str, Any]:
26+
"""Convert Chunk to dict."""
27+
28+
data = self.model_dump()
29+
data.update(kwargs)
30+
data["class_name"] = self.__class__.__name__
31+
return data
32+
33+
def to_json(self, **kwargs: Any) -> str:
34+
"""Convert Chunk to json."""
35+
data = self.to_dict(**kwargs)
36+
return json.dumps(data)
37+
38+
def __hash__(self):
39+
"""Hash function."""
40+
return hash((self.content,))
41+
42+
def __eq__(self, other):
43+
"""Equal function."""
44+
return self.content == other.content
45+
46+
@classmethod
47+
def from_dict(cls, data: Dict[str, Any], **kwargs: Any): # type: ignore
48+
"""Create Chunk from dict."""
49+
if isinstance(kwargs, dict):
50+
data.update(kwargs)
51+
52+
data.pop("class_name", None)
53+
return cls(**data)
54+
55+
@classmethod
56+
def from_json(cls, data_str: str, **kwargs: Any): # type: ignore
57+
"""Create Chunk from json."""
58+
data = json.loads(data_str)
59+
return cls.from_dict(data, **kwargs)
60+
61+
62+
class Document(BaseModel):
63+
"""Document including document content, document metadata."""
64+
65+
model_config = {"arbitrary_types_allowed": True}
66+
67+
content: str = Field(default="", description="document text content")
68+
images: Dict[str, str] = Field(
69+
default_factory=dict, description="Images in the document"
70+
)
71+
72+
chunks: List[Chunk] = Field(default_factory=list, description="document chunks")
73+
metadata: Dict[str, Any] = Field(
74+
default_factory=dict,
75+
description="metadata fields",
76+
)
77+
78+
def set_content(self, content: str) -> None:
79+
"""Set document content."""
80+
self.content = content
81+
82+
def get_content(self) -> str:
83+
"""Get document content."""
84+
return self.content
85+
86+
def is_valid(self) -> bool:
87+
return self.content != ""

docreader/models/read_config.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from dataclasses import dataclass, field
2+
3+
4+
@dataclass
5+
class ChunkingConfig:
6+
"""
7+
Configuration for text chunking process.
8+
Controls how documents are split into smaller pieces for processing.
9+
"""
10+
11+
# Maximum size of each chunk in tokens/chars
12+
chunk_size: int = 512
13+
14+
# Number of tokens/chars to overlap between chunks
15+
chunk_overlap: int = 50
16+
17+
# Text separators in order of priority
18+
separators: list = field(default_factory=lambda: ["\n\n", "\n", "。"])
19+
20+
# Whether to enable multimodal processing (text + images)
21+
enable_multimodal: bool = False
22+
23+
# Preferred field name going forward
24+
storage_config: dict[str, str] = field(default_factory=dict)
25+
26+
# VLM configuration for image captioning
27+
vlm_config: dict[str, str] = field(default_factory=dict)

0 commit comments

Comments
 (0)