1+ import logging
12import os
3+ import re
24import sys
3- import logging
4- from concurrent import futures
55import traceback
6- import grpc
76import uuid
8- import atexit
7+ from concurrent import futures
8+ from typing import Optional
9+
10+ import grpc
911from grpc_health .v1 import health_pb2_grpc
1012from grpc_health .v1 .health import HealthServicer
1113
12- # Add parent directory to Python path
13- current_dir = os .path .dirname (os .path .abspath (__file__ ))
14- parent_dir = os .path .dirname (current_dir )
15- if parent_dir not in sys .path :
16- sys .path .insert (0 , parent_dir )
17-
18- from proto .docreader_pb2 import ReadResponse , Chunk , Image
19- from proto import docreader_pb2_grpc
20- from parser import Parser , OCREngine
21- from parser .config import ChunkingConfig
22- from utils .request import request_id_context , init_logging_request_id
14+ from docreader .models .read_config import ChunkingConfig
15+ from docreader .parser import Parser
16+ from docreader .parser .ocr_engine import OCREngine
17+ from docreader .proto import docreader_pb2_grpc
18+ from docreader .proto .docreader_pb2 import Chunk , Image , ReadResponse
19+ from docreader .utils .request import init_logging_request_id , request_id_context
2320
24- # --- Encoding utilities: sanitize strings to valid UTF-8 and (optionally) multi-encoding read ---
25- import re
26- from typing import Optional
27-
28- try :
29- # Optional dependency for charset detection; install via `pip install charset-normalizer`
30- from charset_normalizer import from_bytes as _cn_from_bytes # type: ignore
31- except Exception : # pragma: no cover
32- _cn_from_bytes = None # type: ignore
33-
34- # Surrogate range U+D800..U+DFFF are invalid Unicode scalar values and cannot be encoded to UTF-8
21+ # Surrogate range U+D800..U+DFFF are invalid Unicode scalar values
22+ # cannot be encoded to UTF-8
3523_SURROGATE_RE = re .compile (r"[\ud800-\udfff]" )
3624
3725
@@ -47,29 +35,6 @@ def to_valid_utf8_text(s: Optional[str]) -> str:
4735 return s .encode ("utf-8" , errors = "replace" ).decode ("utf-8" )
4836
4937
50- def read_text_with_fallback (file_path : str ) -> str :
51- """Read text from file supporting multiple encodings with graceful fallback.
52-
53- This server currently receives bytes over gRPC and delegates decoding to the parser.
54- This helper is provided for future local-file reads if needed.
55- """
56- with open (file_path , "rb" ) as f :
57- raw = f .read ()
58- if _cn_from_bytes is not None :
59- try :
60- result = _cn_from_bytes (raw ).best ()
61- if result :
62- return str (result )
63- except Exception :
64- pass
65- for enc in ("utf-8" , "gb18030" , "latin-1" ):
66- try :
67- return raw .decode (enc , errors = "replace" )
68- except UnicodeDecodeError :
69- continue
70- return raw .decode ("utf-8" , errors = "replace" )
71-
72-
7338# Ensure no existing handlers
7439for handler in logging .root .handlers [:]:
7540 logging .root .removeHandler (handler )
@@ -113,7 +78,7 @@ def ReadFromFile(self, request, context):
11378 request .file_type or os .path .splitext (request .file_name )[1 ][1 :]
11479 )
11580 logger .info (
116- f"Received ReadFromFile request for file: { request .file_name } , type: { file_type } "
81+ f"ReadFromFile for file: { request .file_name } , type: { file_type } "
11782 )
11883 logger .info (f"File content size: { len (request .file_content )} bytes" )
11984
@@ -124,8 +89,8 @@ def ReadFromFile(self, request, context):
12489 enable_multimodal = request .read_config .enable_multimodal or False
12590
12691 logger .info (
127- f"Using chunking config: size={ chunk_size } , overlap= { chunk_overlap } , "
128- f"multimodal={ enable_multimodal } "
92+ f"Using chunking config: size={ chunk_size } , "
93+ f"overlap= { chunk_overlap } , multimodal={ enable_multimodal } "
12994 )
13095
13196 # Get Storage and VLM config from request
@@ -144,7 +109,8 @@ def ReadFromFile(self, request, context):
144109 "path_prefix" : sc .path_prefix ,
145110 }
146111 logger .info (
147- f"Using Storage config: provider={ storage_config .get ('provider' )} , bucket={ storage_config ['bucket_name' ]} "
112+ f"Using Storage config: provider={ storage_config .get ('provider' )} , "
113+ f"bucket={ storage_config ['bucket_name' ]} "
148114 )
149115
150116 vlm_config = {
@@ -170,7 +136,7 @@ def ReadFromFile(self, request, context):
170136 )
171137
172138 # Parse file
173- logger .info (f "Starting file parsing process" )
139+ logger .info ("Starting file parsing process" )
174140 result = self .parser .parse_file (
175141 request .file_name , file_type , request .file_content , chunking_config
176142 )
@@ -184,7 +150,7 @@ def ReadFromFile(self, request, context):
184150
185151 # Convert to protobuf message
186152 logger .info (
187- f"Successfully parsed file { request .file_name } , returning { len (result .chunks )} chunks"
153+ f"Parsed file { request .file_name } , with { len (result .chunks )} chunks"
188154 )
189155
190156 # Build response, including image info
@@ -224,8 +190,8 @@ def ReadFromURL(self, request, context):
224190 enable_multimodal = request .read_config .enable_multimodal or False
225191
226192 logger .info (
227- f"Using chunking config: size={ chunk_size } , overlap= { chunk_overlap } , "
228- f"multimodal={ enable_multimodal } "
193+ f"Using chunking config: size={ chunk_size } , "
194+ f"overlap= { chunk_overlap } , multimodal={ enable_multimodal } "
229195 )
230196
231197 # Get Storage and VLM config from request
@@ -243,7 +209,8 @@ def ReadFromURL(self, request, context):
243209 "path_prefix" : sc .path_prefix ,
244210 }
245211 logger .info (
246- f"Using Storage config: provider={ storage_config .get ('provider' )} , bucket={ storage_config ['bucket_name' ]} "
212+ f"Using Storage config: provider={ storage_config .get ('provider' )} , "
213+ f"bucket={ storage_config ['bucket_name' ]} "
247214 )
248215
249216 vlm_config = {
@@ -269,7 +236,7 @@ def ReadFromURL(self, request, context):
269236 )
270237
271238 # Parse URL
272- logger .info (f "Starting URL parsing process" )
239+ logger .info ("Starting URL parsing process" )
273240 result = self .parser .parse_url (
274241 request .url , request .title , chunking_config
275242 )
@@ -282,7 +249,7 @@ def ReadFromURL(self, request, context):
282249
283250 # Convert to protobuf message, including image info
284251 logger .info (
285- f"Successfully parsed URL { request .url } , returning { len (result .chunks )} chunks"
252+ f"Parsed URL { request .url } , returning { len (result .chunks )} chunks"
286253 )
287254
288255 response = ReadResponse (
@@ -335,29 +302,15 @@ def _convert_chunk_to_proto(self, chunk):
335302 return proto_chunk
336303
337304
338- def init_ocr_engine (ocr_backend , ocr_config ):
305+ def init_ocr_engine (ocr_backend : Optional [ str ] = None , ** kwargs ):
339306 """Initialize OCR engine"""
340- try :
341- logger .info (f"Initializing OCR engine with backend: { ocr_backend } " )
342- ocr_engine = OCREngine .get_instance (backend_type = ocr_backend , ** ocr_config )
343- if ocr_engine :
344- logger .info ("OCR engine initialized successfully" )
345- return True
346- else :
347- logger .error ("OCR engine initialization failed" )
348- return False
349- except Exception as e :
350- logger .error (f"Error initializing OCR engine: { str (e )} " )
351- return False
307+ backend_type = ocr_backend or os .getenv ("OCR_BACKEND" , "paddle" )
308+ logger .info (f"Initializing OCR engine with backend: { backend_type } " )
309+ OCREngine .get_instance (backend_type = backend_type , ** kwargs )
352310
353311
354312def main ():
355- init_ocr_engine (
356- os .getenv ("OCR_BACKEND" , "paddle" ),
357- {
358- "OCR_API_BASE_URL" : os .getenv ("OCR_API_BASE_URL" , "" ),
359- },
360- )
313+ init_ocr_engine ()
361314
362315 # Set max number of worker threads
363316 max_workers = int (os .environ .get ("GRPC_MAX_WORKERS" , "4" ))
0 commit comments