VisionTextGem/ocr_engine.py at main · thewornarchitect/VisionTextGem · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pytesseract
import mss
import threading
import time
import sys
import os
from PIL import Image, ImageEnhance, ImageOps

class OCREngine:
    def __init__(self, callback, get_region_callback):
        self.callback = callback
        self.get_region_callback = get_region_callback
        self.running = False
        self.thread = None
        self.interval = 1.0  # seconds between captures

        # Attempt to find tesseract
        # If the user has it in PATH, this isn't needed, but often on Windows it's in standard dirs
        self._configure_tesseract()

    def _configure_tesseract(self):
        # Check if tesseract is in PATH
        try:
            pytesseract.get_tesseract_version()
            return
        except pytesseract.TesseractNotFoundError:
            pass

        # Common Windows paths
        paths = [
            r'C:\Program Files\Tesseract-OCR\tesseract.exe',
            r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
            r'C:\Users\\' + os.getlogin() + r'\AppData\Local\Tesseract-OCR\tesseract.exe'
        ]
        for p in paths:
            if os.path.exists(p):
                pytesseract.pytesseract.tesseract_cmd = p
                return

        print("Warning: Tesseract not found in standard paths. OCR may fail.")

    def start(self):
        if self.running:
            return
        self.running = True
        self.thread = threading.Thread(target=self._run_loop, daemon=True)
        self.thread.start()

    def stop(self):
        self.running = False
        # Thread will join naturally

    def _run_loop(self):
        with mss.mss() as sct:
            while self.running:
                try:
                    # Get region from UI
                    # Region format: {'top': y, 'left': x, 'width': w, 'height': h}
                    region = self.get_region_callback()

                    if not region or region['width'] < 10 or region['height'] < 10:
                        # Box might be hidden or too small
                        time.sleep(self.interval)
                        continue

                    # mss requires integers
                    monitor = {
                        "top": int(region['top']),
                        "left": int(region['left']),
                        "width": int(region['width']),
                        "height": int(region['height'])
                    }

                    # Capture
                    sct_img = sct.grab(monitor)
                    img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")

                    # Preprocessing
                    # 1. Grayscale
                    img = img.convert('L')

                    # 2. Resize (Upscale) for better clarity
                    # Using a safe bicubic or lanczos if available
                    img = img.resize((img.width * 3, img.height * 3), Image.Resampling.LANCZOS)

                    # 3. Enhance Contrast
                    enhancer = ImageEnhance.Contrast(img)
                    img = enhancer.enhance(2.0)

                    # 4. Enhance Sharpness
                    enhancer = ImageEnhance.Sharpness(img)
                    img = enhancer.enhance(2.0)

                    # 5. Thresholding (Binarization)
                    # Isolate text (assuming white text on dark background)
                    # Any pixel brighter than 128 becomes 255 (white), else 0 (black)
                    thresh = 100
                    fn = lambda x : 255 if x > thresh else 0
                    img = img.convert('L').point(fn, mode='1')

                    # 6. Inversion
                    # Convert White-Text/Black-BG -> Black-Text/White-BG for Tesseract
                    img = ImageOps.invert(img.convert('L'))

                    # OCR
                    # --psm 6 usually assumes a single uniform block of text
                    text = pytesseract.image_to_string(img, config='--psm 6')

                    if text.strip():
                        self.callback(text)

                except Exception as e:
                    print(f"OCR Loop Error: {e}")

                time.sleep(self.interval)