nexly-tech · dmitriyskoryy39 · Oct 19, 2024 · Oct 20, 2024 · Oct 20, 2024 · Oct 20, 2024
diff --git a/README.md b/README.md
@@ -62,4 +62,11 @@ Date validation failed: expected "2025-12-31", found "2023-12-31".
 - **Fork the repository.**
 - **Create a new branch** for your solution.
 - **Push your code** to the new branch.
-- **Send the results** for review via Telegram: https://t.me/ashugaev
+- **Send the results** for review via Telegram: https://t.me/ashugaev
+
+
+### Install requirements
+- pip install pdfminer.six
+
+### Start script
+python3 main.py --company_name "Bluegem III GP SARL" --date "2024-10-12"
diff --git a/executors.py b/executors.py
@@ -0,0 +1,12 @@
+
+class ExecutorText:
+
+    def __init__(self, extractor: object, validator: object, search_text: str):
+        self.search_text = search_text
+        self.extractor = extractor
+        self.validator = validator
+
+    async def start_executor(self):
+        data_ex = await self.extractor.get_text()
+        result = await self.validator.validate(self.search_text, data_ex)
+        return result
diff --git a/extractors.py b/extractors.py
@@ -0,0 +1,86 @@
+import unicodedata
+from abc import abstractmethod
+# Для анализа и извлечения текста
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTChar, LTTextBoxHorizontal, LTRect, LTLine
+
+
+class BaseExtractorText:
+    """Base class for extractors"""
+
+    def __init__(self, path_pdf, type_extractor: str, max_pages: int = None):
+        self.path_pdf = path_pdf
+        self.type_extractor = type_extractor
+        self.max_pages = max_pages
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+    async def _get_pages(self):
+        try:
+            all_pages = tuple(
+                value
+                for index, value in enumerate(extract_pages(self.path_pdf, maxpages=self.max_pages))
+            )
+            return all_pages
+        except Exception as e:
+            print(e)
+
+    async def _get_lines_text(self):
+        try:
+            pages = await self._get_pages()
+
+            lines_text = []
+            for page in pages:
+                for line in page:
+                    if isinstance(line, LTTextBoxHorizontal) and \
+                        not isinstance(line, LTLine) and \
+                        not isinstance(line, LTRect):
+                        lines_text.append(line)
+
+            return lines_text
+        except Exception as e:
+            print(e)
+
+    @abstractmethod
+    async def get_text(self) -> list:
+        """factory method"""
+        raise NotImplementedError
+
+
+class ExtractorText(BaseExtractorText):
+
+    params = {
+        'company_name': {
+            'font': 'BCDFEE+Aptos-Bold',
+            'size': 18.0,
+        },
+        'date': {
+            'font': 'BCDHEE+Aptos',
+            'size': 18.0,
+        },
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.param = ExtractorText.params.get(self.type_extractor, None)
+
+    async def get_text(self) -> list:
+        lines_text = await self._get_lines_text()
+
+        try:
+            result = []
+            for line in lines_text:
+                for item in line:
+                    for ch in item:
+                        if isinstance(ch, LTChar):
+                            if ch.size == self.param.get('size') and ch.fontname == self.param.get('font'):
+                                text = unicodedata.normalize("NFC", line.get_text().strip())
+                                result.append(text)
+                                break
+            if not result:
+                print(f"ExtractorText: doesn't find {self.type_extractor}")
+            return result
+        except Exception as e:
+            print(e)
+
diff --git a/main.py b/main.py
@@ -0,0 +1,28 @@
+import asyncio
+import argparse
+
+from pipeline import Pipeline
+
+async def main():
+    # accept terminal parameters
+    parser_terminal = argparse.ArgumentParser()
+    parser_terminal.add_argument('--company_name')
+    parser_terminal.add_argument('--date')
+    args_terminal = parser_terminal.parse_args()
+
+    params_terminal = {
+        'company_name': args_terminal.company_name,
+        'date': args_terminal.date,
+    }
+
+    path_pdf = "report.pdf"
+
+    # create a Pipeline object
+    pipeline = Pipeline(path_pdf, params_terminal)
+
+    # start working Pipeline
+    await pipeline.processing()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pipeline.py b/pipeline.py
@@ -0,0 +1,36 @@
+from validators import ValidateCompanyNameText, ValidateDateText
+from extractors import ExtractorText
+from executors import ExecutorText
+
+
+class Pipeline:
+
+    def __init__(self, path_pdf, params: dict):
+        self.path_pdf = path_pdf
+        self.params = params
+
+    async def processing(self):
+        # accept terminal parameters
+        company_name = self.params.get('company_name')
+        date = self.params.get('date')
+
+        # create an extractor object
+        extractor = ExtractorText(self.path_pdf, 'company_name', 1)
+        # create an validator object
+        validator = ValidateCompanyNameText()
+
+        """create an Executor object that will launch the extractor, 
+        and then, based on the extracted data, transfer it to the validator"""
+        ex_company_name = ExecutorText(extractor, validator, company_name)
+        result = await ex_company_name.start_executor()
+        print(result)
+
+
+        #same steps, but for Date
+        extractor = ExtractorText(self.path_pdf, 'date', 1)
+        validator = ValidateDateText()
+
+        ex_date = ExecutorText(extractor, validator, date)
+        result = await ex_date.start_executor()
+        print(result)
+
diff --git a/validators.py b/validators.py
@@ -0,0 +1,29 @@
+import unicodedata
+
+
+class ValidateCompanyNameText:
+
+    @staticmethod
+    async def validate(search_text: str, data_extractor: list[str]):
+        try:
+            unicodedata.normalize("NFC", search_text)
+            if search_text in data_extractor:
+                return 'Company name validation passed.'
+            else:
+                return f'Company name validation failed: expected "{search_text}", found "{data_extractor}".'
+        except Exception as e:
+            print(e)
+
+
+class ValidateDateText:
+    @staticmethod
+    async def validate(search_text: str, data_extractor: list[str]):
+        try:
+            unicodedata.normalize("NFC", search_text)
+
+            if search_text in data_extractor:
+                return 'Date validation passed.'
+            else:
+                return f'Date validation failed: expected "{search_text}", found "{data_extractor}".'
+        except Exception as e:
+            print(e)