Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,11 @@ Date validation failed: expected "2025-12-31", found "2023-12-31".
- **Fork the repository.**
- **Create a new branch** for your solution.
- **Push your code** to the new branch.
- **Send the results** for review via Telegram: https://t.me/ashugaev
- **Send the results** for review via Telegram: https://t.me/ashugaev


### Install requirements
- pip install pdfminer.six

### Start script
python3 main.py --company_name "Bluegem III GP SARL" --date "2024-10-12"
12 changes: 12 additions & 0 deletions executors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

class ExecutorText:

def __init__(self, extractor: object, validator: object, search_text: str):
self.search_text = search_text
self.extractor = extractor
self.validator = validator

async def start_executor(self):
data_ex = await self.extractor.get_text()
result = await self.validator.validate(self.search_text, data_ex)
return result
86 changes: 86 additions & 0 deletions extractors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import unicodedata
from abc import abstractmethod
# Для анализа и извлечения текста
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTTextBoxHorizontal, LTRect, LTLine


class BaseExtractorText:
"""Base class for extractors"""

def __init__(self, path_pdf, type_extractor: str, max_pages: int = None):
self.path_pdf = path_pdf
self.type_extractor = type_extractor
self.max_pages = max_pages

def __repr__(self):
return self.__class__.__name__

async def _get_pages(self):
try:
all_pages = tuple(
value
for index, value in enumerate(extract_pages(self.path_pdf, maxpages=self.max_pages))
)
return all_pages
except Exception as e:
print(e)

async def _get_lines_text(self):
try:
pages = await self._get_pages()

lines_text = []
for page in pages:
for line in page:
if isinstance(line, LTTextBoxHorizontal) and \
not isinstance(line, LTLine) and \
not isinstance(line, LTRect):
lines_text.append(line)

return lines_text
except Exception as e:
print(e)

@abstractmethod
async def get_text(self) -> list:
"""factory method"""
raise NotImplementedError


class ExtractorText(BaseExtractorText):

params = {
'company_name': {
'font': 'BCDFEE+Aptos-Bold',
'size': 18.0,
},
'date': {
'font': 'BCDHEE+Aptos',
'size': 18.0,
},
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.param = ExtractorText.params.get(self.type_extractor, None)

async def get_text(self) -> list:
lines_text = await self._get_lines_text()

try:
result = []
for line in lines_text:
for item in line:
for ch in item:
if isinstance(ch, LTChar):
if ch.size == self.param.get('size') and ch.fontname == self.param.get('font'):
text = unicodedata.normalize("NFC", line.get_text().strip())
result.append(text)
break
if not result:
print(f"ExtractorText: doesn't find {self.type_extractor}")
return result
except Exception as e:
print(e)

28 changes: 28 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import asyncio
import argparse

from pipeline import Pipeline

async def main():
# accept terminal parameters
parser_terminal = argparse.ArgumentParser()
parser_terminal.add_argument('--company_name')
parser_terminal.add_argument('--date')
args_terminal = parser_terminal.parse_args()

params_terminal = {
'company_name': args_terminal.company_name,
'date': args_terminal.date,
}

path_pdf = "report.pdf"

# create a Pipeline object
pipeline = Pipeline(path_pdf, params_terminal)

# start working Pipeline
await pipeline.processing()


if __name__ == "__main__":
asyncio.run(main())
36 changes: 36 additions & 0 deletions pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from validators import ValidateCompanyNameText, ValidateDateText
from extractors import ExtractorText
from executors import ExecutorText


class Pipeline:

def __init__(self, path_pdf, params: dict):
self.path_pdf = path_pdf
self.params = params

async def processing(self):
# accept terminal parameters
company_name = self.params.get('company_name')
date = self.params.get('date')

# create an extractor object
extractor = ExtractorText(self.path_pdf, 'company_name', 1)
# create an validator object
validator = ValidateCompanyNameText()

"""create an Executor object that will launch the extractor,
and then, based on the extracted data, transfer it to the validator"""
ex_company_name = ExecutorText(extractor, validator, company_name)
result = await ex_company_name.start_executor()
print(result)


#same steps, but for Date
extractor = ExtractorText(self.path_pdf, 'date', 1)
validator = ValidateDateText()

ex_date = ExecutorText(extractor, validator, date)
result = await ex_date.start_executor()
print(result)

29 changes: 29 additions & 0 deletions validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import unicodedata


class ValidateCompanyNameText:

@staticmethod
async def validate(search_text: str, data_extractor: list[str]):
try:
unicodedata.normalize("NFC", search_text)
if search_text in data_extractor:
return 'Company name validation passed.'
else:
return f'Company name validation failed: expected "{search_text}", found "{data_extractor}".'
except Exception as e:
print(e)


class ValidateDateText:
@staticmethod
async def validate(search_text: str, data_extractor: list[str]):
try:
unicodedata.normalize("NFC", search_text)

if search_text in data_extractor:
return 'Date validation passed.'
else:
return f'Date validation failed: expected "{search_text}", found "{data_extractor}".'
except Exception as e:
print(e)