Skip to content

Commit f40323e

Browse files
Sid MohanSid Mohan
authored andcommitted
pre-commit checks passed
1 parent 4b1e29a commit f40323e

File tree

10 files changed

+161
-105
lines changed

10 files changed

+161
-105
lines changed

README.md

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ DataFog is an open-source DevSecOps platform that lets you scan and redact Perso
2727

2828
![image](https://github.com/DataFog/datafog-python/assets/61345237/57fba4e5-21cc-458f-ac6a-6fbbb70a8de1)
2929

30-
3130
How do you keep:
3231

3332
- Customer PII
@@ -45,7 +44,6 @@ from entering a Generative AI environment in the first place? What you need is a
4544

4645
![image](https://github.com/DataFog/datafog-python/assets/61345237/91f4634a-8a9f-4621-81bc-09930feda78a)
4746

48-
4947
### There's lots of PII tools out there; why DataFog?
5048

5149
If you look at the landscape of PII detection tools, their very existence was in many cases driven by regulatory requirements (i.e. 'comply with CCPA/GDPR/HIPAA').
@@ -55,11 +53,10 @@ are purpose-built for the problem that they are solving.
5553
However, Generative AI changes how we think about privacy. There's now a changing set of privacy requirements (new M&A deals, internal discussions means new terms to scan/redact) as well as different and varying document sources to contend with. PII detection is no longer just about compliance, it's an active - and for some, new - internal security threat for CISOs and Eng Leaders to contend with. We want DataFog to be built and driven to meet the needs of the open-source community as they tackle this challenge.
5654

5755
### Roadmap
58-
DataFog is an active project with regular weekly releases to production (typically on/around Monday evenings US PT). Here's a snapshot of our coming roadmap; if you have questions or would like to weigh in, join our discord and let us know what we can do to make the product better!
59-
60-
![image](https://github.com/DataFog/datafog-python/assets/61345237/62964d22-a221-4f1d-a0e6-0cc99de2ba92)
6156

57+
DataFog is an active project with regular weekly releases to production (typically on/around Monday evenings US PT). Here's a snapshot of our coming roadmap; if you have questions or would like to weigh in, join our discord and let us know what we can do to make the product better!
6258

59+
![image](https://github.com/DataFog/datafog-python/assets/61345237/62964d22-a221-4f1d-a0e6-0cc99de2ba92)
6360

6461
## Installation
6562

src/datafog/__about__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
__version__ = "3.0.0-beta.6"
2-

src/datafog/__init__.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
1-
from .main import DataFog
2-
from .config import PipelineOperationType, OperationConfig
1+
from .config import OperationConfig, PipelineOperationType
32
from .donuttransformer import DonutImageProcessor
4-
from .pii_annotation import PIIAnnotationModel, PIIAnnotationRequest, PIIAnnotationPipeline
3+
from .main import DataFog
4+
from .pii_annotation import (
5+
PIIAnnotationModel,
6+
PIIAnnotationPipeline,
7+
PIIAnnotationRequest,
8+
)
59

610
__all__ = [
7-
'DataFog',
8-
'PipelineOperationType',
9-
'OperationConfig',
10-
'DonutImageProcessor',
11-
'PIIAnnotationModel',
12-
'PIIAnnotationRequest',
13-
'PIIAnnotationPipeline'
11+
"DataFog",
12+
"PipelineOperationType",
13+
"OperationConfig",
14+
"DonutImageProcessor",
15+
"PIIAnnotationModel",
16+
"PIIAnnotationRequest",
17+
"PIIAnnotationPipeline",
1418
]
15-

src/datafog/config.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,21 @@
11
from enum import Enum
2+
from typing import Optional
3+
24
from pydantic import BaseModel
3-
from typing import Optional, Dict, List
5+
46

57
class PipelineOperationType(Enum):
6-
READ_IMAGE = 'read_image'
7-
PARSE_IMAGE = 'parse_image'
8-
TEXT_PII_ANNOTATION = 'text_pii_annotation'
9-
TEXT_PII_ANNOTATION_WITH_IMAGE = 'text_pii_annotation_with_image'
8+
READ_IMAGE = "read_image"
9+
PARSE_IMAGE = "parse_image"
10+
TEXT_PII_ANNOTATION = "text_pii_annotation"
11+
TEXT_PII_ANNOTATION_WITH_IMAGE = "text_pii_annotation_with_image"
12+
1013

1114
class ModelConfig(BaseModel):
1215
model: str
1316
processor: str
1417

18+
1519
class OperationConfig(BaseModel):
1620
operation_type: PipelineOperationType
1721
config: Optional[ModelConfig] = None
@@ -21,16 +25,18 @@ def model_validator(cls, v, values):
2125
configs = {
2226
"read_image": ModelConfig(
2327
model="naver-clova-ix/donut-base-finetuned-rvlcdip",
24-
processor="naver-clova-ix/donut-base-finetuned-rvlcdip"),
28+
processor="naver-clova-ix/donut-base-finetuned-rvlcdip",
29+
),
2530
"parse_image": ModelConfig(
2631
model="naver-clova-ix/donut-base-finetuned-cord-v2",
27-
processor="naver-clova-ix/donut-base-finetuned-cord-v2")
32+
processor="naver-clova-ix/donut-base-finetuned-cord-v2",
33+
),
2834
}
29-
operation_type = values.get('operation_type')
35+
operation_type = values.get("operation_type")
3036
if operation_type and operation_type.value in configs:
3137
return configs[operation_type.value]
3238
return v
33-
39+
3440
class Config:
3541
use_enum_values = True
3642
validate_assignment = True

src/datafog/donuttransformer.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
1-
from pydantic import HttpUrl
2-
from transformers import DonutProcessor, VisionEncoderDecoderModel
3-
from PIL import Image
1+
import re
42
import warnings
53
from io import BytesIO
6-
import re
7-
import torch
8-
from enum import Enum
4+
5+
from PIL import Image
6+
from transformers import DonutProcessor, VisionEncoderDecoderModel
7+
98
from .config import OperationConfig, PipelineOperationType
109

1110

@@ -35,9 +34,12 @@ class DonutImageProcessor:
3534
_process_image(image: Image.Image, operation_type_prompt: str) -> dict:
3635
A helper method to process the image with the model using a specific operation type prompt.
3736
"""
37+
3838
def __init__(self, operation_type: PipelineOperationType):
3939
self.operation_type = operation_type
40-
model_config = OperationConfig.model_validator(None, {'operation_type': operation_type})
40+
model_config = OperationConfig.model_validator(
41+
None, {"operation_type": operation_type}
42+
)
4143
self.processor = DonutProcessor.from_pretrained(model_config.processor)
4244
self.model = VisionEncoderDecoderModel.from_pretrained(model_config.model)
4345
self.device = "cpu"
@@ -52,20 +54,28 @@ def read_image(file: bytes) -> Image.Image:
5254
image = image.convert("RGB")
5355
return image
5456
except IOError as e:
55-
raise ValueError(f"Unable to read the image file: {e}. Ensure it is a valid image.")
57+
raise ValueError(
58+
f"Unable to read the image file: {e}. Ensure it is a valid image."
59+
)
5660

5761
def classify_image(self, image: Image.Image) -> dict:
5862
return self._process_image(image, operation_type_prompt="<s_rvlcdip>")
5963

6064
def parse_image(self, image: Image.Image) -> dict:
6165
return self._process_image(image, operation_type_prompt="<s_cord-v2>")
6266

63-
def question_image(self, image: Image.Image, question: str = "what is shown in this image?") -> dict:
64-
operation_type_prompt = f"<s_docvqa><s_question>{question}</s_question><s_answer>"
67+
def question_image(
68+
self, image: Image.Image, question: str = "what is shown in this image?"
69+
) -> dict:
70+
operation_type_prompt = (
71+
f"<s_docvqa><s_question>{question}</s_question><s_answer>"
72+
)
6573
return self._process_image(image, operation_type_prompt=operation_type_prompt)
6674

6775
def _process_image(self, image: Image.Image, operation_type_prompt: str) -> dict:
68-
decoder_input_ids = self.processor.tokenizer(operation_type_prompt, add_special_tokens=False, return_tensors="pt").input_ids
76+
decoder_input_ids = self.processor.tokenizer(
77+
operation_type_prompt, add_special_tokens=False, return_tensors="pt"
78+
).input_ids
6979
pixel_values = self.processor(image, return_tensors="pt").pixel_values
7080

7181
outputs = self.model.generate(
@@ -82,7 +92,11 @@ def _process_image(self, image: Image.Image, operation_type_prompt: str) -> dict
8292
)
8393

8494
sequence = self.processor.batch_decode(outputs.sequences)[0]
85-
sequence = sequence.replace(self.processor.tokenizer.eos_token, "").replace(self.processor.tokenizer.pad_token, "")
86-
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first operation_type start token
95+
sequence = sequence.replace(self.processor.tokenizer.eos_token, "").replace(
96+
self.processor.tokenizer.pad_token, ""
97+
)
98+
sequence = re.sub(
99+
r"<.*?>", "", sequence, count=1
100+
).strip() # remove first operation_type start token
87101

88102
return self.processor.token2json(sequence)

src/datafog/main.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
from .donuttransformer import DonutImageProcessor, PipelineOperationType
2-
from PIL import Image
3-
from .pii_annotation import PIIAnnotationModel, PIIAnnotationRequest, PIIAnnotationPipeline
4-
from typing import Any, List, Optional, Tuple
5-
from enum import Enum
6-
7-
1+
from .donuttransformer import DonutImageProcessor, PipelineOperationType
2+
from .pii_annotation import (
3+
PIIAnnotationModel,
4+
PIIAnnotationPipeline,
5+
PIIAnnotationRequest,
6+
)
87

98

109
class DataFog:
@@ -26,21 +25,25 @@ class DataFog:
2625
process_image_with_text(file: bytes, text: str) -> dict:
2726
Processes the image for different operations using the Donut model and text for PII entities using the Spacy model.
2827
"""
29-
def __init__(self, operation_type: PipelineOperationType, text_pii_annotation: bool = True, image_processor: bool = False):
28+
29+
def __init__(
30+
self,
31+
operation_type: PipelineOperationType,
32+
text_pii_annotation: bool = True,
33+
image_processor: bool = False,
34+
):
3035
self.text_pii_annotation = text_pii_annotation
3136
if text_pii_annotation:
3237
self.text_annotator = PIIAnnotationModel()
3338
self.image_processor = None
3439
if image_processor:
3540
self.image_processor = DonutImageProcessor(operation_type=operation_type)
3641

37-
3842
def process_text(self, text: str) -> list:
3943
request = PIIAnnotationRequest(text=text)
4044
workflow = PIIAnnotationPipeline(request=request, model=self.text_annotator)
4145
entities = workflow.process_request()
4246
return entities
43-
4447

4548
def process_image(self, file: bytes) -> dict:
4649
image = DonutImageProcessor.read_image(file)
@@ -49,7 +52,9 @@ def process_image(self, file: bytes) -> dict:
4952
elif self.image_processor.operation_type == PipelineOperationType.PARSE_IMAGE:
5053
result = self.image_processor.parse_image(image)
5154
else:
52-
raise ValueError(f"Unsupported operation type: {self.image_processor.operation_type}")
55+
raise ValueError(
56+
f"Unsupported operation type: {self.image_processor.operation_type}"
57+
)
5358
return result
5459

5560
def annotate_pii_in_images(self, file: bytes) -> dict:
@@ -59,9 +64,8 @@ def annotate_pii_in_images(self, file: bytes) -> dict:
5964
for key, value in result.items():
6065
if isinstance(value, list):
6166
for item in value:
62-
if 'nm' in item:
63-
item['entities'] = self.process_text(item['nm']) # Store entities directly within the item
67+
if "nm" in item:
68+
item["entities"] = self.process_text(
69+
item["nm"]
70+
) # Store entities directly within the item
6471
return result # Return the modified result dictionary
65-
66-
67-

src/datafog/pii_annotation.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# Description: Define the data models for the PII Detection Workflow
22

3-
from typing import Any, List, Optional, Tuple
3+
from typing import Any, List, Tuple
4+
45
import en_spacy_pii_fast
5-
import requests
6-
from pydantic import BaseModel, HttpUrl, FilePath, DirectoryPath
6+
from pydantic import BaseModel, DirectoryPath, FilePath, HttpUrl
77

88

99
class PIIAnnotationModel(BaseModel):
@@ -32,8 +32,7 @@ def validate_fields(self):
3232
if not any([self.text, self.file_path, self.url, self.directory_path]):
3333
raise ValueError("At least one of the fields must be filled out")
3434
return True
35-
36-
35+
3736

3837
class PIIAnnotationResponse(BaseModel):
3938
text: str
@@ -57,4 +56,3 @@ def run(self):
5756

5857
class Config:
5958
arbitrary_types_allowed = True
60-

tests/test_donuttransformer.py

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,42 @@
1-
import unittest
2-
from PIL import Image
31
import pytest
4-
from PIL import Image
5-
from datafog import PipelineOperationType, DonutImageProcessor
6-
7-
# class TestDonutImageProcessor(unittest.TestCase):
8-
# def setUp(self):
9-
# self.processor = DonutImageProcessor(operation_type=PipelineOperationType.PARSE_IMAGE)
10-
# with open("/Users/sidmohan/Desktop/v3.0.0/datafog-python/src/datafog/test-invoice.png", "rb") as image_file:
11-
# self.image_data = image_file.read()
12-
13-
# def test_parse_image(self):
14-
# image = DonutImageProcessor.read_image(self.image_data)
15-
# result = self.processor.parse_image(image)
16-
# self.assertIsInstance(result, dict)
17-
# self.assertIn('MEDICAL BILLING INVOICE', [item['nm'] for sublist in result.values() if isinstance(sublist, list) for item in sublist])
18-
# self.assertIn('12245', [item['price']['unitprice'] for sublist in result.values() if isinstance(sublist, list) for item in sublist if 'unitprice' in item['price']])
19-
# self.assertIn('Full Check Up', [item['nm'] for sublist in result.values() if isinstance(sublist, list) for item in sublist])
20-
# self.assertIn('Ear & Throat Examination', [item['nm'] for sublist in result.values() if isinstance(sublist, list) for item in sublist])
21-
22-
# if __name__ == '__main__':
23-
# unittest.main()
242

3+
from datafog import DonutImageProcessor, PipelineOperationType
254

265

276
@pytest.fixture
287
def processor():
298
return DonutImageProcessor(operation_type=PipelineOperationType.PARSE_IMAGE)
309

10+
3111
def test_parse_image(processor):
3212
sample_image_path = "tests/test-invoice.png"
3313
with open(sample_image_path, "rb") as image_file:
3414
image_data = image_file.read()
3515
image = DonutImageProcessor.read_image(image_data)
3616
result = processor.parse_image(image)
3717
assert isinstance(result, dict)
38-
assert 'MEDICAL BILLING INVOICE' in [item['nm'] for sublist in result.values() if isinstance(sublist, list) for item in sublist]
39-
assert '12245' in [item['price']['unitprice'] for sublist in result.values() if isinstance(sublist, list) for item in sublist if 'unitprice' in item['price']]
40-
assert 'Full Check Up' in [item['nm'] for sublist in result.values() if isinstance(sublist, list) for item in sublist]
41-
assert 'Ear & Throat Examination' in [item['nm'] for sublist in result.values() if isinstance(sublist, list) for item in sublist]
18+
assert "MEDICAL BILLING INVOICE" in [
19+
item["nm"]
20+
for sublist in result.values()
21+
if isinstance(sublist, list)
22+
for item in sublist
23+
]
24+
assert "12245" in [
25+
item["price"]["unitprice"]
26+
for sublist in result.values()
27+
if isinstance(sublist, list)
28+
for item in sublist
29+
if "unitprice" in item["price"]
30+
]
31+
assert "Full Check Up" in [
32+
item["nm"]
33+
for sublist in result.values()
34+
if isinstance(sublist, list)
35+
for item in sublist
36+
]
37+
assert "Ear & Throat Examination" in [
38+
item["nm"]
39+
for sublist in result.values()
40+
if isinstance(sublist, list)
41+
for item in sublist
42+
]

0 commit comments

Comments
 (0)