Skip to content

Commit 4abd1ea

Browse files
authored
Merge pull request #21 from DataFog/v3.0.0-beta.5
V3.0.0
2 parents 808696a + 622c903 commit 4abd1ea

23 files changed

+465
-1405
lines changed

.DS_Store

0 Bytes
Binary file not shown.

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ build/
1616
node_modules/
1717
datafog_debug.log
1818
sotu_2023.txt
19-
.DS_Store
19+
.DS_Store

README.md

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ DataFog is an open-source DevSecOps platform that lets you scan and redact Perso
2727

2828
![image](https://github.com/DataFog/datafog-python/assets/61345237/57fba4e5-21cc-458f-ac6a-6fbbb70a8de1)
2929

30-
3130
How do you keep:
3231

3332
- Customer PII
@@ -45,7 +44,6 @@ from entering a Generative AI environment in the first place? What you need is a
4544

4645
![image](https://github.com/DataFog/datafog-python/assets/61345237/91f4634a-8a9f-4621-81bc-09930feda78a)
4746

48-
4947
### There's lots of PII tools out there; why DataFog?
5048

5149
If you look at the landscape of PII detection tools, their very existence was in many cases driven by regulatory requirements (i.e. 'comply with CCPA/GDPR/HIPAA').
@@ -55,11 +53,10 @@ are purpose-built for the problem that they are solving.
5553
However, Generative AI changes how we think about privacy. There's now a changing set of privacy requirements (new M&A deals, internal discussions means new terms to scan/redact) as well as different and varying document sources to contend with. PII detection is no longer just about compliance, it's an active - and for some, new - internal security threat for CISOs and Eng Leaders to contend with. We want DataFog to be built and driven to meet the needs of the open-source community as they tackle this challenge.
5654

5755
### Roadmap
58-
DataFog is an active project with regular weekly releases to production (typically on/around Monday evenings US PT). Here's a snapshot of our coming roadmap; if you have questions or would like to weigh in, join our discord and let us know what we can do to make the product better!
59-
60-
![image](https://github.com/DataFog/datafog-python/assets/61345237/62964d22-a221-4f1d-a0e6-0cc99de2ba92)
6156

57+
DataFog is an active project with regular weekly releases to production (typically on/around Monday evenings US PT). Here's a snapshot of our coming roadmap; if you have questions or would like to weigh in, join our discord and let us know what we can do to make the product better!
6258

59+
![image](https://github.com/DataFog/datafog-python/assets/61345237/62964d22-a221-4f1d-a0e6-0cc99de2ba92)
6360

6461
## Installation
6562

requirements.txt

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
1-
# spacy==3.4.4
2-
# thinc==8.1.0
3-
presidio_analyzer==2.2.353
4-
pandas==2.2.1
5-
pytest==8.0.2
6-
Requests==2.31.0
7-
aiohttp==3.8.2
8-
yarl==1.8.1
9-
frozenlist==1.3.1
10-
en_spacy_pii_fast
11-
unstructured[pdf]
12-
unstructured[pptx]
13-
14-
1+
en_spacy_pii_fast==0.0.0
2+
transformers
3+
torch
4+
pyspark
5+
pydantic
6+
pandas
7+
Pillow
8+
sentencepiece
9+
protobuf
10+
pytest

setup.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
def __version__():
9-
return "2.4.0"
9+
return "3.0.0"
1010

1111

1212
project_urls = {
@@ -27,9 +27,7 @@ def __version__():
2727
long_description=long_description,
2828
long_description_content_type="text/markdown",
2929
install_requires=[
30-
"pandas==2.2.1",
31-
"presidio_analyzer==2.2.353",
32-
"pytest==8.0.2",
30+
"pandas==2.2.2",
3331
"Requests==2.31.0",
3432
"spacy==3.4.4",
3533
"en_spacy_pii_fast",

src/datafog/__about__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
# SSOT for the package version
2-
__version__ = "2.4.0"
1+
__version__ = "3.0.0"

src/datafog/__init__.py

Lines changed: 15 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -1,149 +1,18 @@
1-
# datafog-python/src/datafog/__init__.py
2-
import json
3-
import logging
4-
import tempfile
5-
from pathlib import Path
6-
from typing import List
7-
8-
import pandas as pd
9-
import requests
10-
import spacy
11-
from unstructured.partition.auto import partition
12-
13-
from .__about__ import __version__
14-
from .pii_tools import PresidioEngine
15-
16-
logger = logging.getLogger(__name__).setLevel(logging.ERROR)
1+
from .config import OperationConfig, PipelineOperationType
2+
from .donuttransformer import DonutImageProcessor
3+
from .main import DataFog
4+
from .pii_annotation import (
5+
PIIAnnotationModel,
6+
PIIAnnotationPipeline,
7+
PIIAnnotationRequest,
8+
)
179

1810
__all__ = [
19-
"__version__",
20-
"PresidioEngine",
11+
"DataFog",
12+
"PipelineOperationType",
13+
"OperationConfig",
14+
"DonutImageProcessor",
15+
"PIIAnnotationModel",
16+
"PIIAnnotationRequest",
17+
"PIIAnnotationPipeline",
2118
]
22-
23-
24-
class DataFog:
25-
"""
26-
DataFog class for performing privacy operations on input data.
27-
28-
This class uses the Spacy library to process and analyze input data for
29-
personally identifiable information (PII) and applies specified privacy
30-
operations to protect sensitive data.
31-
32-
Attributes:
33-
nlp (spacy.lang): Spacy language model for PII detection.
34-
"""
35-
36-
# Maintaining support
37-
def __init__(self):
38-
"""
39-
Initialize the DataFog instance.
40-
41-
Loads the Spacy language model for PII detection.
42-
"""
43-
self.nlp = spacy.load("en_spacy_pii_fast")
44-
45-
@staticmethod
46-
def client():
47-
"""
48-
Create a new instance of the DataFog client.
49-
50-
Returns:
51-
DataFog: A new instance of the DataFog client.
52-
"""
53-
return DataFog()
54-
55-
@staticmethod
56-
def upload_file(uploaded_file_path):
57-
uploaded_file_path = Path(uploaded_file_path)
58-
bytes_data = uploaded_file_path.read_bytes()
59-
texts = {}
60-
61-
if not uploaded_file_path.exists():
62-
return "File not found."
63-
else:
64-
65-
temp_file = tempfile.NamedTemporaryFile(
66-
delete=True, suffix=uploaded_file_path.suffix
67-
)
68-
temp_file.write(bytes_data)
69-
elements = partition(temp_file.name)
70-
text = ""
71-
for element in elements:
72-
text += element.text + "\n"
73-
texts[uploaded_file_path.name] = text
74-
75-
return texts
76-
77-
@staticmethod
78-
def upload_files(uploaded_files: List[str]):
79-
"""
80-
Process uploaded files.
81-
82-
Args:
83-
uploaded_files (List[str]): A list of file paths uploaded by the user.
84-
85-
Returns:
86-
Dict[str, str]: A dictionary containing the processed text for each uploaded file.
87-
"""
88-
texts = {}
89-
for uploaded_file in uploaded_files:
90-
result = DataFog.upload_file(uploaded_file)
91-
texts.update(result)
92-
return texts
93-
94-
def __call__(self, input_source, privacy_operation):
95-
"""
96-
Process the input data and apply the specified privacy operation.
97-
98-
Args:
99-
input_source (Union[str, pd.DataFrame]): The input data source.
100-
Can be a URL, file path, or a string containing the data.
101-
Supported file formats: CSV, TXT, JSON, Parquet.
102-
privacy_operation (str): The privacy operation to apply.
103-
Supported operations: 'redact', 'annotate'.
104-
Returns:
105-
str: The processed text with the applied privacy operation.
106-
107-
Raises:
108-
ValueError: If an unsupported input source type or privacy operation is provided.
109-
"""
110-
if isinstance(input_source, str):
111-
if input_source.startswith(("http://", "https://")):
112-
print("Downloading file from URL")
113-
response = requests.get(input_source)
114-
text = response.text
115-
elif input_source.endswith((".csv", ".txt")):
116-
print("Reading CSV/TXT from local path")
117-
with open(input_source, "r") as file:
118-
text = file.read()
119-
elif input_source.endswith(".json"):
120-
print("Reading JSON from local path")
121-
with open(input_source, "r") as file:
122-
data = json.load(file)
123-
text = json.dumps(data)
124-
elif input_source.endswith(".parquet"):
125-
print("Reading Parquet from local path")
126-
df = pd.read_parquet(input_source)
127-
text = df.to_csv(index=False)
128-
else:
129-
text = input_source
130-
else:
131-
raise ValueError("Unsupported input source type")
132-
133-
doc = self.nlp(text)
134-
135-
# Chunk the text and perform privacy operation
136-
for ent in doc.ents:
137-
if ent.label_ in ["PERSON", "ORG", "GPE", "PHONE", "EMAIL", "URL"]:
138-
# Perform privacy operation based on the entity type
139-
if privacy_operation == "redact":
140-
text = text.replace(ent.text, "[REDACTED]")
141-
elif privacy_operation == "annotate":
142-
text = text.replace(ent.text, f"[{ent.label_}]")
143-
144-
else:
145-
raise ValueError(
146-
f"Unsupported privacy operation: {privacy_operation}"
147-
)
148-
149-
return text

src/datafog/config.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from enum import Enum
2+
from typing import Optional
3+
4+
from pydantic import BaseModel
5+
6+
7+
class PipelineOperationType(Enum):
8+
READ_IMAGE = "read_image"
9+
PARSE_IMAGE = "parse_image"
10+
TEXT_PII_ANNOTATION = "text_pii_annotation"
11+
TEXT_PII_ANNOTATION_WITH_IMAGE = "text_pii_annotation_with_image"
12+
13+
14+
class ModelConfig(BaseModel):
15+
model: str
16+
processor: str
17+
18+
19+
class OperationConfig(BaseModel):
20+
operation_type: PipelineOperationType
21+
config: Optional[ModelConfig] = None
22+
23+
@classmethod
24+
def model_validator(cls, v, values):
25+
configs = {
26+
"read_image": ModelConfig(
27+
model="naver-clova-ix/donut-base-finetuned-rvlcdip",
28+
processor="naver-clova-ix/donut-base-finetuned-rvlcdip",
29+
),
30+
"parse_image": ModelConfig(
31+
model="naver-clova-ix/donut-base-finetuned-cord-v2",
32+
processor="naver-clova-ix/donut-base-finetuned-cord-v2",
33+
),
34+
}
35+
operation_type = values.get("operation_type")
36+
if operation_type and operation_type.value in configs:
37+
return configs[operation_type.value]
38+
return v
39+
40+
class Config:
41+
use_enum_values = True
42+
validate_assignment = True
43+
arbitrary_types_allowed = True

src/datafog/donuttransformer.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import re
2+
import warnings
3+
from io import BytesIO
4+
5+
from PIL import Image
6+
from transformers import DonutProcessor, VisionEncoderDecoderModel
7+
8+
from .config import OperationConfig, PipelineOperationType
9+
10+
11+
class DonutImageProcessor:
12+
"""
13+
A class to process images using the Donut model for different operations like classification, parsing, and question answering.
14+
15+
Attributes:
16+
operation_type (PipelineOperationType): The type of operation to perform (e.g., READ_IMAGE, PARSE_IMAGE).
17+
processor (DonutProcessor): The processor associated with the specific Donut model.
18+
model (VisionEncoderDecoderModel): The model loaded based on the operation type.
19+
device (str): The device on which the model will run (default is 'cpu').
20+
21+
Methods:
22+
read_image(file: bytes) -> Image.Image:
23+
Reads an image from bytes, converts it to RGB if necessary, and returns the PIL Image.
24+
25+
classify_image(image: Image.Image) -> dict:
26+
Processes the image for classification using the Donut model.
27+
28+
parse_image(image: Image.Image) -> dict:
29+
Processes the image for parsing key information using the Donut model.
30+
31+
question_image(image: Image.Image, question: str) -> dict:
32+
Processes the image to answer a question about the image using the Donut model.
33+
34+
_process_image(image: Image.Image, operation_type_prompt: str) -> dict:
35+
A helper method to process the image with the model using a specific operation type prompt.
36+
"""
37+
38+
def __init__(self, operation_type: PipelineOperationType):
39+
self.operation_type = operation_type
40+
model_config = OperationConfig.model_validator(
41+
None, {"operation_type": operation_type}
42+
)
43+
self.processor = DonutProcessor.from_pretrained(model_config.processor)
44+
self.model = VisionEncoderDecoderModel.from_pretrained(model_config.model)
45+
self.device = "cpu"
46+
self.model.to(self.device)
47+
48+
@staticmethod
49+
def read_image(file: bytes) -> Image.Image:
50+
try:
51+
image = Image.open(BytesIO(file))
52+
if image.mode != "RGB":
53+
warnings.warn("Image mode is not RGB. Converting to RGB.")
54+
image = image.convert("RGB")
55+
return image
56+
except IOError as e:
57+
raise ValueError(
58+
f"Unable to read the image file: {e}. Ensure it is a valid image."
59+
)
60+
61+
def classify_image(self, image: Image.Image) -> dict:
62+
return self._process_image(image, operation_type_prompt="<s_rvlcdip>")
63+
64+
def parse_image(self, image: Image.Image) -> dict:
65+
return self._process_image(image, operation_type_prompt="<s_cord-v2>")
66+
67+
def question_image(
68+
self, image: Image.Image, question: str = "what is shown in this image?"
69+
) -> dict:
70+
operation_type_prompt = (
71+
f"<s_docvqa><s_question>{question}</s_question><s_answer>"
72+
)
73+
return self._process_image(image, operation_type_prompt=operation_type_prompt)
74+
75+
def _process_image(self, image: Image.Image, operation_type_prompt: str) -> dict:
76+
decoder_input_ids = self.processor.tokenizer(
77+
operation_type_prompt, add_special_tokens=False, return_tensors="pt"
78+
).input_ids
79+
pixel_values = self.processor(image, return_tensors="pt").pixel_values
80+
81+
outputs = self.model.generate(
82+
pixel_values.to(self.device),
83+
decoder_input_ids=decoder_input_ids.to(self.device),
84+
max_length=self.model.decoder.config.max_position_embeddings,
85+
early_stopping=True,
86+
pad_token_id=self.processor.tokenizer.pad_token_id,
87+
eos_token_id=self.processor.tokenizer.eos_token_id,
88+
use_cache=True,
89+
num_beams=1,
90+
bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
91+
return_dict_in_generate=True,
92+
)
93+
94+
sequence = self.processor.batch_decode(outputs.sequences)[0]
95+
sequence = sequence.replace(self.processor.tokenizer.eos_token, "").replace(
96+
self.processor.tokenizer.pad_token, ""
97+
)
98+
sequence = re.sub(
99+
r"<.*?>", "", sequence, count=1
100+
).strip() # remove first operation_type start token
101+
102+
return self.processor.token2json(sequence)

0 commit comments

Comments
 (0)