DataFog
diff --git a/‎.DS_Store
0 Bytes b/‎.DS_Store
0 Bytes
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 2 additions & 5 deletions b/‎README.md
Lines changed: 2 additions & 5 deletions
diff --git a/‎requirements.txt
Lines changed: 10 additions & 14 deletions b/‎requirements.txt
Lines changed: 10 additions & 14 deletions
diff --git a/‎setup.py
Lines changed: 2 additions & 4 deletions b/‎setup.py
Lines changed: 2 additions & 4 deletions
diff --git a/‎src/datafog/__about__.py
Lines changed: 1 addition & 2 deletions b/‎src/datafog/__about__.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/datafog/__init__.py
Lines changed: 15 additions & 146 deletions b/‎src/datafog/__init__.py
Lines changed: 15 additions & 146 deletions
diff --git a/‎src/datafog/config.py
Lines changed: 43 additions & 0 deletions b/‎src/datafog/config.py
Lines changed: 43 additions & 0 deletions
diff --git a/‎src/datafog/donuttransformer.py
Lines changed: 102 additions & 0 deletions b/‎src/datafog/donuttransformer.py
Lines changed: 102 additions & 0 deletions
@@ -16,4 +16,4 @@ build/
 node_modules/
 datafog_debug.log
 sotu_2023.txt
-.DS_Store
+.DS_Store
@@ -27,7 +27,6 @@ DataFog is an open-source DevSecOps platform that lets you scan and redact Perso
 
 ![image](https://github.com/DataFog/datafog-python/assets/61345237/57fba4e5-21cc-458f-ac6a-6fbbb70a8de1)
 
-
 How do you keep:
 
 - Customer PII
@@ -45,7 +44,6 @@ from entering a Generative AI environment in the first place? What you need is a
 
 ![image](https://github.com/DataFog/datafog-python/assets/61345237/91f4634a-8a9f-4621-81bc-09930feda78a)
 
-
 ### There's lots of PII tools out there; why DataFog?
 
 If you look at the landscape of PII detection tools, their very existence was in many cases driven by regulatory requirements (i.e. 'comply with CCPA/GDPR/HIPAA').
@@ -55,11 +53,10 @@ are purpose-built for the problem that they are solving.
 However, Generative AI changes how we think about privacy. There's now a changing set of privacy requirements (new M&A deals, internal discussions means new terms to scan/redact) as well as different and varying document sources to contend with. PII detection is no longer just about compliance, it's an active - and for some, new - internal security threat for CISOs and Eng Leaders to contend with. We want DataFog to be built and driven to meet the needs of the open-source community as they tackle this challenge.
 
 ### Roadmap
-DataFog is an active project with regular weekly releases to production (typically on/around Monday evenings US PT).  Here's a snapshot of our coming roadmap; if you have questions or would like to weigh in, join our discord and let us know what we can do to make the product better!
-
-![image](https://github.com/DataFog/datafog-python/assets/61345237/62964d22-a221-4f1d-a0e6-0cc99de2ba92)
 
+DataFog is an active project with regular weekly releases to production (typically on/around Monday evenings US PT). Here's a snapshot of our coming roadmap; if you have questions or would like to weigh in, join our discord and let us know what we can do to make the product better!
 
+![image](https://github.com/DataFog/datafog-python/assets/61345237/62964d22-a221-4f1d-a0e6-0cc99de2ba92)
 
 ## Installation
 
 
@@ -1,14 +1,10 @@
-# spacy==3.4.4
-# thinc==8.1.0
-presidio_analyzer==2.2.353
-pandas==2.2.1
-pytest==8.0.2
-Requests==2.31.0
-aiohttp==3.8.2
-yarl==1.8.1
-frozenlist==1.3.1
-en_spacy_pii_fast
-unstructured[pdf]
-unstructured[pptx]
-
-
+en_spacy_pii_fast==0.0.0
+transformers
+torch
+pyspark
+pydantic
+pandas
+Pillow
+sentencepiece
+protobuf
+pytest
@@ -6,7 +6,7 @@
 
 
 def __version__():
-    return "2.4.0"
+    return "3.0.0"
 
 
 project_urls = {
@@ -27,9 +27,7 @@ def __version__():
     long_description=long_description,
     long_description_content_type="text/markdown",
     install_requires=[
-        "pandas==2.2.1",
-        "presidio_analyzer==2.2.353",
-        "pytest==8.0.2",
+        "pandas==2.2.2",
         "Requests==2.31.0",
         "spacy==3.4.4",
         "en_spacy_pii_fast",
 
@@ -1,2 +1 @@
-# SSOT for the package version
-__version__ = "2.4.0"
+__version__ = "3.0.0"
@@ -1,149 +1,18 @@
-# datafog-python/src/datafog/__init__.py
-import json
-import logging
-import tempfile
-from pathlib import Path
-from typing import List
-
-import pandas as pd
-import requests
-import spacy
-from unstructured.partition.auto import partition
-
-from .__about__ import __version__
-from .pii_tools import PresidioEngine
-
-logger = logging.getLogger(__name__).setLevel(logging.ERROR)
+from .config import OperationConfig, PipelineOperationType
+from .donuttransformer import DonutImageProcessor
+from .main import DataFog
+from .pii_annotation import (
+    PIIAnnotationModel,
+    PIIAnnotationPipeline,
+    PIIAnnotationRequest,
+)
 
 __all__ = [
-    "__version__",
-    "PresidioEngine",
+    "DataFog",
+    "PipelineOperationType",
+    "OperationConfig",
+    "DonutImageProcessor",
+    "PIIAnnotationModel",
+    "PIIAnnotationRequest",
+    "PIIAnnotationPipeline",
 ]
-
-
-class DataFog:
-    """
-    DataFog class for performing privacy operations on input data.
-
-    This class uses the Spacy library to process and analyze input data for
-    personally identifiable information (PII) and applies specified privacy
-    operations to protect sensitive data.
-
-    Attributes:
-        nlp (spacy.lang): Spacy language model for PII detection.
-    """
-
-    # Maintaining support
-    def __init__(self):
-        """
-        Initialize the DataFog instance.
-
-        Loads the Spacy language model for PII detection.
-        """
-        self.nlp = spacy.load("en_spacy_pii_fast")
-
-    @staticmethod
-    def client():
-        """
-        Create a new instance of the DataFog client.
-
-        Returns:
-            DataFog: A new instance of the DataFog client.
-        """
-        return DataFog()
-
-    @staticmethod
-    def upload_file(uploaded_file_path):
-        uploaded_file_path = Path(uploaded_file_path)
-        bytes_data = uploaded_file_path.read_bytes()
-        texts = {}
-
-        if not uploaded_file_path.exists():
-            return "File not found."
-        else:
-
-            temp_file = tempfile.NamedTemporaryFile(
-                delete=True, suffix=uploaded_file_path.suffix
-            )
-            temp_file.write(bytes_data)
-            elements = partition(temp_file.name)
-            text = ""
-            for element in elements:
-                text += element.text + "\n"
-            texts[uploaded_file_path.name] = text
-
-        return texts
-
-    @staticmethod
-    def upload_files(uploaded_files: List[str]):
-        """
-        Process uploaded files.
-
-        Args:
-            uploaded_files (List[str]): A list of file paths uploaded by the user.
-
-        Returns:
-            Dict[str, str]: A dictionary containing the processed text for each uploaded file.
-        """
-        texts = {}
-        for uploaded_file in uploaded_files:
-            result = DataFog.upload_file(uploaded_file)
-            texts.update(result)
-        return texts
-
-    def __call__(self, input_source, privacy_operation):
-        """
-        Process the input data and apply the specified privacy operation.
-
-        Args:
-            input_source (Union[str, pd.DataFrame]): The input data source.
-                Can be a URL, file path, or a string containing the data.
-                Supported file formats: CSV, TXT, JSON, Parquet.
-            privacy_operation (str): The privacy operation to apply.
-                Supported operations: 'redact', 'annotate'.
-        Returns:
-            str: The processed text with the applied privacy operation.
-
-        Raises:
-            ValueError: If an unsupported input source type or privacy operation is provided.
-        """
-        if isinstance(input_source, str):
-            if input_source.startswith(("http://", "https://")):
-                print("Downloading file from URL")
-                response = requests.get(input_source)
-                text = response.text
-            elif input_source.endswith((".csv", ".txt")):
-                print("Reading  CSV/TXT from local path")
-                with open(input_source, "r") as file:
-                    text = file.read()
-            elif input_source.endswith(".json"):
-                print("Reading JSON from local path")
-                with open(input_source, "r") as file:
-                    data = json.load(file)
-                    text = json.dumps(data)
-            elif input_source.endswith(".parquet"):
-                print("Reading Parquet from local path")
-                df = pd.read_parquet(input_source)
-                text = df.to_csv(index=False)
-            else:
-                text = input_source
-        else:
-            raise ValueError("Unsupported input source type")
-
-        doc = self.nlp(text)
-
-        # Chunk the text and perform privacy operation
-        for ent in doc.ents:
-            if ent.label_ in ["PERSON", "ORG", "GPE", "PHONE", "EMAIL", "URL"]:
-                # Perform privacy operation based on the entity type
-                if privacy_operation == "redact":
-                    text = text.replace(ent.text, "[REDACTED]")
-                elif privacy_operation == "annotate":
-                    text = text.replace(ent.text, f"[{ent.label_}]")
-
-                else:
-                    raise ValueError(
-                        f"Unsupported privacy operation: {privacy_operation}"
-                    )
-
-        return text
@@ -0,0 +1,43 @@
+from enum import Enum
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class PipelineOperationType(Enum):
+    READ_IMAGE = "read_image"
+    PARSE_IMAGE = "parse_image"
+    TEXT_PII_ANNOTATION = "text_pii_annotation"
+    TEXT_PII_ANNOTATION_WITH_IMAGE = "text_pii_annotation_with_image"
+
+
+class ModelConfig(BaseModel):
+    model: str
+    processor: str
+
+
+class OperationConfig(BaseModel):
+    operation_type: PipelineOperationType
+    config: Optional[ModelConfig] = None
+
+    @classmethod
+    def model_validator(cls, v, values):
+        configs = {
+            "read_image": ModelConfig(
+                model="naver-clova-ix/donut-base-finetuned-rvlcdip",
+                processor="naver-clova-ix/donut-base-finetuned-rvlcdip",
+            ),
+            "parse_image": ModelConfig(
+                model="naver-clova-ix/donut-base-finetuned-cord-v2",
+                processor="naver-clova-ix/donut-base-finetuned-cord-v2",
+            ),
+        }
+        operation_type = values.get("operation_type")
+        if operation_type and operation_type.value in configs:
+            return configs[operation_type.value]
+        return v
+
+    class Config:
+        use_enum_values = True
+        validate_assignment = True
+        arbitrary_types_allowed = True
@@ -0,0 +1,102 @@
+import re
+import warnings
+from io import BytesIO
+
+from PIL import Image
+from transformers import DonutProcessor, VisionEncoderDecoderModel
+
+from .config import OperationConfig, PipelineOperationType
+
+
+class DonutImageProcessor:
+    """
+    A class to process images using the Donut model for different operations like classification, parsing, and question answering.
+
+    Attributes:
+        operation_type (PipelineOperationType): The type of operation to perform (e.g., READ_IMAGE, PARSE_IMAGE).
+        processor (DonutProcessor): The processor associated with the specific Donut model.
+        model (VisionEncoderDecoderModel): The model loaded based on the operation type.
+        device (str): The device on which the model will run (default is 'cpu').
+
+    Methods:
+        read_image(file: bytes) -> Image.Image:
+            Reads an image from bytes, converts it to RGB if necessary, and returns the PIL Image.
+
+        classify_image(image: Image.Image) -> dict:
+            Processes the image for classification using the Donut model.
+
+        parse_image(image: Image.Image) -> dict:
+            Processes the image for parsing key information using the Donut model.
+
+        question_image(image: Image.Image, question: str) -> dict:
+            Processes the image to answer a question about the image using the Donut model.
+
+        _process_image(image: Image.Image, operation_type_prompt: str) -> dict:
+            A helper method to process the image with the model using a specific operation type prompt.
+    """
+
+    def __init__(self, operation_type: PipelineOperationType):
+        self.operation_type = operation_type
+        model_config = OperationConfig.model_validator(
+            None, {"operation_type": operation_type}
+        )
+        self.processor = DonutProcessor.from_pretrained(model_config.processor)
+        self.model = VisionEncoderDecoderModel.from_pretrained(model_config.model)
+        self.device = "cpu"
+        self.model.to(self.device)
+
+    @staticmethod
+    def read_image(file: bytes) -> Image.Image:
+        try:
+            image = Image.open(BytesIO(file))
+            if image.mode != "RGB":
+                warnings.warn("Image mode is not RGB. Converting to RGB.")
+                image = image.convert("RGB")
+            return image
+        except IOError as e:
+            raise ValueError(
+                f"Unable to read the image file: {e}. Ensure it is a valid image."
+            )
+
+    def classify_image(self, image: Image.Image) -> dict:
+        return self._process_image(image, operation_type_prompt="<s_rvlcdip>")
+
+    def parse_image(self, image: Image.Image) -> dict:
+        return self._process_image(image, operation_type_prompt="<s_cord-v2>")
+
+    def question_image(
+        self, image: Image.Image, question: str = "what is shown in this image?"
+    ) -> dict:
+        operation_type_prompt = (
+            f"<s_docvqa><s_question>{question}</s_question><s_answer>"
+        )
+        return self._process_image(image, operation_type_prompt=operation_type_prompt)
+
+    def _process_image(self, image: Image.Image, operation_type_prompt: str) -> dict:
+        decoder_input_ids = self.processor.tokenizer(
+            operation_type_prompt, add_special_tokens=False, return_tensors="pt"
+        ).input_ids
+        pixel_values = self.processor(image, return_tensors="pt").pixel_values
+
+        outputs = self.model.generate(
+            pixel_values.to(self.device),
+            decoder_input_ids=decoder_input_ids.to(self.device),
+            max_length=self.model.decoder.config.max_position_embeddings,
+            early_stopping=True,
+            pad_token_id=self.processor.tokenizer.pad_token_id,
+            eos_token_id=self.processor.tokenizer.eos_token_id,
+            use_cache=True,
+            num_beams=1,
+            bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
+            return_dict_in_generate=True,
+        )
+
+        sequence = self.processor.batch_decode(outputs.sequences)[0]
+        sequence = sequence.replace(self.processor.tokenizer.eos_token, "").replace(
+            self.processor.tokenizer.pad_token, ""
+        )
+        sequence = re.sub(
+            r"<.*?>", "", sequence, count=1
+        ).strip()  # remove first operation_type start token
+
+        return self.processor.token2json(sequence)
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1 @@`
`1`		`-# SSOT for the package version`
`2`		`-__version__ = "2.4.0"`
	`1`	`+__version__ = "3.0.0"`