SuffolkLITLab
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 7 additions & 0 deletions b/‎LICENSE‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 13 additions & 0 deletions b/‎README.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎dev-testing/passive_example.pdf‎
1.43 KB b/‎dev-testing/passive_example.pdf‎
1.43 KB
diff --git a/‎formfyxer/docx_wrangling.py‎
Lines changed: 3 additions & 2 deletions b/‎formfyxer/docx_wrangling.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎formfyxer/lit_explorer.py‎
Lines changed: 47 additions & 106 deletions b/‎formfyxer/lit_explorer.py‎
Lines changed: 47 additions & 106 deletions
@@ -11,3 +11,5 @@ dev-testing/.DS_Store
 .env
 .venv
 venv
+*results.json
+.code/
@@ -19,3 +19,10 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+ATTRIBUTIONS:
+
+This project contains code from PassivePy (https://github.com/mitramir55/PassivePy),
+which is included under the MIT License.
+
+Copyright (c) 2021 Mitra Mirshafiee
@@ -38,6 +38,19 @@ cp .env.example .env
 
 The library looks for `SPOT_TOKEN` or `TOOLS_TOKEN` for Spot access and `OPENAI_API_KEY` with an optional `OPENAI_ORGANIZATION` for OpenAI features. Any standard environment variable loader that populates those values will work; the package uses [python-dotenv](https://github.com/theskumar/python-dotenv) to read `.env` automatically.
 
+## Passive Voice Evaluation
+
+You can sanity check the LLM-backed passive voice detector with [Promptfoo](https://promptfoo.dev/).
+
+```bash
+promptfoo eval -c promptfooconfig.yaml
+```
+
+The eval uses `formfyxer/tests/passive_voice_test_dataset.csv` to ensure sentences labeled as passive produce a non-empty `fragments` array. If you change the prompt inside `formfyxer/passive_voice_detection.py`, copy the updated text (the `system_prompt` and numbering format) into `promptfooconfig.yaml` so the evaluation mirrors runtime behavior.
+
+Raw percentage correct performance using gpt-5 nano on the benchmark dataset is **95.56%.** (Did not
+calculate more specific F1 score).
+
 ## Functions
 
 Functions from `pdf_wrangling` are found on [our documentation site](https://suffolklitlab.org/docassemble-AssemblyLine-documentation/docs/reference/formfyxer/pdf_wrangling).
 
@@ -52,7 +52,8 @@ def add_run_after(run, text):
 
 
 def update_docx(
-    document: Union[docx.document.Document, str], modified_runs: List[Tuple[int, int, str, int]]
+    document: Union[docx.document.Document, str],
+    modified_runs: List[Tuple[int, int, str, int]],
 ) -> docx.document.Document:
     """Update the document with the modified runs.
 
@@ -449,7 +450,7 @@ def make_docx_plain_language(docx_path: str) -> docx.document.Document:
     )
     return update_docx(docx.Document(docx_path), guesses)
 
-  
+
 def modify_docx_with_openai_guesses(docx_path: str) -> docx.document.Document:
     """Uses OpenAI to guess the variable names for a document and then modifies the document with the guesses.
 
 
@@ -1,10 +1,7 @@
-# Updated on 2022-12-12
-
 import os
 import re
 import subprocess
-#import spacy
-#from spacy.tokens import Doc
+
 from pdfminer.high_level import extract_text
 from pdfminer.layout import LAParams
 
@@ -22,8 +19,7 @@
 from sklearn.preprocessing import normalize
 from joblib import load
 import nltk
-from nltk.tokenize import sent_tokenize
-#from PassivePySrc import PassivePy
+
 import eyecite
 from enum import Enum
 import sigfig
@@ -68,6 +64,8 @@
 from openai import OpenAI
 from dotenv import load_dotenv
 
+from .passive_voice_detection import detect_passive_voice_segments, split_sentences
+
 from transformers import GPT2TokenizerFast
 
 load_dotenv()
@@ -76,32 +74,6 @@
 
 stop_words = set(stopwords.words("english"))
 
-#try:
-    # this takes a while to load
-#    import en_core_web_lg
-
-#    nlp = en_core_web_lg.load()
-#except:
-#    try:
-#        import en_core_web_sm
-
-#        nlp = en_core_web_sm.load()
-#    except:
-#        print("Downloading word2vec model en_core_web_sm")
-#        import subprocess
-
-#        bashCommand = "python -m spacy download en_core_web_sm"
-#        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
-#        output, error = process.communicate()
-#        print(f"output of word2vec model download: {str(output)}")
-#        import en_core_web_sm
-
-#        nlp = en_core_web_sm.load()
-
-
-#passivepy = PassivePy.PassivePyAnalyzer(nlp=nlp)
-
-
 # Load local variables, models, and API key(s).
 
 ###############
@@ -374,7 +346,7 @@ def vectorize(text: Union[List[str], str], tools_token: Optional[str] = None):
       text: a string of multiple words to vectorize
       tools_token: the token to tools.suffolklitlab.org, used for micro-service
           to reduce the amount of memory you need on your machine. If
-          not passed, you need to have `en_core_web_lg` installed. NOTE: this 
+          not passed, you need to have `en_core_web_lg` installed. NOTE: this
           last bit is nolonger correct, you have to use the micor-service
           as we have had to remove SpaCY due to a breaking change
     """
@@ -400,9 +372,9 @@ def vectorize(text: Union[List[str], str], tools_token: Optional[str] = None):
             return [np.array(embed) for embed in r.json().get("embeddings", [])]
     else:
         raise Exception("Couldn't access tools.suffolklitlab.org, no token provided")
-        #if isinstance(text, str):
+        # if isinstance(text, str):
         #    return norm(nlp(text).vector)
-        #else:
+        # else:
         #    return [norm(nlp(indiv_text).vector) for indiv_text in text]
 
 
@@ -881,96 +853,65 @@ def describe_form(text, creds: Optional[OpenAiCreds] = None) -> str:
     command = 'If the above is a court form, write a brief description of its purpose at a sixth grade reading level, otherwise respond with the word "abortthisnow.".'
     return complete_with_command(text, command, 250, creds=creds)
 
+
 def needs_calculations(text: Union[str]) -> bool:
-# since we reomved SpaCy we can't use Doc, 
-# so I rewrote this to provide similar functionality absent Doc
-# old code is commented out
-#def needs_calculations(text: Union[str, Doc]) -> bool:
+    # since we reomved SpaCy we can't use Doc,
+    # so I rewrote this to provide similar functionality absent Doc
+    # old code is commented out
+    # def needs_calculations(text: Union[str, Doc]) -> bool:
     """A conservative guess at if a given form needs the filler to make math calculations,
     something that should be avoided. If"""
     CALCULATION_WORDS = ["subtract", "total", "minus", "multiply" "divide"]
-    #if isinstance(text, str):
+    # if isinstance(text, str):
     #    doc = nlp(text)
-    #else:
+    # else:
     #    doc = text
-    #for token in doc:
+    # for token in doc:
     #    if token.text.lower() in CALCULATION_WORDS:
     #        return True
     for word in CALCULATION_WORDS:
         if word in text.lower():
             return True
-        
+
     # TODO(brycew): anything better than a binary yes-no value on this?
     return False
 
-def tools_passive(input: Union[List[str], str], tools_token: Optional[str] = None):
-    """
-        Ping passive voice API for list of sentences using the passive voice
-    """
-    if tools_token:
-        headers = {
-            "Authorization": "Bearer " + tools_token,
-            "Content-Type": "application/json",
-        }
-        body = {"input": input}
-        r = requests.post(
-            "https://tools.suffolklitlab.org/passive/",
-            headers=headers,
-            data=json.dumps(body),
-        )
-        if not r.ok:
-            raise Exception("Couldn't access tools.suffolklitlab.org")
-        if isinstance(input, str):
-            output = np.array(r.json().get("results", []),dtype=object)
-            if len(output) <= 0:
-                raise Exception("Vector from tools.suffolklitlab.org is empty")
-            return output
-        else:
-            return [np.array(embed) for embed in r.json().get("results", [])]
-    else:
-        raise Exception("Couldn't access tools.suffolklitlab.org, no token provided")
-
 
 def get_passive_sentences(
-    text: Union[List, str], tools_token: Optional[str] = None) -> List[Tuple[str, List[Tuple[int, int]]]]:
-    """Return a list of tuples, where each tuple represents a
-    sentence in which passive voice was detected along with a list of the
-    starting and ending position of each fragment that is phrased in the passive voice.
-    The combination of the two can be used in the PDFStats frontend to highlight the
-    passive text in an individual sentence.
-
-    Text can either be a string or a list of strings.
-    If provided a single string, it will be tokenized with NTLK and
-    sentences containing fewer than 2 words will be ignored.
+    text: Union[List, str], tools_token: Optional[str] = None, model: str = "gpt-5-nano"
+) -> List[Tuple[str, List[Tuple[int, int]]]]:
+    """Return passive voice fragments for each sentence in ``text``.
+
+    The function relies on OpenAI's language model (via ``passive_voice_detection``)
+    to detect passive constructions. ``tools_token`` is kept for backward compatibility
+    but is no longer used.
+
+    Args:
+        text (Union[List, str]): The input text or list of texts to analyze.
+        tools_token (Optional[str], optional): Deprecated. Previously used for authentication with
+            tools.suffolklitlab.org. Defaults to None.
+        model (str, optional): The OpenAI model to use for detection. Defaults to "gpt-5-nano".
+    Returns:
+        List[Tuple[str, List[Tuple[int, int]]]]: A list of tuples, each containing the original text
+            and a list of tuples representing the start and end positions of detected passive voice fragments.
+
+    Note:
+        At least for now, the fragment detection is no longer meaningful (except in tokenized sentences) because
+        the LLM detection simply returns the full original sentence if it contains passive voice. We have not reimplemented
+        this behavior of PassivePy.
     """
-    # since we reomoved SpaCy, I rewrote this function to call the passive voice API 
-    # already up and running on tools.suffolklitlab.org
-    # old code is commented out
-    # Sepehri, A., Markowitz, D. M., & Mir, M. (2022, February 3).
-    # PassivePy: A Tool to Automatically Identify Passive Voice in Big Text Data. Retrieved from psyarxiv.com/bwp3t
-    #
-    #if isinstance(text, str):
-    #    sentences = [s for s in sent_tokenize(text) if len(s.split(" ")) > 2]
-    #    if not sentences:
-    #        raise ValueError(
-    #            "There are no sentences over 2 words in the provided text."
-    #        )
-    #elif isinstance(text, list):
-    #    sentences = text
-    #else:
-    #    raise ValueError(f"Can't tokenize {type(text)} object into sentences")
-
-    #if not sentences:
-    #    return []
-
-    #passive_text_df = passivepy.match_corpus_level(pd.DataFrame(sentences), 0)
-    #matching_rows = passive_text_df[passive_text_df["binary"] > 0]
+    if tools_token:
+        pass  # deprecated
 
     sentences_with_highlights = []
-    tools_output = tools_passive(text, tools_token=tools_token) #list(zip(matching_rows["document"], matching_rows["all_passives"]))
 
-    #for item in list(zip(matching_rows["document"], matching_rows["all_passives"])):
-    for item in tools_output:
+    passive_voice_results = detect_passive_voice_segments(
+        text,
+        openai_client=client if client else None,
+        model=model,
+    )
+
+    for item in passive_voice_results:
         for fragment in item[1]:
             sentences_with_highlights.append(
                 (
@@ -1262,7 +1203,7 @@ def parse_form(
         new_names = field_names
         new_names_conf = []
 
-    tokenized_sentences = sent_tokenize(original_text)
+    tokenized_sentences = split_sentences(original_text)
     # No need to detect passive voice in very short sentences
     sentences = [s for s in tokenized_sentences if len(s.split(" ")) > 2]
-Original file line number
+Diff line change
 .env
 .venv
 venv
 +*results.json
 +.code/