Skip to content

Commit c08bfd4

Browse files
Merge pull request #147 from SuffolkLITLab/replace-passivepy
Replace passivepy with a call to an LLM
2 parents 0c936f6 + a720859 commit c08bfd4

File tree

12 files changed

+1837
-108
lines changed

12 files changed

+1837
-108
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,5 @@ dev-testing/.DS_Store
1111
.env
1212
.venv
1313
venv
14+
*results.json
15+
.code/

LICENSE

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,10 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
SOFTWARE.
22+
23+
ATTRIBUTIONS:
24+
25+
This project contains code from PassivePy (https://github.com/mitramir55/PassivePy),
26+
which is included under the MIT License.
27+
28+
Copyright (c) 2021 Mitra Mirshafiee

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,19 @@ cp .env.example .env
3838

3939
The library looks for `SPOT_TOKEN` or `TOOLS_TOKEN` for Spot access and `OPENAI_API_KEY` with an optional `OPENAI_ORGANIZATION` for OpenAI features. Any standard environment variable loader that populates those values will work; the package uses [python-dotenv](https://github.com/theskumar/python-dotenv) to read `.env` automatically.
4040

41+
## Passive Voice Evaluation
42+
43+
You can sanity check the LLM-backed passive voice detector with [Promptfoo](https://promptfoo.dev/).
44+
45+
```bash
46+
promptfoo eval -c promptfooconfig.yaml
47+
```
48+
49+
The eval uses `formfyxer/tests/passive_voice_test_dataset.csv` to ensure sentences labeled as passive produce a non-empty `fragments` array. If you change the prompt inside `formfyxer/passive_voice_detection.py`, copy the updated text (the `system_prompt` and numbering format) into `promptfooconfig.yaml` so the evaluation mirrors runtime behavior.
50+
51+
Raw percentage correct performance using gpt-5 nano on the benchmark dataset is **95.56%.** (Did not
52+
calculate more specific F1 score).
53+
4154
## Functions
4255

4356
Functions from `pdf_wrangling` are found on [our documentation site](https://suffolklitlab.org/docassemble-AssemblyLine-documentation/docs/reference/formfyxer/pdf_wrangling).

dev-testing/passive_example.pdf

1.43 KB
Binary file not shown.

formfyxer/docx_wrangling.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ def add_run_after(run, text):
5252

5353

5454
def update_docx(
55-
document: Union[docx.document.Document, str], modified_runs: List[Tuple[int, int, str, int]]
55+
document: Union[docx.document.Document, str],
56+
modified_runs: List[Tuple[int, int, str, int]],
5657
) -> docx.document.Document:
5758
"""Update the document with the modified runs.
5859
@@ -449,7 +450,7 @@ def make_docx_plain_language(docx_path: str) -> docx.document.Document:
449450
)
450451
return update_docx(docx.Document(docx_path), guesses)
451452

452-
453+
453454
def modify_docx_with_openai_guesses(docx_path: str) -> docx.document.Document:
454455
"""Uses OpenAI to guess the variable names for a document and then modifies the document with the guesses.
455456

formfyxer/lit_explorer.py

Lines changed: 47 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
1-
# Updated on 2022-12-12
2-
31
import os
42
import re
53
import subprocess
6-
#import spacy
7-
#from spacy.tokens import Doc
4+
85
from pdfminer.high_level import extract_text
96
from pdfminer.layout import LAParams
107

@@ -22,8 +19,7 @@
2219
from sklearn.preprocessing import normalize
2320
from joblib import load
2421
import nltk
25-
from nltk.tokenize import sent_tokenize
26-
#from PassivePySrc import PassivePy
22+
2723
import eyecite
2824
from enum import Enum
2925
import sigfig
@@ -68,6 +64,8 @@
6864
from openai import OpenAI
6965
from dotenv import load_dotenv
7066

67+
from .passive_voice_detection import detect_passive_voice_segments, split_sentences
68+
7169
from transformers import GPT2TokenizerFast
7270

7371
load_dotenv()
@@ -76,32 +74,6 @@
7674

7775
stop_words = set(stopwords.words("english"))
7876

79-
#try:
80-
# this takes a while to load
81-
# import en_core_web_lg
82-
83-
# nlp = en_core_web_lg.load()
84-
#except:
85-
# try:
86-
# import en_core_web_sm
87-
88-
# nlp = en_core_web_sm.load()
89-
# except:
90-
# print("Downloading word2vec model en_core_web_sm")
91-
# import subprocess
92-
93-
# bashCommand = "python -m spacy download en_core_web_sm"
94-
# process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
95-
# output, error = process.communicate()
96-
# print(f"output of word2vec model download: {str(output)}")
97-
# import en_core_web_sm
98-
99-
# nlp = en_core_web_sm.load()
100-
101-
102-
#passivepy = PassivePy.PassivePyAnalyzer(nlp=nlp)
103-
104-
10577
# Load local variables, models, and API key(s).
10678

10779
###############
@@ -374,7 +346,7 @@ def vectorize(text: Union[List[str], str], tools_token: Optional[str] = None):
374346
text: a string of multiple words to vectorize
375347
tools_token: the token to tools.suffolklitlab.org, used for micro-service
376348
to reduce the amount of memory you need on your machine. If
377-
not passed, you need to have `en_core_web_lg` installed. NOTE: this
349+
not passed, you need to have `en_core_web_lg` installed. NOTE: this
378350
last bit is nolonger correct, you have to use the micor-service
379351
as we have had to remove SpaCY due to a breaking change
380352
"""
@@ -400,9 +372,9 @@ def vectorize(text: Union[List[str], str], tools_token: Optional[str] = None):
400372
return [np.array(embed) for embed in r.json().get("embeddings", [])]
401373
else:
402374
raise Exception("Couldn't access tools.suffolklitlab.org, no token provided")
403-
#if isinstance(text, str):
375+
# if isinstance(text, str):
404376
# return norm(nlp(text).vector)
405-
#else:
377+
# else:
406378
# return [norm(nlp(indiv_text).vector) for indiv_text in text]
407379

408380

@@ -881,96 +853,65 @@ def describe_form(text, creds: Optional[OpenAiCreds] = None) -> str:
881853
command = 'If the above is a court form, write a brief description of its purpose at a sixth grade reading level, otherwise respond with the word "abortthisnow.".'
882854
return complete_with_command(text, command, 250, creds=creds)
883855

856+
884857
def needs_calculations(text: Union[str]) -> bool:
885-
# since we reomved SpaCy we can't use Doc,
886-
# so I rewrote this to provide similar functionality absent Doc
887-
# old code is commented out
888-
#def needs_calculations(text: Union[str, Doc]) -> bool:
858+
# since we reomved SpaCy we can't use Doc,
859+
# so I rewrote this to provide similar functionality absent Doc
860+
# old code is commented out
861+
# def needs_calculations(text: Union[str, Doc]) -> bool:
889862
"""A conservative guess at if a given form needs the filler to make math calculations,
890863
something that should be avoided. If"""
891864
CALCULATION_WORDS = ["subtract", "total", "minus", "multiply" "divide"]
892-
#if isinstance(text, str):
865+
# if isinstance(text, str):
893866
# doc = nlp(text)
894-
#else:
867+
# else:
895868
# doc = text
896-
#for token in doc:
869+
# for token in doc:
897870
# if token.text.lower() in CALCULATION_WORDS:
898871
# return True
899872
for word in CALCULATION_WORDS:
900873
if word in text.lower():
901874
return True
902-
875+
903876
# TODO(brycew): anything better than a binary yes-no value on this?
904877
return False
905878

906-
def tools_passive(input: Union[List[str], str], tools_token: Optional[str] = None):
907-
"""
908-
Ping passive voice API for list of sentences using the passive voice
909-
"""
910-
if tools_token:
911-
headers = {
912-
"Authorization": "Bearer " + tools_token,
913-
"Content-Type": "application/json",
914-
}
915-
body = {"input": input}
916-
r = requests.post(
917-
"https://tools.suffolklitlab.org/passive/",
918-
headers=headers,
919-
data=json.dumps(body),
920-
)
921-
if not r.ok:
922-
raise Exception("Couldn't access tools.suffolklitlab.org")
923-
if isinstance(input, str):
924-
output = np.array(r.json().get("results", []),dtype=object)
925-
if len(output) <= 0:
926-
raise Exception("Vector from tools.suffolklitlab.org is empty")
927-
return output
928-
else:
929-
return [np.array(embed) for embed in r.json().get("results", [])]
930-
else:
931-
raise Exception("Couldn't access tools.suffolklitlab.org, no token provided")
932-
933879

934880
def get_passive_sentences(
935-
text: Union[List, str], tools_token: Optional[str] = None) -> List[Tuple[str, List[Tuple[int, int]]]]:
936-
"""Return a list of tuples, where each tuple represents a
937-
sentence in which passive voice was detected along with a list of the
938-
starting and ending position of each fragment that is phrased in the passive voice.
939-
The combination of the two can be used in the PDFStats frontend to highlight the
940-
passive text in an individual sentence.
941-
942-
Text can either be a string or a list of strings.
943-
If provided a single string, it will be tokenized with NTLK and
944-
sentences containing fewer than 2 words will be ignored.
881+
text: Union[List, str], tools_token: Optional[str] = None, model: str = "gpt-5-nano"
882+
) -> List[Tuple[str, List[Tuple[int, int]]]]:
883+
"""Return passive voice fragments for each sentence in ``text``.
884+
885+
The function relies on OpenAI's language model (via ``passive_voice_detection``)
886+
to detect passive constructions. ``tools_token`` is kept for backward compatibility
887+
but is no longer used.
888+
889+
Args:
890+
text (Union[List, str]): The input text or list of texts to analyze.
891+
tools_token (Optional[str], optional): Deprecated. Previously used for authentication with
892+
tools.suffolklitlab.org. Defaults to None.
893+
model (str, optional): The OpenAI model to use for detection. Defaults to "gpt-5-nano".
894+
Returns:
895+
List[Tuple[str, List[Tuple[int, int]]]]: A list of tuples, each containing the original text
896+
and a list of tuples representing the start and end positions of detected passive voice fragments.
897+
898+
Note:
899+
At least for now, the fragment detection is no longer meaningful (except in tokenized sentences) because
900+
the LLM detection simply returns the full original sentence if it contains passive voice. We have not reimplemented
901+
this behavior of PassivePy.
945902
"""
946-
# since we reomoved SpaCy, I rewrote this function to call the passive voice API
947-
# already up and running on tools.suffolklitlab.org
948-
# old code is commented out
949-
# Sepehri, A., Markowitz, D. M., & Mir, M. (2022, February 3).
950-
# PassivePy: A Tool to Automatically Identify Passive Voice in Big Text Data. Retrieved from psyarxiv.com/bwp3t
951-
#
952-
#if isinstance(text, str):
953-
# sentences = [s for s in sent_tokenize(text) if len(s.split(" ")) > 2]
954-
# if not sentences:
955-
# raise ValueError(
956-
# "There are no sentences over 2 words in the provided text."
957-
# )
958-
#elif isinstance(text, list):
959-
# sentences = text
960-
#else:
961-
# raise ValueError(f"Can't tokenize {type(text)} object into sentences")
962-
963-
#if not sentences:
964-
# return []
965-
966-
#passive_text_df = passivepy.match_corpus_level(pd.DataFrame(sentences), 0)
967-
#matching_rows = passive_text_df[passive_text_df["binary"] > 0]
903+
if tools_token:
904+
pass # deprecated
968905

969906
sentences_with_highlights = []
970-
tools_output = tools_passive(text, tools_token=tools_token) #list(zip(matching_rows["document"], matching_rows["all_passives"]))
971907

972-
#for item in list(zip(matching_rows["document"], matching_rows["all_passives"])):
973-
for item in tools_output:
908+
passive_voice_results = detect_passive_voice_segments(
909+
text,
910+
openai_client=client if client else None,
911+
model=model,
912+
)
913+
914+
for item in passive_voice_results:
974915
for fragment in item[1]:
975916
sentences_with_highlights.append(
976917
(
@@ -1262,7 +1203,7 @@ def parse_form(
12621203
new_names = field_names
12631204
new_names_conf = []
12641205

1265-
tokenized_sentences = sent_tokenize(original_text)
1206+
tokenized_sentences = split_sentences(original_text)
12661207
# No need to detect passive voice in very short sentences
12671208
sentences = [s for s in tokenized_sentences if len(s.split(" ")) > 2]
12681209

0 commit comments

Comments
 (0)