1- # Updated on 2022-12-12
2-
31import os
42import re
53import subprocess
6- #import spacy
7- #from spacy.tokens import Doc
4+
85from pdfminer .high_level import extract_text
96from pdfminer .layout import LAParams
107
2219from sklearn .preprocessing import normalize
2320from joblib import load
2421import nltk
25- from nltk .tokenize import sent_tokenize
26- #from PassivePySrc import PassivePy
22+
2723import eyecite
2824from enum import Enum
2925import sigfig
6864from openai import OpenAI
6965from dotenv import load_dotenv
7066
67+ from .passive_voice_detection import detect_passive_voice_segments , split_sentences
68+
7169from transformers import GPT2TokenizerFast
7270
7371load_dotenv ()
7674
7775stop_words = set (stopwords .words ("english" ))
7876
79- #try:
80- # this takes a while to load
81- # import en_core_web_lg
82-
83- # nlp = en_core_web_lg.load()
84- #except:
85- # try:
86- # import en_core_web_sm
87-
88- # nlp = en_core_web_sm.load()
89- # except:
90- # print("Downloading word2vec model en_core_web_sm")
91- # import subprocess
92-
93- # bashCommand = "python -m spacy download en_core_web_sm"
94- # process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
95- # output, error = process.communicate()
96- # print(f"output of word2vec model download: {str(output)}")
97- # import en_core_web_sm
98-
99- # nlp = en_core_web_sm.load()
100-
101-
102- #passivepy = PassivePy.PassivePyAnalyzer(nlp=nlp)
103-
104-
10577# Load local variables, models, and API key(s).
10678
10779###############
@@ -374,7 +346,7 @@ def vectorize(text: Union[List[str], str], tools_token: Optional[str] = None):
374346 text: a string of multiple words to vectorize
375347 tools_token: the token to tools.suffolklitlab.org, used for micro-service
376348 to reduce the amount of memory you need on your machine. If
377- not passed, you need to have `en_core_web_lg` installed. NOTE: this
349+ not passed, you need to have `en_core_web_lg` installed. NOTE: this
378350 last bit is nolonger correct, you have to use the micor-service
379351 as we have had to remove SpaCY due to a breaking change
380352 """
@@ -400,9 +372,9 @@ def vectorize(text: Union[List[str], str], tools_token: Optional[str] = None):
400372 return [np .array (embed ) for embed in r .json ().get ("embeddings" , [])]
401373 else :
402374 raise Exception ("Couldn't access tools.suffolklitlab.org, no token provided" )
403- #if isinstance(text, str):
375+ # if isinstance(text, str):
404376 # return norm(nlp(text).vector)
405- #else:
377+ # else:
406378 # return [norm(nlp(indiv_text).vector) for indiv_text in text]
407379
408380
@@ -881,96 +853,65 @@ def describe_form(text, creds: Optional[OpenAiCreds] = None) -> str:
881853 command = 'If the above is a court form, write a brief description of its purpose at a sixth grade reading level, otherwise respond with the word "abortthisnow.".'
882854 return complete_with_command (text , command , 250 , creds = creds )
883855
856+
884857def needs_calculations (text : Union [str ]) -> bool :
885- # since we reomved SpaCy we can't use Doc,
886- # so I rewrote this to provide similar functionality absent Doc
887- # old code is commented out
888- # def needs_calculations(text: Union[str, Doc]) -> bool:
858+ # since we reomved SpaCy we can't use Doc,
859+ # so I rewrote this to provide similar functionality absent Doc
860+ # old code is commented out
861+ # def needs_calculations(text: Union[str, Doc]) -> bool:
889862 """A conservative guess at if a given form needs the filler to make math calculations,
890863 something that should be avoided. If"""
891864 CALCULATION_WORDS = ["subtract" , "total" , "minus" , "multiply" "divide" ]
892- #if isinstance(text, str):
865+ # if isinstance(text, str):
893866 # doc = nlp(text)
894- #else:
867+ # else:
895868 # doc = text
896- #for token in doc:
869+ # for token in doc:
897870 # if token.text.lower() in CALCULATION_WORDS:
898871 # return True
899872 for word in CALCULATION_WORDS :
900873 if word in text .lower ():
901874 return True
902-
875+
903876 # TODO(brycew): anything better than a binary yes-no value on this?
904877 return False
905878
906- def tools_passive (input : Union [List [str ], str ], tools_token : Optional [str ] = None ):
907- """
908- Ping passive voice API for list of sentences using the passive voice
909- """
910- if tools_token :
911- headers = {
912- "Authorization" : "Bearer " + tools_token ,
913- "Content-Type" : "application/json" ,
914- }
915- body = {"input" : input }
916- r = requests .post (
917- "https://tools.suffolklitlab.org/passive/" ,
918- headers = headers ,
919- data = json .dumps (body ),
920- )
921- if not r .ok :
922- raise Exception ("Couldn't access tools.suffolklitlab.org" )
923- if isinstance (input , str ):
924- output = np .array (r .json ().get ("results" , []),dtype = object )
925- if len (output ) <= 0 :
926- raise Exception ("Vector from tools.suffolklitlab.org is empty" )
927- return output
928- else :
929- return [np .array (embed ) for embed in r .json ().get ("results" , [])]
930- else :
931- raise Exception ("Couldn't access tools.suffolklitlab.org, no token provided" )
932-
933879
934880def get_passive_sentences (
935- text : Union [List , str ], tools_token : Optional [str ] = None ) -> List [Tuple [str , List [Tuple [int , int ]]]]:
936- """Return a list of tuples, where each tuple represents a
937- sentence in which passive voice was detected along with a list of the
938- starting and ending position of each fragment that is phrased in the passive voice.
939- The combination of the two can be used in the PDFStats frontend to highlight the
940- passive text in an individual sentence.
941-
942- Text can either be a string or a list of strings.
943- If provided a single string, it will be tokenized with NTLK and
944- sentences containing fewer than 2 words will be ignored.
881+ text : Union [List , str ], tools_token : Optional [str ] = None , model : str = "gpt-5-nano"
882+ ) -> List [Tuple [str , List [Tuple [int , int ]]]]:
883+ """Return passive voice fragments for each sentence in ``text``.
884+
885+ The function relies on OpenAI's language model (via ``passive_voice_detection``)
886+ to detect passive constructions. ``tools_token`` is kept for backward compatibility
887+ but is no longer used.
888+
889+ Args:
890+ text (Union[List, str]): The input text or list of texts to analyze.
891+ tools_token (Optional[str], optional): Deprecated. Previously used for authentication with
892+ tools.suffolklitlab.org. Defaults to None.
893+ model (str, optional): The OpenAI model to use for detection. Defaults to "gpt-5-nano".
894+ Returns:
895+ List[Tuple[str, List[Tuple[int, int]]]]: A list of tuples, each containing the original text
896+ and a list of tuples representing the start and end positions of detected passive voice fragments.
897+
898+ Note:
899+ At least for now, the fragment detection is no longer meaningful (except in tokenized sentences) because
900+ the LLM detection simply returns the full original sentence if it contains passive voice. We have not reimplemented
901+ this behavior of PassivePy.
945902 """
946- # since we reomoved SpaCy, I rewrote this function to call the passive voice API
947- # already up and running on tools.suffolklitlab.org
948- # old code is commented out
949- # Sepehri, A., Markowitz, D. M., & Mir, M. (2022, February 3).
950- # PassivePy: A Tool to Automatically Identify Passive Voice in Big Text Data. Retrieved from psyarxiv.com/bwp3t
951- #
952- #if isinstance(text, str):
953- # sentences = [s for s in sent_tokenize(text) if len(s.split(" ")) > 2]
954- # if not sentences:
955- # raise ValueError(
956- # "There are no sentences over 2 words in the provided text."
957- # )
958- #elif isinstance(text, list):
959- # sentences = text
960- #else:
961- # raise ValueError(f"Can't tokenize {type(text)} object into sentences")
962-
963- #if not sentences:
964- # return []
965-
966- #passive_text_df = passivepy.match_corpus_level(pd.DataFrame(sentences), 0)
967- #matching_rows = passive_text_df[passive_text_df["binary"] > 0]
903+ if tools_token :
904+ pass # deprecated
968905
969906 sentences_with_highlights = []
970- tools_output = tools_passive (text , tools_token = tools_token ) #list(zip(matching_rows["document"], matching_rows["all_passives"]))
971907
972- #for item in list(zip(matching_rows["document"], matching_rows["all_passives"])):
973- for item in tools_output :
908+ passive_voice_results = detect_passive_voice_segments (
909+ text ,
910+ openai_client = client if client else None ,
911+ model = model ,
912+ )
913+
914+ for item in passive_voice_results :
974915 for fragment in item [1 ]:
975916 sentences_with_highlights .append (
976917 (
@@ -1262,7 +1203,7 @@ def parse_form(
12621203 new_names = field_names
12631204 new_names_conf = []
12641205
1265- tokenized_sentences = sent_tokenize (original_text )
1206+ tokenized_sentences = split_sentences (original_text )
12661207 # No need to detect passive voice in very short sentences
12671208 sentences = [s for s in tokenized_sentences if len (s .split (" " )) > 2 ]
12681209
0 commit comments