33import os
44import re
55import subprocess
6- #import spacy
7- #from spacy.tokens import Doc
6+
7+ # import spacy
8+ # from spacy.tokens import Doc
89from pdfminer .high_level import extract_text
910from pdfminer .layout import LAParams
1011
2324from joblib import load
2425import nltk
2526from nltk .tokenize import sent_tokenize
26- #from PassivePySrc import PassivePy
27+
28+ # from PassivePySrc import PassivePy
2729import eyecite
2830from enum import Enum
2931import sigfig
7678
7779stop_words = set (stopwords .words ("english" ))
7880
79- #try:
80- # this takes a while to load
81+ # try:
82+ # this takes a while to load
8183# import en_core_web_lg
8284
8385# nlp = en_core_web_lg.load()
84- #except:
86+ # except:
8587# try:
8688# import en_core_web_sm
8789
99101# nlp = en_core_web_sm.load()
100102
101103
102- #passivepy = PassivePy.PassivePyAnalyzer(nlp=nlp)
104+ # passivepy = PassivePy.PassivePyAnalyzer(nlp=nlp)
103105
104106
105107# Load local variables, models, and API key(s).
@@ -374,7 +376,7 @@ def vectorize(text: Union[List[str], str], tools_token: Optional[str] = None):
374376 text: a string of multiple words to vectorize
375377 tools_token: the token to tools.suffolklitlab.org, used for micro-service
376378 to reduce the amount of memory you need on your machine. If
377- not passed, you need to have `en_core_web_lg` installed. NOTE: this
379+ not passed, you need to have `en_core_web_lg` installed. NOTE: this
378380 last bit is nolonger correct, you have to use the micor-service
379381 as we have had to remove SpaCY due to a breaking change
380382 """
@@ -400,9 +402,9 @@ def vectorize(text: Union[List[str], str], tools_token: Optional[str] = None):
400402 return [np .array (embed ) for embed in r .json ().get ("embeddings" , [])]
401403 else :
402404 raise Exception ("Couldn't access tools.suffolklitlab.org, no token provided" )
403- #if isinstance(text, str):
405+ # if isinstance(text, str):
404406 # return norm(nlp(text).vector)
405- #else:
407+ # else:
406408 # return [norm(nlp(indiv_text).vector) for indiv_text in text]
407409
408410
@@ -881,31 +883,33 @@ def describe_form(text, creds: Optional[OpenAiCreds] = None) -> str:
881883 command = 'If the above is a court form, write a brief description of its purpose at a sixth grade reading level, otherwise respond with the word "abortthisnow.".'
882884 return complete_with_command (text , command , 250 , creds = creds )
883885
886+
884887def needs_calculations (text : Union [str ]) -> bool :
885- # since we reomved SpaCy we can't use Doc,
886- # so I rewrote this to provide similar functionality absent Doc
887- # old code is commented out
888- # def needs_calculations(text: Union[str, Doc]) -> bool:
888+ # since we reomved SpaCy we can't use Doc,
889+ # so I rewrote this to provide similar functionality absent Doc
890+ # old code is commented out
891+ # def needs_calculations(text: Union[str, Doc]) -> bool:
889892 """A conservative guess at if a given form needs the filler to make math calculations,
890893 something that should be avoided. If"""
891894 CALCULATION_WORDS = ["subtract" , "total" , "minus" , "multiply" "divide" ]
892- #if isinstance(text, str):
895+ # if isinstance(text, str):
893896 # doc = nlp(text)
894- #else:
897+ # else:
895898 # doc = text
896- #for token in doc:
899+ # for token in doc:
897900 # if token.text.lower() in CALCULATION_WORDS:
898901 # return True
899902 for word in CALCULATION_WORDS :
900903 if word in text .lower ():
901904 return True
902-
905+
903906 # TODO(brycew): anything better than a binary yes-no value on this?
904907 return False
905908
909+
906910def tools_passive (input : Union [List [str ], str ], tools_token : Optional [str ] = None ):
907911 """
908- Ping passive voice API for list of sentences using the passive voice
912+ Ping passive voice API for list of sentences using the passive voice
909913 """
910914 if tools_token :
911915 headers = {
@@ -921,7 +925,7 @@ def tools_passive(input: Union[List[str], str], tools_token: Optional[str] = Non
921925 if not r .ok :
922926 raise Exception ("Couldn't access tools.suffolklitlab.org" )
923927 if isinstance (input , str ):
924- output = np .array (r .json ().get ("results" , []),dtype = object )
928+ output = np .array (r .json ().get ("results" , []), dtype = object )
925929 if len (output ) <= 0 :
926930 raise Exception ("Vector from tools.suffolklitlab.org is empty" )
927931 return output
@@ -932,7 +936,8 @@ def tools_passive(input: Union[List[str], str], tools_token: Optional[str] = Non
932936
933937
934938def get_passive_sentences (
935- text : Union [List , str ], tools_token : Optional [str ] = None ) -> List [Tuple [str , List [Tuple [int , int ]]]]:
939+ text : Union [List , str ], tools_token : Optional [str ] = None
940+ ) -> List [Tuple [str , List [Tuple [int , int ]]]]:
936941 """Return a list of tuples, where each tuple represents a
937942 sentence in which passive voice was detected along with a list of the
938943 starting and ending position of each fragment that is phrased in the passive voice.
@@ -943,33 +948,35 @@ def get_passive_sentences(
943948 If provided a single string, it will be tokenized with NTLK and
944949 sentences containing fewer than 2 words will be ignored.
945950 """
946- # since we reomoved SpaCy, I rewrote this function to call the passive voice API
951+ # since we reomoved SpaCy, I rewrote this function to call the passive voice API
947952 # already up and running on tools.suffolklitlab.org
948953 # old code is commented out
949954 # Sepehri, A., Markowitz, D. M., & Mir, M. (2022, February 3).
950955 # PassivePy: A Tool to Automatically Identify Passive Voice in Big Text Data. Retrieved from psyarxiv.com/bwp3t
951956 #
952- #if isinstance(text, str):
957+ # if isinstance(text, str):
953958 # sentences = [s for s in sent_tokenize(text) if len(s.split(" ")) > 2]
954959 # if not sentences:
955960 # raise ValueError(
956961 # "There are no sentences over 2 words in the provided text."
957962 # )
958- #elif isinstance(text, list):
963+ # elif isinstance(text, list):
959964 # sentences = text
960- #else:
965+ # else:
961966 # raise ValueError(f"Can't tokenize {type(text)} object into sentences")
962967
963- #if not sentences:
968+ # if not sentences:
964969 # return []
965970
966- #passive_text_df = passivepy.match_corpus_level(pd.DataFrame(sentences), 0)
967- #matching_rows = passive_text_df[passive_text_df["binary"] > 0]
971+ # passive_text_df = passivepy.match_corpus_level(pd.DataFrame(sentences), 0)
972+ # matching_rows = passive_text_df[passive_text_df["binary"] > 0]
968973
969974 sentences_with_highlights = []
970- tools_output = tools_passive (text , tools_token = tools_token ) #list(zip(matching_rows["document"], matching_rows["all_passives"]))
975+ tools_output = tools_passive (
976+ text , tools_token = tools_token
977+ ) # list(zip(matching_rows["document"], matching_rows["all_passives"]))
971978
972- #for item in list(zip(matching_rows["document"], matching_rows["all_passives"])):
979+ # for item in list(zip(matching_rows["document"], matching_rows["all_passives"])):
973980 for item in tools_output :
974981 for fragment in item [1 ]:
975982 sentences_with_highlights .append (
0 commit comments