Skip to content

Commit 85e7d8c

Browse files
Add license and passive voice test dataset from PassivePy repo for future evaluations
1 parent 0c936f6 commit 85e7d8c

File tree

4 files changed

+1198
-32
lines changed

4 files changed

+1198
-32
lines changed

LICENSE

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,10 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
SOFTWARE.
22+
23+
ATTRIBUTIONS:
24+
25+
This project contains code from PassivePy (https://github.com/mitramir55/PassivePy),
26+
which is included under the MIT License.
27+
28+
Copyright (c) 2021 Mitra Mirshafiee

formfyxer/docx_wrangling.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ def add_run_after(run, text):
5252

5353

5454
def update_docx(
55-
document: Union[docx.document.Document, str], modified_runs: List[Tuple[int, int, str, int]]
55+
document: Union[docx.document.Document, str],
56+
modified_runs: List[Tuple[int, int, str, int]],
5657
) -> docx.document.Document:
5758
"""Update the document with the modified runs.
5859
@@ -449,7 +450,7 @@ def make_docx_plain_language(docx_path: str) -> docx.document.Document:
449450
)
450451
return update_docx(docx.Document(docx_path), guesses)
451452

452-
453+
453454
def modify_docx_with_openai_guesses(docx_path: str) -> docx.document.Document:
454455
"""Uses OpenAI to guess the variable names for a document and then modifies the document with the guesses.
455456

formfyxer/lit_explorer.py

Lines changed: 37 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
import os
44
import re
55
import subprocess
6-
#import spacy
7-
#from spacy.tokens import Doc
6+
7+
# import spacy
8+
# from spacy.tokens import Doc
89
from pdfminer.high_level import extract_text
910
from pdfminer.layout import LAParams
1011

@@ -23,7 +24,8 @@
2324
from joblib import load
2425
import nltk
2526
from nltk.tokenize import sent_tokenize
26-
#from PassivePySrc import PassivePy
27+
28+
# from PassivePySrc import PassivePy
2729
import eyecite
2830
from enum import Enum
2931
import sigfig
@@ -76,12 +78,12 @@
7678

7779
stop_words = set(stopwords.words("english"))
7880

79-
#try:
80-
# this takes a while to load
81+
# try:
82+
# this takes a while to load
8183
# import en_core_web_lg
8284

8385
# nlp = en_core_web_lg.load()
84-
#except:
86+
# except:
8587
# try:
8688
# import en_core_web_sm
8789

@@ -99,7 +101,7 @@
99101
# nlp = en_core_web_sm.load()
100102

101103

102-
#passivepy = PassivePy.PassivePyAnalyzer(nlp=nlp)
104+
# passivepy = PassivePy.PassivePyAnalyzer(nlp=nlp)
103105

104106

105107
# Load local variables, models, and API key(s).
@@ -374,7 +376,7 @@ def vectorize(text: Union[List[str], str], tools_token: Optional[str] = None):
374376
text: a string of multiple words to vectorize
375377
tools_token: the token to tools.suffolklitlab.org, used for micro-service
376378
to reduce the amount of memory you need on your machine. If
377-
not passed, you need to have `en_core_web_lg` installed. NOTE: this
379+
not passed, you need to have `en_core_web_lg` installed. NOTE: this
378380
last bit is nolonger correct, you have to use the micor-service
379381
as we have had to remove SpaCY due to a breaking change
380382
"""
@@ -400,9 +402,9 @@ def vectorize(text: Union[List[str], str], tools_token: Optional[str] = None):
400402
return [np.array(embed) for embed in r.json().get("embeddings", [])]
401403
else:
402404
raise Exception("Couldn't access tools.suffolklitlab.org, no token provided")
403-
#if isinstance(text, str):
405+
# if isinstance(text, str):
404406
# return norm(nlp(text).vector)
405-
#else:
407+
# else:
406408
# return [norm(nlp(indiv_text).vector) for indiv_text in text]
407409

408410

@@ -881,31 +883,33 @@ def describe_form(text, creds: Optional[OpenAiCreds] = None) -> str:
881883
command = 'If the above is a court form, write a brief description of its purpose at a sixth grade reading level, otherwise respond with the word "abortthisnow.".'
882884
return complete_with_command(text, command, 250, creds=creds)
883885

886+
884887
def needs_calculations(text: Union[str]) -> bool:
885-
# since we reomved SpaCy we can't use Doc,
886-
# so I rewrote this to provide similar functionality absent Doc
887-
# old code is commented out
888-
#def needs_calculations(text: Union[str, Doc]) -> bool:
888+
# since we reomved SpaCy we can't use Doc,
889+
# so I rewrote this to provide similar functionality absent Doc
890+
# old code is commented out
891+
# def needs_calculations(text: Union[str, Doc]) -> bool:
889892
"""A conservative guess at if a given form needs the filler to make math calculations,
890893
something that should be avoided. If"""
891894
CALCULATION_WORDS = ["subtract", "total", "minus", "multiply" "divide"]
892-
#if isinstance(text, str):
895+
# if isinstance(text, str):
893896
# doc = nlp(text)
894-
#else:
897+
# else:
895898
# doc = text
896-
#for token in doc:
899+
# for token in doc:
897900
# if token.text.lower() in CALCULATION_WORDS:
898901
# return True
899902
for word in CALCULATION_WORDS:
900903
if word in text.lower():
901904
return True
902-
905+
903906
# TODO(brycew): anything better than a binary yes-no value on this?
904907
return False
905908

909+
906910
def tools_passive(input: Union[List[str], str], tools_token: Optional[str] = None):
907911
"""
908-
Ping passive voice API for list of sentences using the passive voice
912+
Ping passive voice API for list of sentences using the passive voice
909913
"""
910914
if tools_token:
911915
headers = {
@@ -921,7 +925,7 @@ def tools_passive(input: Union[List[str], str], tools_token: Optional[str] = Non
921925
if not r.ok:
922926
raise Exception("Couldn't access tools.suffolklitlab.org")
923927
if isinstance(input, str):
924-
output = np.array(r.json().get("results", []),dtype=object)
928+
output = np.array(r.json().get("results", []), dtype=object)
925929
if len(output) <= 0:
926930
raise Exception("Vector from tools.suffolklitlab.org is empty")
927931
return output
@@ -932,7 +936,8 @@ def tools_passive(input: Union[List[str], str], tools_token: Optional[str] = Non
932936

933937

934938
def get_passive_sentences(
935-
text: Union[List, str], tools_token: Optional[str] = None) -> List[Tuple[str, List[Tuple[int, int]]]]:
939+
text: Union[List, str], tools_token: Optional[str] = None
940+
) -> List[Tuple[str, List[Tuple[int, int]]]]:
936941
"""Return a list of tuples, where each tuple represents a
937942
sentence in which passive voice was detected along with a list of the
938943
starting and ending position of each fragment that is phrased in the passive voice.
@@ -943,33 +948,35 @@ def get_passive_sentences(
943948
If provided a single string, it will be tokenized with NTLK and
944949
sentences containing fewer than 2 words will be ignored.
945950
"""
946-
# since we reomoved SpaCy, I rewrote this function to call the passive voice API
951+
# since we reomoved SpaCy, I rewrote this function to call the passive voice API
947952
# already up and running on tools.suffolklitlab.org
948953
# old code is commented out
949954
# Sepehri, A., Markowitz, D. M., & Mir, M. (2022, February 3).
950955
# PassivePy: A Tool to Automatically Identify Passive Voice in Big Text Data. Retrieved from psyarxiv.com/bwp3t
951956
#
952-
#if isinstance(text, str):
957+
# if isinstance(text, str):
953958
# sentences = [s for s in sent_tokenize(text) if len(s.split(" ")) > 2]
954959
# if not sentences:
955960
# raise ValueError(
956961
# "There are no sentences over 2 words in the provided text."
957962
# )
958-
#elif isinstance(text, list):
963+
# elif isinstance(text, list):
959964
# sentences = text
960-
#else:
965+
# else:
961966
# raise ValueError(f"Can't tokenize {type(text)} object into sentences")
962967

963-
#if not sentences:
968+
# if not sentences:
964969
# return []
965970

966-
#passive_text_df = passivepy.match_corpus_level(pd.DataFrame(sentences), 0)
967-
#matching_rows = passive_text_df[passive_text_df["binary"] > 0]
971+
# passive_text_df = passivepy.match_corpus_level(pd.DataFrame(sentences), 0)
972+
# matching_rows = passive_text_df[passive_text_df["binary"] > 0]
968973

969974
sentences_with_highlights = []
970-
tools_output = tools_passive(text, tools_token=tools_token) #list(zip(matching_rows["document"], matching_rows["all_passives"]))
975+
tools_output = tools_passive(
976+
text, tools_token=tools_token
977+
) # list(zip(matching_rows["document"], matching_rows["all_passives"]))
971978

972-
#for item in list(zip(matching_rows["document"], matching_rows["all_passives"])):
979+
# for item in list(zip(matching_rows["document"], matching_rows["all_passives"])):
973980
for item in tools_output:
974981
for fragment in item[1]:
975982
sentences_with_highlights.append(

0 commit comments

Comments
 (0)