-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup.py
More file actions
261 lines (227 loc) · 6.69 KB
/
setup.py
File metadata and controls
261 lines (227 loc) · 6.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import multiprocessing
import os
from multiprocessing import Process
from openpyxl.reader.excel import load_workbook
import time
from semantic_text_splitter import TextSplitter
import pickle
import re
from unidecode import unidecode
from cltk.data.fetch import FetchCorpus
from cltk import NLP
from DB import getText
import re
def getCorpora():
corpus_downloader = FetchCorpus(language="lat")
corpora = corpus_downloader.list_corpora
for c in corpora:
try:
corpus_downloader.import_corpus(c)
print(f"{c} successfully downloaded.")
except Exception as e:
print(c)
print(e)
continue
def splitText(text, num):
print(f"Splitting Text into {num} parts")
splitter = TextSplitter(len(text) // num)
chunks = splitter.chunks(text)
result = [(ind, ch) for ind, ch in enumerate(chunks)]
print(f"Returning {len(chunks)} chunks.")
return result
def analyseLarge(chunk, return_list):
ind, text = chunk
cltk = NLP(language="lat", suppress_banner=True)
print(f"Beginning analysis of chunk {ind} of large text. Please wait.")
doc = cltk.analyze(text)
print(f"Finished analysing chunk {ind} of large text")
return_list.append((ind, doc))
def analyseSmall(text):
cltk = NLP(language="lat", suppress_banner=True)
print("Beginning analysis of small text. Please wait.")
doc = cltk.analyze(text)
print("Finished analysing")
return doc
def store_data(author, title, part, nlp_doc):
"""
:param author: string
:param title: string
:param part: string of a digit
:param nlp_doc: nlp-document
:return:
"""
author = author.lower()
title = title.lower()
if not os.path.isdir(f'docs/{author}'):
os.mkdir(f'docs/{author}')
if not os.path.isdir(f'docs/{author}/{title}'):
os.mkdir(f'docs/{author}/{title}')
with open(f'docs/{author}/{title}/{title}{part}.pickle', 'wb') as handle:
pickle.dump(nlp_doc, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("{} by {} successfully saved".format(title, author))
def normalize_text(text):
"""
Removes macrons
Replaces Vs and Js
Removes punctuation in the middle of words
:param text: string of a passage
:return: string
"""
if type(text) != str:
text = str(text)
text = text.lower()
# replace macrons
macrons = {
'ā': 'a',
'ē': 'e',
'ō': 'o',
'ī': 'I',
'ū': 'u',
'ȳ': 'y'
}
for k, v in macrons.items():
text.replace(k, v)
# replace Js and Vs
jv = {
'ja': 'ia',
'je': 'ie',
'ji': 'ii',
'jo': 'io',
'ju': 'iu',
'va': 'ua',
've': 'ue',
'vi': 'ui',
'vo': 'uo',
'vu': 'uu',
}
for k, v in jv.items():
text.replace(k, v)
# join words that split over line break
text = re.sub(r'(-\s{0,3})', "", text)
return text
def set_wordlist(name, listSource):
"""
:param name: string.
:param listSource: either a filepath to a spreadsheet or a list object
Latin terms must be in first column of spreadsheet
:return: normalised version of list
"""
if os.path.exists(listSource):
wb = load_workbook(listSource)
ws = wb.active
raw_list = list([ws.cell(row=i, column=1).value for i in range(2, ws.max_row)])
result = [normalize_text(i) for i in raw_list]
elif type(listSource) == list:
result = [normalize_text(i) for i in listSource]
filename = f'data/wordlists/{name} list.txt'
with open(filename, "a", encoding="utf-8") as f:
for item in result:
if not item:
continue
if len(item.split()) > 1:
item = unidecode(item.split()[0])
else:
item = unidecode(item)
f.write(item)
f.write("\n")
f.close()
return result
def pickleExists(title, author):
if os.path.isdir(f"docs/{author}"):
for folder in os.listdir(f"docs/{author}"):
for filename in os.listdir(f"docs/{author}/{folder}"):
if re.search(rf"{title.lower()}\d.pickle", filename):
return True
else:
return False
def getVocab(name):
with open(f"data/wordlists/{name} list.txt", 'r', encoding='utf-8') as f:
vocab = [x.strip() for x in f.readlines()]
return vocab
if __name__ == "__main__":
print("Starting...\n")
# getCorpora()
# set vocab lists
vocabLists = [
("dcc", "data/vocab/Latin Core Vocab.xlsx"),
("clc", "data/vocab/cambridge_latin_course.xlsx"), # Missing some Bk5 vocab
("llpsi", "data/vocab/lingua_latina.xlsx"),
("olc", "data/vocab/oxford_latin_course.xlsx"), # Only to Ch.22
("ecrom", "data/vocab/ecce_romani.xlsx"),
("sub", "data/vocab/suburani.xlsx"),
("wheel", "data/vocab/wheelock.xlsx"),
("newmil", "data/vocab/latin_for_the_new_millennium.xlsx")
]
# get senior texts
works = [
("cicero", "In Catilinam"),
("cicero", "In Pisonem"),
("cicero", "In Q. Caecilium"),
("cicero", "In Sallustium [sp.]"),
("cicero", "In Vatinium"),
("cicero", "In Verrem"),
("cicero", "Pro Archia"),
("cicero", "Pro Balbo"),
("cicero", "Pro Caecina"),
("cicero", "Pro Caelio"),
("cicero", "Pro Cluentio"),
("cicero", "Pro Flacco"),
("cicero", "Pro Fonteio"),
("cicero", "Pro Lege Manilia"),
("cicero", "Pro Ligario"),
("cicero", "Pro Marcello"),
("cicero", "Pro Milone"),
("cicero", "Pro Murena"),
("cicero", "Pro Plancio"),
("cicero", "Pro Q. Roscio Comoedo"),
("cicero", "Pro Quinctio"),
("cicero", "Pro Rabirio Perduellionis Reo"),
("cicero", "Pro Rabirio Postumo"),
("cicero", "Pro Rege Deiotaro"),
("cicero", "Pro S. Roscio Amerino"),
("cicero", "Pro Scauro"),
("cicero", "Pro Sestio"),
("cicero", "Pro Sulla"),
("cicero", "Pro Tullio"),
("Caesar", "de bello gallico"),
("catullus", "carmina"),
("livius", "ab urbe condita"),
("ovidius", "metamorphoses"),
("ovidius", "amores"),
("ovidius", "remedia amoris"),
("ovidius", "Epistulae (vel Heroides)"),
("plinius", "epistulae"),
("virgilius", "aeneis")
]
for title in works:
text, author = getText(title[1], title[0])
if pickleExists(title[1], author):
print("Text analysis already stored.")
continue
# Split text if long then analyse each chunk
length = len(text.split())
print("Text length: {}".format(length))
num = length//2000
manager = multiprocessing.Manager()
docs = manager.list()
if num > 1:
chunks = splitText(text, min(10, num))
start_time = time.time()
# NOTE: multiprocessing must be done in __main__
processes = [Process(target=analyseLarge, args=(ch, docs)) for ch in chunks]
for ind, process in enumerate(processes):
process.start()
for ind, process in enumerate(processes):
process.join()
print("Analysis took {} minutes.".format(round((time.time() - start_time) / 60), 2))
else:
start_time = time.time()
docs.append((0, analyseSmall(text)))
print("Analysis took {} minutes.".format(round((time.time() - start_time) / 60), 2))
# pickle & store
# TODO save in database
res = sorted(docs, key=lambda x: x[0])
for d in res:
store_data(author, title[1], str(d[0]), d[1])
# for k, v in vocabLists.items():
# set_wordlist(v[0], v[1])