-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_extraction.py
More file actions
167 lines (146 loc) · 6.03 KB
/
text_extraction.py
File metadata and controls
167 lines (146 loc) · 6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import re
import urllib
from urllib.request import Request
web = ["www.", "://", ".com", ".net", ".org", ".us", ".gov"]
accepted_tags = ["p", "span", "article", "font", "blockquote"]
exclude = ["cite"]
accepted_classes = {"paragraph", "text"}
bad_phrases = ["back to top", "home", "welcome", "you are here:", "itunes", "google", "facebook", "twitter", "comment"]
bad_subphrases = ["powered by", "around the web", "around the internet", "et al", "ndl", "view source", "view history",
"edit links", "last modified", "text is available under", "creative commons"]
bad_headers = ["References", "Citations", "Further Reading", "External Links", "Footnotes", "See Also"]
a = lambda x: x == x.lower()
A = re.compile("[a-zA-Z]{2,}[0-9]{2,}[ \\.]*")
B = re.compile("([0-9]+[a-zA-Z]+)+[\\s\\.]+")
C = re.compile("[\\[\\{].*[\\]\\}]")
D = re.compile("[A-Z]{2,3}: {0,2}[0-9]{3,}.{0,2}[0-9]*")
E = re.compile("\\([a-zA-Z\\s]+ ([0-9]+[.]*)+\\)")
F = re.compile("(\\\\[a-zA-Z0-9]{1,5})")
def add_item(goods, parent):
goods.append(parent)
def find_good(parent, goods, wiki_mode):
if parent is not None:
if not parent.__class__.__name__ == "NavigableString" and not parent.__class__.__name__ == "Comment":
if hasattr(parent, "name"):
if parent.name in accepted_tags:
add_item(goods, parent)
else:
classes_proto = parent.get("class")
classes = set() if classes_proto is None else set(filter(a, classes_proto))
ids_proto = parent.get("id")
# ids = set() if ids_proto is None else set(filter(a, ids_proto))
# converts all lists of ids and classes to sets with their lowercase versions
# ids are not currently used, but may be used later
if bool(classes & accepted_classes):
add_item(goods, parent)
# if the class is an accepted class, add the item to the list
else:
if hasattr(parent, "children"):
for item in parent.children:
if hasattr(item, "get_text"):
if not(item.name=="a" or item.parent.name=="a"):
t = item.get_text().strip()
if t in bad_headers:
add_item(goods, None)
return False
find_good(item, goods, wiki_mode)
# searches through the child's child nodes
elif parent.__class__.__name__ == "NavigableString":
add_item(goods, parent)
def decide(factors, threshold):
totalWeight = 0
totalValue = 0
for value, weight in factors:
totalValue += value * weight
totalWeight += weight
adjusted = totalValue / totalWeight
return adjusted > threshold
def check(text):
texts = text.split("\n")
result = ""
for item in texts:
new = (checkIndividual(item) + "\n")
result += new
return result
def checkIndividual(text):
if "°" in text and len(text) < 100:
return ""
text = destroy_citations(text.replace("\r", "\n"))
stripped = text.lower().strip("\n").strip("\t").strip(" ").strip("\r")
if stripped in bad_phrases:
return ""
for item in bad_subphrases:
if item in stripped:
return ""
for item in web:
if item in text:
text = text.replace(item, "")
if len(stripped) < 7:
return ""
if not text[0].isalnum() and not text[0] == " " and not text[0] == "/t" and not text[0] == "\n":
return ""
lastchr = stripped[len(stripped) - 1]
if not lastchr.isalnum() and not (lastchr == "." or lastchr == "?" or lastchr == " "):
return ""
if stripped.isdigit():
return ""
endsWithPunc = 0 if stripped[len(stripped) - 1] == '.' else 1
length = 1 / (len(stripped) - 6)
numSpaces = 1 / (stripped.count(' ') + 1)
if numSpaces > 1 / 3:
return ""
factors = [(endsWithPunc, 2), (length, 1), (numSpaces, 3)]
if decide(factors, 0.4):
return ""
return text
def extract(item):
result = ""
if hasattr(item, "children"):
for text in item.children:
if not hasattr(text, "name") or not text.name in exclude:
result += extract(text)
elif hasattr(item, "get_text"):
result = item.get_text()
else:
result = item
return result.replace("\n", " ")
def check_spaces(text):
obj = re.compile("[\\s \\t\\n]{2,}]")
text = obj.sub(" ", text)
text = re.compile("[ ]{2,}").sub(" ", text)
text = text.replace("\n ", "\n").replace(" \n", "\n")
text = re.compile("[\\r\\n]{3,}").sub("\n", text)
return text
def destroy_citations(text):
return A.sub(" ", B.sub(" ", C.sub("", D.sub(" ", E.sub(" ", F.sub(" ", text))))))
def get_text(soup):
soup = soup.html
wiki_mode = False
if ("wikipedia.org" in str(soup)):
wiki_mode = True
goods = list()
find_good(soup, goods, wiki_mode)
text = ""
for item in goods:
if item is None:
break
extraction = extract(item)
if extraction is not None:
text += extraction + "\n"
text = check_spaces(text)
text = check(text)
text = check_spaces(text)
#return text.split(BAD_STUFF)[0]
return text
def getInp(url):
try:
req = Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
response = urllib.request.urlopen(req).read()
html = response.decode('utf-8', errors='ignore').strip()
html = html.replace("\\n", '\n').replace("\\'", "'").replace("\'", "'").replace("\\r", " ").replace("\\t", " ")
print("REACHED", url)
return html
except Exception:
print("COULD NOT REACH", url)
return ""