This repository was archived by the owner on Mar 8, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathex13_9.py
More file actions
55 lines (42 loc) · 1.18 KB
/
ex13_9.py
File metadata and controls
55 lines (42 loc) · 1.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
'''
TODO: draw the rank-frequency plot
y = -s*x + b (y= logf, x=logr, b=logc)'''
import string
import collections
def read_file(filename, skip_header=True):
"""Makes a histogram that contains the words from a file.
**Rewrite the author's method for simplicity**
filename: string
skip_header: boolean, default=True
returns: map from each word to the number of times it appears.
"""
hist = collections.Counter()
with open(filename, encoding="utf8") as fin:
#skip the header
if skip_header:
skip_gutenberg_header(fin)
#update
for line in fin:
hist.update([word.strip(string.punctuation + string.whitespace).lower() for word in line.replace('-', ' ').split()])
return hist
def skip_gutenberg_header(fp):
"""Reads from fp until it finds the line that ends the header.
fp: open file object
copied from author's answer
"""
for line in fp:
if line.startswith('*END*THE SMALL PRINT!'):
break
def freq_and_rank(hist):
freq = list(hist.values())
freq.sort(reverse=True)
rank = 1
for f in freq:
yield rank, f
rank += 1
def main():
hist = read_file('emma.txt')
for rank, freq in freq_and_rank(hist):
print (rank, freq)
if __name__ == '__main__':
main()