TP_solutions/ex13_9.py at master · dexhunter/TP_solutions · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
'''
TODO: draw the rank-frequency plot
y = -s*x + b (y= logf, x=logr, b=logc)'''

import string
import collections

def read_file(filename, skip_header=True):
	"""Makes a histogram that contains the words from a file.
	**Rewrite the author's method for simplicity**

	filename: string
	skip_header: boolean, default=True

	returns: map from each word to the number of times it appears.
	"""
	hist = collections.Counter()

	with open(filename, encoding="utf8") as fin:
		#skip the header
		if skip_header:
			skip_gutenberg_header(fin)
		#update
		for line in fin:
			hist.update([word.strip(string.punctuation + string.whitespace).lower() for word in line.replace('-', ' ').split()])

	return hist

def skip_gutenberg_header(fp):
	"""Reads from fp until it finds the line that ends the header.

	fp: open file object

	copied from author's answer
	"""
	for line in fp:
		if line.startswith('*END*THE SMALL PRINT!'):
			break

def freq_and_rank(hist):
	freq = list(hist.values())
	freq.sort(reverse=True)
	rank = 1
	for f in freq:
		yield rank, f
		rank += 1

def main():
	hist = read_file('emma.txt')
	for rank, freq in freq_and_rank(hist):
		print (rank, freq)


if __name__ == '__main__':
	main()