MetaCat/load_data.py at master · yuzhimanhua/MetaCat · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import csv
import numpy as np
import os
import re
import itertools
from collections import Counter
from os.path import join
from nltk import tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


def read_file(data_dir, with_evaluation):
	data = []
	target = []
	with open(join(data_dir, 'dataset.csv'), 'rt', encoding='utf-8') as csvfile:
		csv.field_size_limit(500 * 1024 * 1024)
		reader = csv.reader(csvfile)
		for row in reader:
			if data_dir == './agnews':
				doc = row[1] + '. ' + row[2]
				data.append(doc)
				target.append(int(row[0]) - 1)
			elif data_dir == './yelp':
				data.append(row[1])
				target.append(int(row[0]) - 1)
			else:
				data.append(row[1])
				target.append(int(row[0]))
	if with_evaluation:
		y = np.asarray(target)
		assert len(data) == len(y)
		assert set(range(len(np.unique(y)))) == set(np.unique(y))
	else:
		y = None
	return data, y


def clean_str(string):
	string = re.sub(r"[^A-Za-z0-9(),.!?_\"\'\`]", " ", string)
	string = re.sub(r"\'s", " \'s", string)
	string = re.sub(r"\"", " \" ", string)
	string = re.sub(r"\'ve", " \'ve", string)
	string = re.sub(r"n\'t", " n\'t", string)
	string = re.sub(r"\'m", " \'m", string)
	string = re.sub(r"\'re", " \'re", string)
	string = re.sub(r"\'d", " \'d", string)
	string = re.sub(r"\'ll", " \'ll", string)
	string = re.sub(r",", " , ", string)
	string = re.sub(r"\.", " . ", string)
	string = re.sub(r"!", " ! ", string)
	string = re.sub(r"\$", " $ ", string)
	string = re.sub(r"\(", " \( ", string)
	string = re.sub(r"\)", " \) ", string)
	string = re.sub(r"\?", " \? ", string)
	string = re.sub(r"\s{2,}", " ", string)
	return string.strip().lower()


def preprocess_doc(data):
	data = [s.strip() for s in data]
	data = [clean_str(s) for s in data]
	return data


def pad_sequences(sentences, padding_word="<PAD/>", pad_len=None):
	if pad_len is not None:
		sequence_length = pad_len
	else:
		sequence_length = max(len(x) for x in sentences)

	padded_sentences = []
	for i in range(len(sentences)):
		sentence = sentences[i]
		num_padding = sequence_length - len(sentence)
		new_sentence = sentence + [padding_word] * num_padding
		padded_sentences.append(new_sentence)
	return padded_sentences


def build_vocab(sentences):
	# Build vocabulary
	word_counts = Counter(itertools.chain(*sentences))
	# Mapping from index to word
	vocabulary_inv = [x[0] for x in word_counts.most_common()]
	# Mapping from word to index
	vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
	return word_counts, vocabulary, vocabulary_inv


def build_input_data_cnn(sentences, vocabulary):
	x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
	return x


def build_input_data_rnn(data, vocabulary, max_doc_len, max_sent_len):
	x = np.zeros((len(data), max_doc_len, max_sent_len), dtype='int32')
	for i, doc in enumerate(data):
		for j, sent in enumerate(doc):
			if j >= max_doc_len:
				break
			k = 0
			for word in sent:
				if k >= max_sent_len:
					break
				x[i,j,k] = vocabulary[word]
				k += 1
	return x


def extract_keywords(data_path, vocab, class_type, num_keywords, data, perm):
	sup_data = []
	sup_idx = []
	sup_label = []
	file_name = 'doc_id.txt'
	infile = open(join(data_path, file_name), mode='r', encoding='utf-8')
	text = infile.readlines()
	for i, line in enumerate(text):
		line = line.split('\n')[0]
		class_id, doc_ids = line.split(':')
		assert int(class_id) == i
		seed_idx = doc_ids.split(',')
		seed_idx = [int(idx) for idx in seed_idx]
		sup_idx.append(seed_idx)
		for idx in seed_idx:
			sup_data.append(" ".join(data[idx]))
			sup_label.append(i)

	from sklearn.feature_extraction.text import TfidfVectorizer
	import nltk

	tfidf = TfidfVectorizer(norm='l2', sublinear_tf=True, max_df=0.2, stop_words='english')
	sup_x = tfidf.fit_transform(sup_data)
	sup_x = np.asarray(sup_x.todense())

	vocab_dict = tfidf.vocabulary_
	vocab_inv_dict = {v: k for k, v in vocab_dict.items()}

	# print("\n### Supervision type: Labeled documents ###")
	# print("Extracted keywords for each class: ")
	keywords = []
	cnt = 0
	for i in range(len(sup_idx)):
		class_vec = np.average(sup_x[cnt:cnt+len(sup_idx[i])], axis=0)
		cnt += len(sup_idx[i])
		sort_idx = np.argsort(class_vec)[::-1]
		keyword = []
		if class_type == 'topic':
			j = 0
			k = 0
			while j < num_keywords:
				w = vocab_inv_dict[sort_idx[k]]
				if w in vocab:
					keyword.append(vocab_inv_dict[sort_idx[k]])
					j += 1
				k += 1
		elif class_type == 'sentiment':
			j = 0
			k = 0
			while j < num_keywords:
				w = vocab_inv_dict[sort_idx[k]]
				w, t = nltk.pos_tag([w])[0]
				if t.startswith("J") and w in vocab:
					keyword.append(w)
					j += 1
				k += 1
		# print("Class {}:".format(i))
		# print(keyword)
		keywords.append(keyword)

	new_sup_idx = []
	m = {v: k for k, v in enumerate(perm)}
	for seed_idx in sup_idx:
		new_seed_idx = []
		for ele in seed_idx:
			new_seed_idx.append(m[ele])
		new_sup_idx.append(new_seed_idx)

	# padding
	maxlen = 0
	for idlist in new_sup_idx:
		if len(idlist) > maxlen:
			maxlen = len(idlist)
	new_sup_idx0 = []
	for idlist in new_sup_idx:
		idlist0 = []
		for j in range(int(maxlen/len(idlist)) + 1):
			idlist0 += idlist
		new_sup_idx0.append(idlist0[:maxlen])

	new_sup_idx0 = np.asarray(new_sup_idx0)

	return keywords, new_sup_idx0


def load_keywords(data_path, sup_source):
	if sup_source == 'labels':
		file_name = 'classes.txt'
		print("\n### Supervision type: Label Surface Names ###")
		print("Label Names for each class: ")
	elif sup_source == 'keywords':
		file_name = 'keywords.txt'
		print("\n### Supervision type: Class-related Keywords ###")
		print("Keywords for each class: ")
	infile = open(join(data_path, file_name), mode='r', encoding='utf-8')
	text = infile.readlines()

	keywords = []
	for i, line in enumerate(text):
		line = line.split('\n')[0]
		class_id, contents = line.split(':')
		assert int(class_id) == i
		keyword = contents.split(',')
		print("Supervision content of class {}:".format(i))
		print(keyword)
		keywords.append(keyword)
	return keywords


def load_cnn(dataset_name, sup_source, num_keywords=10, with_evaluation=True, truncate_len=None):
	data_path = './' + dataset_name
	data, y = read_file(data_path, with_evaluation)

	sz = len(data)
	np.random.seed(1234)
	perm = np.random.permutation(sz)

	data = preprocess_doc(data)
	data = [s.split(" ") for s in data]

	tmp_list = [len(doc) for doc in data]
	len_max = max(tmp_list)
	len_avg = np.average(tmp_list)
	len_std = np.std(tmp_list)

	print("\n### Dataset statistics: ###")
	print('Document max length: {} (words)'.format(len_max))
	print('Document average length: {} (words)'.format(len_avg))
	print('Document length std: {} (words)'.format(len_std))

	if truncate_len is None:
		truncate_len = min(int(len_avg + 3*len_std), len_max)
	print("Defined maximum document length: {} (words)".format(truncate_len))
	print('Fraction of truncated documents: {}'.format(sum(tmp > truncate_len for tmp in tmp_list)/len(tmp_list)))

	sequences_padded = pad_sequences(data)
	word_counts, vocabulary, vocabulary_inv = build_vocab(sequences_padded)
	x = build_input_data_cnn(sequences_padded, vocabulary)
	x = x[perm]

	if with_evaluation:
		print("Number of classes: {}".format(len(np.unique(y))))
		print("Number of documents in each class:")
		for i in range(len(np.unique(y))):
			print("Class {}: {}".format(i, len(np.where(y == i)[0])))
		y = y[perm]

	print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

	if sup_source == 'labels' or sup_source == 'keywords':
		keywords = load_keywords(data_path, sup_source)
		return x, y, word_counts, vocabulary, vocabulary_inv, len_avg, len_std, keywords, perm
	elif sup_source == 'docs':
		class_type = 'topic'
		keywords, sup_idx = extract_keywords(data_path, vocabulary, class_type, num_keywords, data, perm)
		return x, y, word_counts, vocabulary, vocabulary_inv, len_avg, len_std, keywords, sup_idx, perm


def load_rnn(dataset_name, sup_source, num_keywords=10, with_evaluation=True, truncate_len=None):
	data_path = './' + dataset_name
	data, y = read_file(data_path, with_evaluation)

	sz = len(data)
	np.random.seed(1234)
	perm = np.random.permutation(sz)

	data = preprocess_doc(data)
	data_copy = [s.split(" ") for s in data]
	docs_padded = pad_sequences(data_copy)
	word_counts, vocabulary, vocabulary_inv = build_vocab(docs_padded)

	data = [tokenize.sent_tokenize(doc) for doc in data]
	flat_data = [sent for doc in data for sent in doc]

	tmp_list = [len(sent.split(" ")) for sent in flat_data]
	max_sent_len = max(tmp_list)
	avg_sent_len = np.average(tmp_list)
	std_sent_len = np.std(tmp_list)

	print("\n### Dataset statistics: ###")
	print('Sentence max length: {} (words)'.format(max_sent_len))
	print('Sentence average length: {} (words)'.format(avg_sent_len))

	if truncate_len is None:
		truncate_sent_len = min(int(avg_sent_len + 3*std_sent_len), max_sent_len)
	else:
		truncate_sent_len = truncate_len[1]
	print("Defined maximum sentence length: {} (words)".format(truncate_sent_len))
	print('Fraction of truncated sentences: {}'.format(sum(tmp > truncate_sent_len for tmp in tmp_list)/len(tmp_list)))

	tmp_list = [len(doc) for doc in data]
	max_doc_len = max(tmp_list)
	avg_doc_len = np.average(tmp_list)
	std_doc_len = np.std(tmp_list)

	print('Document max length: {} (sentences)'.format(max_doc_len))
	print('Document average length: {} (sentences)'.format(avg_doc_len))

	if truncate_len is None:
		truncate_doc_len = min(int(avg_doc_len + 3*std_doc_len), max_doc_len)
	else:
		truncate_doc_len = truncate_len[0]
	print("Defined maximum document length: {} (sentences)".format(truncate_doc_len))
	print('Fraction of truncated documents: {}'.format(sum(tmp > truncate_doc_len for tmp in tmp_list)/len(tmp_list)))

	len_avg = [avg_doc_len, avg_sent_len]
	len_std = [std_doc_len, std_sent_len]

	data = [[sent.split(" ") for sent in doc] for doc in data]
	x = build_input_data_rnn(data, vocabulary, int(avg_doc_len + 3*std_doc_len), int(avg_sent_len + 3*std_sent_len))
	x = x[perm]

	if with_evaluation:
		print("Number of classes: {}".format(len(np.unique(y))))
		print("Number of documents in each class:")
		for i in range(len(np.unique(y))):
			print("Class {}: {}".format(i, len(np.where(y == i)[0])))
		y = y[perm]

	print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

	if sup_source == 'labels' or sup_source == 'keywords':
		keywords = load_keywords(data_path, sup_source)
		return x, y, word_counts, vocabulary, vocabulary_inv, len_avg, len_std, keywords, perm
	elif sup_source == 'docs':
		if dataset_name == 'nyt':
			class_type = 'topic'
		elif dataset_name == 'agnews':
			class_type = 'topic'
		elif dataset_name == 'yelp':
			class_type = 'sentiment'
		else:
			class_type = 'topic'
		keywords, sup_idx = extract_keywords(data_path, vocabulary, class_type, num_keywords, data_copy, perm)
		return x, y, word_counts, vocabulary, vocabulary_inv, len_avg, len_std, keywords, sup_idx, perm


def load_dataset(dataset_name, sup_source, model='cnn', with_evaluation=True, truncate_len=None):
	if model == 'cnn':
		return load_cnn(dataset_name, sup_source, with_evaluation=with_evaluation, truncate_len=truncate_len)
	elif model == 'rnn':
		return load_rnn(dataset_name, sup_source, with_evaluation=with_evaluation, truncate_len=truncate_len)