Skip to content

Commit bcc8445

Browse files
add data processor and requirement
1 parent 3633c89 commit bcc8445

File tree

3 files changed

+41
-0
lines changed

3 files changed

+41
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# -*- coding: UTF-8 -*-
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# -*- coding: UTF-8 -*-
2+
from multiprocessing import Pool
3+
import en_core_web_sm
4+
from pysenal import get_chunk, read_jsonline_lazy
5+
6+
7+
class Kp20kPreprocessor(object):
8+
nlp = en_core_web_sm.load(disable=['ner', 'parser', 'textcat'])
9+
10+
def __init__(self, src_filename, dest_filename, parallel_count=10):
11+
self.src_filename = src_filename
12+
self.dest_filename = dest_filename
13+
self.pool = Pool(parallel_count)
14+
15+
def process(self):
16+
chunk_size = 100
17+
for item_chunk in get_chunk(read_jsonline_lazy(self.src_filename), chunk_size):
18+
self.pool.map(self.tokenize_record, item_chunk)
19+
20+
def tokenize_record(self, record):
21+
abstract_tokens = self.tokenize(record['abstract'])
22+
title_tokens = self.tokenize(record['title'])
23+
keyword_token_list = []
24+
for keyword in record['keyword'].split(';'):
25+
keyword_token_list.append(self.tokenize(keyword))
26+
result = {'title_tokens': title_tokens, 'abstract_tokens': abstract_tokens,
27+
'keyword_tokens': keyword_token_list}
28+
return result
29+
30+
def tokenize(self, text, lower=True, stem=False):
31+
tokens = []
32+
for token in self.nlp(text):
33+
if lower:
34+
token_text = token.lower()
35+
else:
36+
token_text = token.text
37+
38+
tokens.append(token_text)
39+
return tokens

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ torch>=1.2.0
22
pysenal>=0.0.8
33
tensorboard>=2.0.1
44
munch==2.5.0
5+
en_core_web_sm

0 commit comments

Comments
 (0)