-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheva_nlp.py
More file actions
105 lines (83 loc) · 2.91 KB
/
eva_nlp.py
File metadata and controls
105 lines (83 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2017-01-13 17:50:09
# @Author : Tom Hu (h1994st@gmail.com)
# @Link : http://h1994st.com
# @Version : 1.0
import re
import subprocess
import peewee
from auto_flow_leaker.comb_coder.public_pool import SINA_EN
from auto_flow_leaker.comb_coder.public_pool import CHINA_DAILY
def parse(sentence, classes):
command = 'java -Xmx512M -jar StanfordParserDemo.jar "%s" "%s"' % (
sentence, classes)
try:
result = subprocess.Popen(
command, shell=True, stdout=subprocess.PIPE).stdout.read()
except Exception:
print '>>>> Run Time Error (%s) When parsing:', sentence
return result
def generate_random_post(num):
query = (
SINA_EN.select(
SINA_EN.TITLE,
SINA_EN.SUMMARY,
SINA_EN.URL).where(
SINA_EN.SUMMARY != '') |
CHINA_DAILY.select(
CHINA_DAILY.TITLE,
CHINA_DAILY.SUMMARY,
CHINA_DAILY.URL).where(
CHINA_DAILY.SUMMARY != '')
).order_by(peewee.fn.Rand()).limit(num)
qr = query.execute()
return qr
def generate_random_post2(num):
query = CHINA_DAILY.select(
CHINA_DAILY.TITLE,
CHINA_DAILY.SUMMARY,
CHINA_DAILY.URL).where(
CHINA_DAILY.SUMMARY != '').order_by(peewee.fn.Rand()).limit(num)
qr = query.execute()
return qr
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def main(num_news_digest, num_testing_file):
with open('data/nlp/testing/testing%d.csv' % num_testing_file, 'w') as fp:
print fp.name
for i in xrange(num_news_digest):
print i
# generate news
qr = generate_random_post(10)
for result in qr:
# fp.write(result.TITLE.encode('utf8'))
# fp.write('\n')
# fp.write(result.SUMMARY.encode('utf8'))
# fp.write('\n')
fp.write(parse(result.TITLE, 'news %d' % i))
fp.write(parse(result.SUMMARY, 'news %d' % i))
print 'Done!'
def main2(num_news_digest, num_testing_file):
with open('data/nlp/testing/testing-news-digest-%d.csv' % num_testing_file, 'w') as fp:
print fp.name
for i in xrange(num_news_digest):
print i
# generate news
qr = generate_random_post2(10)
for result in qr:
# fp.write(result.TITLE.encode('utf8'))
# fp.write('\n')
# fp.write(result.SUMMARY.encode('utf8'))
# fp.write('\n')
line = cleanhtml(result.SUMMARY.encode('utf8'))
if line[-1] != '.':
line += '.'
fp.write(line)
fp.write('\n')
fp.flush()
if __name__ == '__main__':
for i in xrange(10):
main2(1000, i)