Skip to content

Commit 317c5d1

Browse files
authored
Update gensim_similarity.py
1 parent d397550 commit 317c5d1

File tree

1 file changed

+38
-42
lines changed

1 file changed

+38
-42
lines changed

similarity/gensim_similarity.py

Lines changed: 38 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,42 @@
1-
2-
# Documents Similarity using NLTK and Gensim library
31
import gensim
4-
import nltk
52
from nltk.tokenize import word_tokenize
63

7-
raw_documents = ["I'm taking the show on the road.",
8-
"My socks are a force multiplier.",
9-
"I am the barber who cuts everyone's hair who doesn't cut their own.",
10-
"Legend has it that the mind is a mad monkey.",
4+
class GensimSimilarity:
5+
def __init__(self):
6+
self.raw_documents = ["I'm taking the show on the road.",
7+
"My socks are a force multiplier.",
8+
"I am the barber who cuts everyone's hair who doesn't cut their own.",
9+
"Legend has it that the mind is a mad monkey.",
1110
"I make my own fun."]
12-
print("Number of documents:",len(raw_documents))
13-
14-
gen_docs = [[w.lower() for w in word_tokenize(text)]
15-
for text in raw_documents]
16-
print(gen_docs)
17-
18-
dictionary = gensim.corpora.Dictionary(gen_docs)
19-
print(dictionary[5])
20-
print(dictionary.token2id['road'])
21-
print("Number of words in dictionary:",len(dictionary))
22-
for i in range(len(dictionary)):
23-
print(i, dictionary[i])
24-
25-
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
26-
print(corpus)
27-
28-
tf_idf = gensim.models.TfidfModel(corpus)
29-
print(tf_idf)
30-
s = 0
31-
for i in corpus:
32-
s += len(i)
33-
print(s)
34-
35-
sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus],
36-
num_features=len(dictionary))
37-
print(sims)
38-
print(type(sims))
39-
40-
query_doc = [w.lower() for w in word_tokenize("Socks are a force for good.")]
41-
print(query_doc)
42-
query_doc_bow = dictionary.doc2bow(query_doc)
43-
print(query_doc_bow)
44-
query_doc_tf_idf = tf_idf[query_doc_bow]
45-
print(query_doc_tf_idf)
46-
print(sims[query_doc_tf_idf])
11+
12+
def getSimilarity(gen):
13+
gen_docs = [[w.lower() for w in word_tokenize(text)]
14+
for text in gen.raw_documents]
15+
print(gen_docs)
16+
dictionary = gensim.corpora.Dictionary(gen_docs)
17+
print("Number of words in dictionary:",len(dictionary))
18+
19+
for i in range(len(dictionary)):
20+
print(i, dictionary[i])
21+
22+
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
23+
print(corpus)
24+
25+
tf_idf = gensim.models.TfidfModel(corpus)
26+
print(tf_idf)
27+
s = 0
28+
for i in corpus:
29+
s += len(i)
30+
print(s)
31+
32+
sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus],num_features=len(dictionary))
33+
34+
query_doc = [w.lower() for w in word_tokenize("Socks are a force for good.")]
35+
print(query_doc)
36+
query_doc_bow = dictionary.doc2bow(query_doc)
37+
print(query_doc_bow)
38+
query_doc_tf_idf = tf_idf[query_doc_bow]
39+
print(f'Result: {sims[query_doc_tf_idf]}')
40+
41+
similarity = GensimSimilarity()
42+
similarity.getSimilarity()

0 commit comments

Comments
 (0)