Skip to content

Commit 27babef

Browse files
Merge pull request #11 from tgunda/resolve-gensim-update
Resolve gensim update
2 parents ad37bea + 7537963 commit 27babef

File tree

6 files changed

+1982
-274
lines changed

6 files changed

+1982
-274
lines changed

docs/conf.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,17 @@
1414
import sys
1515

1616
import mock
17-
MODULES = ['numpy', 'nltk', 'sklearn.pipeline', 'sklearn.model_selection', 'scipy.sparse', 'pandas', 'scipy', 'sklearn.base', 'gensim.models.doc2vec', 'nltk.tokenize', 'datefinder', 'text_remove_nondate_nums', 'text_remove_numbers_stopwords', 'get_dates', 'gensim.models', 'sklearn.svm', 'sklearn.tree', 'sklearn.neural_network', 'sklearn.linear_model', 'sklearn.ensemble', "sklearn.cluster", "networkx", "matplotlib", "matplotlib.pyplot", "gensim.models.doc2vec", "sklearn.feature_extraction.text", "nltk.tokenize", "plotly.graph_objects", "scipy.signal", 'matplotlib.colors', 'seaborn', 'matplotlib.ticker', 'scipy.signal.find_peaks']
17+
MODULES = ['numpy', 'nltk', 'sklearn.pipeline', 'sklearn.model_selection',
18+
'scipy.sparse', 'pandas', 'scipy', 'sklearn.base', 'gensim.models.doc2vec',
19+
'nltk.tokenize', 'datefinder', 'text_remove_nondate_nums', 'text_remove_numbers_stopwords',
20+
'get_dates', 'gensim.models', 'sklearn.svm', 'sklearn.tree', 'sklearn.neural_network',
21+
'sklearn.linear_model', 'sklearn.ensemble', "sklearn.cluster", "networkx", "matplotlib",
22+
"matplotlib.pyplot", "gensim.models.doc2vec", "sklearn.feature_extraction.text", "nltk.tokenize",
23+
"plotly.graph_objects", "scipy.signal", 'matplotlib.colors', 'seaborn', 'matplotlib.ticker',
24+
'scipy.signal.find_peaks']
1825
for module in MODULES:
1926
sys.modules[module] = mock.Mock()
2027

21-
import shlex
22-
2328
sys.path.insert(0, os.path.abspath("../pvops"))
2429
sys.path.insert(0, os.path.abspath("../pvops/text2time"))
2530
sys.path.insert(0, os.path.abspath("../pvops/text"))

examples/text_class_example.py

Lines changed: 10 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,27 @@
1-
import numpy as np
2-
import pandas as pd
3-
import pickle
4-
import traceback
5-
import nltk
6-
71
import sys
82
import os
93

104
sys.path.append("..")
115
pvops_directory = os.path.join("..", "pvops")
126
sys.path.append(pvops_directory)
137

14-
# Utilities
8+
# pvOps subpackages
159
from pvops.text import nlp_utils
1610
from pvops.text import utils
17-
18-
# Visualizations
1911
from pvops.text import visualize
20-
21-
# Preprocessing
2212
from pvops.text import preprocess
23-
24-
# Classification
2513
from pvops.text import classify
26-
27-
# Library example definitions
2814
from pvops.text import defaults
2915

30-
# Embedding
16+
import nltk
17+
import traceback
18+
import pandas as pd
19+
import numpy as np
3120
from sklearn.feature_extraction.text import TfidfVectorizer
3221
from gensim.models.doc2vec import Doc2Vec
33-
34-
# Clustering
3522
from sklearn.cluster import KMeans
36-
37-
# Scoring
3823
from sklearn.metrics import make_scorer, f1_score, homogeneity_score
3924

40-
4125
class Example:
4226
def __init__(self, df, LABEL_COLUMN):
4327
self.LABEL_COLUMN = LABEL_COLUMN
@@ -75,7 +59,7 @@ def extract_dates(
7559
EVENTSTART_COLUMN,
7660
SAVE_DATA_COLUMN="CleanDesc",
7761
SAVE_DATE_COLUMN="ExtractedDates",
78-
print_info = False,
62+
print_info=False,
7963
):
8064

8165
col_dict = {
@@ -505,7 +489,7 @@ def predict_best_model(
505489
output_col = f"Unsupervised_Pred_{self.LABEL_COLUMN}"
506490

507491
self.df[output_col] = pred_y
508-
print(f"Predictions stored to {output_col} in `df` attribute")
492+
print(f"Predictions stored to {output_col} in `df`")
509493

510494
print(f"Score: {score}")
511495

@@ -519,9 +503,6 @@ def predict_best_model(
519503
df = pd.read_csv(folder + filename)
520504

521505
e = Example(df, LABEL_COLUMN)
522-
# df = e.extract_dates(DATA_COLUMN, DATE_COLUMN, SAVE_DATE_COLUMN='ExtractedDates')
523-
e.prep_data_for_ML(DATA_COLUMN, DATE_COLUMN)
524-
# e.test_doc2vec()
525-
# Setting few cross validation splits because of few example data
526-
e.classify_supervised(n_cv_splits=2, embedding="doc2vec")
527-
e.predict_best_model()
506+
e.summarize_text_data(DATA_COLUMN)
507+
508+
print("\nMessage from pvOps team: See `tutorial_textmodule.ipynb` for a more in-depth demonstration of the text module's functionality.")

examples/tutorial_textmodule.ipynb

Lines changed: 1959 additions & 238 deletions
Large diffs are not rendered by default.

pvops/text/nlp_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def __init__(
3939
self.dv_mapfile = dv_mapfile
4040
self.comment = comment
4141
self.trim_rule = trim_rule
42+
self.callbacks = callbacks
4243
self.window = window
4344
self.epochs = epochs
4445

@@ -70,7 +71,7 @@ def fit(self, raw_documents, y=None):
7071
self.d2v_model.train(
7172
tagged_documents,
7273
total_examples=len(tagged_documents),
73-
epochs=self.d2v_model.iter,
74+
epochs=self.d2v_model.epochs,
7475
)
7576
return self
7677

pvops/text/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ def summarize_text_data(om_df, colname):
3131
tokenized = [sentence.split() for sentence in nonan_text]
3232
avg_n_words = np.array([len(tokens) for tokens in tokenized]).mean()
3333
sum_n_words = np.array([len(tokens) for tokens in tokenized]).sum()
34-
model = Word2Vec(tokenized, min_count=1, size=64)
34+
model = Word2Vec(tokenized, min_count=1)
3535

3636
# Total vocabulary
37-
vocab = model.wv.vocab
37+
vocab = model.wv
3838

3939
# Bold title.
4040
print("\033[1m" + "DETAILS" + "\033[0m")

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@ datefinder
77
matplotlib
88
seaborn
99
plotly
10-
gensim
10+
gensim>=4.0.0
1111
networkx

0 commit comments

Comments
 (0)