Skip to content

Commit a4de9a4

Browse files
authored
Keyword search in text from reference dictionary (#46)
* init file and pull sample data * add helper functions and reference dict * clean up text, tokenize, add new labels from regex function * move example into its own file, seperate out functions to use * clean functions and make generic names * clean up example * move reference dict to example_data * remove pdb * move dict out of functions file * missed import * add metrics to measure accuracy * rename other category to unknown for fair comparison * move keyword search to preprocess step, util * move add label fucntion to classify * fix imports, add comment for viz * remove temp file * change arg to dict * add keyword util * update formating * whitespace * update args to match * remove whitespace * function rename * clean up dict * add sample data * update docstring and change col dict name * clean up and use sample data in examples * add plot function * add in visualization function for confusion matrix * add more detail to dox string * clean up function to expect predicted_col * clean up example * remove whitespace * change col name from new to predicted * clean up sample data * add notebook example * add functionality to replace text in preprocessing * add text replacement step * linter * remove example .py file in favor of jupyter notebook * remove reference dict * add new csv mapping rfiles * add csv * update jupyternotebook example * use df instead of dict * remove dict * remove unused imports and functions * update jupyter notebook and remove module * remove plot * update docstrings * linter * remove whitespace
1 parent 2526d49 commit a4de9a4

File tree

7 files changed

+420
-0
lines changed

7 files changed

+420
-0
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
in,out_
2+
combiner,combiner
3+
comb,combiner
4+
cb,combiner
5+
battery,battery
6+
bess,battery
7+
inverter,inverter
8+
invert,inverter
9+
inv,inverter
10+
met,met
11+
meter,meter
12+
module,module
13+
mod,module
14+
recloser,recloser
15+
reclose,recloser
16+
relay,relay
17+
substation,substation
18+
switchgear,switchgear
19+
switch,switchgear
20+
tracker,tracker
21+
transformer,transformer
22+
xfmr,transformer
23+
wiring,wiring
24+
wire,wiring
25+
wires,wiring
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
in,out_
2+
comm,communication
3+
energy,energy
4+
kwh,energy
5+
mwh,energy
6+
grid,grid
7+
curtailment,grid
8+
curtail,grid
9+
poi,grid
10+
offline,outage
11+
solar,solar
12+
pv,solar
13+
photovoltaic,solar
14+
system,system
15+
site,system
16+
farm,system
17+
project,system
18+
sma,make_model
19+
cm,corrective_maintence
20+
pm,preventative_maintence
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
{
2+
"cells": [
3+
{
4+
"attachments": {},
5+
"cell_type": "markdown",
6+
"metadata": {},
7+
"source": [
8+
"# Adding keyword labels to O&M data\n",
9+
"This notebook demonstrates the use of the `pvops.classify.get_attributes_from_keywords` module for adding asset labels based off O&M notes."
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": null,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"import pandas as pd\n",
19+
"from sklearn.metrics import accuracy_score\n",
20+
"\n",
21+
"from pvops.text import utils, preprocess\n",
22+
"from pvops.text.classify import get_attributes_from_keywords\n",
23+
"from pvops.text.visualize import visualize_classification_confusion_matrix"
24+
]
25+
},
26+
{
27+
"attachments": {},
28+
"cell_type": "markdown",
29+
"metadata": {},
30+
"source": [
31+
"# Step 0: Get sample data, remap assets"
32+
]
33+
},
34+
{
35+
"cell_type": "code",
36+
"execution_count": null,
37+
"metadata": {},
38+
"outputs": [],
39+
"source": [
40+
"# pull in sample data and remap assets for ease of comparison\n",
41+
"\n",
42+
"om_df = pd.read_csv('example_data/example_ML_ticket_data.csv')\n",
43+
"col_dict = {\n",
44+
" \"data\" : \"CompletionDesc\",\n",
45+
" \"eventstart\" : \"Date_EventStart\",\n",
46+
" \"save_data_column\" : \"processed_data\",\n",
47+
" \"save_date_column\" : \"processed_date\",\n",
48+
" \"attribute_col\" : \"Asset\",\n",
49+
" \"predicted_col\" : \"Keyword_Asset\",\n",
50+
" \"remapping_col_from\": \"in\",\n",
51+
" \"remapping_col_to\": \"out_\"\n",
52+
"}\n",
53+
"\n",
54+
"# remap assets\n",
55+
"remapping_df = pd.read_csv('example_data/remappings_asset.csv')\n",
56+
"remapping_df['out_'] = remapping_df['out_'].replace({'met station': 'met',\n",
57+
" 'energy storage': 'battery',\n",
58+
" 'energy meter': 'meter'})\n",
59+
"om_df = utils.remap_attributes(om_df, remapping_df, col_dict, allow_missing_mappings=True)\n",
60+
"om_df.head()"
61+
]
62+
},
63+
{
64+
"attachments": {},
65+
"cell_type": "markdown",
66+
"metadata": {},
67+
"source": [
68+
"# Step 1: Text preprocessing"
69+
]
70+
},
71+
{
72+
"cell_type": "code",
73+
"execution_count": null,
74+
"metadata": {},
75+
"outputs": [],
76+
"source": [
77+
"# preprocessing steps\n",
78+
"om_df[col_dict['attribute_col']] = om_df.apply(lambda row: row[col_dict['attribute_col']].lower(), axis=1)\n",
79+
"om_df = preprocess.preprocessor(om_df, lst_stopwords=[], col_dict=col_dict, print_info=False, extract_dates_only=False)\n",
80+
"\n",
81+
"DATA_COL = col_dict['data']\n",
82+
"om_df[DATA_COL] = om_df['processed_data']\n",
83+
"\n",
84+
"# replace terms\n",
85+
"equipment_df = pd.read_csv('~/pvOps/examples/example_data/mappings_equipment.csv')\n",
86+
"pv_terms_df = pd.read_csv('~/pvOps/examples/example_data/mappings_pv_terms.csv')\n",
87+
"pv_reference_df = pd.concat([equipment_df, pv_terms_df])\n",
88+
"om_df = utils.remap_words_in_text(om_df=om_df, remapping_df=pv_reference_df, remapping_col_dict=col_dict)\n",
89+
"\n",
90+
"om_df.head()"
91+
]
92+
},
93+
{
94+
"attachments": {},
95+
"cell_type": "markdown",
96+
"metadata": {},
97+
"source": [
98+
"# Step 2: Search for keywords to use as labels"
99+
]
100+
},
101+
{
102+
"cell_type": "code",
103+
"execution_count": null,
104+
"metadata": {},
105+
"outputs": [],
106+
"source": [
107+
"# add asset labels from keyword reference dict\n",
108+
"om_df = get_attributes_from_keywords(om_df=om_df,\n",
109+
" col_dict=col_dict,\n",
110+
" reference_df=equipment_df)\n",
111+
"om_df.head()"
112+
]
113+
},
114+
{
115+
"attachments": {},
116+
"cell_type": "markdown",
117+
"metadata": {},
118+
"source": [
119+
"# Step 3: Metrics"
120+
]
121+
},
122+
{
123+
"cell_type": "code",
124+
"execution_count": null,
125+
"metadata": {},
126+
"outputs": [],
127+
"source": [
128+
"# get accuracy measures and count metrics\n",
129+
"PREDICT_COL = col_dict['predicted_col']\n",
130+
"LABEL_COL = col_dict['attribute_col']\n",
131+
"\n",
132+
"# entries with some keyword over interest, over all entries\n",
133+
"label_count = om_df[PREDICT_COL].count() / len(om_df)\n",
134+
"\n",
135+
"# replace 'Other' values with 'Unknown'\n",
136+
"om_df[LABEL_COL] = om_df[LABEL_COL].replace('other', 'unknown')\n",
137+
"# replace NaN values to use accuracy score\n",
138+
"om_df[[LABEL_COL, PREDICT_COL]] = om_df[[LABEL_COL, PREDICT_COL]].fillna('unknown')\n",
139+
"acc_score = accuracy_score(y_true=om_df[LABEL_COL], y_pred=om_df[PREDICT_COL])\n",
140+
"\n",
141+
"msg = f'{label_count:.2%} of entries had a keyword of interest, with {acc_score:.2%} accuracy.'\n",
142+
"print(msg)"
143+
]
144+
},
145+
{
146+
"attachments": {},
147+
"cell_type": "markdown",
148+
"metadata": {},
149+
"source": [
150+
"# Step 4: Visualization"
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"metadata": {},
157+
"outputs": [],
158+
"source": [
159+
"# plot confusion matrix\n",
160+
"title = 'Confusion Matrix of Actual and Predicted Asset Labels'\n",
161+
"visualize_classification_confusion_matrix(om_df, col_dict, title)"
162+
]
163+
}
164+
],
165+
"metadata": {
166+
"kernelspec": {
167+
"display_name": "Python 3",
168+
"language": "python",
169+
"name": "python3"
170+
},
171+
"language_info": {
172+
"codemirror_mode": {
173+
"name": "ipython",
174+
"version": 3
175+
},
176+
"file_extension": ".py",
177+
"mimetype": "text/x-python",
178+
"name": "python",
179+
"nbconvert_exporter": "python",
180+
"pygments_lexer": "ipython3",
181+
"version": "3.7.5"
182+
},
183+
"orig_nbformat": 4
184+
},
185+
"nbformat": 4,
186+
"nbformat_minor": 2
187+
}

pvops/text/classify.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import pandas as pd
99
import copy
1010

11+
from pvops.text.preprocess import get_keywords_of_interest
1112

1213
def classification_deployer(
1314
X,
@@ -187,3 +188,53 @@ def classification_deployer(
187188
best_gs_instance = gs_clf
188189

189190
return pd.concat(rows, axis=1).T, best_gs_instance.best_estimator_
191+
192+
def get_attributes_from_keywords(om_df, col_dict, reference_df, reference_col_dict):
193+
"""Find keywords of interest in specified column of dataframe, return as new column value.
194+
195+
If keywords of interest given in a reference dataframe are in the specified column of the
196+
dataframe, return the keyword category, or categories.
197+
For example, if the string 'inverter' is in the list of text, return ['inverter'].
198+
199+
Parameters
200+
----------
201+
om_df : pd.DataFrame
202+
Dataframe to search for keywords of interest, must include text_col.
203+
col_dict : dict of {str : str}
204+
A dictionary that contains the column names needed:
205+
206+
- data : string, should be assigned to associated column which stores the tokenized text logs
207+
- predicted_col : string, will be used to create keyword search label column
208+
reference_df : DataFrame
209+
Holds columns that define the reference dictionary to search for keywords of interest,
210+
Note: This function can currently only handle single words, no n-gram functionality.
211+
reference_col_dict : dict of {str : str}
212+
A dictionary that contains the column names that describes how
213+
referencing is going to be done
214+
215+
- reference_col_from : string, should be assigned to
216+
associated column name in reference_df that are possible input reference values
217+
Example: pd.Series(['inverter', 'invert', 'inv'])
218+
- reference_col_to : string, should be assigned to
219+
associated column name in reference_df that are the output reference values
220+
of interest
221+
Example: pd.Series(['inverter', 'inverter', 'inverter'])
222+
223+
Returns
224+
-------
225+
om_df: pd.DataFrame
226+
Input df with new_col added, where each found keyword is its own row, may result in
227+
duplicate rows if more than one keywords of interest was found in text_col.
228+
"""
229+
om_df[col_dict['predicted_col']] = om_df[col_dict['data']].apply(get_keywords_of_interest,
230+
reference_df=reference_df,
231+
reference_col_dict=reference_col_dict)
232+
233+
# each multi-category now in its own row, some logs have multiple equipment issues
234+
multiple_keywords_df = om_df[om_df[col_dict['predicted_col']].str.len() > 1]
235+
om_df = om_df.explode(col_dict['predicted_col'])
236+
237+
msg = f'{len(multiple_keywords_df)} entries had multiple keywords of interest. Reference: {multiple_keywords_df.index} in original dataframe.'
238+
print(msg)
239+
240+
return om_df

pvops/text/preprocess.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,3 +449,47 @@ def text_remove_numbers_stopwords(document, lst_stopwords):
449449
document = " ".join(document)
450450

451451
return document
452+
453+
454+
def get_keywords_of_interest(document_tok, reference_df, reference_col_dict):
455+
"""Find keywords of interest in list of strings from reference dict.
456+
457+
If keywords of interest given in a reference dict are in the list of
458+
strings, return the keyword category, or categories. For example,
459+
if the string 'inverter' is in the list of text, return ['inverter'].
460+
461+
Parameters
462+
----------
463+
document_tok : list of str
464+
Tokenized text, functionally a list of string values.
465+
reference_df : DataFrame
466+
Holds columns that define the reference dictionary to search for keywords of interest,
467+
Note: This function can currently only handle single words, no n-gram functionality.
468+
reference_col_dict : dict of {str : str}
469+
A dictionary that contains the column names that describes how
470+
referencing is going to be done
471+
472+
- reference_col_from : string, should be assigned to
473+
associated column name in reference_df that are possible input reference values
474+
Example: pd.Series(['inverter', 'invert', 'inv'])
475+
- reference_col_to : string, should be assigned to
476+
associated column name in reference_df that are the output reference values
477+
of interest
478+
Example: pd.Series(['inverter', 'inverter', 'inverter'])
479+
480+
Returns
481+
-------
482+
included_equipment: list of str
483+
List of keywords from reference_dict found in list_of_txt, can be more than one value.
484+
"""
485+
REFERENCE_COL_FROM = reference_col_dict["reference_col_from"]
486+
REFERENCE_COL_TO = reference_col_dict["reference_col_to"]
487+
488+
reference_dict = dict(
489+
zip(reference_df[REFERENCE_COL_FROM], reference_df[REFERENCE_COL_TO])
490+
)
491+
492+
# keywords of interest
493+
overlap_keywords = reference_dict.keys() & document_tok
494+
included_keywords = list({reference_dict[x] for x in overlap_keywords})
495+
return included_keywords

0 commit comments

Comments
 (0)