Keyword search in text from reference dictionary (#46)

charity-kwha · web-flow · commit a4de9a4722e3 · 2023-07-07T12:24:17.000-04:00
* init file and pull sample data

* add helper functions and reference dict

* clean up text, tokenize, add new labels from regex function

* move example into its own file, seperate out functions to use

* clean functions and make generic names

* clean up example

* move reference dict to example_data

* remove pdb

* move dict out of functions file

* missed import

* add metrics to measure accuracy

* rename other category to unknown for fair comparison

* move keyword search to preprocess step, util

* move add label fucntion to classify

* fix imports, add comment for viz

* remove temp file

* change arg to dict

* add keyword util

* update formating

* whitespace

* update args to match

* remove whitespace

* function rename

* clean up dict

* add sample data

* update docstring and change col dict name

* clean up and use sample data in examples

* add plot function

* add in visualization function for confusion matrix

* add more detail to dox string

* clean up function to expect predicted_col

* clean up example

* remove whitespace

* change col name from new to predicted

* clean up sample data

* add notebook example

* add functionality to replace text in preprocessing

* add text replacement step

* linter

* remove example .py file in favor of jupyter notebook

* remove reference dict

* add new csv mapping rfiles

* add csv

* update jupyternotebook example

* use df instead of dict

* remove dict

* remove unused imports and functions

* update jupyter notebook and remove module

* remove plot

* update docstrings

* linter

* remove whitespace
diff --git a/examples/example_data/mappings_equipment.csv b/examples/example_data/mappings_equipment.csv
@@ -0,0 +1,25 @@
+in,out_
+combiner,combiner
+comb,combiner
+cb,combiner
+battery,battery
+bess,battery
+inverter,inverter
+invert,inverter
+inv,inverter
+met,met
+meter,meter
+module,module
+mod,module
+recloser,recloser
+reclose,recloser
+relay,relay
+substation,substation
+switchgear,switchgear
+switch,switchgear
+tracker,tracker
+transformer,transformer
+xfmr,transformer
+wiring,wiring
+wire,wiring
+wires,wiring
diff --git a/examples/example_data/mappings_pv_terms.csv b/examples/example_data/mappings_pv_terms.csv
@@ -0,0 +1,20 @@
+in,out_
+comm,communication
+energy,energy
+kwh,energy
+mwh,energy
+grid,grid
+curtailment,grid
+curtail,grid
+poi,grid
+offline,outage
+solar,solar
+pv,solar
+photovoltaic,solar
+system,system
+site,system
+farm,system
+project,system
+sma,make_model
+cm,corrective_maintence
+pm,preventative_maintence
diff --git a/examples/tutorial_text_classify_regex_example.ipynb b/examples/tutorial_text_classify_regex_example.ipynb
@@ -0,0 +1,187 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Adding keyword labels to O&M data\n",
+    "This notebook demonstrates the use of the `pvops.classify.get_attributes_from_keywords` module for adding asset labels based off O&M notes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "from pvops.text import utils, preprocess\n",
+    "from pvops.text.classify import get_attributes_from_keywords\n",
+    "from pvops.text.visualize import visualize_classification_confusion_matrix"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step 0: Get sample data, remap assets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pull in sample data and remap assets for ease of comparison\n",
+    "\n",
+    "om_df = pd.read_csv('example_data/example_ML_ticket_data.csv')\n",
+    "col_dict = {\n",
+    "    \"data\" : \"CompletionDesc\",\n",
+    "    \"eventstart\" : \"Date_EventStart\",\n",
+    "    \"save_data_column\" : \"processed_data\",\n",
+    "    \"save_date_column\" : \"processed_date\",\n",
+    "    \"attribute_col\" : \"Asset\",\n",
+    "    \"predicted_col\" : \"Keyword_Asset\",\n",
+    "    \"remapping_col_from\": \"in\",\n",
+    "    \"remapping_col_to\": \"out_\"\n",
+    "}\n",
+    "\n",
+    "# remap assets\n",
+    "remapping_df = pd.read_csv('example_data/remappings_asset.csv')\n",
+    "remapping_df['out_'] = remapping_df['out_'].replace({'met station': 'met',\n",
+    "                                                     'energy storage': 'battery',\n",
+    "                                                     'energy meter': 'meter'})\n",
+    "om_df = utils.remap_attributes(om_df, remapping_df, col_dict, allow_missing_mappings=True)\n",
+    "om_df.head()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step 1: Text preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# preprocessing steps\n",
+    "om_df[col_dict['attribute_col']] = om_df.apply(lambda row: row[col_dict['attribute_col']].lower(), axis=1)\n",
+    "om_df = preprocess.preprocessor(om_df, lst_stopwords=[], col_dict=col_dict, print_info=False, extract_dates_only=False)\n",
+    "\n",
+    "DATA_COL = col_dict['data']\n",
+    "om_df[DATA_COL] = om_df['processed_data']\n",
+    "\n",
+    "# replace terms\n",
+    "equipment_df = pd.read_csv('~/pvOps/examples/example_data/mappings_equipment.csv')\n",
+    "pv_terms_df = pd.read_csv('~/pvOps/examples/example_data/mappings_pv_terms.csv')\n",
+    "pv_reference_df = pd.concat([equipment_df, pv_terms_df])\n",
+    "om_df = utils.remap_words_in_text(om_df=om_df, remapping_df=pv_reference_df, remapping_col_dict=col_dict)\n",
+    "\n",
+    "om_df.head()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step 2: Search for keywords to use as labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add asset labels from keyword reference dict\n",
+    "om_df = get_attributes_from_keywords(om_df=om_df,\n",
+    "                                     col_dict=col_dict,\n",
+    "                                     reference_df=equipment_df)\n",
+    "om_df.head()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step 3: Metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get accuracy measures and count metrics\n",
+    "PREDICT_COL = col_dict['predicted_col']\n",
+    "LABEL_COL = col_dict['attribute_col']\n",
+    "\n",
+    "# entries with some keyword over interest, over all entries\n",
+    "label_count = om_df[PREDICT_COL].count() / len(om_df)\n",
+    "\n",
+    "# replace 'Other' values with 'Unknown'\n",
+    "om_df[LABEL_COL] = om_df[LABEL_COL].replace('other', 'unknown')\n",
+    "# replace NaN values to use accuracy score\n",
+    "om_df[[LABEL_COL, PREDICT_COL]] = om_df[[LABEL_COL, PREDICT_COL]].fillna('unknown')\n",
+    "acc_score = accuracy_score(y_true=om_df[LABEL_COL], y_pred=om_df[PREDICT_COL])\n",
+    "\n",
+    "msg = f'{label_count:.2%} of entries had a keyword of interest, with {acc_score:.2%} accuracy.'\n",
+    "print(msg)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Step 4: Visualization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot confusion matrix\n",
+    "title = 'Confusion Matrix of Actual and Predicted Asset Labels'\n",
+    "visualize_classification_confusion_matrix(om_df, col_dict, title)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pvops/text/classify.py b/pvops/text/classify.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import copy
 
+from pvops.text.preprocess import get_keywords_of_interest
 
 def classification_deployer(
     X,
@@ -187,3 +188,53 @@ def classification_deployer(
             best_gs_instance = gs_clf
 
     return pd.concat(rows, axis=1).T, best_gs_instance.best_estimator_
+
+def get_attributes_from_keywords(om_df, col_dict, reference_df, reference_col_dict):
+    """Find keywords of interest in specified column of dataframe, return as new column value.
+
+    If keywords of interest given in a reference dataframe are in the specified column of the
+    dataframe, return the keyword category, or categories.
+    For example, if the string 'inverter' is in the list of text, return ['inverter'].
+
+    Parameters
+    ----------
+    om_df : pd.DataFrame
+        Dataframe to search for keywords of interest, must include text_col.
+    col_dict : dict of {str : str}
+        A dictionary that contains the column names needed:
+
+        - data : string, should be assigned to associated column which stores the tokenized text logs
+        - predicted_col : string, will be used to create keyword search label column
+    reference_df : DataFrame
+        Holds columns that define the reference dictionary to search for keywords of interest,
+        Note: This function can currently only handle single words, no n-gram functionality.
+    reference_col_dict : dict of {str : str}
+        A dictionary that contains the column names that describes how
+        referencing is going to be done
+
+        - reference_col_from : string, should be assigned to
+          associated column name in reference_df that are possible input reference values
+          Example: pd.Series(['inverter', 'invert', 'inv'])
+        - reference_col_to : string, should be assigned to
+          associated column name in reference_df that are the output reference values
+          of interest
+          Example: pd.Series(['inverter', 'inverter', 'inverter'])
+
+    Returns
+    -------
+    om_df: pd.DataFrame
+        Input df with new_col added, where each found keyword is its own row, may result in
+        duplicate rows if more than one keywords of interest was found in text_col.
+    """
+    om_df[col_dict['predicted_col']] = om_df[col_dict['data']].apply(get_keywords_of_interest,
+                                                                     reference_df=reference_df,
+                                                                     reference_col_dict=reference_col_dict)
+
+    # each multi-category now in its own row, some logs have multiple equipment issues
+    multiple_keywords_df = om_df[om_df[col_dict['predicted_col']].str.len() > 1]
+    om_df = om_df.explode(col_dict['predicted_col'])
+
+    msg = f'{len(multiple_keywords_df)} entries had multiple keywords of interest. Reference: {multiple_keywords_df.index} in original dataframe.'
+    print(msg)
+
+    return om_df
diff --git a/pvops/text/preprocess.py b/pvops/text/preprocess.py
@@ -449,3 +449,47 @@ def text_remove_numbers_stopwords(document, lst_stopwords):
     document = " ".join(document)
 
     return document
+
+
+def get_keywords_of_interest(document_tok, reference_df, reference_col_dict):
+    """Find keywords of interest in list of strings from reference dict.
+
+    If keywords of interest given in a reference dict are in the list of
+    strings, return the keyword category, or categories. For example,
+    if the string 'inverter' is in the list of text, return ['inverter'].
+
+    Parameters
+    ----------
+    document_tok : list of str
+        Tokenized text, functionally a list of string values.
+    reference_df : DataFrame
+        Holds columns that define the reference dictionary to search for keywords of interest,
+        Note: This function can currently only handle single words, no n-gram functionality.
+    reference_col_dict : dict of {str : str}
+        A dictionary that contains the column names that describes how
+        referencing is going to be done
+
+        - reference_col_from : string, should be assigned to
+          associated column name in reference_df that are possible input reference values
+          Example: pd.Series(['inverter', 'invert', 'inv'])
+        - reference_col_to : string, should be assigned to
+          associated column name in reference_df that are the output reference values
+          of interest
+          Example: pd.Series(['inverter', 'inverter', 'inverter'])
+
+    Returns
+    -------
+    included_equipment: list of str
+        List of keywords from reference_dict found in list_of_txt, can be more than one value.
+    """
+    REFERENCE_COL_FROM = reference_col_dict["reference_col_from"]
+    REFERENCE_COL_TO = reference_col_dict["reference_col_to"]
+
+    reference_dict = dict(
+        zip(reference_df[REFERENCE_COL_FROM], reference_df[REFERENCE_COL_TO])
+    )
+
+    # keywords of interest
+    overlap_keywords = reference_dict.keys() & document_tok
+    included_keywords = list({reference_dict[x] for x in overlap_keywords})
+    return included_keywords
diff --git a/pvops/text/utils.py b/pvops/text/utils.py
diff --git a/pvops/text/visualize.py b/pvops/text/visualize.py