Add Twitter NLP sentiment analysis example to accompany the corresponding video and social push (#311)

markurtz · web-flow · commit 500d132f27e9 · 2022-04-08T16:13:38.000-04:00
* Add Twitter NLP sentiment analysis example to accompany the corresponding video and social push

* minor fixes

* updates from reviews

* make style

* update sentiment analysis models

* update for make style
diff --git a/examples/twitter-nlp/README.md b/examples/twitter-nlp/README.md
@@ -0,0 +1,51 @@
+<!--
+Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Twitter NLP Inference Examples
+
+This directory contains examples for scraping, processing, and classifying Twitter data
+using the DeepSparse engine for >=10x increase in inference performance on commodity CPUs.
+
+## Installation
+
+The dependencies for this example can be installed using `pip`:
+```bash
+pip3 install -r requirements.txt
+```
+
+## Sentiment Analysis Example
+
+The `analyze_sentiment.py` script is used to analyze and classify tweets as either positive or negative
+depending on their contents. 
+For example, you can analyze the general sentiment of crypto or other common topics across Twitter.
+
+To use, first gather the desired number of tweets for your topic(s) and save them as a text file to use with `analyze_sentiment.py`.
+The script expects one tweet per row with each tweet formatted as a json object containing a `"tweet"` key that maps to the text content.
+
+An example script `scrape.py` is given to show this in action.
+Note, it uses the Twint library which does not abide by Twitter's terms of service. 
+The script is given as an example only and users are expected to use Twitter's developer pathways and APIs in place of this script.
+```bash
+python scrape.py --topic '#crypto' --total_tweets 1000
+```
+
+Next, use the `analyze_sentiment.py` along with sparsified sentiment analysis models from the [SparseZoo](https://sparsezoo.neuralmagic.com/?domain=nlp&sub_domain=sentiment_analysis&page=1)
+to performantly analyze the general sentiment across the gathered tweets:
+```bash
+python analyze_sentiment.py
+    --model_path "zoo:nlp/sentiment_analysis/bert-base/pytorch/huggingface/sst2/12layer_pruned80_quant-none-vnni"
+    --tweets_file "#crypto.txt"
+```
diff --git a/examples/twitter-nlp/analyze_sentiment.py b/examples/twitter-nlp/analyze_sentiment.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+"""
+Script to analyze the sentiment of a given file of tweets from Twitter
+in batch processing mode.
+
+##########
+Command help:
+Usage: analyze_sentiment.py [OPTIONS]
+
+  Analyze the sentiment of the tweets given in the tweets_file and print out
+  the results.
+
+Options:
+  --model_path TEXT       The path to the sentiment analysis model to
+                          load.Either a model.onnx file, a model folder
+                          containing the model.onnx and supporting files, or a
+                          SparseZoo model stub.
+  --tweets_file TEXT      The path to the tweets json txt file to analyze
+                          sentiment for.
+  --batch_size INTEGER    The batch size to process the tweets with. A higher
+                          batch size may increase performance at the expense
+                          of memory resources and individual latency.
+  --total_tweets INTEGER  The total number of tweets to analyze from the
+                          tweets_file.Defaults to None which will run through
+                          all tweets contained in the file.
+  --help                  Show this message and exit.
+
+##########
+Example running a sparse, quantized sentiment analysis model:
+python analyze_sentiment.py
+    --model_path "zoo:nlp/sentiment_analysis/bert-base/pytorch/huggingface/sst2/12layer_pruned80_quant-none-vnni"
+    --tweets_file /PATH/TO/OUTPUT/FROM/scrape.py
+
+##########
+Example running a dense, unoptimized sentiment analysis model:
+python analyze_sentiment.py
+    --model_path "zoo:nlp/sentiment_analysis/bert-base/pytorch/huggingface/sst2/base-none"
+    --tweets_file /PATH/TO/OUTPUT/FROM/scrape.py
+"""
+
+import json
+from itertools import cycle, islice
+from typing import Any, Dict, List, Optional
+
+import click
+
+from deepsparse.transformers import pipeline
+from rich import print
+
+
+def _load_tweets(tweets_file: str):
+    tweets = []
+    with open(tweets_file, "r") as file:
+        for line in file.readlines():
+            tweets.append(json.loads(line))
+
+    return tweets
+
+
+def _prep_data(tweets: List[Dict], total_num: int) -> List[str]:
+    if total_num:
+        tweets = islice(cycle(tweets), total_num)
+
+    return [tweet["tweet"].strip().replace("\n", "") for tweet in tweets]
+
+
+def _batched_model_input(tweets: List[str], batch_size: int) -> Optional[List[str]]:
+    if batch_size >= len(tweets):
+        return None
+
+    batched = tweets[0:batch_size]
+    del tweets[0:batch_size]
+
+    return batched
+
+
+def _classified_positive(sentiment: Dict[str, Any]):
+    return sentiment["label"] == "LABEL_1"
+
+
+def _display_results(batch, sentiments):
+    for text, sentiment in zip(batch, sentiments):
+        color = "green" if _classified_positive(sentiment) else "magenta"
+        print(f"[{color}]{text}[/{color}]")
+
+
+@click.command()
+@click.option(
+    "--model_path",
+    type=str,
+    help="The path to the sentiment analysis model to load."
+    "Either a model.onnx file, a model folder containing the model.onnx "
+    "and supporting files, or a SparseZoo model stub.",
+)
+@click.option(
+    "--tweets_file",
+    type=str,
+    help="The path to the tweets json txt file to analyze sentiment for.",
+)
+@click.option(
+    "--batch_size",
+    type=int,
+    default=16,
+    help="The batch size to process the tweets with. "
+    "A higher batch size may increase performance at the expense of memory resources "
+    "and individual latency.",
+)
+@click.option(
+    "--total_tweets",
+    type=int,
+    default=None,
+    help="The total number of tweets to analyze from the tweets_file."
+    "Defaults to None which will run through all tweets contained in the file.",
+)
+def analyze_tweets_sentiment(
+    model_path: str, tweets_file: str, batch_size: int, total_tweets: int
+):
+    """
+    Analyze the sentiment of the tweets given in the tweets_file and
+    print out the results.
+    """
+    text_pipeline = pipeline(
+        task="text-classification",
+        model_path=model_path,
+        batch_size=batch_size,
+    )
+    tweets = _load_tweets(tweets_file)
+    tweets = _prep_data(tweets, total_tweets)
+    tot_sentiments = []
+
+    while True:
+        batch = _batched_model_input(tweets, batch_size)
+        if batch is None:
+            break
+        sentiments = text_pipeline(batch)
+        _display_results(batch, sentiments)
+        tot_sentiments.extend(sentiments)
+
+    num_positive = sum(
+        [1 if _classified_positive(sent) else 0 for sent in tot_sentiments]
+    )
+    num_negative = sum(
+        [1 if not _classified_positive(sent) else 0 for sent in tot_sentiments]
+    )
+    print("\n\n\n")
+    print("###########################################################################")
+    print(f"Completed analyzing {len(tweets)} tweets for sentiment,")
+
+    if num_positive >= num_negative:
+        print(
+            f"[green]General sentiment is positive with "
+            f"{100*num_positive/float(len(tot_sentiments)):.0f}% in favor.[/green]"
+        )
+    else:
+
+        print(
+            f"[magenta]General sentiment is negative with "
+            f"{100*num_negative/float(len(tot_sentiments)):.0f}% against.[/magenta]"
+        )
+    print("###########################################################################")
+
+
+if __name__ == "__main__":
+    analyze_tweets_sentiment()
diff --git a/examples/twitter-nlp/requirements.txt b/examples/twitter-nlp/requirements.txt
@@ -0,0 +1,4 @@
+click==8.0.1
+deepsparse>=0.11
+git+https://github.com/twintproject/twint@e7c8a0c764f6879188e5c21e25fb6f1f856a7221#egg=twint
+rich==12.2.0
diff --git a/examples/twitter-nlp/scrape.py b/examples/twitter-nlp/scrape.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Twitter scraping script using Twint.
+Give a topic, or multiple, and it will pull down the desired number of tweets
+that match.
+Writes the results as JSON to the given output_file.
+If None given, will write the results under a new file named after the given topic.
+
+
+##########
+Command help:
+Usage: scrape.py [OPTIONS]
+
+  Twitter scraping script using Twint. Give a topic, or multiple, and it will
+  pull down the desired number of tweets that match. Writes the results as
+  JSON to the given output_file. If None given, will write the results under a
+  new file named after the given topic.
+
+Options:
+  -t, --topic TEXT        The topics to scrape twitter for, either keywords or
+                          hashtags.For example: '--topic #crypto'. Multiple
+                          topics can be used as well, for example: '-t #crypto
+                          -t #bitcoin'
+  --total_tweets INTEGER  The total number of tweets to gather from Twitter.
+                          Note, the API used from Twitter has a maximum date
+                          range of around 1 week.
+  --output_file TEXT      The output file to write the tweets to. If not
+                          supplied, will create a new file using the topics as
+                          names.
+  --help                  Show this message and exit.
+
+##########
+Example command for scraping Twitter for #crypto tweets:
+python scrape.py --topic '#crypto' --total_tweets 1000
+"""
+
+from typing import List, Optional
+
+import click
+import twint
+
+
+@click.command()
+@click.option(
+    "--topic",
+    "-t",
+    multiple=True,
+    help="The topics to scrape twitter for, either keywords or hashtags."
+    "For example: '--topic #crypto'. "
+    "Multiple topics can be used as well, for example: '-t #crypto -t #bitcoin'",
+)
+@click.option(
+    "--total_tweets",
+    type=int,
+    default=100,
+    help="The total number of tweets to gather from Twitter. "
+    "Note, the API used from Twitter has a maximum date range of around 1 week.",
+)
+@click.option(
+    "--output_file",
+    type=str,
+    default=None,
+    help="The output file to write the tweets to. "
+    "If not supplied, will create a new file using the topics as names.",
+)
+def scrape_tweets(topic: List[str], total_tweets: int, output_file: Optional[str]):
+    """
+    Twitter scraping script using Twint.
+    Give a topic, or multiple, and it will pull down the desired number of tweets
+    that match.
+    Writes the results as JSON lines as text to the given output_file.
+    If None given, will write the results under a new file named after the given topic.
+    """
+    print(
+        "WARNING: Twint does not abide by Twitter's terms of service. "
+        "The script listed here is given as only an example and for searching. "
+        "User's should use Twitter's accepted APIs and developer console for search. "
+    )
+    config = twint.Config()
+    topics_str = " ".join(
+        [f"({top})" if top.startswith("#") else top for top in topic]
+    )  # reformat as hashtags
+    config.Custom_query = (
+        f"{topics_str} min_faves:2 lang:en -filter:links -filter:replies "
+    )
+    config.Limit = total_tweets
+    config.Store_json = True
+    config.Output = f"{'_'.join(topic)}.txt" if not output_file else output_file
+
+    print(f"Scraping {total_tweets} tweets")
+    twint.run.Search(config)
+    print(f"Finished scraping, tweets written to {config.Output}")
+
+
+if __name__ == "__main__":
+    scrape_tweets()
diff --git a/setup.cfg b/setup.cfg
@@ -5,7 +5,7 @@ ensure_newline_before_comments = True
 force_grid_wrap = 0
 include_trailing_comma = True
 known_first_party = deepsparse,sparsezoo
-known_third_party = bs4,requests,packaging,setuptools,numpy,onnx,onnxruntime,flask,flask_cors,tqdm,transformers,pydantic,click,yaml
+known_third_party = bs4,requests,packaging,setuptools,numpy,onnx,onnxruntime,flask,flask_cors,tqdm,transformers,pydantic,click,yaml,twint,colorama
 sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
 
 line_length = 88