Merge pull request #31 from vsoch/add/headers-exporters

vsoch · web-flow · commit 14a5b78aca17 · 2019-05-06T11:51:59.000-04:00
Adding user agent header, regex for URL watcher tasks
diff --git a/.github/AUTHORS.md b/.github/AUTHORS.md
@@ -0,0 +1,7 @@
+# Maintainers
+
+ - [@vsoch](https://www.github.com/vsoch)
+
+# Contributors
+
+ - [@SCHKN](https://www.github.com/SCHKN)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ Critical items to know are:
  - changed behaviour
 
 ## [master](https://github.com/vsoch/watchme/tree/master)
+ - Adding option for regular expression for URL wachers, user agent header (0.0.16)
  - requests is missing from install dependencies (0.0.15)
  - small bug fixes (0.0.14)
  - added headers, params, and json args for post and get urls. (0.0.13)
diff --git a/docs/_docs/examples/index.md b/docs/_docs/examples/index.md
@@ -5,8 +5,9 @@ permalink: /examples/index.html
 order: 1
 ---
 
-We will have more examples and details, but for now, here are the example watcher
-repos:
+## Repository Examples
+
+Here you can find example watcher repos:
 
  - [system](https://github.com/vsoch/watchme-system) for system, sensors, users, and networking monitoring using psutils tasks.
  - [air-quality](https://github.com/vsoch/watchme-air-quality) for watching a metric across a few cities.
@@ -17,3 +18,29 @@ For either of the above, you can easily install and activate the watcher to run
 your machine! See [here](https://vsoch.github.io/watchme/getting-started/#how-do-i-get-a-watcher).
 For specific details about creating the watchers in question, see the README markdowns
 in the repositories.
+
+## Configuration Examples
+
+The following example configurations are contributed by users over time. If you
+have an example to contribute, please [open an issue](https://www.github.com/{{ site.repo }}/issues)
+to share it.
+
+### URL Watchers
+
+The following are examples for [URL watchers](https://vsoch.github.io/watchme/watchers/urls/).
+In the following example, the user is using the `get_url_selection` task to extract
+a number (note the regular expression) from the text resulting from selecting the
+class `.local-temp`. For this version of WatchMe the User-Agent header was not
+automatically added, so he added it here as a `header_*` parameter.
+
+```
+[task-temperature]
+url = https://www.accuweather.com/en/lu/luxembourg/228714/weather-forecast/228714
+selection = .local-temp
+get_text = true
+func = get_url_selection
+active = true
+type = urls
+regex = [0-9]+
+header_user-agent = Mozilla/5.0
+```
diff --git a/docs/_docs/watcher-tasks/urls.md b/docs/_docs/watcher-tasks/urls.md
@@ -38,6 +38,16 @@ A urls task has the following parameters shared across functions.
 | url  | Yes     |undefined|url@https://www.reddit.com/r/hpc| validated starts with http |
 | func | No    |get_task |func@download_task| must be defined in tasks.py |
 
+
+#### Task Headers
+
+For some tasks, you can add one or more headers to the request by specifying `header_<name>`.
+For example, to add the header "Token" I could do `header_Token=123456`.
+By default, each task has the User-Agent header added, as it typically helps. 
+If you want to disable this, add the header_User-Agent to be empty, or change
+it to something else.
+
+
 #### Lists of URL Parameters
 
 For the "Get" and "Get with selection" tasks, you might want to include url parameters. For example,
@@ -69,7 +79,6 @@ or to skip the third page call (page=3) for the name parameter, just leave it em
 url_param_name@V,V,,V,V,V,V
 ```
 
-
 ## Tasks Available
 
  - [Get Task](#1-get-a-url-task) appropriate if you want to perform a GET (e.g., download a page)
@@ -117,7 +126,6 @@ If you specify "save_as" to be json, you will get a results.json unless you spec
 file name. 
 
 
-
 ### 2. Post to a URL Task
 
 This task will post to get changes from a URL, ideal for watching restful API
diff --git a/watchme/version.py b/watchme/version.py
@@ -6,7 +6,7 @@
 # with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 
-__version__ = "0.0.15"
+__version__ = "0.0.16"
 AUTHOR = 'Vanessa Sochat'
 AUTHOR_EMAIL = 'vsochat@stanford.edu'
 NAME = 'watchme'
diff --git a/watchme/watchers/urls/helpers.py b/watchme/watchers/urls/helpers.py
@@ -30,10 +30,13 @@ def get_params(kwargs, key='url_param_'):
     names = [x for x in kwargs if x.startswith(key)]
     for n in range(len(names)):
         name = names[n]
+
         # Params are split by commas, with index corresponding to list index
         paramlist = kwargs.get(name).split(',')
+
         # Remove the "url_param"
         name = name.replace(key, '', 1)
+
         # Update the dictionary of dictionaries
         for i in range(len(paramlist)):
 
@@ -54,27 +57,75 @@ def get_params(kwargs, key='url_param_'):
     return params
 
 
+def parse_success_response(response, kwargs):
+    '''parse a successful response of 200, meaning we honor the user
+       request to return json, search for a regular expression, or return
+       raw text. This is used by the basic GET/POST functions. For parsing
+       with beautiful soup, see "get_results" and "get_url_selection"
+
+       Parameters
+       ==========
+       response: the requests (200) response
+       kwargs: dictionary of keyword arguments provided to function
+    '''
+    result = None
+    save_as = kwargs.get('save_as', 'json')
+    regex = kwargs.get('regex')
+
+    # Returning the result as json will detect dictionary, and save json
+    if save_as == "json":
+        result = response.json()
+
+    # As an alternative, search for a regular expression
+    elif regex not in ["", None]:
+        match = re.search(regex, response.text)
+        result = match.group()
+
+    # Otherwise, we return text
+    else:
+        result = response.text
+    return result
+
+
 def get_headers(kwargs):
-    '''Get a single set of headers from the kwargs dict.
+    '''Get a single set of headers from the kwargs dict. A user agent is added
+       as it is helpful in most cases.
 
        Parameters
        ==========
        kwargs: the dictionary of keyword arguments that may contain url
                parameters (format is url_param_<name>
     '''
-    headers = {}
+    headers = {"User-Agent": "Mozilla/5.0"}
 
     for key, value in kwargs.items():
         if key.startswith('header_'):
             name = key.replace('header_', '', 1)
-            headers[name] = value
+
+            # The header is defined with a value
+            if value != None:
+                headers[name] = value
+
+            # If the user wants to remove the User-Agent (or any) header
+            elif value == None and name in headers:
+                del headers[name]
 
     return headers
 
 
-def get_results(url, selector, func=None, attributes=None, params={}, get_text=False, headers={}):
-    '''given a url, a function, an optional selector, optional attributes, and a set (dict)
-       of parameters, perform a request.
+def get_results(url, 
+                selector,
+                func=None,
+                attributes=None,
+                params={},
+                get_text=False,
+                headers={},
+                regex=None):
+
+    '''given a url, a function, an optional selector, optional attributes, 
+       and a set (dict) of parameters, perform a request. This function is
+       used if the calling function needs special parsing of the html with
+       beautiful soup. If only a post/get is needed, this is not necessary.
 
        Parameters
        ==========
@@ -103,6 +154,11 @@ def get_results(url, selector, func=None, attributes=None, params={}, get_text=F
             if attributes != None:
                 [results.append(entry.get(x)) for x in attributes]
 
+            # Second priority for regular expression on text
+            elif regex not in [None, ""]:
+                match = re.search(regex, entry.text)
+                results.append(match.group())
+
             # Does the user want to get text?
             elif get_text == True:
                 results.append(entry.text)
diff --git a/watchme/watchers/urls/tasks.py b/watchme/watchers/urls/tasks.py
@@ -13,10 +13,12 @@
 from .helpers import (
     get_params, 
     get_results, 
-    get_headers
+    get_headers,
+    parse_success_response
 )
 from requests.exceptions import HTTPError
 import os
+import re
 import tempfile
 import requests
 
@@ -30,6 +32,10 @@ def get_task(url, **kwargs):
 
        REQUIRED:
            url: a url to return the page for
+
+       OPTIONAL
+           regex: a regular expression to search the text for (not used w/ json)
+           save_as: return the result to save as json
     '''
     results = []
     paramsets = get_params(kwargs)
@@ -39,16 +45,9 @@ def get_task(url, **kwargs):
         response = requests.get(url, params=params, headers=headers)
 
         if response.status_code == 200:
-            save_as = kwargs.get('save_as')
-
-            # Returning the result as json will detect dictionary, and save json
-            if save_as == "json":
-                result = response.json()
-
-            # Otherwise, we return text
-            else:
-                result = response.text
 
+            # Parse the response per the user's request
+            result = parse_success_response(response, kwargs)
             results.append(result)
 
     results = [x for x in results if x]
@@ -81,19 +80,15 @@ def post_task(url, **kwargs):
         response = requests.post(url, json=params, headers=headers)
         if response.status_code == 200:
 
-            save_as = kwargs.get('save_as', 'json')
-
-            # Returning the result as json will detect dictionary, and save json
-            if save_as == "json":
-                result = response.json()
-
-            # Otherwise, we return text
-            else:
-                result = response.text
+            # Parse the response per the user's request
+            result = parse_success_response(response, kwargs)
+            results.append(result)
 
         else:
             bot.error("%s: %s" %(response.status_code, response.reason))
 
+    results = [x for x in results if x]
+
     # Return None if no results found
     if len(results) == 0:
         results = None
@@ -179,6 +174,9 @@ def get_url_selection(url, **kwargs):
     if kwargs.get('get_text') != None:
         get_text = True
 
+    # Are we searching for a regular expression in the result?
+    regex = kwargs.get('regex')
+
     # Does the user want to get one or more attributes?
     attributes = kwargs.get('attributes', None)
     if attributes != None:
@@ -198,7 +196,8 @@ def get_url_selection(url, **kwargs):
                                headers=headers,
                                attributes=attributes,
                                params=params,
-                               get_text=get_text)
+                               get_text=get_text,
+                               regex=regex)
 
     # No results
     if len(results) == 0: