@@ -30,10 +30,13 @@ def get_params(kwargs, key='url_param_'):
30
30
names = [x for x in kwargs if x .startswith (key )]
31
31
for n in range (len (names )):
32
32
name = names [n ]
33
+
33
34
# Params are split by commas, with index corresponding to list index
34
35
paramlist = kwargs .get (name ).split (',' )
36
+
35
37
# Remove the "url_param"
36
38
name = name .replace (key , '' , 1 )
39
+
37
40
# Update the dictionary of dictionaries
38
41
for i in range (len (paramlist )):
39
42
@@ -54,27 +57,75 @@ def get_params(kwargs, key='url_param_'):
54
57
return params
55
58
56
59
60
+ def parse_success_response (response , kwargs ):
61
+ '''parse a successful response of 200, meaning we honor the user
62
+ request to return json, search for a regular expression, or return
63
+ raw text. This is used by the basic GET/POST functions. For parsing
64
+ with beautiful soup, see "get_results" and "get_url_selection"
65
+
66
+ Parameters
67
+ ==========
68
+ response: the requests (200) response
69
+ kwargs: dictionary of keyword arguments provided to function
70
+ '''
71
+ result = None
72
+ save_as = kwargs .get ('save_as' , 'json' )
73
+ regex = kwargs .get ('regex' )
74
+
75
+ # Returning the result as json will detect dictionary, and save json
76
+ if save_as == "json" :
77
+ result = response .json ()
78
+
79
+ # As an alternative, search for a regular expression
80
+ elif regex not in ["" , None ]:
81
+ match = re .search (regex , response .text )
82
+ result = match .group ()
83
+
84
+ # Otherwise, we return text
85
+ else :
86
+ result = response .text
87
+ return result
88
+
89
+
57
90
def get_headers (kwargs ):
58
- '''Get a single set of headers from the kwargs dict.
91
+ '''Get a single set of headers from the kwargs dict. A user agent is added
92
+ as it is helpful in most cases.
59
93
60
94
Parameters
61
95
==========
62
96
kwargs: the dictionary of keyword arguments that may contain url
63
97
parameters (format is url_param_<name>
64
98
'''
65
- headers = {}
99
+ headers = {"User-Agent" : "Mozilla/5.0" }
66
100
67
101
for key , value in kwargs .items ():
68
102
if key .startswith ('header_' ):
69
103
name = key .replace ('header_' , '' , 1 )
70
- headers [name ] = value
104
+
105
+ # The header is defined with a value
106
+ if value != None :
107
+ headers [name ] = value
108
+
109
+ # If the user wants to remove the User-Agent (or any) header
110
+ elif value == None and name in headers :
111
+ del headers [name ]
71
112
72
113
return headers
73
114
74
115
75
- def get_results (url , selector , func = None , attributes = None , params = {}, get_text = False , headers = {}):
76
- '''given a url, a function, an optional selector, optional attributes, and a set (dict)
77
- of parameters, perform a request.
116
+ def get_results (url ,
117
+ selector ,
118
+ func = None ,
119
+ attributes = None ,
120
+ params = {},
121
+ get_text = False ,
122
+ headers = {},
123
+ regex = None ):
124
+
125
+ '''given a url, a function, an optional selector, optional attributes,
126
+ and a set (dict) of parameters, perform a request. This function is
127
+ used if the calling function needs special parsing of the html with
128
+ beautiful soup. If only a post/get is needed, this is not necessary.
78
129
79
130
Parameters
80
131
==========
@@ -103,6 +154,11 @@ def get_results(url, selector, func=None, attributes=None, params={}, get_text=F
103
154
if attributes != None :
104
155
[results .append (entry .get (x )) for x in attributes ]
105
156
157
+ # Second priority for regular expression on text
158
+ elif regex not in [None , "" ]:
159
+ match = re .search (regex , entry .text )
160
+ results .append (match .group ())
161
+
106
162
# Does the user want to get text?
107
163
elif get_text == True :
108
164
results .append (entry .text )
0 commit comments