1
1
import hashlib
2
+ import inspect
2
3
import json
4
+ import ssl
3
5
from datetime import date , datetime , timedelta
4
6
from typing import Dict , List , Tuple
5
7
6
8
import certifi
7
- import requests
9
+ import httpx
10
+ from charset_normalizer import from_bytes
8
11
9
12
from juriscraper .lib .date_utils import (
10
13
fix_future_year_typo ,
20
23
set_response_encoding ,
21
24
)
22
25
from juriscraper .lib .log_tools import make_default_logger
23
- from juriscraper .lib .network_utils import SSLAdapter
24
26
from juriscraper .lib .string_utils import (
25
27
CaseNameTweaker ,
26
28
clean_string ,
27
29
harmonize ,
28
30
trunc ,
29
31
)
30
- from juriscraper .lib .test_utils import MockRequest
31
32
32
33
logger = make_default_logger ()
33
34
@@ -39,7 +40,7 @@ class AbstractSite:
39
40
Should not contain lists that can't be sorted by the _date_sort function.
40
41
"""
41
42
42
- def __init__ (self , cnt = None , ** kwargs ):
43
+ def __init__ (self , cnt = None , user_agent = "Juriscraper" , ** kwargs ):
43
44
super ().__init__ ()
44
45
45
46
# Computed metadata
@@ -50,11 +51,23 @@ def __init__(self, cnt=None, **kwargs):
50
51
self .downloader_executed = False
51
52
self .cookies = {}
52
53
self .cnt = cnt or CaseNameTweaker ()
54
+ self .user_agent = user_agent
55
+
56
+ # Attribute to reference a function passed by the caller,
57
+ # which takes a single argument, the Site object, after
58
+ # each GET or POST request. Intended for saving the response for
59
+ # debugging purposes.
60
+ self .save_response = kwargs .pop ("save_response_fn" , None )
61
+
62
+ kwargs .pop ("backscrape_start" , None )
63
+ kwargs .pop ("backscrape_end" , None )
64
+ kwargs .pop ("days_interval" , None )
65
+ kwargs .setdefault ("http2" , True )
66
+ kwargs .setdefault ("verify" , certifi .where ())
53
67
self .request = {
54
- "verify" : certifi .where (),
55
- "session" : requests .session (),
68
+ "session" : httpx .AsyncClient (** kwargs ),
56
69
"headers" : {
57
- "User-Agent" : "Juriscraper" ,
70
+ "User-Agent" : self . user_agent ,
58
71
# Disable CDN caching on sites like SCOTUS (ahem)
59
72
"Cache-Control" : "no-cache, max-age=0, must-revalidate" ,
60
73
# backwards compatibility with HTTP/1.0 caches
@@ -66,12 +79,6 @@ def __init__(self, cnt=None, **kwargs):
66
79
"url" : None ,
67
80
}
68
81
69
- # Attribute to reference a function passed by the caller,
70
- # which takes a single argument, the Site object, after
71
- # each GET or POST request. Intended for saving the response for
72
- # debugging purposes.
73
- self .save_response = kwargs .get ("save_response_fn" )
74
-
75
82
# Some courts will block Juriscraper or Courtlistener's user-agent
76
83
# or may need special headers. This flag let's the caller know it
77
84
# should use the modified `self.request["headers"]`
@@ -86,8 +93,8 @@ def __init__(self, cnt=None, **kwargs):
86
93
self ._req_attrs = []
87
94
self ._all_attrs = []
88
95
89
- def __del__ (self ):
90
- self .close_session ()
96
+ async def __aexit__ (self ):
97
+ await self .close_session ()
91
98
92
99
def __str__ (self ):
93
100
out = []
@@ -105,9 +112,9 @@ def __getitem__(self, i):
105
112
def __len__ (self ):
106
113
return len (self .case_names )
107
114
108
- def close_session (self ):
115
+ async def close_session (self ):
109
116
if self .request ["session" ]:
110
- self .request ["session" ].close ()
117
+ await self .request ["session" ].aclose ()
111
118
112
119
def _make_item (self , i ):
113
120
"""Using i, convert a single item into a dict. This is effectively a
@@ -127,20 +134,15 @@ def dump_html(self, element):
127
134
"""Use this for debugging purposes"""
128
135
print (get_html_from_element (element ))
129
136
130
- def disable_certificate_verification (self ):
131
- """Scrapers that require this due to website misconfiguration
132
- should be checked periodically--calls to this method from
133
- site scrapers should be removed when no longer necessary.
134
- """
135
- self .request ["verify" ] = False
136
-
137
137
def set_custom_adapter (self , cipher : str ):
138
- """Set Custom SSL/TLS Adapter for out of date court systems
138
+ """Set Custom SSL/TLS cipher for out of date court systems
139
139
140
140
:param cipher: The court required cipher
141
141
:return: None
142
142
"""
143
- self .request ["session" ].mount ("https://" , SSLAdapter (ciphers = cipher ))
143
+ ctx = ssl .create_default_context (cafile = certifi .where ())
144
+ ctx .set_ciphers (cipher )
145
+ return ctx
144
146
145
147
def test_mode_enabled (self ):
146
148
return self .method == "LOCAL"
@@ -151,18 +153,25 @@ def to_json(self):
151
153
default = json_date_handler ,
152
154
)
153
155
154
- def parse (self ):
156
+ async def parse (self ):
155
157
if not self .downloader_executed :
156
158
# Run the downloader if it hasn't been run already
157
- self .html = self ._download ()
159
+ self .html = await self ._download ()
158
160
159
161
# Process the available html (optional)
160
- self ._process_html ()
162
+ if inspect .iscoroutinefunction (self ._process_html ):
163
+ await self ._process_html ()
164
+ else :
165
+ self ._process_html ()
161
166
162
167
# Set the attribute to the return value from _get_foo()
163
168
# e.g., this does self.case_names = _get_case_names()
164
169
for attr in self ._all_attrs :
165
- self .__setattr__ (attr , getattr (self , f"_get_{ attr } " )())
170
+ get_attr = getattr (self , f"_get_{ attr } " )
171
+ if inspect .iscoroutinefunction (get_attr ):
172
+ self .__setattr__ (attr , await get_attr ())
173
+ else :
174
+ self .__setattr__ (attr , get_attr ())
166
175
167
176
self ._clean_attributes ()
168
177
if "case_name_shorts" in self ._all_attrs :
@@ -350,7 +359,7 @@ def _make_html_tree(self, text):
350
359
"""
351
360
return get_html_parsed_text (text )
352
361
353
- def _download (self , request_dict = {}):
362
+ async def _download (self , request_dict = {}):
354
363
"""Download the latest version of Site"""
355
364
self .downloader_executed = True
356
365
if self .method == "POST" :
@@ -364,14 +373,12 @@ def _download(self, request_dict={}):
364
373
else :
365
374
logger .info (f"Now downloading case page at: { self .url } " )
366
375
367
- self ._process_request_parameters (request_dict )
368
-
369
376
if self .test_mode_enabled ():
370
- self ._request_url_mock (self .url )
377
+ await self ._request_url_mock (self .url )
371
378
elif self .method == "GET" :
372
- self ._request_url_get (self .url )
379
+ await self ._request_url_get (self .url )
373
380
elif self .method == "POST" :
374
- self ._request_url_post (self .url )
381
+ await self ._request_url_post (self .url )
375
382
376
383
self ._post_process_response ()
377
384
return self ._return_response_text_object ()
@@ -385,46 +392,64 @@ def _process_html(self):
385
392
"""
386
393
pass
387
394
388
- def _process_request_parameters (self , parameters = {}):
389
- """Hook for processing injected parameter overrides"""
390
- if parameters .get ("verify" ) is not None :
391
- self .request ["verify" ] = parameters ["verify" ]
392
- del parameters ["verify" ]
393
- self .request ["parameters" ].update (parameters )
394
-
395
- def _request_url_get (self , url ):
395
+ async def _request_url_get (self , url ):
396
396
"""Execute GET request and assign appropriate request dictionary
397
397
values
398
398
"""
399
399
self .request ["url" ] = url
400
- self .request ["response" ] = self .request ["session" ].get (
400
+ self .request ["response" ] = await self .request ["session" ].get (
401
401
url ,
402
402
headers = self .request ["headers" ],
403
- verify = self .request ["verify" ],
404
403
timeout = 60 ,
405
404
** self .request ["parameters" ],
406
405
)
407
406
if self .save_response :
408
407
self .save_response (self )
409
408
410
- def _request_url_post (self , url ):
409
+ async def _request_url_post (self , url ):
411
410
"""Execute POST request and assign appropriate request dictionary values"""
412
411
self .request ["url" ] = url
413
- self .request ["response" ] = self .request ["session" ].post (
412
+ self .request ["response" ] = await self .request ["session" ].post (
414
413
url ,
415
414
headers = self .request ["headers" ],
416
- verify = self .request ["verify" ],
417
415
data = self .parameters ,
418
416
timeout = 60 ,
419
417
** self .request ["parameters" ],
420
418
)
421
419
if self .save_response :
422
420
self .save_response (self )
423
421
424
- def _request_url_mock (self , url ):
422
+ async def _request_url_mock (self , url ):
425
423
"""Execute mock request, used for testing"""
426
424
self .request ["url" ] = url
427
- self .request ["response" ] = MockRequest (url = self .url ).get ()
425
+
426
+ def handler (request : httpx .Request ):
427
+ try :
428
+ with open (self .mock_url , mode = "rb" ) as stream :
429
+ content = stream .read ()
430
+ try :
431
+ text = content .decode ("utf-8" )
432
+ except :
433
+ text = str (from_bytes (content ).best ())
434
+ r = httpx .Response (
435
+ status_code = 200 ,
436
+ request = request ,
437
+ text = text ,
438
+ )
439
+ #: Integer Code of responded HTTP Status.
440
+ if self .mock_url .endswith ("json" ):
441
+ r .headers ["content-type" ] = "application/json"
442
+ except OSError as e :
443
+ raise httpx .RequestError (str (e ))
444
+
445
+ # Return the response.
446
+ return r
447
+
448
+ transport = httpx .MockTransport (handler )
449
+ mock_client = httpx .AsyncClient (
450
+ transport = transport ,
451
+ )
452
+ self .request ["response" ] = await mock_client .get (url = self .url )
428
453
429
454
def _post_process_response (self ):
430
455
"""Cleanup to response object"""
@@ -452,9 +477,8 @@ def _return_response_text_object(self):
452
477
)
453
478
return html_tree
454
479
455
- def _get_html_tree_by_url (self , url , parameters = {}):
456
- self ._process_request_parameters (parameters )
457
- self ._request_url_get (url )
480
+ async def _get_html_tree_by_url (self , url , parameters = {}):
481
+ await self ._request_url_get (url )
458
482
self ._post_process_response ()
459
483
tree = self ._return_response_text_object ()
460
484
tree .make_links_absolute (url )
0 commit comments