Skip to content

Commit f3708ea

Browse files
committed
Migrate http client from requests to httpx async client
1 parent 3703d9f commit f3708ea

File tree

243 files changed

+6169
-6177
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

243 files changed

+6169
-6177
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ Releases are also tagged in git, if that's helpful.
1212

1313
## Coming up
1414

15+
- Migrate from requests to httpx AsyncClient client. #739
16+
- Migrate sync calls to async. #739
1517

1618
## Current
1719

juriscraper/AbstractSite.py

Lines changed: 73 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import hashlib
2+
import inspect
23
import json
4+
import ssl
35
from datetime import date, datetime, timedelta
46
from typing import Dict, List, Tuple
57

68
import certifi
7-
import requests
9+
import httpx
10+
from charset_normalizer import from_bytes
811

912
from juriscraper.lib.date_utils import (
1013
fix_future_year_typo,
@@ -20,14 +23,12 @@
2023
set_response_encoding,
2124
)
2225
from juriscraper.lib.log_tools import make_default_logger
23-
from juriscraper.lib.network_utils import SSLAdapter
2426
from juriscraper.lib.string_utils import (
2527
CaseNameTweaker,
2628
clean_string,
2729
harmonize,
2830
trunc,
2931
)
30-
from juriscraper.lib.test_utils import MockRequest
3132

3233
logger = make_default_logger()
3334

@@ -39,7 +40,7 @@ class AbstractSite:
3940
Should not contain lists that can't be sorted by the _date_sort function.
4041
"""
4142

42-
def __init__(self, cnt=None, **kwargs):
43+
def __init__(self, cnt=None, user_agent="Juriscraper", **kwargs):
4344
super().__init__()
4445

4546
# Computed metadata
@@ -50,11 +51,23 @@ def __init__(self, cnt=None, **kwargs):
5051
self.downloader_executed = False
5152
self.cookies = {}
5253
self.cnt = cnt or CaseNameTweaker()
54+
self.user_agent = user_agent
55+
56+
# Attribute to reference a function passed by the caller,
57+
# which takes a single argument, the Site object, after
58+
# each GET or POST request. Intended for saving the response for
59+
# debugging purposes.
60+
self.save_response = kwargs.pop("save_response_fn", None)
61+
62+
kwargs.pop("backscrape_start", None)
63+
kwargs.pop("backscrape_end", None)
64+
kwargs.pop("days_interval", None)
65+
kwargs.setdefault("http2", True)
66+
kwargs.setdefault("verify", certifi.where())
5367
self.request = {
54-
"verify": certifi.where(),
55-
"session": requests.session(),
68+
"session": httpx.AsyncClient(**kwargs),
5669
"headers": {
57-
"User-Agent": "Juriscraper",
70+
"User-Agent": self.user_agent,
5871
# Disable CDN caching on sites like SCOTUS (ahem)
5972
"Cache-Control": "no-cache, max-age=0, must-revalidate",
6073
# backwards compatibility with HTTP/1.0 caches
@@ -66,12 +79,6 @@ def __init__(self, cnt=None, **kwargs):
6679
"url": None,
6780
}
6881

69-
# Attribute to reference a function passed by the caller,
70-
# which takes a single argument, the Site object, after
71-
# each GET or POST request. Intended for saving the response for
72-
# debugging purposes.
73-
self.save_response = kwargs.get("save_response_fn")
74-
7582
# Some courts will block Juriscraper or Courtlistener's user-agent
7683
# or may need special headers. This flag let's the caller know it
7784
# should use the modified `self.request["headers"]`
@@ -86,8 +93,8 @@ def __init__(self, cnt=None, **kwargs):
8693
self._req_attrs = []
8794
self._all_attrs = []
8895

89-
def __del__(self):
90-
self.close_session()
96+
async def __aexit__(self):
97+
await self.close_session()
9198

9299
def __str__(self):
93100
out = []
@@ -105,9 +112,9 @@ def __getitem__(self, i):
105112
def __len__(self):
106113
return len(self.case_names)
107114

108-
def close_session(self):
115+
async def close_session(self):
109116
if self.request["session"]:
110-
self.request["session"].close()
117+
await self.request["session"].aclose()
111118

112119
def _make_item(self, i):
113120
"""Using i, convert a single item into a dict. This is effectively a
@@ -127,20 +134,15 @@ def dump_html(self, element):
127134
"""Use this for debugging purposes"""
128135
print(get_html_from_element(element))
129136

130-
def disable_certificate_verification(self):
131-
"""Scrapers that require this due to website misconfiguration
132-
should be checked periodically--calls to this method from
133-
site scrapers should be removed when no longer necessary.
134-
"""
135-
self.request["verify"] = False
136-
137137
def set_custom_adapter(self, cipher: str):
138-
"""Set Custom SSL/TLS Adapter for out of date court systems
138+
"""Set Custom SSL/TLS cipher for out of date court systems
139139
140140
:param cipher: The court required cipher
141141
:return: None
142142
"""
143-
self.request["session"].mount("https://", SSLAdapter(ciphers=cipher))
143+
ctx = ssl.create_default_context(cafile=certifi.where())
144+
ctx.set_ciphers(cipher)
145+
return ctx
144146

145147
def test_mode_enabled(self):
146148
return self.method == "LOCAL"
@@ -151,18 +153,25 @@ def to_json(self):
151153
default=json_date_handler,
152154
)
153155

154-
def parse(self):
156+
async def parse(self):
155157
if not self.downloader_executed:
156158
# Run the downloader if it hasn't been run already
157-
self.html = self._download()
159+
self.html = await self._download()
158160

159161
# Process the available html (optional)
160-
self._process_html()
162+
if inspect.iscoroutinefunction(self._process_html):
163+
await self._process_html()
164+
else:
165+
self._process_html()
161166

162167
# Set the attribute to the return value from _get_foo()
163168
# e.g., this does self.case_names = _get_case_names()
164169
for attr in self._all_attrs:
165-
self.__setattr__(attr, getattr(self, f"_get_{attr}")())
170+
get_attr = getattr(self, f"_get_{attr}")
171+
if inspect.iscoroutinefunction(get_attr):
172+
self.__setattr__(attr, await get_attr())
173+
else:
174+
self.__setattr__(attr, get_attr())
166175

167176
self._clean_attributes()
168177
if "case_name_shorts" in self._all_attrs:
@@ -350,7 +359,7 @@ def _make_html_tree(self, text):
350359
"""
351360
return get_html_parsed_text(text)
352361

353-
def _download(self, request_dict={}):
362+
async def _download(self, request_dict={}):
354363
"""Download the latest version of Site"""
355364
self.downloader_executed = True
356365
if self.method == "POST":
@@ -364,14 +373,12 @@ def _download(self, request_dict={}):
364373
else:
365374
logger.info(f"Now downloading case page at: {self.url}")
366375

367-
self._process_request_parameters(request_dict)
368-
369376
if self.test_mode_enabled():
370-
self._request_url_mock(self.url)
377+
await self._request_url_mock(self.url)
371378
elif self.method == "GET":
372-
self._request_url_get(self.url)
379+
await self._request_url_get(self.url)
373380
elif self.method == "POST":
374-
self._request_url_post(self.url)
381+
await self._request_url_post(self.url)
375382

376383
self._post_process_response()
377384
return self._return_response_text_object()
@@ -385,46 +392,59 @@ def _process_html(self):
385392
"""
386393
pass
387394

388-
def _process_request_parameters(self, parameters={}):
389-
"""Hook for processing injected parameter overrides"""
390-
if parameters.get("verify") is not None:
391-
self.request["verify"] = parameters["verify"]
392-
del parameters["verify"]
393-
self.request["parameters"].update(parameters)
394-
395-
def _request_url_get(self, url):
395+
async def _request_url_get(self, url):
396396
"""Execute GET request and assign appropriate request dictionary
397397
values
398398
"""
399399
self.request["url"] = url
400-
self.request["response"] = self.request["session"].get(
400+
self.request["response"] = await self.request["session"].get(
401401
url,
402402
headers=self.request["headers"],
403-
verify=self.request["verify"],
404403
timeout=60,
405404
**self.request["parameters"],
406405
)
407406
if self.save_response:
408407
self.save_response(self)
409408

410-
def _request_url_post(self, url):
409+
async def _request_url_post(self, url):
411410
"""Execute POST request and assign appropriate request dictionary values"""
412411
self.request["url"] = url
413-
self.request["response"] = self.request["session"].post(
412+
self.request["response"] = await self.request["session"].post(
414413
url,
415414
headers=self.request["headers"],
416-
verify=self.request["verify"],
417415
data=self.parameters,
418416
timeout=60,
419417
**self.request["parameters"],
420418
)
421419
if self.save_response:
422420
self.save_response(self)
423421

424-
def _request_url_mock(self, url):
422+
async def _request_url_mock(self, url):
425423
"""Execute mock request, used for testing"""
426424
self.request["url"] = url
427-
self.request["response"] = MockRequest(url=self.url).get()
425+
426+
def handler(request: httpx.Request):
427+
try:
428+
with open(self.mock_url, mode="rb") as stream:
429+
content = stream.read()
430+
try:
431+
text = content.decode("utf-8")
432+
except:
433+
text = str(from_bytes(content).best())
434+
r = httpx.Response(
435+
status_code=200,
436+
request=request,
437+
text=text,
438+
)
439+
if self.mock_url.endswith("json"):
440+
r.headers["content-type"] = "application/json"
441+
except OSError as e:
442+
raise httpx.RequestError(str(e))
443+
return r
444+
445+
transport = httpx.MockTransport(handler)
446+
mock_client = httpx.AsyncClient(transport=transport)
447+
self.request["response"] = await mock_client.get(url=self.url)
428448

429449
def _post_process_response(self):
430450
"""Cleanup to response object"""
@@ -452,9 +472,8 @@ def _return_response_text_object(self):
452472
)
453473
return html_tree
454474

455-
def _get_html_tree_by_url(self, url, parameters={}):
456-
self._process_request_parameters(parameters)
457-
self._request_url_get(url)
475+
async def _get_html_tree_by_url(self, url, parameters={}):
476+
await self._request_url_get(url)
458477
self._post_process_response()
459478
tree = self._return_response_text_object()
460479
tree.make_links_absolute(url)

juriscraper/DeferringList.py

Lines changed: 0 additions & 66 deletions
This file was deleted.

juriscraper/OpinionSiteLinear.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ class OpinionSiteLinear(OpinionSite):
4141
"joined_by",
4242
"other_date",
4343
"attorney",
44+
"title",
45+
"precedential_status",
4446
}
4547

4648
def __init__(self, *args, **kwargs):

juriscraper/OpinionSiteLinearWebDriven.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
77
super().__init__(*args, **kwargs)
88
WebDriven.__init__(self, args, kwargs)
99

10+
async def __aexit__(self):
11+
await self.close_session()
12+
1013
def __del__(self):
11-
self.close_session()
1214
self.close_webdriver_session()

juriscraper/OpinionSiteWebDriven.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
77
super().__init__(*args, **kwargs)
88
WebDriven.__init__(self, args, kwargs)
99

10+
async def __aexit__(self):
11+
await self.close_session()
12+
1013
def __del__(self):
11-
self.close_session()
1214
self.close_webdriver_session()

juriscraper/OralArgumentSiteLinearWebDriven.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
77
super().__init__(*args, **kwargs)
88
WebDriven.__init__(self, args, kwargs)
99

10+
async def __aexit__(self):
11+
await self.close_session()
12+
1013
def __del__(self):
11-
self.close_session()
1214
self.close_webdriver_session()

0 commit comments

Comments
 (0)