Skip to content

Commit 8dd8337

Browse files
committed
Migrate http client from requests to httpx async client
1 parent e5b458d commit 8dd8337

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+467
-378
lines changed

juriscraper/AbstractSite.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from datetime import date, datetime
44

55
import certifi
6-
import requests
6+
import httpx
77

88
from juriscraper.lib.date_utils import fix_future_year_typo, json_date_handler
99
from juriscraper.lib.exceptions import InsanityException
@@ -33,7 +33,7 @@ class AbstractSite:
3333
Should not contain lists that can't be sorted by the _date_sort function.
3434
"""
3535

36-
def __init__(self, cnt=None):
36+
def __init__(self, cnt=None, user_agent="Juriscraper", **kwargs):
3737
super().__init__()
3838

3939
# Computed metadata
@@ -44,10 +44,12 @@ def __init__(self, cnt=None):
4444
self.downloader_executed = False
4545
self.cookies = {}
4646
self.cnt = cnt or CaseNameTweaker()
47+
self.user_agent = user_agent
48+
kwargs.setdefault("http2", True)
4749
self.request = {
4850
"verify": certifi.where(),
49-
"session": requests.session(),
50-
"headers": {"User-Agent": "Juriscraper"},
51+
"session": httpx.AsyncClient(**kwargs),
52+
"headers": {"User-Agent": self.user_agent},
5153
# Disable CDN caching on sites like SCOTUS (ahem)
5254
"cache-control": "no-cache, no-store, max-age=1",
5355
"parameters": {},
@@ -65,8 +67,8 @@ def __init__(self, cnt=None):
6567
self._req_attrs = []
6668
self._all_attrs = []
6769

68-
def __del__(self):
69-
self.close_session()
70+
async def __aexit__(self):
71+
await self.close_session()
7072

7173
def __str__(self):
7274
out = []
@@ -84,9 +86,9 @@ def __getitem__(self, i):
8486
def __len__(self):
8587
return len(self.case_names)
8688

87-
def close_session(self):
89+
async def close_session(self):
8890
if self.request["session"]:
89-
self.request["session"].close()
91+
await self.request["session"].aclose()
9092

9193
def _make_item(self, i):
9294
"""Using i, convert a single item into a dict. This is effectively a
@@ -344,23 +346,23 @@ def _process_request_parameters(self, parameters={}):
344346
del parameters["verify"]
345347
self.request["parameters"] = parameters
346348

347-
def _request_url_get(self, url):
349+
async def _request_url_get(self, url):
348350
"""Execute GET request and assign appropriate request dictionary
349351
values
350352
"""
351353
self.request["url"] = url
352-
self.request["response"] = self.request["session"].get(
354+
self.request["response"] = await self.request["session"].get(
353355
url,
354356
headers=self.request["headers"],
355357
verify=self.request["verify"],
356358
timeout=60,
357359
**self.request["parameters"],
358360
)
359361

360-
def _request_url_post(self, url):
362+
async def _request_url_post(self, url):
361363
"""Execute POST request and assign appropriate request dictionary values"""
362364
self.request["url"] = url
363-
self.request["response"] = self.request["session"].post(
365+
self.request["response"] = await self.request["session"].post(
364366
url,
365367
headers=self.request["headers"],
366368
verify=self.request["verify"],

juriscraper/DeferringList.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import asyncio
2+
import inspect
3+
14
from juriscraper.AbstractSite import logger
25

36

@@ -42,7 +45,13 @@ def __getitem__(self, item):
4245
logger.info(
4346
f"Getting deferred value from seed: {self._data[item]}"
4447
)
45-
new_val = self._fetching_function(self._data[item])
48+
if inspect.isawaitable(self._fetching_function):
49+
loop = asyncio.get_event_loop()
50+
new_val = loop.run_until_complete(
51+
self._fetching_function(self._data[item])
52+
)
53+
else:
54+
new_val = self._fetching_function(self._data[item])
4655
self._data[item] = new_val
4756
self._fetched_items[item] = True
4857
return new_val

juriscraper/OpinionSiteLinearWebDriven.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
77
super().__init__(*args, **kwargs)
88
WebDriven.__init__(self, args, kwargs)
99

10+
async def __aexit__(self):
11+
await self.close_session()
12+
1013
def __del__(self):
11-
self.close_session()
1214
self.close_webdriver_session()

juriscraper/OpinionSiteWebDriven.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
77
super().__init__(*args, **kwargs)
88
WebDriven.__init__(self, args, kwargs)
99

10+
async def __aexit__(self):
11+
await self.close_session()
12+
1013
def __del__(self):
11-
self.close_session()
1214
self.close_webdriver_session()

juriscraper/OralArgumentSiteLinearWebDriven.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
77
super().__init__(*args, **kwargs)
88
WebDriven.__init__(self, args, kwargs)
99

10+
async def __aexit__(self):
11+
await self.close_session()
12+
1013
def __del__(self):
11-
self.close_session()
1214
self.close_webdriver_session()

juriscraper/fdsys/FDSysSite.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,20 @@
55
from datetime import date
66
from pprint import pprint
77

8-
import requests
8+
import httpx
9+
from httpx import InvalidURL
910
from lxml import etree
10-
from requests.exceptions import MissingSchema
1111

1212
from juriscraper.AbstractSite import AbstractSite
1313

1414

15-
def get_tree(url):
15+
async def get_tree(url, **kwargs):
1616
try:
17-
response = requests.get(url, stream=True)
18-
response.raw.decode_content = True
19-
return etree.parse(response.raw)
20-
except MissingSchema:
17+
kwargs.setdefault("http2", True)
18+
async with httpx.AsyncClient(**kwargs) as client:
19+
response = await client.get(url)
20+
return etree.parse(await response.aread())
21+
except InvalidURL:
2122
return etree.parse(url)
2223

2324

@@ -160,23 +161,25 @@ def __getitem__(self, i):
160161
def __len__(self):
161162
return len(xpath(self.html, "//s:loc/text()"))
162163

163-
def save_mods_file(self, url):
164+
async def save_mods_file(self, url, **kwargs):
164165
mods_url = FDSysModsContent._get_mods_file_url(url)
165166
name = "-".join(mods_url.split("/")[-2].split("-")[1:])
166-
with open(f"./examples/2006/{name}.xml", "w") as handle:
167-
response = requests.get(mods_url, stream=True)
168-
for block in response.iter_content(1024):
169-
handle.write(block)
170-
171-
def _download(self, request_dict={}):
167+
with open(f"./examples/2006/{name}.xml", "wb") as handle:
168+
kwargs.setdefault("http2", True)
169+
async with httpx.AsyncClient(**kwargs) as client:
170+
async with client.stream("GET", mods_url) as response:
171+
async for block in response.aiter_bytes():
172+
handle.write(block)
173+
174+
async def _download(self, request_dict={}):
172175
"""
173176
it actually builds an XML tree
174177
"""
175-
return get_tree(self.url)
178+
return await get_tree(self.url)
176179

177-
def _download_backwards(self, year):
180+
async def _download_backwards(self, year):
178181
self.url = self.base_url.format(year=year)
179-
self.html = self._download()
182+
self.html = await self._download()
180183
if self.html is not None:
181184
# Setting status is important because it prevents the download
182185
# function from being run a second time by the parse method.
@@ -185,10 +188,10 @@ def _download_backwards(self, year):
185188
def _check_sanity(self):
186189
pass
187190

188-
def parse(self):
191+
async def parse(self):
189192
if self.status is None:
190193
# Run the downloader if it hasn't been run already
191-
self.html = self._download()
194+
self.html = await self._download()
192195
return self
193196

194197

juriscraper/fdsys/scrape_court_names.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
1+
import asyncio
12
import json
23
from pprint import pprint
34

4-
import requests
5+
import httpx
56
from lxml import etree, html
67

78

8-
def get_court_names():
9-
response = requests.get("https://www.courtlistener.com/api/jurisdictions/")
9+
async def get_court_names(**kwargs):
10+
kwargs.setdefault("http2", True)
11+
async with httpx.AsyncClient(**kwargs) as client:
12+
response = await client.get(
13+
"https://www.courtlistener.com/api/jurisdictions/"
14+
)
1015
tree = html.fromstring(response.text)
1116

1217
data = dict()
@@ -21,13 +26,14 @@ def get_court_names():
2126
json.dump(data, f)
2227

2328

24-
def get_fdsys_court_names():
25-
response = requests.get(
26-
"https://www.gpo.gov/smap/fdsys/sitemap_2014/2014_USCOURTS_sitemap.xml",
27-
stream=True,
28-
)
29-
response.raw.decode_content = True
30-
tree = etree.parse(response.raw)
29+
async def get_fdsys_court_names(**kwargs):
30+
kwargs.setdefault("http2", True)
31+
async with httpx.AsyncClient(**kwargs) as client:
32+
response = await client.get(
33+
"https://www.gpo.gov/smap/fdsys/sitemap_2014/2014_USCOURTS_sitemap.xml"
34+
)
35+
tree = etree.parse(await response.aread())
36+
3137
data = dict()
3238

3339
for url in tree.xpath(
@@ -47,4 +53,4 @@ def get_fdsys_court_names():
4753

4854
if __name__ == "__main__":
4955
# get_court_names()
50-
get_fdsys_court_names()
56+
asyncio.run(get_fdsys_court_names())

juriscraper/lasc/http.py

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,20 @@
1-
import requests
1+
import httpx
22
from lxml.html import fromstring
33

44
from ..lib.log_tools import make_default_logger
55

6-
requests.packages.urllib3.disable_warnings(
7-
requests.packages.urllib3.exceptions.InsecureRequestWarning
8-
)
9-
106
logger = make_default_logger()
117

128

13-
class LASCSession(requests.Session):
9+
class LASCSession(httpx.AsyncClient):
1410
"""
15-
A requests.Session object with special tooling to handle the Los Angeles
11+
A httpx.AsyncClient object with special tooling to handle the Los Angeles
1612
Superior Court Media Access portal.
1713
"""
1814

19-
def __init__(self, username=None, password=None):
15+
def __init__(
16+
self, username=None, password=None, user_agent="Juriscraper", **kwargs
17+
):
2018
"""
2119
Instantiate a new LASC HTTP Session with some Juriscraper defaults.
2220
This method requires credentials from the media access portal.
@@ -25,7 +23,7 @@ def __init__(self, username=None, password=None):
2523
:param password: MAP password
2624
:return: A LASCSession object
2725
"""
28-
super().__init__()
26+
super().__init__(**kwargs)
2927

3028
self.html = None
3129

@@ -53,34 +51,35 @@ def __init__(self, username=None, password=None):
5351
"password": password,
5452
"request_type": "RESPONSE",
5553
}
54+
self.user_agent = user_agent
5655
self.headers = {
5756
"Origin": ms_base_url,
58-
"User-Agent": "Juriscraper",
57+
"User-Agent": self.user_agent,
5958
}
6059

61-
def get(self, url, auto_login=False, **kwargs):
62-
"""Overrides request.Session.get with session retry logic.
60+
async def get(self, url, auto_login=False, **kwargs):
61+
"""Overrides httpx.AsyncClient.get with session retry logic.
6362
6463
:param url: url string to GET
6564
:param auto_login: Whether the auto-login procedure should happen.
66-
:return: requests.Response
65+
:return: httpx.Response
6766
"""
6867
kwargs.setdefault("timeout", 30)
6968
kwargs.setdefault("params", {"p": "B2C_1_Media-LASC-SUSI"})
7069

71-
return super().get(url, **kwargs)
70+
return await super().get(url, **kwargs)
7271

73-
def post(self, url, auto_login=False, **kwargs):
74-
"""Overrides request.Session.post with session retry logic.
72+
async def post(self, url, auto_login=False, **kwargs):
73+
"""Overrides httpx.AsyncClient.post with session retry logic.
7574
7675
:param url: url string to GET
7776
:param auto_login: Whether the auto-login procedure should happen.
78-
:return: requests.Response
77+
:return: httpx.Response
7978
"""
8079
kwargs.setdefault("timeout", 30)
8180
kwargs.setdefault("params", {"p": "B2C_1_Media-LASC-SUSI"})
8281

83-
return super().post(url, **kwargs)
82+
return await super().post(url, **kwargs)
8483

8584
@staticmethod
8685
def _parse_new_html_for_keys(r):
@@ -89,7 +88,7 @@ def _parse_new_html_for_keys(r):
8988
This method parses the HTML after the first login page and identifies
9089
the parameter values required for the next step.
9190
92-
:param r: A request.Response object
91+
:param r: A httpx.Response object
9392
:return: A dict containing the needed keys
9493
"""
9594
html = fromstring(r.text)
@@ -103,7 +102,7 @@ def _parse_new_html_for_keys(r):
103102
def _check_login(r):
104103
"""Check that the login succeeded
105104
106-
:param r: A request.Response object
105+
:param r: A httpx.Response object
107106
:return: None
108107
:raises LASCLoginException
109108
"""
@@ -121,7 +120,7 @@ def _check_login(r):
121120
def _update_header_token(self, r):
122121
self.headers["X-CSRF-TOKEN"] = r.text.split("csrf")[1].split('"')[2]
123122

124-
def login(self):
123+
async def login(self):
125124
"""Log into the LASC Media Access Portal
126125
The process is tricky, requiring two GET requests, each of which
127126
returns HTML or JSON that is parsed for values to send in a subsequent
@@ -326,20 +325,20 @@ def login(self):
326325
"""
327326

328327
logger.info("Logging into MAP has begun")
329-
r = self.get(self.login_url)
328+
r = await self.get(self.login_url)
330329
self._update_header_token(r)
331330

332331
# Call part one of Microsoft login API
333-
r = self.post(self.api_url1, data=self.login_data)
332+
r = await self.post(self.api_url1, data=self.login_data)
334333
self._check_login(r)
335334

336335
# Call part two of Microsoft login API - Redirect
337-
r = self.get(self.api_url2)
336+
r = await self.get(self.api_url2)
338337

339338
# Finalize login with post into LA MAP site
340339
parsed_keys = self._parse_new_html_for_keys(r)
341340

342-
self.post(self.signin_url, data=parsed_keys)
341+
await self.post(self.signin_url, data=parsed_keys)
343342

344343
logger.info("Successfully Logged into MAP")
345344

0 commit comments

Comments
 (0)