Skip to content

1496 add rate limit base method #1497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ The following changes are not yet released, but are code complete:
Features:
- Add error handling for scrapers with expected results #1447
- Add a check to verify ACMS user data is loaded before querying attachment pages #1495
- Adds the possibility to use working hours and rate limit in all scrapers #1496

Changes:
- Expanded ACMS URL matching to support both HTTP and HTTPS protocols.
Expand Down
23 changes: 23 additions & 0 deletions juriscraper/AbstractSite.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import hashlib
import json
import sys
from datetime import date, datetime, timedelta

import certifi
import pytz
import requests

from juriscraper.lib.date_utils import (
Expand Down Expand Up @@ -79,6 +81,14 @@ def __init__(self, cnt=None, **kwargs):
# indicates whether the scraper should have results or not to raise an error
self.should_have_results = False

# Default working hours, can be overridden by subclasses
self.working_hours = (0, 24)
# use print(pytz.all_timezones) to get a list of time zones
self.time_zone = "America/Los_Angeles"

# indicates the rate limit for the scraper, in seconds
self.rate_limit = 0

# Sub-classed metadata
self.court_id = None
self.url = None
Expand Down Expand Up @@ -354,8 +364,21 @@ def _make_html_tree(self, text):
"""
return get_html_parsed_text(text)

def is_within_working_hours(self) -> bool:
"""
Checks if the current time is within the allowed working hours.

:return: True if within working hours, False otherwise.
"""
now = datetime.now(pytz.timezone(self.time_zone)).time()
start, end = self.working_hours
return start <= now.hour < end

def _download(self, request_dict=None):
"""Download the latest version of Site"""
if not self.is_within_working_hours():
raise sys.exit("Attempted to download outside of working hours.")

if request_dict is None:
request_dict = {}
self.downloader_executed = True
Expand Down
3 changes: 3 additions & 0 deletions sample_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from collections import defaultdict
from datetime import datetime
from optparse import OptionParser
from time import sleep
from urllib import parse

import requests
Expand Down Expand Up @@ -177,6 +178,8 @@ def get_binary_content(download_url: str, site, exceptions) -> bytes:

# Note that we do a GET even if site.method is POST. This is
# deliberate.

sleep(site.rate_limit)
r = s.get(
download_url,
verify=has_cipher, # WA has a certificate we don't understand
Expand Down
Loading