diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json index 1967b2d9d6..a6fc0cb958 100755 --- a/uk_bin_collection/tests/input.json +++ b/uk_bin_collection/tests/input.json @@ -879,7 +879,7 @@ "url": "https://environmentfirst.co.uk/house.php?uprn=100060055444", "wiki_command_url_override": "https://environmentfirst.co.uk/house.php?uprn=XXXXXXXXXX", "wiki_name": "Environment First", - "wiki_note": "For properties with collections managed by Environment First, such as Lewes and Eastbourne. Replace the XXXXXXXXXX with the UPRN of your property\u2014you can use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find this." + "wiki_note": "For properties with collections managed by Environment First, such as Lewes and Eastbourne. Replace the XXXXXXXXXX with the UPRN of your property—you can use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find this." }, "EppingForestDistrictCouncil": { "postcode": "IG9 6EP", @@ -1754,14 +1754,14 @@ "LAD24CD": "E06000012" }, "NorthHertfordshireDistrictCouncil": { - "house_number": "Stewards Flat", - "postcode": "SG5 1PZ", - "skip_get_url": true, - "url": "https://waste.nc.north-herts.gov.uk/w/webpage/find-bin-collection-day-input-address", - "web_driver": "http://selenium:4444", - "wiki_name": "North Hertfordshire", - "wiki_note": "Pass a postcode (with space) and house_number/name. The scraper performs the Liberty Create typeahead lookup and matches house_number as a case-insensitive substring.", - "LAD24CD": "E07000099" + "house_number": "Stewards Flat", + "postcode": "SG5 1PZ", + "skip_get_url": true, + "url": "https://waste.nc.north-herts.gov.uk/w/webpage/find-bin-collection-day-input-address", + "web_driver": "http://selenium:4444", + "wiki_name": "North Hertfordshire", + "wiki_note": "Pass a postcode (with space) and house_number/name. The scraper performs the Liberty Create typeahead lookup and matches house_number as a case-insensitive substring.", + "LAD24CD": "E07000099" }, "NorthKestevenDistrictCouncil": { "skip_get_url": true, @@ -2877,5 +2877,14 @@ "wiki_name": "York", "wiki_note": "Provide your UPRN.", "LAD24CD": "E06000014" + }, + "CneSiarCouncil": { + "LAD24CD": "S12000013", + "paon": "Back", + "postcode": "HS2 0LQ", + "skip_get_url": true, + "url": "https://www.cne-siar.gov.uk/bins-and-recycling", + "wiki_name": "Comhairle nan Eilean Siar", + "wiki_note": "Pass village/area name as paon. Postcode optional (HS1-5 = Lewis/Harris, HS6-9 = Uist/Barra). Scrapes static HTML schedule pages." } } \ No newline at end of file diff --git a/uk_bin_collection/uk_bin_collection/councils/CneSiarCouncil.py b/uk_bin_collection/uk_bin_collection/councils/CneSiarCouncil.py new file mode 100644 index 0000000000..54f5c0eee9 --- /dev/null +++ b/uk_bin_collection/uk_bin_collection/councils/CneSiarCouncil.py @@ -0,0 +1,412 @@ +import re +from datetime import datetime + +import requests +from bs4 import BeautifulSoup + +from uk_bin_collection.uk_bin_collection.common import * +from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass + + +class CouncilClass(AbstractGetBinDataClass): + """ + Comhairle nan Eilean Siar (Western Isles Council). + + Area-based collections with no postcode lookup. The user provides a + village, street, or area name via the paon parameter. The scraper + searches the council's schedule pages to find matching routes and + returns upcoming collection dates for all bin types. + + Lewis & Harris bin types: + - Organic Food & Garden Waste / Mixed Recycling (Blue Bin) - 3-weekly + - Non-Recyclable Waste (Grey bin, purple sticker) - 3-weekly + - Glass (Green Bin) - 9-weekly + + Uist & Barra bin types: + - Residual Waste (Black Bin) - fortnightly + - Recycling - Paper/Card (Green sticker) - fortnightly (alternating) + - Recycling - Plastic/Tin (Blue sticker) - fortnightly (alternating) + """ + + BASE_URL = "https://www.cne-siar.gov.uk" + + # Lewis & Harris schedule pages: (bin_type_label, base_path, day_slugs) + LH_SCHEDULES = [ + ( + "Organic Food & Garden Waste / Mixed Recycling (Blue Bin)", + "/bins-and-recycling/waste-recycling-collections-lewis-and-harris" + "/organic-food-and-garden-waste-and-mixed-recycling-blue-bin", + ["monday-collections", "tuesday-collections", + "wednesday-collections", "thursday-collections", + "friday-collections"], + ), + ( + "Non-Recyclable Waste (Grey Bin)", + "/bins-and-recycling/waste-recycling-collections-lewis-and-harris" + "/non-recyclable-waste-grey-bin-purple-sticker", + ["monday-collections", "tuesday-collections", + "wednesday-collections", "thursday-collections", + "friday-collections"], + ), + ( + "Glass (Green Bin)", + "/bins-and-recycling/waste-recycling-collections-lewis-and-harris" + "/glass-green-bin-collections", + ["thursday-collections", "friday-collections"], + ), + ] + + # Uist & Barra schedule pages: (bin_type_label, path, day_slugs) + # Residual uses tables with simple date cells. + # Recycling uses tables with Paper/Card and Plastic/Tin sub-types. + UB_RESIDUAL = ( + "Residual Waste (Black Bin)", + "/bins-and-recycling/uist-and-barra" + "/waste-recycling-collections-uist-and-barra/residual-bins-black-bins", + ["tuesday-collections", "thursday-collections"], + ) + + UB_RECYCLING = ( + None, # bin type determined per-cell (Paper/Card or Plastic/Tin) + "/bins-and-recycling/waste-recycling-collections-uist-and-barra" + "/recycling-bins-blue-and-green", + ["monday-collections", "tuesday-collections", + "wednesday-collections"], + ) + + # HS1-HS2 = Stornoway/Lewis, HS3-HS5 = Harris, + # HS6-HS8 = Uist, HS9 = Barra. If a postcode is provided we can + # skip one region's pages entirely, halving HTTP requests. + LH_POSTCODES = {"HS1", "HS2", "HS3", "HS4", "HS5"} + UB_POSTCODES = {"HS6", "HS7", "HS8", "HS9"} + + def parse_data(self, page: str, **kwargs) -> dict: + user_paon = kwargs.get("paon") + if not user_paon: + raise ValueError( + "A village, street, or area name is required (paon parameter). " + "Examples: 'Back', 'Leverburgh', 'Castlebay', 'Manor', " + "'Goathill', 'Balivanich'. Check your area at " + "https://www.cne-siar.gov.uk/bins-and-recycling" + ) + + bindata = {"bins": []} + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/138.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-GB,en;q=0.9", + } + session = requests.Session() + session.headers.update(headers) + + search_term = user_paon.strip().lower() + + # Determine which region(s) to search based on postcode + user_postcode = kwargs.get("postcode", "").strip().upper() + pc_district = user_postcode[:3].rstrip() if user_postcode else "" + search_lh = not pc_district or pc_district in self.LH_POSTCODES + search_ub = not pc_district or pc_district in self.UB_POSTCODES + + # Search Lewis & Harris accordion-style pages + if search_lh: + for bin_type, base_path, day_slugs in self.LH_SCHEDULES: + for slug in day_slugs: + url = f"{self.BASE_URL}{base_path}/{slug}" + try: + self._parse_lh_accordion_page( + session, url, bin_type, search_term, bindata + ) + except Exception: + continue + + # Search Uist & Barra table-style pages - Residual + if search_ub: + res_label, res_path, res_slugs = self.UB_RESIDUAL + for slug in res_slugs: + url = f"{self.BASE_URL}{res_path}/{slug}" + try: + self._parse_ub_table_page( + session, url, res_label, search_term, bindata, + is_recycling=False, + ) + except Exception: + continue + + # Search Uist & Barra table-style pages - Recycling + _, rec_path, rec_slugs = self.UB_RECYCLING + for slug in rec_slugs: + url = f"{self.BASE_URL}{rec_path}/{slug}" + try: + self._parse_ub_table_page( + session, url, None, search_term, bindata, + is_recycling=True, + ) + except Exception: + continue + + if not bindata["bins"]: + raise ValueError( + f"No collection area found matching '{user_paon}'. " + "Try a village name (e.g. 'Back', 'Leverburgh', 'Castlebay') " + "or street name (e.g. 'Goathill', 'Manor'). " + "Check https://www.cne-siar.gov.uk/bins-and-recycling" + ) + + # De-duplicate and sort + seen = set() + unique_bins = [] + for b in bindata["bins"]: + key = (b["type"], b["collectionDate"]) + if key not in seen: + seen.add(key) + unique_bins.append(b) + + unique_bins.sort( + key=lambda x: datetime.strptime( + x.get("collectionDate"), date_format + ) + ) + bindata["bins"] = unique_bins + return bindata + + def _parse_lh_accordion_page( + self, + session: requests.Session, + url: str, + bin_type: str, + search_term: str, + bindata: dict, + ): + """Parse a Lewis & Harris accordion-style day page. + + Structure: .accordion-pane elements containing: + - Area name in