From c3ed383c2e73037024727f777cd1a1ca7d1664b3 Mon Sep 17 00:00:00 2001
From: InertiaUK <mark@keepcomputing.co.uk>
Date: Fri, 22 May 2026 15:09:15 +0100
Subject: [PATCH 1/2] feat: add orkney islands council scraper

Uses Jadu FAQ search to match street to collection area. Mainland areas
(01-15) have embedded Google Calendar iCal feeds with dated events and
RRULE recurrence. Island areas return day-of-week. Handles EXDATE
exclusions and RECURRENCE-ID overrides for holiday changes.
Pure HTTP - no Selenium needed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 uk_bin_collection/tests/input.json            |  27 +-
 .../councils/OrkneyIslandsCouncil.py          | 415 ++++++++++++++++++
 2 files changed, 433 insertions(+), 9 deletions(-)
 create mode 100644 uk_bin_collection/uk_bin_collection/councils/OrkneyIslandsCouncil.py

diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json
index 1967b2d9d6..91e87957ec 100755
--- a/uk_bin_collection/tests/input.json
+++ b/uk_bin_collection/tests/input.json
@@ -879,7 +879,7 @@
         "url": "https://environmentfirst.co.uk/house.php?uprn=100060055444",
         "wiki_command_url_override": "https://environmentfirst.co.uk/house.php?uprn=XXXXXXXXXX",
         "wiki_name": "Environment First",
-        "wiki_note": "For properties with collections managed by Environment First, such as Lewes and Eastbourne. Replace the XXXXXXXXXX with the UPRN of your property\u2014you can use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find this."
+        "wiki_note": "For properties with collections managed by Environment First, such as Lewes and Eastbourne. Replace the XXXXXXXXXX with the UPRN of your property—you can use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find this."
     },
     "EppingForestDistrictCouncil": {
         "postcode": "IG9 6EP",
@@ -1754,14 +1754,14 @@
         "LAD24CD": "E06000012"
     },
     "NorthHertfordshireDistrictCouncil": {
-            "house_number": "Stewards Flat",
-            "postcode": "SG5 1PZ",
-            "skip_get_url": true,
-            "url": "https://waste.nc.north-herts.gov.uk/w/webpage/find-bin-collection-day-input-address",
-            "web_driver": "http://selenium:4444",
-            "wiki_name": "North Hertfordshire",
-            "wiki_note": "Pass a postcode (with space) and house_number/name. The scraper performs the Liberty Create typeahead lookup and matches house_number as a case-insensitive substring.",
-            "LAD24CD": "E07000099"
+        "house_number": "Stewards Flat",
+        "postcode": "SG5 1PZ",
+        "skip_get_url": true,
+        "url": "https://waste.nc.north-herts.gov.uk/w/webpage/find-bin-collection-day-input-address",
+        "web_driver": "http://selenium:4444",
+        "wiki_name": "North Hertfordshire",
+        "wiki_note": "Pass a postcode (with space) and house_number/name. The scraper performs the Liberty Create typeahead lookup and matches house_number as a case-insensitive substring.",
+        "LAD24CD": "E07000099"
     },
     "NorthKestevenDistrictCouncil": {
         "skip_get_url": true,
@@ -2877,5 +2877,14 @@
         "wiki_name": "York",
         "wiki_note": "Provide your UPRN.",
         "LAD24CD": "E06000014"
+    },
+    "OrkneyIslandsCouncil": {
+        "LAD24CD": "S12000023",
+        "paon": "Albert Street",
+        "postcode": "KW15 1HP",
+        "skip_get_url": true,
+        "url": "https://www.orkney.gov.uk/our-services/waste-and-recycling/household-waste-and-recycling/",
+        "wiki_name": "Orkney Islands Council",
+        "wiki_note": "Pass street name as paon. Mainland areas use Google Calendar iCal feeds. Island areas return day-of-week."
     }
 }
\ No newline at end of file
diff --git a/uk_bin_collection/uk_bin_collection/councils/OrkneyIslandsCouncil.py b/uk_bin_collection/uk_bin_collection/councils/OrkneyIslandsCouncil.py
new file mode 100644
index 0000000000..ef660c98e7
--- /dev/null
+++ b/uk_bin_collection/uk_bin_collection/councils/OrkneyIslandsCouncil.py
@@ -0,0 +1,415 @@
+import base64
+import re
+from datetime import datetime
+from urllib.parse import parse_qs, urlparse
+
+import requests
+from bs4 import BeautifulSoup
+
+from uk_bin_collection.uk_bin_collection.common import *
+from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
+
+
+class CouncilClass(AbstractGetBinDataClass):
+    """
+    Concrete classes have to implement all abstract operations of the
+    base class. They can also override some operations with a default
+    implementation.
+    """
+
+    BASE_URL = "https://www.orkney.gov.uk/our-services/waste-and-recycling/household-waste-and-recycling/"
+
+    def parse_data(self, page: str, **kwargs) -> dict:
+        user_paon = kwargs.get("paon")
+        if not user_paon:
+            raise ValueError(
+                "A street name or area name is required (paon parameter). "
+                "Search at https://www.orkney.gov.uk/mybins to find your area."
+            )
+
+        bindata = {"bins": []}
+
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/138.0.0.0 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "en-GB,en;q=0.9",
+        }
+
+        session = requests.Session()
+        session.headers.update(headers)
+
+        # Step 1: Search the FAQ system with the street/area name.
+        # The MyBins page is a Jadu FAQ module. Searching with
+        # faqsearchOperator=AND matches the phrase within FAQ answers
+        # which list the streets covered by each collection area.
+        search_url = self.BASE_URL
+        response = session.get(
+            search_url,
+            params={
+                "faqsearch": user_paon,
+                "faqsearchOperator": "AND",
+            },
+        )
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        # Check if we landed directly on a single result (FAQ detail page)
+        # or got a list of matching areas. A detail page has .faq-detail
+        # with .faq-answer-detail inside; a list page has multiple links
+        # with ?id= params.
+        faq_detail = soup.find("div", class_="faq-answer-detail")
+        if faq_detail:
+            # Single result -- we're already on the detail page
+            return self._parse_faq_detail(faq_detail, session, bindata)
+
+        # Multiple results or search results -- find area links
+        result_links = soup.find_all("a", href=re.compile(r"\?id=\d+"))
+        if not result_links:
+            raise ValueError(
+                f"No collection area found for '{user_paon}'. "
+                "Try searching with a street name (e.g. 'Albert Street') "
+                "or island name (e.g. 'Sanday'). Check your area at "
+                "https://www.orkney.gov.uk/mybins"
+            )
+
+        # Use the first matching result
+        first_link = result_links[0]
+        href = first_link.get("href", "")
+        faq_id = re.search(r"\?id=(\d+)", href)
+        if not faq_id:
+            raise ValueError("Could not extract FAQ ID from search results.")
+
+        # Step 2: Fetch the FAQ detail page for the matched area
+        detail_url = f"{self.BASE_URL}?id={faq_id.group(1)}"
+        response = session.get(detail_url)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, "html.parser")
+        faq_detail = soup.find("div", class_="faq-answer-detail")
+        if not faq_detail:
+            raise ValueError(
+                "Could not find collection details on the area page."
+            )
+
+        return self._parse_faq_detail(faq_detail, session, bindata)
+
+    def _parse_faq_detail(
+        self,
+        faq_detail,
+        session: requests.Session,
+        bindata: dict,
+    ) -> dict:
+        """Parse a FAQ detail page. Two formats exist:
+        - Mainland areas (01-15): embedded Google Calendar iframe with
+          actual dated events via iCal feed.
+        - Island areas: plain text saying 'Your collection day is <Day>.'
+        """
+
+        # Try to find a Google Calendar embed (mainland areas)
+        # The calendar ID is in the iframe src or the print link's
+        # data-calendar-source attribute (base64-encoded).
+        calendar_id = self._extract_calendar_id(faq_detail)
+        if calendar_id:
+            return self._parse_google_calendar(calendar_id, session, bindata)
+
+        # Fall back to island format: "Your collection day is <Day>."
+        return self._parse_island_day(faq_detail, bindata)
+
+    def _extract_calendar_id(self, faq_detail) -> str:
+        """Extract the Google Calendar ID from the FAQ detail HTML.
+        Looks for the data-calendar-source attribute on the print link
+        first, then falls back to parsing the iframe src parameter.
+        """
+
+        # Method 1: data-calendar-source attribute (base64-encoded cal ID)
+        print_link = faq_detail.find("a", class_="calendarLink")
+        if print_link and print_link.get("data-calendar-source"):
+            try:
+                return base64.b64decode(
+                    print_link["data-calendar-source"]
+                ).decode("utf-8")
+            except Exception:
+                pass
+
+        # Method 2: Parse the print link href for the src= parameter
+        if print_link and print_link.get("href"):
+            parsed = urlparse(print_link["href"])
+            params = parse_qs(parsed.query)
+            src_list = params.get("src", [])
+            if src_list:
+                try:
+                    return base64.b64decode(src_list[0]).decode("utf-8")
+                except Exception:
+                    pass
+
+        # Method 3: Parse the iframe src for the src= parameter
+        iframe = faq_detail.find("iframe")
+        if iframe and iframe.get("src"):
+            parsed = urlparse(iframe["src"])
+            params = parse_qs(parsed.query)
+            src_list = params.get("src", [])
+            if src_list:
+                try:
+                    return base64.b64decode(src_list[0]).decode("utf-8")
+                except Exception:
+                    pass
+
+        return ""
+
+    def _parse_google_calendar(
+        self,
+        calendar_id: str,
+        session: requests.Session,
+        bindata: dict,
+    ) -> dict:
+        """Fetch the public iCal feed for a Google Calendar and parse
+        the VEVENT entries into bin collection dates."""
+
+        ical_url = (
+            f"https://calendar.google.com/calendar/ical/"
+            f"{calendar_id}/public/basic.ics"
+        )
+        response = session.get(ical_url)
+        response.raise_for_status()
+
+        ical_text = response.text
+        now = datetime.now()
+
+        # Parse VEVENT blocks from the iCal data.
+        # Events use RRULE for recurring collections and may have
+        # EXDATE entries for skipped weeks (e.g. Christmas).
+        # We expand recurrences manually for the next ~6 months.
+        events = self._expand_ical_events(ical_text, now)
+
+        for event_date, summary in events:
+            # Strip the "Area XX - " prefix for cleaner bin type names
+            bin_type = re.sub(r"^Area \d+ - ", "", summary).strip()
+            if not bin_type:
+                continue
+
+            bindata["bins"].append(
+                {
+                    "type": bin_type,
+                    "collectionDate": event_date.strftime(date_format),
+                }
+            )
+
+        if not bindata["bins"]:
+            raise ValueError(
+                "No upcoming collection dates found in the calendar. "
+                "The council may not have published schedules yet."
+            )
+
+        bindata["bins"].sort(
+            key=lambda x: datetime.strptime(
+                x.get("collectionDate"), date_format
+            )
+        )
+
+        return bindata
+
+    def _expand_ical_events(
+        self, ical_text: str, now: datetime
+    ) -> list:
+        """Parse iCal text and expand recurring events into concrete
+        (date, summary) tuples for the next 6 months from now.
+
+        Handles:
+        - Single events (DTSTART;VALUE=DATE)
+        - Recurring events (RRULE with FREQ=WEEKLY, INTERVAL, BYDAY)
+        - Exception dates (EXDATE;VALUE=DATE)
+        - Recurrence overrides (RECURRENCE-ID)
+        """
+        from datetime import timedelta
+
+        horizon = now + timedelta(days=180)
+        results = []
+
+        # Split into VEVENT blocks
+        vevent_pattern = re.compile(
+            r"BEGIN:VEVENT\r?\n(.*?)END:VEVENT", re.DOTALL
+        )
+
+        # First pass: collect recurrence overrides (events with
+        # RECURRENCE-ID that replace a specific occurrence)
+        overrides = {}
+        for match in vevent_pattern.finditer(ical_text):
+            block = match.group(1)
+            if "RECURRENCE-ID" not in block:
+                continue
+            uid = self._ical_field(block, "UID")
+            rec_date = self._ical_date(block, "RECURRENCE-ID")
+            summary = self._ical_field(block, "SUMMARY")
+            dtstart = self._ical_date(block, "DTSTART")
+            if uid and rec_date:
+                overrides[(uid, rec_date)] = (dtstart or rec_date, summary)
+
+        # Second pass: process base events
+        for match in vevent_pattern.finditer(ical_text):
+            block = match.group(1)
+            if "RECURRENCE-ID" in block:
+                continue
+
+            uid = self._ical_field(block, "UID")
+            summary = self._ical_field(block, "SUMMARY")
+            dtstart = self._ical_date(block, "DTSTART")
+            if not dtstart or not summary:
+                continue
+
+            # Collect EXDATE values (excluded dates)
+            exdates = set()
+            for line in block.split("\n"):
+                line = line.strip()
+                if line.startswith("EXDATE"):
+                    date_val = self._parse_date_value(
+                        line.split(":", 1)[-1].strip()
+                    )
+                    if date_val:
+                        exdates.add(date_val)
+
+            rrule_line = self._ical_field(block, "RRULE")
+            if rrule_line:
+                # Parse RRULE parameters
+                rrule_params = {}
+                for part in rrule_line.split(";"):
+                    if "=" in part:
+                        k, v = part.split("=", 1)
+                        rrule_params[k] = v
+
+                freq = rrule_params.get("FREQ", "")
+                interval = int(rrule_params.get("INTERVAL", "1"))
+
+                if freq == "WEEKLY":
+                    step = timedelta(weeks=interval)
+                    current = dtstart
+
+                    while current <= horizon:
+                        if current >= now and current not in exdates:
+                            # Check for override
+                            override = overrides.get((uid, current))
+                            if override:
+                                o_date, o_summary = override
+                                if o_date and o_date >= now:
+                                    results.append(
+                                        (o_date, o_summary or summary)
+                                    )
+                            else:
+                                results.append((current, summary))
+                        current += step
+                else:
+                    # Non-weekly recurrence (unlikely for Orkney)
+                    if now <= dtstart <= horizon:
+                        results.append((dtstart, summary))
+            else:
+                # Single (non-recurring) event
+                if now <= dtstart <= horizon:
+                    # Check for override
+                    override = overrides.get((uid, dtstart))
+                    if override:
+                        o_date, o_summary = override
+                        if o_date and o_date >= now:
+                            results.append((o_date, o_summary or summary))
+                    else:
+                        results.append((dtstart, summary))
+
+        # Also add any overrides whose base events may have already
+        # been processed but the override date is in our window
+        # (these are handled above, but standalone overrides for
+        # past base events won't be caught)
+        for (uid, rec_date), (o_date, o_summary) in overrides.items():
+            if o_date and now <= o_date <= horizon:
+                # Only add if not already present
+                if not any(d == o_date and s == o_summary for d, s in results):
+                    results.append((o_date, o_summary))
+
+        return sorted(results, key=lambda x: x[0])
+
+    def _ical_field(self, block: str, field_name: str) -> str:
+        """Extract a simple iCal field value, handling fields with
+        parameters (e.g. DTSTART;VALUE=DATE:20241209) and iCal line
+        folding (continuation lines starting with a space/tab).
+        Also unescapes iCal backslash sequences."""
+        lines = block.split("\n")
+        result = None
+        for i, line in enumerate(lines):
+            stripped = line.rstrip("\r")
+            if result is not None:
+                # Check for continuation line (starts with space or tab)
+                if stripped.startswith(" ") or stripped.startswith("\t"):
+                    result += stripped[1:]
+                    continue
+                else:
+                    break
+            if stripped.startswith(field_name):
+                result = stripped.split(":", 1)[-1]
+
+        if result is None:
+            return ""
+
+        # Unescape iCal sequences: \, -> , and \n -> newline
+        result = result.replace("\\,", ",").replace("\\n", "\n")
+        return result.strip()
+
+    def _ical_date(self, block: str, field_name: str):
+        """Extract a date from an iCal field, returning a datetime
+        object or None."""
+        raw = self._ical_field(block, field_name)
+        return self._parse_date_value(raw)
+
+    def _parse_date_value(self, raw: str):
+        """Parse a raw iCal date value like '20241209' or
+        '20241209T000000Z' into a datetime."""
+        if not raw:
+            return None
+        # Strip any trailing whitespace/carriage returns
+        raw = raw.strip().replace("\r", "")
+        try:
+            if len(raw) == 8:
+                return datetime.strptime(raw, "%Y%m%d")
+            elif "T" in raw:
+                return datetime.strptime(raw[:8], "%Y%m%d")
+            else:
+                return datetime.strptime(raw[:8], "%Y%m%d")
+        except ValueError:
+            return None
+
+    def _parse_island_day(self, faq_detail, bindata: dict) -> dict:
+        """Parse island-format FAQ answers that just state a collection
+        day name (e.g. 'Your collection day is Thursday.')."""
+
+        text = faq_detail.get_text(" ", strip=True)
+
+        # Look for "Your collection day is <DayName>"
+        day_match = re.search(
+            r"collection day is\s+(\w+)", text, re.IGNORECASE
+        )
+        if not day_match:
+            raise ValueError(
+                "Could not determine collection day from the page. "
+                "This area may use a format not yet supported."
+            )
+
+        day_name = day_match.group(1).strip()
+
+        # Validate it's a real day name
+        if day_name not in days_of_week:
+            raise ValueError(
+                f"Unrecognised collection day: '{day_name}'."
+            )
+
+        # Island collections are general waste only (single stream).
+        # The council page doesn't specify bin types for islands --
+        # they have a single weekly collection.
+        collection_date = get_next_day_of_week(day_name)
+
+        bindata["bins"].append(
+            {
+                "type": "General Waste",
+                "collectionDate": collection_date,
+            }
+        )
+
+        return bindata

From fc96993ae779f9c68e37d6f8c89617a1b4f0ef78 Mon Sep 17 00:00:00 2001
From: InertiaUK <mark@keepcomputing.co.uk>
Date: Sat, 23 May 2026 12:31:43 +0100
Subject: [PATCH 2/2] fix: address CodeRabbit review feedback

---
 uk_bin_collection/tests/input.json                           | 2 +-
 .../uk_bin_collection/councils/OrkneyIslandsCouncil.py       | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/uk_bin_collection/tests/input.json b/uk_bin_collection/tests/input.json
index 91e87957ec..ac3ae25650 100755
--- a/uk_bin_collection/tests/input.json
+++ b/uk_bin_collection/tests/input.json
@@ -879,7 +879,7 @@
         "url": "https://environmentfirst.co.uk/house.php?uprn=100060055444",
         "wiki_command_url_override": "https://environmentfirst.co.uk/house.php?uprn=XXXXXXXXXX",
         "wiki_name": "Environment First",
-        "wiki_note": "For properties with collections managed by Environment First, such as Lewes and Eastbourne. Replace the XXXXXXXXXX with the UPRN of your property—you can use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find this."
+        "wiki_note": "For properties with collections managed by Environment First, such as Lewes and Eastbourne. Replace the XXXXXXXXXX with the UPRN of your property - you can use [FindMyAddress](https://www.findmyaddress.co.uk/search) to find this."
     },
     "EppingForestDistrictCouncil": {
         "postcode": "IG9 6EP",
diff --git a/uk_bin_collection/uk_bin_collection/councils/OrkneyIslandsCouncil.py b/uk_bin_collection/uk_bin_collection/councils/OrkneyIslandsCouncil.py
index ef660c98e7..1e149e0254 100644
--- a/uk_bin_collection/uk_bin_collection/councils/OrkneyIslandsCouncil.py
+++ b/uk_bin_collection/uk_bin_collection/councils/OrkneyIslandsCouncil.py
@@ -51,6 +51,7 @@ def parse_data(self, page: str, **kwargs) -> dict:
                 "faqsearch": user_paon,
                 "faqsearchOperator": "AND",
             },
+            timeout=30,
         )
         response.raise_for_status()
 
@@ -84,7 +85,7 @@ def parse_data(self, page: str, **kwargs) -> dict:
 
         # Step 2: Fetch the FAQ detail page for the matched area
         detail_url = f"{self.BASE_URL}?id={faq_id.group(1)}"
-        response = session.get(detail_url)
+        response = session.get(detail_url, timeout=30)
         response.raise_for_status()
 
         soup = BeautifulSoup(response.text, "html.parser")
@@ -172,7 +173,7 @@ def _parse_google_calendar(
             f"https://calendar.google.com/calendar/ical/"
             f"{calendar_id}/public/basic.ics"
         )
-        response = session.get(ical_url)
+        response = session.get(ical_url, timeout=30)
         response.raise_for_status()
 
         ical_text = response.text