Support transformation of LibGuide sub-pages

ghukill · ghukill · commit 53307cdfeb0e · 2026-03-13T14:37:58.000-04:00
Why these changes are being introduced: It was determined that we were not crawling LibGuides sub-pages in browsertrix. Once they started rolling in to Transmogrifier for transform to TIMDEX records, it became clear we'd need to do a little work to handle them. How this addresses that need: * Update the LibGuides API URL to include `?expand=pages` * this adds a `.pages` node to the main/parent guides API data * Interleave these sub-pages with the main guides in the API data, allowing the transform to find and utilize them as well * Because of increased crawl scope, filter out additional directory guides that have `g=176063` in the URL Side effects of this change: * Transmogrifier can transform sub-pages crawled from libguides.mit.edu, resulting in an increased TIMDEX record count for the `libguides` source Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-449
diff --git a/tests/sources/json/test_libguides.py b/tests/sources/json/test_libguides.py
@@ -1,12 +1,13 @@
-# ruff: noqa: E501, PLC0415, S301, SLF001
+# ruff: noqa: E501, PLC0415, PLR2004, S301, SLF001
 
 import base64
-from unittest.mock import PropertyMock, patch
+from unittest.mock import MagicMock, PropertyMock, patch
 
 import pandas as pd
 import pytest
 
 from transmogrifier import models
+from transmogrifier.sources.json.libguides import LibGuidesAPIClient
 
 
 @pytest.fixture(autouse=True)
@@ -342,3 +343,58 @@ def test_libguides_record_is_excluded_when_guide_type_not_allowed(libguides_tran
     libguides_transformer._allowed_guides_df = None
 
     assert libguides_transformer.record_is_excluded(source_record)
+
+
+def test_libguides_api_client_fetch_guides_expands_sub_pages_into_rows():
+    """Test that sub-pages from the API are expanded into their own DataFrame rows.
+
+    Sub-pages inherit parent guide columns (like type_label, status_label, group_id)
+    so they are treated identically to root pages throughout the transformation pipeline.
+    """
+    mock_api_response = [
+        {
+            "id": 100,
+            "name": "Root Guide",
+            "url": "https://libguides.mit.edu/c.php?g=100",
+            "friendly_url": "https://libguides.mit.edu/rootguide",
+            "type_label": "General Purpose Guide",
+            "status_label": "Published",
+            "group_id": 0,
+            "pages": [
+                {
+                    "id": 200,
+                    "name": "Sub Page",
+                    "url": "https://libguides.mit.edu/c.php?g=100&p=200",
+                    "friendly_url": None,
+                }
+            ],
+        }
+    ]
+
+    client = LibGuidesAPIClient()
+    mock_response = MagicMock()
+    mock_response.json.return_value = mock_api_response
+
+    with patch(
+        "transmogrifier.sources.json.libguides.requests.get",
+        return_value=mock_response,
+    ):
+        df = client.fetch_guides("fake-token")
+
+    # 1 guide + 1 sub-page = 2 rows
+    assert len(df) == 2
+
+    # root page untouched
+    root_row = df[df["id"] == 100].iloc[0]
+    assert root_row["url"] == "https://libguides.mit.edu/c.php?g=100"
+    assert root_row["friendly_url"] == "https://libguides.mit.edu/rootguide"
+
+    # sub-page has its own row
+    sub_row = df[df["id"] == 200].iloc[0]
+    assert sub_row["url"] == "https://libguides.mit.edu/c.php?g=100&p=200"
+    assert sub_row["friendly_url"] is None
+
+    # sub-page inherits parent types and statuses
+    assert sub_row["type_label"] == "General Purpose Guide"
+    assert sub_row["status_label"] == "Published"
+    assert sub_row["group_id"] == 0
diff --git a/transmogrifier/config.py b/transmogrifier/config.py
@@ -140,7 +140,7 @@
     "LIBGUIDES_TOKEN_URL", "https://lgapi-us.libapps.com/1.2/oauth/token"
 )
 LIBGUIDES_GUIDES_URL = os.getenv(
-    "LIBGUIDES_GUIDES_URL", "https://lgapi-us.libapps.com/1.2/guides"
+    "LIBGUIDES_GUIDES_URL", "https://lgapi-us.libapps.com/1.2/guides?expand=pages"
 )
 LIBGUIDES_API_TOKEN = os.getenv("LIBGUIDES_API_TOKEN")
 LIBGUIDES_CLIENT_ID = os.getenv("LIBGUIDES_CLIENT_ID")
diff --git a/transmogrifier/sources/json/libguides.py b/transmogrifier/sources/json/libguides.py
@@ -3,6 +3,7 @@
 import re
 from collections import defaultdict
 from functools import lru_cache
+from urllib.parse import urlparse
 
 import pandas as pd
 import requests
@@ -34,7 +35,8 @@
     32635,  # Records retention schedules
 ]
 EXCLUDED_URL_REGEX = [
-    r".*libguides.mit.edu/directory.*",
+    r".*libguides.mit.edu/directory.*",  # staff directory main page
+    r".*libguides.mit.edu/c.php\?g=176063.*",  # staff directory sub-pages
 ]
 
 
@@ -84,21 +86,40 @@ def get_api_token(self) -> str:
         return payload.get("access_token")
 
     def fetch_guides(self, token: str) -> pd.DataFrame:
-        """Retrieve metadata for all LibGuides."""
+        """Retrieve metadata for all LibGuides.
+
+        Each guide may contain a 'pages' key with a list of sub-page dicts.  These
+        sub-pages are expanded into their own rows in the returned DataFrame, inheriting
+        any columns from the parent guide that the sub-page does not have.
+        """
         logger.debug("Retrieving all guides from Libguides API.")
         headers = {"Authorization": f"Bearer {token}"}
         response = requests.get(LIBGUIDES_GUIDES_URL, headers=headers, timeout=60)
         response.raise_for_status()
         guides = response.json()
-        return pd.DataFrame(guides)
+
+        all_rows: list[dict] = []
+        for guide in guides:
+            pages = guide.get("pages", [])
+            all_rows.append(guide)
+            for page in pages:
+                # inherit parent columns, then overlay page-specific columns
+                page_row = {**guide, **page}
+                all_rows.append(page_row)
+
+        return pd.DataFrame(all_rows)
 
     def get_guide_by_url(self, url: str) -> pd.Series:
         """Get metadata for a single guide via a URL."""
+        # strip GET parameter preview=...; duplicate for base URL
+        url = re.sub(r"&preview=[^&]*", "", url)
+
         matches = self.api_guides_df[
             (self.api_guides_df.url == url) | (self.api_guides_df.friendly_url == url)
         ]
         if len(matches) == 1:
             return matches.iloc[0]
+
         raise ValueError(f"Found {len(matches)} guide ids for URL: {url}, expecting one.")
 
 
@@ -169,13 +190,18 @@ def record_is_excluded(self, source_record: dict) -> bool:
         This method utilizes multiple private methods which check for specific things.  If
         any of them return True, the record is excluded.
         """
-        return any(
-            [
-                self._excluded_per_allowed_rules(source_record),
-                self._excluded_per_missing_html(source_record),
-            ]
+        return (
+            self._excluded_per_non_libguides_domain(source_record)
+            or self._excluded_per_allowed_rules(source_record)
+            or self._excluded_per_missing_html(source_record)
         )
 
+    @staticmethod
+    def _excluded_per_non_libguides_domain(source_record: dict) -> bool:
+        """Exclude a record if the captured URL is not from libguides.mit.edu."""
+        parsed = urlparse(source_record["url"])
+        return parsed.hostname != "libguides.mit.edu"
+
     def _excluded_per_allowed_rules(self, source_record: dict) -> bool:
         """Exclude a record if not present in allowed guides dataframe."""
         source_link = self.get_source_link(source_record)
@@ -235,8 +261,8 @@ def get_source_link(cls, source_record: dict) -> str:
         """Use the 'friendly' URL from LibGuides API data."""
         url = source_record["url"]
         guide = cls.api_client.get_guide_by_url(url)
-        friendly_url = guide.get("friendly_url", "").strip()
-        return friendly_url or url
+        friendly_url = guide.get("friendly_url") or ""
+        return friendly_url.strip() or url
 
     @classmethod
     def get_source_record_id(cls, source_record: dict) -> str:

Original file line number	Diff line number	Diff line change
`@@ -140,7 +140,7 @@`
`140`	`140`	`"LIBGUIDES_TOKEN_URL", "https://lgapi-us.libapps.com/1.2/oauth/token"`
`141`	`141`	`)`
`142`	`142`	`LIBGUIDES_GUIDES_URL = os.getenv(`
`143`		`- "LIBGUIDES_GUIDES_URL", "https://lgapi-us.libapps.com/1.2/guides"`
	`143`	`+ "LIBGUIDES_GUIDES_URL", "https://lgapi-us.libapps.com/1.2/guides?expand=pages"`
`144`	`144`	`)`
`145`	`145`	`LIBGUIDES_API_TOKEN = os.getenv("LIBGUIDES_API_TOKEN")`
`146`	`146`	`LIBGUIDES_CLIENT_ID = os.getenv("LIBGUIDES_CLIENT_ID")`