Skip to content

Commit 53307cd

Browse files
committed
Support transformation of LibGuide sub-pages
Why these changes are being introduced: It was determined that we were not crawling LibGuides sub-pages in browsertrix. Once they started rolling in to Transmogrifier for transform to TIMDEX records, it became clear we'd need to do a little work to handle them. How this addresses that need: * Update the LibGuides API URL to include `?expand=pages` * this adds a `.pages` node to the main/parent guides API data * Interleave these sub-pages with the main guides in the API data, allowing the transform to find and utilize them as well * Because of increased crawl scope, filter out additional directory guides that have `g=176063` in the URL Side effects of this change: * Transmogrifier can transform sub-pages crawled from libguides.mit.edu, resulting in an increased TIMDEX record count for the `libguides` source Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-449
1 parent 9cb7864 commit 53307cd

File tree

3 files changed

+95
-13
lines changed

3 files changed

+95
-13
lines changed

tests/sources/json/test_libguides.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1-
# ruff: noqa: E501, PLC0415, S301, SLF001
1+
# ruff: noqa: E501, PLC0415, PLR2004, S301, SLF001
22

33
import base64
4-
from unittest.mock import PropertyMock, patch
4+
from unittest.mock import MagicMock, PropertyMock, patch
55

66
import pandas as pd
77
import pytest
88

99
from transmogrifier import models
10+
from transmogrifier.sources.json.libguides import LibGuidesAPIClient
1011

1112

1213
@pytest.fixture(autouse=True)
@@ -342,3 +343,58 @@ def test_libguides_record_is_excluded_when_guide_type_not_allowed(libguides_tran
342343
libguides_transformer._allowed_guides_df = None
343344

344345
assert libguides_transformer.record_is_excluded(source_record)
346+
347+
348+
def test_libguides_api_client_fetch_guides_expands_sub_pages_into_rows():
349+
"""Test that sub-pages from the API are expanded into their own DataFrame rows.
350+
351+
Sub-pages inherit parent guide columns (like type_label, status_label, group_id)
352+
so they are treated identically to root pages throughout the transformation pipeline.
353+
"""
354+
mock_api_response = [
355+
{
356+
"id": 100,
357+
"name": "Root Guide",
358+
"url": "https://libguides.mit.edu/c.php?g=100",
359+
"friendly_url": "https://libguides.mit.edu/rootguide",
360+
"type_label": "General Purpose Guide",
361+
"status_label": "Published",
362+
"group_id": 0,
363+
"pages": [
364+
{
365+
"id": 200,
366+
"name": "Sub Page",
367+
"url": "https://libguides.mit.edu/c.php?g=100&p=200",
368+
"friendly_url": None,
369+
}
370+
],
371+
}
372+
]
373+
374+
client = LibGuidesAPIClient()
375+
mock_response = MagicMock()
376+
mock_response.json.return_value = mock_api_response
377+
378+
with patch(
379+
"transmogrifier.sources.json.libguides.requests.get",
380+
return_value=mock_response,
381+
):
382+
df = client.fetch_guides("fake-token")
383+
384+
# 1 guide + 1 sub-page = 2 rows
385+
assert len(df) == 2
386+
387+
# root page untouched
388+
root_row = df[df["id"] == 100].iloc[0]
389+
assert root_row["url"] == "https://libguides.mit.edu/c.php?g=100"
390+
assert root_row["friendly_url"] == "https://libguides.mit.edu/rootguide"
391+
392+
# sub-page has its own row
393+
sub_row = df[df["id"] == 200].iloc[0]
394+
assert sub_row["url"] == "https://libguides.mit.edu/c.php?g=100&p=200"
395+
assert sub_row["friendly_url"] is None
396+
397+
# sub-page inherits parent types and statuses
398+
assert sub_row["type_label"] == "General Purpose Guide"
399+
assert sub_row["status_label"] == "Published"
400+
assert sub_row["group_id"] == 0

transmogrifier/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@
140140
"LIBGUIDES_TOKEN_URL", "https://lgapi-us.libapps.com/1.2/oauth/token"
141141
)
142142
LIBGUIDES_GUIDES_URL = os.getenv(
143-
"LIBGUIDES_GUIDES_URL", "https://lgapi-us.libapps.com/1.2/guides"
143+
"LIBGUIDES_GUIDES_URL", "https://lgapi-us.libapps.com/1.2/guides?expand=pages"
144144
)
145145
LIBGUIDES_API_TOKEN = os.getenv("LIBGUIDES_API_TOKEN")
146146
LIBGUIDES_CLIENT_ID = os.getenv("LIBGUIDES_CLIENT_ID")

transmogrifier/sources/json/libguides.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import re
44
from collections import defaultdict
55
from functools import lru_cache
6+
from urllib.parse import urlparse
67

78
import pandas as pd
89
import requests
@@ -34,7 +35,8 @@
3435
32635, # Records retention schedules
3536
]
3637
EXCLUDED_URL_REGEX = [
37-
r".*libguides.mit.edu/directory.*",
38+
r".*libguides.mit.edu/directory.*", # staff directory main page
39+
r".*libguides.mit.edu/c.php\?g=176063.*", # staff directory sub-pages
3840
]
3941

4042

@@ -84,21 +86,40 @@ def get_api_token(self) -> str:
8486
return payload.get("access_token")
8587

8688
def fetch_guides(self, token: str) -> pd.DataFrame:
87-
"""Retrieve metadata for all LibGuides."""
89+
"""Retrieve metadata for all LibGuides.
90+
91+
Each guide may contain a 'pages' key with a list of sub-page dicts. These
92+
sub-pages are expanded into their own rows in the returned DataFrame, inheriting
93+
any columns from the parent guide that the sub-page does not have.
94+
"""
8895
logger.debug("Retrieving all guides from Libguides API.")
8996
headers = {"Authorization": f"Bearer {token}"}
9097
response = requests.get(LIBGUIDES_GUIDES_URL, headers=headers, timeout=60)
9198
response.raise_for_status()
9299
guides = response.json()
93-
return pd.DataFrame(guides)
100+
101+
all_rows: list[dict] = []
102+
for guide in guides:
103+
pages = guide.get("pages", [])
104+
all_rows.append(guide)
105+
for page in pages:
106+
# inherit parent columns, then overlay page-specific columns
107+
page_row = {**guide, **page}
108+
all_rows.append(page_row)
109+
110+
return pd.DataFrame(all_rows)
94111

95112
def get_guide_by_url(self, url: str) -> pd.Series:
96113
"""Get metadata for a single guide via a URL."""
114+
# strip GET parameter preview=...; duplicate for base URL
115+
url = re.sub(r"&preview=[^&]*", "", url)
116+
97117
matches = self.api_guides_df[
98118
(self.api_guides_df.url == url) | (self.api_guides_df.friendly_url == url)
99119
]
100120
if len(matches) == 1:
101121
return matches.iloc[0]
122+
102123
raise ValueError(f"Found {len(matches)} guide ids for URL: {url}, expecting one.")
103124

104125

@@ -169,13 +190,18 @@ def record_is_excluded(self, source_record: dict) -> bool:
169190
This method utilizes multiple private methods which check for specific things. If
170191
any of them return True, the record is excluded.
171192
"""
172-
return any(
173-
[
174-
self._excluded_per_allowed_rules(source_record),
175-
self._excluded_per_missing_html(source_record),
176-
]
193+
return (
194+
self._excluded_per_non_libguides_domain(source_record)
195+
or self._excluded_per_allowed_rules(source_record)
196+
or self._excluded_per_missing_html(source_record)
177197
)
178198

199+
@staticmethod
200+
def _excluded_per_non_libguides_domain(source_record: dict) -> bool:
201+
"""Exclude a record if the captured URL is not from libguides.mit.edu."""
202+
parsed = urlparse(source_record["url"])
203+
return parsed.hostname != "libguides.mit.edu"
204+
179205
def _excluded_per_allowed_rules(self, source_record: dict) -> bool:
180206
"""Exclude a record if not present in allowed guides dataframe."""
181207
source_link = self.get_source_link(source_record)
@@ -235,8 +261,8 @@ def get_source_link(cls, source_record: dict) -> str:
235261
"""Use the 'friendly' URL from LibGuides API data."""
236262
url = source_record["url"]
237263
guide = cls.api_client.get_guide_by_url(url)
238-
friendly_url = guide.get("friendly_url", "").strip()
239-
return friendly_url or url
264+
friendly_url = guide.get("friendly_url") or ""
265+
return friendly_url.strip() or url
240266

241267
@classmethod
242268
def get_source_record_id(cls, source_record: dict) -> str:

0 commit comments

Comments
 (0)