Parse full HTML from mitlibwebsite source records

ghukill · ghukill · commit fb64fa620784 · 2025-12-10T15:53:31.000-05:00
Why these changes are being introduced: Now that browsertrix-harvester is including full HTML + response headers in the source record available to Transmogrifier, we can do two things: 1. Parse metadata for mitlibwebsite TIMDEX records from the original, full HTML in a more opinionated fashion than we could in browsertrix-harvester. 2. Extract good, meaningful full-text from the full HTML to use for the new `fulltext` field. How this addresses that need: Expects a new `html_base64` field in the browsertrix-harvester source records. Uses this to extract metadata and full-text for the record. Side effects of this change: * Full-text is now available in the TIMDEX record for the mitlibwebsite source. * If needed, this HTML parsing could be utilized to extract more granular, source specific metadata in the future. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-259
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,6 +6,10 @@ disallow_untyped_calls = true
 disallow_untyped_defs = true
 exclude = ["tests/", "output/"]
 
+[[tool.mypy.overrides]]
+module = ["bs4", "bs4.*"]
+ignore_missing_imports = true
+
 [tool.pytest.ini_options]
 log_level = "INFO"
 
diff --git a/tests/fixtures/mitlibwebsite/mitlibwebsite_records.jsonl b/tests/fixtures/mitlibwebsite/mitlibwebsite_records.jsonl
diff --git a/tests/fixtures/mitlibwebsite/website.html b/tests/fixtures/mitlibwebsite/website.html
@@ -0,0 +1,18 @@
+<html>
+<head>
+    <meta property="og:title" content="Search | MIT Libraries">
+    <meta property="og:description"
+          content="Use this page to learn about different ways you can search the MIT Libraries’ offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]">
+</head>
+<body>
+<header>
+    <h1>Not Helpful</h1>
+</header>
+<content>
+    <p>Hello World!</p>
+</content>
+<footer>
+    <h1>Also Not Helpful</h1>
+</footer>
+</body>
+</html>
diff --git a/tests/fixtures/mitlibwebsite/website_missing_og_description.html b/tests/fixtures/mitlibwebsite/website_missing_og_description.html
@@ -0,0 +1,16 @@
+<html>
+<head>
+    <meta property="og:title" content="Search | MIT Libraries">
+</head>
+<body>
+<header>
+    <h1>Not Helpful</h1>
+</header>
+<content>
+    <p>Hello World!</p>
+</content>
+<footer>
+    <h1>Also Not Helpful</h1>
+</footer>
+</body>
+</html>
diff --git a/tests/sources/json/test_mitlibwebsite.py b/tests/sources/json/test_mitlibwebsite.py
@@ -1,15 +1,22 @@
 # ruff: noqa: RUF001
+import base64
 from unittest.mock import MagicMock, patch
 
 import transmogrifier.models as timdex
 from transmogrifier.sources.json.mitlibwebsite import MITLibWebsite
 
 
-def create_mitlibwebsite_source_record_stub() -> dict:
+def create_mitlibwebsite_source_record_stub(
+    html_filepath="tests/fixtures/mitlibwebsite/website.html",
+) -> dict:
+    with open(html_filepath) as f:
+        html_content = f.read()
+
     return {
         "url": "https://libraries.mit.edu/search/",
         "cdx_title": "Search | MIT Libraries",
-        "og_description": "Use this page to learn about different ways you can search the MIT Libraries' offerings.",  # noqa: E501
+        "html_base64": base64.b64encode(html_content.encode()).decode(),
+        "response_headers": {},
     }
 
 
@@ -45,6 +52,7 @@ def test_mitlibwebsite_transform_returns_timdex_record(mitlibwebsite_records):
         summary=[
             "Use this page to learn about different ways you can search the MIT Libraries’ offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]"  # noqa: E501
         ],
+        fulltext=timdex_record.fulltext,
     )
 
 
@@ -119,13 +127,14 @@ def test_mitlibwebsite_get_links_success():
 def test_mitlibwebsite_get_summary_success():
     source_record = create_mitlibwebsite_source_record_stub()
     assert MITLibWebsite.get_summary(source_record) == [
-        "Use this page to learn about different ways you can search the MIT Libraries' offerings."  # noqa: E501
+        "Use this page to learn about different ways you can search the MIT Libraries’ offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]"  # noqa: E501
     ]
 
 
 def test_mitlibwebsite_get_summary_returns_none_if_og_description_is_none():
-    source_record = create_mitlibwebsite_source_record_stub()
-    source_record["og_description"] = None
+    source_record = create_mitlibwebsite_source_record_stub(
+        html_filepath="tests/fixtures/mitlibwebsite/website_missing_og_description.html"
+    )
     assert MITLibWebsite.get_summary(source_record) is None
 
 
diff --git a/transmogrifier/sources/json/mitlibwebsite.py b/transmogrifier/sources/json/mitlibwebsite.py
@@ -1,5 +1,9 @@
+import base64
 import hashlib
 import logging
+from functools import lru_cache
+
+from bs4 import BeautifulSoup, Tag
 
 import transmogrifier.models as timdex
 from transmogrifier.sources.jsontransformer import JSONTransformer
@@ -10,6 +14,30 @@
 
 class MITLibWebsite(JSONTransformer):
 
+    @classmethod
+    @lru_cache(maxsize=8)
+    def parse_html(cls, html_base64: str) -> Tag:
+        """Parse HTML from base64 encoded ASCII string.
+
+        For this mitlibwebsite source, also remove the <header> and <footer> elements
+        which are not helpful for any metadata or fulltext purposes.
+
+        This method utilizes an LRU cache to only parse the HTML once per unique HTML
+        base64 string passed.  Maxsize is set to 8 to ensure the cache is large enough
+        for 8 concurrent transformations if threading is used (increase if needed for
+        more threads).
+        """
+        html_bytes = base64.b64decode(html_base64)
+        html_soup = BeautifulSoup(html_bytes, "html.parser")
+
+        # remove header and footer
+        if header := html_soup.select_one("body > header"):
+            header.decompose()
+        if footer := html_soup.select_one("body > footer"):
+            footer.decompose()
+
+        return html_soup
+
     @classmethod
     def get_main_titles(cls, source_record: dict) -> list[str]:
         """
@@ -81,12 +109,24 @@ def get_dates(self, _source_record: dict) -> list[timdex.Date]:
     def get_format(self, _source_record: dict) -> str:
         return "electronic resource"
 
+    def get_fulltext(self, source_record: dict) -> str:
+        html_soup = self.parse_html(source_record["html_base64"])
+        return html_soup.get_text(separator=" ", strip=True)
+
     @classmethod
     def get_links(cls, source_record: dict) -> list[timdex.Link]:
         return [timdex.Link(url=source_record["url"], kind="Website")]
 
     @classmethod
     def get_summary(cls, source_record: dict) -> list[str] | None:
-        if og_description := source_record.get("og_description"):
-            return [og_description]
-        return None
+        html_soup = cls.parse_html(source_record["html_base64"])
+
+        og_tag = html_soup.find("meta", attrs={"property": "og:description"})
+        if not og_tag:
+            return None
+
+        content = og_tag.get("content", "").strip()
+        if content == "":
+            return None
+
+        return [content]