Skip to content

Commit fb64fa6

Browse files
committed
Parse full HTML from mitlibwebsite source records
Why these changes are being introduced: Now that browsertrix-harvester is including full HTML + response headers in the source record available to Transmogrifier, we can do two things: 1. Parse metadata for mitlibwebsite TIMDEX records from the original, full HTML in a more opinionated fashion than we could in browsertrix-harvester. 2. Extract good, meaningful full-text from the full HTML to use for the new `fulltext` field. How this addresses that need: Expects a new `html_base64` field in the browsertrix-harvester source records. Uses this to extract metadata and full-text for the record. Side effects of this change: * Full-text is now available in the TIMDEX record for the mitlibwebsite source. * If needed, this HTML parsing could be utilized to extract more granular, source specific metadata in the future. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-259
1 parent 2e88aa5 commit fb64fa6

File tree

6 files changed

+98
-11
lines changed

6 files changed

+98
-11
lines changed

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ disallow_untyped_calls = true
66
disallow_untyped_defs = true
77
exclude = ["tests/", "output/"]
88

9+
[[tool.mypy.overrides]]
10+
module = ["bs4", "bs4.*"]
11+
ignore_missing_imports = true
12+
913
[tool.pytest.ini_options]
1014
log_level = "INFO"
1115

tests/fixtures/mitlibwebsite/mitlibwebsite_records.jsonl

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<html>
2+
<head>
3+
<meta property="og:title" content="Search | MIT Libraries">
4+
<meta property="og:description"
5+
content="Use this page to learn about different ways you can search the MIT Libraries’ offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]">
6+
</head>
7+
<body>
8+
<header>
9+
<h1>Not Helpful</h1>
10+
</header>
11+
<content>
12+
<p>Hello World!</p>
13+
</content>
14+
<footer>
15+
<h1>Also Not Helpful</h1>
16+
</footer>
17+
</body>
18+
</html>
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<html>
2+
<head>
3+
<meta property="og:title" content="Search | MIT Libraries">
4+
</head>
5+
<body>
6+
<header>
7+
<h1>Not Helpful</h1>
8+
</header>
9+
<content>
10+
<p>Hello World!</p>
11+
</content>
12+
<footer>
13+
<h1>Also Not Helpful</h1>
14+
</footer>
15+
</body>
16+
</html>

tests/sources/json/test_mitlibwebsite.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,22 @@
11
# ruff: noqa: RUF001
2+
import base64
23
from unittest.mock import MagicMock, patch
34

45
import transmogrifier.models as timdex
56
from transmogrifier.sources.json.mitlibwebsite import MITLibWebsite
67

78

8-
def create_mitlibwebsite_source_record_stub() -> dict:
9+
def create_mitlibwebsite_source_record_stub(
10+
html_filepath="tests/fixtures/mitlibwebsite/website.html",
11+
) -> dict:
12+
with open(html_filepath) as f:
13+
html_content = f.read()
14+
915
return {
1016
"url": "https://libraries.mit.edu/search/",
1117
"cdx_title": "Search | MIT Libraries",
12-
"og_description": "Use this page to learn about different ways you can search the MIT Libraries' offerings.", # noqa: E501
18+
"html_base64": base64.b64encode(html_content.encode()).decode(),
19+
"response_headers": {},
1320
}
1421

1522

@@ -45,6 +52,7 @@ def test_mitlibwebsite_transform_returns_timdex_record(mitlibwebsite_records):
4552
summary=[
4653
"Use this page to learn about different ways you can search the MIT Libraries’ offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]" # noqa: E501
4754
],
55+
fulltext=timdex_record.fulltext,
4856
)
4957

5058

@@ -119,13 +127,14 @@ def test_mitlibwebsite_get_links_success():
119127
def test_mitlibwebsite_get_summary_success():
120128
source_record = create_mitlibwebsite_source_record_stub()
121129
assert MITLibWebsite.get_summary(source_record) == [
122-
"Use this page to learn about different ways you can search the MIT Libraries' offerings." # noqa: E501
130+
"Use this page to learn about different ways you can search the MIT Libraries offerings. Use the Default Quick Search Our Quick Search is the default search on the Libraries’ homepage. This collects results from different library search tools and sorts the results into 4 categories: Books and media Articles and book chapters Archives and manuscript collections Our library website and guides The tool will search the 4 categories and present the top results from each category. It is useful to see the full breadth of what MIT Libraries has on a particular topic or author. Go straight to our […]" # noqa: E501
123131
]
124132

125133

126134
def test_mitlibwebsite_get_summary_returns_none_if_og_description_is_none():
127-
source_record = create_mitlibwebsite_source_record_stub()
128-
source_record["og_description"] = None
135+
source_record = create_mitlibwebsite_source_record_stub(
136+
html_filepath="tests/fixtures/mitlibwebsite/website_missing_og_description.html"
137+
)
129138
assert MITLibWebsite.get_summary(source_record) is None
130139

131140

transmogrifier/sources/json/mitlibwebsite.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
import base64
12
import hashlib
23
import logging
4+
from functools import lru_cache
5+
6+
from bs4 import BeautifulSoup, Tag
37

48
import transmogrifier.models as timdex
59
from transmogrifier.sources.jsontransformer import JSONTransformer
@@ -10,6 +14,30 @@
1014

1115
class MITLibWebsite(JSONTransformer):
1216

17+
@classmethod
18+
@lru_cache(maxsize=8)
19+
def parse_html(cls, html_base64: str) -> Tag:
20+
"""Parse HTML from base64 encoded ASCII string.
21+
22+
For this mitlibwebsite source, also remove the <header> and <footer> elements
23+
which are not helpful for any metadata or fulltext purposes.
24+
25+
This method utilizes an LRU cache to only parse the HTML once per unique HTML
26+
base64 string passed. Maxsize is set to 8 to ensure the cache is large enough
27+
for 8 concurrent transformations if threading is used (increase if needed for
28+
more threads).
29+
"""
30+
html_bytes = base64.b64decode(html_base64)
31+
html_soup = BeautifulSoup(html_bytes, "html.parser")
32+
33+
# remove header and footer
34+
if header := html_soup.select_one("body > header"):
35+
header.decompose()
36+
if footer := html_soup.select_one("body > footer"):
37+
footer.decompose()
38+
39+
return html_soup
40+
1341
@classmethod
1442
def get_main_titles(cls, source_record: dict) -> list[str]:
1543
"""
@@ -81,12 +109,24 @@ def get_dates(self, _source_record: dict) -> list[timdex.Date]:
81109
def get_format(self, _source_record: dict) -> str:
82110
return "electronic resource"
83111

112+
def get_fulltext(self, source_record: dict) -> str:
113+
html_soup = self.parse_html(source_record["html_base64"])
114+
return html_soup.get_text(separator=" ", strip=True)
115+
84116
@classmethod
85117
def get_links(cls, source_record: dict) -> list[timdex.Link]:
86118
return [timdex.Link(url=source_record["url"], kind="Website")]
87119

88120
@classmethod
89121
def get_summary(cls, source_record: dict) -> list[str] | None:
90-
if og_description := source_record.get("og_description"):
91-
return [og_description]
92-
return None
122+
html_soup = cls.parse_html(source_record["html_base64"])
123+
124+
og_tag = html_soup.find("meta", attrs={"property": "og:description"})
125+
if not og_tag:
126+
return None
127+
128+
content = og_tag.get("content", "").strip()
129+
if content == "":
130+
return None
131+
132+
return [content]

0 commit comments

Comments
 (0)