Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 172 additions & 0 deletions build_search_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#!/usr/bin/env python3
"""
Build searchIndex.json for AlcHepNet website search functionality.
Indexes all active HTML pages (excludes backups, templates, and components).
"""

import os
import re
import json
from pathlib import Path

SRC_DIR = Path(__file__).parent / "src"
OUTPUT_FILE = SRC_DIR / "searchIndex.json"
PUBLICATIONS_JSON = SRC_DIR / "Publishing" / "doc" / "all_publications.json"
PUBLICATIONS_URL = "Publishing/publications.html"

# Pages to exclude from index
EXCLUDE_PATTERNS = [
r"\.backup",
r"-backup\.html$",
r"\.bk\.html$",
r"-bk\.html$",
r"navbar\.html$",
r"footer\.html$",
r"search-modal\.html$",
r"template",
r"page-template",
]


def should_exclude(filepath: Path) -> bool:
"""Check if file should be excluded from index."""
name = filepath.name
for pattern in EXCLUDE_PATTERNS:
if re.search(pattern, name, re.IGNORECASE):
return True
return False


def strip_html(html: str) -> str:
"""Remove HTML tags and decode entities."""
# Remove script and style blocks
text = re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
# Remove all HTML tags
text = re.sub(r"<[^>]+>", " ", text)
# Collapse whitespace
text = re.sub(r"\s+", " ", text)
return text.strip()


def extract_title(html: str, filepath: Path) -> str:
"""Extract page title from HTML."""
match = re.search(r"<title>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
if match:
return strip_html(match.group(1))
# Fallback: use filename
return filepath.stem.replace("-", " ").title()


def extract_content(html: str) -> str:
"""Extract main content from HTML for indexing."""
# Prefer main tag content
match = re.search(r"<main[^>]*>(.*?)</main>", html, re.DOTALL | re.IGNORECASE)
if match:
return strip_html(match.group(1))
# Fallback: body content
match = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL | re.IGNORECASE)
if match:
body = match.group(1)
body = re.sub(r"<script[^>]*>.*?</script>", " ", body, flags=re.DOTALL | re.IGNORECASE)
return strip_html(body)
return ""


def get_relative_url(filepath: Path) -> str:
"""Get URL relative to src directory."""
rel = filepath.relative_to(SRC_DIR)
return str(rel).replace("\\", "/")


def load_publications_index_entries():
"""Create search entries from the publications JSON data."""
if not PUBLICATIONS_JSON.exists():
return []

try:
with open(PUBLICATIONS_JSON, "r", encoding="utf-8") as f:
publications_data = json.load(f)
except Exception as e:
print(f"Warning: Could not read publications JSON {PUBLICATIONS_JSON}: {e}")
return []

publications_by_year = publications_data.get("publicationsByYear", {})
entries = []

for year in sorted(publications_by_year.keys(), reverse=True):
publications = publications_by_year.get(year, [])
if not publications:
continue

publication_texts = [
pub.get("content", "").strip()
for pub in publications
if pub.get("content", "").strip()
]
if not publication_texts:
continue

content = f"AlcHepNet Publications {year}. " + " ".join(publication_texts)
snippet = " ".join(publication_texts[:2])
if len(snippet) > 250:
snippet = snippet[:250] + "..."

entries.append({
"title": f"AlcHepNet Publications ({year})",
"url": PUBLICATIONS_URL,
"snippet": snippet,
"content": content,
})

return entries


def build_index():
"""Build search index from all HTML files."""
index = []

for html_file in sorted(SRC_DIR.rglob("*.html")):
if should_exclude(html_file):
continue

try:
with open(html_file, "r", encoding="utf-8", errors="ignore") as f:
html = f.read()
except Exception as e:
print(f"Warning: Could not read {html_file}: {e}")
continue

title = extract_title(html, html_file)
content = extract_content(html)

if not content:
content = title # Ensure something to search

snippet = content[:250] + "..." if len(content) > 250 else content

index.append({
"title": title,
"url": get_relative_url(html_file),
"snippet": snippet,
"content": content,
})

index.extend(load_publications_index_entries())

return index


def main():
print("Building search index...")
index = build_index()
print(f"Indexed {len(index)} pages")

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(index, f, ensure_ascii=False, indent=2)

print(f"Written to {OUTPUT_FILE}")


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion src/Biorepo/biorepository.html
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ <h3 class="section-title">Biorepository</h3>
<div class="row featurette">
<div class="col lead">
<p>
The AlcHepNet Biorepository is a part of the Indiana University Data Coordinating Center (DCC). Its primary function is to support biospecimen collection for ongoing AlcHepNet studies. The DCC oversees the operation of the biorepository in partnership with the <a href="https://indianabiobank.org" target="_blank" rel="noopener noreferrer">Indiana Biobank (IB)</a>, which is a state-of-the-art facility for biospecimen storage, tracking, and distribution. The <a href="team.html">biorepository team</a> uses a comprehensive, searchable database to manage all biospecimens and the associated metadata. The DCC provides a link between stored biospecimen data to facilitate research.
The AlcHepNet Biorepository is a part of the Indiana University Data Coordinating Center (DCC). Its primary function is to support biospecimen collection for ongoing AlcHepNet studies. The DCC oversees the operation of the biorepository in partnership with the <a href="https://indianabiobank.org" target="_blank" rel="noopener noreferrer">Indiana Biobank (IB)</a>, which is a state-of-the-art facility for biospecimen storage, tracking, and distribution. The <a href="team.html">biorepository team</a> uses a comprehensive, searchable database to manage all biospecimens and the associated metadata. The DCC provides access to stored biospecimens, and the linkage between biospecimens and clinical data to facilitate research.
</p>

</div>
Expand Down
2 changes: 1 addition & 1 deletion src/Clinical/itaald-trial.html
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ <h3 class="section-title"><u>I</u>ntegrated <u>T</u>herapies for <u>A</u>lcohol
</p>

<p>
The primary objective of the study is to determine whether subjects receiving F-652 followed by enhanced AUD treatments will have better alcohol and liver-related outcomes at 6 months compared to those receiving prednisone plus usual care for AUD. Patients assigned to the AUD treatment will receive Acamprosate and counseling, while those assigned to the AUD usual care will receive brief advice and referral to a 12-step program.
The primary objective of the study is to determine whether subjects receiving F-652 (recombinant IL-22) followed by enhanced AUD treatments will have better alcohol and liver-related outcomes at 6 months compared to those receiving prednisone plus usual care for AUD. Patients assigned to the AUD treatment will receive Acamprosate and counseling, while those assigned to the AUD usual care will receive brief advice and referral to a 12-step program.
</p>

<p>
Expand Down
2 changes: 1 addition & 1 deletion src/Data_Access/design.html
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ <h3 class="section-title">Design of ARDaC</h3>

<h4>1. The ARDaC Data Warehouse</h4>
<p>
The heterogeneous clinical data, biosample information, and omics data information will be extracted from the randomized clinical trial, the observational study, and all other alcohol-associated hepatitis (AH) research projects, standardized according to the ARDaC Data Standard, harmonized according to the ARDaC Common Data Model, and hosted in a central ARDaC Data Warehouse. Specifically, the novel ARDaC Common Data Model is derived from and compatible with the Genomics Data Common (GDC) Data Model and is compliant with the FAIR Principles so that AlcHepNet multimodal data will be findable, accessible, interoperable, and reusable. The ARDaC Data Warehouse is the data source for the ARDaC web application, which is open to the public, as well as for regular reporting and customized services within the AlcHepNet consortium. A graph-based provenance model is used for comprehensive data dependency and version control. The ARDaC digital entities, including the standards, data model, data, metadata, scripts, and codes, are attributable, trackable, and reproducible.
The heterogeneous clinical data, biosample information, and omics data information will be extracted from the randomized clinical trial and all other alcohol-associated hepatitis (AH) research projects, standardized according to the ARDaC Data Standard, harmonized according to the ARDaC Common Data Model, and hosted in a central ARDaC Data Warehouse. Specifically, the novel ARDaC Common Data Model is derived from and compatible with the Genomics Data Common (GDC) Data Model and is compliant with the FAIR Principles so that AlcHepNet multimodal data will be findable, accessible, interoperable, and reusable. The ARDaC Data Warehouse is the data source for the ARDaC web application, which is open to the public, as well as for regular reporting and customized services within the AlcHepNet consortium. A graph-based provenance model is used for comprehensive data dependency and version control. The ARDaC digital entities, including the standards, data model, data, metadata, scripts, and codes, are attributable, trackable, and reproducible.
</p>

<h4>2. The ARDaC web application</h4>
Expand Down
Loading
Loading