Skip to content

Commit 17fef8d

Browse files
committed
Synced webpages from sitemap to Document model for search.
1 parent 72b3840 commit 17fef8d

File tree

8 files changed

+196
-12
lines changed

8 files changed

+196
-12
lines changed

docs/management/commands/update_docs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ def build_doc_release(self, release, force=False, interactive=False):
132132
if self.verbosity >= 1:
133133
self.stdout.write(f"Starting update for {release} at {datetime.now()}...")
134134

135+
release.sync_from_sitemap()
136+
135137
# checkout_dir is shared for all languages.
136138
checkout_dir = settings.DOCS_BUILD_ROOT.joinpath("sources", release.version)
137139
parent_build_dir = settings.DOCS_BUILD_ROOT.joinpath(

docs/models.py

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from functools import partial, reduce
66
from pathlib import Path
77

8+
import requests
89
from django.conf import settings
910
from django.contrib.postgres.indexes import GinIndex
1011
from django.contrib.postgres.search import (
@@ -34,8 +35,16 @@
3435
START_SEL,
3536
STOP_SEL,
3637
TSEARCH_CONFIG_LANGUAGES,
38+
DocumentationCategory,
39+
fetch_html,
3740
get_document_search_vector,
3841
)
42+
from .utils import extract_inner_html
43+
44+
45+
def get_search_config(lang):
46+
"""Determine the PostgreSQL search language"""
47+
return TSEARCH_CONFIG_LANGUAGES.get(lang[:2], DEFAULT_TEXT_SEARCH_CONFIG)
3948

4049

4150
class DocumentReleaseQuerySet(models.QuerySet):
@@ -175,7 +184,7 @@ def sync_to_db(self, decoded_documents):
175184
the database. Deletes all the release's documents first then
176185
reinserts them as needed.
177186
"""
178-
self.documents.all().delete()
187+
self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete()
179188

180189
# Read excluded paths from robots.docs.txt.
181190
robots_path = settings.BASE_DIR.joinpath(
@@ -206,16 +215,54 @@ def sync_to_db(self, decoded_documents):
206215
path=document_path,
207216
title=html.unescape(strip_tags(document["title"])),
208217
metadata=document,
209-
config=TSEARCH_CONFIG_LANGUAGES.get(
210-
self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG
211-
),
218+
config=get_search_config(self.lang),
212219
)
213-
for document in self.documents.all():
220+
for document in self.documents.exclude(
221+
metadata__parents=DocumentationCategory.WEBSITE
222+
):
214223
document.metadata["breadcrumbs"] = list(
215224
Document.objects.breadcrumbs(document).values("title", "path")
216225
)
217226
document.save(update_fields=("metadata",))
218227

228+
def sync_from_sitemap(self):
229+
from djangoproject.urls.www import sitemaps
230+
231+
if self.lang != "en" or self.release:
232+
return
233+
234+
for sitemap in sitemaps.values():
235+
for url in sitemap().get_urls():
236+
try:
237+
page_html = fetch_html(url["location"])
238+
except requests.RequestException:
239+
continue
240+
try:
241+
main_html = extract_inner_html(page_html, tag="main")
242+
title = extract_inner_html(page_html, tag="h1")
243+
except ValueError:
244+
continue
245+
Document.objects.update_or_create(
246+
release=self,
247+
path=url["location"],
248+
defaults={
249+
"title": title,
250+
"metadata": {
251+
"body": main_html,
252+
"breadcrumbs": [
253+
{
254+
"path": DocumentationCategory.WEBSITE,
255+
"title": "Website",
256+
},
257+
],
258+
"parents": DocumentationCategory.WEBSITE,
259+
"title": title,
260+
"toc": "",
261+
},
262+
"config": get_search_config(self.lang),
263+
},
264+
)
265+
219266

220267
def _clean_document_path(path):
221268
# We have to be a bit careful to reverse-engineer the correct
@@ -228,7 +275,9 @@ def _clean_document_path(path):
228275

229276

230277
def document_url(doc):
231-
if doc.path:
278+
if doc.metadata.get("parents") == DocumentationCategory.WEBSITE:
279+
return doc.path
280+
elif doc.path:
232281
kwargs = {
233282
"lang": doc.release.lang,
234283
"version": doc.release.version,
@@ -273,6 +322,14 @@ def search(self, query_text, release, document_category=None):
273322
config=models.F("config"),
274323
)
275324
base_filter = Q(release_id=release.id)
325+
if release.lang == "en" and release.version != "dev":
326+
dev_release = DocumentRelease.objects.get_by_version_and_lang(
327+
"dev", "en"
328+
)
329+
base_filter |= Q(
330+
release_id=dev_release.id,
331+
metadata__parents=DocumentationCategory.WEBSITE,
332+
)
276333
if document_category:
277334
base_filter &= Q(metadata__parents__startswith=document_category)
278335
base_qs = (

docs/search.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import requests
12
from django.contrib.postgres.search import SearchVector
23
from django.db.models import TextChoices
34
from django.db.models.fields.json import KeyTextTransform
@@ -67,10 +68,41 @@ class DocumentationCategory(TextChoices):
6768
TOPICS = "topics", _("Using Django")
6869
HOWTO = "howto", _("How-to guides")
6970
RELEASE_NOTES = "releases", _("Release notes")
71+
WEBSITE = "website", _("Django Website")
7072

7173
@classmethod
7274
def parse(cls, value, default=None):
7375
try:
7476
return cls(value)
7577
except ValueError:
7678
return None
79+
80+
81+
def fetch_html(url, timeout=10):
82+
"""
83+
Fetch the HTML of a page if status code is 200.
84+
Simulates a human browser and accepts only text/html.
85+
"""
86+
87+
headers = {
88+
"User-Agent": (
89+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
90+
"AppleWebKit/537.36 (KHTML, like Gecko) "
91+
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
92+
),
93+
"Accept": "text/html",
94+
"Accept-Language": "en-US,en;q=0.9",
95+
}
96+
97+
response = requests.get(url, headers=headers, timeout=timeout)
98+
99+
if response.status_code == 200:
100+
content_type = response.headers.get("Content-Type", "")
101+
if "text/html" in content_type:
102+
return response.text
103+
else:
104+
raise requests.RequestException(f"Unexpected Content-Type: {content_type}")
105+
else:
106+
raise requests.RequestException(
107+
f"Failed to fetch {url}, status code: {response.status_code}"
108+
)

docs/templates/docs/search_results.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@ <h2>{% translate "No search query given" %}</h2>
4343
{% for result in page.object_list %}
4444
<dt>
4545
<h2 class="result-title">
46-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=result.path host 'docs' %}{% if not start_sel in result.headline %}{{ result.highlight|fragment }}{% endif %}">{{ result.headline|safe }}</a>
46+
<a href="{{ result.get_absolute_url }}{% if not start_sel in result.headline %}{{ result.highlight|fragment }}{% endif %}">{{ result.headline|safe }}</a>
4747
</h2>
4848
<span class="meta breadcrumbs">
4949
{% for breadcrumb in result.breadcrumbs %}
50-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=breadcrumb.path host 'docs' %}">{{ breadcrumb.title }}</a>{% if not forloop.last %} <span class="arrow">»</span>{% endif %}
50+
<a href="{{ result.get_absolute_url }}">{{ breadcrumb.title }}</a>{% if not forloop.last %} <span class="arrow">»</span>{% endif %}
5151
{% endfor %}
5252
</span>
5353
</dt>
@@ -60,7 +60,7 @@ <h2 class="result-title">
6060
<ul class="code-links">
6161
{% for name, value in result_code_links.items %}
6262
<li>
63-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=result.path host 'docs' %}#{{ value.full_path }}">
63+
<a href="{{ result.get_absolute_url }}#{{ value.full_path }}">
6464
<div>
6565
<code>{{ name }}</code>
6666
{% if value.module_path %}<div class="meta">{{ value.module_path }}</div>{% endif %}

docs/tests/test_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ def test_get_available_languages_by_version(self):
173173
class DocumentManagerTest(TestCase):
174174
@classmethod
175175
def setUpTestData(cls):
176+
DocumentRelease.objects.create(lang="en")
176177
cls.release = DocumentRelease.objects.create(
177178
release=Release.objects.create(version="1.2.3"),
178179
)

docs/tests/test_utils.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from django.test import SimpleTestCase
55

6-
from ..utils import get_doc_path, sanitize_for_trigram
6+
from ..utils import extract_inner_html, get_doc_path, sanitize_for_trigram
77

88

99
class TestUtils(SimpleTestCase):
@@ -38,3 +38,38 @@ def test_sanitize_for_trigram(self):
3838
]:
3939
with self.subTest(query=query):
4040
self.assertEqual(sanitize_for_trigram(query), sanitized_query)
41+
42+
def test_extract_inner_html(self):
43+
for html, expected_output in [
44+
("<main><p>Hello</p></main>", "<p>Hello</p>"),
45+
(
46+
'<header>Test</header><main id="app" class="container">'
47+
"<h1>Title</h1></main>",
48+
"<h1>Title</h1>",
49+
),
50+
("<main>&amp; &lt; &gt; &#169;</main>", "& < > ©"),
51+
("<main></main>", ""),
52+
("<main>Hello world</main>", "Hello world"),
53+
("<main><h1>Hi</h1>Text<p>Bye</p></main>", "<h1>Hi</h1>Text<p>Bye</p>"),
54+
]:
55+
self.assertEqual(extract_inner_html(html, tag="main"), expected_output)
56+
57+
def test_extract_inner_html_multiple_same_tags_raises(self):
58+
with self.assertRaisesMessage(
59+
ValueError, "<main> occurs more than once in HTML."
60+
):
61+
extract_inner_html(
62+
"<main>One main</main><main id='dupe'>Two main</main>", tag="main"
63+
)
64+
65+
def test_extract_inner_html_multiple_same_tags_nested_raises(self):
66+
with self.assertRaisesMessage(
67+
ValueError, "Nested <main> tags are not allowed."
68+
):
69+
extract_inner_html(
70+
"<main>One main<main id='dupe'>Two main</main></main>", tag="main"
71+
)
72+
73+
def test_extract_inner_html_tag_not_found_raises(self):
74+
with self.assertRaisesMessage(ValueError, "<main> not found in HTML."):
75+
extract_inner_html("<p>Test</p>", tag="main")

docs/tests/test_views.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def test_search_type_filter_all(self):
9191
)
9292
self.assertEqual(response.status_code, 200)
9393
self.assertContains(
94-
response, "4 results for <em>generic</em> in version 5.1", html=True
94+
response, "5 results for <em>generic</em> in version 5.1", html=True
9595
)
9696
self.assertContains(response, self.active_filter, count=1)
9797
self.assertContains(response, f"{self.active_filter}All</a>", html=True)
@@ -122,7 +122,7 @@ def test_search_category_filter_invalid_doc_categories(self):
122122
)
123123
self.assertEqual(response.status_code, 200)
124124
self.assertContains(
125-
response, "4 results for <em>generic</em> in version 5.1", html=True
125+
response, "5 results for <em>generic</em> in version 5.1", html=True
126126
)
127127
self.assertContains(response, self.active_filter, count=1)
128128
self.assertContains(response, f"{self.active_filter}All</a>", html=True)

docs/utils.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
22
import unicodedata
3+
from html.parser import HTMLParser
34

45
from django.conf import settings
56
from django.http import Http404
@@ -92,3 +93,59 @@ def get_module_path(name, full_path):
9293
if full_path.endswith(name_suffix):
9394
return full_path.removesuffix(name_suffix)
9495
return None
96+
97+
98+
class SingleTagInnerHTMLExtractor(HTMLParser):
99+
def __init__(self, target_tag):
100+
super().__init__()
101+
self.target_tag = target_tag.lower()
102+
self.capturing = False
103+
self.inner_html = []
104+
self.tag_count = 0
105+
106+
def handle_starttag(self, tag, attrs):
107+
tag = tag.lower()
108+
if tag == self.target_tag:
109+
self.tag_count += 1
110+
if self.capturing:
111+
# Nested target tag not allowed.
112+
raise ValueError(f"Nested <{self.target_tag}> tags are not allowed.")
113+
self.capturing = True
114+
elif self.capturing:
115+
self.inner_html.append(self.get_starttag_text())
116+
117+
def handle_endtag(self, tag):
118+
tag = tag.lower()
119+
if self.capturing:
120+
if tag == self.target_tag:
121+
self.capturing = False
122+
else:
123+
self.inner_html.append(f"</{tag}>")
124+
125+
def handle_data(self, data):
126+
if self.capturing:
127+
self.inner_html.append(data)
128+
129+
def handle_entityref(self, name):
130+
if self.capturing:
131+
self.inner_html.append(f"&{name};")
132+
133+
def handle_charref(self, name):
134+
if self.capturing:
135+
self.inner_html.append(f"&#{name};")
136+
137+
138+
def extract_inner_html(html, tag):
139+
"""
140+
Extracts the inner HTML of a tag that appears exactly once.
141+
"""
142+
parser = SingleTagInnerHTMLExtractor(tag)
143+
parser.feed(html)
144+
parser.close()
145+
146+
if parser.tag_count == 0:
147+
raise ValueError(f"<{tag}> not found in HTML.")
148+
if parser.tag_count > 1:
149+
raise ValueError(f"<{tag}> occurs more than once in HTML.")
150+
151+
return "".join(parser.inner_html)

0 commit comments

Comments
 (0)