Skip to content

Commit cad349f

Browse files
committed
Refactor URL content fetching and centralize Jina Reader logic
The changes extract and centralize the common URL content fetching logic used by Project and ProjectPage models into standalone utility functions in model_utils.py. This refactoring: - Introduces get_html_content() and get_markdown_content() utility functions to handle URL fetching and Jina Reader API interactions - Removes duplicate code from Project and ProjectPage models - Improves error handling and logging for content fetching operations - Reduces code complexity and makes the content fetching logic more maintainable and reusable The changes also remove redundant title/description fields from ProjectPage model since they are now handled through the centralized content fetching.
1 parent d7511c0 commit cad349f

File tree

2 files changed

+104
-105
lines changed

2 files changed

+104
-105
lines changed

core/model_utils.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
import random
22
import string
33

4+
import requests
5+
6+
from seo_blog_bot import settings
7+
from seo_blog_bot.utils import get_seo_blog_bot_logger
8+
9+
logger = get_seo_blog_bot_logger(__name__)
10+
411

512
def generate_random_key():
613
characters = string.ascii_letters + string.digits
@@ -39,3 +46,47 @@ def run_agent_synchronously(agent, input_string, deps=None):
3946
return result
4047
except Exception as e:
4148
raise RuntimeError(f"Agent execution failed: {str(e)}") from e
49+
50+
51+
def get_html_content(url):
52+
html_content = ""
53+
try:
54+
html_response = requests.get(url, timeout=30)
55+
html_response.raise_for_status()
56+
html_content = html_response.text
57+
except requests.exceptions.RequestException as e:
58+
logger.warning(
59+
"Could not fetch HTML content",
60+
error=str(e),
61+
url=url,
62+
)
63+
64+
return html_content
65+
66+
67+
def get_markdown_content(url):
68+
jina_url = f"https://r.jina.ai/{url}"
69+
headers = {
70+
"Accept": "application/json",
71+
"Authorization": f"Bearer {settings.JINA_READER_API_KEY}",
72+
}
73+
74+
try:
75+
response = requests.get(jina_url, headers=headers, timeout=30)
76+
response.raise_for_status()
77+
78+
data = response.json().get("data", {})
79+
80+
return (
81+
data.get("title", "")[:500],
82+
data.get("description", ""),
83+
data.get("content", ""),
84+
)
85+
86+
except requests.exceptions.RequestException as e:
87+
logger.error(
88+
"Error fetching content from Jina Reader",
89+
error=str(e),
90+
url=url,
91+
)
92+
return ("", "", "")

core/models.py

Lines changed: 53 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import requests
2-
from django.conf import settings
31
from django.contrib.auth.models import User
42
from django.db import models, transaction
53
from django.urls import reverse
@@ -8,7 +6,7 @@
86

97
from core.base_models import BaseModel
108
from core.choices import Category, ContentType, Language, ProfileStates, ProjectPageType, ProjectStyle, ProjectType
11-
from core.model_utils import generate_random_key, run_agent_synchronously
9+
from core.model_utils import generate_random_key, get_html_content, get_markdown_content, run_agent_synchronously
1210
from core.prompts import GENERATE_CONTENT_SYSTEM_PROMPTS, TITLE_SUGGESTION_SYSTEM_PROMPTS
1311
from core.schemas import (
1412
BlogPostContent,
@@ -214,59 +212,35 @@ def get_page_content(self):
214212
Fetch page content using Jina Reader API and update the project.
215213
Returns the content if successful, raises ValueError otherwise.
216214
"""
217-
try:
218-
html_response = requests.get(self.url, timeout=30)
219-
html_response.raise_for_status()
220-
html_content = html_response.text
221-
except requests.exceptions.RequestException as e:
222-
logger.error(
223-
"[Page Content] Error fetching HTML content",
224-
error=str(e),
225-
project_name=self.name,
226-
project_url=self.url,
227-
)
228-
html_content = ""
229-
230-
jina_url = f"https://r.jina.ai/{self.url}"
231-
headers = {"Accept": "application/json", "Authorization": f"Bearer {settings.JINA_READER_API_KEY}"}
232-
try:
233-
response = requests.get(jina_url, headers=headers, timeout=30)
234-
response.raise_for_status()
235-
236-
data = response.json().get("data", {})
237-
238-
self.date_scraped = timezone.now()
239-
self.title = data.get("title", "")[:500]
240-
self.description = data.get("description", "")
241-
self.markdown_content = data.get("content", "")
242-
self.html_content = html_content
243-
244-
self.save(
245-
update_fields=[
246-
"date_scraped",
247-
"title",
248-
"description",
249-
"markdown_content",
250-
"html_content",
251-
]
252-
)
215+
html_content = get_html_content(self.url)
216+
title, description, markdown_content = get_markdown_content(self.url)
253217

254-
logger.info(
255-
"[Page Content] Successfully fetched content",
256-
project_name=self.name,
257-
project_url=self.url,
258-
)
218+
if not title or not description or not markdown_content:
219+
return False
259220

260-
return True
221+
self.date_scraped = timezone.now()
222+
self.title = title
223+
self.description = description
224+
self.markdown_content = markdown_content
225+
self.html_content = html_content
261226

262-
except requests.exceptions.RequestException as e:
263-
logger.error(
264-
"[Page Content] Error fetching content from Jina Reader",
265-
error=str(e),
266-
project_name=self.name,
267-
project_url=self.url,
268-
)
269-
return False
227+
self.save(
228+
update_fields=[
229+
"date_scraped",
230+
"title",
231+
"description",
232+
"markdown_content",
233+
"html_content",
234+
]
235+
)
236+
237+
logger.info(
238+
"[Page Content] Successfully fetched content",
239+
project_name=self.name,
240+
project_url=self.url,
241+
)
242+
243+
return True
270244

271245
def analyze_content(self):
272246
"""
@@ -624,8 +598,6 @@ class ProjectPage(BaseModel):
624598
project = models.ForeignKey(Project, null=True, blank=True, on_delete=models.CASCADE, related_name="project_pages")
625599

626600
url = models.URLField(max_length=200)
627-
title = models.CharField(max_length=255)
628-
description = models.TextField(blank=True)
629601
html_content = models.TextField(blank=True, default="")
630602

631603
# Content from Jina Reader
@@ -651,59 +623,35 @@ def get_page_content(self):
651623
Fetch page content using Jina Reader API and update the project.
652624
Returns the content if successful, raises ValueError otherwise.
653625
"""
654-
try:
655-
html_response = requests.get(self.url, timeout=30)
656-
html_response.raise_for_status()
657-
html_content = html_response.text
658-
except requests.exceptions.RequestException as e:
659-
logger.error(
660-
"[Page Content] Error fetching HTML content",
661-
error=str(e),
662-
project_name=self.title,
663-
project_url=self.url,
664-
)
665-
html_content = ""
666-
667-
jina_url = f"https://r.jina.ai/{self.url}"
668-
headers = {"Accept": "application/json", "Authorization": f"Bearer {settings.JINA_READER_API_KEY}"}
669-
try:
670-
response = requests.get(jina_url, headers=headers, timeout=30)
671-
response.raise_for_status()
672-
673-
data = response.json().get("data", {})
674-
675-
self.date_scraped = timezone.now()
676-
self.title = data.get("title", "")[:500]
677-
self.description = data.get("description", "")
678-
self.markdown_content = data.get("content", "")
679-
self.html_content = html_content
680-
681-
self.save(
682-
update_fields=[
683-
"date_scraped",
684-
"title",
685-
"description",
686-
"markdown_content",
687-
"html_content",
688-
]
689-
)
626+
html_content = get_html_content(self.url)
627+
title, description, markdown_content = get_markdown_content(self.url)
690628

691-
logger.info(
692-
"[Page Content] Successfully fetched content",
693-
project_name=self.title,
694-
project_url=self.url,
695-
)
629+
if not title or not description or not markdown_content:
630+
return False
696631

697-
return True
632+
self.date_scraped = timezone.now()
633+
self.title = title
634+
self.description = description
635+
self.markdown_content = markdown_content
636+
self.html_content = html_content
698637

699-
except requests.exceptions.RequestException as e:
700-
logger.error(
701-
"[Page Content] Error fetching content from Jina Reader",
702-
error=str(e),
703-
project_name=self.title,
704-
project_url=self.url,
705-
)
706-
return False
638+
self.save(
639+
update_fields=[
640+
"date_scraped",
641+
"title",
642+
"description",
643+
"markdown_content",
644+
"html_content",
645+
]
646+
)
647+
648+
logger.info(
649+
"[Page Content] Successfully fetched content",
650+
project_name=self.project.name,
651+
project_url=self.url,
652+
)
653+
654+
return True
707655

708656
def analyze_content(self):
709657
"""

0 commit comments

Comments
 (0)