Refactor URL content fetching and centralize Jina Reader logic

rasulkireev · rasulkireev · commit cad349f301e4 · 2025-03-09T11:50:22.000+03:00
The changes extract and centralize the common URL content fetching logic used
by Project and ProjectPage models into standalone utility functions in
model_utils.py. This refactoring:

- Introduces get_html_content() and get_markdown_content() utility functions to
  handle URL fetching and Jina Reader API interactions
- Removes duplicate code from Project and ProjectPage models
- Improves error handling and logging for content fetching operations
- Reduces code complexity and makes the content fetching logic more
  maintainable and reusable

The changes also remove redundant title/description fields from ProjectPage
model since they are now handled through the centralized content fetching.
diff --git a/core/model_utils.py b/core/model_utils.py
@@ -1,6 +1,13 @@
 import random
 import string
 
+import requests
+
+from seo_blog_bot import settings
+from seo_blog_bot.utils import get_seo_blog_bot_logger
+
+logger = get_seo_blog_bot_logger(__name__)
+
 
 def generate_random_key():
     characters = string.ascii_letters + string.digits
@@ -39,3 +46,47 @@ def run_agent_synchronously(agent, input_string, deps=None):
         return result
     except Exception as e:
         raise RuntimeError(f"Agent execution failed: {str(e)}") from e
+
+
+def get_html_content(url):
+    html_content = ""
+    try:
+        html_response = requests.get(url, timeout=30)
+        html_response.raise_for_status()
+        html_content = html_response.text
+    except requests.exceptions.RequestException as e:
+        logger.warning(
+            "Could not fetch HTML content",
+            error=str(e),
+            url=url,
+        )
+
+    return html_content
+
+
+def get_markdown_content(url):
+    jina_url = f"https://r.jina.ai/{url}"
+    headers = {
+        "Accept": "application/json",
+        "Authorization": f"Bearer {settings.JINA_READER_API_KEY}",
+    }
+
+    try:
+        response = requests.get(jina_url, headers=headers, timeout=30)
+        response.raise_for_status()
+
+        data = response.json().get("data", {})
+
+        return (
+            data.get("title", "")[:500],
+            data.get("description", ""),
+            data.get("content", ""),
+        )
+
+    except requests.exceptions.RequestException as e:
+        logger.error(
+            "Error fetching content from Jina Reader",
+            error=str(e),
+            url=url,
+        )
+        return ("", "", "")
diff --git a/core/models.py b/core/models.py
@@ -1,5 +1,3 @@
-import requests
-from django.conf import settings
 from django.contrib.auth.models import User
 from django.db import models, transaction
 from django.urls import reverse
@@ -8,7 +6,7 @@
 
 from core.base_models import BaseModel
 from core.choices import Category, ContentType, Language, ProfileStates, ProjectPageType, ProjectStyle, ProjectType
-from core.model_utils import generate_random_key, run_agent_synchronously
+from core.model_utils import generate_random_key, get_html_content, get_markdown_content, run_agent_synchronously
 from core.prompts import GENERATE_CONTENT_SYSTEM_PROMPTS, TITLE_SUGGESTION_SYSTEM_PROMPTS
 from core.schemas import (
     BlogPostContent,
@@ -214,59 +212,35 @@ def get_page_content(self):
         Fetch page content using Jina Reader API and update the project.
         Returns the content if successful, raises ValueError otherwise.
         """
-        try:
-            html_response = requests.get(self.url, timeout=30)
-            html_response.raise_for_status()
-            html_content = html_response.text
-        except requests.exceptions.RequestException as e:
-            logger.error(
-                "[Page Content] Error fetching HTML content",
-                error=str(e),
-                project_name=self.name,
-                project_url=self.url,
-            )
-            html_content = ""
-
-        jina_url = f"https://r.jina.ai/{self.url}"
-        headers = {"Accept": "application/json", "Authorization": f"Bearer {settings.JINA_READER_API_KEY}"}
-        try:
-            response = requests.get(jina_url, headers=headers, timeout=30)
-            response.raise_for_status()
-
-            data = response.json().get("data", {})
-
-            self.date_scraped = timezone.now()
-            self.title = data.get("title", "")[:500]
-            self.description = data.get("description", "")
-            self.markdown_content = data.get("content", "")
-            self.html_content = html_content
-
-            self.save(
-                update_fields=[
-                    "date_scraped",
-                    "title",
-                    "description",
-                    "markdown_content",
-                    "html_content",
-                ]
-            )
+        html_content = get_html_content(self.url)
+        title, description, markdown_content = get_markdown_content(self.url)
 
-            logger.info(
-                "[Page Content] Successfully fetched content",
-                project_name=self.name,
-                project_url=self.url,
-            )
+        if not title or not description or not markdown_content:
+            return False
 
-            return True
+        self.date_scraped = timezone.now()
+        self.title = title
+        self.description = description
+        self.markdown_content = markdown_content
+        self.html_content = html_content
 
-        except requests.exceptions.RequestException as e:
-            logger.error(
-                "[Page Content] Error fetching content from Jina Reader",
-                error=str(e),
-                project_name=self.name,
-                project_url=self.url,
-            )
-            return False
+        self.save(
+            update_fields=[
+                "date_scraped",
+                "title",
+                "description",
+                "markdown_content",
+                "html_content",
+            ]
+        )
+
+        logger.info(
+            "[Page Content] Successfully fetched content",
+            project_name=self.name,
+            project_url=self.url,
+        )
+
+        return True
 
     def analyze_content(self):
         """
@@ -624,8 +598,6 @@ class ProjectPage(BaseModel):
     project = models.ForeignKey(Project, null=True, blank=True, on_delete=models.CASCADE, related_name="project_pages")
 
     url = models.URLField(max_length=200)
-    title = models.CharField(max_length=255)
-    description = models.TextField(blank=True)
     html_content = models.TextField(blank=True, default="")
 
     # Content from Jina Reader
@@ -651,59 +623,35 @@ def get_page_content(self):
         Fetch page content using Jina Reader API and update the project.
         Returns the content if successful, raises ValueError otherwise.
         """
-        try:
-            html_response = requests.get(self.url, timeout=30)
-            html_response.raise_for_status()
-            html_content = html_response.text
-        except requests.exceptions.RequestException as e:
-            logger.error(
-                "[Page Content] Error fetching HTML content",
-                error=str(e),
-                project_name=self.title,
-                project_url=self.url,
-            )
-            html_content = ""
-
-        jina_url = f"https://r.jina.ai/{self.url}"
-        headers = {"Accept": "application/json", "Authorization": f"Bearer {settings.JINA_READER_API_KEY}"}
-        try:
-            response = requests.get(jina_url, headers=headers, timeout=30)
-            response.raise_for_status()
-
-            data = response.json().get("data", {})
-
-            self.date_scraped = timezone.now()
-            self.title = data.get("title", "")[:500]
-            self.description = data.get("description", "")
-            self.markdown_content = data.get("content", "")
-            self.html_content = html_content
-
-            self.save(
-                update_fields=[
-                    "date_scraped",
-                    "title",
-                    "description",
-                    "markdown_content",
-                    "html_content",
-                ]
-            )
+        html_content = get_html_content(self.url)
+        title, description, markdown_content = get_markdown_content(self.url)
 
-            logger.info(
-                "[Page Content] Successfully fetched content",
-                project_name=self.title,
-                project_url=self.url,
-            )
+        if not title or not description or not markdown_content:
+            return False
 
-            return True
+        self.date_scraped = timezone.now()
+        self.title = title
+        self.description = description
+        self.markdown_content = markdown_content
+        self.html_content = html_content
 
-        except requests.exceptions.RequestException as e:
-            logger.error(
-                "[Page Content] Error fetching content from Jina Reader",
-                error=str(e),
-                project_name=self.title,
-                project_url=self.url,
-            )
-            return False
+        self.save(
+            update_fields=[
+                "date_scraped",
+                "title",
+                "description",
+                "markdown_content",
+                "html_content",
+            ]
+        )
+
+        logger.info(
+            "[Page Content] Successfully fetched content",
+            project_name=self.project.name,
+            project_url=self.url,
+        )
+
+        return True
 
     def analyze_content(self):
         """