1- import requests
2- from django .conf import settings
31from django .contrib .auth .models import User
42from django .db import models , transaction
53from django .urls import reverse
86
97from core .base_models import BaseModel
108from core .choices import Category , ContentType , Language , ProfileStates , ProjectPageType , ProjectStyle , ProjectType
11- from core .model_utils import generate_random_key , run_agent_synchronously
9+ from core .model_utils import generate_random_key , get_html_content , get_markdown_content , run_agent_synchronously
1210from core .prompts import GENERATE_CONTENT_SYSTEM_PROMPTS , TITLE_SUGGESTION_SYSTEM_PROMPTS
1311from core .schemas import (
1412 BlogPostContent ,
@@ -214,59 +212,35 @@ def get_page_content(self):
214212 Fetch page content using Jina Reader API and update the project.
215213 Returns the content if successful, raises ValueError otherwise.
216214 """
217- try :
218- html_response = requests .get (self .url , timeout = 30 )
219- html_response .raise_for_status ()
220- html_content = html_response .text
221- except requests .exceptions .RequestException as e :
222- logger .error (
223- "[Page Content] Error fetching HTML content" ,
224- error = str (e ),
225- project_name = self .name ,
226- project_url = self .url ,
227- )
228- html_content = ""
229-
230- jina_url = f"https://r.jina.ai/{ self .url } "
231- headers = {"Accept" : "application/json" , "Authorization" : f"Bearer { settings .JINA_READER_API_KEY } " }
232- try :
233- response = requests .get (jina_url , headers = headers , timeout = 30 )
234- response .raise_for_status ()
235-
236- data = response .json ().get ("data" , {})
237-
238- self .date_scraped = timezone .now ()
239- self .title = data .get ("title" , "" )[:500 ]
240- self .description = data .get ("description" , "" )
241- self .markdown_content = data .get ("content" , "" )
242- self .html_content = html_content
243-
244- self .save (
245- update_fields = [
246- "date_scraped" ,
247- "title" ,
248- "description" ,
249- "markdown_content" ,
250- "html_content" ,
251- ]
252- )
215+ html_content = get_html_content (self .url )
216+ title , description , markdown_content = get_markdown_content (self .url )
253217
254- logger .info (
255- "[Page Content] Successfully fetched content" ,
256- project_name = self .name ,
257- project_url = self .url ,
258- )
218+ if not title or not description or not markdown_content :
219+ return False
259220
260- return True
221+ self .date_scraped = timezone .now ()
222+ self .title = title
223+ self .description = description
224+ self .markdown_content = markdown_content
225+ self .html_content = html_content
261226
262- except requests .exceptions .RequestException as e :
263- logger .error (
264- "[Page Content] Error fetching content from Jina Reader" ,
265- error = str (e ),
266- project_name = self .name ,
267- project_url = self .url ,
268- )
269- return False
227+ self .save (
228+ update_fields = [
229+ "date_scraped" ,
230+ "title" ,
231+ "description" ,
232+ "markdown_content" ,
233+ "html_content" ,
234+ ]
235+ )
236+
237+ logger .info (
238+ "[Page Content] Successfully fetched content" ,
239+ project_name = self .name ,
240+ project_url = self .url ,
241+ )
242+
243+ return True
270244
271245 def analyze_content (self ):
272246 """
@@ -624,8 +598,6 @@ class ProjectPage(BaseModel):
624598 project = models .ForeignKey (Project , null = True , blank = True , on_delete = models .CASCADE , related_name = "project_pages" )
625599
626600 url = models .URLField (max_length = 200 )
627- title = models .CharField (max_length = 255 )
628- description = models .TextField (blank = True )
629601 html_content = models .TextField (blank = True , default = "" )
630602
631603 # Content from Jina Reader
@@ -651,59 +623,35 @@ def get_page_content(self):
651623 Fetch page content using Jina Reader API and update the project.
652624 Returns the content if successful, raises ValueError otherwise.
653625 """
654- try :
655- html_response = requests .get (self .url , timeout = 30 )
656- html_response .raise_for_status ()
657- html_content = html_response .text
658- except requests .exceptions .RequestException as e :
659- logger .error (
660- "[Page Content] Error fetching HTML content" ,
661- error = str (e ),
662- project_name = self .title ,
663- project_url = self .url ,
664- )
665- html_content = ""
666-
667- jina_url = f"https://r.jina.ai/{ self .url } "
668- headers = {"Accept" : "application/json" , "Authorization" : f"Bearer { settings .JINA_READER_API_KEY } " }
669- try :
670- response = requests .get (jina_url , headers = headers , timeout = 30 )
671- response .raise_for_status ()
672-
673- data = response .json ().get ("data" , {})
674-
675- self .date_scraped = timezone .now ()
676- self .title = data .get ("title" , "" )[:500 ]
677- self .description = data .get ("description" , "" )
678- self .markdown_content = data .get ("content" , "" )
679- self .html_content = html_content
680-
681- self .save (
682- update_fields = [
683- "date_scraped" ,
684- "title" ,
685- "description" ,
686- "markdown_content" ,
687- "html_content" ,
688- ]
689- )
626+ html_content = get_html_content (self .url )
627+ title , description , markdown_content = get_markdown_content (self .url )
690628
691- logger .info (
692- "[Page Content] Successfully fetched content" ,
693- project_name = self .title ,
694- project_url = self .url ,
695- )
629+ if not title or not description or not markdown_content :
630+ return False
696631
697- return True
632+ self .date_scraped = timezone .now ()
633+ self .title = title
634+ self .description = description
635+ self .markdown_content = markdown_content
636+ self .html_content = html_content
698637
699- except requests .exceptions .RequestException as e :
700- logger .error (
701- "[Page Content] Error fetching content from Jina Reader" ,
702- error = str (e ),
703- project_name = self .title ,
704- project_url = self .url ,
705- )
706- return False
638+ self .save (
639+ update_fields = [
640+ "date_scraped" ,
641+ "title" ,
642+ "description" ,
643+ "markdown_content" ,
644+ "html_content" ,
645+ ]
646+ )
647+
648+ logger .info (
649+ "[Page Content] Successfully fetched content" ,
650+ project_name = self .project .name ,
651+ project_url = self .url ,
652+ )
653+
654+ return True
707655
708656 def analyze_content (self ):
709657 """
0 commit comments