Skip to content

Commit 52d1a71

Browse files
committed
Add OpenAI-powered content parsing improvement feature
1 parent d05bfe8 commit 52d1a71

File tree

2 files changed

+133
-0
lines changed

2 files changed

+133
-0
lines changed

backend/extractor/readability.go

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,114 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
178178
return rb, nil
179179
}
180180

181+
// ContentParsedWrong handles the logic for when content is parsed incorrectly
182+
func (f *UReadability) ContentParsedWrong(ctx context.Context, urlStr string) (string, error) {
183+
// Extract content using the current method
184+
originalContent, err := f.Extract(ctx, urlStr)
185+
if err != nil {
186+
return "", fmt.Errorf("failed to extract content: %v", err)
187+
}
188+
189+
// Get CSS selector from ChatGPT
190+
selector, err := f.getChatGPTSelector(ctx, urlStr)
191+
if err != nil {
192+
return "", fmt.Errorf("failed to get CSS selector: %v", err)
193+
}
194+
195+
// Get the HTML body
196+
body, err := f.getHTMLBody(urlStr)
197+
if err != nil {
198+
return "", fmt.Errorf("failed to get HTML body: %v", err)
199+
}
200+
201+
// Extract content using the new selector
202+
newContent, err := f.extractContentWithSelector(body, selector)
203+
if err != nil {
204+
return "", fmt.Errorf("failed to extract content with new selector: %v", err)
205+
}
206+
207+
// Compare original and new content
208+
if strings.TrimSpace(originalContent.Content) != strings.TrimSpace(newContent) {
209+
// Contents are different, create a new rule
210+
rule := datastore.Rule{
211+
Author: "",
212+
Domain: f.extractDomain(urlStr),
213+
Content: selector,
214+
TestURLs: []string{urlStr},
215+
Enabled: true,
216+
}
217+
218+
_, err = f.Rules.Save(ctx, rule)
219+
if err != nil {
220+
return "", fmt.Errorf("failed to save new rule: %v", err)
221+
}
222+
223+
return fmt.Sprintf("new custom rule with DOM %s created", selector), nil
224+
}
225+
226+
return "default rule is good, no need to create the custom one", nil
227+
}
228+
229+
func (f *UReadability) getChatGPTSelector(ctx context.Context, urlStr string) (string, error) {
230+
client := openai.NewClient(f.OpenAIKey)
231+
resp, err := client.CreateChatCompletion(
232+
ctx,
233+
openai.ChatCompletionRequest{
234+
Model: openai.GPT4o,
235+
Messages: []openai.ChatCompletionMessage{
236+
{
237+
Role: openai.ChatMessageRoleSystem,
238+
Content: "You are a helpful assistant that provides CSS selectors for extracting main content from web pages.",
239+
},
240+
{
241+
Role: openai.ChatMessageRoleUser,
242+
Content: fmt.Sprintf("Given the URL %s, identify the CSS selector that can be used to extract the main content of the article. This typically includes elements like 'article', 'main', or specific classes. Return only this selector and nothing else.", urlStr),
243+
},
244+
},
245+
},
246+
)
247+
248+
if err != nil {
249+
return "", err
250+
}
251+
252+
return resp.Choices[0].Message.Content, nil
253+
}
254+
255+
func (f *UReadability) getHTMLBody(urlStr string) (string, error) {
256+
//nolint:gosec
257+
resp, err := http.Get(urlStr)
258+
if err != nil {
259+
return "", err
260+
}
261+
defer resp.Body.Close()
262+
263+
body, err := io.ReadAll(resp.Body)
264+
if err != nil {
265+
return "", err
266+
}
267+
268+
return string(body), nil
269+
}
270+
271+
func (f *UReadability) extractContentWithSelector(body, selector string) (string, error) {
272+
doc, err := goquery.NewDocumentFromReader(strings.NewReader(body))
273+
if err != nil {
274+
return "", err
275+
}
276+
277+
content := doc.Find(selector).Text()
278+
return content, nil
279+
}
280+
281+
func (f *UReadability) extractDomain(urlStr string) string {
282+
u, err := url.Parse(urlStr)
283+
if err != nil {
284+
return ""
285+
}
286+
return u.Hostname()
287+
}
288+
181289
// getContent retrieves content from raw body string, both content (text only) and rich (with html tags)
182290
// if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage,
183291
// and at last tries to use general readability parser

backend/rest/server.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ func (s *Server) routes(frontendDir string) chi.Router {
8282
r.Get("/content/v1/parser", s.extractArticleEmulateReadability)
8383
r.Post("/extract", s.extractArticle)
8484
r.Post("/auth", s.authFake)
85+
r.Get("/content-parsed-wrong", s.contentParsedWrong)
8586

8687
r.Group(func(protected chi.Router) {
8788
protected.Use(basicAuth("ureadability", s.Credentials))
@@ -391,6 +392,30 @@ func (s *Server) authFake(w http.ResponseWriter, r *http.Request) {
391392
render.JSON(w, r, JSON{"pong": t.Format("20060102150405")})
392393
}
393394

395+
func (s *Server) contentParsedWrong(w http.ResponseWriter, r *http.Request) {
396+
if s.Readability.OpenAIKey == "" {
397+
render.Status(r, http.StatusBadRequest)
398+
render.JSON(w, r, JSON{"error": "OpenAI key is not set"})
399+
return
400+
}
401+
402+
exampleURL := r.URL.Query().Get("url")
403+
if exampleURL == "" {
404+
render.Status(r, http.StatusBadRequest)
405+
render.JSON(w, r, JSON{"error": "url parameter is required"})
406+
return
407+
}
408+
409+
message, err := s.Readability.ContentParsedWrong(r.Context(), exampleURL)
410+
if err != nil {
411+
render.Status(r, http.StatusInternalServerError)
412+
render.JSON(w, r, JSON{"error": err.Error()})
413+
return
414+
}
415+
416+
render.JSON(w, r, JSON{"message": message})
417+
}
418+
394419
func getBid(id string) primitive.ObjectID {
395420
bid, err := primitive.ObjectIDFromHex(id)
396421
if err != nil {

0 commit comments

Comments
 (0)