@@ -178,6 +178,114 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
178
178
return rb , nil
179
179
}
180
180
181
+ // ContentParsedWrong handles the logic for when content is parsed incorrectly
182
+ func (f * UReadability ) ContentParsedWrong (ctx context.Context , urlStr string ) (string , error ) {
183
+ // Extract content using the current method
184
+ originalContent , err := f .Extract (ctx , urlStr )
185
+ if err != nil {
186
+ return "" , fmt .Errorf ("failed to extract content: %v" , err )
187
+ }
188
+
189
+ // Get CSS selector from ChatGPT
190
+ selector , err := f .getChatGPTSelector (ctx , urlStr )
191
+ if err != nil {
192
+ return "" , fmt .Errorf ("failed to get CSS selector: %v" , err )
193
+ }
194
+
195
+ // Get the HTML body
196
+ body , err := f .getHTMLBody (urlStr )
197
+ if err != nil {
198
+ return "" , fmt .Errorf ("failed to get HTML body: %v" , err )
199
+ }
200
+
201
+ // Extract content using the new selector
202
+ newContent , err := f .extractContentWithSelector (body , selector )
203
+ if err != nil {
204
+ return "" , fmt .Errorf ("failed to extract content with new selector: %v" , err )
205
+ }
206
+
207
+ // Compare original and new content
208
+ if strings .TrimSpace (originalContent .Content ) != strings .TrimSpace (newContent ) {
209
+ // Contents are different, create a new rule
210
+ rule := datastore.Rule {
211
+ Author : "" ,
212
+ Domain : f .extractDomain (urlStr ),
213
+ Content : selector ,
214
+ TestURLs : []string {urlStr },
215
+ Enabled : true ,
216
+ }
217
+
218
+ _ , err = f .Rules .Save (ctx , rule )
219
+ if err != nil {
220
+ return "" , fmt .Errorf ("failed to save new rule: %v" , err )
221
+ }
222
+
223
+ return fmt .Sprintf ("new custom rule with DOM %s created" , selector ), nil
224
+ }
225
+
226
+ return "default rule is good, no need to create the custom one" , nil
227
+ }
228
+
229
+ func (f * UReadability ) getChatGPTSelector (ctx context.Context , urlStr string ) (string , error ) {
230
+ client := openai .NewClient (f .OpenAIKey )
231
+ resp , err := client .CreateChatCompletion (
232
+ ctx ,
233
+ openai.ChatCompletionRequest {
234
+ Model : openai .GPT4o ,
235
+ Messages : []openai.ChatCompletionMessage {
236
+ {
237
+ Role : openai .ChatMessageRoleSystem ,
238
+ Content : "You are a helpful assistant that provides CSS selectors for extracting main content from web pages." ,
239
+ },
240
+ {
241
+ Role : openai .ChatMessageRoleUser ,
242
+ Content : fmt .Sprintf ("Given the URL %s, identify the CSS selector that can be used to extract the main content of the article. This typically includes elements like 'article', 'main', or specific classes. Return only this selector and nothing else." , urlStr ),
243
+ },
244
+ },
245
+ },
246
+ )
247
+
248
+ if err != nil {
249
+ return "" , err
250
+ }
251
+
252
+ return resp .Choices [0 ].Message .Content , nil
253
+ }
254
+
255
+ func (f * UReadability ) getHTMLBody (urlStr string ) (string , error ) {
256
+ //nolint:gosec
257
+ resp , err := http .Get (urlStr )
258
+ if err != nil {
259
+ return "" , err
260
+ }
261
+ defer resp .Body .Close ()
262
+
263
+ body , err := io .ReadAll (resp .Body )
264
+ if err != nil {
265
+ return "" , err
266
+ }
267
+
268
+ return string (body ), nil
269
+ }
270
+
271
+ func (f * UReadability ) extractContentWithSelector (body , selector string ) (string , error ) {
272
+ doc , err := goquery .NewDocumentFromReader (strings .NewReader (body ))
273
+ if err != nil {
274
+ return "" , err
275
+ }
276
+
277
+ content := doc .Find (selector ).Text ()
278
+ return content , nil
279
+ }
280
+
281
+ func (f * UReadability ) extractDomain (urlStr string ) string {
282
+ u , err := url .Parse (urlStr )
283
+ if err != nil {
284
+ return ""
285
+ }
286
+ return u .Hostname ()
287
+ }
288
+
181
289
// getContent retrieves content from raw body string, both content (text only) and rich (with html tags)
182
290
// if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage,
183
291
// and at last tries to use general readability parser
0 commit comments