@@ -14,6 +14,7 @@ import (
14
14
"github.com/PuerkitoBio/goquery"
15
15
log "github.com/go-pkgz/lgr"
16
16
"github.com/mauidude/go-readability"
17
+ "github.com/sashabaranov/go-openai"
17
18
"go.mongodb.org/mongo-driver/bson/primitive"
18
19
19
20
"github.com/ukeeper/ukeeper-redabilty/backend/datastore"
@@ -33,6 +34,7 @@ type UReadability struct {
33
34
TimeOut time.Duration
34
35
SnippetSize int
35
36
Rules Rules
37
+ OpenAIKey string
36
38
}
37
39
38
40
// Response from api calls
@@ -128,6 +130,113 @@ func (f UReadability) Extract(ctx context.Context, reqURL string) (rb *Response,
128
130
return rb , nil
129
131
}
130
132
133
+ // ContentParsedWrong handles the logic for when content is parsed incorrectly
134
+ func (f UReadability ) ContentParsedWrong (ctx context.Context , urlStr string ) (string , error ) {
135
+ // Extract content using the current method
136
+ originalContent , err := f .Extract (ctx , urlStr )
137
+ if err != nil {
138
+ return "" , fmt .Errorf ("failed to extract content: %v" , err )
139
+ }
140
+
141
+ // Get CSS selector from ChatGPT
142
+ selector , err := f .getChatGPTSelector (ctx , urlStr )
143
+ if err != nil {
144
+ return "" , fmt .Errorf ("failed to get CSS selector: %v" , err )
145
+ }
146
+
147
+ // Get the HTML body
148
+ body , err := f .getHTMLBody (urlStr )
149
+ if err != nil {
150
+ return "" , fmt .Errorf ("failed to get HTML body: %v" , err )
151
+ }
152
+
153
+ // Extract content using the new selector
154
+ newContent , err := f .extractContentWithSelector (body , selector )
155
+ if err != nil {
156
+ return "" , fmt .Errorf ("failed to extract content with new selector: %v" , err )
157
+ }
158
+
159
+ // Compare original and new content
160
+ if strings .TrimSpace (originalContent .Content ) != strings .TrimSpace (newContent ) {
161
+ // Contents are different, create a new rule
162
+ rule := datastore.Rule {
163
+ Author : "" ,
164
+ Domain : f .extractDomain (urlStr ),
165
+ Content : selector ,
166
+ TestURLs : []string {urlStr },
167
+ Enabled : true ,
168
+ }
169
+
170
+ _ , err = f .Rules .Save (ctx , rule )
171
+ if err != nil {
172
+ return "" , fmt .Errorf ("failed to save new rule: %v" , err )
173
+ }
174
+
175
+ return fmt .Sprintf ("new custom rule with DOM %s created" , selector ), nil
176
+ }
177
+
178
+ return "default rule is good, no need to create the custom one" , nil
179
+ }
180
+
181
+ func (f UReadability ) getChatGPTSelector (ctx context.Context , urlStr string ) (string , error ) {
182
+ client := openai .NewClient (f .OpenAIKey )
183
+ resp , err := client .CreateChatCompletion (
184
+ ctx ,
185
+ openai.ChatCompletionRequest {
186
+ Model : openai .GPT4o ,
187
+ Messages : []openai.ChatCompletionMessage {
188
+ {
189
+ Role : openai .ChatMessageRoleSystem ,
190
+ Content : "You are a helpful assistant that provides CSS selectors for extracting main content from web pages." ,
191
+ },
192
+ {
193
+ Role : openai .ChatMessageRoleUser ,
194
+ Content : fmt .Sprintf ("Given the URL %s, identify the CSS selector that can be used to extract the main content of the article. This typically includes elements like 'article', 'main', or specific classes. Return only this selector and nothing else." , urlStr ),
195
+ },
196
+ },
197
+ },
198
+ )
199
+
200
+ if err != nil {
201
+ return "" , err
202
+ }
203
+
204
+ return resp .Choices [0 ].Message .Content , nil
205
+ }
206
+
207
+ func (f UReadability ) getHTMLBody (urlStr string ) (string , error ) {
208
+ resp , err := http .Get (urlStr )
209
+ if err != nil {
210
+ return "" , err
211
+ }
212
+ defer resp .Body .Close ()
213
+
214
+ body , err := io .ReadAll (resp .Body )
215
+ if err != nil {
216
+ return "" , err
217
+ }
218
+
219
+ return string (body ), nil
220
+ }
221
+
222
+ func (f UReadability ) extractContentWithSelector (body , selector string ) (string , error ) {
223
+ doc , err := goquery .NewDocumentFromReader (strings .NewReader (body ))
224
+ if err != nil {
225
+ return "" , err
226
+ }
227
+
228
+ content := doc .Find (selector ).Text ()
229
+ return content , nil
230
+ }
231
+
232
+ func (f UReadability ) extractDomain (urlStr string ) string {
233
+ u , err := url .Parse (urlStr )
234
+ if err != nil {
235
+ return ""
236
+ }
237
+ return u .Hostname ()
238
+ }
239
+
131
240
// gets content from raw body string, both content (text only) and rich (with html tags)
132
241
func (f UReadability ) getContent (ctx context.Context , body , reqURL string ) (content , rich string , err error ) {
133
242
// general parser
0 commit comments