diff --git a/apps/studio/.env.example b/apps/studio/.env.example index 11d4b66d55..d7dc5668d5 100644 --- a/apps/studio/.env.example +++ b/apps/studio/.env.example @@ -4,4 +4,6 @@ VITE_SUPABASE_ANON_KEY= VITE_MIXPANEL_TOKEN= # Add your keys here to use Anthropic directly -VITE_ANTHROPIC_API_KEY= \ No newline at end of file +VITE_ANTHROPIC_API_KEY= +# Add your Firecrawl API key here +VITE_FIRECRAWL_API_KEY= \ No newline at end of file diff --git a/apps/studio/electron/main/chat/index.ts b/apps/studio/electron/main/chat/index.ts index 66590255b9..a6de28cc93 100644 --- a/apps/studio/electron/main/chat/index.ts +++ b/apps/studio/electron/main/chat/index.ts @@ -25,6 +25,7 @@ import { z } from 'zod'; import { mainWindow } from '..'; import { PersistentStorage } from '../storage'; import { initModel } from './llmProvider'; +import { extractUrls } from '@onlook/ai/src/tools/helpers'; class LlmManager { private static instance: LlmManager; @@ -59,6 +60,43 @@ class LlmManager { return LlmManager.instance; } + private async processUrls(content: string): Promise { + const urls = extractUrls(content); + + if (urls.length === 0) { + return content; + } + + try { + const result = await streamText({ + model: await initModel(LLMProvider.ANTHROPIC, CLAUDE_MODELS.SONNET, { + requestType: StreamRequestType.SUGGESTIONS, + }), + messages: [ + { + role: 'user', + content: content, + }, + ], + tools: { crawl_urls: chatToolSet.crawl_urls }, + maxTokens: 4000, + }); + + const crawledContent = ` + Original request: + ${content} + + Referenced content from URLs: + ${JSON.stringify(result, null, 2)} + `; + + return crawledContent; + } catch (error) { + console.error('Error processing URLs:', error); + return content; + } + } + public async stream( messages: CoreMessage[], requestType: StreamRequestType, @@ -67,6 +105,10 @@ class LlmManager { skipSystemPrompt?: boolean; }, ): Promise { + const lastUserMessage = messages.findLast((m) => m.role === 'user'); + if (lastUserMessage && typeof lastUserMessage.content === 'string') { + lastUserMessage.content = await this.processUrls(lastUserMessage.content); + } const { abortController, skipSystemPrompt } = options || {}; this.abortController = abortController || new AbortController(); try { diff --git a/apps/studio/package.json b/apps/studio/package.json index 2286c72927..5070e7d23b 100644 --- a/apps/studio/package.json +++ b/apps/studio/package.json @@ -53,6 +53,7 @@ "@emotion/react": "^11.13.3", "@emotion/styled": "^11.13.0", "@fontsource-variable/inter": "^5.1.0", + "@mendable/firecrawl-js": "^1.24.0", "@onlook/foundation": "*", "@onlook/supabase": "*", "@onlook/ui": "*", diff --git a/packages/ai/src/tools/crawler.ts b/packages/ai/src/tools/crawler.ts new file mode 100644 index 0000000000..ac9b693022 --- /dev/null +++ b/packages/ai/src/tools/crawler.ts @@ -0,0 +1,105 @@ +import FirecrawlApp from '@mendable/firecrawl-js'; + +export interface CrawlOptions { + limit?: number; + scrapeOptions?: { + formats?: ( + | 'markdown' + | 'html' + | 'rawHtml' + | 'content' + | 'links' + | 'screenshot' + | 'screenshot@fullPage' + | 'extract' + | 'json' + | 'changeTracking' + )[]; + }; +} + +export interface CrawlerResponse { + success: boolean; + error?: string; + data: Array<{ + html?: string; + markdown?: string; + }>; +} + +export interface CrawledContent { + markdown?: string; + html?: string; +} + +export function validateCrawlerResponse(response: unknown): response is CrawlerResponse { + if (!response || typeof response !== 'object') { + return false; + } + + if (!('success' in response) || typeof response.success !== 'boolean') { + return false; + } + + if (!('data' in response) || !Array.isArray(response.data)) { + return false; + } + + if (response.data.length === 0) { + return false; + } + + const firstItem = response.data[0]; + return ( + typeof firstItem === 'object' && + firstItem !== null && + ('html' in firstItem || 'markdown' in firstItem) && + (firstItem.html === undefined || typeof firstItem.html === 'string') && + (firstItem.markdown === undefined || typeof firstItem.markdown === 'string') + ); +} + +export class CrawlerService { + private static instance: CrawlerService; + + private app: FirecrawlApp; + + private constructor() { + const apiKey = import.meta.env.VITE_FIRECRAWL_API_KEY; + if (!apiKey) { + throw new Error( + 'VITE_FIRECRAWL_API_KEY is not defined. Please provide a valid API key.', + ); + } + this.app = new FirecrawlApp({ apiKey }); + } + + static getInstance(): CrawlerService { + if (!this.instance) { + this.instance = new CrawlerService(); + } + return this.instance; + } + + async crawlUrl( + url: string, + options: CrawlOptions = { + limit: 100, + scrapeOptions: { + formats: ['markdown', 'html'], + }, + }, + ) { + try { + const response = await this.app.crawlUrl(url, options); + + if (!response.success) { + throw new Error(`Failed to crawl: ${response.error}`); + } + return response; + } catch (error) { + console.error('Error during crawling:', error); + throw error; + } + } +} diff --git a/packages/ai/src/tools/helpers.ts b/packages/ai/src/tools/helpers.ts index 877e1ccb95..2c734079e1 100644 --- a/packages/ai/src/tools/helpers.ts +++ b/packages/ai/src/tools/helpers.ts @@ -40,3 +40,40 @@ export async function getAllFiles( return { success: false, error: error instanceof Error ? error.message : 'Unknown error' }; } } + +export function extractUrls(text: string): string[] { + // Regular URLs with http/https + const httpPattern = + /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)/gi; + + // URLs starting with www + const wwwPattern = + /(?:^|\s)www\.[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)/gi; + + // Markdown links [text](url) + const markdownPattern = /\[([^\]]+)\]\(([^)]+)\)/g; + + // Extract all types of URLs + const httpUrls = text.match(httpPattern) || []; + const wwwUrls = (text.match(wwwPattern) || []).map((url) => 'https://' + url.trim()); + const markdownUrls = Array.from(text.matchAll(markdownPattern), (match) => match[2]); + + // Combine all URLs and remove duplicates + const allUrls = [...httpUrls, ...wwwUrls, ...markdownUrls]; + + // Validate URLs and return unique ones + return Array.from( + new Set( + allUrls.filter((url) => { + try { + // Add https:// prefix if missing + const fullUrl = url.startsWith('http') ? url : `https://${url}`; + new URL(fullUrl); + return fullUrl; + } catch { + return false; + } + }), + ), + ); +} diff --git a/packages/ai/src/tools/index.ts b/packages/ai/src/tools/index.ts index d16e1ead8e..fb89e11b28 100644 --- a/packages/ai/src/tools/index.ts +++ b/packages/ai/src/tools/index.ts @@ -4,6 +4,7 @@ import { readFile } from 'fs/promises'; import { z } from 'zod'; import { ONLOOK_PROMPT } from '../prompt/onlook'; import { getAllFiles } from './helpers'; +import { CrawlerService } from './crawler'; export const listFilesTool = tool({ description: 'List all files in the current directory, including subdirectories', @@ -130,8 +131,65 @@ export const getStrReplaceEditorTool = (handlers: FileOperationHandlers) => { return strReplaceEditorTool; }; +export const crawlUrlTool = tool({ + description: 'Crawl webpage content from provided URL', + parameters: z.object({ + urls: z.array(z.string()).describe('Array of URLs to crawl'), + options: z + .object({ + limit: z.number().optional(), + scrapeOptions: z + .object({ + formats: z + .array( + z.enum([ + 'markdown', + 'html', + 'rawHtml', + 'content', + 'links', + 'screenshot', + 'screenshot@fullPage', + 'extract', + 'json', + 'changeTracking', + ]), + ) + .optional(), + }) + .optional(), + }) + .optional(), + }), + execute: async ({ urls, options }) => { + try { + const crawler = CrawlerService.getInstance(); + const results = await Promise.all( + urls.map(async (url) => { + try { + const result = await crawler.crawlUrl(url, options); + if (!result.success) { + return { url, error: result.status }; + } + return { url, data: result.data[0] }; + } catch (error) { + return { + url, + error: error instanceof Error ? error.message : 'Unknown error', + }; + } + }), + ); + return results; + } catch (error) { + return `Error: ${error instanceof Error ? error.message : 'Unknown error'}`; + } + }, +}); + export const chatToolSet: ToolSet = { list_files: listFilesTool, read_files: readFilesTool, onlook_instructions: onlookInstructionsTool, + crawl_url: crawlUrlTool, }; diff --git a/packages/models/src/chat/message/context.ts b/packages/models/src/chat/message/context.ts index 868bda8211..966622b889 100644 --- a/packages/models/src/chat/message/context.ts +++ b/packages/models/src/chat/message/context.ts @@ -4,6 +4,7 @@ export enum MessageContextType { IMAGE = 'image', ERROR = 'error', PROJECT = 'project', + LINK = 'link', } type BaseMessageContext = { @@ -38,9 +39,15 @@ export type ProjectMessageContext = BaseMessageContext & { path: string; }; +export type LinkMessageContext = BaseMessageContext & { + type: MessageContextType.LINK; + url: string; +}; + export type ChatMessageContext = | FileMessageContext | HighlightMessageContext | ImageMessageContext | ErrorMessageContext - | ProjectMessageContext; + | ProjectMessageContext + | LinkMessageContext;