diff --git a/scrapegraph-js/README.md b/scrapegraph-js/README.md index ae1d64a..83a795a 100644 --- a/scrapegraph-js/README.md +++ b/scrapegraph-js/README.md @@ -55,6 +55,91 @@ const prompt = 'What does the company do?'; ## 🎯 Examples +### Scrape - Get HTML Content + +#### Basic Scrape + +```javascript +import { scrape } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; +const url = 'https://example.com'; + +(async () => { + try { + const response = await scrape(apiKey, url); + console.log('HTML content:', response.html); + console.log('Status:', response.status); + } catch (error) { + console.error('Error:', error); + } +})(); +``` + +#### Scrape with Heavy JavaScript Rendering + +```javascript +import { scrape } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; +const url = 'https://example.com'; + +(async () => { + try { + const response = await scrape(apiKey, url, { + renderHeavyJs: true + }); + console.log('HTML content with JS rendering:', response.html); + } catch (error) { + console.error('Error:', error); + } +})(); +``` + +#### Scrape with Custom Headers + +```javascript +import { scrape } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; +const url = 'https://example.com'; + +(async () => { + try { + const response = await scrape(apiKey, url, { + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Cookie': 'session=123' + } + }); + console.log('HTML content with custom headers:', response.html); + } catch (error) { + console.error('Error:', error); + } +})(); +``` + +#### Get Scrape Request Status + +```javascript +import { getScrapeRequest } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; +const requestId = 'your-request-id'; + +(async () => { + try { + const response = await getScrapeRequest(apiKey, requestId); + console.log('Request status:', response.status); + if (response.status === 'completed') { + console.log('HTML content:', response.html); + } + } catch (error) { + console.error('Error:', error); + } +})(); +``` + ### Scraping Websites #### Basic Scraping @@ -395,6 +480,99 @@ const feedbackText = 'This is a test feedback message.'; })(); ``` +## πŸ”§ Available Functions + +### Scrape + +#### `scrape(apiKey, url, options)` + +Converts a webpage into HTML format with optional JavaScript rendering. + +**Parameters:** +- `apiKey` (string): Your ScrapeGraph AI API key +- `url` (string): The URL of the webpage to convert +- `options` (object, optional): Configuration options + - `renderHeavyJs` (boolean, optional): Whether to render heavy JavaScript (default: false) + - `headers` (object, optional): Custom headers to send with the request + +**Returns:** Promise that resolves to an object containing: +- `html`: The HTML content of the webpage +- `status`: Request status ('completed', 'processing', 'failed') +- `scrape_request_id`: Unique identifier for the request +- `error`: Error message if the request failed + +**Example:** +```javascript +const response = await scrape(apiKey, 'https://example.com', { + renderHeavyJs: true, + headers: { 'User-Agent': 'Custom Agent' } +}); +``` + +#### `getScrapeRequest(apiKey, requestId)` + +Retrieves the status or result of a previous scrape request. + +**Parameters:** +- `apiKey` (string): Your ScrapeGraph AI API key +- `requestId` (string): The unique identifier for the scrape request + +**Returns:** Promise that resolves to the request result object. + +**Example:** +```javascript +const result = await getScrapeRequest(apiKey, 'request-id-here'); +``` + +### Smart Scraper + +#### `smartScraper(apiKey, url, prompt, schema, numberOfScrolls, totalPages, cookies)` + +Extracts structured data from websites using AI-powered scraping. + +**Parameters:** +- `apiKey` (string): Your ScrapeGraph AI API key +- `url` (string): The URL of the website to scrape +- `prompt` (string): Natural language prompt describing what to extract +- `schema` (object, optional): Zod schema for structured output +- `numberOfScrolls` (number, optional): Number of scrolls for infinite scroll pages +- `totalPages` (number, optional): Number of pages to scrape +- `cookies` (object, optional): Cookies for authentication + +### Search Scraper + +#### `searchScraper(apiKey, prompt, url, numResults, headers, outputSchema)` + +Searches and extracts information from multiple web sources using AI. + +### Crawl API + +#### `crawl(apiKey, url, prompt, dataSchema, extractionMode, cacheWebsite, depth, maxPages, sameDomainOnly, sitemap, batchSize)` + +Starts a crawl job to extract structured data from a website and its linked pages. + +### Markdownify + +#### `markdownify(apiKey, url, headers)` + +Converts a webpage into clean, well-structured markdown format. + +### Agentic Scraper + +#### `agenticScraper(apiKey, url, steps, useSession, userPrompt, outputSchema, aiExtraction)` + +Performs automated actions on webpages using step-by-step instructions. + +### Utility Functions + +#### `getCredits(apiKey)` + +Retrieves your current credit balance and usage statistics. + +#### `sendFeedback(apiKey, requestId, rating, feedbackText)` + +Submits feedback for a specific request. + ## πŸ“š Documentation For detailed documentation, visit [docs.scrapegraphai.com](https://docs.scrapegraphai.com) diff --git a/scrapegraph-js/examples/scrape_advanced_example.js b/scrapegraph-js/examples/scrape_advanced_example.js new file mode 100644 index 0000000..50b1093 --- /dev/null +++ b/scrapegraph-js/examples/scrape_advanced_example.js @@ -0,0 +1,524 @@ +/** + * Advanced example demonstrating comprehensive usage of the Scrape API with the scrapegraph-js SDK. + * + * This example shows how to: + * 1. Set up the client for Scrape with various configurations + * 2. Handle different types of websites and rendering modes + * 3. Implement error handling and retry logic + * 4. Process multiple websites concurrently + * 5. Save and analyze HTML content with detailed metadata + * 6. Use custom headers and cookies for authentication + * 7. Compare different rendering modes + * + * Requirements: + * - Node.js 16+ + * - scrapegraph-js + * - A valid API key + * + * Usage: + * node scrape_advanced_example.js + */ + +import { scrape, getScrapeRequest } from '../index.js'; +import fs from 'fs/promises'; +import path from 'path'; + +// Configuration +const API_KEY = process.env.SGAI_API_KEY || 'your-api-key-here'; +const OUTPUT_DIR = 'scrape_advanced_output'; + +/** + * Scrape processor with advanced features + */ +class ScrapeProcessor { + constructor(apiKey) { + this.apiKey = apiKey; + this.retryDelays = [1000, 2000, 4000]; // Exponential backoff delays + } + + /** + * Get HTML content from a website using the Scrape API with retry logic. + * + * @param {string} websiteUrl - The URL of the website to get HTML from + * @param {Object} options - Options for the scrape request + * @returns {Object} The API response with additional metadata + */ + async scrapeWebsite(websiteUrl, options = {}) { + const { renderHeavyJs = false, headers = {}, maxRetries = 3 } = options; + + const jsMode = renderHeavyJs ? 'with heavy JS rendering' : 'without JS rendering'; + console.log(`🌐 Getting HTML content from: ${websiteUrl}`); + console.log(`πŸ”§ Mode: ${jsMode}`); + + for (let attempt = 0; attempt < maxRetries; attempt++) { + try { + const startTime = Date.now(); + const result = await scrape(this.apiKey, websiteUrl, { + renderHeavyJs, + headers + }); + const executionTime = (Date.now() - startTime) / 1000; + + console.log(`βœ… Success! Execution time: ${executionTime.toFixed(2)} seconds`); + return { + ...result, + executionTime, + attempts: attempt + 1 + }; + + } catch (error) { + console.error(`❌ Attempt ${attempt + 1} failed: ${error.message}`); + if (attempt < maxRetries - 1) { + const waitTime = this.retryDelays[attempt] || 2000; + console.log(`⏳ Waiting ${waitTime}ms before retry...`); + await new Promise(resolve => setTimeout(resolve, waitTime)); + } else { + console.error(`πŸ’₯ All ${maxRetries} attempts failed for ${websiteUrl}`); + throw error; + } + } + } + } + + /** + * Process multiple websites concurrently. + * + * @param {Array} websites - Array of website configurations + * @param {number} maxConcurrency - Maximum number of concurrent requests + * @returns {Array} Results for each website + */ + async processWebsiteBatch(websites, maxConcurrency = 3) { + const results = []; + + // Process websites in batches to control concurrency + for (let i = 0; i < websites.length; i += maxConcurrency) { + const batch = websites.slice(i, i + maxConcurrency); + const batchPromises = batch.map(website => + this.processSingleWebsite(website) + ); + + const batchResults = await Promise.allSettled(batchPromises); + + // Process batch results + batchResults.forEach((result, index) => { + const website = batch[index]; + if (result.status === 'fulfilled') { + results.push({ + website: website.url, + success: true, + data: result.value + }); + } else { + results.push({ + website: website.url, + success: false, + error: result.reason.message + }); + } + }); + + // Add a small delay between batches to be respectful to the API + if (i + maxConcurrency < websites.length) { + await new Promise(resolve => setTimeout(resolve, 1000)); + } + } + + return results; + } + + /** + * Process a single website with comprehensive analysis. + * + * @param {Object} website - Website configuration object + * @returns {Object} Processing results + */ + async processSingleWebsite(website) { + const { url, name, renderHeavyJs = false, description, headers = {} } = website; + + console.log(`\nπŸ” Processing: ${description}`); + console.log(`πŸ“ URL: ${url}`); + console.log(`βš™οΈ Render Heavy JS: ${renderHeavyJs}`); + + try { + // Get HTML content + const result = await this.scrapeWebsite(url, { + renderHeavyJs, + headers + }); + + // Analyze the HTML content + const analysis = this.analyzeHtmlContent(result.html); + + // Save the HTML content + const filename = `${name}_${renderHeavyJs ? 'js' : 'nojs'}`; + const savedFile = await this.saveHtmlContent(result.html, filename); + + // Create comprehensive result object + const processedResult = { + website: url, + name, + description, + renderHeavyJs, + success: true, + requestId: result.scrape_request_id, + status: result.status, + executionTime: result.executionTime, + attempts: result.attempts, + analysis, + savedFile, + metadata: { + timestamp: new Date().toISOString(), + userAgent: headers['User-Agent'] || 'Default', + hasCustomHeaders: Object.keys(headers).length > 0 + } + }; + + console.log(`βœ… Successfully processed ${url}`); + console.log(`πŸ“Š Analysis: ${analysis.totalLength.toLocaleString()} chars, ${analysis.lines.toLocaleString()} lines`); + console.log(`πŸ’Ύ Saved to: ${savedFile}`); + + return processedResult; + + } catch (error) { + console.error(`❌ Failed to process ${url}: ${error.message}`); + return { + website: url, + name, + description, + renderHeavyJs, + success: false, + error: error.message, + timestamp: new Date().toISOString() + }; + } + } + + /** + * Analyze HTML content and provide detailed statistics. + * + * @param {string} htmlContent - The HTML content to analyze + * @returns {Object} Detailed analysis of the HTML content + */ + analyzeHtmlContent(htmlContent) { + if (!htmlContent) { + return { + totalLength: 0, + lines: 0, + hasDoctype: false, + hasHtmlTag: false, + hasHeadTag: false, + hasBodyTag: false, + scriptTags: 0, + styleTags: 0, + divTags: 0, + pTags: 0, + imgTags: 0, + linkTags: 0, + aTags: 0, + spanTags: 0, + tableTags: 0, + formTags: 0, + inputTags: 0, + buttonTags: 0, + metaTags: 0, + titleTags: 0, + h1Tags: 0, + h2Tags: 0, + h3Tags: 0, + h4Tags: 0, + h5Tags: 0, + h6Tags: 0, + listTags: 0, + codeTags: 0, + preTags: 0, + blockquoteTags: 0, + iframeTags: 0, + canvasTags: 0, + svgTags: 0, + videoTags: 0, + audioTags: 0, + embedTags: 0, + objectTags: 0, + paramTags: 0, + sourceTags: 0, + trackTags: 0, + mapTags: 0, + areaTags: 0, + baseTags: 0, + bdoTags: 0, + brTags: 0, + hrTags: 0, + imgTags: 0, + inputTags: 0, + linkTags: 0, + metaTags: 0, + paramTags: 0, + sourceTags: 0, + trackTags: 0, + wbrTags: 0 + }; + } + + const stats = { + totalLength: htmlContent.length, + lines: htmlContent.split('\n').length, + hasDoctype: htmlContent.trim().startsWith(' r.success).length, + failed: results.filter(r => !r.success).length, + timestamp: new Date().toISOString(), + apiKey: this.apiKey.substring(0, 8) + '...' + }, + results: results, + statistics: { + averageExecutionTime: 0, + totalExecutionTime: 0, + averageAttempts: 0, + totalAttempts: 0 + } + }; + + // Calculate statistics + const successfulResults = results.filter(r => r.success); + if (successfulResults.length > 0) { + report.statistics.averageExecutionTime = + successfulResults.reduce((sum, r) => sum + (r.executionTime || 0), 0) / successfulResults.length; + report.statistics.totalExecutionTime = + successfulResults.reduce((sum, r) => sum + (r.executionTime || 0), 0); + report.statistics.averageAttempts = + successfulResults.reduce((sum, r) => sum + (r.attempts || 1), 0) / successfulResults.length; + report.statistics.totalAttempts = + successfulResults.reduce((sum, r) => sum + (r.attempts || 1), 0); + } + + // Save report as JSON + const reportFile = path.join(outputDir, 'processing_report.json'); + await fs.writeFile(reportFile, JSON.stringify(report, null, 2), 'utf8'); + + // Save summary as text + const summaryFile = path.join(outputDir, 'summary.txt'); + const summaryText = this.formatSummary(report); + await fs.writeFile(summaryFile, summaryText, 'utf8'); + + console.log(`\nπŸ“Š Report generated:`); + console.log(` πŸ“„ JSON: ${reportFile}`); + console.log(` πŸ“ Summary: ${summaryFile}`); + + return { reportFile, summaryFile }; + } + + /** + * Format the summary report as readable text. + * + * @param {Object} report - The processing report + * @returns {string} Formatted summary text + */ + formatSummary(report) { + const { summary, statistics } = report; + + let text = 'SCRAPE API PROCESSING REPORT\n'; + text += '='.repeat(50) + '\n\n'; + text += `Generated: ${summary.timestamp}\n`; + text += `Total Websites: ${summary.totalWebsites}\n`; + text += `Successful: ${summary.successful}\n`; + text += `Failed: ${summary.failed}\n`; + text += `Success Rate: ${((summary.successful / summary.totalWebsites) * 100).toFixed(1)}%\n\n`; + + if (summary.successful > 0) { + text += `PERFORMANCE STATISTICS\n`; + text += '-'.repeat(30) + '\n'; + text += `Average Execution Time: ${statistics.averageExecutionTime.toFixed(2)}s\n`; + text += `Total Execution Time: ${statistics.totalExecutionTime.toFixed(2)}s\n`; + text += `Average Attempts: ${statistics.averageAttempts.toFixed(1)}\n`; + text += `Total Attempts: ${statistics.totalAttempts}\n\n`; + } + + text += `DETAILED RESULTS\n`; + text += '-'.repeat(30) + '\n'; + + report.results.forEach((result, index) => { + text += `${index + 1}. ${result.website}\n`; + text += ` Status: ${result.success ? 'βœ… Success' : '❌ Failed'}\n`; + if (result.success) { + text += ` Execution Time: ${result.executionTime?.toFixed(2)}s\n`; + text += ` Attempts: ${result.attempts}\n`; + text += ` Saved: ${result.savedFile}\n`; + } else { + text += ` Error: ${result.error}\n`; + } + text += '\n'; + }); + + return text; + } +} + +/** + * Main function demonstrating advanced Scrape API usage. + */ +async function main() { + // Example websites to test with different configurations + const testWebsites = [ + { + url: 'https://example.com', + name: 'example', + renderHeavyJs: false, + description: 'Simple static website', + headers: {} + }, + { + url: 'https://httpbin.org/html', + name: 'httpbin_html', + renderHeavyJs: false, + description: 'HTTP testing service', + headers: {} + }, + { + url: 'https://httpbin.org/user-agent', + name: 'httpbin_user_agent', + renderHeavyJs: false, + description: 'User agent testing with custom headers', + headers: { + 'User-Agent': 'Custom Scraper Bot/1.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' + } + } + ]; + + console.log('πŸš€ Advanced Scrape API Example with scrapegraph-js SDK'); + console.log('='.repeat(70)); + + // Check API key + if (!API_KEY || API_KEY === 'your-api-key-here') { + console.error('❌ Please set your SGAI_API_KEY environment variable'); + console.error('Example: export SGAI_API_KEY=your_api_key_here'); + process.exit(1); + } + + console.log('βœ… API key configured'); + console.log(`πŸ“Š Processing ${testWebsites.length} websites with advanced features\n`); + + try { + // Initialize the processor + const processor = new ScrapeProcessor(API_KEY); + + // Process websites with controlled concurrency + const results = await processor.processWebsiteBatch(testWebsites, 2); + + // Generate comprehensive report + await processor.generateReport(results); + + // Display final summary + const successful = results.filter(r => r.success).length; + const failed = results.filter(r => !r.success).length; + + console.log('\n🎯 FINAL SUMMARY'); + console.log('='.repeat(30)); + console.log(`βœ… Successful: ${successful}`); + console.log(`❌ Failed: ${failed}`); + console.log(`πŸ“Š Success Rate: ${((successful / results.length) * 100).toFixed(1)}%`); + console.log(`πŸ“ Output saved to: ${OUTPUT_DIR}/`); + + if (failed > 0) { + console.log('\n❌ Failed websites:'); + results.filter(r => !r.success).forEach(result => { + console.log(` - ${result.website}: ${result.error}`); + }); + } + + console.log('\nβœ… Advanced scrape example completed successfully'); + + } catch (error) { + console.error('πŸ’₯ Fatal error:', error.message); + process.exit(1); + } +} + +// Run the example +if (import.meta.url === `file://${process.argv[1]}`) { + main().catch(error => { + console.error('❌ Fatal error:', error.message); + process.exit(1); + }); +} diff --git a/scrapegraph-js/examples/scrape_example.js b/scrapegraph-js/examples/scrape_example.js new file mode 100644 index 0000000..15bf3b1 --- /dev/null +++ b/scrapegraph-js/examples/scrape_example.js @@ -0,0 +1,205 @@ +/** + * Example demonstrating how to use the Scrape API with the scrapegraph-js SDK. + * + * This example shows how to: + * 1. Set up the API request for Scrape + * 2. Make the API call to get HTML content from a website + * 3. Handle the response and save the HTML content + * 4. Demonstrate both regular and heavy JS rendering modes + * 5. Display the results and metadata + * + * Requirements: + * - Node.js 16+ + * - scrapegraph-js + * - A valid API key + * + * Usage: + * node scrape_example.js + */ + +import { scrape, getScrapeRequest } from '../index.js'; +import fs from 'fs/promises'; +import path from 'path'; + +// Configuration +const API_KEY = process.env.SGAI_API_KEY || 'your-api-key-here'; +const OUTPUT_DIR = 'scrape_output'; + +/** + * Get HTML content from a website using the Scrape API. + * + * @param {string} websiteUrl - The URL of the website to get HTML from + * @param {Object} options - Options for the scrape request + * @returns {Object} The API response containing HTML content and metadata + */ +async function scrapeWebsite(websiteUrl, options = {}) { + const { renderHeavyJs = false, headers = {} } = options; + + const jsMode = renderHeavyJs ? 'with heavy JS rendering' : 'without JS rendering'; + console.log(`Getting HTML content from: ${websiteUrl}`); + console.log(`Mode: ${jsMode}`); + + const startTime = Date.now(); + + try { + const result = await scrape(API_KEY, websiteUrl, { + renderHeavyJs, + headers + }); + + const executionTime = (Date.now() - startTime) / 1000; + console.log(`Execution time: ${executionTime.toFixed(2)} seconds`); + + return result; + } catch (error) { + console.error(`Error: ${error.message}`); + throw error; + } +} + +/** + * Save HTML content to a file. + * + * @param {string} htmlContent - The HTML content to save + * @param {string} filename - The name of the file (without extension) + * @param {string} outputDir - The directory to save the file in + * @returns {string} Path to the saved file + */ +async function saveHtmlContent(htmlContent, filename, outputDir = OUTPUT_DIR) { + // Create output directory if it doesn't exist + try { + await fs.mkdir(outputDir, { recursive: true }); + } catch (error) { + if (error.code !== 'EEXIST') { + throw error; + } + } + + // Save HTML file + const htmlFile = path.join(outputDir, `${filename}.html`); + await fs.writeFile(htmlFile, htmlContent, 'utf8'); + + console.log(`HTML content saved to: ${htmlFile}`); + return htmlFile; +} + +/** + * Analyze HTML content and provide basic statistics. + * + * @param {string} htmlContent - The HTML content to analyze + * @returns {Object} Basic statistics about the HTML content + */ +function analyzeHtmlContent(htmlContent) { + const stats = { + totalLength: htmlContent.length, + lines: htmlContent.split('\n').length, + hasDoctype: htmlContent.trim().startsWith(' { + console.error('❌ Fatal error:', error.message); + process.exit(1); + }); +} diff --git a/scrapegraph-js/examples/scrape_polling_example.js b/scrapegraph-js/examples/scrape_polling_example.js new file mode 100644 index 0000000..87820e9 --- /dev/null +++ b/scrapegraph-js/examples/scrape_polling_example.js @@ -0,0 +1,288 @@ +/** + * Example demonstrating how to use Scrape with polling for results. + * + * This example shows how to: + * 1. Make a scrape request + * 2. Poll for results until completion + * 3. Handle different status responses + * 4. Implement timeout and retry logic + * + * Requirements: + * - Node.js 16+ + * - scrapegraph-js + * - A valid API key + * + * Usage: + * node scrape_polling_example.js + */ + +import { scrape, getScrapeRequest } from '../index.js'; +import fs from 'fs/promises'; +import path from 'path'; + +// Configuration +const API_KEY = process.env.SGAI_API_KEY || 'your-api-key-here'; +const OUTPUT_DIR = 'scrape_polling_output'; +const POLLING_INTERVAL = 2000; // 2 seconds +const MAX_POLLING_TIME = 300000; // 5 minutes +const MAX_RETRIES = 3; + +/** + * Wait for a specified amount of time. + * + * @param {number} ms - Milliseconds to wait + * @returns {Promise} Promise that resolves after the specified time + */ +function wait(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +/** + * Poll for scrape results until completion or timeout. + * + * @param {string} apiKey - Your API key + * @param {string} requestId - The request ID to poll for + * @param {Object} options - Polling options + * @returns {Object} The final result + */ +async function pollScrapeResult(apiKey, requestId, options = {}) { + const { + interval = POLLING_INTERVAL, + maxTime = MAX_POLLING_TIME, + maxRetries = MAX_RETRIES + } = options; + + console.log(`πŸ” Polling for scrape result: ${requestId}`); + console.log(`⏱️ Polling interval: ${interval}ms`); + console.log(`⏰ Max polling time: ${maxTime / 1000}s`); + + const startTime = Date.now(); + let attempt = 0; + + while (true) { + try { + attempt++; + console.log(`\nπŸ“‘ Polling attempt ${attempt}...`); + + const result = await getScrapeRequest(apiKey, requestId); + + console.log(`πŸ“Š Status: ${result.status}`); + + if (result.status === 'completed') { + console.log('βœ… Scrape request completed successfully!'); + return result; + } else if (result.status === 'failed') { + console.error(`❌ Scrape request failed: ${result.error || 'Unknown error'}`); + throw new Error(`Scrape request failed: ${result.error || 'Unknown error'}`); + } else if (result.status === 'processing') { + console.log('⏳ Request is still processing...'); + + // Check if we've exceeded the maximum polling time + if (Date.now() - startTime > maxTime) { + throw new Error(`Polling timeout after ${maxTime / 1000}s`); + } + + // Wait before the next poll + console.log(`⏳ Waiting ${interval / 1000}s before next poll...`); + await wait(interval); + + } else { + console.log(`ℹ️ Unknown status: ${result.status}`); + + // Check if we've exceeded the maximum polling time + if (Date.now() - startTime > maxTime) { + throw new Error(`Polling timeout after ${maxTime / 1000}s`); + } + + // Wait before the next poll + await wait(interval); + } + + } catch (error) { + console.error(`❌ Polling error: ${error.message}`); + + // Check if we've exceeded the maximum polling time + if (Date.now() - startTime > maxTime) { + throw new Error(`Polling timeout after ${maxTime / 1000}s`); + } + + // Check if we've exceeded the maximum retries + if (attempt >= maxRetries) { + throw new Error(`Max retries (${maxRetries}) exceeded`); + } + + // Wait before retry + console.log(`⏳ Waiting ${interval / 1000}s before retry...`); + await wait(interval); + } + } +} + +/** + * Save HTML content to a file. + * + * @param {string} htmlContent - The HTML content to save + * @param {string} filename - The name of the file (without extension) + * @param {string} outputDir - The directory to save the file in + * @returns {string} Path to the saved file + */ +async function saveHtmlContent(htmlContent, filename, outputDir = OUTPUT_DIR) { + try { + await fs.mkdir(outputDir, { recursive: true }); + } catch (error) { + if (error.code !== 'EEXIST') { + throw error; + } + } + + const htmlFile = path.join(outputDir, `${filename}.html`); + await fs.writeFile(htmlFile, htmlContent, 'utf8'); + + console.log(`πŸ’Ύ HTML content saved to: ${htmlFile}`); + return htmlFile; +} + +/** + * Analyze HTML content and provide basic statistics. + * + * @param {string} htmlContent - The HTML content to analyze + * @returns {Object} Basic statistics about the HTML content + */ +function analyzeHtmlContent(htmlContent) { + if (!htmlContent) { + return { + totalLength: 0, + lines: 0, + hasDoctype: false, + hasHtmlTag: false, + hasHeadTag: false, + hasBodyTag: false, + scriptTags: 0, + styleTags: 0, + divTags: 0, + pTags: 0, + imgTags: 0, + linkTags: 0 + }; + } + + const stats = { + totalLength: htmlContent.length, + lines: htmlContent.split('\n').length, + hasDoctype: htmlContent.trim().startsWith(' { + console.error('❌ Fatal error:', error.message); + process.exit(1); + }); +} diff --git a/scrapegraph-js/index.js b/scrapegraph-js/index.js index 47ee6ce..e41f6d7 100644 --- a/scrapegraph-js/index.js +++ b/scrapegraph-js/index.js @@ -1,6 +1,7 @@ export { agenticScraper, getAgenticScraperRequest } from './src/agenticScraper.js'; export { smartScraper, getSmartScraperRequest } from './src/smartScraper.js'; export { markdownify, getMarkdownifyRequest } from './src/markdownify.js'; +export { scrape, getScrapeRequest } from './src/scrape.js'; export { searchScraper, getSearchScraperRequest } from './src/searchScraper.js'; export { getCredits } from './src/credits.js'; export { sendFeedback } from './src/feedback.js'; diff --git a/scrapegraph-js/src/scrape.js b/scrapegraph-js/src/scrape.js new file mode 100644 index 0000000..09c0a9a --- /dev/null +++ b/scrapegraph-js/src/scrape.js @@ -0,0 +1,130 @@ +import axios from 'axios'; +import handleError from './utils/handleError.js'; + +/** + * Converts a webpage into HTML format with optional JavaScript rendering. + * + * @param {string} apiKey - Your ScrapeGraph AI API key. + * @param {string} url - The URL of the webpage to be converted. + * @param {Object} options - Optional configuration options. + * @param {boolean} options.renderHeavyJs - Whether to render heavy JavaScript (defaults to false). + * @param {Object} options.headers - Optional custom headers to send with the request. + * @returns {Promise} A promise that resolves to the HTML content and metadata. + * @throws {Error} Throws an error if the HTTP request fails. + * + * @example + * // Basic usage: + * const apiKey = 'your-api-key'; + * const url = 'https://example.com'; + * + * try { + * const result = await scrape(apiKey, url); + * console.log('HTML content:', result.html); + * console.log('Status:', result.status); + * } catch (error) { + * console.error('Error:', error); + * } + * + * @example + * // With JavaScript rendering: + * const result = await scrape(apiKey, url, { + * renderHeavyJs: true + * }); + * + * @example + * // With custom headers: + * const result = await scrape(apiKey, url, { + * renderHeavyJs: false, + * headers: { + * 'User-Agent': 'Custom Agent', + * 'Cookie': 'session=123' + * } + * }); + */ +export async function scrape(apiKey, url, options = {}) { + const { + renderHeavyJs = false, + headers: customHeaders = {} + } = options; + + const endpoint = 'https://api.scrapegraphai.com/v1/scrape'; + const headers = { + 'accept': 'application/json', + 'SGAI-APIKEY': apiKey, + 'Content-Type': 'application/json', + ...customHeaders + }; + + const payload = { + website_url: url, + render_heavy_js: renderHeavyJs, + }; + + // Only include headers in payload if they are provided + if (Object.keys(customHeaders).length > 0) { + payload.headers = customHeaders; + } + + try { + const response = await axios.post(endpoint, payload, { headers }); + return response.data; + } catch (error) { + handleError(error); + } +} + +/** + * Retrieves the status or result of a scrape request. + * + * @param {string} apiKey - Your ScrapeGraph AI API key. + * @param {string} requestId - The unique identifier for the scrape request. + * @returns {Promise} A promise that resolves to an object containing: + * - status: The current status of the request ('pending', 'completed', 'failed') + * - html: The HTML content when status is 'completed' + * - scrape_request_id: The request identifier + * - error: Error message if the request failed (when status is 'failed') + * - created_at: Timestamp of when the request was created + * - completed_at: Timestamp of when the request was completed (if applicable) + * @throws {Error} Throws an error if the HTTP request fails or if the API key is invalid + * + * @example + * // Example usage: + * const apiKey = 'your-api-key'; + * const requestId = 'previously-obtained-request-id'; + * + * try { + * const result = await getScrapeRequest(apiKey, requestId); + * if (result.status === 'completed') { + * console.log('HTML content:', result.html); + * console.log('Request ID:', result.scrape_request_id); + * } else if (result.status === 'pending') { + * console.log('HTML conversion is still in progress'); + * } else { + * console.log('HTML conversion failed:', result.error); + * } + * } catch (error) { + * console.error('Error fetching HTML:', error); + * } + * + * @note The HTML content includes: + * - Full HTML structure with DOCTYPE + * - Head section with meta tags, title, and styles + * - Body content with all elements + * - JavaScript code (if renderHeavyJs was enabled) + * - CSS styles and formatting + * - Images, links, and other media elements + */ +export async function getScrapeRequest(apiKey, requestId) { + const endpoint = 'https://api.scrapegraphai.com/v1/scrape/' + requestId; + const headers = { + 'accept': 'application/json', + 'SGAI-APIKEY': apiKey, + }; + + try { + const response = await axios.get(endpoint, { headers }); + return response.data; + } catch (error) { + handleError(error); + } +} diff --git a/scrapegraph-js/test/scrape_test.js b/scrapegraph-js/test/scrape_test.js new file mode 100644 index 0000000..cc7e909 --- /dev/null +++ b/scrapegraph-js/test/scrape_test.js @@ -0,0 +1,451 @@ +import { scrape, getScrapeRequest } from '../index.js'; +import 'dotenv/config'; + +/** + * Test suite for Scrape functionality + * This file demonstrates usage and validates the Scrape parameters + */ + +// Mock API key for testing (replace with real key for actual testing) +const API_KEY = process.env.SGAI_APIKEY || 'test-api-key'; + +/** + * Test input validation for scrape + */ +function testInputValidation() { + console.log('πŸ§ͺ Testing Scrape Input Validation'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'Valid inputs - basic', + apiKey: 'valid-key', + url: 'https://example.com', + options: {}, + expected: true, + description: 'All valid parameters with default options' + }, + { + name: 'Valid inputs - with heavy JS', + apiKey: 'valid-key', + url: 'https://example.com', + options: { renderHeavyJs: true }, + expected: true, + description: 'Valid parameters with heavy JS rendering' + }, + { + name: 'Valid inputs - with headers', + apiKey: 'valid-key', + url: 'https://example.com', + options: { + headers: { 'User-Agent': 'Test Agent' } + }, + expected: true, + description: 'Valid parameters with custom headers' + }, + { + name: 'Valid inputs - with all options', + apiKey: 'valid-key', + url: 'https://example.com', + options: { + renderHeavyJs: true, + headers: { 'User-Agent': 'Test Agent' } + }, + expected: true, + description: 'Valid parameters with all options enabled' + }, + { + name: 'Invalid URL - no protocol', + apiKey: 'valid-key', + url: 'example.com', + options: {}, + expected: false, + description: 'URL without http/https protocol' + }, + { + name: 'Invalid URL - relative path', + apiKey: 'valid-key', + url: '/path/to/page', + options: {}, + expected: false, + description: 'Relative path instead of absolute URL' + }, + { + name: 'Invalid URL - empty string', + apiKey: 'valid-key', + url: '', + options: {}, + expected: false, + description: 'Empty URL string' + }, + { + name: 'Invalid URL - null', + apiKey: 'valid-key', + url: null, + options: {}, + expected: false, + description: 'Null URL' + }, + { + name: 'Empty API key', + apiKey: '', + url: 'https://example.com', + options: {}, + expected: false, + description: 'Empty API key string' + }, + { + name: 'Invalid API key type', + apiKey: 123, + url: 'https://example.com', + options: {}, + expected: false, + description: 'API key as number instead of string' + } + ]; + + let passed = 0; + let total = testCases.length; + + testCases.forEach((testCase, index) => { + console.log(`\n${index + 1}. ${testCase.name}`); + console.log(` Description: ${testCase.description}`); + + try { + // Validate inputs + const isValid = validateScrapeInputs( + testCase.apiKey, + testCase.url, + testCase.options + ); + + if (isValid === testCase.expected) { + console.log(` βœ… PASSED`); + passed++; + } else { + console.log(` ❌ FAILED - Expected: ${testCase.expected}, Got: ${isValid}`); + } + } catch (error) { + if (!testCase.expected) { + console.log(` βœ… PASSED (Expected error: ${error.message})`); + passed++; + } else { + console.log(` ❌ FAILED - Unexpected error: ${error.message}`); + } + } + }); + + console.log(`\nπŸ“Š Input Validation Results: ${passed}/${total} tests passed`); + return passed === total; +} + +/** + * Validate scrape function inputs + */ +function validateScrapeInputs(apiKey, url, options) { + // Check API key + if (!apiKey || typeof apiKey !== 'string' || apiKey.trim() === '') { + throw new Error('Invalid API key'); + } + + // Check URL + if (!url || typeof url !== 'string' || url.trim() === '') { + throw new Error('Invalid URL'); + } + + // Check URL format + if (!url.startsWith('http://') && !url.startsWith('https://')) { + throw new Error('URL must start with http:// or https://'); + } + + // Check options + if (options && typeof options !== 'object') { + throw new Error('Options must be an object'); + } + + // Check renderHeavyJs option + if (options.renderHeavyJs !== undefined && typeof options.renderHeavyJs !== 'boolean') { + throw new Error('renderHeavyJs must be a boolean'); + } + + // Check headers option + if (options.headers !== undefined && typeof options.headers !== 'object') { + throw new Error('Headers must be an object'); + } + + return true; +} + +/** + * Test scrape function with mock data + */ +async function testScrapeFunction() { + console.log('\nπŸ§ͺ Testing Scrape Function (Mock)'); + console.log('='.repeat(50)); + + try { + // Mock the scrape function to avoid actual API calls during testing + const mockScrape = async (apiKey, url, options = {}) => { + // Simulate API delay + await new Promise(resolve => setTimeout(resolve, 100)); + + // Return mock response + return { + status: 'completed', + scrape_request_id: 'mock-request-id-12345', + html: 'Mock Page

Mock Content

', + created_at: new Date().toISOString(), + completed_at: new Date().toISOString() + }; + }; + + console.log('1. Testing basic scrape call...'); + const result1 = await mockScrape(API_KEY, 'https://example.com'); + console.log(` βœ… Status: ${result1.status}`); + console.log(` βœ… Request ID: ${result1.scrape_request_id}`); + console.log(` βœ… HTML length: ${result1.html.length} characters`); + + console.log('\n2. Testing scrape with heavy JS rendering...'); + const result2 = await mockScrape(API_KEY, 'https://example.com', { renderHeavyJs: true }); + console.log(` βœ… Status: ${result2.status}`); + console.log(` βœ… Request ID: ${result2.scrape_request_id}`); + + console.log('\n3. Testing scrape with custom headers...'); + const result3 = await mockScrape(API_KEY, 'https://example.com', { + headers: { 'User-Agent': 'Test Bot' } + }); + console.log(` βœ… Status: ${result3.status}`); + console.log(` βœ… Request ID: ${result3.scrape_request_id}`); + + console.log('\nβœ… All scrape function tests passed'); + return true; + + } catch (error) { + console.error(`❌ Scrape function test failed: ${error.message}`); + return false; + } +} + +/** + * Test getScrapeRequest function with mock data + */ +async function testGetScrapeRequestFunction() { + console.log('\nπŸ§ͺ Testing GetScrapeRequest Function (Mock)'); + console.log('='.repeat(50)); + + try { + // Mock the getScrapeRequest function + const mockGetScrapeRequest = async (apiKey, requestId) => { + // Simulate API delay + await new Promise(resolve => setTimeout(resolve, 50)); + + // Return mock response + return { + status: 'completed', + scrape_request_id: requestId, + html: 'Retrieved Page

Retrieved Content

', + created_at: new Date().toISOString(), + completed_at: new Date().toISOString() + }; + }; + + console.log('1. Testing getScrapeRequest with valid request ID...'); + const result1 = await mockGetScrapeRequest(API_KEY, 'test-request-123'); + console.log(` βœ… Status: ${result1.status}`); + console.log(` βœ… Request ID: ${result1.scrape_request_id}`); + console.log(` βœ… HTML length: ${result1.html.length} characters`); + + console.log('\n2. Testing getScrapeRequest with different request ID...'); + const result2 = await mockGetScrapeRequest(API_KEY, 'another-request-456'); + console.log(` βœ… Status: ${result2.status}`); + console.log(` βœ… Request ID: ${result2.scrape_request_id}`); + + console.log('\nβœ… All getScrapeRequest function tests passed'); + return true; + + } catch (error) { + console.error(`❌ GetScrapeRequest function test failed: ${error.message}`); + return false; + } +} + +/** + * Test error handling + */ +function testErrorHandling() { + console.log('\nπŸ§ͺ Testing Error Handling'); + console.log('='.repeat(50)); + + let passed = 0; + let total = 0; + + // Test 1: Invalid API key + total++; + try { + validateScrapeInputs('', 'https://example.com', {}); + console.log('1. Empty API key test: ❌ FAILED (should have thrown error)'); + } catch (error) { + console.log('1. Empty API key test: βœ… PASSED'); + passed++; + } + + // Test 2: Invalid URL + total++; + try { + validateScrapeInputs('valid-key', 'invalid-url', {}); + console.log('2. Invalid URL test: ❌ FAILED (should have thrown error)'); + } catch (error) { + console.log('2. Invalid URL test: βœ… PASSED'); + passed++; + } + + // Test 3: Invalid options + total++; + try { + validateScrapeInputs('valid-key', 'https://example.com', 'invalid-options'); + console.log('3. Invalid options test: ❌ FAILED (should have thrown error)'); + } catch (error) { + console.log('3. Invalid options test: βœ… PASSED'); + passed++; + } + + // Test 4: Invalid renderHeavyJs + total++; + try { + validateScrapeInputs('valid-key', 'https://example.com', { renderHeavyJs: 'invalid' }); + console.log('4. Invalid renderHeavyJs test: ❌ FAILED (should have thrown error)'); + } catch (error) { + console.log('4. Invalid renderHeavyJs test: βœ… PASSED'); + passed++; + } + + console.log(`\nπŸ“Š Error Handling Results: ${passed}/${total} tests passed`); + return passed === total; +} + +/** + * Test URL validation + */ +function testUrlValidation() { + console.log('\nπŸ§ͺ Testing URL Validation'); + console.log('='.repeat(50)); + + const testUrls = [ + { url: 'https://example.com', expected: true, description: 'HTTPS URL' }, + { url: 'http://example.com', expected: true, description: 'HTTP URL' }, + { url: 'https://sub.example.com', expected: true, description: 'Subdomain HTTPS' }, + { url: 'https://example.com/path', expected: true, description: 'HTTPS with path' }, + { url: 'https://example.com?param=value', expected: true, description: 'HTTPS with query params' }, + { url: 'https://example.com#fragment', expected: true, description: 'HTTPS with fragment' }, + { url: 'example.com', expected: false, description: 'No protocol' }, + { url: '/path/to/page', expected: false, description: 'Relative path' }, + { url: 'ftp://example.com', expected: false, description: 'FTP protocol' }, + { url: '', expected: false, description: 'Empty string' }, + { url: null, expected: false, description: 'Null value' }, + { url: undefined, expected: false, description: 'Undefined value' } + ]; + + let passed = 0; + let total = testUrls.length; + + testUrls.forEach((testCase, index) => { + console.log(`${index + 1}. ${testCase.description}: ${testCase.url}`); + + try { + if (testCase.url) { + const isValid = testCase.url.startsWith('http://') || testCase.url.startsWith('https://'); + if (isValid === testCase.expected) { + console.log(` βœ… PASSED`); + passed++; + } else { + console.log(` ❌ FAILED - Expected: ${testCase.expected}, Got: ${isValid}`); + } + } else { + if (!testCase.expected) { + console.log(` βœ… PASSED`); + passed++; + } else { + console.log(` ❌ FAILED - Expected: ${testCase.expected}, Got: false`); + } + } + } catch (error) { + if (!testCase.expected) { + console.log(` βœ… PASSED (Expected error)`); + passed++; + } else { + console.log(` ❌ FAILED - Unexpected error: ${error.message}`); + } + } + }); + + console.log(`\nπŸ“Š URL Validation Results: ${passed}/${total} tests passed`); + return passed === total; +} + +/** + * Run all tests + */ +async function runAllTests() { + console.log('πŸš€ Starting Scrape Test Suite'); + console.log('='.repeat(60)); + console.log(`πŸ”‘ API Key: ${API_KEY.substring(0, 8)}...`); + console.log(`⏰ Timestamp: ${new Date().toISOString()}\n`); + + const tests = [ + { name: 'Input Validation', fn: testInputValidation }, + { name: 'Scrape Function', fn: testScrapeFunction }, + { name: 'GetScrapeRequest Function', fn: testGetScrapeRequestFunction }, + { name: 'Error Handling', fn: testErrorHandling }, + { name: 'URL Validation', fn: testUrlValidation } + ]; + + let passed = 0; + let total = tests.length; + + for (const test of tests) { + try { + const result = await test.fn(); + if (result) { + passed++; + } + } catch (error) { + console.error(`❌ Test '${test.name}' failed with error: ${error.message}`); + } + console.log('\n' + '-'.repeat(60)); + } + + console.log('\n🎯 FINAL TEST RESULTS'); + console.log('='.repeat(30)); + console.log(`βœ… Passed: ${passed}`); + console.log(`❌ Failed: ${total - passed}`); + console.log(`πŸ“Š Success Rate: ${((passed / total) * 100).toFixed(1)}%`); + + if (passed === total) { + console.log('\nπŸŽ‰ All tests passed! Scrape functionality is working correctly.'); + return 0; + } else { + console.log('\n⚠️ Some tests failed. Please review the output above.'); + return 1; + } +} + +// Run tests if this file is executed directly +if (import.meta.url === `file://${process.argv[1]}`) { + runAllTests() + .then(exitCode => { + process.exit(exitCode); + }) + .catch(error => { + console.error('πŸ’₯ Fatal error during test execution:', error.message); + process.exit(1); + }); +} + +export { + testInputValidation, + testScrapeFunction, + testGetScrapeRequestFunction, + testErrorHandling, + testUrlValidation, + runAllTests +}; diff --git a/scrapegraph-py/examples/async/async_scrape_example.py b/scrapegraph-py/examples/async/async_scrape_example.py new file mode 100644 index 0000000..0a6c227 --- /dev/null +++ b/scrapegraph-py/examples/async/async_scrape_example.py @@ -0,0 +1,278 @@ +""" +Async example demonstrating how to use the Scrape API with the scrapegraph-py SDK. + +This example shows how to: +1. Set up the async client for Scrape +2. Make async API calls to get HTML content from websites +3. Handle responses and save HTML content +4. Demonstrate both regular and heavy JS rendering modes +5. Process multiple websites concurrently + +Requirements: +- Python 3.7+ +- scrapegraph-py +- python-dotenv +- aiofiles (for async file operations) +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import asyncio +import json +import os +import time +from pathlib import Path +from typing import Optional + +from dotenv import load_dotenv + +from scrapegraph_py import AsyncClient + +# Load environment variables from .env file +load_dotenv() + + +async def scrape_website( + client: AsyncClient, + website_url: str, + render_heavy_js: bool = False, + headers: Optional[dict[str, str]] = None, +) -> dict: + """ + Get HTML content from a website using the async Scrape API. + + Args: + client: The async scrapegraph-py client instance + website_url: The URL of the website to get HTML from + render_heavy_js: Whether to render heavy JavaScript (defaults to False) + headers: Optional headers to send with the request + + Returns: + dict: A dictionary containing the HTML content and metadata + + Raises: + Exception: If the API request fails + """ + js_mode = "with heavy JS rendering" if render_heavy_js else "without JS rendering" + print(f"Getting HTML content from: {website_url}") + print(f"Mode: {js_mode}") + + start_time = time.time() + + try: + result = await client.scrape( + website_url=website_url, + render_heavy_js=render_heavy_js, + headers=headers, + ) + execution_time = time.time() - start_time + print(f"Execution time: {execution_time:.2f} seconds") + return result + except Exception as e: + print(f"Error: {str(e)}") + raise + + +async def save_html_content( + html_content: str, filename: str, output_dir: str = "async_scrape_output" +): + """ + Save HTML content to a file asynchronously. + + Args: + html_content: The HTML content to save + filename: The name of the file (without extension) + output_dir: The directory to save the file in + """ + # Create output directory if it doesn't exist + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + + # Save HTML file + html_file = output_path / f"{filename}.html" + + # Use asyncio to run file I/O in a thread pool + await asyncio.to_thread( + lambda: html_file.write_text(html_content, encoding="utf-8") + ) + + print(f"HTML content saved to: {html_file}") + return html_file + + +def analyze_html_content(html_content: str) -> dict: + """ + Analyze HTML content and provide basic statistics. + + Args: + html_content: The HTML content to analyze + + Returns: + dict: Basic statistics about the HTML content + """ + stats = { + "total_length": len(html_content), + "lines": len(html_content.splitlines()), + "has_doctype": html_content.strip().startswith(" dict: + """ + Process a single website and return results. + + Args: + client: The async client instance + website: Website configuration dictionary + + Returns: + dict: Processing results + """ + print(f"\nProcessing: {website['description']}") + print("-" * 40) + + try: + # Get HTML content + result = await scrape_website( + client=client, + website_url=website["url"], + render_heavy_js=website["render_heavy_js"], + ) + + # Display response metadata + print(f"Request ID: {result.get('scrape_request_id', 'N/A')}") + print(f"Status: {result.get('status', 'N/A')}") + print(f"Error: {result.get('error', 'None')}") + + # Analyze HTML content + html_content = result.get("html", "") + if html_content: + stats = analyze_html_content(html_content) + print(f"\nHTML Content Analysis:") + print(f" Total length: {stats['total_length']:,} characters") + print(f" Lines: {stats['lines']:,}") + print(f" Has DOCTYPE: {stats['has_doctype']}") + print(f" Has HTML tag: {stats['has_html_tag']}") + print(f" Has Head tag: {stats['has_head_tag']}") + print(f" Has Body tag: {stats['has_body_tag']}") + print(f" Script tags: {stats['script_tags']}") + print(f" Style tags: {stats['style_tags']}") + print(f" Div tags: {stats['div_tags']}") + print(f" Paragraph tags: {stats['p_tags']}") + print(f" Image tags: {stats['img_tags']}") + print(f" Link tags: {stats['link_tags']}") + + # Save HTML content + filename = f"{website['name']}_{'js' if website['render_heavy_js'] else 'nojs'}" + saved_file = await save_html_content(html_content, filename) + + # Show first 500 characters as preview + preview = html_content[:500].replace("\n", " ").strip() + print(f"\nHTML Preview (first 500 chars):") + print(f" {preview}...") + + return { + "success": True, + "website": website["url"], + "saved_file": str(saved_file), + "stats": stats, + "preview": preview + } + else: + print("No HTML content received") + return { + "success": False, + "website": website["url"], + "error": "No HTML content received" + } + + except Exception as e: + print(f"Error processing {website['url']}: {str(e)}") + return { + "success": False, + "website": website["url"], + "error": str(e) + } + + +async def main(): + """ + Main async function demonstrating Scrape API usage. + """ + # Example websites to test + test_websites = [ + { + "url": "https://example.com", + "name": "example", + "render_heavy_js": False, + "description": "Simple static website", + }, + { + "url": "https://httpbin.org/html", + "name": "httpbin_html", + "render_heavy_js": False, + "description": "HTTP testing service", + }, + ] + + print("Async Scrape API Example with scrapegraph-py SDK") + print("=" * 60) + + # Initialize the async client + try: + async with AsyncClient.from_env() as client: + print("βœ… Async client initialized successfully") + + # Process websites concurrently + print(f"\nπŸš€ Processing {len(test_websites)} websites concurrently...") + + tasks = [ + process_website(client, website) + for website in test_websites + ] + + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Display summary + print(f"\nπŸ“Š Processing Summary") + print("=" * 40) + + successful = 0 + for result in results: + if isinstance(result, Exception): + print(f"❌ Exception occurred: {result}") + elif result["success"]: + successful += 1 + print(f"βœ… {result['website']}: {result['saved_file']}") + else: + print(f"❌ {result['website']}: {result.get('error', 'Unknown error')}") + + print(f"\n🎯 Results: {successful}/{len(test_websites)} websites processed successfully") + + except Exception as e: + print(f"❌ Failed to initialize async client: {str(e)}") + print("Make sure you have SGAI_API_KEY in your .env file") + return + + print("\nβœ… Async processing completed") + + +if __name__ == "__main__": + # Run the async main function + asyncio.run(main()) diff --git a/scrapegraph-py/examples/async/steps/async_step_by_step_scrape_example.py b/scrapegraph-py/examples/async/steps/async_step_by_step_scrape_example.py new file mode 100644 index 0000000..41894b9 --- /dev/null +++ b/scrapegraph-py/examples/async/steps/async_step_by_step_scrape_example.py @@ -0,0 +1,184 @@ +""" +Async step-by-step example demonstrating how to use the Scrape API with the scrapegraph-py async SDK. + +This example shows the basic async workflow: +1. Initialize the async client +2. Make a scrape request asynchronously +3. Handle the response +4. Save the HTML content +5. Basic analysis + +Requirements: +- Python 3.7+ +- scrapegraph-py +- python-dotenv +- aiohttp +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import asyncio +import os +from pathlib import Path +from dotenv import load_dotenv + +from scrapegraph_py import AsyncClient + +# Load environment variables from .env file +load_dotenv() + + +async def step_1_initialize_async_client(): + """Step 1: Initialize the scrapegraph-py async client.""" + print("πŸ”‘ Step 1: Initializing async client...") + + try: + # Initialize async client using environment variable + client = AsyncClient.from_env() + print("βœ… Async client initialized successfully") + return client + except Exception as e: + print(f"❌ Failed to initialize async client: {str(e)}") + print("Make sure you have SGAI_API_KEY in your .env file") + raise + + +async def step_2_make_async_scrape_request(client, url, render_js=False): + """Step 2: Make a scrape request asynchronously.""" + print(f"\n🌐 Step 2: Making async scrape request to {url}") + print(f"πŸ”§ Render heavy JS: {render_js}") + + try: + # Make the scrape request asynchronously + result = await client.scrape( + website_url=url, + render_heavy_js=render_js + ) + print("βœ… Async scrape request completed successfully") + return result + except Exception as e: + print(f"❌ Async scrape request failed: {str(e)}") + raise + + +def step_3_handle_response(result): + """Step 3: Handle and analyze the response.""" + print(f"\nπŸ“Š Step 3: Analyzing response...") + + # Check if we got HTML content + html_content = result.get("html", "") + if not html_content: + print("❌ No HTML content received") + return None + + # Basic response analysis + print(f"βœ… Received HTML content") + print(f"πŸ“ Content length: {len(html_content):,} characters") + print(f"πŸ“„ Lines: {len(html_content.splitlines()):,}") + + # Check for common HTML elements + has_doctype = html_content.strip().startswith(" 0: + print(f" {element}: {count}") + + # Check for JavaScript and CSS + has_js = elements["script"] > 0 + has_css = elements["style"] > 0 + + print(f"\n🎨 Content types:") + print(f" JavaScript: {'Yes' if has_js else 'No'}") + print(f" CSS: {'Yes' if has_css else 'No'}") + + return elements + + +async def main(): + """Main function demonstrating async step-by-step scrape usage.""" + print("πŸš€ Async Step-by-Step Scrape API Example") + print("=" * 55) + + # Test URL + test_url = "https://example.com" + + try: + # Step 1: Initialize async client + async with AsyncClient.from_env() as client: + print("βœ… Async client initialized successfully") + + # Step 2: Make async scrape request + result = await step_2_make_async_scrape_request(client, test_url, render_js=False) + + # Step 3: Handle response + html_content = step_3_handle_response(result) + if not html_content: + print("❌ Cannot proceed without HTML content") + return + + # Step 4: Save content + filename = "async_example_website" + saved_file = step_4_save_html_content(html_content, filename) + + # Step 5: Basic analysis + elements = step_5_basic_analysis(html_content) + + # Summary + print(f"\n🎯 Summary:") + print(f"βœ… Successfully processed {test_url} asynchronously") + print(f"πŸ’Ύ HTML saved to: {saved_file}") + print(f"πŸ“Š Analyzed {len(html_content):,} characters of HTML content") + + print("βœ… Async client closed successfully") + + except Exception as e: + print(f"\nπŸ’₯ Error occurred: {str(e)}") + print("Check your API key and internet connection") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scrapegraph-py/examples/sync/htmlfy_output/example_nojs.html b/scrapegraph-py/examples/sync/htmlfy_output/example_nojs.html new file mode 100644 index 0000000..5a9b52f --- /dev/null +++ b/scrapegraph-py/examples/sync/htmlfy_output/example_nojs.html @@ -0,0 +1,46 @@ + + + + Example Domain + + + + + + + + +
+

Example Domain

+

This domain is for use in illustrative examples in documents. You may use this + domain in literature without prior coordination or asking for permission.

+

More information...

+
+ + diff --git a/scrapegraph-py/examples/sync/htmlfy_output/httpbin_html_nojs.html b/scrapegraph-py/examples/sync/htmlfy_output/httpbin_html_nojs.html new file mode 100644 index 0000000..05a40c6 --- /dev/null +++ b/scrapegraph-py/examples/sync/htmlfy_output/httpbin_html_nojs.html @@ -0,0 +1,12 @@ + + + +

Herman Melville - Moby-Dick

+ +
+

+ Availing himself of the mild, summer-cool weather that now reigned in these latitudes, and in preparation for the peculiarly active pursuits shortly to be anticipated, Perth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the foremast; being now almost incessantly invoked by the headsmen, and harpooneers, and bowsmen to do some little job for them; altering, or repairing, or new shaping their various weapons and boat furniture. Often he would be surrounded by an eager circle, all waiting to be served; holding boat-spades, pike-heads, harpoons, and lances, and jealously watching his every sooty movement, as he toiled. Nevertheless, this old man's was a patient hammer wielded by a patient arm. No murmur, no impatience, no petulance did come from him. Silent, slow, and solemn; bowing over still further his chronically broken back, he toiled away, as if toil were life itself, and the heavy beating of his hammer the heavy beating of his heart. And so it was.β€”Most miserable! A peculiar walk in this old man, a certain slight but painful appearing yawing in his gait, had at an early period of the voyage excited the curiosity of the mariners. And to the importunity of their persisted questionings he had finally given in; and so it came to pass that every one now knew the shameful story of his wretched fate. Belated, and not innocently, one bitter winter's midnight, on the road running between two country towns, the blacksmith half-stupidly felt the deadly numbness stealing over him, and sought refuge in a leaning, dilapidated barn. The issue was, the loss of the extremities of both feet. Out of this revelation, part by part, at last came out the four acts of the gladness, and the one long, and as yet uncatastrophied fifth act of the grief of his life's drama. He was an old man, who, at the age of nearly sixty, had postponedly encountered that thing in sorrow's technicals called ruin. He had been an artisan of famed excellence, and with plenty to do; owned a house and garden; embraced a youthful, daughter-like, loving wife, and three blithe, ruddy children; every Sunday went to a cheerful-looking church, planted in a grove. But one night, under cover of darkness, and further concealed in a most cunning disguisement, a desperate burglar slid into his happy home, and robbed them all of everything. And darker yet to tell, the blacksmith himself did ignorantly conduct this burglar into his family's heart. It was the Bottle Conjuror! Upon the opening of that fatal cork, forth flew the fiend, and shrivelled up his home. Now, for prudent, most wise, and economic reasons, the blacksmith's shop was in the basement of his dwelling, but with a separate entrance to it; so that always had the young and loving healthy wife listened with no unhappy nervousness, but with vigorous pleasure, to the stout ringing of her young-armed old husband's hammer; whose reverberations, muffled by passing through the floors and walls, came up to her, not unsweetly, in her nursery; and so, to stout Labor's iron lullaby, the blacksmith's infants were rocked to slumber. Oh, woe on woe! Oh, Death, why canst thou not sometimes be timely? Hadst thou taken this old blacksmith to thyself ere his full ruin came upon him, then had the young widow had a delicious grief, and her orphans a truly venerable, legendary sire to dream of in their after years; and all of them a care-killing competency. +

+
+ + \ No newline at end of file diff --git a/scrapegraph-py/examples/sync/scrape_example.py b/scrapegraph-py/examples/sync/scrape_example.py new file mode 100644 index 0000000..552d79f --- /dev/null +++ b/scrapegraph-py/examples/sync/scrape_example.py @@ -0,0 +1,217 @@ +""" +Example demonstrating how to use the Scrape API with the scrapegraph-py SDK. + +This example shows how to: +1. Set up the client for Scrape +2. Make the API call to get HTML content from a website +3. Handle the response and save the HTML content +4. Demonstrate both regular and heavy JS rendering modes +5. Display the results and metadata + +Requirements: +- Python 3.7+ +- scrapegraph-py +- python-dotenv +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import json +import os +import time +from pathlib import Path +from typing import Optional + +from dotenv import load_dotenv + +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + + +def scrape_website( + client: Client, + website_url: str, + render_heavy_js: bool = False, + headers: Optional[dict[str, str]] = None, +) -> dict: + """ + Get HTML content from a website using the Scrape API. + + Args: + client: The scrapegraph-py client instance + website_url: The URL of the website to get HTML from + render_heavy_js: Whether to render heavy JavaScript (defaults to False) + headers: Optional headers to send with the request + + Returns: + dict: A dictionary containing the HTML content and metadata + + Raises: + Exception: If the API request fails + """ + js_mode = "with heavy JS rendering" if render_heavy_js else "without JS rendering" + print(f"Getting HTML content from: {website_url}") + print(f"Mode: {js_mode}") + + start_time = time.time() + + try: + result = client.scrape( + website_url=website_url, + render_heavy_js=render_heavy_js, + headers=headers, + ) + execution_time = time.time() - start_time + print(f"Execution time: {execution_time:.2f} seconds") + return result + except Exception as e: + print(f"Error: {str(e)}") + raise + + +def save_html_content( + html_content: str, filename: str, output_dir: str = "scrape_output" +): + """ + Save HTML content to a file. + + Args: + html_content: The HTML content to save + filename: The name of the file (without extension) + output_dir: The directory to save the file in + """ + # Create output directory if it doesn't exist + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + + # Save HTML file + html_file = output_path / f"{filename}.html" + with open(html_file, "w", encoding="utf-8") as f: + f.write(html_content) + + print(f"HTML content saved to: {html_file}") + return html_file + + +def analyze_html_content(html_content: str) -> dict: + """ + Analyze HTML content and provide basic statistics. + + Args: + html_content: The HTML content to analyze + + Returns: + dict: Basic statistics about the HTML content + """ + stats = { + "total_length": len(html_content), + "lines": len(html_content.splitlines()), + "has_doctype": html_content.strip().startswith(" 0: + print(f" {element}: {count}") + + # Check for JavaScript and CSS + has_js = elements["script"] > 0 + has_css = elements["style"] > 0 + + print(f"\n🎨 Content types:") + print(f" JavaScript: {'Yes' if has_js else 'No'}") + print(f" CSS: {'Yes' if has_css else 'No'}") + + return elements + + +def main(): + """Main function demonstrating step-by-step scrape usage.""" + print("πŸš€ Step-by-Step Scrape API Example") + print("=" * 50) + + # Test URL + test_url = "https://example.com" + + try: + # Step 1: Initialize client + client = step_1_initialize_client() + + # Step 2: Make scrape request + result = step_2_make_scrape_request(client, test_url, render_js=False) + + # Step 3: Handle response + html_content = step_3_handle_response(result) + if not html_content: + print("❌ Cannot proceed without HTML content") + return + + # Step 4: Save content + filename = "example_website" + saved_file = step_4_save_html_content(html_content, filename) + + # Step 5: Basic analysis + elements = step_5_basic_analysis(html_content) + + # Summary + print(f"\n🎯 Summary:") + print(f"βœ… Successfully processed {test_url}") + print(f"πŸ’Ύ HTML saved to: {saved_file}") + print(f"πŸ“Š Analyzed {len(html_content):,} characters of HTML content") + + # Close client + client.close() + print("πŸ”’ Client closed successfully") + + except Exception as e: + print(f"\nπŸ’₯ Error occurred: {str(e)}") + print("Check your API key and internet connection") + + +if __name__ == "__main__": + main() diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index e98e6a7..b965838 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -14,6 +14,7 @@ ) from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest from scrapegraph_py.models.feedback import FeedbackRequest +from scrapegraph_py.models.scrape import GetScrapeRequest, ScrapeRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.searchscraper import ( GetSearchScraperRequest, @@ -175,6 +176,50 @@ async def get_markdownify(self, request_id: str): logger.info(f"✨ Successfully retrieved result for request {request_id}") return result + async def scrape( + self, + website_url: str, + render_heavy_js: bool = False, + headers: Optional[dict[str, str]] = None, + ): + """Send a scrape request to get HTML content from a website + + Args: + website_url: The URL of the website to get HTML from + render_heavy_js: Whether to render heavy JavaScript (defaults to False) + headers: Optional headers to send with the request + """ + logger.info(f"πŸ” Starting scrape request for {website_url}") + logger.debug(f"πŸ”§ Render heavy JS: {render_heavy_js}") + if headers: + logger.debug("πŸ”§ Using custom headers") + + request = ScrapeRequest( + website_url=website_url, + render_heavy_js=render_heavy_js, + headers=headers, + ) + logger.debug("βœ… Request validation passed") + + result = await self._make_request( + "POST", f"{API_BASE_URL}/scrape", json=request.model_dump() + ) + logger.info("✨ Scrape request completed successfully") + return result + + async def get_scrape(self, request_id: str): + """Get the result of a previous scrape request""" + logger.info(f"πŸ” Fetching scrape result for request {request_id}") + + # Validate input using Pydantic model + GetScrapeRequest(request_id=request_id) + logger.debug("βœ… Request ID validation passed") + + result = await self._make_request( + "GET", f"{API_BASE_URL}/scrape/{request_id}") + logger.info(f"✨ Successfully retrieved result for request {request_id}") + return result + async def smartscraper( self, user_prompt: str, diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index f78620d..6e1d37a 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -15,6 +15,7 @@ ) from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest from scrapegraph_py.models.feedback import FeedbackRequest +from scrapegraph_py.models.scrape import GetScrapeRequest, ScrapeRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.searchscraper import ( GetSearchScraperRequest, @@ -182,6 +183,49 @@ def get_markdownify(self, request_id: str): logger.info(f"✨ Successfully retrieved result for request {request_id}") return result + def scrape( + self, + website_url: str, + render_heavy_js: bool = False, + headers: Optional[dict[str, str]] = None, + ): + """Send a scrape request to get HTML content from a website + + Args: + website_url: The URL of the website to get HTML from + render_heavy_js: Whether to render heavy JavaScript (defaults to False) + headers: Optional headers to send with the request + """ + logger.info(f"πŸ” Starting scrape request for {website_url}") + logger.debug(f"πŸ”§ Render heavy JS: {render_heavy_js}") + if headers: + logger.debug("πŸ”§ Using custom headers") + + request = ScrapeRequest( + website_url=website_url, + render_heavy_js=render_heavy_js, + headers=headers, + ) + logger.debug("βœ… Request validation passed") + + result = self._make_request( + "POST", f"{API_BASE_URL}/scrape", json=request.model_dump() + ) + logger.info("✨ Scrape request completed successfully") + return result + + def get_scrape(self, request_id: str): + """Get the result of a previous scrape request""" + logger.info(f"πŸ” Fetching scrape result for request {request_id}") + + # Validate input using Pydantic model + GetScrapeRequest(request_id=request_id) + logger.debug("βœ… Request ID validation passed") + + result = self._make_request("GET", f"{API_BASE_URL}/scrape/{request_id}") + logger.info(f"✨ Successfully retrieved result for request {request_id}") + return result + def smartscraper( self, user_prompt: str, diff --git a/scrapegraph-py/scrapegraph_py/models/__init__.py b/scrapegraph-py/scrapegraph_py/models/__init__.py index cbde5de..e4b68a8 100644 --- a/scrapegraph-py/scrapegraph_py/models/__init__.py +++ b/scrapegraph-py/scrapegraph_py/models/__init__.py @@ -1,6 +1,7 @@ from .agenticscraper import AgenticScraperRequest, GetAgenticScraperRequest from .crawl import CrawlRequest, GetCrawlRequest from .feedback import FeedbackRequest +from .scrape import GetScrapeRequest, ScrapeRequest from .markdownify import GetMarkdownifyRequest, MarkdownifyRequest from .searchscraper import GetSearchScraperRequest, SearchScraperRequest from .smartscraper import GetSmartScraperRequest, SmartScraperRequest @@ -11,6 +12,8 @@ "CrawlRequest", "GetCrawlRequest", "FeedbackRequest", + "GetScrapeRequest", + "ScrapeRequest", "GetMarkdownifyRequest", "MarkdownifyRequest", "GetSearchScraperRequest", diff --git a/scrapegraph-py/scrapegraph_py/models/scrape.py b/scrapegraph-py/scrapegraph_py/models/scrape.py new file mode 100644 index 0000000..cceffb1 --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/scrape.py @@ -0,0 +1,55 @@ +# Models for scrape endpoint + +from typing import Optional +from uuid import UUID + +from pydantic import BaseModel, Field, model_validator + + +class ScrapeRequest(BaseModel): + website_url: str = Field(..., example="https://scrapegraphai.com/") + render_heavy_js: bool = Field( + False, + description="Whether to render heavy JavaScript (defaults to False)", + ) + headers: Optional[dict[str, str]] = Field( + None, + example={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36", + "Cookie": "cookie1=value1; cookie2=value2", + }, + description="Optional headers to send with the request, including cookies " + "and user agent", + ) + + @model_validator(mode="after") + def validate_url(self) -> "ScrapeRequest": + if self.website_url is None or not self.website_url.strip(): + raise ValueError("Website URL cannot be empty") + if not ( + self.website_url.startswith("http://") + or self.website_url.startswith("https://") + ): + raise ValueError("Invalid URL") + return self + + def model_dump(self, *args, **kwargs) -> dict: + # Set exclude_none=True to exclude None values from serialization + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) + + +class GetScrapeRequest(BaseModel): + """Request model for get_scrape endpoint""" + + request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") + + @model_validator(mode="after") + def validate_request_id(self) -> "GetScrapeRequest": + try: + # Validate the request_id is a valid UUID + UUID(self.request_id) + except ValueError: + raise ValueError("request_id must be a valid UUID") + return self diff --git a/scrapegraph-py/test_scrape_integration.py b/scrapegraph-py/test_scrape_integration.py new file mode 100644 index 0000000..3175902 --- /dev/null +++ b/scrapegraph-py/test_scrape_integration.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +""" +Simple integration test for Scrape functionality. +This script tests the basic Scrape operations without requiring a real API key. +""" + +import os +import sys +from pathlib import Path + +# Add the src directory to the path +sys.path.insert(0, str(Path(__file__).parent / "scrapegraph_py")) + +from models.scrape import ScrapeRequest, GetScrapeRequest + + +def test_scrape_models(): + """Test Scrape model validation""" + print("πŸ§ͺ Testing Scrape models...") + + # Test valid requests + try: + request = ScrapeRequest( + website_url="https://example.com", + render_heavy_js=False + ) + print("βœ… Basic Scrape request validation passed") + + request_with_headers = ScrapeRequest( + website_url="https://example.com", + render_heavy_js=True, + headers={"User-Agent": "Test Agent"} + ) + print("βœ… Scrape request with headers validation passed") + + except Exception as e: + print(f"❌ Scrape request validation failed: {e}") + return False + + # Test invalid requests + try: + ScrapeRequest(website_url="") + print("❌ Empty URL should have failed validation") + return False + except ValueError: + print("βœ… Empty URL validation correctly failed") + + try: + ScrapeRequest(website_url="invalid-url") + print("❌ Invalid URL should have failed validation") + return False + except ValueError: + print("βœ… Invalid URL validation correctly failed") + + # Test GetScrapeRequest + try: + get_request = GetScrapeRequest( + request_id="123e4567-e89b-12d3-a456-426614174000" + ) + print("βœ… Get Scrape request validation passed") + except Exception as e: + print(f"❌ Get Scrape request validation failed: {e}") + return False + + try: + GetScrapeRequest(request_id="invalid-uuid") + print("❌ Invalid UUID should have failed validation") + return False + except ValueError: + print("βœ… Invalid UUID validation correctly failed") + + print("βœ… All Scrape model tests passed!") + return True + + +def test_scrape_model_serialization(): + """Test Scrape model serialization""" + print("\nπŸ§ͺ Testing Scrape model serialization...") + + try: + # Test basic serialization + request = ScrapeRequest( + website_url="https://example.com", + render_heavy_js=False + ) + data = request.model_dump() + + assert "website_url" in data + assert "render_heavy_js" in data + assert "headers" not in data # Should be excluded as None + print("βœ… Basic serialization test passed") + + # Test serialization with headers + request_with_headers = ScrapeRequest( + website_url="https://example.com", + render_heavy_js=True, + headers={"User-Agent": "Test Agent"} + ) + data_with_headers = request_with_headers.model_dump() + + assert data_with_headers["headers"] == {"User-Agent": "Test Agent"} + print("βœ… Serialization with headers test passed") + + print("βœ… All serialization tests passed!") + return True + + except Exception as e: + print(f"❌ Serialization test failed: {e}") + return False + + +def main(): + """Run all Scrape tests""" + print("πŸš€ Scrape Integration Tests") + print("=" * 40) + + tests = [ + test_scrape_models, + test_scrape_model_serialization, + ] + + passed = 0 + total = len(tests) + + for test in tests: + if test(): + passed += 1 + print() + + print("πŸ“Š Test Results") + print("=" * 20) + print(f"Passed: {passed}/{total}") + + if passed == total: + print("πŸŽ‰ All tests passed!") + return 0 + else: + print("❌ Some tests failed!") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scrapegraph-py/tests/test_async_client.py b/scrapegraph-py/tests/test_async_client.py index c663814..f620289 100644 --- a/scrapegraph-py/tests/test_async_client.py +++ b/scrapegraph-py/tests/test_async_client.py @@ -549,3 +549,253 @@ async def test_crawl_markdown_mode_validation(mock_api_key): "Data schema should not be provided when extraction_mode=False" in str(e) ) + + +# ============================================================================ +# ASYNC SCRAPE TESTS +# ============================================================================ + + +@pytest.mark.asyncio +async def test_async_scrape_basic(mock_api_key): + """Test basic async scrape request""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/scrape", + payload={ + "scrape_request_id": str(uuid4()), + "status": "completed", + "html": "

Example Page

This is HTML content.

", + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.scrape(website_url="https://example.com") + assert response["status"] == "completed" + assert "html" in response + assert "

Example Page

" in response["html"] + + +@pytest.mark.asyncio +async def test_async_scrape_with_heavy_js(mock_api_key): + """Test async scrape request with heavy JavaScript rendering""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/scrape", + payload={ + "scrape_request_id": str(uuid4()), + "status": "completed", + "html": "
JavaScript rendered content
", + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.scrape( + website_url="https://example.com", + render_heavy_js=True + ) + assert response["status"] == "completed" + assert "html" in response + assert "JavaScript rendered content" in response["html"] + + +@pytest.mark.asyncio +async def test_async_scrape_with_headers(mock_api_key): + """Test async scrape request with custom headers""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/scrape", + payload={ + "scrape_request_id": str(uuid4()), + "status": "completed", + "html": "

Content with custom headers

", + }, + ) + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Cookie": "session=123" + } + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.scrape( + website_url="https://example.com", + headers=headers + ) + assert response["status"] == "completed" + assert "html" in response + + +@pytest.mark.asyncio +async def test_async_scrape_with_all_options(mock_api_key): + """Test async scrape request with all options enabled""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/scrape", + payload={ + "scrape_request_id": str(uuid4()), + "status": "completed", + "html": "
Full featured content
", + }, + ) + + headers = { + "User-Agent": "Custom Agent", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + } + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.scrape( + website_url="https://example.com", + render_heavy_js=True, + headers=headers + ) + assert response["status"] == "completed" + assert "html" in response + + +@pytest.mark.asyncio +async def test_async_get_scrape(mock_api_key, mock_uuid): + """Test async get scrape result""" + with aioresponses() as mocked: + mocked.get( + f"https://api.scrapegraphai.com/v1/scrape/{mock_uuid}", + payload={ + "scrape_request_id": mock_uuid, + "status": "completed", + "html": "

Retrieved HTML content

", + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.get_scrape(mock_uuid) + assert response["status"] == "completed" + assert response["scrape_request_id"] == mock_uuid + assert "html" in response + + +@pytest.mark.asyncio +async def test_async_scrape_error_response(mock_api_key): + """Test async scrape error response handling""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/scrape", + payload={ + "error": "Website not accessible", + "status": "error" + }, + status=400 + ) + + async with AsyncClient(api_key=mock_api_key) as client: + with pytest.raises(Exception): + await client.scrape(website_url="https://inaccessible-site.com") + + +@pytest.mark.asyncio +async def test_async_scrape_processing_status(mock_api_key): + """Test async scrape processing status response""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/scrape", + payload={ + "scrape_request_id": str(uuid4()), + "status": "processing", + "message": "Scrape job started" + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.scrape(website_url="https://example.com") + assert response["status"] == "processing" + assert "scrape_request_id" in response + + +@pytest.mark.asyncio +async def test_async_scrape_complex_html_response(mock_api_key): + """Test async scrape with complex HTML response""" + complex_html = """ + + + + + + Complex Page + + + +
+ +
+
+

Welcome

+

This is a complex HTML page with multiple elements.

+
+ Sample image + + +
Data 1Data 2
+
+
+ + + + """ + + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/scrape", + payload={ + "scrape_request_id": str(uuid4()), + "status": "completed", + "html": complex_html, + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.scrape(website_url="https://complex-example.com") + assert response["status"] == "completed" + assert "html" in response + assert "" in response["html"] + assert "Complex Page" in response["html"] + assert " + + + """ + + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/scrape", + json={ + "scrape_request_id": str(uuid4()), + "status": "completed", + "html": complex_html, + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.scrape(website_url="https://complex-example.com") + assert response["status"] == "completed" + assert "html" in response + assert "" in response["html"] + assert "Complex Page" in response["html"] + assert "