-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathamazon_search_scraper.py
More file actions
72 lines (56 loc) · 2.37 KB
/
amazon_search_scraper.py
File metadata and controls
72 lines (56 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from crawlbase import CrawlingAPI
import pandas as pd
import json
# Initialize the Crawling API with your Crawlbase token
api = CrawlingAPI({ 'token': 'CRAWLBASE_JS_TOKEN' })
# URL of the Amazon search page you want to scrape
amazon_search_url = 'https://www.amazon.com/s?k=games'
# options for Crawling API
options = {
'scraper': 'amazon-serp'
}
# List to store the scraped product information
product_data = []
def scrape_url(url):
# Make a request to scrape the Amazon search page with options
response = api.get(url, options)
# Check if the request was successful
if response['status_code'] == 200:
# Loading JSON from response body after decoding byte data
response_json = json.loads(response['body'].decode('latin1'))
# Getting Scraper Results
scraper_result = response_json['body']
# Extracting Products from the JSON response
products = scraper_result.get("products", [])
for product in products:
product_info = {
"url": product.get("url", ""),
"name": product.get("name", ""),
"asin": product.get("asin", ""),
"image": product.get("image", ""),
"price": product.get("price", ""),
"isPrime": product.get("isPrime", ""),
"offer": product.get("offer", ""),
"customerReview": product.get("customerReview", ""),
"customerReviewCount": product.get("customerReviewCount", ""),
}
product_data.append(product_info)
# Extract pagination information and return it
pagination = scraper_result.get("pagination")
return pagination
else:
print("Failed to retrieve the page. Status code:", response['status_code'])
return None
# Scrape the initial page and get pagination information
pagination_info = scrape_url(amazon_search_url)
# Check if pagination information is available
if pagination_info:
total_pages = pagination_info.get('totalPages', 1)
# Start from page 2 since the first page is already scraped
for page_number in range(2, total_pages + 1):
page_url = f'{amazon_search_url}&page={page_number}'
scrape_url(page_url)
# Create a Pandas DataFrame from the extracted data
df = pd.DataFrame(product_data)
# Save the DataFrame to a CSV file
df.to_csv("amazon_products.csv", index=False)