diff --git a/WebScraper (Google Search API)/.gitignore b/WebScraper (Google Search API)/.gitignore new file mode 100644 index 00000000..2eea525d --- /dev/null +++ b/WebScraper (Google Search API)/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/WebScraper (Google Search API)/README.md b/WebScraper (Google Search API)/README.md new file mode 100644 index 00000000..266054d9 --- /dev/null +++ b/WebScraper (Google Search API)/README.md @@ -0,0 +1,47 @@ +# 🌐 Python Web Surfing +This Python project allows you to scrape search results from the web using ```Google API``` and ```Google Custom Search Engine ID```, extract useful information, and perform basic data analysis using ```Gemini API```. It is designed to be reliable, modular, and easy to run from the command line. + +--- + +## ✅ Functionalities Implemented + +1. **Extracting Titles, URLs, and Snippets** + - Scrapes and saves the title, URL, and snippet/description from search results. + +2. **Taking Dynamic Input (Query from Command Line)** + - Run the scraper with any search query directly from the command line: + ```bash + python scraper.py + ``` + For Example + ```bash + python scraper.py "AI in healthcare" + ``` + +3. **Saving Results to CSV File** + - Results are saved in a seperate CSV file for each query. + +4. **Running in Headless Mode (Browser in Background)** + - The usage of the Custom Search Engine ID makes it totally headless. + +5. **Crawling Multiple Pages** + - The scraper can crawl multiple pages of search results (Free tier Google API only allows max 10 results at a time). + +6. **Adding Logs** + - Logs are stored in ```data/logs/```. + +7. **Data Summarizer** + - Summarizes the results all the results that were fetched and stores them in ```data_analysis``` folder. + +## ⚡ How to Run +1. Install dependencies: +```bash +pip install -r requirements.txt +``` +2. Run Scraper +```bash +python scraper.py +``` +## 💡 Notes +- Ensure you have ```Google API``` key, ```Google Custom Search Engine ID``` and ```Gemini API``` key set up in the script. +- Logs are automatically created for debugging and tracking scraping activity. \ No newline at end of file diff --git a/WebScraper (Google Search API)/data_analysis/AI in healthcare_summary.txt b/WebScraper (Google Search API)/data_analysis/AI in healthcare_summary.txt new file mode 100644 index 00000000..e26a8f97 --- /dev/null +++ b/WebScraper (Google Search API)/data_analysis/AI in healthcare_summary.txt @@ -0,0 +1,7 @@ +Artificial intelligence is a powerful and disruptive technology with the potential to +fundamentally transform medicine and healthcare delivery. AI systems analyze patient data to +predict health risks, diagnose diseases, and develop personalized treatment plans, thereby +assisting clinicians with decision-making. By creating more efficient workflows and enabling +better self-management of chronic illnesses, AI aims to make healthcare more personalized, +accessible, and effective. Ultimately, the goal is to improve patient care, achieve better +strategic outcomes, and potentially save lives. \ No newline at end of file diff --git a/WebScraper (Google Search API)/requirements.txt b/WebScraper (Google Search API)/requirements.txt new file mode 100644 index 00000000..9d3ffc2a --- /dev/null +++ b/WebScraper (Google Search API)/requirements.txt @@ -0,0 +1,4 @@ +pandas +python-dotenv +requests +google-generativeai \ No newline at end of file diff --git a/WebScraper (Google Search API)/scraper.py b/WebScraper (Google Search API)/scraper.py new file mode 100644 index 00000000..bf17324a --- /dev/null +++ b/WebScraper (Google Search API)/scraper.py @@ -0,0 +1,112 @@ +import csv +import logging +import sys +import os +import requests +from datetime import datetime +from dotenv import load_dotenv +import pandas as pd +import textwrap + +os.environ['GRPC_VERBOSITY'] = 'NONE' +import google.generativeai as genai + + +# Setup logging +os.makedirs("data/logs", exist_ok=True) +logging.basicConfig( + filename=f"data/logs/scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log", + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" +) + +load_dotenv() + +API_KEY = os.getenv("GOOGLE_API_KEY") +CX = os.getenv("CUSTOM_SEARCH_ENGINE_ID") +gemini_api=os.getenv("GEMINI_API") + +def scrape_google(query, num_results=10): + url = "https://www.googleapis.com/customsearch/v1" + params = {"key": API_KEY, "cx": CX, "q": query, "num": num_results} + + logging.info(f"Fetching results for query: {query}") + try: + req = requests.get(url, params=params) + req.raise_for_status() + data = req.json() + results = [] + + for item in data.get("items", []): + results.append({ + "Title": item.get("title", ""), + "URL": item.get("link", ""), + "Snippet": item.get("snippet", "") + }) + + logging.info(f"Fetched {len(results)} results for query: {query}") + return results + + except requests.exceptions.RequestException as e: + logging.error(f"Request failed: {e}") + return [] + +def save_results(results, filename): + if not results: + logging.warning("No results to save.") + print("❌ No results to save.") + return + + os.makedirs("data", exist_ok=True) + with open(filename, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=["Title", "URL", "Snippet"]) + writer.writeheader() + writer.writerows(results) + logging.info(f"Saved results to {filename}") + print(f"✅ Saved {len(results)} results to {filename}") + +def summarize(query): + filename=f"./data/{query}.csv" + df=pd.read_csv(filename) + texts_combined = "\n\n".join(df["Snippet"].astype(str).tolist()) + PROMPT=f''' + You are an expert text summarizer. I will provide you with multiple short text excerpts. + Your task is to read all of them and produce a single, concise summary that captures the + key ideas, themes, and main points across all excerpts. + + Make the summary clear, coherent, and around 3–5 sentences long. + + Texts: + {texts_combined} + + Output only the final summary. + ''' + genai.configure(api_key=gemini_api) + model = genai.GenerativeModel('gemini-2.5-pro') + response = model.generate_content(PROMPT) + + wrapped_text = textwrap.fill(response.text, width=95) + + folder_path = "data_analysis" + os.makedirs(folder_path, exist_ok=True) + summary_file_path = os.path.join(folder_path, f"{query}_summary.txt") + + with open(summary_file_path, "w", encoding="utf-8") as f: + f.write(wrapped_text) + + print(f"✅ Summary saved to {summary_file_path}") + + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python scraper.py ") + sys.exit(1) + + query = "".join(sys.argv[1]) + logging.info(f"Starting scrape for query: {query}") + + data = scrape_google(query) + save_results(data,f"./data/{query}.csv") + + summarize(query) \ No newline at end of file