diff --git a/Youtube-trend-analysis/app.py b/Youtube-trend-analysis/app.py index 44c966a63..3d6936d4b 100644 --- a/Youtube-trend-analysis/app.py +++ b/Youtube-trend-analysis/app.py @@ -18,6 +18,8 @@ bright_data_api_key = os.getenv("BRIGHT_DATA_API_KEY") +os.makedirs("transcripts", exist_ok=True) + @st.cache_resource def load_llm(): @@ -124,6 +126,45 @@ def start_analysis(): channel_scrapped_output = get_output(bright_data_api_key, status['snapshot_id'], format="json") + status_container.info("Processing transcripts...") + st.session_state.all_files = [] + error_files = [] + + for i in tqdm(range(len(channel_scrapped_output[0]))): + youtube_video_id = channel_scrapped_output[0][i]["shortcode"] + + os.makedirs("transcripts", exist_ok=True) + + file = f"transcripts/{youtube_video_id}.txt" + st.session_state.all_files.append(file) + + with open(file, "w", encoding="utf-8") as f: + transcript = channel_scrapped_output[0][i].get( + "formatted_transcript", [] + ) + if isinstance(transcript, list): + for entry in transcript: + text = entry.get("text", "") + start = entry.get("start_time", 0.0) + end = entry.get("end_time", 0.0) + line = f"({start:.2f}-{end:.2f}): {text}\n" + f.write(line) + else: + f.write(str(transcript)) + error_files.append(i) + del st.session_state.all_files[-1] + + if error_files: + for idx in error_files: + youtube_video_id = channel_scrapped_output[0][idx]["shortcode"] + file = f"transcripts/{youtube_video_id}.txt" + if os.path.exists(file): + os.remove(file) + print(f"Removed file: {file}") + else: + print(f"File not found: {file}") + + st.session_state.channel_scrapped_output = channel_scrapped_output st.markdown("## YouTube Videos Extracted") # Create a container for the carousel @@ -147,44 +188,34 @@ def start_analysis(): # Check if we still have videos to display if video_idx < num_videos: - with cols[col_idx]: - st.video(channel_scrapped_output[0][video_idx]['url']) + if video_idx in error_files: + continue - status_container.info("Processing transcripts...") - st.session_state.all_files = [] - # Calculate transcripts - for i in tqdm(range(len(channel_scrapped_output[0]))): - - - # save transcript to file - youtube_video_id = channel_scrapped_output[0][i]['shortcode'] - - file = "transcripts/" + youtube_video_id + ".txt" - st.session_state.all_files.append(file) - - with open(file, "w") as f: - for j in range(len(channel_scrapped_output[0][i]['formatted_transcript'])): - text = channel_scrapped_output[0][i]['formatted_transcript'][j]['text'] - start_time = channel_scrapped_output[0][i]['formatted_transcript'][j]['start_time'] - end_time = channel_scrapped_output[0][i]['formatted_transcript'][j]['end_time'] - f.write(f"({start_time:.2f}-{end_time:.2f}): {text}\n") - - f.close() + with cols[col_idx]: + st.video(channel_scrapped_output[0][video_idx]["url"]) - st.session_state.channel_scrapped_output = channel_scrapped_output - status_container.success("Scraping complete! We shall now analyze the videos and report trends...") + status_container.success("Scraping complete! Analyzing trends...") else: status_container.error(f"Scraping failed with status: {status}") if status['status'] == "ready": + file_contents = [] + for file in st.session_state.all_files: + with open(file, "r", encoding="utf-8") as f: + content = f.read() + file_contents.append(content) + + merge_content = "\n\n".join(file_contents) + status_container = st.empty() with st.spinner('The agent is analyzing the videos... This may take a moment.'): # create crew st.session_state.crew = create_agents_and_tasks() - st.session_state.response = st.session_state.crew.kickoff(inputs={"file_paths": ", ".join(st.session_state.all_files)}) - + st.session_state.response = st.session_state.crew.kickoff( + inputs={"file_contents": merge_content} + ) # =========================== diff --git a/Youtube-trend-analysis/brightdata_scrapper.py b/Youtube-trend-analysis/brightdata_scrapper.py index 78832500f..2ee22b401 100644 --- a/Youtube-trend-analysis/brightdata_scrapper.py +++ b/Youtube-trend-analysis/brightdata_scrapper.py @@ -105,9 +105,9 @@ def get_output(api_key, snapshot_id, format="json"): f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}?format={format}" ] - result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8') - if result.returncode == 0: + if result.returncode == 0 and result.stdout: json_lines = result.stdout.strip().split("\n") print(json_lines) json_objects = [json.loads(line) for line in json_lines] diff --git a/Youtube-trend-analysis/config.yaml b/Youtube-trend-analysis/config.yaml index 818795a6b..b608e1063 100644 --- a/Youtube-trend-analysis/config.yaml +++ b/Youtube-trend-analysis/config.yaml @@ -2,7 +2,7 @@ agents: - name: analysis_agent role: "YouTube Transcript Analyzer" goal: > - Analyze the transcripts of several videos located in {file_paths}. + Analyze the transcripts of several videos using {file_contents}. Break down the analysis into structured sections, including: 1. Key topics discussed. 2. Emerging trends or patterns across multiple transcripts. @@ -34,7 +34,7 @@ agents: tasks: - name: analysis_task description: > - Conduct a fine-grained analysis of the transcripts of several videos located in {file_paths}. + Conduct a fine-grained analysis of the transcripts of several videos using {file_contents}. Break the analysis into the following sections: 1. Key topics and themes discussed in the videos. 2. Emerging trends or patterns across multiple videos.