Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,6 @@ dist/
build/
*.egg-info/
helpers/*
json_template/
json_template/

python_env
33 changes: 33 additions & 0 deletions backend/add_reviews_method.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import json
import os
import pandas as pd


directory = r'c:\Users\Nathan Palamuttam\Downloads\2_reviews_per_movie_raw'

csv_names = []

for filename in os.listdir(directory):
if filename.endswith('.csv'):
processed_name = filename[:-9]
print(processed_name)
csv_names.append((processed_name, filename))


with open('init.json', 'r') as file:
data = json.load(file)

for item in data:
for item1 in csv_names:
if item['title'] is not None and item['title'] in item1[0]:
csv_file_path = os.path.join(directory, item1[1])
df = pd.read_csv(csv_file_path)
reviews = df['review'].head(10).tolist()
item['reviews'] = reviews
print(item['title'])
print(reviews)
with open('init.json', 'w') as file:
json.dump(data, file, indent = 4)



172 changes: 162 additions & 10 deletions backend/app.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
import json
import os
from flask import Flask, render_template, request
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
from helpers.MySQLDatabaseHandler import MySQLDatabaseHandler
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
import numpy as np

# ROOT_PATH for linking with all your files.


# ROOT_PATH for linking with all your files.
# Feel free to use a config.py or settings.py with a global export variable
os.environ['ROOT_PATH'] = os.path.abspath(os.path.join("..",os.curdir))

Expand All @@ -18,29 +29,170 @@
# Assuming your JSON data is stored in a file named 'init.json'
with open(json_file_path, 'r') as file:
data = json.load(file)
episodes_df = pd.DataFrame(data['episodes'])
reviews_df = pd.DataFrame(data['reviews'])
# episodes_df = pd.DataFrame(data['episodes'])
# reviews_df = pd.DataFrame(data['reviews'])
movies_df = pd.DataFrame(data)

app = Flask(__name__)
CORS(app)

# Sample search using json with pandas
def json_search(query):
matches = []
merged_df = pd.merge(episodes_df, reviews_df, left_on='id', right_on='id', how='inner')
matches = merged_df[merged_df['title'].str.lower().str.contains(query.lower())]
matches_filtered = matches[['title', 'descr', 'imdb_rating']]
matches = movies_df[movies_df['title'].str.lower().str.contains(query.lower()) | movies_df['original_title'].str.lower().str.contains(query.lower())]
matches_filtered = matches[['title', 'overview', 'vote_average', 'reviews','image','popularity']] # Adjusted to match relevant fields in the new JSON
matches_filtered_json = matches_filtered.to_json(orient='records')
return matches_filtered_json
def genre_search(genre):
matches = movies_df[movies_df['genres'].apply(lambda g: genre.lower() in (genre_name.lower() for genre_name in g))]
matches_filtered = matches[['title', 'overview', 'vote_average']]
matches_filtered_json = matches_filtered.to_json(orient='records')
return matches_filtered_json
def filter_movies_by_genre(genre):
def is_genre_present(genres_str, genre):
try:
genres_list = ast.literal_eval(genres_str)
for genre_dict in genres_list:
if genre.lower() == genre_dict['name'].lower():
return True
except ValueError:
return False
return False

matches = movies_df[movies_df['genres'].apply(lambda g: is_genre_present(g, genre))]
matches_filtered = matches[['title', 'overview', 'vote_average', 'reviews','image','popularity']]
matches_filtered_json = matches_filtered.to_json(orient='records')
return matches_filtered_json

import math
def compute_similarities(overviews, query):
combined_texts = overviews + [query]
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(combined_texts)
cosine_similarities = linear_kernel(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
similarity_scores = [(score, idx) for idx, score in enumerate(cosine_similarities)]
similarity_scores = sorted(similarity_scores, reverse=True)
top_matches = similarity_scores[:50]
return top_matches

def tokenize(text):
if text is None:
return ""
return text.lower().split()

def build_vocabulary(descriptions):
vocab = set()
for description in descriptions:
vocab.update(tokenize(description))
return list(vocab)

def vectorize(text, vocabulary):
word_counts = {word: 0 for word in vocabulary}
for word in tokenize(text):
if word in word_counts:
word_counts[word] += 1
return [word_counts[word] for word in vocabulary]

def cosine_similarity(vec1, vec2):
dot_product = sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
magnitude1 = math.sqrt(sum(v**2 for v in vec1))
magnitude2 = math.sqrt(sum(v**2 for v in vec2))
if magnitude1 == 0 or magnitude2 == 0:
return 0
return dot_product / (magnitude1 * magnitude2)
def genresuggests(genre):
genres = set()
for item in data:
genre_list = json.loads(item['genres'].replace("'", "\""))
for genre in genre_list:
if genre in genre['name'].lower():
genres.add(genre['name'])

return jsonify(list(genres))


train_data = pd.read_csv('train.tsv.zip', sep='\t')
train_data['full_sentence'] = train_data.groupby('SentenceId')['Phrase'].transform(lambda x: ' '.join(x))
train_data = train_data.drop_duplicates('SentenceId').reset_index(drop=True)

model = make_pipeline(TfidfVectorizer(stop_words='english'), MultinomialNB())
model.fit(train_data['full_sentence'], train_data['Sentiment'])

def classify_and_score_reviews(json_data):
movies_df = pd.read_json(json_data)
results = []
for index, row in movies_df.iterrows():
movie_reviews = row['reviews']
if movie_reviews:
sentiments = model.predict(movie_reviews)
if len(sentiments) > 0:
average_sentiment = np.mean(sentiments)
results.append((row['title'], average_sentiment))
else:
results.append((row['title'], None))
else:
results.append((row['title'], None))

return dict(results)


@app.route("/")
def home():
return render_template('base.html',title="sample html")

def compute_cosine_similarities(texts):
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(texts)
cosine_similarities = linear_kernel(tfidf_matrix[-1:], tfidf_matrix[:-1]).flatten()
return cosine_similarities


@app.route("/episodes")
def episodes_search():
text = request.args.get("title")
return json_search(text)
query= request.args.get("query")
review=request.args.get("review")
print("Reivew", review)
movies_df = pd.read_json('init.json')

pure_json= filter_movies_by_genre(text)
json_text= json.loads(pure_json)
sentiment_scores = classify_and_score_reviews(pure_json)

overviews = [overview['overview'] if overview['overview'] is not None else "" for overview in json_text]
texts = overviews + ([query] if query is not None else [''])
query_sim= compute_cosine_similarities(texts)
reviews= [str(text['reviews']) if text['reviews'] is not None else "" for text in json_text]
review_text= reviews+([review] if review is not None else [''])
reviews_similarities=compute_cosine_similarities(review_text)
cosine_similarity=None
if review != "" and query != "":
cosine_similarity= (query_sim+reviews_similarities)/2
if review != "" and query =="":
cosine_similarity= reviews_similarities
if review == "" and query !="":
cosine_similarity=query_sim
else:
cosine_similarity=query_sim
movie_scores = list(enumerate(cosine_similarity))
sorted_movie_scores = sorted(movie_scores, key=lambda x: x[1], reverse=True)[:20]
combined_scores = [(index,value, sentiment_scores[json_text[index]['title']]) for index,value in sorted_movie_scores]
combined_scores_sorted = (sorted(combined_scores, key=lambda x: x[2] , reverse=True))
filtered_movies = [json_text[int(index)] for index,first,second in combined_scores_sorted]
return filtered_movies

@app.route('/genre_suggestions')
def genre_suggestions():
query = request.args.get('query', '').lower()
genres = set()

for item in data:
genre_list = json.loads(item['genres'].replace("'", "\""))
for genre in genre_list:
if query in genre['name'].lower():
genres.add(genre['name'])

return jsonify(list(genres))

if 'DB_NAME' not in os.environ:
app.run(debug=True,host="0.0.0.0",port=5000)
app.run(debug=True,host="0.0.0.0",port=5000)

6 changes: 6 additions & 0 deletions backend/checkjson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import json

with open('init.json', 'r') as file:
data = json.load(file)
for item in data:
print(item['popularity'], item['image'])
Loading