rag-teaching-assistant/process_incoming.py at main · 002meet/rag-teaching-assistant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics.pairwise import cosine_similarity
import requests
from openai import OpenAI

client = OpenAI(
    api_key="enter_your_api_key"
    )

#Load the DataFrame
df = joblib.load('embedding.joblib')

def create_embedding(text_list):
    # https://github.com/ollama/ollama/blob/main/docs/api.md#generate-embeddings
    r = requests.post(
        "http://localhost:11434/api/embed",
        json = {
            "model": "bge-m3",
            "input": text_list
            })

    embedding = r.json()['embeddings']
    return embedding

def inference(prompt):
    r = requests.post(
        "http://localhost:11434/api/generate",
        json = {
            # "model": "deepdeek-r1",
            "model": "llama3.2",
            "prompt": prompt,
            "stream" : False
            })

def inference_Openai(prompt):
    print("Thinking...")
    response = client.responses.create(
    model = "gpt-5",
    input = prompt,
    store = True,
    )

    return response.response.output_text

incoming_query = input("Ask a Question: ")

question_embedding = create_embedding([incoming_query])[0]
# print(np.vstack(df['embedding'].values))
# print(np.vstack(df['embedding']).shape)
embedding_matrix = np.vstack(df['embedding'].values)

# Find similarities of question_embedding with other embeddings
similarities = cosine_similarity(embedding_matrix, [question_embedding]).flatten()

top_results = 5
max_index = similarities.argsort()[::-1][0:top_results]

new_df = df.loc[max_index]
# print(new_df[["title", "number", "text"]])

# for index, item in new_df.iterrows():
#     print(index, item['title'], item['number'], item['text'], item['start'], item['end'])

prompt = f'''I am teaching web development in my Sigma web development course. Here are video subtitle chunks containing video title, video number, start time in seconds, end time in seconds, the text at that time:

{new_df[["title", "number", "start", "end", "text"]].to_json(orient="records")}
-------------------------------------------------------------------------------
"{incoming_query}"
User asked this question related to the video chunks, you have to answer in a human way (dont mention the above format, its just for you) where and how much content is taught in which video (in which video and at what timestamp) and guide the user to go to that particular video. If user asks unrelated question, tell him that you can only answer questions related to the course
'''
with open("prompt.txt", "w") as f:
    f.write(prompt)

# response = inference(prompt)["response"]
# print(response)

response = inference_Openai(prompt)
print(response)

with open("response.txt", "w") as f:
    f.write(response)