-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrag.py
More file actions
135 lines (114 loc) · 4.34 KB
/
rag.py
File metadata and controls
135 lines (114 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import chromadb
from google import genai
import os
# Initialize ChromaDB locally
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection(name="study_materials")
def add_to_knowledge_base(document_id: str, extracted_data: list, subject: str = "General"):
"""
extracted_data is a list of dicts: {'page': int, 'text': str, 'images': list[str]}
"""
documents = []
metadatas = []
ids = []
for idx, chunk in enumerate(extracted_data):
page = chunk['page']
text = chunk['text']
images = chunk['images']
# We must push non-empty text to chromadb
if not text.strip():
text = f"[Image only page {page}]"
documents.append(text)
metadatas.append({
"document_id": document_id,
"page": page,
"images": ",".join(images) if images else "",
"subject": subject
})
ids.append(f"{document_id}_page_{page}_{idx}")
if documents:
collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)
def query_knowledge_base(query: str, api_key: str, subject: str = "General"):
"""
Retrieve top K relevant chunks and generate a response.
"""
results = collection.query(
query_texts=[query],
n_results=4,
where={"subject": subject}
)
if not results['documents'][0]:
return "No relevant study material found. Please upload a PDF.", []
context_str = ""
retrieved_images = set()
for i in range(len(results['documents'][0])):
text = results['documents'][0][i]
meta = results['metadatas'][0][i]
context_str += f"--- Page {meta['page']} ---\n{text}\n\n"
if meta.get('images'):
images_list = meta['images'].split(",")
for img in images_list:
if img.strip():
retrieved_images.add(img.strip())
prompt = f"""
You are an intelligent study assistant for a student.
Your goal is to summarize the following extracted text from the user's study materials based on their query, to decrease their cognitive workload.
Give a clear and brief matter without losing any important information from the source material.
Structure your response beautifully with markdown headings and bullet points.
If the text refers to diagrams, graphs, or tables, explicitly mention that the user should refer to the attached diagram.
Query: {query}
Materials:
{context_str}
"""
try:
client = genai.Client(api_key=api_key)
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=prompt
)
answer = response.text
except Exception as e:
# Fallback to older model or print error
try:
response = client.models.generate_content(
model='gemini-1.5-flash',
contents=prompt
)
answer = response.text
except Exception as inner_e:
answer = f"Error generating response from Gemini API: {e}. Check your API key."
return answer, list(retrieved_images)
def generate_document_overview(extracted_data: list, api_key: str):
text_content = ""
# Keep context concise to avoid huge prompts overhead
for chunk in extracted_data[:30]:
text_content += chunk['text'] + "\n\n"
prompt = f"""
You are an intelligent study assistant. The user just uploaded a new study material.
Please provide:
1. A short textual overview of the important topics in a structured way (bullet points).
2. A visual overview using Mermaid.js syntax. CRITICAL: Keep it VERY small and crisp! Maximum depth of 3. Use very short phrases (2-4 words max) for nodes. Provide ONLY main headings as a visual tree (e.g. mindmap or graph TD).
Format your response exactly like this:
```markdown
[Your textual overview here]
```
```mermaid
graph TD
[Your tree structure of topics here]
```
Extracted Text:
{text_content}
"""
try:
client = genai.Client(api_key=api_key)
response = client.models.generate_content(
model='gemini-2.5-flash',
contents=prompt
)
return response.text
except Exception as e:
return f"Could not generate overview. Error: {e}"