HTKUDTeam2 · TranThiKimHuynh · Jan 4, 2025 · Jan 12, 2025 · Jan 18, 2025 · Jan 24, 2025
diff --git a/.gitignore b/.gitignore
diff --git a/back-end/api/routes/chatbot.py b/back-end/api/routes/chatbot.py
@@ -8,23 +8,23 @@
 
 @router.post("/ask", response_model=ChatbotResponse)
 async def ask_question(request: ChatbotRequest, app_request: Request):
-    """
-    Endpoint xử lý câu hỏi từ người dùng.
-    - Truy xuất tài liệu từ vector database.
-    - Sinh câu trả lời dựa trên tài liệu.
-    """
-    question = request.question
+    current_question = request.currentQuestion
+    conversation = request.conversation
 
     try:
         # Truy cập vector database đã được load sẵn trong app.state
         chroma_db = app_request.app.state.chroma_db
 
-        # Sinh câu trả lời từ các tài liệu truy xuất
-        answer, links, titles = generate_answer(vector_db=chroma_db, question=question, top_k=5)
+        # Sinh câu trả lời từ câu hỏi và lấy ngữ cảnh từ lịch sử trò chuyện
+        answer, links, titles = generate_answer(
+            vector_db=chroma_db,
+            question=current_question,
+            conversation=conversation,
+            top_k=5
+        )
 
         # Trả về câu trả lời
-        return ChatbotResponse(question=question, answer=answer, links=links, titles=titles)
+        return ChatbotResponse(question=current_question, answer=answer, links=links, titles=titles)
 
     except Exception as e:
-        # Xử lý lỗi chung
-        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
diff --git a/back-end/handlers/model_handler.py b/back-end/handlers/model_handler.py
@@ -22,6 +22,28 @@
     openai_api_key=openai_api_key,
 )
 
+##################### Truy suất câu hỏi #####################
+question_generated_prompt = """
+Role: You are an intellectual property law attorney named iLaw.
+
+Context: The user may provide a question related to intellectual property law, but the question might not be entirely clear as it could include words referring to previous discussions, such as 'it,' 'that,' or 'this.' 
+You will be provided with the history of Q&A between the user and the system to better understand the issues the user is concerned about.
+
+Instruction: Create a specific and fully meaningful question if the user is referring to issues from the history of previous Q&A discussions.
+
+Q&A History:  
+{conversation}
+
+Requirements:  
+- The question must be as specific as possible, providing full context if the user is referring to an issue from the Q&A history.  
+- If there is no reference to the Q&A history, return the original question provided by the user.  
+- Do not fabricate, assume, or ask me for clarification.
+
+Output Indicator: A single question in Vietnamese, nothing else.
+"""
+
+
+##################### RAG prompt #####################
 # Định nghĩa mẫu prompt cho hệ thống
 system_prompt = """
 You are an Intellectual Property Lawyer named iLaw. Your primary responsibility is to assist users with legal questions related to intellectual property law. 
@@ -30,32 +52,33 @@
 {context}
 
 ### Role and Guidelines:
-1. **Role**: You are acting as a professional lawyer specializing in Intellectual Property Law.
-2. **Language**: All responses must be in **Vietnamese**.
-3. **Audience**: Users may not have prior knowledge of legal terms, so your responses should be:
+1. *Role*: You are acting as a professional lawyer specializing in Intellectual Property Law.
+2. *Language*: All responses must be in *Vietnamese*.
+3. *Audience*: Users may not have prior knowledge of legal terms, so your responses should be:
    - Clear, detailed, and easy to understand.
    - Free of unnecessary jargon.
    - Supplemented with examples or analogies when necessary.
-4. **Fallback**: If no relevant documentation is found, respond with: **"Chúng tôi không tìm thấy thông tin liên quan."**
-5. **Prohibited**: Do not guess, assume, or provide advice that cannot be supported by laws or references.
+4. *Fallback*: If no relevant documentation is found, respond with: *"Chúng tôi không tìm thấy thông tin liên quan."*
+5. *References*: Official legal language, directly quoting legal provisions, suitable for research purposes or detailed reference in the field of law.
+6. *Prohibited*: Do not guess, assume, or provide advice that cannot be supported by laws or references.
 ---
 
 ### Instructions:
 When answering, follow this structured reasoning process:
-1. **Identify the user's question**: Determine the specific area of Intellectual Property Law being addressed.
-2. **Explain the concept in detail**: Provide a step-by-step explanation of relevant legal terms or laws.
-3. **Apply to user's scenario**: Illustrate how the laws or terms apply to the user's context.
-4. **Provide actionable advice**: Suggest next steps, documents, or authorities the user should contact.
-
-
+1. *Identify the user's question*: Determine the specific area of Intellectual Property Law being addressed.
+2. *Explain the concept in detail*: Provide a step-by-step explanation of relevant legal terms or laws.
+3. *Apply to user's scenario*: Illustrate how the laws or terms apply to the user's context.
+4. *Provide actionable advice*: Suggest next steps, documents, or authorities the user should contact.
 """
 
-
-
 prompt_template = ChatPromptTemplate.from_messages(
-    [("system", system_prompt), ("human", "Question: {question}")]
+    [
+        ("system", system_prompt),
+        ("human", "Question: {question}")
+    ]
 )
 
+
 # Khởi tạo chuỗi RAG với prompt_template
 rag_chain = (
     RunnableMap(
@@ -66,20 +89,15 @@
     | StrOutputParser()
 )
 
+# Kết hợp các retriever và mô hình reranker vào một ensemble retriever
 class EnhancedEnsembleRetriever:
     def __init__(self, retrievers, weights, reranker_model=None):
-        """
-        Initialize the ensemble retriever with multiple retrievers and optional reranker.
-        """
         self.retrievers = retrievers
         self.weights = weights
         self.reranker = CrossEncoder(reranker_model) if reranker_model else None
 
     def invoke(self, query):
-        """
-        Invoke the retrievers to get relevant documents and re-rank if necessary.
-        """
-        # 1. Lấy kết quả từ từng retriever
+        # Lấy kết quả từ từng retriever
         all_results = []
         for retriever, weight in zip(self.retrievers, self.weights):
             results = retriever.get_relevant_documents(query)
@@ -88,7 +106,7 @@ def invoke(self, query):
                 doc.metadata['score'] = doc.metadata.get('score', 1.0) * weight
             all_results.extend(results)
 
-        # 2. Loại bỏ trùng lặp (nếu cần) theo nội dung
+        # Loại bỏ trùng lặp 
         unique_results = []
         seen_content = set()
 
@@ -98,26 +116,23 @@ def invoke(self, query):
                 unique_results.append(doc)
                 seen_content.add(content)
 
-        # 3. Rerank nếu có mô hình reranker
+        # Rerank nếu có mô hình reranker
         if self.reranker:
             query_doc_pairs = [(query, doc.page_content) for doc in unique_results]
             scores = self.reranker.predict(query_doc_pairs)
             reranked_results = sorted(
                 zip(unique_results, scores),
-                key=lambda x: x[1],  # Sắp xếp theo điểm số
+                key=lambda x: x[1], 
                 reverse=True
             )
             return [doc for doc, score in reranked_results]
 
-        # 4. Nếu không rerank, trả về kết quả theo trọng số
+        # Nếu không rerank, trả về kết quả theo trọng số
         return sorted(unique_results, key=lambda x: x.metadata['score'], reverse=True)
 
 
 # Hàm tạo retriever từ Chroma và BM25
 def create_retriever(vector_db, query, k=4):
-    """
-    Tạo một retriever kết hợp giữa Chroma và BM25 cho một cơ sở dữ liệu vector.
-    """
     # Tạo BM25 retriever
     chroma_retriever = vector_db.as_retriever(search_type='similarity', search_kwargs={'k': k})
 
@@ -139,39 +154,46 @@ def create_retriever(vector_db, query, k=4):
     return ensemble_retriever
 
 
-# Hàm lấy tài liệu phù hợp từ retriever
+# Retrieve documents từ vector database
 def retrieve_documents(ensemble_retriever, query, top_k=4):
-    """
-    Lấy các tài liệu phù hợp từ ensemble retriever.
-    """
     docs = ensemble_retriever.invoke(query=query)
     return docs[:top_k]
 
 
-# Hàm định dạng tài liệu thành chuỗi
+# Định dạng documents thành chuỗi
 def format_docs(docs):
-    """
-    Định dạng các tài liệu thành chuỗi văn bản cho hệ thống RAG.
-    """
     formatted_docs =""
     for i, doc in enumerate(docs):
         formatted_docs += f"Document {i+1}:\n{doc.page_content}\n\n"
 
     return formatted_docs
 
 
-# Hàm gọi chuỗi RAG và sinh câu trả lời
-def generate_answer(vector_db, question, top_k=4):
-    """
-    Sinh câu trả lời cho câu hỏi dựa trên cơ sở dữ liệu vector và chuỗi RAG.
-    """
-    # Tạo retriever kết hợp
-    ensemble_retriever = create_retriever(vector_db, question, k=top_k)
+def generate_answer(vector_db, question, conversation=None, top_k=4):
+
 
-    # Lấy các tài liệu phù hợp
-    docs = retrieve_documents(ensemble_retriever, question, top_k=top_k)
+    if conversation:
+        formatted_conversasion = "\n".join(
+            [f"User's question: {qa['question']}\nAI response: {qa['answer']}" for qa in conversation]
+        )
+    else:
+        formatted_conversasion = "Chưa có cuộc trò chuyện nào trước đó."
 
-    # Lấy các thông tin từ metadata của tài liệu (link, title)
+    # Kết hợp ngữ cảnh từ lịch sử trò chuyện và tài liệu retriever
+    question_context = question_generated_prompt.format(conversation=formatted_conversasion)
+    fully_context_question = rag_chain.invoke({"context": question_context, "question": question})
+    print("Câu hỏi cụ thể của người dùng: ", fully_context_question)
+
+    # Tạo retriever kết và truy xuất những tài liệu liên quan
+    ensemble_retriever = create_retriever(vector_db, fully_context_question, k=top_k)
+    docs = retrieve_documents(ensemble_retriever, fully_context_question, top_k=top_k)
+    # Định dạng tài liệu thành chuỗi văn bản
+    formatted_docs = format_docs(docs)
+    # print("Tài liệu trả về: ", formatted_docs)
+    # Gọi chuỗi RAG với ngữ cảnh đầy đủ
+    output = rag_chain.invoke({"context": formatted_docs, "question": fully_context_question})
+
+    # Xử lý metadata của các tài liệu để lấy liên kết và tiêu đề
     seen_links = set()
     unique_links = []
     unique_titles = []
@@ -184,10 +206,4 @@ def generate_answer(vector_db, question, top_k=4):
             seen_links.add(link)
             unique_links.append(link)
             unique_titles.append(title)
-
-    # Định dạng các tài liệu thành chuỗi văn bản
-    formatted_docs = format_docs(docs)
-    print(formatted_docs)
-    # Gọi chuỗi RAG để tạo câu trả lời
-    output = rag_chain.invoke({"context": formatted_docs, "question": question})
-    return output, unique_links, unique_titles
+    return output, unique_links, unique_titles
diff --git a/back-end/main.py b/back-end/main.py
@@ -21,10 +21,6 @@
 # Tải vector database khi ứng dụng khởi động
 @app.on_event("startup")
 async def load_existing_chroma_db():
-    """
-    Hàm được gọi khi ứng dụng khởi động.
-    Tải vector database từ thư mục đã lưu và in ra một số nội dung mẫu.
-    """
     try:
         print("Loading Chroma database...")
         app.state.chroma_db = load_chroma_db("vector_db")
@@ -40,23 +36,12 @@ async def load_existing_chroma_db():
 # Gắn router cho chatbot
 app.include_router(chatbot_router, prefix="/chatbot", tags=["Chatbot"])
 
-# Xử lý lỗi khi ứng dụng dừng
-
 
 @app.on_event("shutdown")
 async def shutdown_event():
-    """
-    Hàm được gọi khi ứng dụng dừng.
-    Có thể dùng để giải phóng tài nguyên nếu cần.
-    """
     print("Shutting down application. Cleaning up resources.")
 
 # Endpoint kiểm tra trạng thái ứng dụng
-
-
 @app.get("/")
 async def health_check():
-    """
-    Endpoint để kiểm tra trạng thái ứng dụng.
-    """
     return {"status": "OK", "message": "Chatbot RAG API is running!"}
diff --git a/back-end/schemas/chatbot.py b/back-end/schemas/chatbot.py
@@ -1,12 +1,13 @@
 from pydantic import BaseModel
-
+from typing import List, Dict
 
 class ChatbotRequest(BaseModel):
-    question: str
+    conversation: List[Dict[str, str]]
+    currentQuestion: str
 
 
 class ChatbotResponse(BaseModel):
     question: str
     answer: str
     links: list[str]
-    titles: list[str]
+    titles: list[str]
diff --git a/evaluate_model/.gitignore b/evaluate_model/.gitignore
@@ -0,0 +1,2 @@
+.env
+vector_db/
diff --git a/data/Q&A-So-huu-tri-tue.csv → evaluate_model/Q&A-So-huu-tri-tue.csv b/data/Q&A-So-huu-tri-tue.csv → evaluate_model/Q&A-So-huu-tri-tue.csv