From 6e5b9b8656bc0de9f63895efb3745116926edb72 Mon Sep 17 00:00:00 2001
From: csf0326 <csf22326@gmail.com>
Date: Fri, 1 Aug 2025 18:23:23 +0800
Subject: [PATCH] =?UTF-8?q?add=20agent=20doc=20=EF=BC=9Anloop=20rag?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/en/Cookbook/rag_nloop.md | 262 ++++++++++++++++++++++++++++++++++
 docs/mkdocs.template.yml      |   1 +
 docs/zh/Cookbook/rag_nloop.md | 262 ++++++++++++++++++++++++++++++++++
 3 files changed, 525 insertions(+)
 create mode 100644 docs/en/Cookbook/rag_nloop.md
 create mode 100644 docs/zh/Cookbook/rag_nloop.md

diff --git a/docs/en/Cookbook/rag_nloop.md b/docs/en/Cookbook/rag_nloop.md
new file mode 100644
index 000000000..539fbd52c
--- /dev/null
+++ b/docs/en/Cookbook/rag_nloop.md
@@ -0,0 +1,262 @@
+# Build a Question Answering application with chat history
+
+A key challenge in Q&A systems is handling conversational context. This tutorial shows how to add memory to a RAG application so it can handle follow-up questions that depend on chat history.
+
+We'll use the LazyLLM framework to build a conversational RAG system that can handle multi-turn conversations effectively.
+
+## Setup
+
+### Components
+
+We'll build our conversational RAG application using LazyLLM's built-in components:
+
+- **Document**: For loading and managing documents with embeddings
+- **Retriever**: For retrieving relevant documents based on similarity
+- **Reranker**: For reranking retrieved documents for better relevance
+- **OnlineChatModule**: For LLM-based question reformulation and answer generation
+- **ReactAgent**: For creating an agent that can iterate over multiple retrieval steps
+
+### Dependencies
+
+```python
+import os
+import tempfile
+from typing import List, Dict, Any
+import lazyllm
+from lazyllm import pipeline, parallel, bind, fc_register
+from lazyllm import Document, Retriever, Reranker, SentenceSplitter, OnlineEmbeddingModule
+```
+
+## Chains
+
+A key challenge for conversational RAG is that the user messages and the LLM's responses aren't the only forms of context. User messages can reference previous portions of the conversation, making them challenging to understand in isolation.
+
+For example, the second question below depends on context from the first:
+
+```
+Human: What is the standard method for task decomposition?
+AI: The standard method for task decomposition is Chain of Thought (CoT) prompting...
+Human: What are the common extensions of this method?
+```
+
+The second question "What are the common extensions of this method?" is ambiguous without the context of the first question. Our system needs to understand that "this method" refers to "Chain of Thought prompting".
+
+### Stateful management of chat history
+
+To handle this, we need to contextualize the question based on the chat history. Let's build our conversational RAG system:
+
+```python
+class ConversationalRAGPipeline:
+    """Multi-turn conversational RAG pipeline based on LazyLLM components"""
+    
+    def __init__(self, docs_path: str = None):
+        self.chat_history = []
+        self.docs_path = docs_path or self._create_sample_docs()
+        self.pipeline = None
+        self.init_pipeline()
+```
+
+First, we set up our documents and embeddings:
+
+```python
+def init_pipeline(self):
+    """Initialize RAG pipeline"""
+    # Create documents and embeddings
+    documents = Document(
+        dataset_path=self.docs_path, 
+        embed=OnlineEmbeddingModule(
+            source="qwen",
+            type="embed",
+            embed_model_name="text-embedding-v1"
+        ),
+        manager=False
+    )
+    documents.create_node_group(
+        name="sentences", 
+        transform=SentenceSplitter, 
+        chunk_size=512, 
+        chunk_overlap=100
+    )
+```
+
+Next, we create a contextualization prompt to reformulate questions based on chat history:
+
+```python
+contextualize_prompt = """You are an assistant that helps rewrite questions. Your task is to:
+1. Analyze the user's chat history
+2. Understand the latest question
+3. If the question depends on historical context, rewrite it as a standalone understandable form
+4. If the question is already standalone, return the original question
+5. Only return the rewritten question, do not add any explanations
+
+Example:
+History: "User: Tell me about John\nAssistant: John is an engineer..."
+Question: "What is his job?" 
+Rewrite as: "What is John's job?"
+
+History: {chat_history}
+Question: {question}
+
+Rewritten question:"""
+
+self.contextualizer = lazyllm.OnlineChatModule(
+    source="deepseek",
+    timeout=30
+).prompt(lazyllm.ChatPrompter(contextualize_prompt))
+```
+
+Then we build our main RAG pipeline with parallel retrieval and reranking:
+
+```python
+with pipeline() as ppl:
+    # Parallel retrieval
+    with parallel().sum as ppl.prl:
+        ppl.prl.retriever1 = Retriever(
+            documents, 
+            group_name="sentences", 
+            similarity="cosine", 
+            topk=3
+        )
+        ppl.prl.retriever2 = Retriever(
+            documents, 
+            "CoarseChunk", 
+            "bm25_chinese", 
+            0.003, 
+            topk=2
+        )
+    
+    # Reranking
+    ppl.reranker = Reranker(
+        "ModuleReranker", 
+        model=OnlineEmbeddingModule(type="rerank", source="qwen"), 
+        topk=3, 
+        output_format='content', 
+        join=True
+    ) | bind(query=ppl.input)
+    
+    # LLM generates answer
+    ppl.llm = lazyllm.OnlineChatModule(
+        source="deepseek",
+        stream=False,
+        timeout=60
+    ).prompt(lazyllm.ChatPrompter(rag_prompt, extra_keys=["context_str", "chat_history"]))
+
+self.pipeline = ppl
+```
+
+The chat method handles the conversation flow:
+
+```python
+def chat(self, question: str) -> str:
+    """Process single turn conversation"""
+    # 1. Reformulate question
+    contextualized_question = self.contextualize_question(question)
+    
+    # 2. Build historical conversation text
+    history_text = self._format_chat_history(self.chat_history, max_turns=4)
+    
+    # 3. Generate answer through RAG pipeline
+    response = self.pipeline(
+        contextualized_question,
+        chat_history=history_text
+    )
+    
+    # 4. Update chat history
+    self.chat_history.append({"role": "user", "content": question})
+    self.chat_history.append({"role": "assistant", "content": response})
+    
+    return response
+```
+
+Let's test our conversational RAG:
+
+```python
+rag = ConversationalRAGPipeline()
+
+# First question
+response1 = rag.chat("What is the standard method for task decomposition?")
+print(f"Response 1: {response1}")
+
+# Follow-up question that depends on context
+response2 = rag.chat("What are the common extensions of this method?")
+print(f"Response 2: {response2}")
+```
+
+Note that the question generated by the model in the second question incorporates the conversational context, transforming "this method" into "Chain of Thought prompting extensions".
+
+## Agents
+
+Agents leverage the reasoning capabilities of LLMs to make decisions during execution. Using agents allows you to offload additional discretion over the retrieval process. Although their behavior is less predictable than the above "chain", they are able to execute multiple retrieval steps in service of a query, or iterate on a single search.
+
+Below we assemble a minimal RAG agent using LazyLLM's ReactAgent:
+
+```python
+@fc_register("tool")
+def conversational_rag_chat(question: str) -> str:
+    """
+    Multi-turn conversational RAG tool
+    
+    Args:
+        question (str): User question
+    
+    Returns:
+        str: RAG system's answer
+    """
+    global rag_system
+    if rag_system is None:
+        rag_system = ConversationalRAGPipeline()
+    
+    return rag_system.chat(question)
+
+def create_rag_agent():
+    """Create RAG agent"""
+    agent = lazyllm.ReactAgent(
+        llm=lazyllm.OnlineChatModule(source="deepseek", timeout=60),
+        tools=["conversational_rag_chat"],
+        max_retries=3,
+        return_trace=False,
+        stream=False
+    )
+    
+    return agent
+```
+
+The key difference from our earlier implementation is that instead of a final generation step that ends the run, here the tool invocation can loop back to gather more information. The agent can then either answer the question using the retrieved context, or generate another tool call to obtain more information.
+
+Let's test this out with a question that would typically require an iterative sequence of retrieval steps:
+
+```python
+agent = create_rag_agent()
+
+response = agent(
+    "What is the standard method for Task Decomposition? "
+    "Once you get the answer, look up common extensions of that method."
+)
+print(response)
+```
+
+Note that the agent:
+1. Generates a query to search for a standard method for task decomposition
+2. Receiving the answer, generates a second query to search for common extensions of it
+3. Having received all necessary context, answers the question
+
+## Running the Application
+
+To start the web interface:
+
+```bash
+python agent_n_rag.py
+```
+
+Then visit `http://localhost:8849` to interact with the conversational RAG system through a web interface.
+
+## Next steps
+
+We've covered the steps to build a basic conversational Q&A application:
+
+* We used chains to build a predictable application that contextualizes questions based on chat history
+* We used agents to build an application that can iterate on a sequence of queries
+
+To explore different types of retrievers and retrieval strategies, visit LazyLLM's documentation on retrieval components.
+
+For more advanced agent architectures, check out LazyLLM's agent documentation and examples.
diff --git a/docs/mkdocs.template.yml b/docs/mkdocs.template.yml
index 806aeee06..3e4ceca32 100644
--- a/docs/mkdocs.template.yml
+++ b/docs/mkdocs.template.yml
@@ -15,6 +15,7 @@ nav:
   - Great Writer: Cookbook/great_writer.md
   - RAG: Cookbook/rag.md
   - Streaming: Cookbook/streaming.md
+  - RAG with Chat History: Cookbook/rag_nloop.md
 - Best Practice:
   - Flow: Best Practice/flow.md
   - Flowapp: Best Practice/flowapp.md
diff --git a/docs/zh/Cookbook/rag_nloop.md b/docs/zh/Cookbook/rag_nloop.md
new file mode 100644
index 000000000..170656c3a
--- /dev/null
+++ b/docs/zh/Cookbook/rag_nloop.md
@@ -0,0 +1,262 @@
+# 构建带聊天历史的问答应用
+
+问答系统的一个关键挑战是处理对话上下文。本教程展示如何为 RAG 应用添加记忆功能，使其能够处理依赖于聊天历史的后续问题。
+
+我们将使用 LazyLLM 框架构建一个能够有效处理多轮对话的会话式 RAG 系统。
+
+## 环境配置
+
+### 组件
+
+我们将使用 LazyLLM 的内置组件来构建会话式 RAG 应用：
+
+- **Document**：用于加载和管理带有嵌入的文档
+- **Retriever**：基于相似度检索相关文档
+- **Reranker**：对检索到的文档进行重新排序以提高相关性
+- **OnlineChatModule**：用于基于 LLM 的问题重构和答案生成
+- **ReactAgent**：用于创建可以进行多次检索迭代的智能体
+
+### 依赖
+
+```python
+import os
+import tempfile
+from typing import List, Dict, Any
+import lazyllm
+from lazyllm import pipeline, parallel, bind, fc_register
+from lazyllm import Document, Retriever, Reranker, SentenceSplitter, OnlineEmbeddingModule
+```
+
+## 链式处理
+
+会话式 RAG 的一个关键挑战在于，用户消息和 LLM 的响应并不是唯一的上下文形式。用户消息可能会引用之前对话的部分内容，这使得单独理解它们变得具有挑战性。
+
+例如，下面的第二个问题依赖于第一个问题的上下文：
+
+```
+Human: 任务分解的标准方法是什么？
+AI: 任务分解的标准方法是思维链（Chain of Thought，CoT）提示...
+Human: 这种方法有哪些常见的扩展？
+```
+
+第二个问题"这种方法有哪些常见的扩展？"如果没有第一个问题的上下文是模糊的。我们的系统需要理解"这种方法"指的是"思维链提示"。
+
+### 聊天历史的状态管理
+
+为了处理这种情况，我们需要基于聊天历史来理解问题上下文。让我们构建我们的会话式 RAG 系统：
+
+```python
+class ConversationalRAGPipeline:
+    """基于 LazyLLM 组件的多轮会话 RAG 管道"""
+    
+    def __init__(self, docs_path: str = None):
+        self.chat_history = []
+        self.docs_path = docs_path or self._create_sample_docs()
+        self.pipeline = None
+        self.init_pipeline()
+```
+
+首先，我们设置文档和嵌入：
+
+```python
+def init_pipeline(self):
+    """初始化 RAG 管道"""
+    # 创建文档和嵌入
+    documents = Document(
+        dataset_path=self.docs_path, 
+        embed=OnlineEmbeddingModule(
+            source="qwen",
+            type="embed",
+            embed_model_name="text-embedding-v1"
+        ),
+        manager=False
+    )
+    documents.create_node_group(
+        name="sentences", 
+        transform=SentenceSplitter, 
+        chunk_size=512, 
+        chunk_overlap=100
+    )
+```
+
+接下来，我们创建一个上下文化提示，用于基于聊天历史重新表述问题：
+
+```python
+contextualize_prompt = """你是一个帮助重写问题的助手。你的任务是：
+1. 分析用户的聊天历史
+2. 理解最新的问题
+3. 如果问题依赖于历史上下文，将其重写为独立可理解的形式
+4. 如果问题已经是独立的，返回原始问题
+5. 只返回重写后的问题，不要添加任何解释
+
+示例：
+历史记录："用户：告诉我关于小明的信息\n助手：小明是一名工程师..."
+问题："他的工作是什么？"
+重写为："小明的工作是什么？"
+
+历史记录：{chat_history}
+问题：{question}
+
+重写后的问题："""
+
+self.contextualizer = lazyllm.OnlineChatModule(
+    source="deepseek",
+    timeout=30
+).prompt(lazyllm.ChatPrompter(contextualize_prompt))
+```
+
+然后我们构建具有并行检索和重排序的主要 RAG 管道：
+
+```python
+with pipeline() as ppl:
+    # Parallel retrieval
+    with parallel().sum as ppl.prl:
+        ppl.prl.retriever1 = Retriever(
+            documents, 
+            group_name="sentences", 
+            similarity="cosine", 
+            topk=3
+        )
+        ppl.prl.retriever2 = Retriever(
+            documents, 
+            "CoarseChunk", 
+            "bm25_chinese", 
+            0.003, 
+            topk=2
+        )
+    
+    # Reranking
+    ppl.reranker = Reranker(
+        "ModuleReranker", 
+        model=OnlineEmbeddingModule(type="rerank", source="qwen"), 
+        topk=3, 
+        output_format='content', 
+        join=True
+    ) | bind(query=ppl.input)
+    
+    # LLM generates answer
+    ppl.llm = lazyllm.OnlineChatModule(
+        source="deepseek",
+        stream=False,
+        timeout=60
+    ).prompt(lazyllm.ChatPrompter(rag_prompt, extra_keys=["context_str", "chat_history"]))
+
+self.pipeline = ppl
+```
+
+聊天方法处理对话流程：
+
+```python
+def chat(self, question: str) -> str:
+    """处理单轮对话"""
+    # 1. 重构问题
+    contextualized_question = self.contextualize_question(question)
+    
+    # 2. 构建历史对话文本
+    history_text = self._format_chat_history(self.chat_history, max_turns=4)
+    
+    # 3. 通过 RAG 管道生成答案
+    response = self.pipeline(
+        contextualized_question,
+        chat_history=history_text
+    )
+    
+    # 4. 更新聊天历史
+    self.chat_history.append({"role": "user", "content": question})
+    self.chat_history.append({"role": "assistant", "content": response})
+    
+    return response
+```
+
+让我们测试我们的会话式 RAG：
+
+```python
+rag = ConversationalRAGPipeline()
+
+# 第一个问题
+response1 = rag.chat("任务分解的标准方法是什么？")
+print(f"回复 1: {response1}")
+
+# 依赖上下文的后续问题
+response2 = rag.chat("这种方法有哪些常见的扩展？")
+print(f"回复 2: {response2}")
+```
+
+注意，模型在第二个问题中生成的问题包含了对话上下文，将"这种方法"转换为"思维链提示的扩展"。
+
+## 智能体
+
+智能体利用 LLM 的推理能力在执行过程中做出决策。使用智能体允许你将更多的检索过程决策权交给系统。虽然它们的行为比上述"链式"处理更难预测，但它们能够执行多个检索步骤来服务于一个查询，或者对单个搜索进行迭代。
+
+下面我们使用 LazyLLM 的 ReactAgent 组装一个最小的 RAG 智能体：
+
+```python
+@fc_register("tool")
+def conversational_rag_chat(question: str) -> str:
+    """
+    Multi-turn conversational RAG tool
+    
+    Args:
+        question (str): User question
+    
+    Returns:
+        str: RAG system's answer
+    """
+    global rag_system
+    if rag_system is None:
+        rag_system = ConversationalRAGPipeline()
+    
+    return rag_system.chat(question)
+
+def create_rag_agent():
+    """Create RAG agent"""
+    agent = lazyllm.ReactAgent(
+        llm=lazyllm.OnlineChatModule(source="deepseek", timeout=60),
+        tools=["conversational_rag_chat"],
+        max_retries=3,
+        return_trace=False,
+        stream=False
+    )
+    
+    return agent
+```
+
+与我们之前的实现的主要区别在于，这里不是以最终生成步骤结束运行，而是工具调用可以循环回去收集更多信息。然后智能体可以使用检索到的上下文回答问题，或者生成另一个工具调用以获取更多信息。
+
+让我们用一个通常需要迭代序列检索步骤的问题来测试：
+
+```python
+agent = create_rag_agent()
+
+response = agent(
+    "任务分解的标准方法是什么？"
+    "获得答案后，查找该方法的常见扩展。"
+)
+print(response)
+```
+
+注意智能体：
+1. 生成查询以搜索任务分解的标准方法
+2. 收到答案后，生成第二个查询以搜索其常见扩展
+3. 在收到所有必要的上下文后，回答问题
+
+## 运行应用
+
+启动 Web 界面：
+
+```bash
+python agent_n_rag.py
+```
+
+然后访问 `http://localhost:8849` 通过 Web 界面与会话式 RAG 系统进行交互。
+
+## 下一步
+
+我们已经介绍了构建基本会话式问答应用的步骤：
+
+* 我们使用链来构建一个可预测的应用，该应用基于聊天历史来理解问题上下文
+* 我们使用智能体来构建一个可以对查询序列进行迭代的应用
+
+要探索不同类型的检索器和检索策略，请访问 LazyLLM 的检索组件文档。
+
+要了解更高级的智能体架构，请查看 LazyLLM 的智能体文档和示例。