Add next questions suggestion to the user (#170)

leehuwuj · marcusschiesser · web-flow · commit 090041368922 · 2024-07-23T17:04:53.000+07:00
---------
Co-authored-by: Marcus Schiesser &lt;mail@marcusschiesser.de&gt;
diff --git a/.changeset/tall-pans-bake.md b/.changeset/tall-pans-bake.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Add suggestions for next questions.
diff --git a/templates/components/llamaindex/typescript/streaming/stream.ts b/templates/components/llamaindex/typescript/streaming/stream.ts
@@ -5,34 +5,51 @@ import {
   trimStartOfStreamHelper,
   type AIStreamCallbacksAndOptions,
 } from "ai";
-import { EngineResponse } from "llamaindex";
+import { ChatMessage, EngineResponse } from "llamaindex";
+import { generateNextQuestions } from "./suggestion";
 
 export function LlamaIndexStream(
   response: AsyncIterable<EngineResponse>,
   data: StreamData,
+  chatHistory: ChatMessage[],
   opts?: {
     callbacks?: AIStreamCallbacksAndOptions;
   },
 ): ReadableStream<Uint8Array> {
-  return createParser(response, data)
+  return createParser(response, data, chatHistory)
     .pipeThrough(createCallbacksTransformer(opts?.callbacks))
     .pipeThrough(createStreamDataTransformer());
 }
 
-function createParser(res: AsyncIterable<EngineResponse>, data: StreamData) {
+function createParser(
+  res: AsyncIterable<EngineResponse>,
+  data: StreamData,
+  chatHistory: ChatMessage[],
+) {
   const it = res[Symbol.asyncIterator]();
   const trimStartOfStream = trimStartOfStreamHelper();
+  let llmTextResponse = "";
 
   return new ReadableStream<string>({
     async pull(controller): Promise<void> {
       const { value, done } = await it.next();
       if (done) {
         controller.close();
+        // LLM stream is done, generate the next questions with a new LLM call
+        chatHistory.push({ role: "assistant", content: llmTextResponse });
+        const questions: string[] = await generateNextQuestions(chatHistory);
+        if (questions.length > 0) {
+          data.appendMessageAnnotation({
+            type: "suggested_questions",
+            data: questions,
+          });
+        }
         data.close();
         return;
       }
       const text = trimStartOfStream(value.delta ?? "");
       if (text) {
+        llmTextResponse += text;
         controller.enqueue(text);
       }
     },
diff --git a/templates/components/llamaindex/typescript/streaming/suggestion.ts b/templates/components/llamaindex/typescript/streaming/suggestion.ts
@@ -0,0 +1,54 @@
+import { ChatMessage, Settings } from "llamaindex";
+
+const NEXT_QUESTION_PROMPT_TEMPLATE = `You're a helpful assistant! Your task is to suggest the next question that user might ask. 
+Here is the conversation history
+---------------------
+$conversation
+---------------------
+Given the conversation history, please give me $number_of_questions questions that you might ask next!
+Your answer should be wrapped in three sticks which follows the following format:
+\`\`\`
+<question 1>
+<question 2>\`\`\`
+`;
+const N_QUESTIONS_TO_GENERATE = 3;
+
+export async function generateNextQuestions(
+  conversation: ChatMessage[],
+  numberOfQuestions: number = N_QUESTIONS_TO_GENERATE,
+) {
+  const llm = Settings.llm;
+
+  // Format conversation
+  const conversationText = conversation
+    .map((message) => `${message.role}: ${message.content}`)
+    .join("\n");
+  const message = NEXT_QUESTION_PROMPT_TEMPLATE.replace(
+    "$conversation",
+    conversationText,
+  ).replace("$number_of_questions", numberOfQuestions.toString());
+
+  try {
+    const response = await llm.complete({ prompt: message });
+    const questions = extractQuestions(response.text);
+    return questions;
+  } catch (error) {
+    console.error("Error: ", error);
+    throw error;
+  }
+}
+
+// TODO: instead of parsing the LLM's result we can use structured predict, once LITS supports it
+function extractQuestions(text: string): string[] {
+  // Extract the text inside the triple backticks
+  const contentMatch = text.match(/```(.*?)```/s);
+  const content = contentMatch ? contentMatch[1] : "";
+
+  // Split the content by newlines to get each question
+  const questions = content
+    .split("\n")
+    .map((question) => question.trim())
+    .filter((question) => question !== "");
+
+  return questions;
+}
diff --git a/templates/types/streaming/express/src/controllers/chat.controller.ts b/templates/types/streaming/express/src/controllers/chat.controller.ts
@@ -67,7 +67,11 @@ export const chat = async (req: Request, res: Response) => {
     });
 
     // Return a stream, which can be consumed by the Vercel/AI client
-    const stream = LlamaIndexStream(response, vercelStreamData);
+    const stream = LlamaIndexStream(
+      response,
+      vercelStreamData,
+      messages as ChatMessage[],
+    );
 
     return streamToResponse(stream, res, {}, vercelStreamData);
   } catch (error) {
diff --git a/templates/types/streaming/fastapi/app/api/routers/chat.py b/templates/types/streaming/fastapi/app/api/routers/chat.py
@@ -61,7 +61,7 @@ async def chat(
         response = await chat_engine.astream_chat(last_message_content, messages)
         process_response_nodes(response.source_nodes, background_tasks)
 
-        return VercelStreamResponse(request, event_handler, response)
+        return VercelStreamResponse(request, event_handler, response, data)
     except Exception as e:
         logger.exception("Error in chat engine", exc_info=True)
         raise HTTPException(
diff --git a/templates/types/streaming/fastapi/app/api/routers/models.py b/templates/types/streaming/fastapi/app/api/routers/models.py
@@ -25,7 +25,7 @@ class File(BaseModel):
     filetype: str
 
 
-class AnnotationData(BaseModel):
+class AnnotationFileData(BaseModel):
     files: List[File] = Field(
         default=[],
         description="List of files",
@@ -50,7 +50,7 @@ class Config:
 
 class Annotation(BaseModel):
     type: str
-    data: AnnotationData
+    data: AnnotationFileData | List[str]
 
     def to_content(self) -> str | None:
         if self.type == "document_file":
diff --git a/templates/types/streaming/fastapi/app/api/routers/vercel_response.py b/templates/types/streaming/fastapi/app/api/routers/vercel_response.py
@@ -6,7 +6,8 @@
 from llama_index.core.chat_engine.types import StreamingAgentChatResponse
 
 from app.api.routers.events import EventCallbackHandler
-from app.api.routers.models import SourceNodes
+from app.api.routers.models import ChatData, Message, SourceNodes
+from app.api.services.suggestion import NextQuestionSuggestion
 
 
 class VercelStreamResponse(StreamingResponse):
@@ -17,15 +18,6 @@ class VercelStreamResponse(StreamingResponse):
     TEXT_PREFIX = "0:"
     DATA_PREFIX = "8:"
 
-    def __init__(
-        self,
-        request: Request,
-        event_handler: EventCallbackHandler,
-        response: StreamingAgentChatResponse,
-    ):
-        content = self.content_generator(request, event_handler, response)
-        super().__init__(content=content)
-
     @classmethod
     def convert_text(cls, token: str):
         # Escape newlines and double quotes to avoid breaking the stream
@@ -37,17 +29,48 @@ def convert_data(cls, data: dict):
         data_str = json.dumps(data)
         return f"{cls.DATA_PREFIX}[{data_str}]\n"
 
+    def __init__(
+        self,
+        request: Request,
+        event_handler: EventCallbackHandler,
+        response: StreamingAgentChatResponse,
+        chat_data: ChatData,
+    ):
+        content = VercelStreamResponse.content_generator(
+            request, event_handler, response, chat_data
+        )
+        super().__init__(content=content)
+
     @classmethod
     async def content_generator(
         cls,
         request: Request,
         event_handler: EventCallbackHandler,
         response: StreamingAgentChatResponse,
+        chat_data: ChatData,
     ):
         # Yield the text response
         async def _chat_response_generator():
+            final_response = ""
             async for token in response.async_response_gen():
-                yield cls.convert_text(token)
+                final_response += token
+                yield VercelStreamResponse.convert_text(token)
+
+            # Generate questions that user might interested to
+            conversation = chat_data.messages + [
+                Message(role="assistant", content=final_response)
+            ]
+            questions = await NextQuestionSuggestion.suggest_next_questions(
+                conversation
+            )
+            if len(questions) > 0:
+                yield VercelStreamResponse.convert_data(
+                    {
+                        "type": "suggested_questions",
+                        "data": questions,
+                    }
+                )
+
             # the text_generator is the leading stream, once it's finished, also finish the event stream
             event_handler.is_done = True
 
diff --git a/templates/types/streaming/fastapi/app/api/services/suggestion.py b/templates/types/streaming/fastapi/app/api/services/suggestion.py
@@ -0,0 +1,48 @@
+from typing import List
+
+from app.api.routers.models import Message
+from llama_index.core.prompts import PromptTemplate
+from llama_index.core.settings import Settings
+from pydantic import BaseModel
+
+NEXT_QUESTIONS_SUGGESTION_PROMPT = PromptTemplate(
+    "You're a helpful assistant! Your task is to suggest the next question that user might ask. "
+    "\nHere is the conversation history"
+    "\n---------------------\n{conversation}\n---------------------"
+    "Given the conversation history, please give me $number_of_questions questions that you might ask next!"
+)
+N_QUESTION_TO_GENERATE = 3
+
+
+class NextQuestions(BaseModel):
+    """A list of questions that user might ask next"""
+
+    questions: List[str]
+
+
+class NextQuestionSuggestion:
+    @staticmethod
+    async def suggest_next_questions(
+        messages: List[Message],
+        number_of_questions: int = N_QUESTION_TO_GENERATE,
+    ) -> List[str]:
+        # Reduce the cost by only using the last two messages
+        last_user_message = None
+        last_assistant_message = None
+        for message in reversed(messages):
+            if message.role == "user":
+                last_user_message = f"User: {message.content}"
+            elif message.role == "assistant":
+                last_assistant_message = f"Assistant: {message.content}"
+            if last_user_message and last_assistant_message:
+                break
+        conversation: str = f"{last_user_message}\n{last_assistant_message}"
+
+        output: NextQuestions = await Settings.llm.astructured_predict(
+            NextQuestions,
+            prompt=NEXT_QUESTIONS_SUGGESTION_PROMPT,
+            conversation=conversation,
+            nun_questions=number_of_questions,
+        )
+
+        return output.questions
diff --git a/templates/types/streaming/nextjs/app/api/chat/route.ts b/templates/types/streaming/nextjs/app/api/chat/route.ts
@@ -80,7 +80,11 @@ export async function POST(request: NextRequest) {
     });
 
     // Transform LlamaIndex stream to Vercel/AI format
-    const stream = LlamaIndexStream(response, vercelStreamData);
+    const stream = LlamaIndexStream(
+      response,
+      vercelStreamData,
+      messages as ChatMessage[],
+    );
 
     // Return a StreamingTextResponse, which can be consumed by the Vercel/AI client
     return new StreamingTextResponse(stream, {}, vercelStreamData);
diff --git a/templates/types/streaming/nextjs/app/components/ui/chat/chat-message/chat-suggestedQuestions.tsx b/templates/types/streaming/nextjs/app/components/ui/chat/chat-message/chat-suggestedQuestions.tsx
@@ -0,0 +1,32 @@
+import { useState } from "react";
+import { ChatHandler, SuggestedQuestionsData } from "..";
+
+export function SuggestedQuestions({
+  questions,
+  append,
+}: {
+  questions: SuggestedQuestionsData;
+  append: Pick<ChatHandler, "append">["append"];
+}) {
+  const [showQuestions, setShowQuestions] = useState(questions.length > 0);
+
+  return (
+    showQuestions &&
+    append !== undefined && (
+      <div className="flex flex-col space-y-2">
+        {questions.map((question, index) => (
+          <a
+            key={index}
+            onClick={() => {
+              append({ role: "user", content: question });
+              setShowQuestions(false);
+            }}
+            className="text-sm italic hover:underline cursor-pointer"
+          >
+            {"->"} {question}
+          </a>
+        ))}
+      </div>
+    )
+  );
+}
diff --git a/templates/types/streaming/nextjs/app/components/ui/chat/chat-message/index.tsx b/templates/types/streaming/nextjs/app/components/ui/chat/chat-message/index.tsx
diff --git a/templates/types/streaming/nextjs/app/components/ui/chat/chat-messages.tsx b/templates/types/streaming/nextjs/app/components/ui/chat/chat-messages.tsx
diff --git a/templates/types/streaming/nextjs/app/components/ui/chat/index.ts b/templates/types/streaming/nextjs/app/components/ui/chat/index.ts

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"create-llama": patch
 +---
++
 +Add suggestions for next questions.