diff --git a/api/websocket_wiki.py b/api/websocket_wiki.py
index c8292996..d077a06a 100644
--- a/api/websocket_wiki.py
+++ b/api/websocket_wiki.py
@@ -403,7 +403,10 @@ async def handle_websocket_chat(websocket: WebSocket):
conversation_history += f"\n{turn.user_query.query_str}\n{turn.assistant_response.response_str}\n\n"
# Create the prompt with context
- prompt = f"/no_think {system_prompt}\n\n"
+ if request.provider == "ollama":
+ prompt = f"/think {system_prompt}\n\n"
+ else:
+ prompt = f"/no_think {system_prompt}\n\n"
if conversation_history:
prompt += f"\n{conversation_history}\n\n"
@@ -428,7 +431,7 @@ async def handle_websocket_chat(websocket: WebSocket):
model_config = get_model_config(request.provider, request.model)["model_kwargs"]
if request.provider == "ollama":
- prompt += " /no_think"
+ prompt += " /think"
model = OllamaClient()
model_kwargs = {
@@ -527,11 +530,18 @@ async def handle_websocket_chat(websocket: WebSocket):
# Get the response and handle it properly using the previously created api_kwargs
response = await model.acall(api_kwargs=api_kwargs, model_type=ModelType.LLM)
# Handle streaming response from Ollama
+ think = False
async for chunk in response:
text = getattr(chunk, 'response', None) or getattr(chunk, 'text', None) or str(chunk)
if text and not text.startswith('model=') and not text.startswith('created_at='):
- text = text.replace('', '').replace('', '')
- await websocket.send_text(text)
+ if text == '':
+ think = True
+ logger.info("think enabled")
+ elif text == '':
+ think = False
+ # skip ..... in output
+ if not think:
+ await websocket.send_text(text)
# Explicitly close the WebSocket connection after the response is complete
await websocket.close()
elif request.provider == "openrouter":
@@ -614,7 +624,10 @@ async def handle_websocket_chat(websocket: WebSocket):
logger.warning("Token limit exceeded, retrying without context")
try:
# Create a simplified prompt without context
- simplified_prompt = f"/no_think {system_prompt}\n\n"
+ if request.provider == "ollama":
+ simplified_prompt = f"/think {system_prompt}\n\n"
+ else:
+ simplified_prompt = f"/no_think {system_prompt}\n\n"
if conversation_history:
simplified_prompt += f"\n{conversation_history}\n\n"
@@ -626,7 +639,7 @@ async def handle_websocket_chat(websocket: WebSocket):
simplified_prompt += f"\n{query}\n\n\nAssistant: "
if request.provider == "ollama":
- simplified_prompt += " /no_think"
+ simplified_prompt += " /think"
# Create new api_kwargs with the simplified prompt
fallback_api_kwargs = model.convert_inputs_to_api_kwargs(
@@ -639,11 +652,16 @@ async def handle_websocket_chat(websocket: WebSocket):
fallback_response = await model.acall(api_kwargs=fallback_api_kwargs, model_type=ModelType.LLM)
# Handle streaming fallback_response from Ollama
- async for chunk in fallback_response:
+ think = False
+ async for chunk in response:
text = getattr(chunk, 'response', None) or getattr(chunk, 'text', None) or str(chunk)
if text and not text.startswith('model=') and not text.startswith('created_at='):
- text = text.replace('', '').replace('', '')
- await websocket.send_text(text)
+ if text == '':
+ think = True
+ elif text == '':
+ think = False
+ if not think:
+ await websocket.send_text(text)
elif request.provider == "openrouter":
try:
# Create new api_kwargs with the simplified prompt