From 530a2b6f170b1ce9324220950c6643db31b4cd57 Mon Sep 17 00:00:00 2001 From: chen zhang Date: Sat, 19 Jul 2025 10:51:52 +0800 Subject: [PATCH] enable deep mode for local ollama --- api/websocket_wiki.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/api/websocket_wiki.py b/api/websocket_wiki.py index c8292996..d077a06a 100644 --- a/api/websocket_wiki.py +++ b/api/websocket_wiki.py @@ -403,7 +403,10 @@ async def handle_websocket_chat(websocket: WebSocket): conversation_history += f"\n{turn.user_query.query_str}\n{turn.assistant_response.response_str}\n\n" # Create the prompt with context - prompt = f"/no_think {system_prompt}\n\n" + if request.provider == "ollama": + prompt = f"/think {system_prompt}\n\n" + else: + prompt = f"/no_think {system_prompt}\n\n" if conversation_history: prompt += f"\n{conversation_history}\n\n" @@ -428,7 +431,7 @@ async def handle_websocket_chat(websocket: WebSocket): model_config = get_model_config(request.provider, request.model)["model_kwargs"] if request.provider == "ollama": - prompt += " /no_think" + prompt += " /think" model = OllamaClient() model_kwargs = { @@ -527,11 +530,18 @@ async def handle_websocket_chat(websocket: WebSocket): # Get the response and handle it properly using the previously created api_kwargs response = await model.acall(api_kwargs=api_kwargs, model_type=ModelType.LLM) # Handle streaming response from Ollama + think = False async for chunk in response: text = getattr(chunk, 'response', None) or getattr(chunk, 'text', None) or str(chunk) if text and not text.startswith('model=') and not text.startswith('created_at='): - text = text.replace('', '').replace('', '') - await websocket.send_text(text) + if text == '': + think = True + logger.info("think enabled") + elif text == '': + think = False + # skip ..... in output + if not think: + await websocket.send_text(text) # Explicitly close the WebSocket connection after the response is complete await websocket.close() elif request.provider == "openrouter": @@ -614,7 +624,10 @@ async def handle_websocket_chat(websocket: WebSocket): logger.warning("Token limit exceeded, retrying without context") try: # Create a simplified prompt without context - simplified_prompt = f"/no_think {system_prompt}\n\n" + if request.provider == "ollama": + simplified_prompt = f"/think {system_prompt}\n\n" + else: + simplified_prompt = f"/no_think {system_prompt}\n\n" if conversation_history: simplified_prompt += f"\n{conversation_history}\n\n" @@ -626,7 +639,7 @@ async def handle_websocket_chat(websocket: WebSocket): simplified_prompt += f"\n{query}\n\n\nAssistant: " if request.provider == "ollama": - simplified_prompt += " /no_think" + simplified_prompt += " /think" # Create new api_kwargs with the simplified prompt fallback_api_kwargs = model.convert_inputs_to_api_kwargs( @@ -639,11 +652,16 @@ async def handle_websocket_chat(websocket: WebSocket): fallback_response = await model.acall(api_kwargs=fallback_api_kwargs, model_type=ModelType.LLM) # Handle streaming fallback_response from Ollama - async for chunk in fallback_response: + think = False + async for chunk in response: text = getattr(chunk, 'response', None) or getattr(chunk, 'text', None) or str(chunk) if text and not text.startswith('model=') and not text.startswith('created_at='): - text = text.replace('', '').replace('', '') - await websocket.send_text(text) + if text == '': + think = True + elif text == '': + think = False + if not think: + await websocket.send_text(text) elif request.provider == "openrouter": try: # Create new api_kwargs with the simplified prompt