@@ -403,7 +403,10 @@ async def handle_websocket_chat(websocket: WebSocket):
403
403
conversation_history += f"<turn>\n <user>{ turn .user_query .query_str } </user>\n <assistant>{ turn .assistant_response .response_str } </assistant>\n </turn>\n "
404
404
405
405
# Create the prompt with context
406
- prompt = f"/no_think { system_prompt } \n \n "
406
+ if request .provider == "ollama" :
407
+ prompt = f"/think { system_prompt } \n \n "
408
+ else :
409
+ prompt = f"/no_think { system_prompt } \n \n "
407
410
408
411
if conversation_history :
409
412
prompt += f"<conversation_history>\n { conversation_history } </conversation_history>\n \n "
@@ -428,7 +431,7 @@ async def handle_websocket_chat(websocket: WebSocket):
428
431
model_config = get_model_config (request .provider , request .model )["model_kwargs" ]
429
432
430
433
if request .provider == "ollama" :
431
- prompt += " /no_think "
434
+ prompt += " /think "
432
435
433
436
model = OllamaClient ()
434
437
model_kwargs = {
@@ -527,11 +530,18 @@ async def handle_websocket_chat(websocket: WebSocket):
527
530
# Get the response and handle it properly using the previously created api_kwargs
528
531
response = await model .acall (api_kwargs = api_kwargs , model_type = ModelType .LLM )
529
532
# Handle streaming response from Ollama
533
+ think = False
530
534
async for chunk in response :
531
535
text = getattr (chunk , 'response' , None ) or getattr (chunk , 'text' , None ) or str (chunk )
532
536
if text and not text .startswith ('model=' ) and not text .startswith ('created_at=' ):
533
- text = text .replace ('<think>' , '' ).replace ('</think>' , '' )
534
- await websocket .send_text (text )
537
+ if text == '<think>' :
538
+ think = True
539
+ logger .info ("think enabled" )
540
+ elif text == '</think>' :
541
+ think = False
542
+ # skip <think>.....</think> in output
543
+ if not think :
544
+ await websocket .send_text (text )
535
545
# Explicitly close the WebSocket connection after the response is complete
536
546
await websocket .close ()
537
547
elif request .provider == "openrouter" :
@@ -614,7 +624,10 @@ async def handle_websocket_chat(websocket: WebSocket):
614
624
logger .warning ("Token limit exceeded, retrying without context" )
615
625
try :
616
626
# Create a simplified prompt without context
617
- simplified_prompt = f"/no_think { system_prompt } \n \n "
627
+ if request .provider == "ollama" :
628
+ simplified_prompt = f"/think { system_prompt } \n \n "
629
+ else :
630
+ simplified_prompt = f"/no_think { system_prompt } \n \n "
618
631
if conversation_history :
619
632
simplified_prompt += f"<conversation_history>\n { conversation_history } </conversation_history>\n \n "
620
633
@@ -626,7 +639,7 @@ async def handle_websocket_chat(websocket: WebSocket):
626
639
simplified_prompt += f"<query>\n { query } \n </query>\n \n Assistant: "
627
640
628
641
if request .provider == "ollama" :
629
- simplified_prompt += " /no_think "
642
+ simplified_prompt += " /think "
630
643
631
644
# Create new api_kwargs with the simplified prompt
632
645
fallback_api_kwargs = model .convert_inputs_to_api_kwargs (
@@ -639,10 +652,15 @@ async def handle_websocket_chat(websocket: WebSocket):
639
652
fallback_response = await model .acall (api_kwargs = fallback_api_kwargs , model_type = ModelType .LLM )
640
653
641
654
# Handle streaming fallback_response from Ollama
642
- async for chunk in fallback_response :
655
+ think = False
656
+ async for chunk in response :
643
657
text = getattr (chunk , 'response' , None ) or getattr (chunk , 'text' , None ) or str (chunk )
644
658
if text and not text .startswith ('model=' ) and not text .startswith ('created_at=' ):
645
- text = text .replace ('<think>' , '' ).replace ('</think>' , '' )
659
+ if text == '<think>' :
660
+ think = True
661
+ elif text == '</think>' :
662
+ think = False
663
+ if not think :
646
664
await websocket .send_text (text )
647
665
elif request .provider == "openrouter" :
648
666
try :
0 commit comments