|
| 1 | +--- |
| 2 | +name: "hermes-vllm" |
| 3 | + |
| 4 | +config_file: | |
| 5 | + backend: vllm |
| 6 | + context_size: 8192 |
| 7 | + stopwords: |
| 8 | + - "<|im_end|>" |
| 9 | + - "<dummy32000>" |
| 10 | + - "<|eot_id|>" |
| 11 | + - "<|end_of_text|>" |
| 12 | + function: |
| 13 | + disable_no_action: true |
| 14 | + grammar: |
| 15 | + # Uncomment the line below to enable grammar matching for JSON results if the model is breaking |
| 16 | + # the output. This will make the model more accurate and won't break the JSON output. |
| 17 | + # This however, will make parallel_calls not functional (it is a known bug) |
| 18 | + # mixed_mode: true |
| 19 | + disable: true |
| 20 | + parallel_calls: true |
| 21 | + expect_strings_after_json: true |
| 22 | + json_regex_match: |
| 23 | + - "(?s)<tool_call>(.*?)</tool_call>" |
| 24 | + - "(?s)<tool_call>(.*)" |
| 25 | + capture_llm_results: |
| 26 | + - (?s)<scratchpad>(.*?)</scratchpad> |
| 27 | + replace_llm_results: |
| 28 | + - key: (?s)<scratchpad>(.*?)</scratchpad> |
| 29 | + value: "" |
| 30 | +
|
| 31 | + template: |
| 32 | + use_tokenizer_template: true |
| 33 | + chat: | |
| 34 | + {{.Input -}} |
| 35 | + <|im_start|>assistant |
| 36 | + chat_message: | |
| 37 | + <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}} |
| 38 | + {{- if .FunctionCall }} |
| 39 | + <tool_call> |
| 40 | + {{- else if eq .RoleName "tool" }} |
| 41 | + <tool_response> |
| 42 | + {{- end }} |
| 43 | + {{- if .Content}} |
| 44 | + {{.Content }} |
| 45 | + {{- end }} |
| 46 | + {{- if .FunctionCall}} |
| 47 | + {{toJson .FunctionCall}} |
| 48 | + {{- end }} |
| 49 | + {{- if .FunctionCall }} |
| 50 | + </tool_call> |
| 51 | + {{- else if eq .RoleName "tool" }} |
| 52 | + </tool_response> |
| 53 | + {{- end }}<|im_end|> |
| 54 | + completion: | |
| 55 | + {{.Input}} |
| 56 | + function: | |
| 57 | + <|im_start|>system |
| 58 | + You are a function calling AI model. |
| 59 | + Here are the available tools: |
| 60 | + <tools> |
| 61 | + {{range .Functions}} |
| 62 | + {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }} |
| 63 | + {{end}} |
| 64 | + </tools> |
| 65 | + You should call the tools provided to you sequentially |
| 66 | + Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows: |
| 67 | + <scratchpad> |
| 68 | + {step-by-step reasoning and plan in bullet points} |
| 69 | + </scratchpad> |
| 70 | + For each function call return a json object with function name and arguments within <tool_call> XML tags as follows: |
| 71 | + <tool_call> |
| 72 | + {"arguments": <args-dict>, "name": <function-name>} |
| 73 | + </tool_call><|im_end|> |
| 74 | + {{.Input -}} |
| 75 | + <|im_start|>assistant |
| 76 | +# Uncomment to specify a quantization method (optional) |
| 77 | +# quantization: "awq" |
| 78 | +# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%) |
| 79 | +# gpu_memory_utilization: 0.5 |
| 80 | +# Uncomment to trust remote code from huggingface |
| 81 | +# trust_remote_code: true |
| 82 | +# Uncomment to enable eager execution |
| 83 | +# enforce_eager: true |
| 84 | +# Uncomment to specify the size of the CPU swap space per GPU (in GiB) |
| 85 | +# swap_space: 2 |
| 86 | +# Uncomment to specify the maximum length of a sequence (including prompt and output) |
| 87 | +# max_model_len: 32768 |
| 88 | +# Uncomment and specify the number of Tensor divisions. |
| 89 | +# Allows you to partition and run large models. Performance gains are limited. |
| 90 | +# https://github.com/vllm-project/vllm/issues/1435 |
| 91 | +# tensor_parallel_size: 2 |
0 commit comments