cactus/models.json at main · cactus-compute/cactus · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
[
  {"model": "google/gemma-3-270m-it",                  "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion"],                                    "description": "Gemma 3 270M instruction-tuned model for on-device text completion."},
  {"model": "google/functiongemma-270m-it",            "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools"],                            "description": "Gemma 3 270M fine-tuned for structured tool and function calling."},
  {"model": "LiquidAI/LFM2.5-350M",                    "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools","embed"],                    "description": "LFM2.5 350M compact hybrid language model from Liquid AI designed for edge deployment."},
  {"model": "LiquidAI/LFM2-700M",                      "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools","embed"],                    "description": "LFM2 700M hybrid language model from Liquid AI for on-device chat and embeddings."},
  {"model": "Qwen/Qwen3-0.6B",                         "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools","embed"],                    "description": "Qwen3 0.6B compact model supporting thinking and non-thinking modes for on-device chat."},
  {"model": "Qwen/Qwen3.5-0.8B",                       "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "image-text-to-text",           "tags": ["vision","completion","tools","embed"],           "description": "Qwen3.5 0.8B hybrid vision-language model with DeltaNet for on-device multimodal inference."},
  {"model": "google/gemma-3-1b-it",                    "int4": true,  "int8": false, "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion"],                                    "description": "Gemma 3 1B instruction-tuned model for on-device text completion."},
  {"model": "google/gemma-3n-E2B-it",                  "int4": true,  "int8": false, "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools"],                            "description": "Gemma 3n E2B instruction-tuned model with 2B effective parameters for on-device completion and tool use."},
  {"model": "google/gemma-3n-E4B-it",                  "int4": true,  "int8": false, "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools"],                            "description": "Gemma 3n E4B instruction-tuned model with 4B effective parameters for on-device completion and tool use."},
  {"model": "google/gemma-4-E2B-it",                   "int4": true,  "int8": false, "fp16": false, "apple": true,  "pipeline_tag": "image-text-to-text",           "tags": ["vision","audio","completion","tools","apple-npu"], "description": "Gemma 4 E2B instruction-tuned multimodal model with vision and audio for on-device inference."},
  {"model": "google/gemma-4-E4B-it",                   "int4": true,  "int8": false, "fp16": false, "apple": true,  "pipeline_tag": "image-text-to-text",           "tags": ["vision","audio","completion","tools","apple-npu"], "description": "Gemma 4 E4B instruction-tuned multimodal model with vision and audio for on-device inference."},
  {"model": "LiquidAI/LFM2.5-1.2B-Thinking",           "int4": true,  "int8": false, "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools","embed"],                    "description": "LFM2.5 1.2B reasoning model from Liquid AI with extended thinking for on-device deployment."},
  {"model": "LiquidAI/LFM2.5-1.2B-Instruct",           "int4": true,  "int8": false, "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools","embed"],                    "description": "LFM2.5 1.2B instruction-tuned language model from Liquid AI designed for edge deployment."},
  {"model": "Qwen/Qwen3-1.7B",                         "int4": true,  "int8": false, "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools","embed"],                    "description": "Qwen3 1.7B model supporting thinking and non-thinking modes for on-device reasoning."},
  {"model": "Qwen/Qwen3.5-2B",                         "int4": true,  "int8": false, "fp16": false, "apple": false, "pipeline_tag": "image-text-to-text",           "tags": ["vision","completion","tools","embed"],           "description": "Qwen3.5 2B hybrid vision-language model with DeltaNet for on-device multimodal inference."},
  {"model": "tencent/Youtu-LLM-2B",                    "int4": true,  "int8": false, "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools", "embed"],                   "description": "Youtu-LLM 1.96B model from Tencent with Dense MLA attention, 128k context, and native agentic capabilities excelling at coding, STEM, and reasoning."},
  {"model": "LiquidAI/LFM2-2.6B",                      "int4": true,  "int8": false, "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools","embed"],                    "description": "LFM2 2.6B hybrid language model from Liquid AI for on-device chat and embeddings."},
  {"model": "LiquidAI/LFM2-VL-450M",                   "int4": true,  "int8": true,  "fp16": false, "apple": true,  "pipeline_tag": "image-text-to-text",           "tags": ["vision","text-embed","image-embed","apple-npu"], "description": "LFM2-VL 450M compact vision-language model from Liquid AI for on-device image understanding."},
  {"model": "LiquidAI/LFM2.5-VL-450M",                 "int4": true,  "int8": true,  "fp16": false, "apple": true,  "pipeline_tag": "image-text-to-text",           "tags": ["vision","text-embed","image-embed","apple-npu"], "description": "LFM2.5-VL 450M refreshed compact vision-language model from Liquid AI for image and text understanding."},
  {"model": "LiquidAI/LFM2.5-VL-1.6B",                 "int4": true,  "int8": false, "fp16": false, "apple": true,  "pipeline_tag": "image-text-to-text",           "tags": ["vision","text-embed","image-embed","apple-npu"], "description": "LFM2.5-VL 1.6B vision-language model from Liquid AI for image and text understanding."},
  {"model": "UsefulSensors/moonshine-base",            "int4": true,  "int8": true,  "fp16": false, "apple": true,  "pipeline_tag": "automatic-speech-recognition", "tags": ["transcription","speech-embed"],                  "description": "Moonshine Base 61M parameter English speech recognition model optimized for live transcription."},
  {"model": "openai/whisper-tiny",                     "int4": true,  "int8": true,  "fp16": false, "apple": true,  "pipeline_tag": "automatic-speech-recognition", "tags": ["transcription","speech-embed","apple-npu"],      "description": "Whisper Tiny 39M parameter multilingual speech recognition model by OpenAI."},
  {"model": "openai/whisper-base",                     "int4": true,  "int8": true,  "fp16": false, "apple": true,  "pipeline_tag": "automatic-speech-recognition", "tags": ["transcription","speech-embed","apple-npu"],      "description": "Whisper Base 74M parameter multilingual speech recognition model by OpenAI."},
  {"model": "openai/whisper-small",                    "int4": true,  "int8": true,  "fp16": false, "apple": true,  "pipeline_tag": "automatic-speech-recognition", "tags": ["transcription","speech-embed","apple-npu"],      "description": "Whisper Small 244M parameter multilingual speech recognition model by OpenAI."},
  {"model": "openai/whisper-medium",                   "int4": true,  "int8": true,  "fp16": false, "apple": true,  "pipeline_tag": "automatic-speech-recognition", "tags": ["transcription","speech-embed","apple-npu"],      "description": "Whisper Medium 769M parameter multilingual speech recognition model by OpenAI."},
  {"model": "openai/whisper-large-v3",                 "int4": true,  "int8": true,  "fp16": false, "apple": true,  "pipeline_tag": "automatic-speech-recognition", "tags": ["transcription","speech-embed","apple-npu"],      "description": "Whisper Large v3 1.55B parameter multilingual speech recognition model by OpenAI with improved accuracy and 128 mel bins."},
  {"model": "snakers4/silero-vad",                     "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "voice-activity-detection",     "tags": ["vad"],                                           "description": "Silero VAD tiny voice activity detection model supporting over 100 languages."},
  {"model": "nomic-ai/nomic-embed-text-v2-moe",        "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "feature-extraction",           "tags": ["embed"],                                         "description": "Nomic Embed Text v2 MoE 305M multilingual text embedding model using mixture-of-experts."},
  {"model": "Qwen/Qwen3-Embedding-0.6B",               "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "feature-extraction",           "tags": ["embed"],                                         "description": "Qwen3 0.6B text embedding model supporting 100+ languages with 1024-dimensional vectors."},
  {"model": "nvidia/parakeet-ctc-0.6b",                "int4": true,  "int8": true,  "fp16": false, "apple": true,  "pipeline_tag": "automatic-speech-recognition", "tags": ["transcription","speech-embed","apple-npu"],      "description": "Parakeet CTC 0.6b is a speech recognition model optimized for on-device performance and live transcription."},
  {"model": "nvidia/parakeet-ctc-1.1b",                "int4": true,  "int8": false, "fp16": false, "apple": true,  "pipeline_tag": "automatic-speech-recognition", "tags": ["transcription","speech-embed","apple-npu"],      "description": "Parakeet CTC 1.1b is a speech recognition model optimized for on-device performance and live transcription."},
  {"model": "nvidia/parakeet-tdt-0.6b-v3",             "int4": true,  "int8": true,  "fp16": false, "apple": true,  "pipeline_tag": "automatic-speech-recognition", "tags": ["transcription","speech-embed","apple-npu"],      "description": "Parakeet TDT 0.6b is a speech recognition model optimized for on-device performance and live transcription."},
  {"model": "LiquidAI/LFM2-8B-A1B",                    "int4": true,  "int8": false, "fp16": false, "apple": false, "pipeline_tag": "text-generation",              "tags": ["completion","tools","embed"],                    "description": "LFM2 8B MoE model with 1.5B active parameters for high-quality on-device inference."},
  {"model": "pyannote/segmentation-3.0",               "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "voice-activity-detection",     "tags": ["diarization"],                                   "description": "PyAnnote segmentation-3.0 speaker diarization model using SincNet frontend and BiLSTM layers."},
  {"model": "pyannote/wespeaker-voxceleb-resnet34-LM", "int4": true,  "int8": true,  "fp16": false, "apple": false, "pipeline_tag": "speaker-recognition",          "tags": ["speaker-embed"],                                 "description": "WeSpeaker ResNet34-LM speaker embedding model producing 256-dimensional speaker representations."}
]