Skip to content

Commit ebdcc90

Browse files
philipithomasbhaktatejas922propel-code-bot[bot]jeffchuber
authored
[ENH] Add Morph embedding functions (#5183)
Supersedes #5043 --------- Co-authored-by: bhaktatejas922 <[email protected]> Co-authored-by: propel-code-bot[bot] <203372662+propel-code-bot[bot]@users.noreply.github.com> Co-authored-by: Jeffrey Huber <[email protected]>
1 parent 8a0bb2e commit ebdcc90

File tree

21 files changed

+1025
-42
lines changed

21 files changed

+1025
-42
lines changed

chromadb/test/ef/test_ef.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def test_get_builtins_holds() -> None:
3737
"InstructorEmbeddingFunction",
3838
"JinaEmbeddingFunction",
3939
"MistralEmbeddingFunction",
40+
"MorphEmbeddingFunction",
4041
"ONNXMiniLM_L6_V2",
4142
"OllamaEmbeddingFunction",
4243
"OpenAIEmbeddingFunction",

chromadb/test/ef/test_morph_ef.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import os
2+
import pytest
3+
import numpy as np
4+
from chromadb.utils.embedding_functions.morph_embedding_function import (
5+
MorphEmbeddingFunction,
6+
)
7+
8+
9+
def test_morph_embedding_function_with_api_key() -> None:
10+
"""Test Morph embedding function when API key is available."""
11+
if os.environ.get("MORPH_API_KEY") is None:
12+
pytest.skip("MORPH_API_KEY not set")
13+
14+
ef = MorphEmbeddingFunction(
15+
model_name="morph-embedding-v2"
16+
)
17+
18+
# Test with code snippets (Morph's specialty)
19+
code_snippets = [
20+
"def hello_world():\n print('Hello, World!')",
21+
"class Calculator:\n def add(self, a, b):\n return a + b"
22+
]
23+
24+
embeddings = ef(code_snippets)
25+
assert embeddings is not None
26+
assert len(embeddings) == 2
27+
assert all(isinstance(emb, np.ndarray) for emb in embeddings)
28+
assert all(len(emb) > 0 for emb in embeddings)
29+
30+
31+
def test_morph_embedding_function_with_custom_parameters() -> None:
32+
"""Test Morph embedding function with custom parameters."""
33+
if os.environ.get("MORPH_API_KEY") is None:
34+
pytest.skip("MORPH_API_KEY not set")
35+
36+
ef = MorphEmbeddingFunction(
37+
model_name="morph-embedding-v2",
38+
api_base="https://api.morphllm.com/v1",
39+
encoding_format="float",
40+
api_key_env_var="MORPH_API_KEY"
41+
)
42+
43+
# Test with a simple function
44+
code_snippet = ["function add(a, b) { return a + b; }"]
45+
46+
embeddings = ef(code_snippet)
47+
assert embeddings is not None
48+
assert len(embeddings) == 1
49+
assert isinstance(embeddings[0], np.ndarray)
50+
assert len(embeddings[0]) > 0
51+
52+
53+
def test_morph_embedding_function_config_roundtrip() -> None:
54+
"""Test that Morph embedding function configuration can be saved and restored."""
55+
try:
56+
import openai
57+
except ImportError:
58+
pytest.skip("openai package not installed")
59+
60+
ef = MorphEmbeddingFunction(
61+
model_name="morph-embedding-v2",
62+
api_base="https://api.morphllm.com/v1",
63+
encoding_format="float",
64+
api_key_env_var="MORPH_API_KEY"
65+
)
66+
67+
# Get configuration
68+
config = ef.get_config()
69+
70+
# Verify configuration contains expected keys
71+
assert "model_name" in config
72+
assert "api_base" in config
73+
assert "encoding_format" in config
74+
assert "api_key_env_var" in config
75+
76+
# Verify values
77+
assert config["model_name"] == "morph-embedding-v2"
78+
assert config["api_base"] == "https://api.morphllm.com/v1"
79+
assert config["encoding_format"] == "float"
80+
assert config["api_key_env_var"] == "MORPH_API_KEY"
81+
82+
# Test building from config
83+
new_ef = MorphEmbeddingFunction.build_from_config(config)
84+
new_config = new_ef.get_config()
85+
86+
# Configurations should match
87+
assert config == new_config
88+
89+
90+
def test_morph_embedding_function_name() -> None:
91+
"""Test that Morph embedding function returns correct name."""
92+
assert MorphEmbeddingFunction.name() == "morph"
93+
94+
95+
def test_morph_embedding_function_spaces() -> None:
96+
"""Test that Morph embedding function supports expected spaces."""
97+
try:
98+
import openai
99+
except ImportError:
100+
pytest.skip("openai package not installed")
101+
102+
ef = MorphEmbeddingFunction(
103+
model_name="morph-embedding-v2",
104+
api_key_env_var="MORPH_API_KEY"
105+
)
106+
107+
# Test default space
108+
assert ef.default_space() == "cosine"
109+
110+
# Test supported spaces
111+
supported_spaces = ef.supported_spaces()
112+
assert "cosine" in supported_spaces
113+
assert "l2" in supported_spaces
114+
assert "ip" in supported_spaces
115+
116+
117+
def test_morph_embedding_function_validate_config() -> None:
118+
"""Test that Morph embedding function validates configuration correctly."""
119+
# Valid configuration
120+
valid_config = {
121+
"model_name": "morph-embedding-v2",
122+
"api_key_env_var": "MORPH_API_KEY"
123+
}
124+
125+
# This should not raise an exception
126+
MorphEmbeddingFunction.validate_config(valid_config)
127+
128+
# Invalid configuration (missing required fields)
129+
invalid_config = {
130+
"model_name": "morph-embedding-v2"
131+
# Missing api_key_env_var
132+
}
133+
134+
with pytest.raises(Exception):
135+
MorphEmbeddingFunction.validate_config(invalid_config)

chromadb/utils/embedding_functions/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@
6464
from chromadb.utils.embedding_functions.mistral_embedding_function import (
6565
MistralEmbeddingFunction,
6666
)
67+
from chromadb.utils.embedding_functions.morph_embedding_function import (
68+
MorphEmbeddingFunction,
69+
)
6770

6871
try:
6972
from chromadb.is_thin_client import is_thin_client
@@ -84,6 +87,7 @@
8487
"InstructorEmbeddingFunction",
8588
"JinaEmbeddingFunction",
8689
"MistralEmbeddingFunction",
90+
"MorphEmbeddingFunction",
8791
"VoyageAIEmbeddingFunction",
8892
"ONNXMiniLM_L6_V2",
8993
"OpenCLIPEmbeddingFunction",
@@ -145,6 +149,7 @@ def validate_config(config: Dict[str, Any]) -> None:
145149
"instructor": InstructorEmbeddingFunction,
146150
"jina": JinaEmbeddingFunction,
147151
"mistral": MistralEmbeddingFunction,
152+
"morph": MorphEmbeddingFunction,
148153
"voyageai": VoyageAIEmbeddingFunction,
149154
"onnx_mini_lm_l6_v2": ONNXMiniLM_L6_V2,
150155
"open_clip": OpenCLIPEmbeddingFunction,
@@ -233,6 +238,7 @@ def config_to_embedding_function(config: Dict[str, Any]) -> EmbeddingFunction:
233238
"InstructorEmbeddingFunction",
234239
"JinaEmbeddingFunction",
235240
"MistralEmbeddingFunction",
241+
"MorphEmbeddingFunction",
236242
"VoyageAIEmbeddingFunction",
237243
"ONNXMiniLM_L6_V2",
238244
"OpenCLIPEmbeddingFunction",
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
from chromadb.api.types import Embeddings, Documents, EmbeddingFunction, Space
2+
from typing import List, Dict, Any, Optional
3+
import os
4+
import numpy as np
5+
from chromadb.utils.embedding_functions.schemas import validate_config_schema
6+
import warnings
7+
8+
9+
class MorphEmbeddingFunction(EmbeddingFunction[Documents]):
10+
def __init__(
11+
self,
12+
api_key: Optional[str] = None,
13+
model_name: str = "morph-embedding-v2",
14+
api_base: str = "https://api.morphllm.com/v1",
15+
encoding_format: str = "float",
16+
api_key_env_var: str = "MORPH_API_KEY",
17+
):
18+
"""
19+
Initialize the MorphEmbeddingFunction.
20+
21+
Args:
22+
api_key (str, optional): The API key for the Morph API. If not provided,
23+
it will be read from the environment variable specified by api_key_env_var.
24+
model_name (str, optional): The name of the model to use for embeddings.
25+
Defaults to "morph-embedding-v2".
26+
api_base (str, optional): The base URL for the Morph API.
27+
Defaults to "https://api.morphllm.com/v1".
28+
encoding_format (str, optional): The format for embeddings (float or base64).
29+
Defaults to "float".
30+
api_key_env_var (str, optional): Environment variable name that contains your API key.
31+
Defaults to "MORPH_API_KEY".
32+
"""
33+
try:
34+
import openai
35+
except ImportError:
36+
raise ValueError(
37+
"The openai python package is not installed. Please install it with `pip install openai`. "
38+
"Note: Morph uses the OpenAI client library for API communication."
39+
)
40+
41+
if api_key is not None:
42+
warnings.warn(
43+
"Direct api_key configuration will not be persisted. "
44+
"Please use environment variables via api_key_env_var for persistent storage.",
45+
DeprecationWarning,
46+
)
47+
48+
self.api_key_env_var = api_key_env_var
49+
self.api_key = api_key or os.getenv(api_key_env_var)
50+
if not self.api_key:
51+
raise ValueError(f"The {api_key_env_var} environment variable is not set.")
52+
53+
self.model_name = model_name
54+
self.api_base = api_base
55+
self.encoding_format = encoding_format
56+
57+
# Initialize the OpenAI client with Morph's base URL
58+
self.client = openai.OpenAI(
59+
api_key=self.api_key,
60+
base_url=self.api_base,
61+
)
62+
63+
def __call__(self, input: Documents) -> Embeddings:
64+
"""
65+
Generate embeddings for the given documents.
66+
67+
Args:
68+
input: Documents to generate embeddings for.
69+
70+
Returns:
71+
Embeddings for the documents.
72+
"""
73+
# Handle empty input
74+
if not input:
75+
return []
76+
77+
# Prepare embedding parameters
78+
embedding_params: Dict[str, Any] = {
79+
"model": self.model_name,
80+
"input": input,
81+
"encoding_format": self.encoding_format,
82+
}
83+
84+
# Get embeddings from Morph API
85+
response = self.client.embeddings.create(**embedding_params)
86+
87+
# Extract embeddings from response
88+
return [np.array(data.embedding, dtype=np.float32) for data in response.data]
89+
90+
@staticmethod
91+
def name() -> str:
92+
return "morph"
93+
94+
def default_space(self) -> Space:
95+
# Morph embeddings work best with cosine similarity
96+
return "cosine"
97+
98+
def supported_spaces(self) -> List[Space]:
99+
return ["cosine", "l2", "ip"]
100+
101+
@staticmethod
102+
def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction[Documents]":
103+
# Extract parameters from config
104+
api_key_env_var = config.get("api_key_env_var")
105+
model_name = config.get("model_name")
106+
api_base = config.get("api_base")
107+
encoding_format = config.get("encoding_format")
108+
109+
if api_key_env_var is None or model_name is None:
110+
assert False, "This code should not be reached"
111+
112+
# Create and return the embedding function
113+
return MorphEmbeddingFunction(
114+
api_key_env_var=api_key_env_var,
115+
model_name=model_name,
116+
api_base=api_base if api_base is not None else "https://api.morphllm.com/v1",
117+
encoding_format=encoding_format if encoding_format is not None else "float",
118+
)
119+
120+
def get_config(self) -> Dict[str, Any]:
121+
return {
122+
"api_key_env_var": self.api_key_env_var,
123+
"model_name": self.model_name,
124+
"api_base": self.api_base,
125+
"encoding_format": self.encoding_format,
126+
}
127+
128+
def validate_config_update(
129+
self, old_config: Dict[str, Any], new_config: Dict[str, Any]
130+
) -> None:
131+
if "model_name" in new_config:
132+
raise ValueError(
133+
"The model name cannot be changed after the embedding function has been initialized."
134+
)
135+
136+
@staticmethod
137+
def validate_config(config: Dict[str, Any]) -> None:
138+
"""
139+
Validate the configuration using the JSON schema.
140+
141+
Args:
142+
config: Configuration to validate
143+
144+
Raises:
145+
ValidationError: If the configuration does not match the schema
146+
"""
147+
validate_config_schema(config, "morph")

clients/new-js/packages/ai-embeddings/all/package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,12 @@
4242
"@chroma-core/google-gemini": "workspace:^",
4343
"@chroma-core/huggingface-server": "workspace:^",
4444
"@chroma-core/jina": "workspace:^",
45+
"@chroma-core/mistral": "workspace:^",
46+
"@chroma-core/morph": "workspace:^",
4547
"@chroma-core/ollama": "workspace:^",
4648
"@chroma-core/openai": "workspace:^",
4749
"@chroma-core/together-ai": "workspace:^",
48-
"@chroma-core/voyageai": "workspace:^",
49-
"@chroma-core/mistral": "workspace:^"
50+
"@chroma-core/voyageai": "workspace:^"
5051
},
5152
"engines": {
5253
"node": ">=20"

clients/new-js/packages/ai-embeddings/all/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export * from "@chroma-core/google-gemini";
55
export * from "@chroma-core/huggingface-server";
66
export * from "@chroma-core/jina";
77
export * from "@chroma-core/mistral";
8+
export * from "@chroma-core/morph";
89
export * from "@chroma-core/ollama";
910
export * from "@chroma-core/openai";
1011
export * from "@chroma-core/together-ai";

clients/new-js/packages/ai-embeddings/common/src/schema-utils.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import voyageaiSchema from "../../../../../../schemas/embedding_functions/voyage
2222
import cloudflareWorkersAiSchema from "../../../../../../schemas/embedding_functions/cloudflare_workers_ai.json";
2323
import togetherAiSchema from "../../../../../../schemas/embedding_functions/together_ai.json";
2424
import mistralSchema from "../../../../../../schemas/embedding_functions/mistral.json";
25+
import morphSchema from "../../../../../../schemas/embedding_functions/morph.json";
2526
import Ajv from "ajv";
2627

2728
// Define a common interface for all schemas
@@ -68,6 +69,7 @@ const schemaMap = {
6869
"cloudflare-worker-ai": cloudflareWorkersAiSchema as Schema,
6970
"together-ai": togetherAiSchema as Schema,
7071
mistral: mistralSchema as Schema,
72+
morph: morphSchema as Schema,
7173
};
7274

7375
/**

0 commit comments

Comments
 (0)