-
Notifications
You must be signed in to change notification settings - Fork 1.9k
[ENH] Add Morph embedding functions #5183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
17 commits
Select commit
Hold shift + click to select a range
dcd60f0
morph embeddings
bhaktatejas922 9956a51
Merge remote-tracking branch 'upstream/main'
bhaktatejas922 5dabbfe
morph openai note
bhaktatejas922 98390a4
Update clients/new-js/packages/ai-embeddings/morph/src/index.ts
bhaktatejas922 d0a402e
config stuff
bhaktatejas922 40e993b
bump commit
jeffchuber c8efaeb
Rename "CHROMA_MORPH_API_KEY" to "MORPH_API_KEY"
philipithomas b0ea687
Rename "CHROMA_MORPH_API_KEY" to "MORPH_API_KEY"
philipithomas c2704f1
Update configuration for morph js package
philipithomas 96a7249
update lockfile
philipithomas d35b050
revert
philipithomas 43b662b
Refactor Morph embedding documentation
philipithomas 855501d
port docs updates to llm txt
philipithomas 81ec34f
Remove unused import in Morph embedding function test
philipithomas 472f8fe
Update tests for MorphEmbeddingFunction to assert numpy.ndarray inste…
philipithomas f52c5d7
docs update
philipithomas dabfd6c
Skip morph e2e tests if openai library is not installed
philipithomas File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| import os | ||
| import pytest | ||
| import numpy as np | ||
| from chromadb.utils.embedding_functions.morph_embedding_function import ( | ||
| MorphEmbeddingFunction, | ||
| ) | ||
|
|
||
|
|
||
| def test_morph_embedding_function_with_api_key() -> None: | ||
| """Test Morph embedding function when API key is available.""" | ||
| if os.environ.get("MORPH_API_KEY") is None: | ||
| pytest.skip("MORPH_API_KEY not set") | ||
|
|
||
| ef = MorphEmbeddingFunction( | ||
| model_name="morph-embedding-v2" | ||
| ) | ||
|
|
||
| # Test with code snippets (Morph's specialty) | ||
| code_snippets = [ | ||
| "def hello_world():\n print('Hello, World!')", | ||
| "class Calculator:\n def add(self, a, b):\n return a + b" | ||
| ] | ||
|
|
||
| embeddings = ef(code_snippets) | ||
| assert embeddings is not None | ||
| assert len(embeddings) == 2 | ||
| assert all(isinstance(emb, np.ndarray) for emb in embeddings) | ||
| assert all(len(emb) > 0 for emb in embeddings) | ||
|
|
||
|
|
||
| def test_morph_embedding_function_with_custom_parameters() -> None: | ||
| """Test Morph embedding function with custom parameters.""" | ||
| if os.environ.get("MORPH_API_KEY") is None: | ||
| pytest.skip("MORPH_API_KEY not set") | ||
|
|
||
| ef = MorphEmbeddingFunction( | ||
| model_name="morph-embedding-v2", | ||
| api_base="https://api.morphllm.com/v1", | ||
| encoding_format="float", | ||
| api_key_env_var="MORPH_API_KEY" | ||
| ) | ||
|
|
||
| # Test with a simple function | ||
| code_snippet = ["function add(a, b) { return a + b; }"] | ||
|
|
||
| embeddings = ef(code_snippet) | ||
| assert embeddings is not None | ||
| assert len(embeddings) == 1 | ||
| assert isinstance(embeddings[0], np.ndarray) | ||
| assert len(embeddings[0]) > 0 | ||
|
|
||
|
|
||
| def test_morph_embedding_function_config_roundtrip() -> None: | ||
| """Test that Morph embedding function configuration can be saved and restored.""" | ||
| try: | ||
| import openai | ||
| except ImportError: | ||
| pytest.skip("openai package not installed") | ||
|
|
||
| ef = MorphEmbeddingFunction( | ||
| model_name="morph-embedding-v2", | ||
| api_base="https://api.morphllm.com/v1", | ||
| encoding_format="float", | ||
| api_key_env_var="MORPH_API_KEY" | ||
| ) | ||
|
|
||
| # Get configuration | ||
| config = ef.get_config() | ||
|
|
||
| # Verify configuration contains expected keys | ||
| assert "model_name" in config | ||
| assert "api_base" in config | ||
| assert "encoding_format" in config | ||
| assert "api_key_env_var" in config | ||
|
|
||
| # Verify values | ||
| assert config["model_name"] == "morph-embedding-v2" | ||
| assert config["api_base"] == "https://api.morphllm.com/v1" | ||
| assert config["encoding_format"] == "float" | ||
| assert config["api_key_env_var"] == "MORPH_API_KEY" | ||
|
|
||
| # Test building from config | ||
| new_ef = MorphEmbeddingFunction.build_from_config(config) | ||
| new_config = new_ef.get_config() | ||
|
|
||
| # Configurations should match | ||
| assert config == new_config | ||
|
|
||
|
|
||
| def test_morph_embedding_function_name() -> None: | ||
| """Test that Morph embedding function returns correct name.""" | ||
| assert MorphEmbeddingFunction.name() == "morph" | ||
|
|
||
|
|
||
| def test_morph_embedding_function_spaces() -> None: | ||
| """Test that Morph embedding function supports expected spaces.""" | ||
| try: | ||
| import openai | ||
| except ImportError: | ||
| pytest.skip("openai package not installed") | ||
|
|
||
| ef = MorphEmbeddingFunction( | ||
| model_name="morph-embedding-v2", | ||
| api_key_env_var="MORPH_API_KEY" | ||
| ) | ||
|
|
||
| # Test default space | ||
| assert ef.default_space() == "cosine" | ||
|
|
||
| # Test supported spaces | ||
| supported_spaces = ef.supported_spaces() | ||
| assert "cosine" in supported_spaces | ||
| assert "l2" in supported_spaces | ||
| assert "ip" in supported_spaces | ||
|
|
||
|
|
||
| def test_morph_embedding_function_validate_config() -> None: | ||
| """Test that Morph embedding function validates configuration correctly.""" | ||
| # Valid configuration | ||
| valid_config = { | ||
| "model_name": "morph-embedding-v2", | ||
| "api_key_env_var": "MORPH_API_KEY" | ||
| } | ||
|
|
||
| # This should not raise an exception | ||
| MorphEmbeddingFunction.validate_config(valid_config) | ||
|
|
||
| # Invalid configuration (missing required fields) | ||
| invalid_config = { | ||
| "model_name": "morph-embedding-v2" | ||
| # Missing api_key_env_var | ||
| } | ||
|
|
||
| with pytest.raises(Exception): | ||
| MorphEmbeddingFunction.validate_config(invalid_config) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
147 changes: 147 additions & 0 deletions
147
chromadb/utils/embedding_functions/morph_embedding_function.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,147 @@ | ||
| from chromadb.api.types import Embeddings, Documents, EmbeddingFunction, Space | ||
| from typing import List, Dict, Any, Optional | ||
| import os | ||
| import numpy as np | ||
| from chromadb.utils.embedding_functions.schemas import validate_config_schema | ||
| import warnings | ||
|
|
||
|
|
||
| class MorphEmbeddingFunction(EmbeddingFunction[Documents]): | ||
| def __init__( | ||
| self, | ||
| api_key: Optional[str] = None, | ||
| model_name: str = "morph-embedding-v2", | ||
| api_base: str = "https://api.morphllm.com/v1", | ||
| encoding_format: str = "float", | ||
| api_key_env_var: str = "MORPH_API_KEY", | ||
| ): | ||
| """ | ||
| Initialize the MorphEmbeddingFunction. | ||
|
|
||
| Args: | ||
| api_key (str, optional): The API key for the Morph API. If not provided, | ||
| it will be read from the environment variable specified by api_key_env_var. | ||
| model_name (str, optional): The name of the model to use for embeddings. | ||
| Defaults to "morph-embedding-v2". | ||
| api_base (str, optional): The base URL for the Morph API. | ||
| Defaults to "https://api.morphllm.com/v1". | ||
| encoding_format (str, optional): The format for embeddings (float or base64). | ||
| Defaults to "float". | ||
| api_key_env_var (str, optional): Environment variable name that contains your API key. | ||
| Defaults to "MORPH_API_KEY". | ||
| """ | ||
| try: | ||
| import openai | ||
| except ImportError: | ||
| raise ValueError( | ||
| "The openai python package is not installed. Please install it with `pip install openai`. " | ||
| "Note: Morph uses the OpenAI client library for API communication." | ||
| ) | ||
|
|
||
| if api_key is not None: | ||
| warnings.warn( | ||
| "Direct api_key configuration will not be persisted. " | ||
| "Please use environment variables via api_key_env_var for persistent storage.", | ||
| DeprecationWarning, | ||
| ) | ||
|
|
||
| self.api_key_env_var = api_key_env_var | ||
| self.api_key = api_key or os.getenv(api_key_env_var) | ||
| if not self.api_key: | ||
| raise ValueError(f"The {api_key_env_var} environment variable is not set.") | ||
|
|
||
| self.model_name = model_name | ||
| self.api_base = api_base | ||
| self.encoding_format = encoding_format | ||
|
|
||
| # Initialize the OpenAI client with Morph's base URL | ||
| self.client = openai.OpenAI( | ||
| api_key=self.api_key, | ||
| base_url=self.api_base, | ||
| ) | ||
|
|
||
| def __call__(self, input: Documents) -> Embeddings: | ||
| """ | ||
| Generate embeddings for the given documents. | ||
|
|
||
| Args: | ||
| input: Documents to generate embeddings for. | ||
|
|
||
| Returns: | ||
| Embeddings for the documents. | ||
| """ | ||
| # Handle empty input | ||
| if not input: | ||
| return [] | ||
|
|
||
| # Prepare embedding parameters | ||
| embedding_params: Dict[str, Any] = { | ||
| "model": self.model_name, | ||
| "input": input, | ||
| "encoding_format": self.encoding_format, | ||
| } | ||
|
|
||
| # Get embeddings from Morph API | ||
| response = self.client.embeddings.create(**embedding_params) | ||
philipithomas marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # Extract embeddings from response | ||
| return [np.array(data.embedding, dtype=np.float32) for data in response.data] | ||
|
|
||
| @staticmethod | ||
| def name() -> str: | ||
| return "morph" | ||
|
|
||
| def default_space(self) -> Space: | ||
| # Morph embeddings work best with cosine similarity | ||
| return "cosine" | ||
|
|
||
| def supported_spaces(self) -> List[Space]: | ||
| return ["cosine", "l2", "ip"] | ||
|
|
||
| @staticmethod | ||
| def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction[Documents]": | ||
philipithomas marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Extract parameters from config | ||
| api_key_env_var = config.get("api_key_env_var") | ||
| model_name = config.get("model_name") | ||
| api_base = config.get("api_base") | ||
| encoding_format = config.get("encoding_format") | ||
|
|
||
| if api_key_env_var is None or model_name is None: | ||
| assert False, "This code should not be reached" | ||
philipithomas marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # Create and return the embedding function | ||
| return MorphEmbeddingFunction( | ||
| api_key_env_var=api_key_env_var, | ||
| model_name=model_name, | ||
| api_base=api_base if api_base is not None else "https://api.morphllm.com/v1", | ||
| encoding_format=encoding_format if encoding_format is not None else "float", | ||
| ) | ||
|
|
||
| def get_config(self) -> Dict[str, Any]: | ||
| return { | ||
| "api_key_env_var": self.api_key_env_var, | ||
| "model_name": self.model_name, | ||
| "api_base": self.api_base, | ||
| "encoding_format": self.encoding_format, | ||
| } | ||
|
|
||
| def validate_config_update( | ||
| self, old_config: Dict[str, Any], new_config: Dict[str, Any] | ||
| ) -> None: | ||
| if "model_name" in new_config: | ||
| raise ValueError( | ||
| "The model name cannot be changed after the embedding function has been initialized." | ||
| ) | ||
|
|
||
|
Comment on lines
+128
to
+135
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [BestPractice] The current implementation of |
||
| @staticmethod | ||
| def validate_config(config: Dict[str, Any]) -> None: | ||
| """ | ||
| Validate the configuration using the JSON schema. | ||
|
|
||
| Args: | ||
| config: Configuration to validate | ||
|
|
||
| Raises: | ||
| ValidationError: If the configuration does not match the schema | ||
| """ | ||
| validate_config_schema(config, "morph") | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.