Implement the sentence embedding retriever with local cache from the embedding store.
-
Embedding store abstraction class
-
Support Jina client implementation embedding store
-
Support LFU, LRU cache eviction policy for limited cache size, if the eviction policy is not specified then won't apply any eviction policy
-
Save the cache to parquet file
-
Load the cache from existed parquet file
-
Python 3.9
-
Linux/MacOS
- Installation
pip install embestore"[jina]"- To start up the Jina flow service with default sentence transformer model
sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
embestore serve start-jina- Use other sentence transformer model from hugging face
# Take sentence-transformers/all-MiniLM-L6-v2 for example
export SENTENCE_TRANSFORMER=sentence-transformers/all-MiniLM-L6-v2
embestore serve start-jina- Retrieve the embedding
from embestore.store.jina import JinaEmbeddingStore
JINA_EMBEDDING_STORE_GRPC = "grpc://0.0.0.0:54321"
query_sentences = ["I want to listen the music.", "Music don't want to listen me."]
jina_embedding_store = JinaEmbeddingStore(embedding_grpc=JINA_EMBEDDING_STORE_GRPC)
embeddings = jina_embedding_store.retrieve_embeddings(sentences=query_sentences)
>>> embeddings
array([[ 2.26917475e-01, 8.17841291e-02, 2.35427842e-02,
-3.02357599e-02, 1.15757119e-02, -8.42996314e-02,
4.42815214e-01, 1.80795133e-01, 1.04702041e-01,
...
]])- Stop the docker container
embestore serve stop-jina- Installation
pip install embestore"[sentence-transformers]"- Serve the sentence embedding model
sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2by in-memory
from embestore.store.torch import TorchEmbeddingStore
query_sentences = ["I want to listen the music.", "Music don't want to listen me."]
torch_embedding_store = TorchEmbeddingStore()
embeddings = torch_embedding_store.retrieve_embeddings(sentences=query_sentences)
>>> embeddings
array([[ 2.26917475e-01, 8.17841291e-02, 2.35427842e-02,
-3.02357599e-02, 1.15757119e-02, -8.42996314e-02,
4.42815214e-01, 1.80795133e-01, 1.04702041e-01,
...
]])- Installation
pip install embestorefrom typing import List, Text
import numpy as np
from sentence_transformers import SentenceTransformer
from embestore.store.base import EmbeddingStore
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2").eval()
class TorchEmbeddingStore(EmbeddingStore):
def _retrieve_embeddings_from_model(self, sentences: List[Text]) -> np.ndarray:
return model.encode(sentences)torch_embedding_store.save("cache.parquet")torch_embedding_store = TorchEmbeddingStore("cache.parquet")- LRU
torch_embedding_store = TorchEmbeddingStore(max_size=100, eviction_policy="lru")- LFU
torch_embedding_store = TorchEmbeddingStore(max_size=100, eviction_policy="lfu")[TODO] Badges