Skip to content

Commit b364f64

Browse files
committed
Add Elasticsearch KNN
1 parent 49e0380 commit b364f64

File tree

4 files changed

+153
-100
lines changed

4 files changed

+153
-100
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Evaluated
2929
* [PUFFINN](https://github.com/puffinn/puffinn) ![https://img.shields.io/github/stars/puffinn/puffinn?style=social](https://img.shields.io/github/stars/puffinn/puffinn?style=social)
3030
* [N2](https://github.com/kakao/n2) ![https://img.shields.io/github/stars/kakao/n2?style=social](https://img.shields.io/github/stars/kakao/n2?style=social)
3131
* [ScaNN](https://github.com/google-research/google-research/tree/master/scann)
32+
* [Elasticsearch](https://github.com/elastic/elasticsearch) ![https://img.shields.io/github/stars/elastic/elasticsearch?style=social](https://img.shields.io/github/stars/elastic/elasticsearch?style=social): HNSW
3233
* [Elastiknn](https://github.com/alexklibisz/elastiknn) ![https://img.shields.io/github/stars/alexklibisz/elastiknn?style=social](https://img.shields.io/github/stars/alexklibisz/elastiknn?style=social)
3334
* [OpenSearch KNN](https://github.com/opensearch-project/k-NN) ![https://img.shields.io/github/stars/opensearch-project/k-NN?style=social](https://img.shields.io/github/stars/opensearch-project/k-NN?style=social)
3435
* [DiskANN](https://github.com/microsoft/diskann) ![https://img.shields.io/github/stars/microsoft/diskann?style=social](https://img.shields.io/github/stars/microsoft/diskann?style=social): Vamana, Vamana-PQ

algos.yaml

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -824,11 +824,13 @@ float:
824824
elasticsearch:
825825
docker-tag: ann-benchmarks-elasticsearch
826826
module: ann_benchmarks.algorithms.elasticsearch
827-
constructor: ElasticsearchScriptScoreQuery
828-
base-args: [ "@metric", "@dimension" ]
827+
constructor: ElasticsearchKNN
828+
base-args: ["@metric", "@dimension"]
829829
run-groups:
830-
empty:
831-
args: []
830+
m-16-ef-100:
831+
arg-groups:
832+
- {"m": 16, "ef_construction": 100} # index_options
833+
query-args: [[10, 20, 40, 80, 160]] # num_candidates
832834
elastiknn-l2lsh:
833835
docker-tag: ann-benchmarks-elastiknn
834836
module: ann_benchmarks.algorithms.elastiknn
@@ -1143,11 +1145,13 @@ float:
11431145
elasticsearch:
11441146
docker-tag: ann-benchmarks-elasticsearch
11451147
module: ann_benchmarks.algorithms.elasticsearch
1146-
constructor: ElasticsearchScriptScoreQuery
1147-
base-args: [ "@metric", "@dimension" ]
1148+
constructor: ElasticsearchKNN
1149+
base-args: ["@metric", "@dimension"]
11481150
run-groups:
1149-
empty:
1150-
args: []
1151+
m-16-ef-100:
1152+
arg-groups:
1153+
- {"m": 16, "ef_construction": 100} # index_options
1154+
query-args: [[10, 20, 40, 80, 160]] # num_candidates
11511155
opensearchknn:
11521156
docker-tag: ann-benchmarks-opensearchknn
11531157
module: ann_benchmarks.algorithms.opensearchknn

ann_benchmarks/algorithms/elasticsearch.py

Lines changed: 88 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,99 +1,120 @@
1-
"""
2-
ann-benchmarks interfaces for Elasticsearch.
3-
Note that this requires X-Pack, which is not included in the OSS version of Elasticsearch.
4-
"""
5-
import logging
61
from time import sleep
7-
from urllib.error import URLError
8-
from urllib.request import Request, urlopen
92

10-
from elasticsearch import Elasticsearch
3+
from elasticsearch import ConnectionError, Elasticsearch
114
from elasticsearch.helpers import bulk
125

136
from .base import BaseANN
147

15-
# Configure the elasticsearch logger.
16-
# By default, it writes an INFO statement for every request.
17-
logging.getLogger("elasticsearch").setLevel(logging.WARN)
188

19-
# Uncomment these lines if you want to see timing for every HTTP request and its duration.
20-
# logging.basicConfig(level=logging.INFO)
21-
# logging.getLogger("elasticsearch").setLevel(logging.INFO)
9+
class ElasticsearchKNN(BaseANN):
10+
"""Elasticsearch KNN search.
2211
23-
24-
def es_wait():
25-
print("Waiting for elasticsearch health endpoint...")
26-
req = Request("http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s")
27-
for i in range(30):
28-
try:
29-
res = urlopen(req)
30-
if res.getcode() == 200:
31-
print("Elasticsearch is ready")
32-
return
33-
except URLError:
34-
pass
35-
sleep(1)
36-
raise RuntimeError("Failed to connect to local elasticsearch")
37-
38-
39-
class ElasticsearchScriptScoreQuery(BaseANN):
40-
"""
41-
KNN using the Elasticsearch dense_vector datatype and script score functions.
42-
- Dense vector field type: https://www.elastic.co/guide/en/elasticsearch/reference/master/dense-vector.html
43-
- Dense vector queries: https://www.elastic.co/guide/en/elasticsearch/reference/master/query-dsl-script-score-query.html
12+
See https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html for more details.
4413
"""
4514

46-
def __init__(self, metric: str, dimension: int):
47-
self.name = f"elasticsearch-script-score-query_metric={metric}_dimension={dimension}"
15+
def __init__(self, metric: str, dimension: int, index_options: dict):
4816
self.metric = metric
4917
self.dimension = dimension
50-
self.index = f"es-ssq-{metric}-{dimension}"
51-
self.es = Elasticsearch(["http://localhost:9200"])
18+
self.index_options = index_options
19+
self.num_candidates = 100
20+
21+
index_options_str = "-".join(sorted(f"{k}-{v}" for k, v in self.index_options.items()))
22+
self.name = f"es-{metric}-{dimension}-{index_options_str}"
23+
self.similarity_metric = self._vector_similarity_metric(metric)
24+
25+
self.client = Elasticsearch(["http://localhost:9200"])
5226
self.batch_res = []
53-
if self.metric == "euclidean":
54-
self.script = '1 / (1 + l2norm(params.query_vec, "vec"))'
55-
elif self.metric == "angular":
56-
self.script = '1.0 + cosineSimilarity(params.query_vec, "vec")'
57-
else:
58-
raise NotImplementedError(f"Not implemented for metric {self.metric}")
59-
es_wait()
27+
self._wait_for_health_status()
28+
29+
def _vector_similarity_metric(self, metric: str):
30+
# `dot_product` is more efficient than `cosine`, but requires all vectors to be normalized
31+
# to unit length. We opt for adaptability, some datasets might not be normalized.
32+
supported_metrics = {
33+
"angular": "cosine",
34+
"euclidean": "l2_norm",
35+
}
36+
if metric not in supported_metrics:
37+
raise NotImplementedError(f"{metric} is not implemented")
38+
return supported_metrics[metric]
39+
40+
def _wait_for_health_status(self, wait_seconds=30, status="yellow"):
41+
print("Waiting for Elasticsearch ...")
42+
for _ in range(wait_seconds):
43+
try:
44+
health = self.client.cluster.health(wait_for_status=status, request_timeout=1)
45+
print(f'Elasticsearch is ready: status={health["status"]}')
46+
return
47+
except ConnectionError:
48+
pass
49+
sleep(1)
50+
raise RuntimeError("Failed to connect to Elasticsearch")
6051

6152
def fit(self, X):
62-
body = dict(settings=dict(number_of_shards=1, number_of_replicas=0))
63-
mapping = dict(
64-
properties=dict(id=dict(type="keyword", store=True), vec=dict(type="dense_vector", dims=self.dimension))
65-
)
66-
self.es.indices.create(self.index, body=body)
67-
self.es.indices.put_mapping(mapping, self.index)
53+
settings = {
54+
"number_of_shards": 1,
55+
"number_of_replicas": 0,
56+
"refresh_interval": -1,
57+
}
58+
mappings = {
59+
"properties": {
60+
"id": {"type": "keyword", "store": True},
61+
"vec": {
62+
"type": "dense_vector",
63+
"element_type": "float",
64+
"dims": self.dimension,
65+
"index": True,
66+
"similarity": self.similarity_metric,
67+
"index_options": {
68+
"type": self.index_options.get("type", "hnsw"),
69+
"m": self.index_options["m"],
70+
"ef_construction": self.index_options["ef_construction"],
71+
},
72+
},
73+
},
74+
}
75+
self.client.indices.create(index=self.name, settings=settings, mappings=mappings)
6876

6977
def gen():
7078
for i, vec in enumerate(X):
71-
yield {"_op_type": "index", "_index": self.index, "vec": vec.tolist(), "id": str(i + 1)}
79+
yield {"_op_type": "index", "_index": self.name, "id": str(i), "vec": vec.tolist()}
7280

73-
(_, errors) = bulk(self.es, gen(), chunk_size=500, max_retries=9)
74-
assert len(errors) == 0, errors
81+
print("Indexing ...")
82+
(_, errors) = bulk(self.client, gen(), chunk_size=500, request_timeout=90)
83+
if len(errors) != 0:
84+
raise RuntimeError("Failed to index documents")
7585

76-
self.es.indices.refresh(self.index)
77-
self.es.indices.forcemerge(self.index, max_num_segments=1)
86+
print("Force merge index ...")
87+
self.client.indices.forcemerge(index=self.name, max_num_segments=1, request_timeout=900)
88+
89+
print("Refreshing index ...")
90+
self.client.indices.refresh(index=self.name, request_timeout=900)
91+
92+
def set_query_arguments(self, num_candidates):
93+
self.num_candidates = num_candidates
7894

7995
def query(self, q, n):
80-
body = dict(
81-
query=dict(
82-
script_score=dict(
83-
query=dict(match_all=dict()), script=dict(source=self.script, params=dict(query_vec=q.tolist()))
84-
)
85-
)
86-
)
87-
res = self.es.search(
88-
index=self.index,
96+
if n > self.num_candidates:
97+
raise ValueError("n must be smaller than num_candidates")
98+
99+
body = {
100+
"knn": {
101+
"field": "vec",
102+
"query_vector": q.tolist(),
103+
"k": n,
104+
"num_candidates": self.num_candidates,
105+
}
106+
}
107+
res = self.client.search(
108+
index=self.name,
89109
body=body,
90110
size=n,
91111
_source=False,
92112
docvalue_fields=["id"],
93113
stored_fields="_none_",
94114
filter_path=["hits.hits.fields.id"],
115+
request_timeout=10,
95116
)
96-
return [int(h["fields"]["id"][0]) - 1 for h in res["hits"]["hits"]]
117+
return [int(h["fields"]["id"][0]) for h in res["hits"]["hits"]]
97118

98119
def batch_query(self, X, n):
99120
self.batch_res = [self.query(q, n) for q in X]

install/Dockerfile.elasticsearch

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,72 @@
1-
FROM ann-benchmarks
1+
FROM ann-benchmarks AS builder
2+
ARG ELASTICSEARCH_VERSION=8.7.0
23

3-
WORKDIR /home/app
4+
ENV DEBIAN_FRONTEND noninteractive
5+
RUN apt-get install -y curl
6+
7+
# Download Elasticsearch to intermediate builder.
8+
WORKDIR /tmp
9+
RUN curl -OsS https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ELASTICSEARCH_VERSION}-linux-$(arch).tar.gz
10+
RUN curl -sS https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ELASTICSEARCH_VERSION}-linux-$(arch).tar.gz.sha512 | shasum -a 512 -c -
11+
12+
WORKDIR /usr/share/elasticsearch
13+
RUN tar -zxf /tmp/elasticsearch-${ELASTICSEARCH_VERSION}-linux-$(arch).tar.gz --strip-components=1
14+
15+
# Install Elasticsearch in final image:
16+
# - https://www.elastic.co/guide/en/elasticsearch/reference/current/targz.html
17+
# - https://www.elastic.co/guide/en/elasticsearch/reference/current/system-config.html
18+
FROM ann-benchmarks
19+
ARG ELASTICSEARCH_VERSION=8.7.0
420

5-
# Install elasticsearch.
621
ENV DEBIAN_FRONTEND noninteractive
7-
RUN apt install -y wget curl htop
8-
RUN wget --quiet https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-amd64.deb \
9-
&& dpkg -i elasticsearch-7.9.2-amd64.deb \
10-
&& rm elasticsearch-7.9.2-amd64.deb
22+
RUN apt-get install -y curl
23+
24+
WORKDIR /usr/share/elasticsearch
1125

12-
# Install python client.
13-
RUN python3 -m pip install --upgrade elasticsearch==7.9.1
26+
# Create elasticsearch user and user group.
27+
RUN groupadd -g 1000 elasticsearch
28+
RUN adduser --uid 1000 --gid 1000 --home /usr/share/elasticsearch elasticsearch
1429

15-
# Configure elasticsearch and JVM for single-node, single-core.
30+
COPY --from=builder --chown=elasticsearch:elasticsearch /usr/share/elasticsearch /usr/share/elasticsearch
31+
32+
RUN echo "vm.max_map_count=262144" >> /etc/sysctl.conf
33+
34+
# Backup original configurations for potential future reference.
35+
RUN cp config/elasticsearch.yml config/elasticsearch.yml.bak
36+
RUN cp config/jvm.options config/jvm.options.bak
37+
38+
# Configure Elasticsearch for single-node, single-core.
1639
RUN echo '\
1740
discovery.type: single-node\n\
18-
network.host: 0.0.0.0\n\
19-
node.master: true\n\
20-
node.data: true\n\
41+
node.roles: [master, data]\n\
2142
node.processors: 1\n\
43+
path.data: /usr/share/elasticsearch/data\n\
44+
path.logs: /usr/share/elasticsearch/logs\n\
45+
bootstrap.memory_lock: true\n\
2246
thread_pool.write.size: 1\n\
2347
thread_pool.search.size: 1\n\
2448
thread_pool.search.queue_size: 1\n\
25-
path.data: /var/lib/elasticsearch\n\
26-
path.logs: /var/log/elasticsearch\n\
27-
' > /etc/elasticsearch/elasticsearch.yml
49+
xpack.security.enabled: false\n\
50+
' > config/elasticsearch.yml
2851

2952
RUN echo '\
3053
-Xms3G\n\
3154
-Xmx3G\n\
3255
-XX:+UseG1GC\n\
33-
-XX:G1ReservePercent=25\n\
34-
-XX:InitiatingHeapOccupancyPercent=30\n\
35-
-XX:+HeapDumpOnOutOfMemoryError\n\
36-
-XX:HeapDumpPath=/var/lib/elasticsearch\n\
37-
-XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log\n\
38-
-Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount=32,filesize=64m' > /etc/elasticsearch/jvm.options
56+
-XX:HeapDumpPath=data\n\
57+
-XX:ErrorFile=/usr/share/elasticsearch/logs/hs_err_pid%p.log\n\
58+
-Xlog:gc*,gc+age=trace,safepoint:file=/usr/share/elasticsearch/logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m\n\
59+
' > config/jvm.options
3960

40-
# Make sure you can start the service.
41-
RUN service elasticsearch start && service elasticsearch stop
61+
RUN chown -R elasticsearch:elasticsearch /usr/share/elasticsearch
62+
63+
WORKDIR /home/app
64+
65+
RUN python3 -m pip install elasticsearch==${ELASTICSEARCH_VERSION}
4266

4367
# Custom entrypoint that also starts the Elasticsearch server.
44-
RUN echo 'service elasticsearch start && python3 -u run_algorithm.py "$@"' > entrypoint.sh
68+
RUN echo 'set -eux' >> entrypoint.sh
69+
RUN echo 'su - elasticsearch -c "nohup /usr/share/elasticsearch/bin/elasticsearch > nohup.out 2>&1 &"' >> entrypoint.sh
70+
RUN echo 'python3 -u run_algorithm.py "$@"' >> entrypoint.sh
71+
4572
ENTRYPOINT ["/bin/bash", "/home/app/entrypoint.sh"]

0 commit comments

Comments
 (0)