Skip to content

Commit 0247086

Browse files
authored
MSMARCO 10m Recall Operations (#1092)
* added a 10m recall test for msmarco
1 parent 98a842a commit 0247086

File tree

3 files changed

+16
-2
lines changed

3 files changed

+16
-2
lines changed

msmarco-v2-vector/operations/default.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,12 @@
6969
"visit-percentage": {{p_search_ops[i][3]}},
7070
{%- endif -%}
7171
"oversample-rescore": {{p_search_ops[i][2]}},
72-
"include-in-reporting": false
72+
"include-in-reporting": false,
73+
{%- if initial_indexing_ingest_doc_count is defined and initial_indexing_ingest_doc_count == 10000000 -%}
74+
"recall-doc-set": "10m"
75+
{%- else -%}
76+
"recall-doc-set": "full"
77+
{%- endif -%}
7378
}
7479
{%- endfor %}
7580
{%- set p_hybrid_knn_ops = (hybrid_knn_ops | default([(10, 0), (10, 50), (100, 200), (100, 300)]) ) -%}
678 KB
Binary file not shown.

msmarco-v2-vector/track.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
QUERIES_FILENAME: str = "queries.json.bz2"
1818
QUERIES_RECALL_FILENAME: str = "queries-recall.json.bz2"
19+
QUERIES_RECALL_10M_FILENAME: str = "queries-recall-10m.json.bz2"
1920

2021

2122
def extract_vector_operations_count(knn_result):
@@ -147,6 +148,7 @@ def params(self):
147148
"num_candidates": self._params.get("num-candidates", 100),
148149
"visit_percentage": self._params.get("visit-percentage", -1),
149150
"oversample_rescore": self._params.get("oversample-rescore", -1),
151+
"recall_doc_set": self._params.get("recall-doc-set", -1),
150152
}
151153

152154

@@ -157,6 +159,7 @@ async def __call__(self, es, params):
157159
visit_percentage = params["visit_percentage"]
158160
index = params["index"]
159161
request_cache = params["cache"]
162+
recall_doc_set = params["recall_doc_set"]
160163

161164
cwd = os.path.dirname(__file__)
162165
qrels = read_qrels(os.path.join(cwd, "qrels.tsv"))
@@ -166,7 +169,13 @@ async def __call__(self, es, params):
166169
exact_total = 0
167170
min_recall = top_k
168171
nodes_visited = []
169-
with bz2.open(os.path.join(cwd, QUERIES_RECALL_FILENAME), "r") as queries_file:
172+
173+
if recall_doc_set == "10m":
174+
queries_recall = QUERIES_RECALL_10M_FILENAME
175+
else:
176+
queries_recall = QUERIES_RECALL_FILENAME
177+
178+
with bz2.open(os.path.join(cwd, queries_recall), "r") as queries_file:
170179
for line in queries_file:
171180
query = json.loads(line)
172181
query_id = query["query_id"]

0 commit comments

Comments
 (0)