Skip to content

Commit 9ac53b2

Browse files
committed
add software issue localization datasets
1 parent 10c4948 commit 9ac53b2

File tree

5 files changed

+171
-5
lines changed

5 files changed

+171
-5
lines changed

mteb/evaluation/evaluators/RerankingEvaluator.py

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
class RerankingEvaluator(Evaluator):
2121
"""This class evaluates a SentenceTransformer model for the task of re-ranking.
2222
Given a query and a list of documents, it computes the score [query, doc_i] for all possible
23-
documents and sorts them in decreasing order. Then, MRR@10 and MAP is compute to measure the quality of the ranking.
23+
documents and sorts them in decreasing order. Then, MRR@k, MAP, and Recall@k are computed to measure the quality of the ranking.
2424
:param samples: Must be a list and each element is of the form:
2525
- {'query': '', 'positive': [], 'negative': []}. Query is the search query, positive is a list of positive
2626
(relevant) documents, negative is a list of negative (irrelevant) documents.
@@ -143,6 +143,7 @@ def compute_metrics_individual(self, model: Encoder):
143143
def _encode_candidates(self, model: Encoder, batched: bool, all_query_embs=None):
144144
all_mrr_scores = []
145145
all_ap_scores = []
146+
all_recall_scores = []
146147
all_conf_scores = []
147148
logger.info("Encoding candidates...")
148149
if batched:
@@ -151,16 +152,18 @@ def _encode_candidates(self, model: Encoder, batched: bool, all_query_embs=None)
151152
all_query_embs=all_query_embs,
152153
all_mrr_scores=all_mrr_scores,
153154
all_ap_scores=all_ap_scores,
155+
all_recall_scores=all_recall_scores,
154156
all_conf_scores=all_conf_scores,
155157
)
156158
else:
157159
self._encode_candidates_individual(
158160
model=model,
159161
all_mrr_scores=all_mrr_scores,
160162
all_ap_scores=all_ap_scores,
163+
all_recall_scores=all_recall_scores,
161164
all_conf_scores=all_conf_scores,
162165
)
163-
scores = self._collect_results(all_mrr_scores, all_ap_scores, all_conf_scores)
166+
scores = self._collect_results(all_mrr_scores, all_ap_scores, all_recall_scores, all_conf_scores)
164167
return scores
165168

166169
def _encode_candidates_batched(
@@ -169,6 +172,7 @@ def _encode_candidates_batched(
169172
model: Encoder,
170173
all_mrr_scores,
171174
all_ap_scores,
175+
all_recall_scores,
172176
all_conf_scores,
173177
):
174178
all_docs = []
@@ -208,6 +212,7 @@ def _encode_candidates_batched(
208212
is_relevant,
209213
all_mrr_scores,
210214
all_ap_scores,
215+
all_recall_scores,
211216
all_conf_scores,
212217
model,
213218
)
@@ -217,6 +222,7 @@ def _encode_candidates_individual(
217222
model: Encoder,
218223
all_mrr_scores,
219224
all_ap_scores,
225+
all_recall_scores,
220226
all_conf_scores,
221227
):
222228
for instance in tqdm.tqdm(self.samples, desc="Samples"):
@@ -255,19 +261,22 @@ def _encode_candidates_individual(
255261
is_relevant,
256262
all_mrr_scores,
257263
all_ap_scores,
264+
all_recall_scores,
258265
all_conf_scores,
259266
model,
260267
)
261268

262-
def _collect_results(self, all_mrr_scores, all_ap_scores, all_conf_scores):
269+
def _collect_results(self, all_mrr_scores, all_ap_scores, all_recall_scores, all_conf_scores):
263270
mean_ap = np.mean(all_ap_scores)
264271
mean_mrr = np.mean(all_mrr_scores)
272+
mean_recall = np.mean(all_recall_scores)
265273

266274
# Compute nAUCs
267275
naucs_map = self.nAUC_scores(all_conf_scores, all_ap_scores, "map")
268276
naucs_mrr = self.nAUC_scores(all_conf_scores, all_mrr_scores, "mrr")
277+
naucs_recall = self.nAUC_scores(all_conf_scores, all_recall_scores, f"recall_at_{self.mrr_at_k}")
269278

270-
return {**{"map": mean_ap, "mrr": mean_mrr}, **naucs_map, **naucs_mrr}
279+
return {**{"map": mean_ap, "mrr": mean_mrr, f"recall_at_{self.mrr_at_k}": mean_recall}, **naucs_map, **naucs_mrr, **naucs_recall}
271280

272281
def _encode_candidates_miracl(
273282
self,
@@ -408,6 +417,7 @@ def _apply_sim_scores(
408417
is_relevant,
409418
all_mrr_scores,
410419
all_ap_scores,
420+
all_recall_scores,
411421
all_conf_scores,
412422
model: Encoder,
413423
):
@@ -417,6 +427,7 @@ def _apply_sim_scores(
417427

418428
all_mrr_scores.append(scores["mrr"])
419429
all_ap_scores.append(scores["ap"])
430+
all_recall_scores.append(scores["recall"])
420431
all_conf_scores.append(conf_scores)
421432

422433
@staticmethod
@@ -483,11 +494,13 @@ def _compute_metrics_instance(
483494
scores:
484495
- `mrr`: Mean Reciprocal Rank @ `self.mrr_at_k`
485496
- `ap`: Average Precision
497+
- `recall`: Recall @ `self.mrr_at_k`
486498
"""
487499
pred_scores_argsort = torch.argsort(-sim_scores) # Sort in decreasing order
488500
mrr = self.mrr_at_k_score(is_relevant, pred_scores_argsort, self.mrr_at_k)
489501
ap = self.ap_score(is_relevant, sim_scores.cpu().tolist())
490-
return {"mrr": mrr, "ap": ap}
502+
recall = self.recall_at_k_score(is_relevant, pred_scores_argsort, self.mrr_at_k)
503+
return {"mrr": mrr, "ap": ap, "recall": recall}
491504

492505
@staticmethod
493506
def conf_scores(sim_scores: torch.Tensor) -> dict[str, float]:
@@ -570,3 +583,29 @@ def ap_score(is_relevant, pred_scores):
570583
# ap = np.mean([np.mean(preds[: k + 1]) for k in range(len(preds)) if preds[k]])
571584
ap = average_precision_score(is_relevant, pred_scores)
572585
return ap
586+
587+
@staticmethod
588+
def recall_at_k_score(
589+
is_relevant: list[bool], pred_ranking: list[int], k: int
590+
) -> float:
591+
"""Computes Recall@k score
592+
593+
Args:
594+
is_relevant: True if the document is relevant
595+
pred_ranking: Indices of the documents sorted in decreasing order
596+
of the similarity score
597+
k: Top-k documents to consider
598+
599+
Returns:
600+
The Recall@k score
601+
"""
602+
total_relevant = sum(is_relevant)
603+
if total_relevant == 0:
604+
return 0.0
605+
606+
relevant_retrieved = 0
607+
for rank, index in enumerate(pred_ranking[:k]):
608+
if is_relevant[index]:
609+
relevant_retrieved += 1
610+
611+
return relevant_retrieved / total_relevant

mteb/tasks/Reranking/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
from .eng.SciDocsReranking import *
88
from .eng.StackOverflowDupQuestions import *
99
from .eng.WebLINXCandidatesReranking import *
10+
from .eng.SWEbenchLiteReranking import *
11+
from .eng.SWEbenchVerifiedReranking import *
12+
from .eng.LocBenchReranking import *
1013
from .fra.AlloprofReranking import *
1114
from .fra.SyntecReranking import *
1215
from .jpn.JaCWIRReranking import *
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from __future__ import annotations
2+
3+
from mteb.abstasks.TaskMetadata import TaskMetadata
4+
5+
from ....abstasks.AbsTaskReranking import AbsTaskReranking
6+
7+
8+
class SWEbenchLiteReranking(AbsTaskReranking):
9+
metadata = TaskMetadata(
10+
name="LocBenchRR",
11+
description="Software Issue Localization.",
12+
reference="https://arxiv.org/abs/2503.09089",
13+
dataset={
14+
"path": "tarsur909/mteb-loc-bench-reranking",
15+
"revision": "6741d68550b3793b45b18b6aaf981d00f33971cb",
16+
},
17+
type="Reranking",
18+
category="p2p",
19+
modalities=["text"],
20+
eval_splits=["train"],
21+
eval_langs=["eng-Latn", "python-Code"],
22+
main_score="recall_at_10",
23+
date=("2025-03-12", "2025-03-12"), # arxiv v1 submission date
24+
domains=["Programming", "Written"],
25+
task_subtypes=["Code retrieval"],
26+
license="mit",
27+
annotations_creators="derived",
28+
dialect=[],
29+
sample_creation="found",
30+
prompt="Instruct: Given a github issue, identify the code that needs to be changed to fix the issue. Query: ",
31+
bibtex_citation=r"""
32+
@misc{chen2025locagentgraphguidedllmagents,
33+
title={LocAgent: Graph-Guided LLM Agents for Code Localization},
34+
author={Zhaoling Chen and Xiangru Tang and Gangda Deng and Fang Wu and Jialong Wu and Zhiwei Jiang and Viktor Prasanna and Arman Cohan and Xingyao Wang},
35+
year={2025},
36+
eprint={2503.09089},
37+
archivePrefix={arXiv},
38+
primaryClass={cs.SE},
39+
url={https://arxiv.org/abs/2503.09089},
40+
}
41+
"""
42+
)
43+
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from __future__ import annotations
2+
3+
from mteb.abstasks.TaskMetadata import TaskMetadata
4+
5+
from ....abstasks.AbsTaskReranking import AbsTaskReranking
6+
7+
8+
class SWEbenchLiteReranking(AbsTaskReranking):
9+
metadata = TaskMetadata(
10+
name="SWEbenchLiteRR",
11+
description="Software Issue Localization.",
12+
reference="https://www.swebench.com/",
13+
dataset={
14+
"path": "tarsur909/mteb-swe-bench-lite-reranking",
15+
"revision": "9020779825304b569312509a068219d1771bae7d",
16+
},
17+
type="Reranking",
18+
category="p2p",
19+
modalities=["text"],
20+
eval_splits=["train"],
21+
eval_langs=["eng-Latn", "python-Code"],
22+
main_score="recall_at_10",
23+
date=("2023-10-10", "2023-10-10"),
24+
domains=["Programming", "Written"],
25+
task_subtypes=["Code retrieval"],
26+
license="mit",
27+
annotations_creators="derived",
28+
dialect=[],
29+
sample_creation="found",
30+
prompt="Instruct: Given a github issue, identify the code that needs to be changed to fix the issue. Query: ",
31+
bibtex_citation=r"""
32+
@misc{jimenez2024swebenchlanguagemodelsresolve,
33+
title={SWE-bench: Can Language Models Resolve Real-World GitHub Issues?},
34+
author={Carlos E. Jimenez and John Yang and Alexander Wettig and Shunyu Yao and Kexin Pei and Ofir Press and Karthik Narasimhan},
35+
year={2024},
36+
eprint={2310.06770},
37+
archivePrefix={arXiv},
38+
primaryClass={cs.CL},
39+
url={https://arxiv.org/abs/2310.06770},
40+
}
41+
"""
42+
)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from __future__ import annotations
2+
3+
from mteb.abstasks.TaskMetadata import TaskMetadata
4+
5+
from ....abstasks.AbsTaskReranking import AbsTaskReranking
6+
7+
8+
class SWEbenchVerifiedReranking(AbsTaskReranking):
9+
metadata = TaskMetadata(
10+
name="SWEbenchVerifiedRR",
11+
description="Software Issue Localization for SWE-bench Verified",
12+
reference="https://openai.com/index/introducing-swe-bench-verified/",
13+
dataset={
14+
"path": "tarsur909/mteb-swe-bench-verified-reranking",
15+
"revision": "796ae0b4b187e5c0533a12411dee0d8e34eaf0b5",
16+
},
17+
type="Reranking",
18+
category="p2p",
19+
modalities=["text"],
20+
eval_splits=["train"],
21+
eval_langs=["eng-Latn", "python-Code"],
22+
main_score="recall_at_10",
23+
date=("2024-08-13", "2024-08-13"), # arxiv v1 submission date
24+
domains=["Programming", "Written"],
25+
task_subtypes=["Code retrieval"],
26+
license="mit",
27+
annotations_creators="derived",
28+
dialect=[],
29+
sample_creation="found",
30+
prompt="Instruct: Given a github issue, identify the code that needs to be changed to fix the issue. Query: ",
31+
bibtex_citation=r"""
32+
@misc{openai2024swebenchverified,
33+
title={Introducing swe-bench verified},
34+
author={OpenAI},
35+
year={2024},
36+
url={https://openai.com/index/introducing-swe-bench-verified/},
37+
}
38+
"""
39+
)

0 commit comments

Comments
 (0)