Skip to content

Commit 4ea811d

Browse files
author
chenxwh user
committed
add parsed ref
1 parent 1162d60 commit 4ea811d

File tree

1 file changed

+104
-82
lines changed

1 file changed

+104
-82
lines changed

src/fairseq2/recipes/lm/_online_finetune/_generative_judge.py

Lines changed: 104 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,11 @@
1-
# POINTWISE_J1_PROMPT = """
2-
# You are given a user question and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. You will be shown multiple responses to the same prompt, but only one at a time. Evaluate each response independently.
3-
4-
# Think carefully about how to assess the quality of the response, and enclose your reasoning within <think> and </think> tags. Your reasoning should include your evaluation criteria, a clear understanding of what an ideal response would look like for this particular question, and a concrete example of such an ideal or reference answer if possible. Then compare the assistant's response to your ideal or reference answer, explaining how it aligns with or deviates from your expectations. Be specific and avoid vague or overly general judgments. Remain as objective as possible.
5-
6-
# Finally, assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision. A higher score should indicate a higher-quality response. Enclose the score within <score> and </score> tags.
7-
8-
# Format your output like this:
9-
# <think> your_thinking_process </think>
10-
# <score> your_score </score>
11-
12-
# Below are the user's question and the assistant's response:
13-
14-
# [User Question]
15-
# {instruction}
16-
17-
# [The Start of the Assistant's Answer]
18-
# {response}
19-
# [The End of the Assistant's Answer]
20-
# """
21-
221
POINTWISE_J1_PROMPT = """
23-
You are given a user question and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. Do not allow the length of the response to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
2+
You are given a user question and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. You will be shown multiple responses to the same prompt, but only one at a time. Evaluate each response independently.
243
25-
Think carefully about how to assess the quality of the response and finally assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision. A higher score should indicate a higher-quality response. Enclose the score within <score> and </score> tags.
4+
Think carefully about how to assess the quality of the response and assign the assistant's response a score 1 if the response is correct, and 0 if not. Enclose the score within <score> and </score> tags.
265
276
Format your output like this:
287
<think> your_thinking_process </think>
29-
<score> your_score </score>
8+
<score> 0 or 1 </score>
309
3110
Below are the user's question and the assistant's response:
3211
@@ -38,14 +17,35 @@
3817
[The End of the Assistant's Answer]
3918
"""
4019

20+
21+
# POINTWISE_J1_PROMPT = """
22+
# You are given a user question and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. You will be shown multiple responses to the same prompt, but only one at a time. Evaluate each response independently.
23+
24+
# Think carefully about how to assess the quality of the response and assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision. A higher score should indicate a higher-quality response. Enclose the score within <score> and </score> tags.
25+
26+
# Format your output like this:
27+
# <think> your_thinking_process </think>
28+
# <score> your_score </score>
29+
30+
# Below are the user's question and the assistant's response:
31+
32+
# [User Question]
33+
# {instruction}
34+
35+
# [The Start of the Assistant's Answer]
36+
# {response}
37+
# [The End of the Assistant's Answer]
38+
# """
39+
40+
4141
POINTWISE_J1_PROMPT_WITH_REF_ANSWER = """
4242
You are given a user question, a reference answer and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. You will be shown multiple responses to the same prompt, but only one at a time. Evaluate each response independently.
4343
44-
Think carefully about how to assess the quality of the response and finally assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision. A higher score should indicate a higher-quality response. Enclose the score within <score> and </score> tags.
44+
Think carefully about how to assess the quality of the response and assign the assistant's response a score 1 if the response is correct, and 0 if not. Enclose the score within <score> and </score> tags.
4545
4646
Format your output like this:
4747
<think> your_thinking_process </think>
48-
<score> your_score </score>
48+
<score> 0 or 1 </score>
4949
5050
Below are the user's question, reference answer and the assistant's response:
5151
@@ -60,14 +60,11 @@
6060
[The End of the Assistant's Answer]
6161
"""
6262

63-
# PAIRWISE_WITH_SCORES_J1_PROMPT = """
64-
# You are given a user question and two responses from two AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer.
65-
66-
# First, provide your reasoning within <think> and </think> tags. This should include your evaluation criteria for a high-quality response, a detailed comparison of the two responses, and when helpful, a reference answer as part of your evaluation. Be explicit in your thought process, referencing your criteria and explaining how each response aligns with or deviates from them.
6763

68-
# Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
64+
# PAIRWISE_WITH_SCORES_J1_PROMPT = """
65+
# You are given a user question and two responses from two AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
6966

70-
# Finally, assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
67+
# Think carefully about how to assess the quality of the responses and assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
7168

7269
# Format your output like this:
7370
# <think> your_thinking_process </think>
@@ -87,14 +84,15 @@
8784
# [The End of Assistant B's Answer]
8885
# """
8986

87+
9088
PAIRWISE_WITH_SCORES_J1_PROMPT = """
9189
You are given a user question and two responses from two AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
9290
93-
Think carefully about how to assess the quality of the responses and finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
91+
Think carefully about how to assess the quality of the responses and assign each response a score 1 if the response is correct, and 0 if not. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
9492
9593
Format your output like this:
9694
<think> your_thinking_process </think>
97-
<score_A> your_score_a </score_A> <score_B> your_score_b </score_B>
95+
<score_A> 0 or 1 </score_A> <score_B> 0 or 1 </score_B>
9896
9997
Below are the user's question and the two responses:
10098
@@ -110,63 +108,50 @@
110108
[The End of Assistant B's Answer]
111109
"""
112110

113-
KWISE_WITH_SCORES_J1_PROMPT = """
114-
You are given a user question and {k} responses from {k} AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
115-
116-
Think carefully about how to assess the quality of the responses and finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_assistant_1> </score_assistant_1>, <score_assistant_2> </score_assistant_2> and so on.
117-
118-
Format your output like this:
119-
<think> your_thinking_process </think>
120-
<score_assistant_1> your_score_1 </score_assistant_1>
121-
<score_assistant_2> your_score_2 </score_assistant_2>
122-
<score_assistant_3> your_score_3 </score_assistant_3>
123-
...
124-
125-
Below are the user's question and the two responses:
126-
127-
[User Question]
128-
{instruction}
129-
130-
{responses}
131-
"""
111+
# PAIRWISE_WITH_SCORES_J1_PROMPT_WITH_REF_ANSWER = """
112+
# You are given a user question, two responses from two AI assistants, and a reference answer. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
132113

133-
KWISE_WITH_SCORES_J1_PROMPT_WITH_REF_ANSWER = """
134-
You are given a user question and {k} responses from {k} AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
114+
# Think carefully about how to assess the quality of the responses and utilize the reference answer for your judgement. Finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
135115

136-
Think carefully about how to assess the quality of the responses and finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_assistant_1> </score_assistant_1>, <score_assistant_2> </score_assistant_2> and so on.
116+
# Format your output like this:
117+
# <think> your_thinking_process </think>
118+
# <score_A> your_score_a </score_A> <score_B> your_score_b </score_B>
137119

138-
Format your output like this:
139-
<think> your_thinking_process </think>
140-
<score_assistant_1> your_score_1 </score_assistant_1>
141-
<score_assistant_2> your_score_2 </score_assistant_2>
142-
<score_assistant_3> your_score_3 </score_assistant_3>
143-
...
120+
# Below are the user's question, reference answer and the two responses:
144121

145-
Below are the user's question and the two responses:
122+
# [User Question]
123+
# {instruction}
146124

147-
[User Question]
148-
{instruction}
125+
# [Reference Answer]
126+
# {reference_answer}
149127

150-
[Reference Answer]
151-
{reference_answer}
128+
# [The Start of Assistant A's Answer]
129+
# {response_A}
130+
# [The End of Assistant A's Answer]
152131

153-
{responses}
154-
"""
132+
# [The Start of Assistant B's Answer]
133+
# {response_B}
134+
# [The End of Assistant B's Answer]
135+
# """
155136

156-
# PAIRWISE_WITH_SCORES_J1_PROMPT = """
157-
# You are given a user question and two responses from two AI assistants. You are also given their thinking process. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Care any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
137+
# PAIRWISE_WITH_SCORES_J1_PROMPT_WITH_REF_ANSWER = """
138+
# You are given a user question, two responses from two AI assistants, and a reference answer. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
158139

159-
# Carefully analyze the assistants' thought process, assess the quality of the responses and finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
140+
# Think carefully about how to assess the quality of the responses and utilize the reference answer for your judgement. Finally, assign each response a score 1 if the response is correct, and 0 if not. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
160141

161142
# Format your output like this:
162143
# <think> your_thinking_process </think>
163-
# <score_A> your_score_a </score_A> <score_B> your_score_b </score_B>
144+
# <score_A> 0 or 1 </score_A> <score_B> 0 or 1 </score_B>
164145

165-
# Below are the user's question and the two responses:
146+
147+
# Below are the user's question, reference answer and the two responses:
166148

167149
# [User Question]
168150
# {instruction}
169151

152+
# [Reference Answer]
153+
# {reference_answer}
154+
170155
# [The Start of Assistant A's Answer]
171156
# {response_A}
172157
# [The End of Assistant A's Answer]
@@ -176,30 +161,38 @@
176161
# [The End of Assistant B's Answer]
177162
# """
178163

164+
179165
PAIRWISE_WITH_SCORES_J1_PROMPT_WITH_REF_ANSWER = """
180-
You are given a user question and two responses from two AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
166+
You are given a user question, two responses from two AI assistants and the parsed version of the responses, and a reference answer. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
181167
182-
Think carefully about how to assess the quality of the responses and finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
168+
Think carefully about how to assess the quality of the responses and finally, utilize the reference answer for your judgement. Note that the parsed version of the responses are automatically extracted and may contain errors, therefore you should primarily rely on the original responses for your judgement.
169+
Finally, assign each response a score 1 if the response is correct, and 0 if not. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
183170
184171
Format your output like this:
185172
<think> your_thinking_process </think>
186-
<score_A> your_score_a </score_A> <score_B> your_score_b </score_B>
173+
<score_A> 0 or 1 </score_A> <score_B> 0 or 1 </score_B>
187174
188-
Below are the user's question, reference answer and the two responses:
175+
Below are the user's question, two responses and the parsed versions of the responses, and the reference answer:
189176
190177
[User Question]
191178
{instruction}
192179
193-
[Reference Answer]
194-
{reference_answer}
195-
196180
[The Start of Assistant A's Answer]
197181
{response_A}
198182
[The End of Assistant A's Answer]
199183
200184
[The Start of Assistant B's Answer]
201185
{response_B}
202186
[The End of Assistant B's Answer]
187+
188+
[The Parsed Version of Assistant A's Answer]
189+
{parsed_response_A}
190+
191+
[The Parsed Version of Assistant B's Answer]
192+
{parsed_response_B}
193+
194+
[Reference Answer]
195+
{reference_answer}
203196
"""
204197

205198
import re
@@ -475,6 +468,22 @@ def config_kls(self):
475468
class J1PairwiseScoreExtractor(JudgmentExtractor):
476469
def __init__(self, tokenizer):
477470
self.tokenizer = tokenizer
471+
try:
472+
from math_verify import parse
473+
from math_verify.parser import (
474+
ExprExtractionConfig,
475+
LatexExtractionConfig,
476+
NormalizationConfig,
477+
)
478+
except ImportError:
479+
raise ImportError(
480+
"install mathverify from https://github.com/huggingface/Math-Verify"
481+
)
482+
483+
self.student_extraction_config = (
484+
LatexExtractionConfig(boxed_match_priority=0),
485+
)
486+
self.parse = parse
478487

479488
@override
480489
def prompt(self, reference_answer):
@@ -483,6 +492,17 @@ def prompt(self, reference_answer):
483492
if reference_answer is None
484493
else PAIRWISE_WITH_SCORES_J1_PROMPT_WITH_REF_ANSWER
485494
)
495+
496+
def get_preferred_index(self, lst):
497+
"""
498+
math_verify parse returns a list of parsed answers, we want want the item at idex 1, which is a string
499+
"""
500+
if len(lst) > 1:
501+
return lst[1]
502+
elif len(lst) == 1:
503+
return lst[0]
504+
else:
505+
return "None"
486506

487507
@override
488508
def format_prompt(
@@ -498,9 +518,11 @@ def format_prompt(
498518
if reference_answer is None
499519
else prompt_template.format(
500520
instruction=prompt_text,
501-
reference_answer=reference_answer,
502521
response_A=rollout_A_text,
503522
response_B=rollout_B_text,
523+
parsed_response_A=self.get_preferred_index(self.parse(rollout_A_text, self.student_extraction_config)),
524+
parsed_response_B=self.get_preferred_index(self.parse(rollout_B_text, self.student_extraction_config)),
525+
reference_answer=reference_answer,
504526
)
505527
)
506528

0 commit comments

Comments
 (0)