add parsed ref

chenxwh user · chenxwh user · commit 4ea811d005ca · 2025-09-06T17:39:56.000Z
diff --git a/src/fairseq2/recipes/lm/_online_finetune/_generative_judge.py b/src/fairseq2/recipes/lm/_online_finetune/_generative_judge.py
@@ -1,32 +1,11 @@
-# POINTWISE_J1_PROMPT = """
-# You are given a user question and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. You will be shown multiple responses to the same prompt, but only one at a time. Evaluate each response independently.
-
-# Think carefully about how to assess the quality of the response, and enclose your reasoning within <think> and </think> tags. Your reasoning should include your evaluation criteria, a clear understanding of what an ideal response would look like for this particular question, and a concrete example of such an ideal or reference answer if possible. Then compare the assistant's response to your ideal or reference answer, explaining how it aligns with or deviates from your expectations. Be specific and avoid vague or overly general judgments. Remain as objective as possible.
-
-# Finally, assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision. A higher score should indicate a higher-quality response. Enclose the score within <score> and </score> tags.
-
-# Format your output like this:
-# <think> your_thinking_process </think>
-# <score> your_score </score>
-
-# Below are the user's question and the assistant's response:
-
-# [User Question]
-# {instruction}
-
-# [The Start of the Assistant's Answer]
-# {response}
-# [The End of the Assistant's Answer]
-# """
-
 POINTWISE_J1_PROMPT = """
-You are given a user question and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. Do not allow the length of the response to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
+You are given a user question and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. You will be shown multiple responses to the same prompt, but only one at a time. Evaluate each response independently. 
 
-Think carefully about how to assess the quality of the response and finally assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision. A higher score should indicate a higher-quality response. Enclose the score within <score> and </score> tags.
+Think carefully about how to assess the quality of the response and assign the assistant's response a score 1 if the response is correct, and 0 if not. Enclose the score within <score> and </score> tags.
 
 Format your output like this: 
 <think> your_thinking_process </think> 
-<score> your_score </score>
+<score> 0 or 1 </score>
 
 Below are the user's question and the assistant's response:
 
@@ -38,14 +17,35 @@
 [The End of the Assistant's Answer]
 """
 
+
+# POINTWISE_J1_PROMPT = """
+# You are given a user question and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. You will be shown multiple responses to the same prompt, but only one at a time. Evaluate each response independently. 
+
+# Think carefully about how to assess the quality of the response and assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision. A higher score should indicate a higher-quality response. Enclose the score within <score> and </score> tags.
+
+# Format your output like this: 
+# <think> your_thinking_process </think> 
+# <score> your_score </score>
+
+# Below are the user's question and the assistant's response:
+
+# [User Question]
+# {instruction}
+
+# [The Start of the Assistant's Answer]
+# {response}
+# [The End of the Assistant's Answer]
+# """
+
+
 POINTWISE_J1_PROMPT_WITH_REF_ANSWER = """
 You are given a user question, a reference answer and a response from an AI assistant. Your task is to act as an impartial judge and evaluate how well the response fulfills the user's instructions. You will be shown multiple responses to the same prompt, but only one at a time. Evaluate each response independently. 
 
-Think carefully about how to assess the quality of the response and finally assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision. A higher score should indicate a higher-quality response. Enclose the score within <score> and </score> tags.
+Think carefully about how to assess the quality of the response and assign the assistant's response a score 1 if the response is correct, and 0 if not. Enclose the score within <score> and </score> tags.
 
 Format your output like this: 
 <think> your_thinking_process </think> 
-<score> your_score </score>
+<score> 0 or 1 </score>
 
 Below are the user's question, reference answer and the assistant's response:
 
@@ -60,14 +60,11 @@
 [The End of the Assistant's Answer]
 """
 
-# PAIRWISE_WITH_SCORES_J1_PROMPT = """
-# You are given a user question and two responses from two AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer.
-
-# First, provide your reasoning within <think> and </think> tags. This should include your evaluation criteria for a high-quality response, a detailed comparison of the two responses, and when helpful, a reference answer as part of your evaluation. Be explicit in your thought process, referencing your criteria and explaining how each response aligns with or deviates from them.
 
-# Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
+# PAIRWISE_WITH_SCORES_J1_PROMPT = """
+# You are given a user question and two responses from two AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
 
-# Finally, assign the assistant's response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
+# Think carefully about how to assess the quality of the responses and assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
 
 # Format your output like this:
 # <think> your_thinking_process </think>
@@ -87,14 +84,15 @@
 # [The End of Assistant B's Answer]
 # """
 
+
 PAIRWISE_WITH_SCORES_J1_PROMPT = """
 You are given a user question and two responses from two AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
 
-Think carefully about how to assess the quality of the responses and finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
+Think carefully about how to assess the quality of the responses and assign each response a score 1 if the response is correct, and 0 if not. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
 
 Format your output like this:
 <think> your_thinking_process </think>
-<score_A> your_score_a </score_A> <score_B> your_score_b </score_B>
+<score_A> 0 or 1 </score_A> <score_B> 0 or 1 </score_B>
 
 Below are the user's question and the two responses:
 
@@ -110,63 +108,50 @@
 [The End of Assistant B's Answer]
 """
 
-KWISE_WITH_SCORES_J1_PROMPT = """
-You are given a user question and {k} responses from {k} AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
-
-Think carefully about how to assess the quality of the responses and finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_assistant_1> </score_assistant_1>, <score_assistant_2> </score_assistant_2> and so on.
-
-Format your output like this:
-<think> your_thinking_process </think>
-<score_assistant_1> your_score_1 </score_assistant_1> 
-<score_assistant_2> your_score_2 </score_assistant_2>
-<score_assistant_3> your_score_3 </score_assistant_3>
-...
-
-Below are the user's question and the two responses:
-
-[User Question]
-{instruction}
-
-{responses}
-"""
+# PAIRWISE_WITH_SCORES_J1_PROMPT_WITH_REF_ANSWER = """
+# You are given a user question, two responses from two AI assistants, and a reference answer. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
 
-KWISE_WITH_SCORES_J1_PROMPT_WITH_REF_ANSWER = """
-You are given a user question and {k} responses from {k} AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
+# Think carefully about how to assess the quality of the responses and utilize the reference answer for your judgement. Finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
 
-Think carefully about how to assess the quality of the responses and finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_assistant_1> </score_assistant_1>, <score_assistant_2> </score_assistant_2> and so on.
+# Format your output like this:
+# <think> your_thinking_process </think>
+# <score_A> your_score_a </score_A> <score_B> your_score_b </score_B>
 
-Format your output like this:
-<think> your_thinking_process </think>
-<score_assistant_1> your_score_1 </score_assistant_1> 
-<score_assistant_2> your_score_2 </score_assistant_2>
-<score_assistant_3> your_score_3 </score_assistant_3>
-...
+# Below are the user's question, reference answer and the two responses:
 
-Below are the user's question and the two responses:
+# [User Question]
+# {instruction}
 
-[User Question]
-{instruction}
+# [Reference Answer]
+# {reference_answer}
 
-[Reference Answer]
-{reference_answer}
+# [The Start of Assistant A's Answer]
+# {response_A}
+# [The End of Assistant A's Answer]
 
-{responses}
-"""
+# [The Start of Assistant B's Answer]
+# {response_B}
+# [The End of Assistant B's Answer]
+# """
 
-# PAIRWISE_WITH_SCORES_J1_PROMPT = """
-# You are given a user question and two responses from two AI assistants. You are also given their thinking process. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Care any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
+# PAIRWISE_WITH_SCORES_J1_PROMPT_WITH_REF_ANSWER = """
+# You are given a user question, two responses from two AI assistants, and a reference answer. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
 
-# Carefully analyze the assistants' thought process, assess the quality of the responses and finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
+# Think carefully about how to assess the quality of the responses and utilize the reference answer for your judgement. Finally, assign each response a score 1 if the response is correct, and 0 if not. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
 
 # Format your output like this:
 # <think> your_thinking_process </think>
-# <score_A> your_score_a </score_A> <score_B> your_score_b </score_B>
+# <score_A> 0 or 1 </score_A> <score_B> 0 or 1 </score_B>
 
-# Below are the user's question and the two responses:
+
+# Below are the user's question, reference answer and the two responses:
 
 # [User Question]
 # {instruction}
 
+# [Reference Answer]
+# {reference_answer}
+
 # [The Start of Assistant A's Answer]
 # {response_A}
 # [The End of Assistant A's Answer]
@@ -176,30 +161,38 @@
 # [The End of Assistant B's Answer]
 # """
 
+
 PAIRWISE_WITH_SCORES_J1_PROMPT_WITH_REF_ANSWER = """
-You are given a user question and two responses from two AI assistants. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
+You are given a user question, two responses from two AI assistants and the parsed version of the responses, and a reference answer. Your task is to act as an impartial judge and evaluate which response better follows the user's instructions and provides a higher-quality answer. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.
 
-Think carefully about how to assess the quality of the responses and finally, assign each response a score from 0 to 10, using either an integer or a decimal with up to 0.1 precision, with a higher score indicating a higher-quality response that better satisfies the criteria. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
+Think carefully about how to assess the quality of the responses and finally, utilize the reference answer for your judgement. Note that the parsed version of the responses are automatically extracted and may contain errors, therefore you should primarily rely on the original responses for your judgement.
+Finally, assign each response a score 1 if the response is correct, and 0 if not. Enclose the scores within the tags <score_A> </score_A>, and <score_B> </score_B>.
 
 Format your output like this:
 <think> your_thinking_process </think>
-<score_A> your_score_a </score_A> <score_B> your_score_b </score_B>
+<score_A> 0 or 1 </score_A> <score_B> 0 or 1 </score_B>
 
-Below are the user's question, reference answer and the two responses:
+Below are the user's question, two responses and the parsed versions of the responses, and the reference answer:
 
 [User Question]
 {instruction}
 
-[Reference Answer]
-{reference_answer}
-
 [The Start of Assistant A's Answer]
 {response_A}
 [The End of Assistant A's Answer]
 
 [The Start of Assistant B's Answer]
 {response_B}
 [The End of Assistant B's Answer]
+
+[The Parsed Version of Assistant A's Answer]
+{parsed_response_A}
+
+[The Parsed Version of Assistant B's Answer]
+{parsed_response_B}
+
+[Reference Answer]
+{reference_answer}
 """
 
 import re
@@ -475,6 +468,22 @@ def config_kls(self):
 class J1PairwiseScoreExtractor(JudgmentExtractor):
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
+        try:
+            from math_verify import parse
+            from math_verify.parser import (
+                ExprExtractionConfig,
+                LatexExtractionConfig,
+                NormalizationConfig,
+            )
+        except ImportError:
+            raise ImportError(
+                "install mathverify from https://github.com/huggingface/Math-Verify"
+            )
+
+        self.student_extraction_config = (
+            LatexExtractionConfig(boxed_match_priority=0),
+        )
+        self.parse = parse
 
     @override
     def prompt(self, reference_answer):
@@ -483,6 +492,17 @@ def prompt(self, reference_answer):
             if reference_answer is None
             else PAIRWISE_WITH_SCORES_J1_PROMPT_WITH_REF_ANSWER
         )
+        
+    def get_preferred_index(self, lst):
+        """
+        math_verify parse returns a list of parsed answers, we want want the item at idex 1, which is a string
+        """
+        if len(lst) > 1:
+            return lst[1]
+        elif len(lst) == 1:
+            return lst[0]
+        else:
+            return "None"
 
     @override
     def format_prompt(
@@ -498,9 +518,11 @@ def format_prompt(
             if reference_answer is None
             else prompt_template.format(
                 instruction=prompt_text,
-                reference_answer=reference_answer,
                 response_A=rollout_A_text,
                 response_B=rollout_B_text,
+                parsed_response_A=self.get_preferred_index(self.parse(rollout_A_text, self.student_extraction_config)),
+                parsed_response_B=self.get_preferred_index(self.parse(rollout_B_text, self.student_extraction_config)),
+                reference_answer=reference_answer,
             )
         )