Merge pull request #6 from nathan-weinberg/skeleton

nathan-weinberg · web-flow · commit 6632002daeec · 2024-06-17T13:31:04.000-04:00
Initial skeleton for Evaluator classes and exceptions
diff --git a/README.md b/README.md
@@ -1,3 +1,8 @@
 # eval
 
+![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main)
+![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main)
+![Release](https://img.shields.io/github/v/release/instructlab/eval)
+![License](https://img.shields.io/github/license/instructlab/eval)
+
 Python library for Evaluation
diff --git a/src/instructlab/eval/evaluator.py b/src/instructlab/eval/evaluator.py
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+
+
+class Evaluator:
+    """
+    Parent class for Evaluators
+
+    Atttributes:
+        model_path   Path to the model to be evaluated
+    """
+
+    def __init__(self, model_path: str) -> None:
+        self.model_path = model_path
diff --git a/src/instructlab/eval/exceptions.py b/src/instructlab/eval/exceptions.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+
+
+class EvalError(Exception):
+    """
+    Parent class for all of instructlab-eval exceptions
+    """
+
+
+class ModelNotFoundError(EvalError):
+    """
+    Exception raised when model is not able to be found
+
+    Attributes
+        message     error message to be printed on raise
+        model       model that is being operated on
+        path        filepath of model location
+    """
+
+    def __init__(self, path) -> None:
+        super().__init__()
+        self.path = path
+        self.model = path.rsplit("/")[-1]
+        self.message = f"Model {self.model} could not be found at {self.path}"
diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Local
+from .evaluator import Evaluator
+
+
+class MMLU_Evaluator(Evaluator):
+    """
+    Child class of an Evaluator for Massive Multitask Language Understanding (MMLU)
+
+    Attributes:
+        tasks        list of tasks for MMLU to test the model with
+        few_shots    number of examples
+        batch_size   number of GPUs
+    """
+
+    def __init__(
+        self, model_path, tasks: list[str], few_shots: int = 2, batch_size: int = 5
+    ) -> None:
+        super().__init__(model_path)
+        self.tasks = tasks
+        self.few_shots = few_shots
+        self.batch_size = batch_size
+
+    def run(self) -> tuple:
+        """
+        Runs MMLU evaluation
+
+        Returns:
+            overall_score       MMLU score for the overall model evaluation
+            individual_scores   Individual MMLU score for each task
+        """
+        individual_scores: dict[str, float] = {}
+        overall_score: float = 0.0
+        return overall_score, individual_scores
+
+
+class PR_MMLU_Evaluator(Evaluator):
+    """
+    Child class of an Evaluator for PR Massive Multitask Language Understanding (PR MMLU)
+
+    Attributes:
+        sdg_path    path where all the PR MMLU tasks are stored
+        task        group name that is shared by all the PR MMLU tasks
+        few_shots   number of examples
+        batch_size  number of GPUs
+    """
+
+    def __init__(
+        self,
+        model_path,
+        sdg_path: str,
+        task: str = "mmlu_pr",
+        few_shots: int = 2,
+        batch_size: int = 5,
+    ) -> None:
+        super().__init__(model_path)
+        self.sdg_path = sdg_path
+        self.task = task
+        self.few_shots = few_shots
+        self.batch_size = batch_size
+
+    def run(self) -> tuple:
+        """
+        Runs PR MMLU evaluation
+
+        Returns:
+            overall_score       PR MMLU score for the overall model evaluation
+            individual_scores   Individual PR MMLU scores for each task
+            qa_pairs            Question and answer pairs from the evaluation
+        """
+        individual_scores: dict[str, float] = {}
+        overall_score: float = 0.0
+        qa_pairs: list[tuple] = []
+        return overall_score, individual_scores, qa_pairs
diff --git a/src/instructlab/eval/mtbench.py b/src/instructlab/eval/mtbench.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Local
+from .evaluator import Evaluator
+
+
+class MT_Bench_Evaluator(Evaluator):
+    """
+    Child class of an Evaluator for Multi-turn Benchmark (MT-Bench)
+
+    Attributes
+        server_url  vLLM server endpoint
+    """
+
+    def __init__(self, model_path, server_url: str) -> None:
+        super().__init__(model_path)
+        self.server_url = server_url
+
+    def run(self) -> tuple:
+        """
+        Runs MT-Bench evaluation
+
+        Returns:
+            overall_score   MT-Bench score for the overall model evaluation
+            qa_pairs        Question and answer pairs from the evaluation
+        """
+        overall_score: float = 0.0
+        qa_pairs: list[tuple] = []
+        return overall_score, qa_pairs
+
+
+class PR_Bench_Evaluator(Evaluator):
+    """
+    Child class of an Evaluator for PR-Bench Benchmark (PR-Bench)
+
+    Attributes
+        server_url  vLLM server endpoint
+        questions   questions to be asked
+    """
+
+    def __init__(self, model_path, server_url: str, questions: str) -> None:
+        super().__init__(model_path)
+        self.server_url = server_url
+        self.questions = questions
+
+    def run(self) -> tuple:
+        """
+        Runs PR-Bench evaluation
+
+        Returns:
+            overall_score   MT-Bench score for the overall model evaluation
+            qa_pairs        Question and answer pairs from the evaluation
+        """
+        overall_score = 0.0
+        qa_pairs: list[tuple] = []
+        return overall_score, qa_pairs