|
6 | 6 | if TYPE_CHECKING:
|
7 | 7 | import pandas as pd
|
8 | 8 |
|
| 9 | +import asyncio |
| 10 | + |
9 | 11 | import tqdm
|
10 | 12 |
|
11 | 13 | import dspy
|
@@ -51,6 +53,7 @@ class EvaluationResult(Prediction):
|
51 | 53 | - score: An float value (e.g., 67.30) representing the overall performance
|
52 | 54 | - results: a list of (example, prediction, score) tuples for each example in devset
|
53 | 55 | """
|
| 56 | + |
54 | 57 | def __init__(self, score: float, results: list[Tuple["dspy.Example", "dspy.Example", Any]]):
|
55 | 58 | super().__init__(score=score, results=results)
|
56 | 59 |
|
@@ -126,71 +129,132 @@ def __call__(
|
126 | 129 |
|
127 | 130 | Returns:
|
128 | 131 | The evaluation results are returned as a dspy.EvaluationResult object containing the following attributes:
|
129 |
| - |
| 132 | +
|
130 | 133 | - score: A float percentage score (e.g., 67.30) representing overall performance
|
131 |
| - |
| 134 | +
|
132 | 135 | - results: a list of (example, prediction, score) tuples for each example in devset
|
133 | 136 | """
|
134 |
| - metric = metric if metric is not None else self.metric |
135 |
| - devset = devset if devset is not None else self.devset |
136 |
| - num_threads = num_threads if num_threads is not None else self.num_threads |
137 |
| - display_progress = display_progress if display_progress is not None else self.display_progress |
138 |
| - display_table = display_table if display_table is not None else self.display_table |
139 |
| - |
| 137 | + metric, devset, num_threads, display_progress, display_table = self._resolve_call_args( |
| 138 | + metric, devset, num_threads, display_progress, display_table |
| 139 | + ) |
140 | 140 | if callback_metadata:
|
141 | 141 | logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}")
|
142 |
| - |
143 | 142 | tqdm.tqdm._instances.clear()
|
| 143 | + results = self._execute_with_multithreading(program, metric, devset, num_threads, display_progress) |
| 144 | + return self._process_evaluate_result(devset, results, metric, display_table) |
144 | 145 |
|
145 |
| - executor = ParallelExecutor( |
146 |
| - num_threads=num_threads, |
147 |
| - disable_progress_bar=not display_progress, |
148 |
| - max_errors=( |
149 |
| - self.max_errors |
150 |
| - if self.max_errors is not None |
151 |
| - else dspy.settings.max_errors |
152 |
| - ), |
153 |
| - provide_traceback=self.provide_traceback, |
154 |
| - compare_results=True, |
| 146 | + @with_callbacks |
| 147 | + async def acall( |
| 148 | + self, |
| 149 | + program: "dspy.Module", |
| 150 | + metric: Callable | None = None, |
| 151 | + devset: List["dspy.Example"] | None = None, |
| 152 | + num_threads: int | None = None, |
| 153 | + display_progress: bool | None = None, |
| 154 | + display_table: bool | int | None = None, |
| 155 | + callback_metadata: dict[str, Any] | None = None, |
| 156 | + ) -> EvaluationResult: |
| 157 | + """Async version of `Evaluate.__call__`.""" |
| 158 | + metric, devset, num_threads, display_progress, display_table = self._resolve_call_args( |
| 159 | + metric, devset, num_threads, display_progress, display_table |
| 160 | + ) |
| 161 | + if callback_metadata: |
| 162 | + logger.debug(f"Evaluate.acall is called with callback metadata: {callback_metadata}") |
| 163 | + tqdm.tqdm._instances.clear() |
| 164 | + results = await self._execute_with_event_loop(program, metric, devset, num_threads) |
| 165 | + return self._process_evaluate_result(devset, results, metric, display_table) |
| 166 | + |
| 167 | + def _resolve_call_args(self, metric, devset, num_threads, display_progress, display_table): |
| 168 | + return ( |
| 169 | + metric or self.metric, |
| 170 | + devset or self.devset, |
| 171 | + num_threads or self.num_threads, |
| 172 | + display_progress or self.display_progress, |
| 173 | + display_table or self.display_table, |
155 | 174 | )
|
156 | 175 |
|
157 |
| - def process_item(example): |
158 |
| - prediction = program(**example.inputs()) |
159 |
| - score = metric(example, prediction) |
160 |
| - |
161 |
| - # Increment assert and suggest failures to program's attributes |
162 |
| - if hasattr(program, "_assert_failures"): |
163 |
| - program._assert_failures += dspy.settings.get("assert_failures") |
164 |
| - if hasattr(program, "_suggest_failures"): |
165 |
| - program._suggest_failures += dspy.settings.get("suggest_failures") |
166 |
| - |
167 |
| - return prediction, score |
168 |
| - |
169 |
| - results = executor.execute(process_item, devset) |
| 176 | + def _process_evaluate_result(self, devset, results, metric, display_table): |
170 | 177 | assert len(devset) == len(results)
|
171 |
| - |
172 | 178 | results = [((dspy.Prediction(), self.failure_score) if r is None else r) for r in results]
|
173 | 179 | results = [(example, prediction, score) for example, (prediction, score) in zip(devset, results, strict=False)]
|
174 | 180 | ncorrect, ntotal = sum(score for *_, score in results), len(devset)
|
175 |
| - |
176 | 181 | logger.info(f"Average Metric: {ncorrect} / {ntotal} ({round(100 * ncorrect / ntotal, 1)}%)")
|
177 |
| - |
178 | 182 | if display_table:
|
179 | 183 | if importlib.util.find_spec("pandas") is not None:
|
180 |
| - # Rename the 'correct' column to the name of the metric object |
181 | 184 | metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__
|
182 |
| - # Construct a pandas DataFrame from the results |
183 | 185 | result_df = self._construct_result_table(results, metric_name)
|
184 |
| - |
185 | 186 | self._display_result_table(result_df, display_table, metric_name)
|
186 | 187 | else:
|
187 | 188 | logger.warning("Skipping table display since `pandas` is not installed.")
|
188 |
| - |
189 | 189 | return EvaluationResult(
|
190 | 190 | score=round(100 * ncorrect / ntotal, 2),
|
191 | 191 | results=results,
|
192 | 192 | )
|
193 | 193 |
|
| 194 | + def _execute_with_multithreading( |
| 195 | + self, |
| 196 | + program: "dspy.Module", |
| 197 | + metric: Callable, |
| 198 | + devset: List["dspy.Example"], |
| 199 | + num_threads: int, |
| 200 | + disable_progress_bar: bool, |
| 201 | + ): |
| 202 | + executor = ParallelExecutor( |
| 203 | + num_threads=num_threads, |
| 204 | + disable_progress_bar=disable_progress_bar, |
| 205 | + max_errors=(self.max_errors or dspy.settings.max_errors), |
| 206 | + provide_traceback=self.provide_traceback, |
| 207 | + compare_results=True, |
| 208 | + ) |
| 209 | + |
| 210 | + def process_item(example): |
| 211 | + prediction = program(**example.inputs()) |
| 212 | + score = metric(example, prediction) |
| 213 | + return prediction, score |
| 214 | + |
| 215 | + return executor.execute(process_item, devset) |
| 216 | + |
| 217 | + async def _execute_with_event_loop( |
| 218 | + self, |
| 219 | + program: "dspy.Module", |
| 220 | + metric: Callable, |
| 221 | + devset: List["dspy.Example"], |
| 222 | + num_threads: int, |
| 223 | + ): |
| 224 | + queue = asyncio.Queue() |
| 225 | + results = [None for _ in range(len(devset))] |
| 226 | + for i, example in enumerate(devset): |
| 227 | + await queue.put((i, example)) |
| 228 | + |
| 229 | + for _ in range(num_threads): |
| 230 | + # Add a sentinel value to indicate that the worker should exit |
| 231 | + await queue.put((-1, None)) |
| 232 | + |
| 233 | + # Create tqdm progress bar |
| 234 | + pbar = tqdm.tqdm(total=len(devset), dynamic_ncols=True) |
| 235 | + |
| 236 | + async def worker(): |
| 237 | + while True: |
| 238 | + index, example = await queue.get() |
| 239 | + if index == -1: |
| 240 | + break |
| 241 | + prediction = await program.acall(**example.inputs()) |
| 242 | + score = metric(example, prediction) |
| 243 | + results[index] = (prediction, score) |
| 244 | + |
| 245 | + vals = [r[-1] for r in results if r is not None] |
| 246 | + nresults = sum(vals) |
| 247 | + ntotal = len(vals) |
| 248 | + pct = round(100 * nresults / ntotal, 1) if ntotal else 0 |
| 249 | + pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({pct}%)") |
| 250 | + pbar.update(1) |
| 251 | + queue.task_done() |
| 252 | + |
| 253 | + workers = [asyncio.create_task(worker()) for _ in range(num_threads)] |
| 254 | + await asyncio.gather(*workers) |
| 255 | + pbar.close() |
| 256 | + |
| 257 | + return results |
194 | 258 |
|
195 | 259 | def _construct_result_table(
|
196 | 260 | self, results: list[Tuple["dspy.Example", "dspy.Example", Any]], metric_name: str
|
|
0 commit comments