From de146067ea424c20726a95f2bd9f6fa9feebf93d Mon Sep 17 00:00:00 2001
From: klopsahlong <kopsahlong@gmail.com>
Date: Sun, 19 Jan 2025 13:12:29 -0800
Subject: [PATCH 01/18] log full evals as trials in optuna

---
 dspy/teleprompt/mipro_optimizer_v2.py | 94 +++++++++++++++++++++------
 dspy/teleprompt/utils.py              | 21 +++---
 2 files changed, 84 insertions(+), 31 deletions(-)

diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index bd8eb48ad1..31b170b77d 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import optuna
+from optuna.distributions import CategoricalDistribution
 
 import dspy
 from dspy.evaluate.evaluate import Evaluate
@@ -477,8 +478,19 @@ def _optimize_prompt_parameters(
         minibatch_full_eval_steps: int,
         seed: int,
     ) -> Optional[Any]:
-        logger.info("Evaluating the default program...\n")
-        default_score = eval_candidate_program(len(valset), valset, program, evaluate, self.rng)
+
+        # Run optimization
+        optuna.logging.set_verbosity(optuna.logging.WARNING)
+        logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==")
+        logger.info(
+            "We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n"
+        )
+
+        adjusted_num_trials = num_trials + num_trials // minibatch_full_eval_steps + 1
+        logger.info(f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program ==")
+
+        # logger.info("Evaluating the default program...\n")
+        default_score, baseline_results = eval_candidate_program(len(valset), valset, program, evaluate, self.rng, return_all_scores=True)
         logger.info(f"Default program score: {default_score}\n")
 
         trial_logs = {}
@@ -500,11 +512,12 @@ def _optimize_prompt_parameters(
 
         # Define the objective function
         def objective(trial):
-            nonlocal program, best_program, best_score, trial_logs, total_eval_calls, score_data
+            nonlocal program, best_program, best_score, trial_logs, total_eval_calls, score_data, minibatch_full_eval_steps
 
             trial_num = trial.number + 1
             if minibatch:
-                logger.info(f"== Minibatch Trial {trial_num} / {num_trials} ==")
+                adjusted_num_trials = num_trials + num_trials // minibatch_full_eval_steps + 1
+                logger.info(f"== Trial {trial_num} / {adjusted_num_trials} - Minibatch ==")
             else:
                 logger.info(f"===== Trial {trial_num} / {num_trials} =====")
 
@@ -514,7 +527,7 @@ def objective(trial):
             candidate_program = program.deepcopy()
 
             # Choose instructions and demos, insert them into the program
-            chosen_params = self._select_and_insert_instructions_and_demos(
+            chosen_params, raw_chosen_params = self._select_and_insert_instructions_and_demos(
                 candidate_program,
                 instruction_candidates,
                 demo_candidates,
@@ -554,6 +567,7 @@ def objective(trial):
                     num_trials,
                     trial_logs,
                     trial_num,
+                    minibatch_full_eval_steps,
                     candidate_program,
                     total_eval_calls,
                 )
@@ -563,7 +577,7 @@ def objective(trial):
                 )
             categorical_key = ",".join(map(str, chosen_params))
             param_score_dict[categorical_key].append(
-                (score, candidate_program),
+                (score, candidate_program, raw_chosen_params),
             )
 
             # If minibatch, perform full evaluation at intervals
@@ -573,6 +587,8 @@ def objective(trial):
             ):
                 best_score, best_program, total_eval_calls = self._perform_full_evaluation(
                     trial_num,
+                    num_trials,
+                    minibatch_full_eval_steps,
                     param_score_dict,
                     fully_evaled_param_combos,
                     evaluate,
@@ -582,20 +598,28 @@ def objective(trial):
                     score_data,
                     best_score,
                     best_program,
+                    study,
+                    instruction_candidates,
+                    demo_candidates
                 )
-
+            
             return score
 
-        # Run optimization
-        optuna.logging.set_verbosity(optuna.logging.WARNING)
-        logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==")
-        logger.info(
-            "We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n"
-        )
-
         sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True)
         study = optuna.create_study(direction="maximize", sampler=sampler)
-        study.optimize(objective, n_trials=num_trials)
+
+        default_params = {f"{i}_predictor_instruction": 0 for i in range(len(program.predictors()))}
+        if demo_candidates:
+            default_params.update({f"{i}_predictor_demos": 0 for i in range(len(program.predictors()))})
+
+        # Add default run as a baseline in optuna (TODO: figure out how to weight this by # of samples evaluated on)
+        trial = optuna.trial.create_trial(
+            params=default_params,
+            distributions= self._get_param_distributions(program, instruction_candidates, demo_candidates),
+            value=default_score,
+        )
+        study.add_trial(trial)
+        study.optimize(objective, n_trials=num_trials-1)
 
         # Attach logs to best program
         if best_program is not None and self.track_stats:
@@ -624,6 +648,7 @@ def _log_minibatch_eval(
         num_trials,
         trial_logs,
         trial_num,
+        minibatch_full_eval_steps,
         candidate_program,
         total_eval_calls,
     ):
@@ -633,6 +658,9 @@ def _log_minibatch_eval(
         trial_logs[trial_num]["mb_score"] = score
         trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
         trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy()
+        
+        # Log adjusted num trials as total trial num
+        adjusted_num_trials = num_trials + num_trials // minibatch_full_eval_steps + 1
 
         logger.info(
             f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}."
@@ -642,7 +670,7 @@ def _log_minibatch_eval(
         logger.info(f"Full eval scores so far: {trajectory}")
         logger.info(f"Best full score so far: {best_score}")
         logger.info(
-            f'{"="*len(f"== Minibatch Trial {trial.number+1} / {num_trials} ==")}\n\n'
+            f'{"="*len(f"== Trial {trial.number+1} / {adjusted_num_trials} - Minibatch Evaluation ==")}\n\n'
         )
 
     def _log_normal_eval(
@@ -670,6 +698,7 @@ def _select_and_insert_instructions_and_demos(
         trial_num: int,
     ) -> List[str]:
         chosen_params = []
+        raw_chosen_params = {}
 
         for i, predictor in enumerate(candidate_program.predictors()):
             # Select instruction
@@ -683,7 +712,7 @@ def _select_and_insert_instructions_and_demos(
             set_signature(predictor, updated_signature)
             trial_logs[trial_num][f"{i}_predictor_instruction"] = instruction_idx
             chosen_params.append(f"Predictor {i}: Instruction {instruction_idx}")
-
+            raw_chosen_params[f"{i}_predictor_instruction"] = instruction_idx
             # Select demos if available
             if demo_candidates:
                 demos_idx = trial.suggest_categorical(
@@ -692,12 +721,25 @@ def _select_and_insert_instructions_and_demos(
                 predictor.demos = demo_candidates[i][demos_idx]
                 trial_logs[trial_num][f"{i}_predictor_demos"] = demos_idx
                 chosen_params.append(f"Predictor {i}: Few-Shot Set {demos_idx}")
+                raw_chosen_params[f"{i}_predictor_demos"] = instruction_idx
+
+        return chosen_params, raw_chosen_params
 
-        return chosen_params
+    def _get_param_distributions(self, program, instruction_candidates, demo_candidates):
+        param_distributions = {}
+
+        for i in range(len(instruction_candidates)):
+            param_distributions[f"{i}_predictor_instruction"] = CategoricalDistribution(range(len(instruction_candidates[i])))
+            if demo_candidates:
+                param_distributions[f"{i}_predictor_demos"] = CategoricalDistribution(range(len(demo_candidates[i])))
+
+        return param_distributions
 
     def _perform_full_evaluation(
         self,
         trial_num: int,
+        num_trials: int,
+        minibatch_full_eval_steps: int,
         param_score_dict: Dict,
         fully_evaled_param_combos: Dict,
         evaluate: Evaluate,
@@ -707,11 +749,15 @@ def _perform_full_evaluation(
         score_data,
         best_score: float,
         best_program: Any,
+        study: optuna.Study,
+        instruction_candidates: List,
+        demo_candidates: List,
     ):
-        logger.info(f"===== Full Eval {len(fully_evaled_param_combos)+1} =====")
+        adjusted_num_trials = num_trials + num_trials // minibatch_full_eval_steps + 1
+        logger.info(f"===== Trial {trial_num+1} /{adjusted_num_trials} - Full Evaluation =====")
 
         # Identify best program to evaluate fully
-        highest_mean_program, mean_score, combo_key = (
+        highest_mean_program, mean_score, combo_key, params = (
             get_program_with_highest_avg_score(
                 param_score_dict, fully_evaled_param_combos
             )
@@ -724,6 +770,14 @@ def _perform_full_evaluation(
         )
         score_data.append((full_eval_score, highest_mean_program, True))
 
+        # Log full eval as a trial so that optuna can learn from the new results
+        trial = optuna.trial.create_trial(
+            params=params,
+            distributions= self._get_param_distributions(best_program, instruction_candidates, demo_candidates),
+            value=full_eval_score,
+        )
+        study.add_trial(trial)
+
         # Log full evaluation results
         fully_evaled_param_combos[combo_key] = {
             "program": highest_mean_program,
diff --git a/dspy/teleprompt/utils.py b/dspy/teleprompt/utils.py
index bdc6a529a1..96cc2d44cd 100644
--- a/dspy/teleprompt/utils.py
+++ b/dspy/teleprompt/utils.py
@@ -42,25 +42,23 @@ def create_minibatch(trainset, batch_size=50, rng=None):
     return minibatch
 
 
-def eval_candidate_program(batch_size, trainset, candidate_program, evaluate, rng=None):
+def eval_candidate_program(batch_size, trainset, candidate_program, evaluate, rng=None, return_all_scores=False):
     """Evaluate a candidate program on the trainset, using the specified batch size."""
 
     try:
         # Evaluate on the full trainset
         if batch_size >= len(trainset):
-            score = evaluate(candidate_program, devset=trainset)
+            return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores)
         # Or evaluate on a minibatch
         else:
-            score = evaluate(
+            return evaluate(
                 candidate_program,
                 devset=create_minibatch(trainset, batch_size, rng),
+                return_all_scores=return_all_scores
             )
     except Exception as e:
         print(f"Exception occurred: {e}")
-        score = 0.0  # TODO: Handle this better, as -ve scores are possible
-
-    return score
-
+        return 0.0  # TODO: Handle this better, as -ve scores are possible
 
 def eval_candidate_program_with_pruning(
     trial, trial_logs, trainset, candidate_program, evaluate, trial_num, batch_size=100,
@@ -114,22 +112,23 @@ def get_program_with_highest_avg_score(param_score_dict, fully_evaled_param_comb
         scores = np.array([v[0] for v in values])
         mean = np.average(scores)
         program = values[0][1]
-        results.append((key, mean, program))
+        params = values[0][2]
+        results.append((key, mean, program, params))
 
     # Sort results by the mean
     sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
 
     # Find the combination with the highest mean, skip fully evaluated ones
     for combination in sorted_results:
-        key, mean, program = combination
+        key, mean, program, params = combination
 
         if key in fully_evaled_param_combos:
             continue
 
-        return program, mean, key
+        return program, mean, key, params
 
     # If no valid program is found, we return the last valid one that we found
-    return program, mean, key
+    return program, mean, key, params
 
 
 def calculate_last_n_proposed_quality(

From 41bca2f1798f2f14788a4008b9ce2cc470bbf02b Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Mon, 20 Jan 2025 14:45:04 -0700
Subject: [PATCH 02/18] fixing off by one error in trial counting

---
 dspy/teleprompt/mipro_optimizer_v2.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index 8378bbaf8a..4f363416f9 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -544,7 +544,7 @@ def objective(trial):
             # Evaluate the candidate program (on minibatch if minibatch=True)
             batch_size = minibatch_size if minibatch else len(valset)
             score = eval_candidate_program(
-                batch_size, valset, candidate_program, evaluate, self.rng
+               batch_size, valset, candidate_program, evaluate, self.rng
             )
             total_eval_calls += batch_size
 
@@ -581,9 +581,11 @@ def objective(trial):
             )
 
             # If minibatch, perform full evaluation at intervals
+            adjusted_num_trials = (num_trials + num_trials // minibatch_full_eval_steps + 1) if minibatch else num_trials
+
             if minibatch and (
                 (trial_num % minibatch_full_eval_steps == 0)
-                or (trial_num == num_trials)
+                or (trial_num == (adjusted_num_trials -1))
             ):
                 best_score, best_program, total_eval_calls = self._perform_full_evaluation(
                     trial_num,

From 6896950386da7d0146619c9259c2a97bde94ba06 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Mon, 20 Jan 2025 15:03:13 -0700
Subject: [PATCH 03/18] cleaning up adjusted_trials variable a bit

---
 dspy/teleprompt/mipro_optimizer_v2.py | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index 4f363416f9..f2c2483f3d 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -486,7 +486,8 @@ def _optimize_prompt_parameters(
             "We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n"
         )
 
-        adjusted_num_trials = num_trials + num_trials // minibatch_full_eval_steps + 1
+        # Compute the adjusted total trials that we will run (including full evals)
+        adjusted_num_trials = (num_trials + num_trials // minibatch_full_eval_steps + 1) if minibatch else num_trials
         logger.info(f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program ==")
 
         # logger.info("Evaluating the default program...\n")
@@ -516,7 +517,6 @@ def objective(trial):
 
             trial_num = trial.number + 1
             if minibatch:
-                adjusted_num_trials = num_trials + num_trials // minibatch_full_eval_steps + 1
                 logger.info(f"== Trial {trial_num} / {adjusted_num_trials} - Minibatch ==")
             else:
                 logger.info(f"===== Trial {trial_num} / {num_trials} =====")
@@ -564,10 +564,9 @@ def objective(trial):
                     chosen_params,
                     score_data,
                     trial,
-                    num_trials,
+                    adjusted_num_trials,
                     trial_logs,
                     trial_num,
-                    minibatch_full_eval_steps,
                     candidate_program,
                     total_eval_calls,
                 )
@@ -580,17 +579,14 @@ def objective(trial):
                 (score, candidate_program, raw_chosen_params),
             )
 
-            # If minibatch, perform full evaluation at intervals
-            adjusted_num_trials = (num_trials + num_trials // minibatch_full_eval_steps + 1) if minibatch else num_trials
-
+            # If minibatch, perform full evaluation at intervals (and at the very end)
             if minibatch and (
                 (trial_num % minibatch_full_eval_steps == 0)
                 or (trial_num == (adjusted_num_trials -1))
             ):
                 best_score, best_program, total_eval_calls = self._perform_full_evaluation(
                     trial_num,
-                    num_trials,
-                    minibatch_full_eval_steps,
+                    adjusted_num_trials,
                     param_score_dict,
                     fully_evaled_param_combos,
                     evaluate,
@@ -647,10 +643,9 @@ def _log_minibatch_eval(
         chosen_params,
         score_data,
         trial,
-        num_trials,
+        adjusted_num_trials,
         trial_logs,
         trial_num,
-        minibatch_full_eval_steps,
         candidate_program,
         total_eval_calls,
     ):
@@ -661,9 +656,6 @@ def _log_minibatch_eval(
         trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
         trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy()
         
-        # Log adjusted num trials as total trial num
-        adjusted_num_trials = num_trials + num_trials // minibatch_full_eval_steps + 1
-
         logger.info(
             f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}."
         )
@@ -740,8 +732,7 @@ def _get_param_distributions(self, program, instruction_candidates, demo_candida
     def _perform_full_evaluation(
         self,
         trial_num: int,
-        num_trials: int,
-        minibatch_full_eval_steps: int,
+        adjusted_num_trials: int,
         param_score_dict: Dict,
         fully_evaled_param_combos: Dict,
         evaluate: Evaluate,
@@ -755,7 +746,6 @@ def _perform_full_evaluation(
         instruction_candidates: List,
         demo_candidates: List,
     ):
-        adjusted_num_trials = num_trials + num_trials // minibatch_full_eval_steps + 1
         logger.info(f"===== Trial {trial_num+1} /{adjusted_num_trials} - Full Evaluation =====")
 
         # Identify best program to evaluate fully

From 0cea467959e42613fa2a65997a20db0d1e5cd53b Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Mon, 20 Jan 2025 15:05:11 -0700
Subject: [PATCH 04/18] minor cleanups

---
 dspy/teleprompt/mipro_optimizer_v2.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index f2c2483f3d..82dfdbf81d 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -490,7 +490,6 @@ def _optimize_prompt_parameters(
         adjusted_num_trials = (num_trials + num_trials // minibatch_full_eval_steps + 1) if minibatch else num_trials
         logger.info(f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program ==")
 
-        # logger.info("Evaluating the default program...\n")
         default_score, baseline_results = eval_candidate_program(len(valset), valset, program, evaluate, self.rng, return_all_scores=True)
         logger.info(f"Default program score: {default_score}\n")
 
@@ -513,7 +512,7 @@ def _optimize_prompt_parameters(
 
         # Define the objective function
         def objective(trial):
-            nonlocal program, best_program, best_score, trial_logs, total_eval_calls, score_data, minibatch_full_eval_steps
+            nonlocal program, best_program, best_score, trial_logs, total_eval_calls, score_data
 
             trial_num = trial.number + 1
             if minibatch:

From d9a7f0b94eb827649858eeb4f2012cd85dbed0f0 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Mon, 24 Mar 2025 09:33:15 -0400
Subject: [PATCH 05/18] changes to json adapter to get kie to run

---
 dspy/adapters/json_adapter.py | 50 +++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/dspy/adapters/json_adapter.py b/dspy/adapters/json_adapter.py
index a2fbcd5554..8c7ad5326b 100644
--- a/dspy/adapters/json_adapter.py
+++ b/dspy/adapters/json_adapter.py
@@ -6,6 +6,7 @@
 import textwrap
 from copy import deepcopy
 from typing import Any, Dict, KeysView, Literal, NamedTuple
+from typing import get_args, Union
 
 import json_repair
 import litellm
@@ -35,25 +36,25 @@ def __call__(self, lm, lm_kwargs, signature, demos, inputs):
         inputs = self.format(signature, demos, inputs)
         inputs = dict(prompt=inputs) if isinstance(inputs, str) else dict(messages=inputs)
 
-        try:
-            provider = lm.model.split("/", 1)[0] or "openai"
-            params = litellm.get_supported_openai_params(model=lm.model, custom_llm_provider=provider)
-            if params and "response_format" in params:
-                try:
-                    response_format = _get_structured_outputs_response_format(signature)
-                    outputs = lm(**inputs, **lm_kwargs, response_format=response_format)
-                except Exception:
-                    logger.debug(
-                        "Failed to obtain response using signature-based structured outputs"
-                        " response format: Falling back to default 'json_object' response format."
-                        " Exception: {e}"
-                    )
-                    outputs = lm(**inputs, **lm_kwargs, response_format={"type": "json_object"})
-            else:
-                outputs = lm(**inputs, **lm_kwargs)
-
-        except litellm.UnsupportedParamsError:
-            outputs = lm(**inputs, **lm_kwargs)
+        # try:
+        #     provider = lm.model.split("/", 1)[0] or "openai"
+        #     params = litellm.get_supported_openai_params(model=lm.model, custom_llm_provider=provider)
+        #     if params and "response_format" in params:
+        #         try:
+        #             response_format = _get_structured_outputs_response_format(signature)
+        #             outputs = lm(**inputs, **lm_kwargs, response_format=response_format)
+        #         except Exception:
+        #             logger.debug(
+        #                 "Failed to obtain response using signature-based structured outputs"
+        #                 " response format: Falling back to default 'json_object' response format."
+        #                 " Exception: {e}"
+        #             )
+        #             outputs = lm(**inputs, **lm_kwargs, response_format={"type": "json_object"})
+        #     else:
+        #         outputs = lm(**inputs, **lm_kwargs)
+
+        # except litellm.UnsupportedParamsError:
+        outputs = lm(**inputs, **lm_kwargs)
 
         values = []
 
@@ -63,7 +64,7 @@ def __call__(self, lm, lm_kwargs, signature, demos, inputs):
                 signature.output_fields.keys()
             ), f"Expected {signature.output_fields.keys()} but got {value.keys()}"
             values.append(value)
-
+        
         return values
 
     def format(self, signature, demos, inputs):
@@ -120,8 +121,12 @@ def format_fields(self, signature, values, role):
 
 
 def parse_value(value, annotation):
-    if annotation is str:
-        return str(value)
+    is_optional_str = (
+        getattr(annotation, '__origin__', None) is Union and str in get_args(annotation)
+    )
+
+    if annotation is str or is_optional_str:
+        return str(value) if value is not None else None  # Ensure string output, preserving None
 
     parsed_value = value
 
@@ -135,7 +140,6 @@ def parse_value(value, annotation):
                 parsed_value = ast.literal_eval(value)
             except (ValueError, SyntaxError):
                 parsed_value = value
-
     return TypeAdapter(annotation).validate_python(parsed_value)
 
 

From a83c7a9ffc1a41abde1cb6cc3defe17b73cd377b Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Wed, 26 Mar 2025 13:07:39 -0400
Subject: [PATCH 06/18] wiip

---
 dspy/teleprompt/mipro_optimizer_v2.py |  93 +----
 dspy/teleprompt/simba.py              |  24 +-
 dspy/teleprompt/utils.py              |  55 +++
 dspy/utils/parallelizer.py            | 496 ++++++++++++++++++--------
 4 files changed, 444 insertions(+), 224 deletions(-)

diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index e2c3035b00..b90915ea42 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -21,6 +21,7 @@
     print_full_program,
     save_candidate_program,
     set_signature,
+    log_token_usage
 )
 
 logger = logging.getLogger(__name__)
@@ -456,10 +457,7 @@ def _optimize_prompt_parameters(
         minibatch_full_eval_steps: int,
         seed: int,
     ) -> Optional[Any]:
-<<<<<<< HEAD
 
-=======
->>>>>>> fd67cf09b82e4529459d8616bc481c3c33312208
         # Run optimization
         optuna.logging.set_verbosity(optuna.logging.WARNING)
         logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==")
@@ -471,21 +469,17 @@ def _optimize_prompt_parameters(
         adjusted_num_trials = (num_trials + num_trials // minibatch_full_eval_steps + 1) if minibatch else num_trials
         logger.info(f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program ==")
 
-<<<<<<< HEAD
-        default_score, baseline_results = eval_candidate_program(len(valset), valset, program, evaluate, self.rng, return_all_scores=True)
-=======
         default_score, _ = eval_candidate_program(
             len(valset), valset, program, evaluate, self.rng, return_all_scores=True
         )
->>>>>>> fd67cf09b82e4529459d8616bc481c3c33312208
         logger.info(f"Default program score: {default_score}\n")
 
         trial_logs = {}
         trial_logs[-1] = {}
-        trial_logs[-1]["full_eval_program_path"] = save_candidate_program(program, self.log_dir, -1)
+        # trial_logs[-1]["full_eval_program_path"] = save_candidate_program(program, self.log_dir, -1)
         trial_logs[-1]["full_eval_score"] = default_score
         trial_logs[-1]["total_eval_calls_so_far"] = len(valset)
-        trial_logs[-1]["full_eval_program"] = program.deepcopy()
+        # trial_logs[-1]["full_eval_program"] = program.deepcopy()
 
         # Initialize optimization variables
         best_score = default_score
@@ -527,13 +521,7 @@ def objective(trial):
 
             # Evaluate the candidate program (on minibatch if minibatch=True)
             batch_size = minibatch_size if minibatch else len(valset)
-<<<<<<< HEAD
-            score = eval_candidate_program(
-               batch_size, valset, candidate_program, evaluate, self.rng
-            )
-=======
             score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng)
->>>>>>> fd67cf09b82e4529459d8616bc481c3c33312208
             total_eval_calls += batch_size
 
             # Update best score and program
@@ -581,14 +569,7 @@ def objective(trial):
             )
 
             # If minibatch, perform full evaluation at intervals (and at the very end)
-<<<<<<< HEAD
-            if minibatch and (
-                (trial_num % minibatch_full_eval_steps == 0)
-                or (trial_num == (adjusted_num_trials -1))
-            ):
-=======
             if minibatch and ((trial_num % minibatch_full_eval_steps == 0) or (trial_num == (adjusted_num_trials - 1))):
->>>>>>> fd67cf09b82e4529459d8616bc481c3c33312208
                 best_score, best_program, total_eval_calls = self._perform_full_evaluation(
                     trial_num,
                     adjusted_num_trials,
@@ -603,13 +584,12 @@ def objective(trial):
                     best_program,
                     study,
                     instruction_candidates,
-<<<<<<< HEAD
-                    demo_candidates
-=======
                     demo_candidates,
->>>>>>> fd67cf09b82e4529459d8616bc481c3c33312208
                 )
             
+            # Log model token usage at the end of the trial 
+            log_token_usage(trial_logs, trial_num, model_dict={"prompt_model": self.prompt_model, "teacher_model": self.teacher_settings["lm"], "task_model": self.task_model})
+            
             return score
 
         sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True)
@@ -622,19 +602,11 @@ def objective(trial):
         # Add default run as a baseline in optuna (TODO: figure out how to weight this by # of samples evaluated on)
         trial = optuna.trial.create_trial(
             params=default_params,
-<<<<<<< HEAD
-            distributions= self._get_param_distributions(program, instruction_candidates, demo_candidates),
-            value=default_score,
-        )
-        study.add_trial(trial)
-        study.optimize(objective, n_trials=num_trials-1)
-=======
             distributions=self._get_param_distributions(program, instruction_candidates, demo_candidates),
             value=default_score,
         )
         study.add_trial(trial)
         study.optimize(objective, n_trials=num_trials - 1)
->>>>>>> fd67cf09b82e4529459d8616bc481c3c33312208
 
         # Attach logs to best program
         if best_program is not None and self.track_stats:
@@ -670,22 +642,10 @@ def _log_minibatch_eval(
         candidate_program,
         total_eval_calls,
     ):
-        trial_logs[trial_num]["mb_program_path"] = save_candidate_program(candidate_program, self.log_dir, trial_num)
+        # trial_logs[trial_num]["mb_program_path"] = save_candidate_program(candidate_program, self.log_dir, trial_num)
         trial_logs[trial_num]["mb_score"] = score
         trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
-        trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy()
-<<<<<<< HEAD
-        
-        logger.info(
-            f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}."
-        )
-        logger.info(f"Minibatch scores so far: {'['+', '.join([f'{s[0]}' for s in score_data if not s[2]]) +']'}")
-        trajectory = "[" + ", ".join([f"{s[0]}" for s in score_data if s[2]]) + "]"
-        logger.info(f"Full eval scores so far: {trajectory}")
-        logger.info(f"Best full score so far: {best_score}")
-        logger.info(
-            f'{"="*len(f"== Trial {trial.number+1} / {adjusted_num_trials} - Minibatch Evaluation ==")}\n\n'
-=======
+        # trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy()
 
         logger.info(f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}.")
         minibatch_scores = ", ".join([f"{s['score']}" for s in score_data if not s["full_eval"]])
@@ -696,7 +656,6 @@ def _log_minibatch_eval(
         logger.info(f"Best full score so far: {best_score}")
         logger.info(
             f"{'=' * len(f'== Trial {trial.number + 1} / {adjusted_num_trials} - Minibatch Evaluation ==')}\n\n"
->>>>>>> fd67cf09b82e4529459d8616bc481c3c33312208
         )
 
     def _log_normal_eval(
@@ -714,12 +673,12 @@ def _log_normal_eval(
         candidate_program,
         total_eval_calls,
     ):
-        trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program(
-            candidate_program, self.log_dir, trial_num
-        )
+        # trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program(
+        #     candidate_program, self.log_dir, trial_num
+        # )
         trial_logs[trial_num]["full_eval_score"] = score
         trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
-        trial_logs[trial_num]["full_eval_program"] = candidate_program.deepcopy()
+        # trial_logs[trial_num]["full_eval_program"] = candidate_program.deepcopy()
 
         logger.info(f"Score: {score} with parameters {chosen_params}.")
         full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]])
@@ -764,13 +723,9 @@ def _get_param_distributions(self, program, instruction_candidates, demo_candida
         param_distributions = {}
 
         for i in range(len(instruction_candidates)):
-<<<<<<< HEAD
-            param_distributions[f"{i}_predictor_instruction"] = CategoricalDistribution(range(len(instruction_candidates[i])))
-=======
             param_distributions[f"{i}_predictor_instruction"] = CategoricalDistribution(
                 range(len(instruction_candidates[i]))
             )
->>>>>>> fd67cf09b82e4529459d8616bc481c3c33312208
             if demo_candidates:
                 param_distributions[f"{i}_predictor_demos"] = CategoricalDistribution(range(len(demo_candidates[i])))
 
@@ -793,21 +748,11 @@ def _perform_full_evaluation(
         instruction_candidates: List,
         demo_candidates: List,
     ):
-<<<<<<< HEAD
-        logger.info(f"===== Trial {trial_num+1} /{adjusted_num_trials} - Full Evaluation =====")
-
-        # Identify best program to evaluate fully
-        highest_mean_program, mean_score, combo_key, params = (
-            get_program_with_highest_avg_score(
-                param_score_dict, fully_evaled_param_combos
-            )
-=======
         logger.info(f"===== Trial {trial_num + 1} / {adjusted_num_trials} - Full Evaluation =====")
 
         # Identify best program to evaluate fully
         highest_mean_program, mean_score, combo_key, params = get_program_with_highest_avg_score(
             param_score_dict, fully_evaled_param_combos
->>>>>>> fd67cf09b82e4529459d8616bc481c3c33312208
         )
         logger.info(f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials...")
         full_eval_score = eval_candidate_program(len(valset), valset, highest_mean_program, evaluate, self.rng)
@@ -836,13 +781,13 @@ def _perform_full_evaluation(
         }
         total_eval_calls += len(valset)
         trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
-        trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program(
-            program=highest_mean_program,
-            log_dir=self.log_dir,
-            trial_num=trial_num,
-            note="full_eval",
-        )
-        trial_logs[trial_num]["full_eval_program"] = highest_mean_program
+        # trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program(
+        #     program=highest_mean_program,
+        #     log_dir=self.log_dir,
+        #     trial_num=trial_num,
+        #     note="full_eval",
+        # )
+        # trial_logs[trial_num]["full_eval_program"] = highest_mean_program
         trial_logs[trial_num]["full_eval_score"] = full_eval_score
 
         # Update best score and program if necessary
diff --git a/dspy/teleprompt/simba.py b/dspy/teleprompt/simba.py
index 9d41c563ab..d281c47db3 100644
--- a/dspy/teleprompt/simba.py
+++ b/dspy/teleprompt/simba.py
@@ -6,6 +6,7 @@
 from typing import Callable
 from dspy.teleprompt.teleprompt import Teleprompter
 from dspy.teleprompt.simba_utils import prepare_models_for_resampling, wrap_program, append_a_demo, append_a_rule
+from dspy.teleprompt.utils import log_token_usage
 
 logger = logging.getLogger(__name__)
 
@@ -47,7 +48,10 @@ def __init__(
         self.temperature_for_sampling = temperature_for_sampling
         self.temperature_for_candidates = temperature_for_candidates
 
-        self.strategies = [append_a_demo(demo_input_field_maxlen), append_a_rule]
+        if self.max_demos > 0:
+            self.strategies = [append_a_demo(demo_input_field_maxlen), append_a_rule]
+        else:
+            self.strategies = [append_a_rule]
 
     def compile(self, student: dspy.Module, *, trainset: list[dspy.Example], seed: int = 0):
         # Basic checks
@@ -117,7 +121,10 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
         # Parallel runner
         run_parallel = dspy.Parallel(access_examples=False, num_threads=self.num_threads)
 
+        trial_logs = {}
         for batch_idx in range(self.max_steps):
+            trial_logs[batch_idx] = {}
+
             logger.info(f"Starting batch {batch_idx+1} of {self.max_steps}.")
 
             # STEP 1: Get next batch
@@ -205,12 +212,14 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
                 name2predictor = {}
                 num_demos_list = []
 
+                max_demos_tmp = self.max_demos if self.max_demos > 0 else 3
+
                 for name, predictor in system_candidate.named_predictors():
                     name2predictor[name] = predictor
                     num_demos_list.append(len(predictor.demos))
 
                 num_demos = max(num_demos_list) if num_demos_list else 0
-                num_demos_to_drop = max(rng_np.poisson(num_demos / self.max_demos), int(num_demos >= self.max_demos))
+                num_demos_to_drop = max(rng_np.poisson(num_demos / max_demos_tmp), int(num_demos >= max_demos_tmp))
                 num_demos_to_drop = min(num_demos_to_drop, num_demos)
                 demos_to_drop = [rng.randrange(num_demos) for _ in range(num_demos_to_drop)]
 
@@ -268,7 +277,7 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
             if candidate_scores:
                 best_idx_among_candidates = candidate_scores.index(max(candidate_scores))
                 best_program = system_candidates[best_idx_among_candidates]
-                winning_programs.append(best_program)
+                winning_programs.append(best_program.deepcopy())
 
             # STEP 8: Register all new candidate systems in our global pool
             for idx_cand, cand_sys in enumerate(system_candidates):
@@ -276,6 +285,8 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
                 end = (idx_cand + 1) * self.bsize
                 sys_scores = [outputs[i]["score"] for i in range(start, end)]
                 register_new_program(cand_sys, sys_scores)
+            
+            log_token_usage(trial_logs, batch_idx, {"lm": dspy.settings.lm})
 
         M = len(winning_programs) - 1
         N = self.num_candidates + 1
@@ -286,7 +297,7 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
             program_idxs = [round(i * M / (N - 1)) for i in range(N)]
         program_idxs = list(dict.fromkeys(program_idxs))
 
-        candidate_programs = [winning_programs[i] for i in program_idxs]
+        candidate_programs = [winning_programs[i].deepcopy() for i in program_idxs]
         logger.info(f"VALIDATION: Evaluating {len(candidate_programs)} programs on the full trainset.")
         exec_pairs = [(wrap_program(sys, self.metric), ex) for sys in candidate_programs for ex in trainset]
         outputs = run_parallel(exec_pairs)
@@ -298,7 +309,9 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
             sys_scores = [outputs[i]["score"] for i in range(start, end)]
             avg_score = sum(sys_scores) / len(sys_scores) if sys_scores else 0.0
             scores.append(avg_score)
-
+            if idx_prog != 0:
+                trial_logs[idx_prog-1]["train_score"] = avg_score
+        
         best_idx = scores.index(max(scores)) if scores else 0
         best_program = candidate_programs[best_idx]
         logger.info(
@@ -309,5 +322,6 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
         # FIXME: Attach all program candidates in decreasing average score to the best program.
         best_program.candidate_programs = candidate_programs
         best_program.winning_programs = winning_programs
+        best_program.trial_logs = trial_logs
 
         return best_program
diff --git a/dspy/teleprompt/utils.py b/dspy/teleprompt/utils.py
index a64caf61f1..8209e5ef07 100644
--- a/dspy/teleprompt/utils.py
+++ b/dspy/teleprompt/utils.py
@@ -6,6 +6,7 @@
 import random
 import shutil
 import sys
+from typing import Tuple
 
 import numpy as np
 
@@ -135,6 +136,60 @@ def get_program_with_highest_avg_score(param_score_dict, fully_evaled_param_comb
     # If no valid program is found, we return the last valid one that we found
     return program, mean, key, params
 
+def get_token_usage(model) -> Tuple[int, int]:
+    """
+    Extract total input tokens and output tokens from a model's interaction history.
+    Returns (total_input_tokens, total_output_tokens).
+    """
+    if not hasattr(model, "history"):
+        return 0, 0
+
+    input_tokens = []
+    output_tokens = []
+    for interaction in model.history:
+        usage = interaction.get("usage", {})
+        _input_tokens = usage.get("prompt_tokens", 0)
+        _output_tokens = usage.get("completion_tokens", 0)
+        input_tokens.append(_input_tokens)
+        output_tokens.append(_output_tokens)
+
+    total_input_tokens = np.sum(input_tokens)
+    total_output_tokens = np.sum(output_tokens)
+
+    return total_input_tokens, total_output_tokens
+
+def extract_token_usage(model):
+    """Return (total_input_tokens, total_output_tokens) by summing usage in model.history."""
+    if not model or not hasattr(model, "history"):
+        # If model is None or doesn't have a .history, return 0 usage.
+        return 0, 0
+
+    input_tokens = []
+    output_tokens = []
+    for interaction in model.history:
+        usage = interaction.get("usage", {})
+        _input_tokens = usage.get("prompt_tokens", 0)
+        _output_tokens = usage.get("completion_tokens", 0)
+        input_tokens.append(_input_tokens)
+        output_tokens.append(_output_tokens)
+    return int(np.sum(input_tokens)), int(np.sum(output_tokens))
+    
+def log_token_usage(trial_logs, trial_num, model_dict):
+    """
+    Extract total input and output tokens used by each model and log to trial_logs[trial_num]["token_usage"].
+    """
+
+    token_usage_dict = {}
+
+    for model_name, model in model_dict.items():
+        in_tokens, out_tokens = extract_token_usage(model)
+        token_usage_dict[model_name] = {
+            "total_input_tokens": in_tokens,
+            "total_output_tokens": out_tokens
+        }
+
+    # Store token usage info in trial logs
+    trial_logs[trial_num]["token_usage"] = token_usage_dict
 
 def calculate_last_n_proposed_quality(
     base_program, trial_logs, evaluate, trainset, devset, n,
diff --git a/dspy/utils/parallelizer.py b/dspy/utils/parallelizer.py
index 5541e380ae..40939ef2ba 100644
--- a/dspy/utils/parallelizer.py
+++ b/dspy/utils/parallelizer.py
@@ -4,10 +4,9 @@
 import logging
 import threading
 import traceback
-import time
 import contextlib
-
-from concurrent.futures import ThreadPoolExecutor, wait, FIRST_COMPLETED
+from tqdm.contrib.logging import logging_redirect_tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError
 
 logger = logging.getLogger(__name__)
 
@@ -20,196 +19,403 @@ def __init__(
         disable_progress_bar=False,
         provide_traceback=False,
         compare_results=False,
-        timeout=120,
-        straggler_limit=3,
+        timeout=None,  # New timeout parameter in seconds
     ):
-        """
-        Offers isolation between the tasks (dspy.settings) irrespective of whether num_threads == 1 or > 1.
-        Handles also straggler timeouts.
-        """
-        
+        """Offers isolation between the tasks (dspy.settings) irrespective of whether num_threads == 1 or > 1."""
         self.num_threads = num_threads
-        self.max_errors = max_errors
         self.disable_progress_bar = disable_progress_bar
+        self.max_errors = max_errors
         self.provide_traceback = provide_traceback
         self.compare_results = compare_results
-        self.timeout = timeout
-        self.straggler_limit = straggler_limit
+        self.timeout = timeout  # Store timeout value
 
         self.error_count = 0
         self.error_lock = threading.Lock()
         self.cancel_jobs = threading.Event()
 
     def execute(self, function, data):
-        tqdm.tqdm._instances.clear()
-        wrapped = self._wrap_function(function)
-        return self._execute_parallel(wrapped, data)
+        wrapped_function = self._wrap_function(function)
+        if self.num_threads == 1:
+            return self._execute_isolated_single_thread(wrapped_function, data)
+        else:
+            return self._execute_multi_thread(wrapped_function, data)
 
-    def _wrap_function(self, user_function):
-        def safe_func(item):
+    def _wrap_function(self, function):
+        # Wrap the function with error handling
+        def wrapped(item):
             if self.cancel_jobs.is_set():
                 return None
             try:
-                return user_function(item)
+                return function(item)
             except Exception as e:
                 with self.error_lock:
                     self.error_count += 1
-                    if self.error_count >= self.max_errors:
-                        self.cancel_jobs.set()
+                    current_error_count = self.error_count
+                if current_error_count >= self.max_errors:
+                    self.cancel_jobs.set()
+                    raise e
                 if self.provide_traceback:
-                    logger.error(f"Error for {item}: {e}\n{traceback.format_exc()}")
+                    logger.error(
+                        f"Error processing item {item}: {e}\nStack trace:\n{traceback.format_exc()}"
+                    )
                 else:
                     logger.error(
-                        f"Error for {item}: {e}. "
-                        "Set `provide_traceback=True` for traceback."
+                        f"Error processing item {item}: {e}. Set `provide_traceback=True` to see the stack trace."
                     )
                 return None
+        return wrapped
 
-        return safe_func
+    def _execute_isolated_single_thread(self, function, data):
+        results = []
+        pbar = tqdm.tqdm(
+            total=len(data),
+            dynamic_ncols=True,
+            disable=self.disable_progress_bar,
+            file=sys.stdout
+        )
 
-    def _execute_parallel(self, function, data):
-        results = [None] * len(data)
-        job_cancelled = "cancelled"
+        from dspy.dsp.utils.settings import thread_local_overrides
+        original_overrides = thread_local_overrides.overrides
 
-        # We resubmit at most once per item.
-        start_time_map = {}
-        start_time_lock = threading.Lock()
-        resubmitted = set()
+        for item in data:
+            with logging_redirect_tqdm():
+                if self.cancel_jobs.is_set():
+                    break
 
-        # This is the worker function each thread will run.
-        def worker(parent_overrides, submission_id, index, item):
-            if self.cancel_jobs.is_set():
-                return index, job_cancelled
-            # Record actual start time
-            with start_time_lock:
-                start_time_map[submission_id] = time.time()
+                thread_local_overrides.overrides = original_overrides.copy()
 
-            # Apply parent's thread-local overrides
-            from dspy.dsp.utils.settings import thread_local_overrides
+                try:
+                    result = function(item)
+                    results.append(result)
+                finally:
+                    thread_local_overrides.overrides = original_overrides
 
-            original = thread_local_overrides.overrides
-            thread_local_overrides.overrides = parent_overrides.copy()
+                if self.compare_results:
+                    self._update_progress(
+                        pbar,
+                        sum([r[-1] for r in results if r is not None]),
+                        len([r for r in data if r is not None]),
+                    )
+                else:
+                    self._update_progress(pbar, len(results), len(data))
 
-            try:
-                return index, function(item)
-            finally:
-                thread_local_overrides.overrides = original
+        pbar.close()
+
+        if self.cancel_jobs.is_set():
+            logger.warning("Execution was cancelled due to errors.")
+            raise Exception("Execution was cancelled due to errors.")
+
+        return results
+
+    def _update_progress(self, pbar, nresults, ntotal):
+        if self.compare_results:
+            percentage = round(100 * nresults / ntotal, 1) if ntotal > 0 else 0
+            pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({percentage}%)")
+        else:
+            pbar.set_description(f"Processed {nresults} / {ntotal} examples")
+
+        pbar.update()
+
+    def _execute_multi_thread(self, function, data):
+        results = [None] * len(data)  # Pre-allocate results list to maintain order
+        job_cancelled = "cancelled"
 
-        # Handle Ctrl-C in the main thread
         @contextlib.contextmanager
-        def interrupt_manager():
+        def interrupt_handler_manager():
+            """Sets the cancel_jobs event when a SIGINT is received, only in the main thread."""
             if threading.current_thread() is threading.main_thread():
-                orig_handler = signal.getsignal(signal.SIGINT)
+                default_handler = signal.getsignal(signal.SIGINT)
 
-                def handler(sig, frame):
+                def interrupt_handler(sig, frame):
                     self.cancel_jobs.set()
-                    logger.warning("SIGINT received. Cancelling.")
-                    orig_handler(sig, frame)
+                    logger.warning("Received SIGINT. Cancelling execution.")
+                    default_handler(sig, frame)
 
-                signal.signal(signal.SIGINT, handler)
+                signal.signal(signal.SIGINT, interrupt_handler)
                 try:
                     yield
                 finally:
-                    signal.signal(signal.SIGINT, orig_handler)
+                    signal.signal(signal.SIGINT, default_handler)
             else:
                 yield
 
-        executor = ThreadPoolExecutor(max_workers=self.num_threads)
-        try:
-            with interrupt_manager():
-                from dspy.dsp.utils.settings import thread_local_overrides
+        def cancellable_function(parent_overrides, index_item):
+            index, item = index_item
+            if self.cancel_jobs.is_set():
+                return index, job_cancelled
 
-                parent_overrides = thread_local_overrides.overrides.copy()
+            from dspy.dsp.utils.settings import thread_local_overrides
+            original_overrides = thread_local_overrides.overrides
+            thread_local_overrides.overrides = parent_overrides.copy()
 
-                futures_map = {}
-                futures_set = set()
-                submission_counter = 0
+            try:
+                return index, function(item)
+            finally:
+                thread_local_overrides.overrides = original_overrides
 
-                for idx, item in enumerate(data):
-                    f = executor.submit(
-                        worker, parent_overrides, submission_counter, idx, item
+        with ThreadPoolExecutor(max_workers=self.num_threads) as executor, interrupt_handler_manager():
+            from dspy.dsp.utils.settings import thread_local_overrides
+            parent_overrides = thread_local_overrides.overrides.copy()
+
+            futures = {}
+            for pair in enumerate(data):
+                future = executor.submit(cancellable_function, parent_overrides, pair)
+                futures[future] = pair
+
+            pbar = tqdm.tqdm(
+                total=len(data),
+                dynamic_ncols=True,
+                disable=self.disable_progress_bar,
+                file=sys.stdout
+            )
+
+            for future in as_completed(futures, timeout=self.timeout):  # Handle timeouts
+                try:
+                    index, result = future.result(timeout=self.timeout)
+                except TimeoutError:
+                    print(f"Task at index {index} timed out.")
+                    index = futures[future][0]
+                    logger.warning(f"Task at index {index} timed out.")
+                    results[index] = None  # Store None for timed-out tasks
+                    continue
+
+                if result is job_cancelled:
+                    continue
+
+                results[index] = result
+
+                if self.compare_results:
+                    self._update_progress(
+                        pbar,
+                        sum([r[-1] for r in results if r is not None]),
+                        len([r for r in results if r is not None]),
                     )
-                    futures_map[f] = (submission_counter, idx, item)
-                    futures_set.add(f)
-                    submission_counter += 1
-
-                pbar = tqdm.tqdm(
-                    total=len(data),
-                    dynamic_ncols=True,
-                    disable=self.disable_progress_bar,
-                    file=sys.stdout,
-                )
-
-                def all_done():
-                    return all(r is not None for r in results)
-
-                while futures_set and not self.cancel_jobs.is_set():
-                    if all_done():
-                        break
-                    done, not_done = wait(
-                        futures_set, timeout=1, return_when=FIRST_COMPLETED
+                else:
+                    self._update_progress(
+                        pbar,
+                        len([r for r in results if r is not None]),
+                        len(data),
                     )
-                    for f in done:
-                        futures_set.remove(f)
-                        try:
-                            index, outcome = f.result()
-                        except Exception:
-                            pass
-                        else:
-                            if outcome != job_cancelled and results[index] is None:
-                                results[index] = outcome
-
-                            # Update progress
-                            if self.compare_results:
-                                vals = [r[-1] for r in results if r is not None]
-                                self._update_progress(pbar, sum(vals), len(vals))
-                            else:
-                                self._update_progress(
-                                    pbar,
-                                    len([r for r in results if r is not None]),
-                                    len(data),
-                                )
-
-                    if all_done():
-                        break
-
-                    # Check stragglers if few remain
-                    if 0 < self.timeout and len(not_done) <= self.straggler_limit:
-                        now = time.time()
-                        for f in list(not_done):
-                            if f not in resubmitted:
-                                sid, idx, item = futures_map[f]
-                                with start_time_lock:
-                                    st = start_time_map.get(sid, None)
-                                if st and (now - st) >= self.timeout:
-                                    resubmitted.add(f)
-                                    nf = executor.submit(
-                                        worker,
-                                        parent_overrides,
-                                        submission_counter,
-                                        idx,
-                                        item,
-                                    )
-                                    futures_map[nf] = (submission_counter, idx, item)
-                                    futures_set.add(nf)
-                                    submission_counter += 1
-
-                pbar.close()
-
-        finally:
-            # Avoid waiting on leftover tasks that no longer matter
-            executor.shutdown(wait=False)
+
+            pbar.close()
 
         if self.cancel_jobs.is_set():
-            logger.warning("Execution cancelled due to errors or interruption.")
-            raise Exception("Execution cancelled due to errors or interruption.")
+            logger.warning("Execution was cancelled due to errors.")
+            raise Exception("Execution was cancelled due to errors.")
 
         return results
-
-    def _update_progress(self, pbar, nresults, ntotal):
-        if self.compare_results:
-            pct = round(100 * nresults / ntotal, 1) if ntotal else 0
-            pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({pct}%)")
-        else:
-            pbar.set_description(f"Processed {nresults} / {ntotal} examples")
-        pbar.update()
+# import sys
+# import tqdm
+# import signal
+# import logging
+# import threading
+# import traceback
+# import time
+# import contextlib
+
+# from concurrent.futures import ThreadPoolExecutor, wait, FIRST_COMPLETED
+
+# logger = logging.getLogger(__name__)
+
+
+# class ParallelExecutor:
+#     def __init__(
+#         self,
+#         num_threads,
+#         max_errors=5,
+#         disable_progress_bar=False,
+#         provide_traceback=False,
+#         compare_results=False,
+#         timeout=120,
+#         straggler_limit=3,
+#     ):
+#         """
+#         Offers isolation between the tasks (dspy.settings) irrespective of whether num_threads == 1 or > 1.
+#         Handles also straggler timeouts.
+#         """
+        
+#         self.num_threads = num_threads
+#         self.max_errors = max_errors
+#         self.disable_progress_bar = disable_progress_bar
+#         self.provide_traceback = provide_traceback
+#         self.compare_results = compare_results
+#         self.timeout = timeout
+#         self.straggler_limit = straggler_limit
+
+#         self.error_count = 0
+#         self.error_lock = threading.Lock()
+#         self.cancel_jobs = threading.Event()
+
+#     def execute(self, function, data):
+#         tqdm.tqdm._instances.clear()
+#         wrapped = self._wrap_function(function)
+#         return self._execute_parallel(wrapped, data)
+
+#     def _wrap_function(self, user_function):
+#         def safe_func(item):
+#             if self.cancel_jobs.is_set():
+#                 return None
+#             try:
+#                 return user_function(item)
+#             except Exception as e:
+#                 with self.error_lock:
+#                     self.error_count += 1
+#                     if self.error_count >= self.max_errors:
+#                         self.cancel_jobs.set()
+#                 if self.provide_traceback:
+#                     logger.error(f"Error for {item}: {e}\n{traceback.format_exc()}")
+#                 else:
+#                     logger.error(
+#                         f"Error for {item}: {e}. "
+#                         "Set `provide_traceback=True` for traceback."
+#                     )
+#                 return None
+
+#         return safe_func
+
+#     def _execute_parallel(self, function, data):
+#         results = [None] * len(data)
+#         job_cancelled = "cancelled"
+
+#         # We resubmit at most once per item.
+#         start_time_map = {}
+#         start_time_lock = threading.Lock()
+#         resubmitted = set()
+
+#         # This is the worker function each thread will run.
+#         def worker(parent_overrides, submission_id, index, item):
+#             if self.cancel_jobs.is_set():
+#                 return index, job_cancelled
+#             # Record actual start time
+#             with start_time_lock:
+#                 start_time_map[submission_id] = time.time()
+
+#             # Apply parent's thread-local overrides
+#             from dspy.dsp.utils.settings import thread_local_overrides
+
+#             original = thread_local_overrides.overrides
+#             thread_local_overrides.overrides = parent_overrides.copy()
+
+#             try:
+#                 return index, function(item)
+#             finally:
+#                 thread_local_overrides.overrides = original
+
+#         # Handle Ctrl-C in the main thread
+#         @contextlib.contextmanager
+#         def interrupt_manager():
+#             if threading.current_thread() is threading.main_thread():
+#                 orig_handler = signal.getsignal(signal.SIGINT)
+
+#                 def handler(sig, frame):
+#                     self.cancel_jobs.set()
+#                     logger.warning("SIGINT received. Cancelling.")
+#                     orig_handler(sig, frame)
+
+#                 signal.signal(signal.SIGINT, handler)
+#                 try:
+#                     yield
+#                 finally:
+#                     signal.signal(signal.SIGINT, orig_handler)
+#             else:
+#                 yield
+
+#         executor = ThreadPoolExecutor(max_workers=self.num_threads)
+#         try:
+#             with interrupt_manager():
+#                 from dspy.dsp.utils.settings import thread_local_overrides
+
+#                 parent_overrides = thread_local_overrides.overrides.copy()
+
+#                 futures_map = {}
+#                 futures_set = set()
+#                 submission_counter = 0
+
+#                 for idx, item in enumerate(data):
+#                     f = executor.submit(
+#                         worker, parent_overrides, submission_counter, idx, item
+#                     )
+#                     futures_map[f] = (submission_counter, idx, item)
+#                     futures_set.add(f)
+#                     submission_counter += 1
+
+#                 pbar = tqdm.tqdm(
+#                     total=len(data),
+#                     dynamic_ncols=True,
+#                     disable=self.disable_progress_bar,
+#                     file=sys.stdout,
+#                 )
+
+#                 def all_done():
+#                     return all(r is not None for r in results)
+
+#                 while futures_set and not self.cancel_jobs.is_set():
+#                     if all_done():
+#                         break
+#                     done, not_done = wait(
+#                         futures_set, timeout=1, return_when=FIRST_COMPLETED
+#                     )
+#                     for f in done:
+#                         futures_set.remove(f)
+#                         try:
+#                             index, outcome = f.result()
+#                         except Exception:
+#                             pass
+#                         else:
+#                             if outcome != job_cancelled and results[index] is None:
+#                                 results[index] = outcome
+
+#                             # Update progress
+#                             if self.compare_results:
+#                                 vals = [r[-1] for r in results if r is not None]
+#                                 self._update_progress(pbar, sum(vals), len(vals))
+#                             else:
+#                                 self._update_progress(
+#                                     pbar,
+#                                     len([r for r in results if r is not None]),
+#                                     len(data),
+#                                 )
+
+#                     if all_done():
+#                         break
+
+#                     # Check stragglers if few remain
+#                     if 0 < self.timeout and len(not_done) <= self.straggler_limit:
+#                         now = time.time()
+#                         for f in list(not_done):
+#                             if f not in resubmitted:
+#                                 sid, idx, item = futures_map[f]
+#                                 with start_time_lock:
+#                                     st = start_time_map.get(sid, None)
+#                                 if st and (now - st) >= self.timeout:
+#                                     resubmitted.add(f)
+#                                     nf = executor.submit(
+#                                         worker,
+#                                         parent_overrides,
+#                                         submission_counter,
+#                                         idx,
+#                                         item,
+#                                     )
+#                                     futures_map[nf] = (submission_counter, idx, item)
+#                                     futures_set.add(nf)
+#                                     submission_counter += 1
+
+#                 pbar.close()
+
+#         finally:
+#             # Avoid waiting on leftover tasks that no longer matter
+#             executor.shutdown(wait=False)
+
+#         if self.cancel_jobs.is_set():
+#             logger.warning("Execution cancelled due to errors or interruption.")
+#             raise Exception("Execution cancelled due to errors or interruption.")
+
+#         return results
+
+#     def _update_progress(self, pbar, nresults, ntotal):
+#         if self.compare_results:
+#             pct = round(100 * nresults / ntotal, 1) if ntotal else 0
+#             pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({pct}%)")
+#         else:
+#             pbar.set_description(f"Processed {nresults} / {ntotal} examples")
+#         pbar.update()

From 29d61e500d6c2e8235a6cccab24c9ea976a77ebd Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Fri, 28 Mar 2025 16:49:07 -0400
Subject: [PATCH 07/18] wip

---
 dspy/teleprompt/mipro_optimizer_v2.py |   7 +-
 dspy/teleprompt/simba_utils.py        |   4 +-
 dspy/utils/parallelizer.py            | 115 +++++++++++++++++++++++---
 3 files changed, 110 insertions(+), 16 deletions(-)

diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index b90915ea42..401ca77ebb 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -33,7 +33,7 @@
 
 AUTO_RUN_SETTINGS = {
     "light": {"num_trials": 7, "val_size": 100},
-    "medium": {"num_trials": 25, "val_size": 300},
+    "medium": {"num_trials": 25, "val_size": 500},
     "heavy": {"num_trials": 50, "val_size": 1000},
 }
 
@@ -54,7 +54,7 @@ def __init__(
         teacher_settings: Dict = {},
         max_bootstrapped_demos: int = 4,
         max_labeled_demos: int = 16,
-        auto: Optional[Literal["light", "medium", "heavy"]] = None,
+        auto: Optional[Literal["light", "medium", "heavy"]] = "None",
         num_candidates: int = 10,
         num_threads: int = 6,
         max_errors: int = 10,
@@ -588,7 +588,8 @@ def objective(trial):
                 )
             
             # Log model token usage at the end of the trial 
-            log_token_usage(trial_logs, trial_num, model_dict={"prompt_model": self.prompt_model, "teacher_model": self.teacher_settings["lm"], "task_model": self.task_model})
+            teacher_model = None if "lm" not in self.teacher_settings else self.teacher_settings["lm"]
+            log_token_usage(trial_logs, trial_num, model_dict={"prompt_model": self.prompt_model, "teacher_model": teacher_model, "task_model": self.task_model})
             
             return score
 
diff --git a/dspy/teleprompt/simba_utils.py b/dspy/teleprompt/simba_utils.py
index 0aea7b5d33..08868fc53b 100644
--- a/dspy/teleprompt/simba_utils.py
+++ b/dspy/teleprompt/simba_utils.py
@@ -15,7 +15,9 @@ def prepare_models_for_resampling(program: dspy.Module, n: int):
     lm = program.get_lm() or dspy.settings.lm
     temps = [lm.kwargs["temperature"]] + [0.5 + i * (0.5 / n) for i in range(n)]
     temps = list(dict.fromkeys(temps))[:n]
-    return [lm.copy(temperature=t) for t in temps]
+    lm_list = [lm.copy(temperature=t) for t in temps]
+    # Add history back from the original LM so that we can do token estimates correctly
+    return [lm.copy(history=lm.history) for lm in lm_list]
 
 
 def wrap_program(program: dspy.Module, metric: Callable):
diff --git a/dspy/utils/parallelizer.py b/dspy/utils/parallelizer.py
index 40939ef2ba..bb22f7ccb5 100644
--- a/dspy/utils/parallelizer.py
+++ b/dspy/utils/parallelizer.py
@@ -19,7 +19,7 @@ def __init__(
         disable_progress_bar=False,
         provide_traceback=False,
         compare_results=False,
-        timeout=None,  # New timeout parameter in seconds
+        timeout=30,  # New timeout parameter in seconds
     ):
         """Offers isolation between the tasks (dspy.settings) irrespective of whether num_threads == 1 or > 1."""
         self.num_threads = num_threads
@@ -115,7 +115,7 @@ def _update_progress(self, pbar, nresults, ntotal):
             pbar.set_description(f"Processed {nresults} / {ntotal} examples")
 
         pbar.update()
-
+    
     def _execute_multi_thread(self, function, data):
         results = [None] * len(data)  # Pre-allocate results list to maintain order
         job_cancelled = "cancelled"
@@ -157,10 +157,10 @@ def cancellable_function(parent_overrides, index_item):
             from dspy.dsp.utils.settings import thread_local_overrides
             parent_overrides = thread_local_overrides.overrides.copy()
 
-            futures = {}
-            for pair in enumerate(data):
-                future = executor.submit(cancellable_function, parent_overrides, pair)
-                futures[future] = pair
+            futures = {
+                executor.submit(cancellable_function, parent_overrides, pair): pair[0]
+                for pair in enumerate(data)
+            }
 
             pbar = tqdm.tqdm(
                 total=len(data),
@@ -169,17 +169,18 @@ def cancellable_function(parent_overrides, index_item):
                 file=sys.stdout
             )
 
-            for future in as_completed(futures, timeout=self.timeout):  # Handle timeouts
+            for future in as_completed(futures):  # no global timeout here
+                index = futures[future]
                 try:
                     index, result = future.result(timeout=self.timeout)
                 except TimeoutError:
-                    print(f"Task at index {index} timed out.")
-                    index = futures[future][0]
                     logger.warning(f"Task at index {index} timed out.")
-                    results[index] = None  # Store None for timed-out tasks
-                    continue
+                    result = None
+                except Exception as e:
+                    logger.warning(f"Task at index {index} failed with exception: {e}")
+                    result = None
 
-                if result is job_cancelled:
+                if result == job_cancelled:
                     continue
 
                 results[index] = result
@@ -204,6 +205,96 @@ def cancellable_function(parent_overrides, index_item):
             raise Exception("Execution was cancelled due to errors.")
 
         return results
+
+
+    # def _execute_multi_thread(self, function, data):
+    #     results = [None] * len(data)  # Pre-allocate results list to maintain order
+    #     job_cancelled = "cancelled"
+
+    #     @contextlib.contextmanager
+    #     def interrupt_handler_manager():
+    #         """Sets the cancel_jobs event when a SIGINT is received, only in the main thread."""
+    #         if threading.current_thread() is threading.main_thread():
+    #             default_handler = signal.getsignal(signal.SIGINT)
+
+    #             def interrupt_handler(sig, frame):
+    #                 self.cancel_jobs.set()
+    #                 logger.warning("Received SIGINT. Cancelling execution.")
+    #                 default_handler(sig, frame)
+
+    #             signal.signal(signal.SIGINT, interrupt_handler)
+    #             try:
+    #                 yield
+    #             finally:
+    #                 signal.signal(signal.SIGINT, default_handler)
+    #         else:
+    #             yield
+
+    #     def cancellable_function(parent_overrides, index_item):
+    #         index, item = index_item
+    #         if self.cancel_jobs.is_set():
+    #             return index, job_cancelled
+
+    #         from dspy.dsp.utils.settings import thread_local_overrides
+    #         original_overrides = thread_local_overrides.overrides
+    #         thread_local_overrides.overrides = parent_overrides.copy()
+
+    #         try:
+    #             return index, function(item)
+    #         finally:
+    #             thread_local_overrides.overrides = original_overrides
+
+    #     with ThreadPoolExecutor(max_workers=self.num_threads) as executor, interrupt_handler_manager():
+    #         from dspy.dsp.utils.settings import thread_local_overrides
+    #         parent_overrides = thread_local_overrides.overrides.copy()
+
+    #         futures = {}
+    #         for pair in enumerate(data):
+    #             future = executor.submit(cancellable_function, parent_overrides, pair)
+    #             futures[future] = pair
+
+    #         pbar = tqdm.tqdm(
+    #             total=len(data),
+    #             dynamic_ncols=True,
+    #             disable=self.disable_progress_bar,
+    #             file=sys.stdout
+    #         )
+
+    #         for future in as_completed(futures, timeout=self.timeout):  # Handle timeouts
+    #             try:
+    #                 index, result = future.result(timeout=self.timeout)
+    #             except TimeoutError:
+    #                 print(f"Task at index {index} timed out.")
+    #                 index = futures[future][0]
+    #                 logger.warning(f"Task at index {index} timed out.")
+    #                 results[index] = None  # Store None for timed-out tasks
+    #                 continue
+
+    #             if result is job_cancelled:
+    #                 continue
+
+    #             results[index] = result
+
+    #             if self.compare_results:
+    #                 self._update_progress(
+    #                     pbar,
+    #                     sum([r[-1] for r in results if r is not None]),
+    #                     len([r for r in results if r is not None]),
+    #                 )
+    #             else:
+    #                 self._update_progress(
+    #                     pbar,
+    #                     len([r for r in results if r is not None]),
+    #                     len(data),
+    #                 )
+
+    #         pbar.close()
+
+    #     if self.cancel_jobs.is_set():
+    #         logger.warning("Execution was cancelled due to errors.")
+    #         raise Exception("Execution was cancelled due to errors.")
+
+    #     return results
 # import sys
 # import tqdm
 # import signal

From 7df5495050fb8027971323a3995c85564e9305d6 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Thu, 3 Apr 2025 10:05:24 -0400
Subject: [PATCH 08/18] wip

---
 dspy/teleprompt/__init__.py           |   2 +
 dspy/teleprompt/mipro_optimizer_v2.py |   5 +-
 dspy/teleprompt/simba.py              |   7 +
 dspy/teleprompt/simba_utils.py        |  94 ++++-
 dspy/utils/parallelizer.py            | 477 ++------------------------
 5 files changed, 131 insertions(+), 454 deletions(-)

diff --git a/dspy/teleprompt/__init__.py b/dspy/teleprompt/__init__.py
index 3168cd1c44..2fefec2f21 100644
--- a/dspy/teleprompt/__init__.py
+++ b/dspy/teleprompt/__init__.py
@@ -6,6 +6,7 @@
 from dspy.teleprompt.ensemble import Ensemble
 from dspy.teleprompt.knn_fewshot import KNNFewShot
 from dspy.teleprompt.simba import SIMBA
+from dspy.teleprompt.simba_fast import SIMBAFast
 
 from dspy.teleprompt.mipro_optimizer_v2 import MIPROv2
 from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch
@@ -29,4 +30,5 @@
     "LabeledFewShot",
     "InferRules",
     "SIMBA",
+    "SIMBAFast",
 ]
diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index 5d424cb8f8..741efe875e 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -34,7 +34,7 @@
 AUTO_RUN_SETTINGS = {
     "light": {"num_trials": 7, "val_size": 100},
     "medium": {"num_trials": 25, "val_size": 500},
-    "heavy": {"num_trials": 50, "val_size": 1000},
+    "heavy": {"num_trials": 40, "val_size": 1000},
 }
 
 # ANSI escape codes for colors
@@ -221,7 +221,8 @@ def _set_hyperparams_from_run_mode(
         num_trials = auto_settings["num_trials"]
         valset = create_minibatch(valset, batch_size=auto_settings["val_size"], rng=self.rng)
         minibatch = len(valset) > MIN_MINIBATCH_SIZE
-        self.num_candidates = int(np.round(np.min([num_trials * num_vars, (1.5 * num_trials) / num_vars])))
+        # self.num_candidates = int(np.round(np.min([num_trials * num_vars, (1 * num_trials) / num_vars])))
+        self.num_candidates = int(np.round(np.min([num_trials * num_vars, (num_trials) / num_vars])))
 
         return num_trials, valset, minibatch
 
diff --git a/dspy/teleprompt/simba.py b/dspy/teleprompt/simba.py
index 021fe0edfb..a8b5ccec47 100644
--- a/dspy/teleprompt/simba.py
+++ b/dspy/teleprompt/simba.py
@@ -157,12 +157,17 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
                     # Use the special wrap that includes the 'example' in the output
                     wrapped_candidate_system = wrap_program(candidate_system, self.metric)
                     exec_pairs.append((wrapped_candidate_system, example))
+            
+            # TODO: check to see if token count from all these models is accounted for in lm.history
+            # Could add their history to main dspy.settings.lm.history
 
             # STEP 2: Execute
             logger.info(f"Sampling program trajectories on {self.bsize} examples x {self.num_candidates} samples.")
             outputs = run_parallel(exec_pairs)
             assert len(outputs) == len(exec_pairs) == self.bsize * self.num_candidates
 
+            dspy.settings.lm.history.extend([entry for model in models for entry in model.history])
+
             # STEP 3: Sort the training buckets by (max-to-min gap, max score, and max-to-avg gap).
             buckets = []
             largest_max_to_avg_gap = float("-inf")
@@ -285,6 +290,8 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
                 end = (idx_cand + 1) * self.bsize
                 sys_scores = [outputs[i]["score"] for i in range(start, end)]
                 register_new_program(cand_sys, sys_scores)
+            
+            log_token_usage(trial_logs, batch_idx, {"lm": dspy.settings.lm})
 
         M = len(winning_programs) - 1
         N = self.num_candidates + 1
diff --git a/dspy/teleprompt/simba_utils.py b/dspy/teleprompt/simba_utils.py
index 08868fc53b..3a215986cd 100644
--- a/dspy/teleprompt/simba_utils.py
+++ b/dspy/teleprompt/simba_utils.py
@@ -15,45 +15,110 @@ def prepare_models_for_resampling(program: dspy.Module, n: int):
     lm = program.get_lm() or dspy.settings.lm
     temps = [lm.kwargs["temperature"]] + [0.5 + i * (0.5 / n) for i in range(n)]
     temps = list(dict.fromkeys(temps))[:n]
-    lm_list = [lm.copy(temperature=t) for t in temps]
-    # Add history back from the original LM so that we can do token estimates correctly
-    return [lm.copy(history=lm.history) for lm in lm_list]
-
+    return [lm.copy(temperature=t) for t in temps]
 
 def wrap_program(program: dspy.Module, metric: Callable):
     def wrapped_program(example):
         with dspy.context(trace=[]):
-            prediction, trace, score = None, None, 0.0
+            prediction, trace = None, None
             try:
                 prediction = program(**example.inputs())
             except Exception as e:
                 print(e)
             trace = dspy.settings.trace.copy()
 
+        output = None
+        score = 0.0
+        output_metadata = {}
+
         try:
-            score = metric(example, prediction)
+            output = metric(example, prediction)
+            if isinstance(output, (int, float)):
+                score = output
+            elif isinstance(output, dspy.Prediction):
+                score = output.score
+                # Just extract fields from _store, excluding 'score'
+                output_metadata = {
+                    k: v for k, v in output._store.items() if k != "score"
+                }
         except Exception as e:
             print(e)
 
-        # Include the `example` in the output for subsequent usage in buckets/strategies.
         return {
             "prediction": prediction,
             "trace": trace,
             "score": score,
-            "example": example
+            "example": example,
+            "output_metadata": output_metadata
         }
 
     return wrapped_program
 
 
+class SummarizeBucket(dspy.Signature):
+    """
+    Given sets of LLM trajectories on examples in a task, generate an updated summary of the common failure modes across examples.
+
+    The goal is that we can generate a summary that will make it easy for us to re-write the prompt or specifically specify rules that will help the system improve on this task.
+
+    You'll be given an existing summary of example failure modes that has been generated by looking at previous sets of examples. Please update this summary with your additional insights (if any).
+
+    Please make sure to include specific examples from the input that you think are particularly illustrative of the failure modes you're summarizing. If these are also included in the existing summary, please make sure that this information is not dropped in the updated summary.
+    """
+    current_summary = dspy.InputField(desc="The current summary of common failure modes across examples")
+    # inconsistencies_summary = dspy.InputField(desc="The current summary of potential inconsistencies in the evaluation / areas of ambiguity in the evaluation")
+    examples = dspy.InputField(desc="The examples to analyze and use to update the summaries with additional insights")
+    updated_summary = dspy.OutputField(desc="The updated summary of common failure modes across examples")
+    # updated_inconsistencies_summary = dspy.OutputField(desc="The updated summary of potential inconsistencies in the evaluation / areas of ambiguity in the evaluation")
+
+def format_examples(buckets):
+    formatted_examples = ""
+    for i, bucket in enumerate(buckets):
+        formatted_examples += f"Example {i+1}:\n"
+        good, bad = bucket[0], bucket[-1]
+        example = good["example"]
+        program_inputs={**example.inputs()}
+        formatted_examples += f"Inputs: {ujson.dumps(program_inputs, indent=2)}\n"
+        formatted_examples += f"Bad Prediction: {ujson.dumps(bad['prediction'], indent=2)}\n"
+        formatted_examples += f"Bad Score: {bad['score']}\n"
+        formatted_examples += f"Bad Metadata: {ujson.dumps(bad['output_metadata'], indent=2)}\n"
+        formatted_examples += f"Good Prediction: {ujson.dumps(good['prediction'], indent=2)}\n"
+        formatted_examples += f"Good Score: {good['score']}\n"
+        formatted_examples += f"Good Metadata: {ujson.dumps(good['output_metadata'], indent=2)}\n"
+        formatted_examples += "\n\n"
+    return formatted_examples
+
+def summarize_batch(buckets, summary_window=3, max_examples=12):
+    # TODO: note that we could use a longer context window here 
+    # TODO: we should autocalc the summary window based on what will fit in our model 
+
+    summary = "N/A"
+    inconsistencies_summary = "N/A"
+
+    breakpoint()
+
+    bucket_subset = buckets[:max_examples]
+    for i in range(0, len(bucket_subset), summary_window):
+        formatted_examples = format_examples(buckets[i:i+summary_window])
+        summarize_bucket_output = dspy.ChainOfThought(SummarizeBucket)(current_summary=summary, examples=formatted_examples)
+        summary = summarize_bucket_output.updated_summary
+        # inconsistencies = summarize_bucket_output.updated_inconsistencies_summary
+    logger.info(f"Summary for bucket {i}: {summary}")
+    # logger.info(f"Inconsistencies for bucket {i}: {inconsistencies}")
 
 def append_a_demo(demo_input_field_maxlen):
     def append_a_demo_(bucket, system, **kwargs):
         predictor2name, name2predictor = kwargs["predictor2name"], kwargs["name2predictor"]
+        batch_10p_score, batch_90p_score = kwargs["batch_10p_score"], kwargs["batch_90p_score"]
 
-        trace = bucket[0]["trace"]
+        good = bucket[0]
+        trace = good["trace"]
         name2demo = {}
 
+        if good["score"] <= batch_10p_score:
+            logger.info(f"Skipping appending a demo as good score {good['score']} is at or below the 10th percentile.")
+            return False
+
         for step in trace:
             predictor, _inputs, _outputs = step
 
@@ -64,7 +129,6 @@ def append_a_demo_(bucket, system, **kwargs):
             demo = dspy.Example(augmented=True, **_inputs, **_outputs)
             name = predictor2name[id(predictor)]
             name2demo[name] = demo  # keep the last demo for each predictor
-
         for name, demo in name2demo.items():
             predictor = name2predictor[name]
             predictor.demos.append(demo)
@@ -83,9 +147,9 @@ def append_a_rule(bucket, system, **kwargs):
     good, bad = bucket[0], bucket[-1]
     example = good["example"]
 
-    if good["score"] < batch_10p_score or bad["score"] > batch_90p_score:
-        logger.info(f"Skipping rule generation as good score {good['score']} is below the 10th percentile "
-                    f"*or* bad score {bad['score']} is above the 90th percentile.")
+    if good["score"] <= batch_10p_score or bad["score"] >= batch_90p_score:
+        logger.info(f"Skipping rule generation as good score {good['score']} is at or below the 10th percentile "
+                    f"*or* bad score {bad['score']} is at or above the 90th percentile.")
         return False
 
     if good["score"] <= bad["score"]:
@@ -118,6 +182,8 @@ def append_a_rule(bucket, system, **kwargs):
         worse_program_outputs=dict(bad["prediction"] or {}),
         worse_reward_value=bad["score"],
         better_reward_value=good["score"],
+        worse_reward_info=bad["output_metadata"],
+        better_reward_info=good["output_metadata"],
         module_names=module_names,
     )
 
@@ -157,11 +223,13 @@ class OfferFeedback(dspy.Signature):
     )
     worse_program_outputs: str = InputField(desc="The outputs of the program that we are analyzing")
     worse_reward_value: float = InputField(desc="The reward value assigned to the program's outputs")
+    worse_reward_info: str = InputField(desc="Additional information that might be helpful to understanding the assigned reward value.")
     better_program_trajectory: str = InputField(
         desc="The trajectory of the program's execution, showing each module's I/O"
     )
     better_program_outputs: str = InputField(desc="The outputs of the program that we are analyzing")
     better_reward_value: float = InputField(desc="The reward value assigned to the program's outputs")
+    better_reward_info: str = InputField(desc="Additional information that might be helpful to understanding the assigned reward value.")
     module_names: list[str] = InputField(desc="The names of the modules in the program, for which we seek advice")
     discussion: str = OutputField(desc="Discussing blame of where each module went wrong, if it did")
     module_advice: dict[str, str] = OutputField(
diff --git a/dspy/utils/parallelizer.py b/dspy/utils/parallelizer.py
index 9fed80e9c8..1607db0477 100644
--- a/dspy/utils/parallelizer.py
+++ b/dspy/utils/parallelizer.py
@@ -21,7 +21,8 @@ def __init__(
         disable_progress_bar=False,
         provide_traceback=False,
         compare_results=False,
-        timeout=30,  # New timeout parameter in seconds
+        timeout=120,
+        straggler_limit=3,
     ):
         """
         Offers isolation between the tasks (dspy.settings) irrespective of whether num_threads == 1 or > 1.
@@ -29,64 +30,57 @@ def __init__(
         """
 
         self.num_threads = num_threads
-        self.disable_progress_bar = disable_progress_bar
         self.max_errors = max_errors
+        self.disable_progress_bar = disable_progress_bar
         self.provide_traceback = provide_traceback
         self.compare_results = compare_results
-        self.timeout = timeout  # Store timeout value
+        self.timeout = timeout
+        self.straggler_limit = straggler_limit
 
         self.error_count = 0
         self.error_lock = threading.Lock()
         self.cancel_jobs = threading.Event()
 
     def execute(self, function, data):
-        wrapped_function = self._wrap_function(function)
-        if self.num_threads == 1:
-            return self._execute_isolated_single_thread(wrapped_function, data)
-        else:
-            return self._execute_multi_thread(wrapped_function, data)
+        tqdm.tqdm._instances.clear()
+        wrapped = self._wrap_function(function)
+        return self._execute_parallel(wrapped, data)
 
-    def _wrap_function(self, function):
-        # Wrap the function with error handling
-        def wrapped(item):
+    def _wrap_function(self, user_function):
+        def safe_func(item):
             if self.cancel_jobs.is_set():
                 return None
             try:
-                return function(item)
+                return user_function(item)
             except Exception as e:
                 with self.error_lock:
                     self.error_count += 1
-                    current_error_count = self.error_count
-                if current_error_count >= self.max_errors:
-                    self.cancel_jobs.set()
-                    raise e
+                    if self.error_count >= self.max_errors:
+                        self.cancel_jobs.set()
                 if self.provide_traceback:
-                    logger.error(
-                        f"Error processing item {item}: {e}\nStack trace:\n{traceback.format_exc()}"
-                    )
+                    logger.error(f"Error for {item}: {e}\n{traceback.format_exc()}")
                 else:
                     logger.error(f"Error for {item}: {e}. " "Set `provide_traceback=True` for traceback.")
                 return None
-        return wrapped
 
-    def _execute_isolated_single_thread(self, function, data):
-        results = []
-        pbar = tqdm.tqdm(
-            total=len(data),
-            dynamic_ncols=True,
-            disable=self.disable_progress_bar,
-            file=sys.stdout
-        )
+        return safe_func
 
-        from dspy.dsp.utils.settings import thread_local_overrides
-        original_overrides = thread_local_overrides.overrides
+    def _execute_parallel(self, function, data):
+        results = [None] * len(data)
+        job_cancelled = "cancelled"
 
-        for item in data:
-            with logging_redirect_tqdm():
-                if self.cancel_jobs.is_set():
-                    break
+        # We resubmit at most once per item.
+        start_time_map = {}
+        start_time_lock = threading.Lock()
+        resubmitted = set()
 
-                thread_local_overrides.overrides = original_overrides.copy()
+        # This is the worker function each thread will run.
+        def worker(parent_overrides, submission_id, index, item):
+            if self.cancel_jobs.is_set():
+                return index, job_cancelled
+            # Record actual start time
+            with start_time_lock:
+                start_time_map[submission_id] = time.time()
 
             # Apply parent's thread-local overrides
             from dspy.dsp.utils.settings import thread_local_overrides
@@ -115,10 +109,11 @@ def handler(sig, frame):
 
                 signal.signal(signal.SIGINT, handler)
                 try:
-                    result = function(item)
-                    results.append(result)
+                    yield
                 finally:
-                    thread_local_overrides.overrides = original_overrides
+                    signal.signal(signal.SIGINT, orig_handler)
+            else:
+                yield
 
         executor = ThreadPoolExecutor(max_workers=self.num_threads)
         try:
@@ -203,411 +198,15 @@ def all_done():
             executor.shutdown(wait=False)
 
         if self.cancel_jobs.is_set():
-            logger.warning("Execution was cancelled due to errors.")
-            raise Exception("Execution was cancelled due to errors.")
+            logger.warning("Execution cancelled due to errors or interruption.")
+            raise Exception("Execution cancelled due to errors or interruption.")
 
         return results
 
     def _update_progress(self, pbar, nresults, ntotal):
         if self.compare_results:
-            percentage = round(100 * nresults / ntotal, 1) if ntotal > 0 else 0
-            pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({percentage}%)")
+            pct = round(100 * nresults / ntotal, 1) if ntotal else 0
+            pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({pct}%)")
         else:
             pbar.set_description(f"Processed {nresults} / {ntotal} examples")
-
-        pbar.update()
-    
-    def _execute_multi_thread(self, function, data):
-        results = [None] * len(data)  # Pre-allocate results list to maintain order
-        job_cancelled = "cancelled"
-
-        @contextlib.contextmanager
-        def interrupt_handler_manager():
-            """Sets the cancel_jobs event when a SIGINT is received, only in the main thread."""
-            if threading.current_thread() is threading.main_thread():
-                default_handler = signal.getsignal(signal.SIGINT)
-
-                def interrupt_handler(sig, frame):
-                    self.cancel_jobs.set()
-                    logger.warning("Received SIGINT. Cancelling execution.")
-                    default_handler(sig, frame)
-
-                signal.signal(signal.SIGINT, interrupt_handler)
-                try:
-                    yield
-                finally:
-                    signal.signal(signal.SIGINT, default_handler)
-            else:
-                yield
-
-        def cancellable_function(parent_overrides, index_item):
-            index, item = index_item
-            if self.cancel_jobs.is_set():
-                return index, job_cancelled
-
-            from dspy.dsp.utils.settings import thread_local_overrides
-            original_overrides = thread_local_overrides.overrides
-            thread_local_overrides.overrides = parent_overrides.copy()
-
-            try:
-                return index, function(item)
-            finally:
-                thread_local_overrides.overrides = original_overrides
-
-        with ThreadPoolExecutor(max_workers=self.num_threads) as executor, interrupt_handler_manager():
-            from dspy.dsp.utils.settings import thread_local_overrides
-            parent_overrides = thread_local_overrides.overrides.copy()
-
-            futures = {
-                executor.submit(cancellable_function, parent_overrides, pair): pair[0]
-                for pair in enumerate(data)
-            }
-
-            pbar = tqdm.tqdm(
-                total=len(data),
-                dynamic_ncols=True,
-                disable=self.disable_progress_bar,
-                file=sys.stdout
-            )
-
-            for future in as_completed(futures):  # no global timeout here
-                index = futures[future]
-                try:
-                    index, result = future.result(timeout=self.timeout)
-                except TimeoutError:
-                    logger.warning(f"Task at index {index} timed out.")
-                    result = None
-                except Exception as e:
-                    logger.warning(f"Task at index {index} failed with exception: {e}")
-                    result = None
-
-                if result == job_cancelled:
-                    continue
-
-                results[index] = result
-
-                if self.compare_results:
-                    self._update_progress(
-                        pbar,
-                        sum([r[-1] for r in results if r is not None]),
-                        len([r for r in results if r is not None]),
-                    )
-                else:
-                    self._update_progress(
-                        pbar,
-                        len([r for r in results if r is not None]),
-                        len(data),
-                    )
-
-            pbar.close()
-
-        if self.cancel_jobs.is_set():
-            logger.warning("Execution was cancelled due to errors.")
-            raise Exception("Execution was cancelled due to errors.")
-
-        return results
-
-
-    # def _execute_multi_thread(self, function, data):
-    #     results = [None] * len(data)  # Pre-allocate results list to maintain order
-    #     job_cancelled = "cancelled"
-
-    #     @contextlib.contextmanager
-    #     def interrupt_handler_manager():
-    #         """Sets the cancel_jobs event when a SIGINT is received, only in the main thread."""
-    #         if threading.current_thread() is threading.main_thread():
-    #             default_handler = signal.getsignal(signal.SIGINT)
-
-    #             def interrupt_handler(sig, frame):
-    #                 self.cancel_jobs.set()
-    #                 logger.warning("Received SIGINT. Cancelling execution.")
-    #                 default_handler(sig, frame)
-
-    #             signal.signal(signal.SIGINT, interrupt_handler)
-    #             try:
-    #                 yield
-    #             finally:
-    #                 signal.signal(signal.SIGINT, default_handler)
-    #         else:
-    #             yield
-
-    #     def cancellable_function(parent_overrides, index_item):
-    #         index, item = index_item
-    #         if self.cancel_jobs.is_set():
-    #             return index, job_cancelled
-
-    #         from dspy.dsp.utils.settings import thread_local_overrides
-    #         original_overrides = thread_local_overrides.overrides
-    #         thread_local_overrides.overrides = parent_overrides.copy()
-
-    #         try:
-    #             return index, function(item)
-    #         finally:
-    #             thread_local_overrides.overrides = original_overrides
-
-    #     with ThreadPoolExecutor(max_workers=self.num_threads) as executor, interrupt_handler_manager():
-    #         from dspy.dsp.utils.settings import thread_local_overrides
-    #         parent_overrides = thread_local_overrides.overrides.copy()
-
-    #         futures = {}
-    #         for pair in enumerate(data):
-    #             future = executor.submit(cancellable_function, parent_overrides, pair)
-    #             futures[future] = pair
-
-    #         pbar = tqdm.tqdm(
-    #             total=len(data),
-    #             dynamic_ncols=True,
-    #             disable=self.disable_progress_bar,
-    #             file=sys.stdout
-    #         )
-
-    #         for future in as_completed(futures, timeout=self.timeout):  # Handle timeouts
-    #             try:
-    #                 index, result = future.result(timeout=self.timeout)
-    #             except TimeoutError:
-    #                 print(f"Task at index {index} timed out.")
-    #                 index = futures[future][0]
-    #                 logger.warning(f"Task at index {index} timed out.")
-    #                 results[index] = None  # Store None for timed-out tasks
-    #                 continue
-
-    #             if result is job_cancelled:
-    #                 continue
-
-    #             results[index] = result
-
-    #             if self.compare_results:
-    #                 self._update_progress(
-    #                     pbar,
-    #                     sum([r[-1] for r in results if r is not None]),
-    #                     len([r for r in results if r is not None]),
-    #                 )
-    #             else:
-    #                 self._update_progress(
-    #                     pbar,
-    #                     len([r for r in results if r is not None]),
-    #                     len(data),
-    #                 )
-
-    #         pbar.close()
-
-    #     if self.cancel_jobs.is_set():
-    #         logger.warning("Execution was cancelled due to errors.")
-    #         raise Exception("Execution was cancelled due to errors.")
-
-    #     return results
-# import sys
-# import tqdm
-# import signal
-# import logging
-# import threading
-# import traceback
-# import time
-# import contextlib
-
-# from concurrent.futures import ThreadPoolExecutor, wait, FIRST_COMPLETED
-
-# logger = logging.getLogger(__name__)
-
-
-# class ParallelExecutor:
-#     def __init__(
-#         self,
-#         num_threads,
-#         max_errors=5,
-#         disable_progress_bar=False,
-#         provide_traceback=False,
-#         compare_results=False,
-#         timeout=120,
-#         straggler_limit=3,
-#     ):
-#         """
-#         Offers isolation between the tasks (dspy.settings) irrespective of whether num_threads == 1 or > 1.
-#         Handles also straggler timeouts.
-#         """
-        
-#         self.num_threads = num_threads
-#         self.max_errors = max_errors
-#         self.disable_progress_bar = disable_progress_bar
-#         self.provide_traceback = provide_traceback
-#         self.compare_results = compare_results
-#         self.timeout = timeout
-#         self.straggler_limit = straggler_limit
-
-#         self.error_count = 0
-#         self.error_lock = threading.Lock()
-#         self.cancel_jobs = threading.Event()
-
-#     def execute(self, function, data):
-#         tqdm.tqdm._instances.clear()
-#         wrapped = self._wrap_function(function)
-#         return self._execute_parallel(wrapped, data)
-
-#     def _wrap_function(self, user_function):
-#         def safe_func(item):
-#             if self.cancel_jobs.is_set():
-#                 return None
-#             try:
-#                 return user_function(item)
-#             except Exception as e:
-#                 with self.error_lock:
-#                     self.error_count += 1
-#                     if self.error_count >= self.max_errors:
-#                         self.cancel_jobs.set()
-#                 if self.provide_traceback:
-#                     logger.error(f"Error for {item}: {e}\n{traceback.format_exc()}")
-#                 else:
-#                     logger.error(
-#                         f"Error for {item}: {e}. "
-#                         "Set `provide_traceback=True` for traceback."
-#                     )
-#                 return None
-
-#         return safe_func
-
-#     def _execute_parallel(self, function, data):
-#         results = [None] * len(data)
-#         job_cancelled = "cancelled"
-
-#         # We resubmit at most once per item.
-#         start_time_map = {}
-#         start_time_lock = threading.Lock()
-#         resubmitted = set()
-
-#         # This is the worker function each thread will run.
-#         def worker(parent_overrides, submission_id, index, item):
-#             if self.cancel_jobs.is_set():
-#                 return index, job_cancelled
-#             # Record actual start time
-#             with start_time_lock:
-#                 start_time_map[submission_id] = time.time()
-
-#             # Apply parent's thread-local overrides
-#             from dspy.dsp.utils.settings import thread_local_overrides
-
-#             original = thread_local_overrides.overrides
-#             thread_local_overrides.overrides = parent_overrides.copy()
-
-#             try:
-#                 return index, function(item)
-#             finally:
-#                 thread_local_overrides.overrides = original
-
-#         # Handle Ctrl-C in the main thread
-#         @contextlib.contextmanager
-#         def interrupt_manager():
-#             if threading.current_thread() is threading.main_thread():
-#                 orig_handler = signal.getsignal(signal.SIGINT)
-
-#                 def handler(sig, frame):
-#                     self.cancel_jobs.set()
-#                     logger.warning("SIGINT received. Cancelling.")
-#                     orig_handler(sig, frame)
-
-#                 signal.signal(signal.SIGINT, handler)
-#                 try:
-#                     yield
-#                 finally:
-#                     signal.signal(signal.SIGINT, orig_handler)
-#             else:
-#                 yield
-
-#         executor = ThreadPoolExecutor(max_workers=self.num_threads)
-#         try:
-#             with interrupt_manager():
-#                 from dspy.dsp.utils.settings import thread_local_overrides
-
-#                 parent_overrides = thread_local_overrides.overrides.copy()
-
-#                 futures_map = {}
-#                 futures_set = set()
-#                 submission_counter = 0
-
-#                 for idx, item in enumerate(data):
-#                     f = executor.submit(
-#                         worker, parent_overrides, submission_counter, idx, item
-#                     )
-#                     futures_map[f] = (submission_counter, idx, item)
-#                     futures_set.add(f)
-#                     submission_counter += 1
-
-#                 pbar = tqdm.tqdm(
-#                     total=len(data),
-#                     dynamic_ncols=True,
-#                     disable=self.disable_progress_bar,
-#                     file=sys.stdout,
-#                 )
-
-#                 def all_done():
-#                     return all(r is not None for r in results)
-
-#                 while futures_set and not self.cancel_jobs.is_set():
-#                     if all_done():
-#                         break
-#                     done, not_done = wait(
-#                         futures_set, timeout=1, return_when=FIRST_COMPLETED
-#                     )
-#                     for f in done:
-#                         futures_set.remove(f)
-#                         try:
-#                             index, outcome = f.result()
-#                         except Exception:
-#                             pass
-#                         else:
-#                             if outcome != job_cancelled and results[index] is None:
-#                                 results[index] = outcome
-
-#                             # Update progress
-#                             if self.compare_results:
-#                                 vals = [r[-1] for r in results if r is not None]
-#                                 self._update_progress(pbar, sum(vals), len(vals))
-#                             else:
-#                                 self._update_progress(
-#                                     pbar,
-#                                     len([r for r in results if r is not None]),
-#                                     len(data),
-#                                 )
-
-#                     if all_done():
-#                         break
-
-#                     # Check stragglers if few remain
-#                     if 0 < self.timeout and len(not_done) <= self.straggler_limit:
-#                         now = time.time()
-#                         for f in list(not_done):
-#                             if f not in resubmitted:
-#                                 sid, idx, item = futures_map[f]
-#                                 with start_time_lock:
-#                                     st = start_time_map.get(sid, None)
-#                                 if st and (now - st) >= self.timeout:
-#                                     resubmitted.add(f)
-#                                     nf = executor.submit(
-#                                         worker,
-#                                         parent_overrides,
-#                                         submission_counter,
-#                                         idx,
-#                                         item,
-#                                     )
-#                                     futures_map[nf] = (submission_counter, idx, item)
-#                                     futures_set.add(nf)
-#                                     submission_counter += 1
-
-#                 pbar.close()
-
-#         finally:
-#             # Avoid waiting on leftover tasks that no longer matter
-#             executor.shutdown(wait=False)
-
-#         if self.cancel_jobs.is_set():
-#             logger.warning("Execution cancelled due to errors or interruption.")
-#             raise Exception("Execution cancelled due to errors or interruption.")
-
-#         return results
-
-#     def _update_progress(self, pbar, nresults, ntotal):
-#         if self.compare_results:
-#             pct = round(100 * nresults / ntotal, 1) if ntotal else 0
-#             pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({pct}%)")
-#         else:
-#             pbar.set_description(f"Processed {nresults} / {ntotal} examples")
-#         pbar.update()
+        pbar.update()
\ No newline at end of file

From 45d0b773b28aa54668f138a32127c173d14309b4 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Wed, 16 Apr 2025 14:28:42 -0400
Subject: [PATCH 09/18] wip

---
 dspy/teleprompt/__init__.py   |   2 +
 dspy/teleprompt/simba.py      |  58 +++--
 dspy/teleprompt/simba_fast.py | 385 ++++++++++++++++++++++++++++++++++
 3 files changed, 414 insertions(+), 31 deletions(-)
 create mode 100644 dspy/teleprompt/simba_fast.py

diff --git a/dspy/teleprompt/__init__.py b/dspy/teleprompt/__init__.py
index 2fefec2f21..947bf55de1 100644
--- a/dspy/teleprompt/__init__.py
+++ b/dspy/teleprompt/__init__.py
@@ -7,6 +7,7 @@
 from dspy.teleprompt.knn_fewshot import KNNFewShot
 from dspy.teleprompt.simba import SIMBA
 from dspy.teleprompt.simba_fast import SIMBAFast
+from dspy.teleprompt.simba_bayesian import SIMBABayesian
 
 from dspy.teleprompt.mipro_optimizer_v2 import MIPROv2
 from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch
@@ -31,4 +32,5 @@
     "InferRules",
     "SIMBA",
     "SIMBAFast",
+    "SIMBABayesian",
 ]
diff --git a/dspy/teleprompt/simba.py b/dspy/teleprompt/simba.py
index a8b5ccec47..1452558667 100644
--- a/dspy/teleprompt/simba.py
+++ b/dspy/teleprompt/simba.py
@@ -118,6 +118,14 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
         rng.shuffle(data_indices)
         instance_idx = 0
 
+        M = self.max_steps - 1
+        N = self.num_candidates + 1
+        program_idxs = [0] * N if M < 1 else [round(i * M / (N - 1)) for i in range(N)]
+        program_idxs = list(dict.fromkeys(program_idxs))
+
+        final_candidate_programs = []
+        final_candidate_scores = []
+
         # Parallel runner
         run_parallel = dspy.Parallel(access_examples=False, num_threads=self.num_threads)
 
@@ -291,42 +299,30 @@ def register_new_program(prog: dspy.Module, score_list: list[float]):
                 sys_scores = [outputs[i]["score"] for i in range(start, end)]
                 register_new_program(cand_sys, sys_scores)
             
+            # STEP 8: If it's time for a full evaluation, evaluate the winning program on the full trainset
+            if batch_idx in program_idxs:
+                logger.info(f"Batch {batch_idx+1}: Evaluating winning program on full trainset.")
+                exec_pairs = [(wrap_program(best_program, self.metric), ex) for ex in trainset]
+                full_outputs = run_parallel(exec_pairs)
+                scores = [o["score"] for o in full_outputs]
+                avg_score = sum(scores) / len(scores)
+                trial_logs[batch_idx]["train_score"] = avg_score
+
+                final_candidate_programs.append(best_program.deepcopy())
+                final_candidate_scores.append(avg_score)
+                        
             log_token_usage(trial_logs, batch_idx, {"lm": dspy.settings.lm})
 
-        M = len(winning_programs) - 1
-        N = self.num_candidates + 1
-        if M < 1:
-            # Only one or zero winning programs
-            program_idxs = [0] * N
-        else:
-            program_idxs = [round(i * M / (N - 1)) for i in range(N)]
-        program_idxs = list(dict.fromkeys(program_idxs))
-
-        candidate_programs = [winning_programs[i].deepcopy() for i in program_idxs]
-        logger.info(f"VALIDATION: Evaluating {len(candidate_programs)} programs on the full trainset.")
-        exec_pairs = [(wrap_program(sys, self.metric), ex) for sys in candidate_programs for ex in trainset]
-        outputs = run_parallel(exec_pairs)
-
-        scores = []
-        for idx_prog, prog in enumerate(candidate_programs):
-            start = idx_prog * len(trainset)
-            end = (idx_prog + 1) * len(trainset)
-            sys_scores = [outputs[i]["score"] for i in range(start, end)]
-            avg_score = sum(sys_scores) / len(sys_scores) if sys_scores else 0.0
-            scores.append(avg_score)
-            if idx_prog != 0:
-                trial_logs[idx_prog-1]["train_score"] = avg_score
-        
-        best_idx = scores.index(max(scores)) if scores else 0
-        best_program = candidate_programs[best_idx]
+        best_idx = np.argmax(final_candidate_scores) if final_candidate_scores else 0
+        # best_idx = scores.index(max(final_candidate_scores)) if final_candidate_scores else 0
+        best_program = final_candidate_programs[best_idx]
         logger.info(
-            f"Final trainset scores: {scores}, Best: {max(scores) if scores else 'N/A'} "
-            f"(at index {best_idx if scores else 'N/A'})\n\n\n"
+            f"Final trainset scores: {final_candidate_scores}, Best: {max(final_candidate_scores) if final_candidate_scores else 'N/A'} "
+            f"(at index {best_idx if final_candidate_scores else 'N/A'})\n\n\n"
         )
-
         # FIXME: Attach all program candidates in decreasing average score to the best program.
-        best_program.candidate_programs = candidate_programs
+        best_program.candidate_programs = final_candidate_programs
         best_program.winning_programs = winning_programs
         best_program.trial_logs = trial_logs
 
-        return best_program
+        return best_program
\ No newline at end of file
diff --git a/dspy/teleprompt/simba_fast.py b/dspy/teleprompt/simba_fast.py
new file mode 100644
index 0000000000..441775735c
--- /dev/null
+++ b/dspy/teleprompt/simba_fast.py
@@ -0,0 +1,385 @@
+import dspy
+import random
+import logging
+
+import numpy as np
+from typing import Callable
+from dspy.teleprompt.teleprompt import Teleprompter
+from dspy.teleprompt.simba_utils import prepare_models_for_resampling, wrap_program, append_a_demo, append_a_rule, summarize_batch
+from dspy.teleprompt.utils import log_token_usage
+
+logger = logging.getLogger(__name__)
+
+
+# Stochastic Introspective Mini-Batch Ascent
+class SIMBAFast(Teleprompter):
+    def __init__(
+        self,
+        *,
+        metric: Callable,
+        bsize=32,
+        num_candidates=6,
+        max_steps=8,
+        max_demos=4,
+        demo_input_field_maxlen=100_000,
+        num_threads=16,
+        temperature_for_sampling=0.2,
+        temperature_for_candidates=0.2,
+    ):
+        """
+        :param metric: A function (Example, prediction_dict) -> float
+        :param bsize: mini-batch size
+        :param num_candidates: how many new candidate programs to produce per iteration
+        :param max_steps: how many optimization steps to run
+        :param max_demos: how many demos we allow a predictor to hold before we must drop some
+        :param demo_input_field_maxlen: how many characters of an input field to keep when building a new demo
+        :param num_threads: how many threads for run_parallel
+        :param temperature_for_sampling: temperature used for picking programs for the trajectory-sampling step
+        :param temperature_for_candidates: temperature used for picking the source program for building new candidates
+        """
+        self.metric = metric
+        self.bsize = bsize
+        self.num_candidates = num_candidates
+        self.max_steps = max_steps
+        self.max_demos = max_demos
+        self.demo_input_field_maxlen = demo_input_field_maxlen
+        self.num_threads = num_threads
+
+        self.temperature_for_sampling = temperature_for_sampling
+        self.temperature_for_candidates = temperature_for_candidates
+
+        if self.max_demos > 0:
+            self.strategies = [append_a_demo(demo_input_field_maxlen), append_a_rule]
+        else:
+            self.strategies = [append_a_rule]
+
+    def compile(self, student: dspy.Module, *, trainset: list[dspy.Example], seed: int = 0):
+        # Basic checks
+        assert len(trainset) >= self.bsize, f"Trainset too small: {len(trainset)} < {self.bsize}"
+
+        # Initialize RNG
+        rng = random.Random(seed)
+        rng_np = np.random.default_rng(seed)
+
+        programs = []
+        program_scores = {}
+        program_batch_idx = {}
+        next_program_idx = 0
+        batch_idx_to_baseline_scores = {}
+
+        # Helper functions
+        def calc_average_score(prog_idx: int) -> float:
+            scores = program_scores.get(prog_idx, [])
+            if not scores:
+                return 0.0
+            return sum(scores) / len(scores)
+        
+        def calc_average_adjusted_score(prog_idx: int) -> float:
+            prog_scores = program_scores.get(prog_idx, [])
+            baseline_scores = batch_idx_to_baseline_scores.get(program_batch_idx[prog_idx], [])
+
+            # If either list is empty or not the same length, return 0 or handle how you prefer
+            if not prog_scores or not baseline_scores:
+                return 0.0
+            if len(prog_scores) != len(baseline_scores):
+                # You can decide how you want to handle mismatch
+                return 0.0
+
+            # Elementwise subtraction
+            adjusted_scores = [p - b for p, b in zip(prog_scores, baseline_scores)]
+            return sum(adjusted_scores) / len(adjusted_scores)
+
+        def adjusted_top_k_plus_baseline(k: int) -> list[int]:
+            # Sort all programs by descending average score
+            scored_programs = sorted(programs, key=lambda p: calc_average_adjusted_score(p.simba_idx), reverse=True)
+            top_k = [p.simba_idx for p in scored_programs[:k]]
+            # Ensure baseline=0 is in there:
+            if 0 not in top_k and len(top_k) > 0:
+                top_k[-1] = 0
+            return list(dict.fromkeys(top_k))
+
+        def top_k_plus_baseline(k: int) -> list[int]:
+            # Sort all programs by descending average score
+            scored_programs = sorted(programs, key=lambda p: calc_average_score(p.simba_idx), reverse=True)
+            top_k = [p.simba_idx for p in scored_programs[:k]]
+            # Ensure baseline=0 is in there:
+            if 0 not in top_k and len(top_k) > 0:
+                top_k[-1] = 0
+            return list(dict.fromkeys(top_k))
+
+        def softmax_sample(rng_obj: random.Random, program_idxs: list[int], temperature: float) -> int:
+            if not program_idxs:
+                raise ValueError("No programs available for softmax sampling.")
+
+            # Unnormalized weights
+            scores = [calc_average_score(idx) for idx in program_idxs]
+            exps = [np.exp(s / temperature) for s in scores]
+            sum_exps = sum(exps)
+            if sum_exps <= 0:
+                # Fallback: uniform if all exps are zero
+                return rng_obj.choice(program_idxs)
+
+            # Weighted random choice
+            probs = [val / sum_exps for val in exps]
+            return rng_obj.choices(program_idxs, weights=probs, k=1)[0]
+
+        def register_new_program(prog: dspy.Module, score_list: list[float], batch_idx: int):
+            nonlocal next_program_idx
+            next_program_idx += 1
+            new_idx = next_program_idx
+            prog.simba_idx = new_idx
+            programs.append(prog)
+            program_scores[new_idx] = score_list
+            program_batch_idx[new_idx] = batch_idx
+
+        # Initialize the baseline program: index=0
+        student = student.deepcopy()
+        student.simba_idx = 0
+        programs.append(student)
+        program_scores[0] = []
+        program_batch_idx[0] = 0
+
+        winning_programs = [(0,student)]
+
+        # Data shuffling
+        data_indices = list(range(len(trainset)))
+        rng.shuffle(data_indices)
+        instance_idx = 0
+
+        # Parallel runner
+        run_parallel = dspy.Parallel(access_examples=False, num_threads=self.num_threads)
+
+        trial_logs = {}
+
+         # Initialize for hybrid execution reuse
+        last_batch_outputs = None
+
+        predictor2name = {}
+
+        M = self.max_steps - 1
+        N = self.num_candidates + 1
+        program_idxs = [0] * N if M < 1 else [round(i * M / (N - 1)) for i in range(N)]
+        program_idxs = list(dict.fromkeys(program_idxs))
+
+        final_candidate_programs = []
+        final_candidate_scores = []
+        validated_program_outputs = {}  # {prog_idx: {example_idx: output_dict}}
+
+        # Compute baseline student score on the full trainset
+        logger.info(f"Evaluating student program on full trainset.")
+        exec_pairs = [(wrap_program(student, self.metric), ex) for ex in trainset]
+        full_outputs = run_parallel(exec_pairs)
+        baseline_scores = [o["score"] for o in full_outputs]
+
+        for batch_idx in range(self.max_steps):
+            trial_logs[batch_idx+1] = {}
+
+            logger.info(f"Starting batch {batch_idx+1} of {self.max_steps}.")
+
+            # STEP 1: Get next batch
+            if instance_idx + self.bsize > len(trainset):
+                rng.shuffle(data_indices)
+                instance_idx = 0
+
+            batch_indices = data_indices[instance_idx : instance_idx + self.bsize]
+            batch = [trainset[i] for i in batch_indices]
+            instance_idx += self.bsize
+
+            # Compute student baseline on batch
+            batch_idx_to_baseline_scores[batch_idx] = [score for i, score in enumerate(baseline_scores) if i in batch_indices]
+
+            # STEP 2 (or hybrid): Collect execution results for bucket building
+            models = prepare_models_for_resampling(programs[0], self.num_candidates)
+            top_programs = top_k_plus_baseline(self.num_candidates)
+
+            exec_pairs = []
+
+            if batch_idx == 0:
+                # First round — use full trajectory sampling
+                for model in models:
+                    for example in batch:
+                        chosen_prog_idx = softmax_sample(rng, top_programs, self.temperature_for_sampling)
+                        candidate_system = programs[chosen_prog_idx].deepcopy()
+                        candidate_system.set_lm(model)
+
+                        for name, predictor in candidate_system.named_predictors():
+                            predictor2name[id(predictor)] = name
+
+                        wrapped_candidate_system = wrap_program(candidate_system, self.metric)
+                        exec_pairs.append((wrapped_candidate_system, example))
+
+                logger.info(f"Sampling program trajectories on {self.bsize} examples x {self.num_candidates} samples.")
+                outputs = run_parallel(exec_pairs)
+            else:
+                outputs = last_batch_outputs.copy() if last_batch_outputs else []
+                for prog_idx, prog_cache in validated_program_outputs.items():
+                    for i in batch_indices:
+                        if i in prog_cache:
+                            outputs.append(prog_cache[i])
+            
+            dspy.settings.lm.history.extend([entry for model in models for entry in model.history])
+
+            # STEP 3: Sort the training buckets by (max-to-min gap, max score, and max-to-avg gap).
+            buckets = []
+            largest_max_to_avg_gap = float("-inf")
+            batch_10th_percentile_score = np.percentile([float(o["score"]) for o in outputs], 10)
+            batch_90th_percentile_score = np.percentile([float(o["score"]) for o in outputs], 90)
+
+            # We'll chunk `outputs` by example index, each chunk has length = num_candidates
+            for idx, example in enumerate(batch):
+                # gather all results for this example
+                bucket = [outputs[i] for i in range(idx, len(outputs), self.bsize)]
+                bucket.sort(key=lambda x: x["score"], reverse=True)
+
+                max_score = float(bucket[0]["score"])
+                min_score = float(bucket[-1]["score"])
+                avg_score = sum(x["score"] for x in bucket) / len(bucket)
+                max_to_min_gap = max_score - min_score
+                max_to_avg_gap = max_score - avg_score
+                if max_to_avg_gap > largest_max_to_avg_gap:
+                    largest_max_to_avg_gap = max_to_avg_gap
+
+                buckets.append((bucket, (max_to_min_gap, max_score, max_to_avg_gap)))
+
+            # sort the buckets
+            buckets.sort(key=lambda x: x[1], reverse=True)
+            # TODO: if all buckets mave a max_to_min gap of 0 and max score <1.0, then we should do more trajectory sampling
+
+            # Baseline for the batch is just the average of all runs
+            all_scores_in_this_batch = [o["score"] for o in outputs]
+            baseline_score = sum(all_scores_in_this_batch) / len(all_scores_in_this_batch)
+            logger.info(f"Batch {batch_idx+1}: Baseline mini-batch score: {baseline_score}\n")
+            
+            # summarize_batch([bucket[0] for bucket in buckets])
+            # STEP 4: Build new candidate programs by applying a strategy to some top buckets.
+            system_candidates = []
+            for bucket_idx, (bucket, bucket_stats) in enumerate(buckets):
+                max_to_min_gap, max_score, max_to_avg_gap = bucket_stats
+                logger.info(
+                    f"Batch {batch_idx+1}: Processing bucket #{bucket_idx+1}, with max score {max_score}, "
+                    f"max-to-min gap {max_to_min_gap}, and max-to-avg gap {max_to_avg_gap}."
+                )
+
+                # pick source program
+                src_prog_idx = softmax_sample(
+                    rng, top_k_plus_baseline(self.num_candidates), self.temperature_for_candidates
+                )
+                system_candidate = programs[src_prog_idx].deepcopy()
+
+                # Drop some demos from each predictor
+                name2predictor = {}
+                num_demos_list = []
+
+                max_demos_tmp = self.max_demos if self.max_demos > 0 else 3
+
+                for name, predictor in system_candidate.named_predictors():
+                    name2predictor[name] = predictor
+                    num_demos_list.append(len(predictor.demos))
+
+                num_demos = max(num_demos_list) if num_demos_list else 0
+                num_demos_to_drop = max(rng_np.poisson(num_demos / max_demos_tmp), int(num_demos >= max_demos_tmp))
+                num_demos_to_drop = min(num_demos_to_drop, num_demos)
+                demos_to_drop = [rng.randrange(num_demos) for _ in range(num_demos_to_drop)]
+
+                for name, predictor in name2predictor.items():
+                    predictor.demos = [demo for idxd, demo in enumerate(predictor.demos) if idxd not in demos_to_drop]
+
+                # Pick a strategy
+                strategy = rng.choice(self.strategies)
+                logger.info(
+                    f"Batch {batch_idx+1}: Invoking strategy: {strategy.__name__}"
+                    + (f", having dropped {num_demos_to_drop} demos per predictor" if num_demos_to_drop else "")
+                )
+
+                for name, predictor in system_candidate.named_predictors():
+                    predictor2name[id(predictor)] = name
+
+                try:
+                    strategy(
+                        bucket,
+                        system_candidate,
+                        predictor2name=predictor2name,
+                        name2predictor=name2predictor,
+                        batch_10p_score=batch_10th_percentile_score,
+                        batch_90p_score=batch_90th_percentile_score,
+                    )
+                except Exception as e:
+                    logger.error(f"Strategy failed with error: {e}")
+                    continue
+
+                system_candidates.append(system_candidate)
+                logger.info("\n")
+
+                if len(system_candidates) >= self.num_candidates + 1:
+                    break
+
+            # STEP 5: Evaluate these new system_candidates on the same mini-batch
+            logger.info(f"Batch {batch_idx+1}: Evaluating {len(system_candidates)} programs on {self.bsize} examples.")
+
+            exec_pairs = [(wrap_program(sys, self.metric), ex) for sys in system_candidates for ex in batch]
+            outputs = run_parallel(exec_pairs)
+            assert len(outputs) == len(exec_pairs) == len(system_candidates) * self.bsize
+
+            # STEP 6: Compute average mini-batch scores for each new candidate
+            candidate_scores = []
+            for idx_cand, cand_sys in enumerate(system_candidates):
+                start = idx_cand * self.bsize
+                end = (idx_cand + 1) * self.bsize
+                sys_scores = [outputs[i]["score"] for i in range(start, end)]
+                avg_sys_score = sum(sys_scores) / len(sys_scores)
+                candidate_scores.append(avg_sys_score)
+
+            logger.info(
+                f"Scores after {batch_idx+1} batches: {candidate_scores}, "
+                f"Best: {max(candidate_scores) if candidate_scores else 'N/A'}\n"
+            )
+
+            trial_logs[batch_idx+1]["batch_scores"] = candidate_scores
+
+            # STEP 7: Select the best among these new ones for "winning" record
+            if candidate_scores:
+                best_idx_among_candidates = candidate_scores.index(max(candidate_scores))
+                best_program = system_candidates[best_idx_among_candidates]
+                winning_programs.append((batch_idx+1, best_program.deepcopy()))
+
+            # STEP 8: If it's time for a full evaluation, evaluate the winning program on the full trainset
+            if batch_idx in program_idxs:
+                logger.info(f"Batch {batch_idx+1}: Evaluating winning program on full trainset.")
+                exec_pairs = [(wrap_program(best_program, self.metric), ex) for ex in trainset]
+                full_outputs = run_parallel(exec_pairs)
+                scores = [o["score"] for o in full_outputs]
+                avg_score = sum(scores) / len(scores)
+                trial_logs[batch_idx + 1]["train_score"] = avg_score
+
+                final_candidate_programs.append(best_program.deepcopy())
+                final_candidate_scores.append(avg_score)
+
+                prog_cache = {i: out for i, out in enumerate(full_outputs)}
+                validated_program_outputs[best_program.simba_idx] = prog_cache
+
+            # STEP 9: Register all new candidate systems in our global pool
+            for idx_cand, cand_sys in enumerate(system_candidates):
+                start = idx_cand * self.bsize
+                end = (idx_cand + 1) * self.bsize
+                sys_scores = [outputs[i]["score"] for i in range(start, end)]
+                register_new_program(cand_sys, sys_scores, batch_idx)
+            
+            # Save for hybrid bucket building next round
+            last_batch_outputs = outputs.copy()
+
+            log_token_usage(trial_logs, batch_idx+1, {"lm": dspy.settings.lm})
+
+
+        best_idx = np.argmax(final_candidate_scores) if final_candidate_scores else 0
+        # best_idx = scores.index(max(final_candidate_scores)) if final_candidate_scores else 0
+        best_program = final_candidate_programs[best_idx]
+        logger.info(
+            f"Final trainset scores: {final_candidate_scores}, Best: {max(final_candidate_scores) if final_candidate_scores else 'N/A'} "
+            f"(at index {best_idx if final_candidate_scores else 'N/A'})\n\n\n"
+        )
+        # FIXME: Attach all program candidates in decreasing average score to the best program.
+        best_program.candidate_programs = final_candidate_programs
+        best_program.winning_programs = winning_programs
+        best_program.trial_logs = trial_logs
+
+        return best_program

From f795a3d05926475e4a1d66e84e391bf1baa11d37 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Fri, 2 May 2025 14:13:30 -0400
Subject: [PATCH 10/18] wip

---
 dspy/teleprompt/simba_fast.py | 16 +++++++++++-----
 dspy/utils/parallelizer.py    |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/dspy/teleprompt/simba_fast.py b/dspy/teleprompt/simba_fast.py
index 441775735c..341fb12d45 100644
--- a/dspy/teleprompt/simba_fast.py
+++ b/dspy/teleprompt/simba_fast.py
@@ -147,6 +147,7 @@ def register_new_program(prog: dspy.Module, score_list: list[float], batch_idx:
         instance_idx = 0
 
         # Parallel runner
+        print("Creating parallel runner with num_threads: ", self.num_threads)
         run_parallel = dspy.Parallel(access_examples=False, num_threads=self.num_threads)
 
         trial_logs = {}
@@ -161,16 +162,20 @@ def register_new_program(prog: dspy.Module, score_list: list[float], batch_idx:
         program_idxs = [0] * N if M < 1 else [round(i * M / (N - 1)) for i in range(N)]
         program_idxs = list(dict.fromkeys(program_idxs))
 
-        final_candidate_programs = []
-        final_candidate_scores = []
-        validated_program_outputs = {}  # {prog_idx: {example_idx: output_dict}}
-
         # Compute baseline student score on the full trainset
         logger.info(f"Evaluating student program on full trainset.")
         exec_pairs = [(wrap_program(student, self.metric), ex) for ex in trainset]
         full_outputs = run_parallel(exec_pairs)
         baseline_scores = [o["score"] for o in full_outputs]
 
+        # Compute average score for the baseline program
+        avg_baseline_score = sum(baseline_scores) / len(baseline_scores)
+        logger.info(f"Baseline program (index 0) score: {avg_baseline_score}\n")
+
+        final_candidate_programs = [student]
+        final_candidate_scores = [avg_baseline_score]
+        validated_program_outputs = {}  # {prog_idx: {example_idx: output_dict}}
+
         for batch_idx in range(self.max_steps):
             trial_logs[batch_idx+1] = {}
 
@@ -310,7 +315,7 @@ def register_new_program(prog: dspy.Module, score_list: list[float], batch_idx:
                 system_candidates.append(system_candidate)
                 logger.info("\n")
 
-                if len(system_candidates) >= self.num_candidates + 1:
+                if len(system_candidates) >= self.num_candidates:
                     break
 
             # STEP 5: Evaluate these new system_candidates on the same mini-batch
@@ -349,6 +354,7 @@ def register_new_program(prog: dspy.Module, score_list: list[float], batch_idx:
                 full_outputs = run_parallel(exec_pairs)
                 scores = [o["score"] for o in full_outputs]
                 avg_score = sum(scores) / len(scores)
+                print(f"Batch {batch_idx+1}: Full trainset score: {avg_score}")
                 trial_logs[batch_idx + 1]["train_score"] = avg_score
 
                 final_candidate_programs.append(best_program.deepcopy())
diff --git a/dspy/utils/parallelizer.py b/dspy/utils/parallelizer.py
index 1607db0477..1452c122a4 100644
--- a/dspy/utils/parallelizer.py
+++ b/dspy/utils/parallelizer.py
@@ -114,7 +114,7 @@ def handler(sig, frame):
                     signal.signal(signal.SIGINT, orig_handler)
             else:
                 yield
-
+        print(f"num_threads: {self.num_threads}")
         executor = ThreadPoolExecutor(max_workers=self.num_threads)
         try:
             with interrupt_manager():

From 7ded758c83a005a8140ade739d0730488b12ffd7 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Sun, 25 May 2025 11:09:58 -0400
Subject: [PATCH 11/18] relaxing openai version

---
 dspy/utils/parallelizer.py | 2 +-
 pyproject.toml             | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dspy/utils/parallelizer.py b/dspy/utils/parallelizer.py
index 1452c122a4..1607db0477 100644
--- a/dspy/utils/parallelizer.py
+++ b/dspy/utils/parallelizer.py
@@ -114,7 +114,7 @@ def handler(sig, frame):
                     signal.signal(signal.SIGINT, orig_handler)
             else:
                 yield
-        print(f"num_threads: {self.num_threads}")
+
         executor = ThreadPoolExecutor(max_workers=self.num_threads)
         try:
             with interrupt_manager():
diff --git a/pyproject.toml b/pyproject.toml
index 4fb32fd719..09c9ba472c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
 dependencies = [
     "backoff>=2.2",
     "joblib~=1.3",
-    "openai>=0.28.1,<=1.61.0",
+    "openai>=0.28.1",
     "pandas>=2.1.1",
     "regex>=2023.10.3",
     "ujson>=5.8.0",
@@ -103,7 +103,7 @@ python = ">=3.9,<3.13"
 pydantic = "^2.0"
 backoff = "^2.2"
 joblib = "^1.3"
-openai = ">=0.28.1,<=1.61.0"
+openai = ">=0.28.1"
 pandas = "^2.1.1"
 regex = "^2023.10.3"
 ujson = "^5.8.0"

From f1a89a72f86a8191aa6c157c7043db5f48c55eae Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Sun, 25 May 2025 11:21:42 -0400
Subject: [PATCH 12/18] removing unused imports

---
 dspy/teleprompt/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dspy/teleprompt/__init__.py b/dspy/teleprompt/__init__.py
index 947bf55de1..2fefec2f21 100644
--- a/dspy/teleprompt/__init__.py
+++ b/dspy/teleprompt/__init__.py
@@ -7,7 +7,6 @@
 from dspy.teleprompt.knn_fewshot import KNNFewShot
 from dspy.teleprompt.simba import SIMBA
 from dspy.teleprompt.simba_fast import SIMBAFast
-from dspy.teleprompt.simba_bayesian import SIMBABayesian
 
 from dspy.teleprompt.mipro_optimizer_v2 import MIPROv2
 from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch
@@ -32,5 +31,4 @@
     "InferRules",
     "SIMBA",
     "SIMBAFast",
-    "SIMBABayesian",
 ]

From 8f0d71d477bd14a83efd13a724ed6a79f5b25a65 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Sun, 25 May 2025 11:33:13 -0400
Subject: [PATCH 13/18] updates to simba fast to allow for teacher / prompt opt
 models

---
 dspy/teleprompt/simba_fast.py  |  15 +++--
 dspy/teleprompt/simba_utils.py | 116 ++++++++++++++-------------------
 2 files changed, 58 insertions(+), 73 deletions(-)

diff --git a/dspy/teleprompt/simba_fast.py b/dspy/teleprompt/simba_fast.py
index 341fb12d45..425411da3e 100644
--- a/dspy/teleprompt/simba_fast.py
+++ b/dspy/teleprompt/simba_fast.py
@@ -3,9 +3,9 @@
 import logging
 
 import numpy as np
-from typing import Callable
+from typing import Callable, Optional, Any, Dict
 from dspy.teleprompt.teleprompt import Teleprompter
-from dspy.teleprompt.simba_utils import prepare_models_for_resampling, wrap_program, append_a_demo, append_a_rule, summarize_batch
+from dspy.teleprompt.simba_utils import prepare_models_for_resampling, wrap_program, append_a_demo, append_a_rule
 from dspy.teleprompt.utils import log_token_usage
 
 logger = logging.getLogger(__name__)
@@ -21,6 +21,8 @@ def __init__(
         num_candidates=6,
         max_steps=8,
         max_demos=4,
+        prompt_model: Optional[Any] = None,
+        teacher_settings: Optional[Dict] = None,
         demo_input_field_maxlen=100_000,
         num_threads=16,
         temperature_for_sampling=0.2,
@@ -42,6 +44,8 @@ def __init__(
         self.num_candidates = num_candidates
         self.max_steps = max_steps
         self.max_demos = max_demos
+        self.prompt_model = prompt_model if prompt_model else dspy.settings.lm
+        self.teacher_settings = teacher_settings if teacher_settings else {}
         self.demo_input_field_maxlen = demo_input_field_maxlen
         self.num_threads = num_threads
 
@@ -147,7 +151,7 @@ def register_new_program(prog: dspy.Module, score_list: list[float], batch_idx:
         instance_idx = 0
 
         # Parallel runner
-        print("Creating parallel runner with num_threads: ", self.num_threads)
+        logger.info(f"Creating parallel runner with num_threads: {self.num_threads}")
         run_parallel = dspy.Parallel(access_examples=False, num_threads=self.num_threads)
 
         trial_logs = {}
@@ -194,7 +198,7 @@ def register_new_program(prog: dspy.Module, score_list: list[float], batch_idx:
             batch_idx_to_baseline_scores[batch_idx] = [score for i, score in enumerate(baseline_scores) if i in batch_indices]
 
             # STEP 2 (or hybrid): Collect execution results for bucket building
-            models = prepare_models_for_resampling(programs[0], self.num_candidates)
+            models = prepare_models_for_resampling(programs[0], self.num_candidates, self.teacher_settings)
             top_programs = top_k_plus_baseline(self.num_candidates)
 
             exec_pairs = []
@@ -307,6 +311,7 @@ def register_new_program(prog: dspy.Module, score_list: list[float], batch_idx:
                         name2predictor=name2predictor,
                         batch_10p_score=batch_10th_percentile_score,
                         batch_90p_score=batch_90th_percentile_score,
+                        prompt_model=self.prompt_model,
                     )
                 except Exception as e:
                     logger.error(f"Strategy failed with error: {e}")
@@ -354,7 +359,7 @@ def register_new_program(prog: dspy.Module, score_list: list[float], batch_idx:
                 full_outputs = run_parallel(exec_pairs)
                 scores = [o["score"] for o in full_outputs]
                 avg_score = sum(scores) / len(scores)
-                print(f"Batch {batch_idx+1}: Full trainset score: {avg_score}")
+                logger.info(f"Batch {batch_idx+1}: Full trainset score: {avg_score}")
                 trial_logs[batch_idx + 1]["train_score"] = avg_score
 
                 final_candidate_programs.append(best_program.deepcopy())
diff --git a/dspy/teleprompt/simba_utils.py b/dspy/teleprompt/simba_utils.py
index 3a215986cd..5d609a628b 100644
--- a/dspy/teleprompt/simba_utils.py
+++ b/dspy/teleprompt/simba_utils.py
@@ -3,28 +3,49 @@
 import inspect
 import logging
 import textwrap
+import re
 
-from dspy.adapters.chat_adapter import enumerate_fields
+from dspy.adapters.utils import get_field_description_string
 from dspy.signatures import InputField, OutputField
-from typing import Callable
+from typing import Callable, Optional, Dict, Any
 
 logger = logging.getLogger(__name__)
 
+def prepare_models_for_resampling(program: dspy.Module, n: int, teacher_settings: Optional[Dict] = None):
+
+    models = []
+    if teacher_settings:
+        with dspy.settings.context(trace=[], **teacher_settings):
+            lm = dspy.settings.lm
+            models.append(lm)
 
-def prepare_models_for_resampling(program: dspy.Module, n: int):
     lm = program.get_lm() or dspy.settings.lm
-    temps = [lm.kwargs["temperature"]] + [0.5 + i * (0.5 / n) for i in range(n)]
-    temps = list(dict.fromkeys(temps))[:n]
-    return [lm.copy(temperature=t) for t in temps]
+
+    # Check to see if our model is a reasoning model, which means temp must stay as 1.0
+    model_family = lm.model.split("/")[-1].lower() if "/" in lm.model else lm.model.lower()
+    model_pattern = re.match(r"^o([13])(?:-mini)?", model_family)
+
+    if model_pattern: # Vary the seed
+        start_seed = 0 if "seed" not in lm.kwargs else lm.kwargs["seed"]
+        seeds = [start_seed + 1 + i for i in range(n-len(models))]
+        seeds = list(dict.fromkeys(seeds))[:(n-len(models))]
+        models.extend([lm.copy(seed=seed) for seed in seeds])
+    else: # Vary the temperature
+        start_temp = 0 if "temperature" not in lm.kwargs else lm.kwargs["temperature"]
+        temps = [start_temp + 0.5 + i * (0.5 / n) for i in range(n-len(models))]
+        temps = list(dict.fromkeys(temps))[:(n-len(models))]
+        models.extend([lm.copy(temperature=t) for t in temps])
+    
+    return models
 
 def wrap_program(program: dspy.Module, metric: Callable):
     def wrapped_program(example):
         with dspy.context(trace=[]):
-            prediction, trace = None, None
+            prediction, trace, score = None, None, 0.0
             try:
                 prediction = program(**example.inputs())
             except Exception as e:
-                print(e)
+                logger.info(e)
             trace = dspy.settings.trace.copy()
 
         output = None
@@ -36,13 +57,15 @@ def wrapped_program(example):
             if isinstance(output, (int, float)):
                 score = output
             elif isinstance(output, dspy.Prediction):
+                if not hasattr(output, 'score'):
+                    raise ValueError("dspy.Prediction must contain a 'score' attribute")
                 score = output.score
                 # Just extract fields from _store, excluding 'score'
                 output_metadata = {
                     k: v for k, v in output._store.items() if k != "score"
                 }
         except Exception as e:
-            print(e)
+            logger.info(e)
 
         return {
             "prediction": prediction,
@@ -54,67 +77,20 @@ def wrapped_program(example):
 
     return wrapped_program
 
-
-class SummarizeBucket(dspy.Signature):
-    """
-    Given sets of LLM trajectories on examples in a task, generate an updated summary of the common failure modes across examples.
-
-    The goal is that we can generate a summary that will make it easy for us to re-write the prompt or specifically specify rules that will help the system improve on this task.
-
-    You'll be given an existing summary of example failure modes that has been generated by looking at previous sets of examples. Please update this summary with your additional insights (if any).
-
-    Please make sure to include specific examples from the input that you think are particularly illustrative of the failure modes you're summarizing. If these are also included in the existing summary, please make sure that this information is not dropped in the updated summary.
-    """
-    current_summary = dspy.InputField(desc="The current summary of common failure modes across examples")
-    # inconsistencies_summary = dspy.InputField(desc="The current summary of potential inconsistencies in the evaluation / areas of ambiguity in the evaluation")
-    examples = dspy.InputField(desc="The examples to analyze and use to update the summaries with additional insights")
-    updated_summary = dspy.OutputField(desc="The updated summary of common failure modes across examples")
-    # updated_inconsistencies_summary = dspy.OutputField(desc="The updated summary of potential inconsistencies in the evaluation / areas of ambiguity in the evaluation")
-
-def format_examples(buckets):
-    formatted_examples = ""
-    for i, bucket in enumerate(buckets):
-        formatted_examples += f"Example {i+1}:\n"
-        good, bad = bucket[0], bucket[-1]
-        example = good["example"]
-        program_inputs={**example.inputs()}
-        formatted_examples += f"Inputs: {ujson.dumps(program_inputs, indent=2)}\n"
-        formatted_examples += f"Bad Prediction: {ujson.dumps(bad['prediction'], indent=2)}\n"
-        formatted_examples += f"Bad Score: {bad['score']}\n"
-        formatted_examples += f"Bad Metadata: {ujson.dumps(bad['output_metadata'], indent=2)}\n"
-        formatted_examples += f"Good Prediction: {ujson.dumps(good['prediction'], indent=2)}\n"
-        formatted_examples += f"Good Score: {good['score']}\n"
-        formatted_examples += f"Good Metadata: {ujson.dumps(good['output_metadata'], indent=2)}\n"
-        formatted_examples += "\n\n"
-    return formatted_examples
-
-def summarize_batch(buckets, summary_window=3, max_examples=12):
-    # TODO: note that we could use a longer context window here 
-    # TODO: we should autocalc the summary window based on what will fit in our model 
-
-    summary = "N/A"
-    inconsistencies_summary = "N/A"
-
-    breakpoint()
-
-    bucket_subset = buckets[:max_examples]
-    for i in range(0, len(bucket_subset), summary_window):
-        formatted_examples = format_examples(buckets[i:i+summary_window])
-        summarize_bucket_output = dspy.ChainOfThought(SummarizeBucket)(current_summary=summary, examples=formatted_examples)
-        summary = summarize_bucket_output.updated_summary
-        # inconsistencies = summarize_bucket_output.updated_inconsistencies_summary
-    logger.info(f"Summary for bucket {i}: {summary}")
-    # logger.info(f"Inconsistencies for bucket {i}: {inconsistencies}")
-
 def append_a_demo(demo_input_field_maxlen):
     def append_a_demo_(bucket, system, **kwargs):
         predictor2name, name2predictor = kwargs["predictor2name"], kwargs["name2predictor"]
-        batch_10p_score, batch_90p_score = kwargs["batch_10p_score"], kwargs["batch_90p_score"]
+        batch_10p_score = kwargs["batch_10p_score"]
 
+        logger.info(f"Appending a demo with max length {demo_input_field_maxlen}")
+        
         good = bucket[0]
         trace = good["trace"]
         name2demo = {}
 
+        # if good["score"] < batch_10p_score:
+        #     logger.info(f"Skipping appending a demo as good score {good['score']} is below the 10th percentile.")
+        #     return False
         if good["score"] <= batch_10p_score:
             logger.info(f"Skipping appending a demo as good score {good['score']} is at or below the 10th percentile.")
             return False
@@ -133,15 +109,17 @@ def append_a_demo_(bucket, system, **kwargs):
             predictor = name2predictor[name]
             predictor.demos.append(demo)
 
-        logger.info(f"Added {len(name2demo)} demos (one each) across all predictors.")
+        logger.info(f"Added {len(name2demo)} demos (one each) across all predictors. Each predictor now has {len(predictor.demos)} demos total.")
         return True
     
     return append_a_demo_
 
 
 def append_a_rule(bucket, system, **kwargs):
+    # Read in kwargs
     predictor2name = kwargs["predictor2name"]
     batch_10p_score, batch_90p_score = kwargs["batch_10p_score"], kwargs["batch_90p_score"]
+    prompt_model = kwargs["prompt_model"] or dspy.settings.lm
 
     module_names = [name for name, _ in system.named_predictors()]
     good, bad = bucket[0], bucket[-1]
@@ -189,7 +167,10 @@ def append_a_rule(bucket, system, **kwargs):
 
     kwargs = {k: v if isinstance(v, str) else ujson.dumps(recursive_mask(v), indent=2)
               for k, v in kwargs.items()}
-    advice = dspy.Predict(OfferFeedback)(**kwargs).module_advice
+
+    with dspy.settings.context(trace=[], lm=prompt_model):
+        advice_program = dspy.Predict(OfferFeedback)
+        advice = advice_program(**kwargs).module_advice
 
     for name, predictor in system.named_predictors():
         if name in advice:
@@ -239,7 +220,6 @@ class OfferFeedback(dspy.Signature):
         "like the successful trajectory rather than the lower-scoring trajectory."
     )
 
-
 def inspect_modules(program):
     separator = "-" * 80
     output = [separator]
@@ -251,9 +231,9 @@ def inspect_modules(program):
 
         output.append(f"Module {name}")
         output.append("\n\tInput Fields:")
-        output.append(("\n" + "\t" * 2).join([""] + enumerate_fields(signature.input_fields).splitlines()))
+        output.append(("\n" + "\t" * 2).join([""] + get_field_description_string(signature.input_fields).splitlines()))
         output.append("\tOutput Fields:")
-        output.append(("\n" + "\t" * 2).join([""] + enumerate_fields(signature.output_fields).splitlines()))
+        output.append(("\n" + "\t" * 2).join([""] + get_field_description_string(signature.output_fields).splitlines()))
         output.append(f"\tOriginal Instructions: {instructions}")
         output.append(separator)
 
@@ -279,4 +259,4 @@ def recursive_mask(o):
         return tuple(recursive_mask(v) for v in o)
     # Otherwise, replace it with a placeholder string (or use repr(o)).
     else:
-        return f"<non-serializable: {type(o).__name__}>"
+        return f"<non-serializable: {type(o).__name__}>"
\ No newline at end of file

From 08324eebeaa9441527615ef4d8c5c9adfae68ce6 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Sun, 25 May 2025 11:39:22 -0400
Subject: [PATCH 14/18] changed import in simba_utils for compatibility

---
 dspy/teleprompt/simba_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dspy/teleprompt/simba_utils.py b/dspy/teleprompt/simba_utils.py
index 5d609a628b..32438164bb 100644
--- a/dspy/teleprompt/simba_utils.py
+++ b/dspy/teleprompt/simba_utils.py
@@ -5,7 +5,7 @@
 import textwrap
 import re
 
-from dspy.adapters.utils import get_field_description_string
+from dspy.adapters.chat_adapter import enumerate_fields
 from dspy.signatures import InputField, OutputField
 from typing import Callable, Optional, Dict, Any
 
@@ -231,9 +231,9 @@ def inspect_modules(program):
 
         output.append(f"Module {name}")
         output.append("\n\tInput Fields:")
-        output.append(("\n" + "\t" * 2).join([""] + get_field_description_string(signature.input_fields).splitlines()))
+        output.append(("\n" + "\t" * 2).join([""] + enumerate_fields(signature.input_fields).splitlines()))
         output.append("\tOutput Fields:")
-        output.append(("\n" + "\t" * 2).join([""] + get_field_description_string(signature.output_fields).splitlines()))
+        output.append(("\n" + "\t" * 2).join([""] + enumerate_fields(signature.output_fields).splitlines()))
         output.append(f"\tOriginal Instructions: {instructions}")
         output.append(separator)
 

From 194e62aa780ae4e330a1ce7b915d4846a52c39bf Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Sun, 25 May 2025 11:56:19 -0400
Subject: [PATCH 15/18] reverting mipro changes

---
 dspy/teleprompt/mipro_optimizer_v2.py | 146 +++++++++++++++-----------
 1 file changed, 84 insertions(+), 62 deletions(-)

diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index 741efe875e..711b227f50 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -3,11 +3,13 @@
 import textwrap
 from collections import defaultdict
 from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+import select
+import sys
+import time
 
 import numpy as np
 import optuna
 from optuna.distributions import CategoricalDistribution
-
 import dspy
 from dspy.evaluate.evaluate import Evaluate
 from dspy.propose import GroundedProposer
@@ -21,7 +23,6 @@
     print_full_program,
     save_candidate_program,
     set_signature,
-    log_token_usage
 )
 
 logger = logging.getLogger(__name__)
@@ -32,9 +33,9 @@
 MIN_MINIBATCH_SIZE = 50
 
 AUTO_RUN_SETTINGS = {
-    "light": {"num_trials": 7, "val_size": 100},
-    "medium": {"num_trials": 25, "val_size": 500},
-    "heavy": {"num_trials": 40, "val_size": 1000},
+    "light": {"n": 6, "val_size": 100},
+    "medium": {"n": 12, "val_size": 300},
+    "heavy": {"n": 18, "val_size": 1000},
 }
 
 # ANSI escape codes for colors
@@ -54,9 +55,9 @@ def __init__(
         teacher_settings: Dict = {},
         max_bootstrapped_demos: int = 4,
         max_labeled_demos: int = 4,
-        auto: Optional[Literal["light", "medium", "heavy"]] = "medium",
-        num_candidates: int = 10,
-        num_threads: int = 6,
+        auto: Optional[Literal["light", "medium", "heavy"]] = "light",
+        num_candidates: Optional[int] = None,
+        num_threads: Optional[int] = None,
         max_errors: int = 10,
         seed: int = 9,
         init_temperature: float = 0.5,
@@ -70,7 +71,8 @@ def __init__(
         if auto not in allowed_modes:
             raise ValueError(f"Invalid value for auto: {auto}. Must be one of {allowed_modes}.")
         self.auto = auto
-
+        self.num_fewshot_candidates = num_candidates
+        self.num_instruct_candidates = num_candidates
         self.num_candidates = num_candidates
         self.metric = metric
         self.init_temperature = init_temperature
@@ -97,7 +99,7 @@ def compile(
         trainset: List,
         teacher: Any = None,
         valset: Optional[List] = None,
-        num_trials: int = 30,
+        num_trials: Optional[int] = None,
         max_bootstrapped_demos: Optional[int] = None,
         max_labeled_demos: Optional[int] = None,
         seed: Optional[int] = None,
@@ -110,8 +112,23 @@ def compile(
         tip_aware_proposer: bool = True,
         fewshot_aware_proposer: bool = True,
         requires_permission_to_run: bool = True,
-        provide_traceback: bool = False,
+        provide_traceback: Optional[bool] = None,
     ) -> Any:
+        
+        zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0)
+        
+        # If auto is None, and num_trials is not provided (but num_candidates is), raise an error that suggests a good num_trials value
+        if self.auto is None and (self.num_candidates is not None and num_trials is None):
+            raise ValueError(f"If auto is None, num_trials must also be provided. Given num_candidates={self.num_candidates}, we'd recommend setting num_trials to ~{self._set_num_trials_from_num_candidates(student, zeroshot_opt, self.num_candidates)}.")
+        
+        # If auto is None, and num_candidates or num_trials is None, raise an error
+        if self.auto is None and (self.num_candidates is None or num_trials is None):
+            raise ValueError("If auto is None, num_candidates must also be provided.")
+        
+        # If auto is provided, and either num_candidates or num_trials is not None, raise an error
+        if self.auto is not None and (self.num_candidates is not None or num_trials is not None):
+            raise ValueError("If auto is not None, num_candidates and num_trials cannot be set, since they would be overrided by the auto settings. Please either set auto to None, or do not specify num_candidates and num_trials.")
+        
         # Set random seeds
         seed = seed or self.seed
         self._set_random_seeds(seed)
@@ -126,7 +143,6 @@ def compile(
         trainset, valset = self._set_and_validate_datasets(trainset, valset)
 
         # Set hyperparameters based on run mode (if set)
-        zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0)
         num_trials, valset, minibatch = self._set_hyperparams_from_run_mode(
             student, num_trials, minibatch, zeroshot_opt, valset
         )
@@ -202,6 +218,15 @@ def _set_random_seeds(self, seed):
         self.rng = random.Random(seed)
         np.random.seed(seed)
 
+    def _set_num_trials_from_num_candidates(self, program, zeroshot_opt, num_candidates):
+        num_vars = len(program.predictors())
+        if not zeroshot_opt:
+            num_vars *= 2  # Account for few-shot examples + instruction variables
+        # Trials = MAX(c*M*log(N), c=2, 3/2*N)
+        num_trials = int(max(2 * num_vars * np.log2(num_candidates), 1.5 * num_candidates))
+
+        return num_trials
+        
     def _set_hyperparams_from_run_mode(
         self,
         program: Any,
@@ -213,16 +238,18 @@ def _set_hyperparams_from_run_mode(
         if self.auto is None:
             return num_trials, valset, minibatch
 
-        num_vars = len(program.predictors())
-        if not zeroshot_opt:
-            num_vars *= 2  # Account for few-shot examples + instruction variables
-
         auto_settings = AUTO_RUN_SETTINGS[self.auto]
-        num_trials = auto_settings["num_trials"]
+        
         valset = create_minibatch(valset, batch_size=auto_settings["val_size"], rng=self.rng)
         minibatch = len(valset) > MIN_MINIBATCH_SIZE
-        # self.num_candidates = int(np.round(np.min([num_trials * num_vars, (1 * num_trials) / num_vars])))
-        self.num_candidates = int(np.round(np.min([num_trials * num_vars, (num_trials) / num_vars])))
+        
+        # Set num instruct candidates to 1/2 of N if optimizing with few-shot examples, otherwise set to N
+        # This is because we've found that it's generally better to spend optimization budget on few-shot examples
+        # When they are allowed.
+        self.num_instruct_candidates = auto_settings["n"] if zeroshot_opt else int(auto_settings["n"] * 0.5)
+        self.num_fewshot_candidates = auto_settings["n"] 
+
+        num_trials = self._set_num_trials_from_num_candidates(program, zeroshot_opt, auto_settings["n"])
 
         return num_trials, valset, minibatch
 
@@ -248,7 +275,8 @@ def _print_auto_run_settings(self, num_trials: int, minibatch: bool, valset: Lis
             f"\nRUNNING WITH THE FOLLOWING {self.auto.upper()} AUTO RUN SETTINGS:"
             f"\nnum_trials: {num_trials}"
             f"\nminibatch: {minibatch}"
-            f"\nnum_candidates: {self.num_candidates}"
+            f"\nnum_fewshot_candidates: {self.num_fewshot_candidates}"
+            f"\nnum_instruct_candidates: {self.num_instruct_candidates}"
             f"\nvalset size: {len(valset)}\n"
         )
 
@@ -267,12 +295,12 @@ def _estimate_lm_calls(
         # Estimate prompt model calls
         estimated_prompt_model_calls = (
             10  # Data summarizer calls
-            + self.num_candidates * num_predictors  # Candidate generation
+            + self.num_instruct_candidates * num_predictors  # Candidate generation
             + (num_predictors + 1 if program_aware_proposer else 0)  # Program-aware proposer
         )
         prompt_model_line = (
             f"{YELLOW}- Prompt Generation: {BLUE}{BOLD}10{ENDC}{YELLOW} data summarizer calls + "
-            f"{BLUE}{BOLD}{self.num_candidates}{ENDC}{YELLOW} * "
+            f"{BLUE}{BOLD}{self.num_instruct_candidates}{ENDC}{YELLOW} * "
             f"{BLUE}{BOLD}{num_predictors}{ENDC}{YELLOW} lm calls in program "
             f"+ ({BLUE}{BOLD}{num_predictors + 1}{ENDC}{YELLOW}) lm calls in program-aware proposer "
             f"= {BLUE}{BOLD}{estimated_prompt_model_calls}{ENDC}{YELLOW} prompt model calls{ENDC}"
@@ -344,6 +372,7 @@ def _get_user_confirmation(
         user_confirmation_message = textwrap.dedent(
             f"""\
             To proceed with the execution of this program, please confirm by typing {BLUE}'y'{ENDC} for yes or {BLUE}'n'{ENDC} for no.
+            If no input is received within 20 seconds, the program will proceed automatically.
 
             If you would like to bypass this confirmation step in future executions, set the {YELLOW}`requires_permission_to_run`{ENDC} flag to {YELLOW}`False`{ENDC} when calling compile.
 
@@ -351,10 +380,18 @@ def _get_user_confirmation(
         """
         )
 
-        user_input = (
-            input(f"{user_message}\n{user_confirmation_message}\nDo you wish to continue? (y/n): ").strip().lower()
-        )
-        return user_input == "y"
+        print(f"{user_message}\n{user_confirmation_message}\nDo you wish to continue? (y/n): ", end='', flush=True)
+        
+        # Wait for input with timeout
+        start_time = time.time()
+        while time.time() - start_time < 20:
+            if select.select([sys.stdin], [], [], 0.1)[0]:
+                user_input = sys.stdin.readline().strip().lower()
+                return user_input == "y"
+            time.sleep(0.1)
+        
+        print("\nNo input received within 20 seconds. Proceeding with execution...")
+        return True
 
     def _bootstrap_fewshot_examples(self, program: Any, trainset: List, seed: int, teacher: Any) -> Optional[List]:
         logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==")
@@ -365,14 +402,14 @@ def _bootstrap_fewshot_examples(self, program: Any, trainset: List, seed: int, t
         else:
             logger.info("These will be used for informing instruction proposal.\n")
 
-        logger.info(f"Bootstrapping N={self.num_candidates} sets of demonstrations...")
+        logger.info(f"Bootstrapping N={self.num_fewshot_candidates} sets of demonstrations...")
 
         zeroshot = self.max_bootstrapped_demos == 0 and self.max_labeled_demos == 0
 
         try:
             demo_candidates = create_n_fewshot_demo_sets(
                 student=program,
-                num_candidate_sets=self.num_candidates,
+                num_candidate_sets=self.num_fewshot_candidates,
                 trainset=trainset,
                 max_labeled_demos=(LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT if zeroshot else self.max_labeled_demos),
                 max_bootstrapped_demos=(
@@ -426,12 +463,12 @@ def _propose_instructions(
             rng=self.rng,
         )
 
-        logger.info("\nProposing instructions...\n")
+        logger.info(f"\nProposing N={self.num_instruct_candidates} instructions...\n")
         instruction_candidates = proposer.propose_instructions_for_program(
             trainset=trainset,
             program=program,
             demo_candidates=demo_candidates,
-            N=self.num_candidates,
+            N=self.num_instruct_candidates,
             T=self.init_temperature,
             trial_logs={},
         )
@@ -458,7 +495,6 @@ def _optimize_prompt_parameters(
         minibatch_full_eval_steps: int,
         seed: int,
     ) -> Optional[Any]:
-
         # Run optimization
         optuna.logging.set_verbosity(optuna.logging.WARNING)
         logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==")
@@ -468,7 +504,7 @@ def _optimize_prompt_parameters(
 
         # Compute the adjusted total trials that we will run (including full evals)
         run_additional_full_eval_at_end = 1 if num_trials % minibatch_full_eval_steps != 0 else 0
-        adjusted_num_trials = (num_trials + num_trials // minibatch_full_eval_steps + 1 + run_additional_full_eval_at_end) if minibatch else num_trials
+        adjusted_num_trials = int((num_trials + num_trials // minibatch_full_eval_steps + 1 + run_additional_full_eval_at_end) if minibatch else num_trials)
         logger.info(f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program ==")
 
         default_score, _ = eval_candidate_program(
@@ -477,12 +513,11 @@ def _optimize_prompt_parameters(
         logger.info(f"Default program score: {default_score}\n")
 
         trial_logs = {}
-
         trial_logs[1] = {}
-        # trial_logs[1]["full_eval_program_path"] = save_candidate_program(program, self.log_dir, -1)
+        trial_logs[1]["full_eval_program_path"] = save_candidate_program(program, self.log_dir, -1)
         trial_logs[1]["full_eval_score"] = default_score
         trial_logs[1]["total_eval_calls_so_far"] = len(valset)
-        # trial_logs[1]["full_eval_program"] = program.deepcopy()
+        trial_logs[1]["full_eval_program"] = program.deepcopy()
 
         # Initialize optimization variables
         best_score = default_score
@@ -589,11 +624,7 @@ def objective(trial):
                     instruction_candidates,
                     demo_candidates,
                 )
-            
-            # Log model token usage at the end of the trial 
-            teacher_model = None if "lm" not in self.teacher_settings else self.teacher_settings["lm"]
-            log_token_usage(trial_logs, trial_num, model_dict={"prompt_model": self.prompt_model, "teacher_model": teacher_model, "task_model": self.task_model})
-            
+
             return score
 
         sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True)
@@ -646,10 +677,10 @@ def _log_minibatch_eval(
         candidate_program,
         total_eval_calls,
     ):
-        # trial_logs[trial_num]["mb_program_path"] = save_candidate_program(candidate_program, self.log_dir, trial_num)
+        trial_logs[trial_num]["mb_program_path"] = save_candidate_program(candidate_program, self.log_dir, trial_num)
         trial_logs[trial_num]["mb_score"] = score
         trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
-        # trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy()
+        trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy()
 
         logger.info(f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}.")
         minibatch_scores = ", ".join([f"{s['score']}" for s in score_data if not s["full_eval"]])
@@ -677,12 +708,12 @@ def _log_normal_eval(
         candidate_program,
         total_eval_calls,
     ):
-        # trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program(
-        #     candidate_program, self.log_dir, trial_num
-        # )
+        trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program(
+            candidate_program, self.log_dir, trial_num
+        )
         trial_logs[trial_num]["full_eval_score"] = score
         trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
-        # trial_logs[trial_num]["full_eval_program"] = candidate_program.deepcopy()
+        trial_logs[trial_num]["full_eval_program"] = candidate_program.deepcopy()
 
         logger.info(f"Score: {score} with parameters {chosen_params}.")
         full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]])
@@ -770,30 +801,21 @@ def _perform_full_evaluation(
         )
         study.add_trial(trial)
 
-        # Log full eval as a trial so that optuna can learn from the new results
-        trial = optuna.trial.create_trial(
-            params=params,
-            distributions= self._get_param_distributions(best_program, instruction_candidates, demo_candidates),
-            value=full_eval_score,
-        )
-        study.add_trial(trial)
-
         # Log full evaluation results
         fully_evaled_param_combos[combo_key] = {
             "program": highest_mean_program,
             "score": full_eval_score,
         }
         total_eval_calls += len(valset)
-
         trial_logs[trial_num + 1] = {}
         trial_logs[trial_num + 1]["total_eval_calls_so_far"] = total_eval_calls
-        # trial_logs[trial_num + 1]["full_eval_program_path"] = save_candidate_program(
-        #     program=highest_mean_program,
-        #     log_dir=self.log_dir,
-        #     trial_num=trial_num + 1,
-        #     note="full_eval",
-        # )
-        # trial_logs[trial_num + 1]["full_eval_program"] = highest_mean_program
+        trial_logs[trial_num + 1]["full_eval_program_path"] = save_candidate_program(
+            program=highest_mean_program,
+            log_dir=self.log_dir,
+            trial_num=trial_num + 1,
+            note="full_eval",
+        )
+        trial_logs[trial_num + 1]["full_eval_program"] = highest_mean_program
         trial_logs[trial_num + 1]["full_eval_score"] = full_eval_score
 
         # Update best score and program if necessary

From e671f958dca720eba531bbcade65ad670c0e3894 Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Mon, 26 May 2025 23:00:14 -0400
Subject: [PATCH 16/18] fixing up parse_value in utils

---
 dspy/adapters/json_adapter.py | 18 ------------------
 dspy/adapters/utils.py        | 27 ++++++++++++++++++---------
 2 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/dspy/adapters/json_adapter.py b/dspy/adapters/json_adapter.py
index 5fcfb8b2e1..96597f49fb 100644
--- a/dspy/adapters/json_adapter.py
+++ b/dspy/adapters/json_adapter.py
@@ -44,24 +44,6 @@ def __call__(
         inputs = self.format(signature, demos, inputs)
         inputs = dict(prompt=inputs) if isinstance(inputs, str) else dict(messages=inputs)
 
-        # try:
-        #     provider = lm.model.split("/", 1)[0] or "openai"
-        #     params = litellm.get_supported_openai_params(model=lm.model, custom_llm_provider=provider)
-        #     if params and "response_format" in params:
-        #         try:
-        #             response_format = _get_structured_outputs_response_format(signature)
-        #             outputs = lm(**inputs, **lm_kwargs, response_format=response_format)
-        #         except Exception as e:
-        #             logger.debug(
-        #                 f"Failed to obtain response using signature-based structured outputs"
-        #                 f" response format: Falling back to default 'json_object' response format."
-        #                 f" Exception: {e}"
-        #             )
-        #             outputs = lm(**inputs, **lm_kwargs, response_format={"type": "json_object"})
-        #     else:
-        #         outputs = lm(**inputs, **lm_kwargs)
-
-        # except litellm.UnsupportedParamsError:
         outputs = lm(**inputs, **lm_kwargs)
 
         values = []
diff --git a/dspy/adapters/utils.py b/dspy/adapters/utils.py
index 511720cc63..a9b0851685 100644
--- a/dspy/adapters/utils.py
+++ b/dspy/adapters/utils.py
@@ -87,24 +87,33 @@ def find_enum_member(enum, identifier):
 
 
 def parse_value(value, annotation):
+    # Handle Optional[T] (i.e., Union[T, None]) and validate Union assumptions
+    if get_origin(annotation) is Union:
+        args = get_args(annotation)
+        non_none_args = [arg for arg in args if arg is not type(None)]
 
-    is_optional_str = (
-        getattr(annotation, '__origin__', None) is Union and str in get_args(annotation)
-    )
+        if len(non_none_args) == 1:
+            annotation = non_none_args[0]
+        else:
+            raise TypeError(
+                f"Unsupported Union type: {annotation}. "
+                f"Expected Optional[T] (i.e., Union[T, None]), but got Union with multiple concrete types: {non_none_args}"
+            )
 
-    if annotation is str or is_optional_str:
-        return str(value) if value is not None else None  # Ensure string output, preserving None
-    
-    # if annotation is str:
-    #     return str(value)
+    # Handle str
+    if annotation is str:
+        return str(value) if value is not None else None
 
+    # Handle Enums
     if isinstance(annotation, enum.EnumMeta):
         return find_enum_member(annotation, value)
 
+    # Validate if input is already the right type
     if not isinstance(value, str):
         return TypeAdapter(annotation).validate_python(value)
 
-    candidate = json_repair.loads(value)  # json_repair.loads returns "" on failure.
+    # Try to parse string value
+    candidate = json_repair.loads(value)
     if candidate == "" and value != "":
         try:
             candidate = ast.literal_eval(value)

From 0ac18957886ac108084fb10401cdd9ad0fa8127d Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Tue, 27 May 2025 09:49:37 -0400
Subject: [PATCH 17/18] explicitly updating utils.py to handle None values

---
 dspy/adapters/utils.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/dspy/adapters/utils.py b/dspy/adapters/utils.py
index a9b0851685..fb848ee475 100644
--- a/dspy/adapters/utils.py
+++ b/dspy/adapters/utils.py
@@ -87,6 +87,8 @@ def find_enum_member(enum, identifier):
 
 
 def parse_value(value, annotation):
+    origin_annotation = annotation
+
     # Handle Optional[T] (i.e., Union[T, None]) and validate Union assumptions
     if get_origin(annotation) is Union:
         args = get_args(annotation)
@@ -100,9 +102,16 @@ def parse_value(value, annotation):
                 f"Expected Optional[T] (i.e., Union[T, None]), but got Union with multiple concrete types: {non_none_args}"
             )
 
+    # Explicitly return None if the value is None and the annotation allowed it
+    if value is None:
+        if get_origin(origin_annotation) is Union and type(None) in get_args(origin_annotation):
+            return None
+        else:
+            raise TypeError(f"Received None for non-optional annotation: {annotation}")
+
     # Handle str
     if annotation is str:
-        return str(value) if value is not None else None
+        return str(value)
 
     # Handle Enums
     if isinstance(annotation, enum.EnumMeta):

From 5992830b7881ed27b3271a29de20431f55b0378f Mon Sep 17 00:00:00 2001
From: Krista Opsahl-Ong <krista.opsahl-ong@databricks.com>
Date: Fri, 11 Jul 2025 15:43:42 -0400
Subject: [PATCH 18/18] adding update fields ability

---
 dspy/teleprompt/simba_fast.py  |   5 +-
 dspy/teleprompt/simba_utils.py | 142 +++++++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+), 2 deletions(-)

diff --git a/dspy/teleprompt/simba_fast.py b/dspy/teleprompt/simba_fast.py
index 425411da3e..56451b7c20 100644
--- a/dspy/teleprompt/simba_fast.py
+++ b/dspy/teleprompt/simba_fast.py
@@ -5,7 +5,7 @@
 import numpy as np
 from typing import Callable, Optional, Any, Dict
 from dspy.teleprompt.teleprompt import Teleprompter
-from dspy.teleprompt.simba_utils import prepare_models_for_resampling, wrap_program, append_a_demo, append_a_rule
+from dspy.teleprompt.simba_utils import prepare_models_for_resampling, wrap_program, append_a_demo, append_a_rule, update_fields
 from dspy.teleprompt.utils import log_token_usage
 
 logger = logging.getLogger(__name__)
@@ -53,7 +53,8 @@ def __init__(
         self.temperature_for_candidates = temperature_for_candidates
 
         if self.max_demos > 0:
-            self.strategies = [append_a_demo(demo_input_field_maxlen), append_a_rule]
+            # self.strategies = [append_a_demo(demo_input_field_maxlen), append_a_rule]
+            self.strategies = [update_fields]
         else:
             self.strategies = [append_a_rule]
 
diff --git a/dspy/teleprompt/simba_utils.py b/dspy/teleprompt/simba_utils.py
index 32438164bb..b36f12a33f 100644
--- a/dspy/teleprompt/simba_utils.py
+++ b/dspy/teleprompt/simba_utils.py
@@ -114,6 +114,24 @@ def append_a_demo_(bucket, system, **kwargs):
     
     return append_a_demo_
 
+def get_good_and_bad_examples(bucket):
+    """Get good and bad examples from bucket
+    """
+    good, bad = bucket[0], bucket[-1]
+    return good["example"], bad["example"]
+
+def get_good_and_bad_trajectories(good_example, bad_example, predictor2name):
+    """Get good and bad trajectories from examples
+    """
+    good_trajectory = [
+        dict(module_name=predictor2name[id(p)], inputs=i, outputs=dict(o))
+        for p, i, o in good_example["trace"]
+    ]
+    bad_trajectory = [
+        dict(module_name=predictor2name[id(p)], inputs=i, outputs=dict(o))
+        for p, i, o in bad_example["trace"]
+    ]
+    return good_trajectory, bad_trajectory
 
 def append_a_rule(bucket, system, **kwargs):
     # Read in kwargs
@@ -180,6 +198,130 @@ def append_a_rule(bucket, system, **kwargs):
 
     return True
 
+def update_fields(bucket, system, **kwargs):
+    predictor2name = kwargs["predictor2name"]
+    batch_10p_score, batch_90p_score = kwargs["batch_10p_score"], kwargs["batch_90p_score"]
+    prompt_model = kwargs["prompt_model"] or dspy.settings.lm
+
+    module_names = [name for name, _ in system.named_predictors()]
+    good, bad = bucket[0], bucket[-1]
+    example = good["example"]
+
+    if good["score"] <= batch_10p_score or bad["score"] >= batch_90p_score:
+        logger.info(f"Skipping rule generation as good score {good['score']} is at or below the 10th percentile "
+                    f"*or* bad score {bad['score']} is at or above the 90th percentile.")
+        return False
+
+    if good["score"] <= bad["score"]:
+        if good["score"] > batch_90p_score:
+            bad["trace"] = []
+            bad["score"] = "N/A"
+            bad["prediction"] = {"N/A": "Prediction not available"}
+        else:
+            good["trace"] = []
+            good["score"] = "N/A"
+            good["prediction"] = {"N/A": "Prediction not available"}
+
+    better_trajectory = [
+        dict(module_name=predictor2name[id(p)], inputs=i, outputs=dict(o))
+        for p, i, o in good["trace"]
+    ]
+    worse_trajectory = [
+        dict(module_name=predictor2name[id(p)], inputs=i, outputs=dict(o))
+        for p, i, o in bad["trace"]
+    ]
+
+    # Get the current fields
+    current_fields = {}
+    for name, predictor in system.named_predictors():
+        current_fields[name] = {}
+        for field_name, field in predictor.signature.input_fields.items():
+            current_fields[name][field_name] = {}
+            current_fields[name][field_name]["name"] = field.json_schema_extra["prefix"]
+            current_fields[name][field_name]["desc"] = field.json_schema_extra["desc"]
+        for field_name, field in predictor.signature.output_fields.items():
+            current_fields[name][field_name] = {}
+            current_fields[name][field_name]["name"] = field.json_schema_extra["prefix"]
+            current_fields[name][field_name]["desc"] = field.json_schema_extra["desc"]
+
+    kwargs = dict(
+        program_code=inspect.getsource(system.__class__),
+        modules_defn=inspect_modules(system),
+        program_inputs={**example.inputs()},
+        oracle_metadata={**example.labels()},
+        current_fields=current_fields,
+        better_program_trajectory=better_trajectory,
+        better_program_outputs=dict(good["prediction"]),
+        worse_program_trajectory=worse_trajectory,
+        worse_program_outputs=dict(bad["prediction"] or {}),
+        worse_reward_value=bad["score"],
+        better_reward_value=good["score"],
+        worse_reward_info=bad["output_metadata"],
+        better_reward_info=good["output_metadata"],
+    )
+
+    kwargs = {k: v if isinstance(v, str) else ujson.dumps(recursive_mask(v), indent=2)
+              for k, v in kwargs.items()}
+
+    # Get new prefixes and descriptions for each field
+    with dspy.settings.context(trace=[], lm=prompt_model):
+        update_fields_program = dspy.Predict(UpdateFields)
+        updated_fields = update_fields_program(**kwargs).updated_fields
+    
+    # Set the prefix and description of the fields
+    for name, predictor in system.named_predictors():
+        if name in updated_fields:
+            for field_name, field in predictor.signature.input_fields.items():
+                if field_name in updated_fields[name]:
+                    if "name" in updated_fields[name][field_name]:
+                        field.json_schema_extra["prefix"] = updated_fields[name][field_name]["name"]
+                    if "desc" in updated_fields[name][field_name]:
+                        field.json_schema_extra["desc"] = updated_fields[name][field_name]["desc"]
+            for field_name, field in predictor.signature.output_fields.items():
+                if field_name in updated_fields[name]:
+                    if "name" in updated_fields[name][field_name]:
+                        field.json_schema_extra["prefix"] = updated_fields[name][field_name]["name"]
+                    if "desc" in updated_fields[name][field_name]:
+                        field.json_schema_extra["desc"] = updated_fields[name][field_name]["desc"]
+
+    prompt_model.inspect_history(n=1)
+    print(f"Current fields: {current_fields}")
+    print(f"Updated fields: {updated_fields}")
+
+    return True
+
+class UpdateFields(dspy.Signature):
+    """
+    You will be given two trajectories of an LLM-driven program's execution: one that is successful and one that is not.
+    You will also be provided with the current fields of the program, which are being used to describe the desired inputs and outputs of the program to the LLM.
+    Your goal is to update the fields of the program to be more accurate and informative to ensure that the program
+    is able to learn from the successful trajectory and avoid the mistakes of the unsuccessful trajectory. You can update both the name and the description of the fields.
+
+    These fields are important because they are used to provide the LLM with a description of the inputs it will receive,
+    and the outputs it will produce.
+
+    """
+    program_code: str = InputField(desc="The code of the program that we are analyzing")
+    modules_defn: str = InputField(desc="The definition of each module in the program, including its I/O")
+    program_inputs: str = InputField(desc="The inputs to the program that we are analyzing")
+    oracle_metadata: str = InputField(desc="Any (hidden) metadata about the training set instance we're analyzing")
+    worse_program_trajectory: str = InputField(
+        desc="The trajectory of the program's execution, showing each module's I/O"
+    )
+    worse_program_outputs: str = InputField(desc="The outputs of the program that we are analyzing")
+    worse_reward_value: float = InputField(desc="The reward value assigned to the program's outputs")
+    worse_reward_info: str = InputField(desc="Additional information that might be helpful to understanding the assigned reward value.")
+    better_program_trajectory: str = InputField(
+        desc="The trajectory of the program's execution, showing each module's I/O"
+    )
+    better_program_outputs: str = InputField(desc="The outputs of the program that we are analyzing")
+    better_reward_value: float = InputField(desc="The reward value assigned to the program's outputs")
+    better_reward_info: str = InputField(desc="Additional information that might be helpful to understanding the assigned reward value.")
+    current_fields: dict[str, dict[str, dict[str, str]]] = InputField(desc="A dictionary of current field names and descriptions for the program.")
+    discussion: str = OutputField(desc="Discussing blame of where each module went wrong, if it did.")
+    field_discussion: str = OutputField(desc="Discussing the changes to the fields that should be made for each model in the program.")
+    updated_fields: dict[str, dict[str, dict[str, str]]] = OutputField(desc="A dictionary of new field names and descriptions for each module in the program. These will be used to update the fields of the program to better clarify expected inputs & outputs to the LLM.")
+
 class OfferFeedback(dspy.Signature):
     """
     You will be given two trajectories of an LLM-driven program's execution. Your goal is to help the program's modules