wip: new results and optimize agent

vsoch · vsoch · commit 8560e71e1880 · 2025-08-22T11:44:13.000-07:00
Signed-off-by: vsoch &lt;vsoch@users.noreply.github.com&gt;
diff --git a/fractale/agent/kubernetes/__init__.py b/fractale/agent/kubernetes/__init__.py
@@ -1,2 +1,3 @@
 from .job import KubernetesJobAgent
-assert KubernetesJobAgent
+
+assert KubernetesJobAgent
diff --git a/fractale/agent/kubernetes/base.py b/fractale/agent/kubernetes/base.py
@@ -1,4 +1,9 @@
 import argparse
+import json
+import subprocess
+
+from rich import print
+from rich.panel import Panel
 from rich.syntax import Syntax
 
 import fractale.agent.logger as logger
@@ -45,7 +50,6 @@ def print_result(self, job_crd):
             highlighted_syntax, title="Final Kubernetes Job", border_style="green", expand=True
         )
 
-
     def save_log(self, full_logs):
         """
         Save logs to metadata
@@ -64,4 +68,58 @@ def save_job_manifest(self, job):
                 self.metadata["assets"][self.result_type] = []
             self.metadata["assets"][self.result_type].append(
                 {"item": job, "attempt": self.attempts}
-            )
+            )
+
+    def cluster_resources(self):
+        """
+        Get cluster resources - count of nodes and resources.
+        I was thinking of caching this, but clusters can change,
+        and it's easy (and inexpensive) enough to query that we repeat.
+        """
+        print("[yellow]Querying Kubernetes cluster for node resources...[/yellow]")
+        try:
+            # Execute the kubectl command
+            result = subprocess.run(
+                ["kubectl", "get", "nodes", "-o", "json"],
+                capture_output=True,
+                text=True,
+                check=True,
+                timeout=30,
+            )
+
+            # Parse the JSON output
+            nodes_data = json.loads(result.stdout)
+            nodes = nodes_data.get("items", [])
+
+            if not nodes:
+                print("[red]Error: No nodes found in the cluster.[/red]")
+                return None
+
+            # Keep a listing (with count) of node specs
+            # The key is the cpu, memory, and arch, and then node count
+            node_specs = {}
+            for node in nodes:
+                node_spec = (
+                    node["status"]["allocatable"]["cpu"],
+                    node["status"]["allocatable"]["memory"],
+                    node["status"]["nodeInfo"]["architecture"],
+                )
+                if node_spec not in node_specs:
+                    node_specs[node_spec] = 0
+                node_specs[node_spec] += 1
+
+            # Ensure we expand the resources
+            node_specs = [
+                {"cpu": x[0], "memory": x[1], "arch": x[2], "count": v}
+                for x, v in node_specs.items()
+            ]
+            cluster_info = {"total_nodes": len(nodes), "node_specs": node_specs}
+
+            print("[green]✅ Successfully retrieved cluster information.[/green]")
+            return cluster_info
+
+        except Exception as e:
+            print(
+                f"[bold red]Error executing kubectl command. Do you have access to the cluster?[/bold red]"
+            )
+            print(f"Stderr: {e.stderr}")
diff --git a/fractale/agent/kubernetes/job/agent.py b/fractale/agent/kubernetes/job/agent.py
@@ -1,7 +1,5 @@
-import argparse
 import json
 import os
-import re
 import shutil
 import subprocess
 import sys
@@ -11,17 +9,16 @@
 
 import yaml
 from rich import print
-from rich.syntax import Syntax
 
-import fractale.agent.kubernetes.objects as objects
-from fractale.agent.kubernetes.base import KubernetesAgent
 import fractale.agent.kubernetes.job.prompts as prompts
+import fractale.agent.kubernetes.objects as objects
 import fractale.agent.logger as logger
 import fractale.utils as utils
-from fractale.agent.base import GeminiAgent
 from fractale.agent.context import get_context
 from fractale.agent.decorators import timed
 from fractale.agent.errors import DebugAgent
+from fractale.agent.kubernetes.base import KubernetesAgent
+from fractale.agent.optimize import OptimizationAgent
 
 
 class KubernetesJobAgent(KubernetesAgent):
@@ -33,6 +30,13 @@ class KubernetesJobAgent(KubernetesAgent):
     description = "Kubernetes Job agent"
     result_type = "kubernetes-job-manifest"
 
+    def __init__(self, *args, **kwargs):
+        """
+        Add the optimization agent, even if we don't need it.
+        """
+        super().__init__(*args, **kwargs)
+        self.optimize_agent = OptimizationAgent()
+
     def get_prompt(self, context):
         """
         Get the prompt for the LLM. We expose this so the manager can take it
@@ -139,7 +143,6 @@ def deploy(self, context):
         Deploy the Kubernetes Job.
         """
         job_crd = context.result
-        cleanup = context.get("cleanup", True)
 
         # Not sure if this can happen, assume it can
         if not job_crd:
@@ -320,21 +323,56 @@ def deploy(self, context):
         # But did it succeed?
         if final_status.get("succeeded", 0) > 0:
             print("\n[green]✅ Job final status is Succeeded.[/green]")
+
+            # if we want to optimize, we continue to run until we are instructed not to.
+            if context.get("optimize") is not None:
+
+                # TODO move into own function?
+                # We should provide the cluster resources to the agent
+                resources = self.cluster_resources()
+
+                # The agent calling the optimize agent decides what metadata to present.
+                # This is how this agent will work for cloud vs. bare metal
+                context.requires = prompts.get_optimize_prompt(context, resources)
+                context = self.optimize_agent.run(context, full_logs)
+
+                # Go through spec and update fields that match.
+                decision = context.optimize_result['decision']
+                print(f"\n[green]✅ Optimization agent decided to {decision}.[/green]")
+                if decision == "RETRY":
+                    context.result = self.update_job_crd(context.optimize_result, job_crd)
+                    print(context.result)
+                    return self.deploy(context)
+                
+                # Agent has decided to return - no more optimize.
+                return 0, full_logs
+
         else:
             print("\n[red]❌ Job final status is Failed.[/red]")
             diagnostics = self.get_diagnostics(job, pod)
             job.delete()
             # We already have the logs, so we can pass them directly.
             return 1, prompts.failure_message % diagnostics
 
-        if cleanup and os.path.exists(deploy_dir):
+        if context.get('cleanup') is True and os.path.exists(deploy_dir):
             print(f"[dim]Cleaning up temporary deploy directory: {deploy_dir}[/dim]")
             job.delete()
             shutil.rmtree(deploy_dir, ignore_errors=True)
 
         # Save full logs for the step
         return 0, full_logs
 
+    def update_job_crd(self, updates, job_crd):
+        """
+        Update the job crd with a set of controlled fields.
+        """
+        for key in ['decision', 'reason']:
+            if key in updates:
+                del updates[key]
+        prompt = prompts.update_prompt % (job_crd, json.dumps(updates))
+        result = self.ask_gemini(prompt)
+        return self.get_code_block(result, 'yaml')
+
     def save_job_manifest(self, job):
         """
         Save job manifest to metadata
diff --git a/fractale/agent/kubernetes/job/prompts.py b/fractale/agent/kubernetes/job/prompts.py
@@ -1,5 +1,6 @@
 import fractale.agent.defaults as defaults
 from fractale.agent.prompts import prompt_wrapper
+import json
 
 # Requirements are separate to give to error helper agent
 # This should explicitly state what the agent is capable of doing.
@@ -29,6 +30,37 @@
 %s
 """
 
+update_prompt = """You are a Kubernetes Job update agent. Your job is to take a spec of updates for a Job Manifest and apply them.
+You are NOT allowed to make other changes to the manifest. Ignore the 'decision' field and if you think appropriate, add context from "reason" as comments.
+Here are the updates:
+
+%s
+
+And here is the Job manifest to apply them to:
+%s
+Return ONLY the YAML with no other text or commentary.
+"""
+
+def get_optimize_prompt(context, resources):
+    """
+    Get a description of cluster resources and optimization goals.
+    """
+    prompt = """
+    Your task is to optimize the running of a Kubernetes Job: %s in %s. You are allowed to request anywhere in the range of available resources, including count and type. Here are the available resources:
+    %s
+    Here is the current job manifest:
+    ```yaml
+    %s
+    ```
+    Please return ONLY a json structure to be loaded that includes a limited set of fields (with keys corresponding to the names that are organized the same as a Kubernetes Job, e.g., spec -> template -spec.
+    The result should be provided as json. The fields should map 1:1 into a pod spec serialzied as json. 
+    Do not make requests that lead to Guaranteed pods. DO NOT CHANGE PROBLEM SIZE PARAMETERS OR COMMAND. You can change args. Remember that
+    to get a full node resources you often have to ask for slightly less than what is available.
+    """ % (context.optimize, context.environment, json.dumps(resources), context.result)
+    dockerfile = context.get('dockerfile')
+    if dockerfile:
+        prompt += f" Here is the Dockerfile that helped to generate the application.\n {dockerfile}\n" 
+    return prompt
 
 def get_regenerate_prompt(context):
     """
diff --git a/fractale/agent/optimize/__init__.py b/fractale/agent/optimize/__init__.py
@@ -0,0 +1 @@
+from .agent import OptimizationAgent
diff --git a/fractale/agent/optimize/agent.py b/fractale/agent/optimize/agent.py
@@ -0,0 +1,101 @@
+import argparse
+import textwrap
+import json
+import os
+from rich import print
+
+import google.generativeai as genai
+import sys
+import fractale.agent.build.prompts as prompts
+import fractale.agent.logger as logger
+from fractale.agent.base import GeminiAgent
+from fractale.agent.context import get_context
+import fractale.agent.optimize.prompts as prompts
+import fractale.agent.defaults as defaults
+
+# The result parser holds a ResultAgent
+from fractale.agent.results import ResultParser
+
+class OptimizationAgent(GeminiAgent):
+    """
+    Optimization Agent
+
+    The optimization agent receives a figure of merit request
+    and a need to minimize or maximize. It will return to a job
+    running agent (akin to Kubernetes Job) that has explicit
+    instructions to only change a subset of the execution arguments.
+    This means that the optimization agent can be general to take
+    some application and parameters attempted before. The optimization
+    agent should return a new parameter set to the build agent.
+    """
+
+    name = "optimization"
+    description = "optimization agent agent"
+    state_variables = ["optimize"]
+
+    def init(self):
+        self.model = genai.GenerativeModel(defaults.gemini_model)
+        self.chat = self.model.start_chat()
+        try:
+            genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+        except KeyError:
+            sys.exit("ERROR: GEMINI_API_KEY environment variable not set.")
+
+        self.foms = []
+        # We could just take the listing of FOMs, but I am not sure
+        # if there are cases where that might not reflect attempts.
+        self.attempts = 0        
+        self.parser = ResultParser()
+
+
+    def _add_arguments(self, subparser):
+        """
+        Add arguments for the plugin to show up in argparse
+        """
+        agent = subparser.add_parser(
+            self.name,
+            formatter_class=argparse.RawTextHelpFormatter,
+            description=self.description,
+        )
+        agent.add_argument(
+            "optimize",
+            help="Optimization instruction (include application, environment, algorithm, etc.).",
+        )
+        return agent
+
+    def run(self, context, log):
+        """
+        Run the optimization agent.
+        """
+        # We don't do attempts because we have no condition for success.
+        context = get_context(context)
+
+        prompt = context.get('requires')
+        # If requirements not specified, we require the "optimize" context
+        if not prompt:
+            prompt = prompts.get_optimize_prompt(context)
+
+        # Parser requires is the FOM and optimize directive.
+        # This returns a list of foms.
+        foms = self.parser.parse(context.optimize, log)
+        self.foms += foms
+        self.attempts += 1
+
+        # This adds supplementary detail about how to optimize - "keep going until it's good":_
+        prompt += prompts.supplement_optimize_prompt % (self.attempts, json.dumps(self.foms))
+
+        # TODO: if this agent stores memory we don't need to include dockerfile after the first...
+        print("Sending optimization prompt to Gemini...")
+        
+        # Get the updates. We assume that optimization updates for resources
+        # need to come back and be parsed into json.
+        print(textwrap.indent(prompt[0:500], "> ", predicate=lambda _: True))
+        content = self.ask_gemini(prompt, with_history=True)
+        print("Received optimization from Gemini...")
+        logger.custom(content, title="[green]Result Parser[/green]", border_style="green")
+        result = json.loads(self.get_code_block(content, 'json'))
+        if "decision" not in result:
+            return self.run(context, log)
+        # We can't be sure of the format or how to update, so return to job agent
+        context.optimize_result = result
+        return context
diff --git a/fractale/agent/optimize/prompts.py b/fractale/agent/optimize/prompts.py
@@ -0,0 +1,26 @@
+import fractale.agent.defaults as defaults
+from fractale.agent.prompts import prompt_wrapper
+
+# TODO should this be allowed to return to a different agent?
+common_instructions = """
+- You can make changes to the application execution only.
+- You are not allowed to request changes to any configuration beyond the application execution command.
+"""
+
+optimize_prompt = f"""You are an optimization agent. Your job is to receive application commands and environments, and make a suggestion for how to improve a metric of interest.
+Here are your instructions:
+
+%s 
+
+- The response should ONLY contain parameters for resources cpu, memory, nodes, and environment variables, formatting as a JSON string that can be parsed.
+"""
+
+# This is added to details from a job manager optimization prompt about the decision that should come back.
+supplement_optimize_prompt = """You also need to decide if the job is worth retrying again. You have made %s attempts and here are the figure of merits as described for those attempts:
+%s
+Please include in your response a "decision" field that is RETRY or STOP. You should keep retrying until you determine the application run is optimized. If you like, you can add a "reason" field that briefly summarizes the decision.
+"""
+
+# These are currently required, but don't necessarily have to be...
+def get_optimize_prompt(context):    
+    return optimize_prompt % context.requires
diff --git a/fractale/agent/results/__init__.py b/fractale/agent/results/__init__.py
@@ -0,0 +1,2 @@
+from .agent import ResultAgent, ResultParser
+assert ResultAgent, ResultParser
diff --git a/fractale/agent/results/agent.py b/fractale/agent/results/agent.py
diff --git a/fractale/agent/results/prompts.py b/fractale/agent/results/prompts.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .agent import ResultAgent, ResultParser`
	`2`	`+assert ResultAgent, ResultParser`