diff --git a/.ai-rules.md b/.ai-rules.md deleted file mode 100644 index 5da0c84..0000000 --- a/.ai-rules.md +++ /dev/null @@ -1,66 +0,0 @@ -# AI Assistant Rules for Fireteam - -This file contains rules for AI coding assistants (Cursor, Claude, Warp, GitHub Copilot, etc.) - -## Python Version: 3.12+ ONLY - -**CRITICAL**: This project requires Python 3.12 or higher. - -- ✅ Use: `python3.12` or higher -- ❌ Never use: Python 3.9, 3.10, or 3.11 -- Dependencies like `claude-agent-sdk>=0.1.4` require Python 3.10+, and we standardize on 3.12+ - -### Checking Python Version -```bash -python3.12 --version # Should show Python 3.12.x or higher -``` - -## Dependency Management: Use `uv` - -**CRITICAL**: Always use `uv` for Python package management. - -- ✅ Use: `uv pip install`, `uv add`, `uv venv` -- ❌ Never use: `pip install`, `pip3 install`, standard pip commands - -### Why `uv`? -- 10-100x faster than pip -- Better dependency resolution -- Drop-in replacement for pip -- Production-ready and maintained by Astral (creators of Ruff) - -### Common Commands -```bash -# Create virtual environment -uv venv - -# Activate virtual environment (macOS/Linux) -source venv/bin/activate - -# Install all dependencies -uv pip install -r requirements.txt - -# Install a single package -uv pip install - -# Add a new dependency (updates requirements.txt) -uv add - -# Sync to exact versions -uv pip sync requirements.txt -``` - -## Installing `uv` - -If `uv` is not installed: -```bash -# macOS/Linux -curl -LsSf https://astral.sh/uv/install.sh | sh - -# Or with Homebrew -brew install uv -``` - -## Summary - -1. **Python**: Always 3.12+ -2. **Packages**: Always use `uv`, never `pip` diff --git a/.claude-plugin/commands/fireteam.md b/.claude-plugin/commands/fireteam.md new file mode 100644 index 0000000..074f69f --- /dev/null +++ b/.claude-plugin/commands/fireteam.md @@ -0,0 +1,65 @@ +# /fireteam + +Multi-phase autonomous task execution with complexity-based routing. + +## Usage + +``` +/fireteam +``` + +## Configuration + +Set these environment variables to configure fireteam behavior: + +| Variable | Default | Description | +|----------|---------|-------------| +| `ANTHROPIC_API_KEY` | (required) | API key for Claude | +| `FIRETEAM_MAX_ITERATIONS` | (none/infinite) | Maximum loop iterations. Leave unset for infinite. | +| `FIRETEAM_LOG_LEVEL` | INFO | Logging verbosity (DEBUG, INFO, WARNING, ERROR) | + +## Examples + +``` +/fireteam Fix the authentication bug in auth.py +/fireteam Refactor the user module to use dependency injection +/fireteam Add comprehensive tests for the payment service +``` + +## How It Works + +1. **Complexity Estimation**: Analyzes your goal and estimates complexity (TRIVIAL, SIMPLE, MODERATE, COMPLEX) +2. **Mode Selection**: Routes to appropriate execution strategy: + - TRIVIAL/SIMPLE → SINGLE_TURN (one-shot execution) + - MODERATE → Execute → Review loop until >95% complete + - COMPLEX → Plan → Execute → 3 Parallel Reviews loop until 2/3 majority says >95% +3. **Loop Until Complete**: MODERATE and FULL modes loop continuously until the task is complete or max_iterations is reached (if set) + +## Configuration via Code + +When using fireteam as a library: + +```python +from fireteam import execute, ExecutionMode + +# Infinite iterations (default) +result = await execute( + project_dir="/path/to/project", + goal="Implement feature X", +) + +# Limited iterations +result = await execute( + project_dir="/path/to/project", + goal="Implement feature X", + max_iterations=10, # Stop after 10 iterations if not complete +) + +# Force a specific mode +result = await execute( + project_dir="/path/to/project", + goal="Implement feature X", + mode=ExecutionMode.FULL, + max_iterations=5, +) +``` diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..faa4396 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,7 @@ +{ + "name": "fireteam", + "version": "0.1.0", + "description": "Multi-phase autonomous task execution with complexity estimation, planning, execution, and review", + "commands": ["commands/fireteam.md"], + "hooks": "hooks/hooks.json" +} diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md new file mode 100644 index 0000000..d39548e --- /dev/null +++ b/.claude/CLAUDE.md @@ -0,0 +1,43 @@ +# Fireteam Agent Principles + +These principles are automatically loaded by the Claude Agent SDK and guide all fireteam operations. + +## Testing + +- Write tests as you implement (not as an afterthought) +- Run tests after every code change +- Don't consider a task complete until tests pass +- If tests fail, fix them before moving on + +## Quality Gates + +- All CI checks must pass locally before completion +- Run linting, type checking, and tests before considering work done +- If any quality check fails, address it immediately + +## Progress Checkpoints + +- After significant progress, step back and reassess +- Ask yourself: How are we doing? What's left? Is this more complex than expected? +- Update your todo list to reflect current understanding +- If the task has grown beyond the original estimate, flag it for re-evaluation + +## Escalation + +- If stuck after 3 attempts on the same issue, consider a different approach +- If a task turns out to be more complex than estimated, communicate this +- Don't silently struggle - surface blockers early + +## Code Quality + +- Write clean, readable code with clear intent +- Follow existing patterns in the codebase +- Add comments only where the logic isn't self-evident +- Don't over-engineer - solve the problem at hand + +## Minimal Changes + +- Make the smallest change that solves the problem +- Don't refactor unrelated code +- Don't add features that weren't requested +- Keep diffs focused and reviewable diff --git a/.cursorrules b/.cursorrules deleted file mode 100644 index 04b9462..0000000 --- a/.cursorrules +++ /dev/null @@ -1,28 +0,0 @@ -# Fireteam Development Rules - -## Python Version -- Always use Python 3.12 or higher -- Never use Python 3.9, 3.10, or 3.11 -- When creating virtual environments, use: `python3.12 -m venv venv` or `uv venv` - -## Dependency Management -- Always use `uv` for Python dependency management -- Never use `pip`, `pip3`, or `pip install` directly -- Install dependencies with: `uv pip install ` -- Sync dependencies with: `uv pip sync requirements.txt` -- Add dependencies with: `uv add ` - -## Example Commands -```bash -# Create virtual environment -uv venv - -# Install dependencies -uv pip install -r requirements.txt - -# Add a new dependency -uv add - -# Sync dependencies -uv pip sync requirements.txt -``` diff --git a/.env.example b/.env.example deleted file mode 100644 index 36056d3..0000000 --- a/.env.example +++ /dev/null @@ -1,13 +0,0 @@ -# Fireteam Environment Variables - -# Sudo password for system-level package installation -# Used by agents when installing dependencies (Node.js, build tools, etc.) -# Leave empty or comment out if using passwordless sudo -SUDO_PASSWORD=claude - -# Git configuration (optional overrides) -# GIT_USER_NAME=Your Name -# GIT_USER_EMAIL=your.email@example.com - -# Anthropic -ANTHROPIC_API_KEY="" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6d82292..5790869 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,88 +4,76 @@ on: pull_request: branches: [ main ] push: - branches: [ main ] # Only run on direct pushes to main + branches: [ main ] jobs: fast-tests: name: Fast Tests (Unit + Lightweight) runs-on: ubuntu-latest - + steps: - uses: actions/checkout@v4 - + - name: Set up Python 3.12 uses: actions/setup-python@v5 with: python-version: '3.12' - + - name: Install uv run: curl -LsSf https://astral.sh/uv/install.sh | sh - + - name: Create virtual environment run: uv venv - + - name: Install dependencies run: | source .venv/bin/activate - uv pip install -r requirements.txt - - - name: Run all fast tests + uv pip install -e ".[dev]" + + - name: Run unit tests run: | source .venv/bin/activate - pytest tests/ -m "not slow and not e2e and not integration" -v --tb=short + pytest tests/ -v --tb=short e2e-tests: name: End-to-End Tests (API) runs-on: ubuntu-latest - timeout-minutes: 20 # Fail fast if tests hang - # Run on main branch and e/* branches for testing + timeout-minutes: 20 if: | - github.ref == 'refs/heads/main' || + github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/e/') || startsWith(github.head_ref, 'e/') - + steps: - uses: actions/checkout@v4 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' - - - name: Install Claude CLI - run: | - npm install -g @anthropic-ai/claude-code - echo "Claude CLI installed at: $(which claude)" - claude --version - + - name: Set up Python 3.12 uses: actions/setup-python@v5 with: python-version: '3.12' - + - name: Install uv run: curl -LsSf https://astral.sh/uv/install.sh | sh - + - name: Create virtual environment run: uv venv - + - name: Install dependencies run: | source .venv/bin/activate - uv pip install -r requirements.txt - - - name: Run E2E tests - timeout-minutes: 15 # Per-step timeout + uv pip install -e ".[dev]" + + - name: Run integration tests + timeout-minutes: 15 env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - PYTHONUNBUFFERED: "1" # Force immediate output + PYTHONUNBUFFERED: "1" run: | source .venv/bin/activate - echo "Starting e2e tests at $(date)" - pytest tests/ -m "e2e" -v --tb=short -s --log-cli-level=INFO - echo "E2E tests completed at $(date)" - + echo "Starting integration tests at $(date)" + pytest tests/ --run-integration -v --tb=short -s + echo "Integration tests completed at $(date)" + - name: Upload logs on failure if: failure() uses: actions/upload-artifact@v4 @@ -95,48 +83,3 @@ jobs: /tmp/fireteam-test-*/ tests/**/*.log retention-days: 7 - - integration-tests: - name: Terminal-bench Integration - runs-on: ubuntu-latest - # Temporarily disabled - needs debugging - if: false - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python 3.12 - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Set up Docker - uses: docker/setup-buildx-action@v3 - - - name: Install uv - run: curl -LsSf https://astral.sh/uv/install.sh | sh - - - name: Install terminal-bench - run: uv tool install terminal-bench - - - name: Create virtual environment - run: uv venv - - - name: Install dependencies - run: | - source .venv/bin/activate - uv pip install -r requirements.txt - - - name: Install Fireteam adapter - run: | - source .venv/bin/activate - cd benchmark - uv pip install -e . - - - name: Run terminal-bench integration test - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - run: | - source .venv/bin/activate - pytest tests/ -m "integration" -v --tb=short - diff --git a/.gitignore b/.gitignore index 0d1da53..8359771 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,12 @@ __pycache__/ *.so .Python +# Build artifacts +*.egg-info/ +dist/ +build/ +.eggs/ + # Virtual environments venv/ env/ diff --git a/MEMORY_SYSTEM.md b/MEMORY_SYSTEM.md deleted file mode 100644 index 0100b03..0000000 --- a/MEMORY_SYSTEM.md +++ /dev/null @@ -1,518 +0,0 @@ -# Fireteam Memory System - -An OB-1-inspired trace memory system with spontaneous retrieval, providing agents with "ever-present" context awareness. - -## Overview - -Fireteam's memory system enables agents to learn from past experiences, avoid repeating mistakes, and maintain architectural consistency across cycles. Inspired by [OB-1's Terminal Bench #1 achievement](https://www.openblocklabs.com/blog/terminal-bench-1), our implementation uses local vector storage with state-of-the-art embeddings for semantic search. - -## Core Philosophy: Spontaneous Memory - -Memory retrieval feels like human thought - relevant memories automatically surface based on what agents are working on, without explicit queries. Agents don't know they're "checking memory" - memories just appear as background knowledge in their context. - -## Architecture - -### Technology Stack - -- **Vector Database:** ChromaDB 1.0+ (embedded, persistent SQLite backend) -- **Embeddings:** Qwen3-Embedding-0.6B (70.58 MTEB score, state-of-the-art) -- **Acceleration:** Metal/MPS on MacBook Pro M-series (with CPU fallback) -- **Caching:** LRU cache for embeddings, Hugging Face model cache - -### Storage Structure - -``` -memory/ - {project_hash}/ # MD5 hash of project_dir - chroma_db/ # Vector database (persistent) -``` - -### Memory Types - -All memories stored with `type` field: -- `trace` - Execution output, errors, files modified -- `failed_approach` - What didn't work and why -- `decision` - Architectural choices and rationale -- `learning` - Patterns and conventions discovered -- `code_location` - Where key functionality lives - -### Project Isolation - -Each project gets a unique collection based on MD5 hash of `project_dir`: -```python -collection_name = hashlib.md5(project_dir.encode()).hexdigest()[:16] -``` - -This ensures **zero cross-project contamination** - projects never share memories. - -## How It Works - -### Automatic Retrieval Flow - -**Every cycle, before each agent executes:** - -1. **Agent stores execution context** (`self._execution_context = kwargs`) -2. **Agent builds semantic query** from current task context -3. **MemoryManager performs semantic search** (retrieves top 10 relevant memories) -4. **BaseAgent injects memories** into system prompt silently -5. **Agent sees memories** as "background knowledge" - -This happens **3 times per cycle** (once per agent: Planner → Executor → Reviewer). - -### Agent-Specific Retrieval - -**PlannerAgent** retrieves: -- `decision` - Past architectural choices -- `failed_approach` - What to avoid -- `learning` - Discovered patterns - -Context query: `"Planning to achieve: {goal}. Recent feedback: {last_review}"` - -**ExecutorAgent** retrieves: -- `failed_approach` - Implementation gotchas -- `trace` - Past execution patterns -- `code_location` - Where things are implemented - -Context query: `"Implementing plan: {plan}. Goal: {goal}"` - -**ReviewerAgent** retrieves: -- `learning` - Known patterns -- `decision` - Architectural constraints -- `pattern` - Code conventions - -Context query: `"Reviewing implementation: {execution_result}. Original plan: {plan}"` - -### Memory Recording - -**After Execution:** -```python -memory.add_memory( - content=executor_result["execution_result"], - memory_type="trace", - cycle=cycle_num -) -``` - -**After Review:** -```python -# Reviewer extracts structured learnings -for learning in reviewer_result["learnings"]: - memory.add_memory( - content=learning["content"], - memory_type=learning["type"], - cycle=cycle_num - ) -``` - -### Learning Extraction - -Reviewer agent extracts learnings using special syntax: - -``` -LEARNING[pattern]: All database operations use connection pooling -LEARNING[decision]: Using JWT tokens with 24h expiry for sessions -LEARNING[failed_approach]: Attempted websockets but had CORS issues -LEARNING[code_location]: User authentication logic in src/auth/handler.py -``` - -These are automatically parsed and stored in memory. - -## Usage - -### Running with Memory (Default) - -```bash -python src/orchestrator.py --project-dir /path/to/project --goal "Your goal" -``` - -Memory automatically: -- Records execution traces -- Extracts learnings -- Provides context to agents -- **Cleans up after completion** - -### Debug Mode (Preserve Memory) - -```bash -python src/orchestrator.py --project-dir /path/to/project --goal "Your goal" --keep-memory -``` - -Preserves memory and state after completion for analysis. - -### First Run - -**Note:** First run downloads Qwen3-Embedding-0.6B model (~1.2GB) from Hugging Face. This is cached locally at `~/.cache/huggingface/` and subsequent runs use the cached version. - -## Performance - -### Timing Characteristics - -- **Model load:** 3-5 seconds (once at startup) -- **Per retrieval:** ~1 second (with caching) -- **Per cycle overhead:** ~3 seconds (3 automatic retrievals) -- **Embedding cache hit:** <50ms - -### Resource Usage - -- **Model size:** ~1.2GB (RAM) -- **GPU usage:** Metal/MPS on M-series Mac (optional, falls back to CPU) -- **Disk usage:** Grows with memories, auto-cleaned on completion - -## Observability - -All memory operations are logged with timing and counts: - -``` -[MEMORY] Initializing MemoryManager... -[MEMORY] Model loaded in 3.45s -[MEMORY] Using Metal/MPS acceleration -[MEMORY] Project initialized with 0 existing memories -[PLANNER] Retrieving memories... -[MEMORY] Searching: Planning to achieve: Build auth system... -[MEMORY] Found 3 memories in 0.85s -[PLANNER] Retrieved 3 memories in 0.87s -[MEMORY] Added trace in 0.42s -[MEMORY] Added decision in 0.38s -[MEMORY] Deleting collection a3f2e1... (15 memories)... -[MEMORY] Successfully deleted 15 memories -``` - -Enable debug logging for detailed output: -```bash -python src/orchestrator.py --project-dir /path --goal "Goal" --debug -``` - -## Testing - -### Run All Memory Tests - -```bash -./tests/run_memory_tests.sh -``` - -### Test Coverage - -**36 comprehensive tests:** -- ✅ MemoryManager CRUD operations -- ✅ Embedding generation and caching -- ✅ Semantic search functionality -- ✅ Memory type filtering -- ✅ Project isolation -- ✅ BaseAgent template method pattern -- ✅ Automatic memory retrieval -- ✅ Learning extraction -- ✅ Cleanup functionality -- ✅ Edge cases and error handling - -### Individual Test Suites - -```bash -# Unit tests for MemoryManager -python -m pytest tests/test_memory_manager.py -v - -# Unit tests for BaseAgent memory -python -m pytest tests/test_base_agent_memory.py -v - -# Integration tests -python -m pytest tests/test_memory_integration.py -v - -# Isolation tests -python -m pytest tests/test_memory_isolation.py -v -``` - -## Configuration - -### Memory Settings (in `src/config.py`) - -```python -# Memory configuration -MEMORY_DIR = os.path.join(SYSTEM_DIR, "memory") -MEMORY_EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-0.6B" -MEMORY_SEARCH_LIMIT = 10 # How many memories to retrieve per query -``` - -### Customization - -Adjust search limit for more/fewer memories: -```python -# In config.py -MEMORY_SEARCH_LIMIT = 15 # Retrieve more memories per query -``` - -## Key Design Decisions - -### Why Local (No APIs)? - -- ✅ **Complete privacy** - Data never leaves your machine -- ✅ **Zero costs** - No API fees per embedding -- ✅ **Fast** - No network latency -- ✅ **Reliable** - No external dependencies -- ✅ **Perfect for Terminal Bench** - No repeated model downloads - -### Why Qwen3-Embedding-0.6B? - -- ✅ **State-of-the-art quality** - 70.58 MTEB score (beats competitors) -- ✅ **Optimized for Mac** - Excellent Metal/MPS performance -- ✅ **Good size/performance** - 600M parameters is sweet spot -- ✅ **Code-aware** - Trained on multilingual corpus including code -- ✅ **Open source** - Apache 2.0 license - -### Why Spontaneous Retrieval? - -Traditional approach: -```python -# Agent explicitly queries memory -if should_check_memory(): - memories = memory.search(query) -``` - -**Problems:** -- Agent decides when to check (adds complexity) -- Explicit queries feel mechanical -- Easy to forget to check - -**Our approach:** -```python -# Memory automatically appears in context -# Agent never knows it's happening -``` - -**Benefits:** -- Mimics human thought (memories pop up naturally) -- No decision overhead -- Always relevant (semantic search) -- Agent-specific (each gets what it needs) - -### Why Chroma? - -- ✅ Embedded (no external service) -- ✅ Mature and stable -- ✅ Built for LLM workflows -- ✅ Persistent SQLite backend -- ✅ Excellent Python API - -## Example Memory Flow - -### Cycle 1: Initial Implementation - -**Executor completes work:** -``` -"Implemented JWT authentication using jsonwebtoken library. -Created middleware in src/auth/jwt.js. -All tests passing." -``` - -**Stored as:** `trace` memory - -**Reviewer extracts learnings:** -``` -LEARNING[decision]: Using JWT tokens with 24h expiry for sessions -LEARNING[code_location]: Authentication middleware in src/auth/jwt.js -LEARNING[pattern]: All protected routes use auth middleware -``` - -**Stored as:** 3 separate memories (`decision`, `code_location`, `pattern`) - -### Cycle 2: Hit a Problem - -**Executor reports:** -``` -"Attempted to add refresh tokens using redis-om library -but encountered connection errors in test environment. -Falling back to in-memory session store." -``` - -**Stored as:** `trace` memory - -**Reviewer extracts:** -``` -LEARNING[failed_approach]: Tried redis-om for refresh tokens but had connection issues -LEARNING[decision]: Using in-memory session store for MVP -``` - -**Stored as:** 2 memories - -### Cycle 5: Planning Auth Improvements - -**Planner automatically receives context:** -``` ---- -BACKGROUND KNOWLEDGE FROM PREVIOUS WORK: -(You have access to these learnings from earlier cycles) - -• Decision (Cycle 1): Using JWT tokens with 24h expiry for sessions -• Failed Approach (Cycle 2): Tried redis-om for refresh tokens but had connection issues -• Code Location (Cycle 1): Authentication middleware in src/auth/jwt.js -• Pattern (Cycle 1): All protected routes use auth middleware - -Use this background knowledge naturally. Don't explicitly reference cycles. ---- -``` - -Planner naturally avoids redis-om and builds on existing JWT implementation. - -## Troubleshooting - -### Model Download Issues - -If model download fails on first run: -```bash -# Check Hugging Face cache -ls -lh ~/.cache/huggingface/hub/models--Qwen--Qwen3-Embedding-0.6B/ - -# Clear cache and retry -rm -rf ~/.cache/huggingface/ -python src/orchestrator.py --project-dir /path --goal "Test" -``` - -### Memory Not Working - -Check logs for `[MEMORY]` prefix: -```bash -# Look for memory operations in logs -grep "\[MEMORY\]" logs/orchestrator_*.log -``` - -Should see: -- Model loading -- Project initialization -- Search operations -- Memory additions - -### MPS/Metal Issues on Mac - -If you see warnings about MPS: -``` -[MEMORY] Using CPU (MPS not available) -``` - -This is fine - memory will work on CPU. Slightly slower but functional. - -To enable MPS, ensure PyTorch 2.5+ with Metal support: -```bash -pip install --upgrade torch -``` - -### Cleanup Issues - -If cleanup fails: -```bash -# Manual cleanup -rm -rf memory/{project_hash}/ -rm state/current.json -``` - -Or run with `--keep-memory` to preserve data. - -## Comparison to OB-1 - -### Similarities (Inspired By) - -- ✅ Trace memory (commands, outputs, errors) -- ✅ Recording failed approaches -- ✅ Preventing mistake repetition -- ✅ Context across long-horizon tasks - -### Enhancements (We Added) - -- ✅ **Semantic search** - Find memories by meaning, not keywords -- ✅ **Agent-specific retrieval** - Each agent gets relevant context -- ✅ **Spontaneous injection** - Memories appear automatically -- ✅ **State-of-the-art embeddings** - Qwen3-0.6B (70.58 MTEB) -- ✅ **Comprehensive observability** - All operations logged with timing -- ✅ **Automatic cleanup** - No manual memory management -- ✅ **Project isolation** - Multi-project support - -## Future Enhancements (Post-MVP) - -Ideas for extending the memory system: - -1. **Memory Consolidation** - Merge duplicate/similar learnings -2. **Forgetting Mechanism** - Remove outdated or irrelevant memories -3. **Cross-Project Transfer** - Opt-in knowledge sharing between projects -4. **Memory Analytics** - Dashboard showing memory growth and patterns -5. **Export/Import** - Share memory dumps for debugging or collaboration -6. **Semantic Clustering** - Visualize related memories as knowledge graph - -## Implementation Details - -### Files Created - -- `src/memory/manager.py` - Core MemoryManager class (220 lines) -- `src/memory/__init__.py` - Module initialization -- `tests/test_memory_manager.py` - 14 unit tests -- `tests/test_base_agent_memory.py` - 10 unit tests -- `tests/test_memory_integration.py` - 5 integration tests -- `tests/test_memory_isolation.py` - 7 isolation tests -- `tests/run_memory_tests.sh` - Test runner script - -### Files Modified - -- `requirements.txt` - Added chromadb, transformers, torch, pytest -- `src/config.py` - Added memory configuration -- `src/agents/base.py` - Template method pattern + automatic retrieval -- `src/agents/planner.py` - Memory integration -- `src/agents/executor.py` - Memory integration -- `src/agents/reviewer.py` - Memory integration + learning extraction -- `src/orchestrator.py` - Full lifecycle integration + cleanup - -### Lines of Code - -- **Production code:** ~400 lines (MemoryManager + BaseAgent enhancements) -- **Test code:** ~500 lines (36 comprehensive tests) -- **Total:** ~900 lines for complete memory system - -## Dependencies Added - -``` -chromadb>=1.0.0 # Vector database -transformers>=4.50.0 # Hugging Face model loading -torch>=2.5.0 # PyTorch with Metal/MPS support -pytest>=7.0.0 # Testing framework -``` - -## Version History - -### v1.0.0 - Initial Memory System (November 6, 2025) - -**Features:** -- Local vector storage with ChromaDB -- Qwen3-Embedding-0.6B for state-of-the-art retrieval -- Spontaneous memory retrieval -- Agent-specific context queries -- Automatic cleanup with debug mode -- Comprehensive test coverage (36 tests) -- Full observability with timing metrics - -**Performance:** -- ~3 seconds overhead per cycle -- ~1.2GB model size (cached locally) -- Metal/MPS acceleration on Mac - -**Inspired by:** OB-1's Terminal Bench achievement ([blog post](https://www.openblocklabs.com/blog/terminal-bench-1)) - -## Contributing - -When extending the memory system: - -1. **Add new memory types** - Update `memory_type` field values -2. **Customize retrieval** - Override `_build_memory_context_query()` in agents -3. **Add metadata** - Pass `metadata` dict to `add_memory()` -4. **Test thoroughly** - Add tests to appropriate test file -5. **Document** - Update this file with new features - -## Support - -For issues related to memory system: -- Check logs for `[MEMORY]` prefixed messages -- Run tests: `./tests/run_memory_tests.sh` -- Enable debug logging: `--debug` flag -- Preserve memory for inspection: `--keep-memory` flag - -## References - -- [OB-1 Terminal Bench Achievement](https://www.openblocklabs.com/blog/terminal-bench-1) -- [ChromaDB Documentation](https://docs.trychroma.com/) -- [Qwen3 Model Card](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B) -- [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard) - diff --git a/README.md b/README.md index e3ebdf4..6fcf8b7 100644 --- a/README.md +++ b/README.md @@ -1,298 +1,225 @@ # Fireteam -[![Tests](https://github.com/darkresearch/fireteam/actions/workflows/test.yml/badge.svg)](https://github.com/darkresearch/fireteam/actions/workflows/test.yml) - -An autonomous multi-agent system for long-running project execution powered by Claude. +Adaptive task execution using Claude Agent SDK with complexity-based routing and loop-until-complete behavior. ## Overview -The Fireteam is a sophisticated orchestration framework that manages three specialized agents in an infinite cycle of planning, execution, and review until project completion: - -- **Planner Agent**: Creates and updates project plans -- **Executor Agent**: Executes planned tasks -- **Reviewer Agent**: Assesses progress and estimates completion - -## Architecture +Fireteam estimates task complexity and routes to the appropriate execution strategy: -``` -Orchestrator (Infinite Loop) - ↓ -[Plan] → [Execute] → [Review] → [Git Commit] - ↑___________________________________| -``` - -### Key Features - -- **Autonomous Operation**: Runs continuously until project completion -- **Git Integration**: Automatic repo initialization, branching, commits, and pushing -- **State Isolation**: Clean state separation between projects to prevent contamination -- **Completion Validation**: Triple-check validation system (3 consecutive >95% reviews) -- **Error Recovery**: Automatic retry logic and graceful degradation -- **Production Focus**: Emphasis on production-ready code with comprehensive testing +| Complexity | Mode | Behavior | +|------------|------|----------| +| TRIVIAL | SINGLE_TURN | Direct execution, single pass | +| SIMPLE | SINGLE_TURN | Direct execution, single pass | +| MODERATE | MODERATE | Execute -> review loop until >95% complete | +| COMPLEX | FULL | Plan once, then execute -> 3 parallel reviews loop until 2/3 say >95% | ## Installation -1. **Prerequisites** - - Python 3.12+ - - Git - - Claude CLI ([installation guide](https://docs.claude.com/en/docs/claude-code/installation)) - -2. **Setup** - ```bash - cd /home/claude/fireteam - bash setup.sh - source ~/.bashrc # or restart your shell - ``` - -## Usage - -### Starting a Project - ```bash -start-agent --project-dir /path/to/project --prompt "Your project goal here" +pip install fireteam ``` -Example: -```bash -start-agent --project-dir ~/my-calculator --prompt "Build a Python command-line calculator with support for basic arithmetic operations" -``` +Requires Python 3.10+ and a valid `ANTHROPIC_API_KEY` environment variable. -### Checking Progress +## Usage -```bash -agent-progress -``` +### Basic Usage -This shows: -- Current status (running/stopped) -- Project information -- Current cycle number -- Completion percentage -- Recent activity logs +```python +from fireteam import execute -### Stopping the System +result = await execute( + project_dir="/path/to/project", + goal="Fix the bug in auth.py", + context="Error logs: NullPointerException at line 42", +) -```bash -stop-agent +if result.success: + print(f"Completed in {result.iterations} iterations") + print(f"Completion: {result.completion_percentage}%") +else: + print(f"Failed: {result.error}") ``` -This gracefully shuts down the orchestrator and all running agents. - -## How It Works - -### Initialization - -1. Creates/validates Git repository in project directory -2. Creates timestamped branch (e.g., `agent-20240315-143022`) -3. Initializes clean project state - -### Cycle Execution - -Each cycle consists of three phases: - -1. **Planning Phase** - - Planner agent reviews goal, previous plan, and recent results - - Creates or updates project plan - - Breaks down remaining work into actionable tasks - -2. **Execution Phase** - - Executor agent implements tasks from the plan - - Writes actual, working code (no placeholders) - - Tests implementations - - Documents work - -3. **Review Phase** - - Reviewer agent examines the codebase - - Tests functionality - - Estimates completion percentage (0-100%) - - Identifies gaps or issues - -4. **Git Commit** - - Commits all changes with descriptive message - - Pushes to remote if origin exists - -### Completion Logic - -- System runs infinite cycles until completion -- When Reviewer estimates >95% complete: enter validation mode -- Validation requires 3 consecutive reviews confirming >95% -- Each validation review takes a fresh, critical look -- Upon completion: system stops and logs success - -## State Management - -State is stored in `state/current.json` (runtime data directory) and includes: - -- `project_dir`: Absolute path to project -- `goal`: Project objective -- `status`: Current phase (planning/executing/reviewing) -- `cycle_number`: Current cycle count -- `completion_percentage`: Latest estimate (0-100) -- `validation_checks`: Consecutive validation passes -- `git_branch`: Current branch name -- `current_plan`: Latest plan -- `last_execution_result`: Latest execution output -- `last_review`: Latest review output +### Specify Execution Mode + +```python +from fireteam import execute, ExecutionMode + +# Force full mode with planning and parallel reviews +# Loops infinitely until complete (default) +result = await execute( + project_dir="/path/to/project", + goal="Refactor the authentication module", + mode=ExecutionMode.FULL, +) + +# Or limit iterations if needed +result = await execute( + project_dir="/path/to/project", + goal="Refactor the authentication module", + mode=ExecutionMode.FULL, + max_iterations=10, # Stop after 10 iterations if not complete +) +``` -**Important**: State is completely reset between projects to prevent cross-contamination. +### Complexity Estimation -## Configuration +```python +from fireteam import estimate_complexity, ComplexityLevel -Edit `src/config.py` to customize: +# Quick estimation (no codebase access) +complexity = await estimate_complexity( + goal="Add logging to the auth module", + context="Existing logging in other modules uses Python logging", +) -- `MAX_RETRIES`: Number of retry attempts for failed agent calls (default: 3) -- `COMPLETION_THRESHOLD`: Percentage to trigger validation (default: 95) -- `VALIDATION_CHECKS_REQUIRED`: Consecutive checks needed (default: 3) -- `LOG_LEVEL`: Logging verbosity (default: INFO) +# Accurate estimation with codebase exploration +# Claude uses Glob, Grep, Read to understand the project +complexity = await estimate_complexity( + goal="Refactor the authentication system", + project_dir="/path/to/project", +) -## Logging - -Logs are stored in `logs/`: +print(f"Estimated complexity: {complexity}") +# ComplexityLevel.MODERATE -> routes to MODERATE mode +``` -- `orchestrator_YYYYMMDD_HHMMSS.log`: Per-run orchestrator logs -- `system.log`: Combined system output (when running in background) +## Execution Modes -## Project Structure +### SINGLE_TURN +For trivial and simple tasks. Single SDK call, no review loop. +### MODERATE +For moderate tasks requiring validation: ``` -fireteam/ -├── src/ # Source code directory -│ ├── orchestrator.py # Main orchestration loop -│ ├── config.py # Configuration settings -│ ├── __init__.py -│ ├── agents/ -│ │ ├── __init__.py -│ │ ├── base.py # Base agent class -│ │ ├── planner.py # Planner agent -│ │ ├── executor.py # Executor agent -│ │ └── reviewer.py # Reviewer agent -│ └── state/ -│ └── manager.py # State management module -├── state/ # Runtime state data (gitignored) -│ └── current.json # Active project state -├── cli/ -│ ├── start-agent # Start system -│ ├── stop-agent # Stop system -│ └── agent-progress # Check status -├── logs/ # Log directory -├── service/ -│ └── claude-agent.service # Systemd service file -├── setup.sh # Installation script -└── README.md # This file +while not complete: + execute() + completion = review() + if completion >= 95%: + complete = True ``` +Loops **indefinitely** until a single reviewer says >95% complete. Set `max_iterations` to limit. -## Troubleshooting - -### System won't start - -- Check Claude CLI is installed: `claude --version` -- Ensure project directory is accessible -- Check logs in `logs/system.log` +### FULL +For complex tasks requiring planning and consensus: +``` +plan() # Once at start +while not complete: + execute() + reviews = run_3_parallel_reviewers() + if 2 of 3 say >= 95%: + complete = True +``` +Plans once, then loops **indefinitely** until majority (2/3) consensus. Set `max_iterations` to limit. -### Agents failing repeatedly +## API Reference -- Check Claude CLI credentials -- Verify network connectivity -- Review agent logs for specific errors -- Ensure sufficient disk space +### `execute()` -### State corruption +```python +async def execute( + project_dir: str | Path, + goal: str, + context: str = "", + mode: ExecutionMode | None = None, # Auto-detect if None + run_tests: bool = True, + max_iterations: int | None = None, # None = infinite (default) +) -> ExecutionResult +``` -- Stop the system: `stop-agent` -- Remove state file: `rm state/current.json` -- Restart with fresh state +### `ExecutionResult` + +```python +@dataclass +class ExecutionResult: + success: bool + mode: ExecutionMode + output: str | None = None + error: str | None = None + completion_percentage: int = 0 + iterations: int = 0 + metadata: dict = field(default_factory=dict) +``` -### Git issues +### `estimate_complexity()` -- Ensure git is configured: `git config --list` -- Check remote access: `git remote -v` (in project dir) -- Verify credentials for pushing +```python +async def estimate_complexity( + goal: str, + context: str = "", + project_dir: str | Path | None = None, # Enables codebase exploration +) -> ComplexityLevel +``` -## Best Practices +## Configuration -1. **Clear Goals**: Provide specific, detailed project goals -2. **Monitor Progress**: Check `agent-progress` periodically -3. **Review Commits**: Examine git commits to understand changes -4. **Iterate on Plans**: Let the system adapt through multiple cycles -5. **Trust Validation**: The triple-check ensures quality +Environment variables: -## Advanced Usage +| Variable | Default | Description | +|----------|---------|-------------| +| `ANTHROPIC_API_KEY` | (required) | API key for Claude | +| `FIRETEAM_MAX_ITERATIONS` | (none) | Max loop iterations. Unset = infinite. | +| `FIRETEAM_LOG_LEVEL` | INFO | Logging verbosity | -### Multiple Projects +## Quality Hooks -Each project maintains isolated state. To work on multiple projects: +Fireteam includes SDK hooks for quality enforcement: -```bash -# Start project 1 -start-agent --project-dir ~/project1 --prompt "Goal 1" +- **QUALITY_HOOKS**: Run tests after edits, block user questions +- **AUTONOMOUS_HOOKS**: Block all user interaction +- **DEBUG_HOOKS**: Log all tool usage -# Wait for completion or stop -stop-agent +```python +from fireteam import execute -# Start project 2 (completely fresh state) -start-agent --project-dir ~/project2 --prompt "Goal 2" +result = await execute( + project_dir="/path/to/project", + goal="Add feature", + run_tests=True, # Enables QUALITY_HOOKS (default) +) ``` -### Custom Branch Names - -The system automatically creates timestamped branches. To continue from a specific commit: - -1. Manually checkout desired branch in project directory -2. System will create new branch from that point - -### Remote Repositories - -To push to a remote: +## Project Structure -```bash -cd /path/to/project -git remote add origin -# System will automatically push subsequent commits +``` +fireteam/ +├── .claude-plugin/ +│ ├── plugin.json # Claude Code plugin manifest +│ └── commands/ +│ └── fireteam.md # /fireteam command definition +├── src/ +│ ├── __init__.py # Public API exports +│ ├── api.py # Core execute() function +│ ├── models.py # Data models (ExecutionMode, ExecutionResult, etc.) +│ ├── loops.py # Loop implementations (moderate_loop, full_loop) +│ ├── complexity.py # Complexity estimation +│ ├── config.py # Configuration +│ ├── hooks.py # SDK hooks for quality +│ └── prompts/ +│ ├── __init__.py # Prompt loader +│ ├── builder.py # Prompt building with feedback injection +│ ├── executor.md # Executor agent prompt +│ ├── reviewer.md # Reviewer agent prompt +│ ├── planner.md # Planner agent prompt +│ └── complexity.md # Complexity estimation prompt +├── tests/ +└── pyproject.toml ``` -## Technical Details - -### Agent Communication - -Agents don't communicate directly. The orchestrator: -- Passes outputs as inputs to the next agent -- Maintains state in shared state file -- Ensures proper sequencing - -### Claude CLI Integration +## Development -Agents invoke Claude CLI with: ```bash -claude --dangerously-skip-permissions --prompt "" --cwd +# Clone and install dev dependencies +git clone https://github.com/darkresearch/fireteam +cd fireteam +uv venv && source .venv/bin/activate +uv pip install -e ".[dev]" + +# Run tests +pytest tests/ -v ``` -The `--dangerously-skip-permissions` flag enables fully autonomous operation. - -### Error Handling - -- Each agent call has retry logic (3 attempts by default) -- Exponential backoff between retries -- Graceful degradation on persistent failures -- Comprehensive logging for debugging - -## Contributing - -This is a production system. Contributions should: -- Follow Python best practices (PEP 8) -- Include error handling -- Update documentation -- Maintain backward compatibility - ## License -MIT License - See LICENSE file for details - -## Support - -- Documentation: [Claude Code Docs](https://docs.claude.com/en/docs/claude-code) -- Issues: Report via project repository -- Sub-agents: [Sub-agent Documentation](https://docs.claude.com/en/docs/claude-code/sub-agents) - -## Version - -1.0.0 - Initial release +MIT License diff --git a/TESTING_COMPLETE.md b/TESTING_COMPLETE.md deleted file mode 100644 index a2413f8..0000000 --- a/TESTING_COMPLETE.md +++ /dev/null @@ -1,221 +0,0 @@ -# 🎊 Fireteam Test Suite - COMPLETE - -## ✅ Implementation Status: DONE - -All test infrastructure, tests, and CI/CD pipeline successfully implemented and verified. - -## 📊 Test Suite Overview - -### Total: 165 Tests - -**Unit Tests (161 tests) - ✅ ALL PASSING** -- Configuration: 15 tests -- State Manager: 20 tests -- Agents (BaseAgent, Planner, Executor, Reviewer): 38 tests -- Orchestrator Integration: 28 tests -- CLI Tools: 24 tests -- Memory System (Maria): 36 tests - -**New End-to-End Tests (4 tests) - ✅ READY** -- Lightweight Embeddings: 2 tests ✅ PASSING -- E2E Hello World: 1 test 🔧 READY (requires API to run) -- Terminal-bench Integration: 1 test 🔧 READY (requires API to run) - -## 🚀 What Was Implemented - -### 1. Test Infrastructure ✅ -- `tests/conftest.py` - Shared fixtures with parallel safety - - `isolated_tmp_dir` - UUID-based temp directories - - `isolated_system_dirs` - Separate state/logs/memory - - `lightweight_memory_manager` - Fast embedding model fixture - - `--keep-artifacts` command-line option - -- `tests/helpers.py` - Complete test helpers (320 lines) - - `TestResult` - Dataclass with formatted display - - `LogParser` - Extract metrics from logs - - `StreamingOutputHandler` - Real-time output with progress indicators - - `FireteamTestRunner` - Subprocess spawning and management - - `TerminalBenchResult` - Terminal-bench result dataclass - - `TerminalBenchParser` - Parse terminal-bench output - -### 2. Enhanced Components ✅ -- `src/memory/manager.py` - Added `embedding_model` parameter - - Supports both Qwen3 (production) and sentence-transformers (CI) - - Automatically uses appropriate API for each model type - - Backwards compatible (defaults to Qwen3) - -- `requirements.txt` - Added sentence-transformers>=2.2.0 - -- `src/config.py` - Fixed .env loading from repo root - -### 3. New Tests ✅ -- `tests/test_memory_lightweight.py` - Fast HuggingFace validation - - Uses 80MB model instead of 1.2GB Qwen3 - - Tests embedding generation - - Tests save/retrieve with semantic search - - **Status:** ✅ 2/2 passing (31s) - -- `tests/test_e2e_hello_world.py` - Real task completion - - Spawns actual Fireteam subprocess - - Real-time progress indicators - - Validates file creation, git commits, output - - **Status:** 🔧 Ready to run (needs API key) - -- `tests/test_terminal_bench_integration.py` - Production validation - - Runs terminal-bench hello-world task - - Verifies 100% accuracy - - Structured result parsing - - **Status:** 🔧 Ready to run (needs API key + tb) - -### 4. Configuration ✅ -- `tests/pytest.ini` - Added markers (lightweight, e2e, slow, integration) -- `tests/README.md` - Comprehensive documentation -- `TODO.md` - Future testing improvements -- `TEST_SUITE_SUMMARY.md` - Implementation summary - -### 5. CI/CD Pipeline ✅ -- `.github/workflows/test.yml` - 3-job workflow - - **fast-tests**: Runs on all PRs (~2 min, free) - - **e2e-tests**: Runs on main only (~5 min, ~$0.50) - - **integration-tests**: Runs on main only (~10 min, ~$1) - -- `README.md` - Added CI badge - -## 🎯 Verification Results - -### Fast Tests (163 tests) -```bash -pytest tests/ -m "not slow and not e2e and not integration" -v -``` -**Status:** ✅ 163 passed in 58.55s - -### Lightweight Tests (2 tests) -```bash -pytest tests/ -m "lightweight" -v -``` -**Status:** ✅ 2 passed in 31.57s - -### Configuration -- ✅ .env file exists in repo root -- ✅ ANTHROPIC_API_KEY loaded correctly (108 characters) -- ✅ terminal-bench (tb) installed and functional -- ✅ All 165 tests discovered by pytest - -## 🚀 Ready to Run (Requires API Key) - -### E2E Hello World Test -```bash -cd /Users/osprey/repos/dark/fireteam -source .venv/bin/activate -pytest tests/test_e2e_hello_world.py -v --keep-artifacts -``` -**Expected:** Creates hello_world.py file, verifies output, ~3-5 minutes - -### Terminal-bench Integration Test -```bash -cd /Users/osprey/repos/dark/fireteam -source .venv/bin/activate -pytest tests/test_terminal_bench_integration.py -v -``` -**Expected:** 100% accuracy on hello-world task, ~10 minutes - -### All Tests (Including Slow) -```bash -pytest tests/ -v -``` -**Expected:** 165 tests pass, ~20 minutes total, ~$1.50 API cost - -## 📝 Next Steps for Complete CI - -### 1. Add GitHub Secret -1. Go to: https://github.com/YOUR_ORG/fireteam/settings/secrets/actions -2. Click "New repository secret" -3. Name: `ANTHROPIC_API_KEY` -4. Value: [paste your API key from .env] -5. Click "Add secret" - -### 2. Update CI Badge -In `README.md`, replace `YOUR_ORG` with your actual GitHub org/username - -### 3. Test Locally First (Optional) -Run the e2e tests locally to ensure they work before pushing: -```bash -pytest tests/ -m "e2e" -v --keep-artifacts -``` - -### 4. Push to GitHub -```bash -git add . -git commit -m "Add comprehensive E2E tests and CI pipeline" -git push -``` - -The CI workflow will automatically run on push! - -## 🎨 Test Quality Features - -### Comprehensive -- ✅ All components tested (config, state, agents, orchestrator, CLI, memory) -- ✅ Intent-focused tests (test functionality, not implementation) -- ✅ End-to-end validation with real tasks -- ✅ Production validation via terminal-bench - -### Elegant -- ✅ Separation of concerns (LogParser, parsers, runners) -- ✅ Reusable fixtures and helpers -- ✅ Clean dataclasses with formatted displays -- ✅ No code duplication -- ✅ Proper result parsing (no brittle string matching) - -### Observable -- ✅ Real-time streaming: `🔄 Cycle 1 → Planning... ✓ 50%` -- ✅ Structured result displays -- ✅ Helpful error messages with context -- ✅ Duration and metric tracking -- ✅ Artifact preservation with `--keep-artifacts` -- ✅ CI badges for instant status - -## 📈 Test Execution Strategy - -### Local Development -```bash -# Quick check (fast tests only) -pytest tests/ -m "not slow" -v - -# Before committing -pytest tests/ -m "not slow and not integration" -v -``` - -### CI Pipeline -- **PRs:** Fast tests only (~2 min, no cost) -- **Main branch:** All tests including e2e/integration (~20 min, ~$1.50) - -### Manual Validation -```bash -# Test specific category -pytest tests/ -m "lightweight" -v -pytest tests/ -m "e2e" -v -pytest tests/ -m "integration" -v - -# Keep test artifacts for debugging -pytest tests/ --keep-artifacts -v -``` - -## 🎉 Success! - -**Original Goal Met:** -- ✅ Comprehensive test coverage (165 tests) -- ✅ Tests test intent, not just implementation -- ✅ CI configured with GitHub Actions -- ✅ API key setup ready (in .env locally, will be GitHub secret) -- ✅ All fast tests pass (163/163) -- ✅ All lightweight tests pass (2/2) -- ✅ Code is correct and validated -- ✅ Components ready for CI - -**Ready for:** -1. Run e2e/integration tests locally (optional) -2. Add GitHub secret -3. Push to trigger CI -4. Watch all 165 tests pass in GitHub Actions! 🚀 - diff --git a/TEST_EXPANSION_PLAN.md b/TEST_EXPANSION_PLAN.md deleted file mode 100644 index bfc29eb..0000000 --- a/TEST_EXPANSION_PLAN.md +++ /dev/null @@ -1,405 +0,0 @@ -# Test Expansion Implementation Plan - -## Problem Statement - -The Fireteam project currently has comprehensive tests for the memory system (Maria) with 36 test cases covering: -- Memory manager CRUD operations -- Agent memory integration -- Memory isolation between projects -- End-to-end memory scenarios - -However, **critical functionality lacks test coverage**: -- **Orchestrator**: No tests for the main orchestration loop, cycle execution, completion checking, git operations -- **State Manager**: No tests for state persistence, locking, completion tracking, parse failure handling -- **Individual Agents**: No tests for Planner, Executor, or Reviewer agent functionality -- **Config**: No tests for configuration loading and validation -- **CLI tools**: No tests for the CLI utilities (start-agent, stop-agent, agent-progress) -- **Integration**: No full system integration tests simulating complete orchestration cycles - -This limits confidence in: -1. Core orchestration logic correctness -2. State management reliability -3. Agent behavior under various conditions -4. System-level workflows -5. Edge cases and error handling - -## Current State - -### Existing Test Infrastructure -**Location**: `tests/` -- `pytest.ini` configured with testpaths, naming conventions -- 4 test files, 36 tests total (all memory-focused) -- Uses temporary directories for isolation -- Mock/patch patterns for testing agents - -**Test Files**: -1. `test_memory_manager.py` - MemoryManager unit tests (18 tests) -2. `test_memory_isolation.py` - Project isolation tests (7 tests) -3. `test_base_agent_memory.py` - BaseAgent memory integration (9 tests) -4. `test_memory_integration.py` - End-to-end memory scenarios (2 tests) - -### Source Code Structure -**Core Components** (`src/`): -``` -src/ -├── orchestrator.py # Main loop - NO TESTS -├── config.py # Configuration - NO TESTS -├── agents/ -│ ├── base.py # BaseAgent - Partial coverage (memory only) -│ ├── planner.py # PlannerAgent - NO TESTS -│ ├── executor.py # ExecutorAgent - NO TESTS -│ └── reviewer.py # ReviewerAgent - NO TESTS -├── state/ -│ └── manager.py # StateManager - NO TESTS -└── memory/ - └── manager.py # MemoryManager - FULL COVERAGE ✓ -``` - -**CLI Tools** (`cli/`): No tests -- `start-agent` - bash script -- `stop-agent` - bash script -- `agent-progress` - bash script -- `fireteam-status` - bash script - -### Key Functionality to Test - -#### 1. Orchestrator (`src/orchestrator.py`) -Critical untested functionality: -- **Initialization**: Project setup, git repo initialization, memory initialization -- **Cycle execution**: Plan → Execute → Review → Commit loop -- **Completion checking**: Validation logic (3 consecutive >95% checks) -- **Git operations**: Commit creation, branch management, remote pushing -- **Error handling**: Agent failures, retry logic, graceful degradation -- **Signal handling**: SIGINT/SIGTERM graceful shutdown -- **Memory cleanup**: Automatic cleanup on completion - -#### 2. State Manager (`src/state/manager.py`) -Critical untested functionality: -- **State persistence**: JSON serialization, file locking -- **Project isolation**: State reset between projects -- **Completion tracking**: Percentage updates, validation counters -- **Parse failure handling**: Fallback to last known completion (novel feature!) -- **Safety mechanisms**: 3 consecutive parse failures → 0% -- **Concurrent access**: File locking for race condition prevention - -#### 3. Agent Classes -##### Planner (`src/agents/planner.py`) -- Initial plan creation prompts -- Plan update prompts based on feedback -- Memory context queries (decisions, failed approaches, learnings) -- Plan extraction from Claude output - -##### Executor (`src/agents/executor.py`) -- Execution prompt building -- Memory context queries (failed approaches, traces, code locations) -- Result extraction and formatting - -##### Reviewer (`src/agents/reviewer.py`) -- Review prompt building (normal vs validation mode) -- Completion percentage extraction (regex parsing) -- Learning extraction (`LEARNING[type]: content` pattern) -- Memory context queries (patterns, decisions, learnings) - -##### BaseAgent (`src/agents/base.py`) -Current coverage: Memory integration only -Missing coverage: -- SDK execution with retry logic -- Timeout handling -- Error type detection (CLINotFoundError, etc.) -- Command execution success/failure paths - -#### 4. Config (`src/config.py`) -No tests for: -- Environment variable loading -- Default value fallbacks -- API key validation -- Path configuration -- Timeout configuration - -## Proposed Changes - -### Phase 1: Unit Tests for Core Components - -#### 1.1 State Manager Tests (`tests/test_state_manager.py`) -**Intent**: Verify state persistence, isolation, and failure handling - -Test categories: -- **Initialization**: Fresh project state, required fields, timestamp generation -- **State Updates**: Single updates, batch updates, timestamp updates -- **Persistence**: File operations, JSON serialization -- **Locking**: Concurrent access prevention, lock acquisition/release -- **Completion Tracking**: - - Percentage updates (success path) - - Parse failure handling (fallback to last known) - - 3-failure safety valve - - Validation counter tracking -- **Project Isolation**: State clearing between projects -- **Edge Cases**: Missing state file, corrupted JSON, lock file issues - -**Key test scenarios**: -```python -def test_parse_failure_uses_last_known_completion() -def test_three_consecutive_failures_resets_to_zero() -def test_validation_checks_reset_on_percentage_drop() -def test_concurrent_state_access_with_locking() -def test_state_isolation_between_projects() -``` - -#### 1.2 Planner Agent Tests (`tests/test_planner_agent.py`) -**Intent**: Verify planning prompts and memory integration - -Test categories: -- **Prompt Building**: Initial vs update prompts, context inclusion -- **Memory Integration**: Query building, type filtering (decision, failed_approach, learning) -- **Plan Extraction**: Output parsing -- **Error Handling**: SDK failures, retry logic -- **Context Awareness**: Cycle number, previous plan, feedback integration - -#### 1.3 Executor Agent Tests (`tests/test_executor_agent.py`) -**Intent**: Verify execution prompts and memory integration - -Test categories: -- **Prompt Building**: Goal and plan context -- **Memory Integration**: Query building, type filtering (failed_approach, trace, code_location) -- **Result Extraction**: Output parsing -- **Error Handling**: Implementation failures, partial completions - -#### 1.4 Reviewer Agent Tests (`tests/test_reviewer_agent.py`) -**Intent**: Verify review logic, completion extraction, learning extraction - -Test categories: -- **Prompt Building**: Normal vs validation mode -- **Completion Extraction**: Regex parsing, format variations, fallbacks -- **Learning Extraction**: `LEARNING[type]: content` pattern matching -- **Memory Integration**: Query building, type filtering (learning, decision, pattern) -- **Validation Mode**: Extra critical prompts, thorough checking -- **Edge Cases**: Missing completion marker, malformed learnings - -**Key test scenarios**: -```python -def test_extract_completion_percentage_from_standard_format() -def test_extract_completion_fallback_patterns() -def test_extract_learnings_all_types() -def test_validation_mode_prompt_includes_critical_checks() -``` - -#### 1.5 BaseAgent Tests (`tests/test_base_agent.py`) -**Intent**: Complete coverage of base agent functionality - -Test categories: -- **SDK Execution**: Success/failure paths, output collection -- **Retry Logic**: MAX_RETRIES attempts, exponential backoff -- **Error Handling**: CLINotFoundError, CLIConnectionError, ProcessError -- **Timeout Handling**: Agent-specific timeouts -- **Execute Template**: _do_execute() delegation pattern - -#### 1.6 Config Tests (`tests/test_config.py`) -**Intent**: Verify configuration loading and defaults - -Test categories: -- **Environment Variables**: Loading, overrides, defaults -- **API Key Handling**: Lazy loading, validation -- **Path Configuration**: System paths, memory dir, state dir -- **Timeout Configuration**: Agent-specific timeouts -- **Model Configuration**: SDK options, model selection - -### Phase 2: Integration Tests - -#### 2.1 Orchestrator Integration Tests (`tests/test_orchestrator_integration.py`) -**Intent**: Test orchestration flow with mocked agents - -Test categories: -- **Initialization**: Git repo setup (new and existing), memory initialization -- **Single Cycle**: Plan → Execute → Review → Commit flow -- **Multi-Cycle**: State accumulation across cycles -- **Completion Logic**: - - Validation triggering at >95% - - 3 consecutive checks required - - Reset on percentage drop -- **Git Operations**: Commits, branch creation, remote pushing (mocked) -- **Error Recovery**: Agent failures, retries, partial progress -- **Graceful Shutdown**: Signal handling, cleanup -- **Memory Integration**: Memory recording and retrieval through cycle - -**Key test scenarios**: -```python -def test_single_cycle_execution() -def test_completion_requires_three_consecutive_validations() -def test_git_commit_after_each_cycle() -def test_memory_cleanup_on_completion() -def test_graceful_shutdown_on_signal() -def test_agent_failure_with_retry() -``` - -#### 2.2 Full System Integration Tests (`tests/test_system_integration.py`) -**Intent**: End-to-end system tests with realistic scenarios - -Test categories: -- **Complete Project Lifecycle**: Start → Multiple cycles → Completion -- **State Persistence**: State survives crashes (test with state file manipulation) -- **Memory Accumulation**: Memories persist and are retrieved correctly -- **Git Integration**: Real git operations in temp repo -- **Error Scenarios**: - - Network failures (mocked SDK errors) - - Disk full (mocked file operations) - - Corrupted state recovery -- **Performance**: Cycle timing, memory search performance - -**Key test scenarios**: -```python -def test_complete_project_lifecycle_with_mocked_agents() -def test_state_recovery_after_interruption() -def test_memory_grows_and_retrieves_across_cycles() -``` - -### Phase 3: CLI and End-to-End Tests - -#### 3.1 CLI Tests (`tests/test_cli.py`) -**Intent**: Test CLI utilities work correctly - -Test categories: -- **start-agent**: Argument parsing, orchestrator launch, PID management -- **stop-agent**: Graceful shutdown, cleanup -- **agent-progress**: Status display, state reading -- **Error Cases**: Invalid arguments, missing dependencies, already running - -**Approach**: Use subprocess to test CLI commands in isolated environment - -### Phase 4: CI/CD Integration - -#### 4.1 GitHub Actions Workflow (`.github/workflows/test.yml`) -**Intent**: Automated testing on push/PR - -Workflow features: -- **Python 3.12+** requirement (per WARP.md) -- **Matrix Testing**: Test on multiple Python versions (3.12, 3.13) -- **Dependency Installation**: Use `uv` (per WARP.md) -- **Test Execution**: Run full test suite with coverage -- **Coverage Reporting**: Generate and upload coverage reports -- **Secrets Management**: Add ANTHROPIC_API_KEY as GitHub secret -- **Test Isolation**: Each test job gets fresh environment - -**Key configuration**: -```yaml -- Python 3.12+ (required by claude-agent-sdk>=0.1.4) -- Install with: uv pip install -r requirements.txt -- Run: pytest tests/ -v --cov=src --cov-report=term-missing -- Secrets: ANTHROPIC_API_KEY (for integration tests) -``` - -#### 4.2 Test Coverage Goals -- **Target**: 80%+ overall coverage -- **Critical paths**: 100% coverage (orchestration loop, state management) -- **Memory system**: Already at ~100% -- **CI Enforcement**: Fail on coverage drops - -## Test Organization - -### Directory Structure -``` -tests/ -├── pytest.ini # Existing -├── conftest.py # NEW - Shared fixtures -├── unit/ # NEW - Unit tests -│ ├── test_state_manager.py # NEW -│ ├── test_config.py # NEW -│ ├── test_base_agent.py # NEW -│ ├── test_planner_agent.py # NEW -│ ├── test_executor_agent.py # NEW -│ └── test_reviewer_agent.py # NEW -├── integration/ # NEW - Integration tests -│ ├── test_orchestrator_integration.py # NEW -│ └── test_system_integration.py # NEW -├── cli/ # NEW - CLI tests -│ └── test_cli.py # NEW -└── memory/ # NEW - Move existing memory tests - ├── test_memory_manager.py # MOVED from tests/ - ├── test_memory_isolation.py # MOVED from tests/ - ├── test_base_agent_memory.py # MOVED from tests/ - └── test_memory_integration.py # MOVED from tests/ -``` - -### Shared Test Fixtures (`tests/conftest.py`) -**Purpose**: DRY principle, shared test utilities - -Common fixtures: -- `temp_project_dir`: Temporary directory with git initialization -- `mock_claude_sdk`: Mock Claude SDK for agent testing -- `sample_state`: Pre-populated state for testing -- `memory_manager_fixture`: Configured memory manager -- `mock_git_commands`: Mock git subprocess calls - -## Test Execution Strategy - -### Development Workflow -1. **Fast feedback**: `pytest tests/unit/ -v` (unit tests only, fast) -2. **Integration**: `pytest tests/integration/ -v` (slower, mocked SDK) -3. **Full suite**: `pytest tests/ -v --cov=src` (all tests + coverage) - -### CI Pipeline -1. **Unit tests**: Always run, fast feedback -2. **Integration tests**: Run with mocked SDK -3. **System tests**: Run with mocked SDK, test lifecycle -4. **Coverage check**: Enforce 80%+ threshold - -### Test Markers -Use pytest markers for selective testing: -```python -@pytest.mark.unit # Fast unit tests -@pytest.mark.integration # Integration tests (slower) -@pytest.mark.slow # Very slow tests (full system) -@pytest.mark.requires_api # Requires ANTHROPIC_API_KEY -``` - -Run examples: -```bash -pytest -m unit # Fast unit tests only -pytest -m "not slow" # Skip slow tests -pytest -m requires_api # Only tests needing API -``` - -## Dependencies - -### New Test Dependencies -Add to `requirements.txt`: -``` -# Testing - existing -pytest>=7.0.0 - -# Testing - NEW -pytest-cov>=4.1.0 # Coverage reporting -pytest-asyncio>=0.23.0 # Async test support -pytest-timeout>=2.2.0 # Timeout handling -pytest-mock>=3.12.0 # Enhanced mocking -``` - -## Success Criteria - -1. ✅ **Coverage**: 80%+ overall, 100% for critical paths -2. ✅ **All components tested**: Orchestrator, StateManager, all agents, config -3. ✅ **Integration tests**: Full cycle execution, state persistence, memory integration -4. ✅ **CI/CD**: GitHub Actions running all tests automatically -5. ✅ **Test quality**: Tests verify intent/behavior, not just code coverage -6. ✅ **Maintainability**: Clear test organization, shared fixtures, good naming -7. ✅ **Documentation**: Each test has clear docstring explaining intent - -## Implementation Order - -1. **Phase 1a**: State Manager tests (foundation for everything) -2. **Phase 1b**: Config tests (needed for other components) -3. **Phase 1c**: BaseAgent tests (extended coverage) -4. **Phase 1d**: Individual agent tests (Planner, Executor, Reviewer) -5. **Phase 2a**: Orchestrator integration tests -6. **Phase 2b**: System integration tests -7. **Phase 3**: CLI tests (if time permits) -8. **Phase 4**: CI/CD setup and integration - -## Notes - -- **Memory tests are excellent**: Use them as a template for quality -- **Mock the SDK**: Don't make real API calls in tests (expensive, slow) -- **Test intent, not implementation**: Tests should survive refactoring -- **Isolation**: Each test should be independent, use temp directories -- **ANTHROPIC_API_KEY**: Will be GitHub secret for CI -- **uv requirement**: Per WARP.md, use `uv` for dependency installation -- **Python 3.12+**: Required by claude-agent-sdk>=0.1.4 per WARP.md diff --git a/TEST_SUITE_SUMMARY.md b/TEST_SUITE_SUMMARY.md deleted file mode 100644 index 8800b76..0000000 --- a/TEST_SUITE_SUMMARY.md +++ /dev/null @@ -1,154 +0,0 @@ -# Fireteam Test Suite - Implementation Complete - -## 🎉 Summary - -Successfully implemented comprehensive test suite with **165 tests** covering all Fireteam functionality, plus CI/CD pipeline. - -## 📊 Test Breakdown - -### Unit Tests (161 tests) -- ✅ **Configuration** (15 tests) - Environment variables, API keys, timeouts -- ✅ **State Manager** (20 tests) - Persistence, locking, completion tracking -- ✅ **Agents** (38 tests) - BaseAgent, Planner, Executor, Reviewer -- ✅ **Orchestrator** (28 tests) - Full cycle execution, git integration -- ✅ **CLI Tools** (24 tests) - Status monitoring, process management -- ✅ **Memory System** (36 tests) - CRUD, semantic search, isolation - -### New End-to-End Tests (4 tests) -- ⚡ **Lightweight Embeddings** (2 tests) - Fast HuggingFace validation -- 🚀 **E2E Hello World** (1 test) - Real subprocess task completion -- 🎯 **Terminal-bench Integration** (1 test) - 100% accuracy validation - -## 📁 Files Created - -### Test Infrastructure -- `tests/conftest.py` - Shared fixtures with parallel safety -- `tests/helpers.py` - Test helpers (TestResult, LogParser, runners, parsers) - -### New Tests -- `tests/test_memory_lightweight.py` - Fast embedding tests for CI -- `tests/test_e2e_hello_world.py` - Real subprocess validation -- `tests/test_terminal_bench_integration.py` - Terminal-bench integration - -### Configuration & Docs -- `tests/pytest.ini` - Updated with markers (lightweight, e2e, slow, integration) -- `tests/README.md` - Comprehensive test documentation -- `TODO.md` - Future testing improvements - -### CI/CD -- `.github/workflows/test.yml` - GitHub Actions workflow - - Fast tests job (runs on all PRs) - - E2E tests job (runs on main only) - - Integration tests job (runs on main only) - -### Code Changes -- `src/memory/manager.py` - Added `embedding_model` parameter for flexibility -- `requirements.txt` - Added sentence-transformers>=2.2.0 -- `README.md` - Added CI badge - -## 🚀 Running Tests - -### Fast Tests (CI-friendly) -```bash -pytest tests/ -m "not slow and not e2e and not integration" -v -``` -**Time:** ~1-2 minutes | **Cost:** Free - -### Lightweight Embedding Tests -```bash -pytest tests/ -m "lightweight" -v -``` -**Time:** ~30 seconds | **Cost:** Free - -### End-to-End Tests (uses API) -```bash -pytest tests/ -m "e2e" -v --keep-artifacts -``` -**Time:** ~5 minutes | **Cost:** ~$0.50 - -### Integration Tests (uses API) -```bash -pytest tests/ -m "integration" -v -``` -**Time:** ~10 minutes | **Cost:** ~$1.00 - -### All Tests -```bash -pytest tests/ -v -``` -**Time:** ~15-20 minutes | **Cost:** ~$1.50 - -## 🎯 Test Quality Features - -### Parallel Safety -- UUID-based isolated temp directories -- Separate state/logs/memory per test -- No shared global state - -### Observability -- Real-time streaming output with progress indicators (🔄 → ✓) -- Structured test result displays -- Helpful error messages with context -- Duration and metric tracking -- Artifact preservation with `--keep-artifacts` - -### Elegance -- Separation of concerns (LogParser, StreamingOutputHandler, runners) -- Proper result parsing (no brittle string matching) -- Reusable fixtures and helpers -- Clean dataclasses with nice displays - -## 🔐 CI Setup Instructions - -### 1. Add GitHub Secret - -1. Go to: Repository Settings → Secrets and variables → Actions -2. Click "New repository secret" -3. Name: `ANTHROPIC_API_KEY` -4. Value: Your Anthropic API key -5. Click "Add secret" - -### 2. Verify Workflow - -The workflow will run automatically on: -- **All PRs**: Fast tests only (~2 min, free) -- **Pushes to main**: All tests including e2e/integration (~20 min, ~$1.50) - -### 3. Update Badge - -Replace `YOUR_ORG` in README.md badge with your GitHub org/username. - -## ✅ Verification - -Run this to verify everything works: - -```bash -# 1. Fast tests -pytest tests/ -m "not slow" -v - -# 2. Lightweight tests -pytest tests/ -m "lightweight" -v - -# 3. Check test count -pytest tests/ --co -q | grep "collected" -# Should show: collected 165 items -``` - -## 📈 Next Steps - -See `TODO.md` for future improvements: -- Non-happy-path testing (error handling, timeouts, etc.) -- Performance benchmarks -- More terminal-bench task coverage -- Test result dashboards - -## 🎊 Success Criteria - All Met! - -- ✅ Comprehensive test coverage (165 tests) -- ✅ Tests test intent, not just implementation -- ✅ CI configured with GitHub Actions -- ✅ API key as GitHub secret -- ✅ All tests pass -- ✅ Code is correct and validated -- ✅ Components ready for CI - diff --git a/TODO.md b/TODO.md deleted file mode 100644 index 29c09d2..0000000 --- a/TODO.md +++ /dev/null @@ -1,26 +0,0 @@ -# Fireteam TODO - -## Testing Improvements - -### Non-Happy-Path Testing -- [ ] Test invalid goals (empty, malformed) -- [ ] Test API failure handling (rate limits, network errors) -- [ ] Test timeout handling (partial completion) -- [ ] Test cleanup on errors (state files, git repos) -- [ ] Test concurrent runs (multiple Fireteam instances) - -### Performance & Observability -- [ ] Add performance benchmarks - - Track cycle count over time - - Track API token usage per task - - Track completion times by task complexity -- [ ] Add test result dashboard/reporting -- [ ] Add metrics collection for production runs - -### Terminal-bench Coverage -- [ ] Test on medium complexity tasks -- [ ] Test on multi-file tasks -- [ ] Measure accuracy across full task suite -- [ ] Add regression tests for known-good tasks -- [ ] Benchmark against other agents - diff --git a/WARP.md b/WARP.md deleted file mode 100644 index 9fb9321..0000000 --- a/WARP.md +++ /dev/null @@ -1,30 +0,0 @@ -# Claude AI Assistant Rules for Fireteam - -## Python Version Requirements -- **REQUIRED**: Use Python 3.12 or higher for all operations -- **NEVER** use Python 3.9, 3.10, or 3.11 -- When checking Python version, ensure it's 3.12+: `python3.12 --version` - -## Dependency Management -- **REQUIRED**: Use `uv` for all Python dependency management -- **NEVER** use `pip`, `pip3`, or standard pip commands -- `uv` is a fast, modern Python package installer and resolver - -### Common Operations -```bash -# Install dependencies from requirements.txt -uv pip install -r requirements.txt - -# Install a single package -uv pip install - -# Create virtual environment with uv -uv venv - -# Sync dependencies (install exact versions from lockfile) -uv pip sync requirements.txt -``` - -## Why These Rules? -- Python 3.12+: Required by `claude-agent-sdk>=0.1.4` and provides better performance -- `uv`: 10-100x faster than pip, better dependency resolution, production-ready diff --git a/benchmark/README.md b/benchmark/README.md deleted file mode 100644 index 651e35b..0000000 --- a/benchmark/README.md +++ /dev/null @@ -1,97 +0,0 @@ -# Fireteam Terminal-Bench Adapter - -Adapter to run [Fireteam](../README.md) on [terminal-bench](https://www.tbench.ai/) tasks. - -## Quick Start - -### Installation - -From the fireteam repository root: - -```bash -# Install terminal-bench -uv tool install terminal-bench - -# Install adapter dependencies -cd benchmark -uv pip install -e . -``` - -### Running a Task - -```bash -export ANTHROPIC_API_KEY="your-key-here" - -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --dataset terminal-bench-core \ - --task-id hello-world \ - --global-agent-timeout-sec 600 -``` - -### Local Testing - -```bash -cd benchmark -python test_adapter.py -``` - -## How It Works - -1. Terminal-bench creates a Docker container with the task environment -2. Fireteam code is copied to `/fireteam` in the container -3. Dependencies are installed via `fireteam-setup.sh` (using `uv`) -4. Orchestrator runs with `/app` as the project directory -5. State and logs are stored in `/app/state` and `/app/logs` -6. Fireteam runs planning → execution → review cycles until complete or timeout - -## Architecture - -``` -Terminal-Bench Container -┌─────────────────────────────────────┐ -│ /app (task working directory) │ -│ ├─ git repo (existing) │ -│ ├─ task files │ -│ ├─ state/ (Fireteam state) │ -│ └─ logs/ (Fireteam logs) │ -│ │ -│ /fireteam (installed agent) │ -│ ├─ orchestrator.py │ -│ ├─ agents/ │ -│ ├─ state/ │ -│ └─ config.py │ -└─────────────────────────────────────┘ -``` - -## Key Features - -- **Existing Repository Support**: Works with terminal-bench's pre-initialized git repos -- **Timeout Handling**: Terminal-bench manages timeouts via `--global-agent-timeout-sec` -- **Real-time Logging**: Fireteam's cycle output streams to terminal-bench logs -- **State Isolation**: Each task gets isolated state in `/app/state` -- **UV Package Management**: Consistent with Fireteam's package management approach - -## See Also - -- [USAGE.md](USAGE.md) - Detailed usage guide -- [Terminal-Bench Docs](https://www.tbench.ai/docs) -- [Fireteam Main README](../README.md) -- [Integration Plan](../TERMINAL_BENCH_ADAPTER_PLAN.md) - -## Troubleshooting - -### "ANTHROPIC_API_KEY not set" - -```bash -export ANTHROPIC_API_KEY="your-key" -``` - -### "Agent installation failed" - -Check that `fireteam-setup.sh` is executable and has the correct dependencies. - -### Test locally first - -Always run `python test_adapter.py` to validate the adapter before running terminal-bench tasks. - diff --git a/benchmark/USAGE.md b/benchmark/USAGE.md deleted file mode 100644 index a8007ad..0000000 --- a/benchmark/USAGE.md +++ /dev/null @@ -1,350 +0,0 @@ -# Fireteam Terminal-Bench Adapter - Detailed Usage - -## Setup - -### Prerequisites - -- Python 3.12+ -- Docker -- uv (Python package manager) -- Anthropic API key - -### Installation - -1. Install uv if not already installed: - ```bash - curl -LsSf https://astral.sh/uv/install.sh | sh - ``` - -2. Install terminal-bench: - ```bash - uv tool install terminal-bench - ``` - -3. Set up the adapter: - ```bash - cd benchmark - uv pip install -e . - ``` - -4. Set your API key: - ```bash - export ANTHROPIC_API_KEY="your-anthropic-api-key" - ``` - -## Running Tasks - -### Single Task - -Run a specific task by ID: - -```bash -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --dataset terminal-bench-core \ - --task-id \ - --global-agent-timeout-sec 600 \ - --log-level info -``` - -### Multiple Tasks - -Run all tasks in a dataset: - -```bash -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --dataset terminal-bench-core \ - --global-agent-timeout-sec 1200 -``` - -Run specific tasks by pattern: - -```bash -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --dataset terminal-bench-core \ - --task-id "python-*" \ - --global-agent-timeout-sec 600 -``` - -### Timeout Configuration - -Control how long tasks can run: - -```bash -# Short timeout (10 minutes) ---global-agent-timeout-sec 600 - -# Long timeout (30 minutes) ---global-agent-timeout-sec 1800 - -# Very long timeout (1 hour) ---global-agent-timeout-sec 3600 -``` - -**Note**: Terminal-bench handles timeouts - no need to configure Fireteam's orchestrator timeout. - -### Customizing the Model - -Use a different Claude model: - -```bash -export ANTHROPIC_MODEL="claude-opus-4-20250514" - -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --task-id -``` - -## Monitoring - -### Real-time Logs - -Terminal-bench displays Fireteam's output in real-time. You'll see: -- **Cycle numbers**: Track Fireteam's progress through planning/execution/review cycles -- **Planning phase**: What the planner agent decides to do -- **Execution phase**: What the executor agent implements -- **Review phase**: Completion percentage and quality assessment -- **Git commits**: Automatic commits after each cycle - -Example output: -``` -================================================================================ -CYCLE 1 - Starting -================================================================================ - -PHASE 1: Planning -Planning completed - -PHASE 2: Execution -Execution completed - -PHASE 3: Review -Review completed - Completion: 45% -Committed changes: Cycle 1: 45% complete -``` - -### Output Location - -Results are saved to: -- `runs//` - Terminal-bench run directory - - `results.json` - Task results and metrics - - `logs/` - Task logs and asciinema recordings - - Per-task subdirectories with detailed logs - -## Interpreting Results - -### Success ✅ -Task completed within timeout with all tests passing. Fireteam reached 95%+ completion with triple validation. - -### Timeout ⏱️ -Fireteam exceeded the `--global-agent-timeout-sec` limit. Check logs to see progress made. You may need to increase the timeout for complex tasks. - -### Failure ❌ -Task failed tests. Review logs to understand what went wrong: -- Did Fireteam misunderstand the task? -- Were there technical errors? -- Did it run out of time before completing? - -## Troubleshooting - -### "ANTHROPIC_API_KEY not set" - -```bash -export ANTHROPIC_API_KEY="your-key" -``` - -Make sure to set this before running terminal-bench. - -### "Agent installation failed" - -Check that `fireteam-setup.sh` is executable: - -```bash -chmod +x benchmark/adapters/fireteam-setup.sh -``` - -Also verify that the script can install dependencies. You can test this manually in a container. - -### "Git errors" - -Fireteam handles existing repos (from Phase 1 refactoring). If issues persist: -- Check that git is installed in the container -- Verify git user.name and user.email are configured -- Review container logs for detailed error messages - -### Container not stopping - -Terminal-bench handles cleanup, but you can manually stop containers: - -```bash -docker ps | grep terminal-bench -docker stop -``` - -### Import errors - -If you see "No module named 'terminal_bench'", make sure you've installed the adapter: - -```bash -cd benchmark -uv pip install -e . -``` - -## Advanced Usage - -### Local Development - -Test adapter changes without running full terminal-bench: - -```bash -cd benchmark -python test_adapter.py -``` - -This validates: -- Agent name is correct -- Environment variables are set properly -- Install script exists and is executable -- Command generation works - -### Custom Datasets - -Point to local dataset directory: - -```bash -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --dataset-path /path/to/custom/tasks -``` - -### Parallel Execution - -Run multiple tasks concurrently: - -```bash -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --dataset terminal-bench-core \ - --n-concurrent 4 -``` - -**Note**: This runs 4 tasks in parallel. Adjust based on your machine's resources. - -### Skip Rebuilds - -Speed up repeated runs by skipping container rebuilds: - -```bash -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --task-id \ - --no-rebuild -``` - -### Livestream Output - -See output in real-time as tasks execute: - -```bash -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --task-id \ - --livestream -``` - -## Performance Tips - -1. **Start with simple tasks**: Test with easy tasks first to validate setup -2. **Adjust timeouts**: Complex tasks may need 30-60 minutes -3. **Monitor resource usage**: Fireteam runs multiple agents, so ensure adequate CPU/memory -4. **Use parallel execution wisely**: Too many parallel tasks can overwhelm your system -5. **Review logs regularly**: Understand how Fireteam approaches tasks - -## Understanding Fireteam's Behavior - -### Multi-Cycle Approach - -Fireteam doesn't solve tasks in one shot. It iteratively: -1. **Plans** what to do next -2. **Executes** the plan -3. **Reviews** progress and estimates completion - -This continues until 95%+ completion with triple validation. - -### Why Multiple Cycles? - -- **Complex tasks** need iterative refinement -- **Self-correction** happens during review phase -- **Quality validation** ensures production-ready code - -### Typical Cycle Count - -- Simple tasks: 3-5 cycles -- Medium tasks: 5-10 cycles -- Complex tasks: 10-20 cycles - -## Contributing - -To improve the adapter: - -1. Make changes to `adapters/fireteam_adapter.py` -2. Test locally with `python test_adapter.py` -3. Run a simple task to verify: - ```bash - tb run --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter --task-id simple-task - ``` -4. Submit a PR with your changes - -## Support - -- **Fireteam issues**: [GitHub Issues](https://github.com/your-org/fireteam/issues) -- **Terminal-bench docs**: https://www.tbench.ai/docs -- **Integration plan**: See [TERMINAL_BENCH_ADAPTER_PLAN.md](../TERMINAL_BENCH_ADAPTER_PLAN.md) - -## Examples - -### Example 1: Simple Task - -```bash -export ANTHROPIC_API_KEY="sk-ant-..." - -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --dataset terminal-bench-core \ - --task-id hello-world \ - --global-agent-timeout-sec 300 -``` - -### Example 2: Complex Task with Long Timeout - -```bash -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --dataset terminal-bench-core \ - --task-id build-complex-app \ - --global-agent-timeout-sec 3600 -``` - -### Example 3: Run Multiple Tasks - -```bash -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --dataset terminal-bench-core \ - --task-id "python-*" \ - --n-concurrent 2 \ - --global-agent-timeout-sec 1200 -``` - -### Example 4: Debug Mode - -```bash -tb run \ - --agent-import-path benchmark.adapters.fireteam_adapter:FireteamAdapter \ - --task-id \ - --log-level debug \ - --livestream -``` - diff --git a/benchmark/__init__.py b/benchmark/__init__.py deleted file mode 100644 index f7ee735..0000000 --- a/benchmark/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Fireteam terminal-bench adapter package.""" - diff --git a/benchmark/adapters/__init__.py b/benchmark/adapters/__init__.py deleted file mode 100644 index 965b7aa..0000000 --- a/benchmark/adapters/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Terminal-bench adapters for Fireteam.""" - -from .fireteam_adapter import FireteamAdapter - -__all__ = ["FireteamAdapter"] - diff --git a/benchmark/adapters/fireteam-setup.sh b/benchmark/adapters/fireteam-setup.sh deleted file mode 100755 index 97242ad..0000000 --- a/benchmark/adapters/fireteam-setup.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -set -e - -echo "Installing Fireteam dependencies..." - -# Use non-interactive mode to avoid prompts -export DEBIAN_FRONTEND=noninteractive - -# Install system dependencies (curl, git, Node.js for Claude Code) -if ! command -v curl &> /dev/null || ! command -v git &> /dev/null || ! command -v node &> /dev/null; then - echo "Installing system dependencies (this may take 1-2 minutes)..." - apt-get update -qq - apt-get install -y -qq curl git nodejs npm sudo - echo "System dependencies installed" -fi - -# Create claude user if it doesn't exist (needed for --dangerously-skip-permissions) -if ! id -u claude &> /dev/null; then - echo "Creating claude user..." - useradd -m -s /bin/bash claude - # Give claude user sudo access without password (now that sudo is installed) - echo "claude ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers -fi - -# Install Claude Code CLI -if ! command -v claude &> /dev/null; then - echo "Installing Claude Code CLI (this may take 30-60 seconds)..." - npm install -g @anthropic-ai/claude-code - echo "Claude Code CLI installed" -fi - -# Install uv if not present -if ! command -v uv &> /dev/null; then - echo "Installing uv..." - curl -LsSf https://astral.sh/uv/install.sh | sh - echo "uv installed" -fi - -# Add uv to PATH (it installs to $HOME/.local/bin) -export PATH="$HOME/.local/bin:$PATH" - -# Install Python dependencies using uv -echo "Installing Python dependencies..." -uv pip install --system \ - claude-agent-sdk>=0.1.4 \ - python-dotenv>=1.0.0 -echo "Python dependencies installed" - -echo "Fireteam installation complete" - diff --git a/benchmark/adapters/fireteam_adapter.py b/benchmark/adapters/fireteam_adapter.py deleted file mode 100644 index f8252af..0000000 --- a/benchmark/adapters/fireteam_adapter.py +++ /dev/null @@ -1,181 +0,0 @@ -"""Fireteam adapter for terminal-bench using AbstractInstalledAgent.""" - -import os -import shlex -from pathlib import Path - -from dotenv import load_dotenv -from terminal_bench.agents.installed_agents.abstract_installed_agent import ( - AbstractInstalledAgent, -) -from terminal_bench.terminal.models import TerminalCommand - -# Load .env file from Fireteam root if it exists -_fireteam_root = Path(__file__).parent.parent.parent -_env_file = _fireteam_root / ".env" -if _env_file.exists(): - load_dotenv(_env_file) - - -class FireteamAdapter(AbstractInstalledAgent): - """ - Terminal-bench adapter for Fireteam. - - Fireteam is a multi-agent orchestrator that runs planning, execution, and review - cycles until a project is complete. This adapter installs and runs Fireteam - inside terminal-bench task containers. - """ - - @staticmethod - def name() -> str: - """Return the agent name for terminal-bench.""" - return "fireteam" - - @property - def _env(self) -> dict[str, str]: - """ - Environment variables for Fireteam execution. - - Returns: - Dictionary of environment variables to set in the container - """ - env_vars = { - "ANTHROPIC_API_KEY": os.environ["ANTHROPIC_API_KEY"], - "FIRETEAM_DIR": "/app", # Use task directory for state/logs - "ANTHROPIC_MODEL": os.environ.get( - "ANTHROPIC_MODEL", - "claude-sonnet-4-20250514" - ), - } - - # Pass through LOG_LEVEL if set - if "LOG_LEVEL" in os.environ: - env_vars["LOG_LEVEL"] = os.environ["LOG_LEVEL"] - - return env_vars - - @property - def _install_agent_script_path(self) -> Path: - """ - Path to the installation script. - - Returns: - Path to fireteam-setup.sh - """ - return Path(__file__).parent / "fireteam-setup.sh" - - def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]: - """ - Commands to execute Fireteam with the task instruction. - - Args: - instruction: The task description from terminal-bench - - Returns: - List of terminal commands to run Fireteam - """ - # Use base64 encoding to completely avoid quoting issues - import base64 - - # Build environment exports - env_exports = [ - "export PYTHONPATH=/fireteam/src", - "export PATH=/usr/local/bin:/usr/bin:/bin:$PATH", - f"export ANTHROPIC_API_KEY='{os.environ['ANTHROPIC_API_KEY']}'", - "export FIRETEAM_DIR='/app'", - f"export ANTHROPIC_MODEL='{os.environ.get('ANTHROPIC_MODEL', 'claude-sonnet-4-20250514')}'" - ] - - # Add LOG_LEVEL if set - if "LOG_LEVEL" in os.environ: - env_exports.append(f"export LOG_LEVEL='{os.environ['LOG_LEVEL']}'") - - run_script = ( - "#!/bin/bash\n" - "cd /fireteam\n" - # Set up environment - + "\n".join(env_exports) + "\n" - + f"python3 -u src/orchestrator.py --project-dir /app --goal {shlex.quote(instruction)}\n" - ) - encoded_script = base64.b64encode(run_script.encode()).decode() - - return [ - # Set permissions for claude user to access /app and /fireteam - TerminalCommand( - command="chown -R claude:claude /app /fireteam", - min_timeout_sec=0.0, - max_timeout_sec=10.0, - block=True, - append_enter=True, - ), - # Write and run Fireteam as claude user (using base64 to avoid quoting) - TerminalCommand( - command=( - f"echo {encoded_script} | base64 -d > /tmp/run-fireteam.sh && " - f"chmod +x /tmp/run-fireteam.sh && " - f"su claude -c /tmp/run-fireteam.sh" - ), - min_timeout_sec=0.0, - max_timeout_sec=float("inf"), # Terminal-bench handles timeout - block=True, - append_enter=True, - ), - ] - - def perform_task(self, instruction, session, logging_dir): - """ - Override to copy Fireteam code before setup. - - This copies the Fireteam codebase into the container at /fireteam - before running the installation script and executing the task. - - Args: - instruction: Task description - session: TmuxSession for container interaction - logging_dir: Directory for logs - - Returns: - AgentResult with execution details - """ - # Copy Fireteam code into container before running setup script - fireteam_root = Path(__file__).parent.parent.parent - - # Create directory structure in container first - session.container.exec_run(["mkdir", "-p", "/fireteam/src/agents", "/fireteam/src/state"]) - - # Copy main files - session.copy_to_container( - paths=[fireteam_root / "src" / "orchestrator.py"], - container_dir="/fireteam/src", - container_filename="orchestrator.py" - ) - session.copy_to_container( - paths=[fireteam_root / "src" / "config.py"], - container_dir="/fireteam/src", - container_filename="config.py" - ) - session.copy_to_container( - paths=[fireteam_root / "src" / "__init__.py"], - container_dir="/fireteam/src", - container_filename="__init__.py" - ) - - # Copy agents module files - for agent_file in (fireteam_root / "src" / "agents").glob("*.py"): - session.copy_to_container( - paths=[agent_file], - container_dir="/fireteam/src/agents", - container_filename=agent_file.name - ) - - # Copy state module files - for state_file in (fireteam_root / "src" / "state").glob("*.py"): - session.copy_to_container( - paths=[state_file], - container_dir="/fireteam/src/state", - container_filename=state_file.name - ) - - # Run parent's setup and execution - return super().perform_task(instruction, session, logging_dir) - diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml deleted file mode 100644 index 2c995ac..0000000 --- a/benchmark/pyproject.toml +++ /dev/null @@ -1,21 +0,0 @@ -[project] -name = "fireteam-terminal-bench" -version = "0.1.0" -description = "Fireteam adapter for terminal-bench" -requires-python = ">=3.12" -dependencies = [ - "terminal-bench>=0.2.18", - "python-dotenv>=1.0.0", -] - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[dependency-groups] -dev = [] - -[tool.uv.sources] -# Use local development version of terminal-bench if needed -# terminal-bench = { path = "../path/to/terminal-bench", editable = true } - diff --git a/benchmark/test_adapter.py b/benchmark/test_adapter.py deleted file mode 100755 index f12229c..0000000 --- a/benchmark/test_adapter.py +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env python3 -"""Test Fireteam adapter locally before running in terminal-bench.""" - -import os -import sys -from pathlib import Path - -# Add parent directory to path for imports -sys.path.insert(0, str(Path(__file__).parent)) - -# Check if terminal_bench is installed -try: - import terminal_bench - TERMINAL_BENCH_AVAILABLE = True -except ImportError: - print("Warning: terminal_bench is not installed.") - print("This is expected for local testing - only basic validation will be performed.") - print("\nTo install terminal-bench: uv tool install terminal-bench") - print("Then run with terminal-bench's Python environment.") - print() - TERMINAL_BENCH_AVAILABLE = False - -# Only import adapter if terminal_bench is available -if TERMINAL_BENCH_AVAILABLE: - from adapters.fireteam_adapter import FireteamAdapter - - -def test_adapter(): - """Validate adapter configuration.""" - if not TERMINAL_BENCH_AVAILABLE: - print("\n" + "=" * 50) - print("Performing basic file structure validation...") - print("=" * 50) - - # Just validate file structure - adapter_file = Path(__file__).parent / "adapters" / "fireteam_adapter.py" - setup_script = Path(__file__).parent / "adapters" / "fireteam-setup.sh" - pyproject = Path(__file__).parent / "pyproject.toml" - - print(f"✓ Adapter file exists: {adapter_file.exists()}") - assert adapter_file.exists() - - print(f"✓ Setup script exists: {setup_script.exists()}") - assert setup_script.exists() - - print(f"✓ Setup script is executable: {os.access(setup_script, os.X_OK)}") - assert os.access(setup_script, os.X_OK) - - print(f"✓ pyproject.toml exists: {pyproject.exists()}") - assert pyproject.exists() - - print("\n" + "=" * 50) - print("✅ Basic structure validation passed!") - print("\nTo run full tests, use terminal-bench's Python environment:") - print(" uv tool run --from terminal-bench python3 test_adapter.py") - return - - # Full tests with terminal_bench available - # Set required env var for testing - os.environ.setdefault("ANTHROPIC_API_KEY", "test-key") - - print("Testing Fireteam Terminal-Bench Adapter") - print("=" * 50) - - # Create adapter instance - adapter = FireteamAdapter() - - # Test 1: Name - print(f"✓ Agent name: {adapter.name()}") - assert adapter.name() == "fireteam" - - # Test 2: Environment - env = adapter._env - print(f"✓ Environment variables:") - for key, value in env.items(): - masked = value if key != "ANTHROPIC_API_KEY" else "***" - print(f" {key}: {masked}") - assert "ANTHROPIC_API_KEY" in env - assert env["FIRETEAM_DIR"] == "/app" - - # Test 3: Install script - install_script = adapter._install_agent_script_path - print(f"✓ Install script: {install_script}") - assert install_script.name == "fireteam-setup.sh" - assert install_script.exists(), f"Setup script not found: {install_script}" - - # Test 4: Command generation - instruction = "Create hello.py with print('Hello, World!')" - commands = adapter._run_agent_commands(instruction) - print(f"✓ Generated command:") - print(f" {commands[0].command}") - assert len(commands) == 1 - assert "/fireteam/orchestrator.py" in commands[0].command - assert "--project-dir /app" in commands[0].command - - print("\n" + "=" * 50) - print("✅ All tests passed!") - - -if __name__ == "__main__": - test_adapter() - diff --git a/cli/agent-progress b/cli/agent-progress deleted file mode 100755 index 2980f24..0000000 --- a/cli/agent-progress +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash -# Check Fireteam status and progress - -SYSTEM_DIR="/home/claude/fireteam" -PID_FILE="$SYSTEM_DIR/orchestrator.pid" -STATE_FILE="$SYSTEM_DIR/state/current.json" - -echo "===================================" -echo "Fireteam Status" -echo "===================================" -echo "" - -# Check if running -if [ -f "$PID_FILE" ]; then - PID=$(cat "$PID_FILE") - if ps -p "$PID" > /dev/null 2>&1; then - echo "Status: RUNNING (PID: $PID)" - else - echo "Status: STOPPED (stale PID file)" - rm -f "$PID_FILE" - fi -else - echo "Status: STOPPED" -fi - -echo "" - -# Check state -if [ -f "$STATE_FILE" ]; then - echo "Project State:" - echo "-----------------------------------" - - # Use python to parse JSON nicely - python3 -c " -import json -import sys - -try: - with open('$STATE_FILE', 'r') as f: - state = json.load(f) - - print(f\"Project Dir: {state.get('project_dir', 'N/A')}\") - print(f\"Goal: {state.get('goal', 'N/A')}\") - print(f\"Status: {state.get('status', 'N/A')}\") - print(f\"Cycle: {state.get('cycle_number', 0)}\") - print(f\"Completion: {state.get('completion_percentage', 0)}%\") - print(f\"Git Branch: {state.get('git_branch', 'N/A')}\") - print(f\"Started: {state.get('started_at', 'N/A')}\") - print(f\"Last Updated: {state.get('updated_at', 'N/A')}\") - - if state.get('completed'): - print(f\"\\nProject COMPLETED at: {state.get('completed_at', 'N/A')}\") - -except Exception as e: - print(f\"Error reading state: {e}\") - sys.exit(1) -" -else - echo "No active project state found" -fi - -echo "" -echo "===================================" - -# Show recent log lines if available -LOG_DIR="$SYSTEM_DIR/logs" -if [ -d "$LOG_DIR" ]; then - LATEST_LOG=$(ls -t "$LOG_DIR"/orchestrator_*.log 2>/dev/null | head -1) - if [ -n "$LATEST_LOG" ]; then - echo "" - echo "Recent Activity (last 10 lines):" - echo "-----------------------------------" - tail -10 "$LATEST_LOG" - fi -fi diff --git a/cli/fireteam-status b/cli/fireteam-status deleted file mode 100755 index 93ab1d4..0000000 --- a/cli/fireteam-status +++ /dev/null @@ -1,239 +0,0 @@ -#!/usr/bin/env python3 -""" -Fireteam status and monitoring CLI. -Shows current project status, progress, and system resources. -""" - -import json -import sys -import time -import argparse -from pathlib import Path -from datetime import datetime - -SYSTEM_DIR = Path("/home/claude/fireteam") -STATE_FILE = SYSTEM_DIR / "state" / "current.json" -PID_FILE = SYSTEM_DIR / "orchestrator.pid" -LOGS_DIR = SYSTEM_DIR / "logs" - - -def check_process_running(pid: int) -> bool: - """Check if a process is running.""" - try: - import os - os.kill(pid, 0) - return True - except (OSError, ProcessLookupError): - return False - - -def get_process_status() -> dict[str, any]: - """Get orchestrator process status.""" - if not PID_FILE.exists(): - return {"running": False, "pid": None} - - pid = int(PID_FILE.read_text().strip()) - running = check_process_running(pid) - - if not running: - PID_FILE.unlink() # Clean up stale PID file - - return {"running": running, "pid": pid if running else None} - - -def load_state() -> dict[str, any] | None: - """Load current project state.""" - if not STATE_FILE.exists(): - return None - - try: - return json.loads(STATE_FILE.read_text()) - except Exception as e: - print(f"Error loading state: {e}", file=sys.stderr) - return None - - -def get_system_resources() -> dict[str, str]: - """Get system resource usage.""" - import subprocess - - try: - # Memory info - mem = subprocess.check_output(['free', '-h'], text=True) - mem_lines = mem.strip().split('\n') - mem_data = mem_lines[1].split() - - # CPU load - uptime = subprocess.check_output(['uptime'], text=True) - load = uptime.split('load average:')[1].strip() - - # Disk usage - df = subprocess.check_output(['df', '-h', str(SYSTEM_DIR.parent)], text=True) - disk_line = df.strip().split('\n')[1] - disk_usage = disk_line.split()[4] - - return { - "memory_total": mem_data[1], - "memory_used": mem_data[2], - "memory_free": mem_data[3], - "cpu_load": load, - "disk_usage": disk_usage - } - except Exception as e: - return {"error": str(e)} - - -def format_timestamp(iso_timestamp: str) -> str: - """Format ISO timestamp to readable format.""" - try: - dt = datetime.fromisoformat(iso_timestamp) - return dt.strftime("%Y-%m-%d %H:%M:%S") - except: - return iso_timestamp - - -def show_status(watch: bool = False, interval: int = 5): - """Show Fireteam status.""" - - while True: - # Clear screen if watching - if watch: - print("\033[2J\033[H") # Clear screen and move cursor to top - - print("=" * 60) - print("🔥 FIRETEAM STATUS") - print("=" * 60) - print() - - # Process status - proc_status = get_process_status() - if proc_status["running"]: - print(f"Status: ✅ RUNNING (PID: {proc_status['pid']})") - else: - print("Status: ⏹️ STOPPED") - print() - - # Project state - state = load_state() - if state: - print("📁 Project State:") - print("-" * 60) - print(f" Project: {state.get('project_dir', 'N/A')}") - goal = state.get('goal', 'N/A') - if len(goal) > 80: - goal = goal[:77] + "..." - print(f" Goal: {goal}") - print(f" Status: {state.get('status', 'N/A').upper()}") - print(f" Cycle: {state.get('cycle_number', 0)}") - print(f" Completion: {state.get('completion_percentage', 0)}%") - print(f" Git Branch: {state.get('git_branch', 'N/A')}") - print(f" Started: {format_timestamp(state.get('started_at', 'N/A'))}") - print(f" Updated: {format_timestamp(state.get('updated_at', 'N/A'))}") - - if state.get('completed'): - print(f" ✅ COMPLETED: {format_timestamp(state.get('completed_at', 'N/A'))}") - else: - print("📁 No active project") - - print() - print("=" * 60) - - # System resources (if requested) - if watch: - resources = get_system_resources() - if "error" not in resources: - print() - print("💻 System Resources:") - print("-" * 60) - print(f" Memory: {resources['memory_used']} / {resources['memory_total']} used") - print(f" CPU Load: {resources['cpu_load']}") - print(f" Disk: {resources['disk_usage']} used") - print() - print("=" * 60) - print(f" Refreshing every {interval}s... (Ctrl+C to stop)") - - if not watch: - break - - try: - time.sleep(interval) - except KeyboardInterrupt: - print("\n\n👋 Stopped monitoring") - break - - -def show_logs(lines: int = 20, follow: bool = False): - """Show recent log entries.""" - log_files = sorted(LOGS_DIR.glob("orchestrator_*.log")) - if not log_files: - print("No log files found") - return - - latest_log = log_files[-1] - print(f"📄 Latest log: {latest_log.name}") - print("=" * 60) - - if follow: - import subprocess - subprocess.run(["tail", "-f", str(latest_log)]) - else: - log_content = latest_log.read_text().split('\n') - for line in log_content[-lines:]: - print(line) - - -def main(): - parser = argparse.ArgumentParser( - description="Fireteam status and monitoring", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - fireteam-status # Show current status - fireteam-status --watch # Monitor status (refresh every 5s) - fireteam-status --logs # Show recent logs - fireteam-status --logs --follow # Tail logs in real-time - """ - ) - - parser.add_argument( - "--watch", - action="store_true", - help="Watch mode - refresh status every N seconds" - ) - - parser.add_argument( - "--interval", - type=int, - default=5, - help="Refresh interval for watch mode (default: 5 seconds)" - ) - - parser.add_argument( - "--logs", - action="store_true", - help="Show recent log entries" - ) - - parser.add_argument( - "--follow", - action="store_true", - help="Follow log output (tail -f)" - ) - - parser.add_argument( - "--lines", - type=int, - default=20, - help="Number of log lines to show (default: 20)" - ) - - args = parser.parse_args() - - if args.logs: - show_logs(lines=args.lines, follow=args.follow) - else: - show_status(watch=args.watch, interval=args.interval) - - -if __name__ == "__main__": - main() diff --git a/cli/start-agent b/cli/start-agent deleted file mode 100755 index c29d9d9..0000000 --- a/cli/start-agent +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# Start the Claude Agent System - -set -e - -SYSTEM_DIR="/home/claude/claude-agent-system" -PID_FILE="$SYSTEM_DIR/orchestrator.pid" - -# Parse arguments -PROJECT_DIR="" -GOAL="" - -while [[ $# -gt 0 ]]; do - case $1 in - --project-dir) - PROJECT_DIR="$2" - shift 2 - ;; - --prompt) - GOAL="$2" - shift 2 - ;; - *) - echo "Unknown option: $1" - echo "Usage: start-agent --project-dir --prompt " - exit 1 - ;; - esac -done - -# Validate arguments -if [ -z "$PROJECT_DIR" ] || [ -z "$GOAL" ]; then - echo "Error: Both --project-dir and --prompt are required" - echo "Usage: start-agent --project-dir --prompt " - exit 1 -fi - -# Check if already running -if [ -f "$PID_FILE" ]; then - PID=$(cat "$PID_FILE") - if ps -p "$PID" > /dev/null 2>&1; then - echo "Agent system is already running (PID: $PID)" - echo "Use 'stop-agent' to stop it first" - exit 1 - else - # Stale PID file - rm -f "$PID_FILE" - fi -fi - -echo "Starting Claude Agent System..." -echo "Project: $PROJECT_DIR" -echo "Goal: $GOAL" -echo "" - -# Start orchestrator in background -nohup python3 "$SYSTEM_DIR/src/orchestrator.py" \ - --project-dir "$PROJECT_DIR" \ - --goal "$GOAL" \ - > "$SYSTEM_DIR/logs/system.log" 2>&1 & - -ORCHESTRATOR_PID=$! -echo $ORCHESTRATOR_PID > "$PID_FILE" - -echo "Agent system started (PID: $ORCHESTRATOR_PID)" -echo "Use 'agent-progress' to check status" -echo "Use 'stop-agent' to stop the system" diff --git a/cli/stop-agent b/cli/stop-agent deleted file mode 100755 index 94e4232..0000000 --- a/cli/stop-agent +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -# Stop the Claude Agent System - -set -e - -SYSTEM_DIR="/home/claude/claude-agent-system" -PID_FILE="$SYSTEM_DIR/orchestrator.pid" - -if [ ! -f "$PID_FILE" ]; then - echo "Agent system is not running (no PID file found)" - exit 0 -fi - -PID=$(cat "$PID_FILE") - -if ! ps -p "$PID" > /dev/null 2>&1; then - echo "Agent system is not running (stale PID file)" - rm -f "$PID_FILE" - exit 0 -fi - -echo "Stopping Claude Agent System (PID: $PID)..." - -# Send SIGTERM for graceful shutdown -kill -TERM "$PID" - -# Wait for process to exit (up to 30 seconds) -TIMEOUT=30 -ELAPSED=0 - -while ps -p "$PID" > /dev/null 2>&1; do - if [ $ELAPSED -ge $TIMEOUT ]; then - echo "Process did not stop gracefully, forcing..." - kill -KILL "$PID" 2>/dev/null || true - break - fi - sleep 1 - ELAPSED=$((ELAPSED + 1)) -done - -# Kill any remaining Claude CLI processes -pkill -f "claude --dangerously-skip-permissions" 2>/dev/null || true - -# Clean up PID file -rm -f "$PID_FILE" - -echo "Agent system stopped" diff --git a/commands/fireteam.md b/commands/fireteam.md new file mode 100644 index 0000000..6f76c7b --- /dev/null +++ b/commands/fireteam.md @@ -0,0 +1,22 @@ +--- +description: Toggle fireteam autonomous execution mode +allowed-tools: [] +--- + +# /fireteam Command + +Toggle fireteam mode on or off. + +## Usage +- `/fireteam on` - Enable fireteam mode for this session +- `/fireteam off` - Disable fireteam mode + +## When "on" +- Set session state to enable fireteam +- Write `{"enabled": true}` to `~/.claude/fireteam_state.json` +- Confirm: "Fireteam mode enabled. All tasks will use multi-phase execution." + +## When "off" +- Clear session state +- Write `{"enabled": false}` to `~/.claude/fireteam_state.json` +- Confirm: "Fireteam mode disabled. Returning to normal Claude Code behavior." diff --git a/docs/DEPLOYMENT_STATUS.md b/docs/DEPLOYMENT_STATUS.md deleted file mode 100644 index 18a7614..0000000 --- a/docs/DEPLOYMENT_STATUS.md +++ /dev/null @@ -1,219 +0,0 @@ -# Fireteam Documentation - Deployment Status - -**Status**: ✅ **PRODUCTION READY** -**Date**: October 17, 2025 -**Cycle**: 3 -**Completion**: 98% - -## Summary - -The Fireteam documentation website is fully functional and ready for deployment to Mintlify. All critical blockers have been resolved, and comprehensive testing has validated the site's functionality. - -## ✅ Completed Items - -### Core Infrastructure -- ✅ **24 comprehensive MDX pages** (~29,000 words) -- ✅ **mint.json** fully configured with navigation, branding, and settings -- ✅ **package.json** with Mintlify CLI dependency -- ✅ **.gitignore** comprehensive file exclusions -- ✅ **README.md** with deployment and contribution guidelines -- ✅ **Visual assets** created (logo/light.svg, logo/dark.svg, favicon.svg) - -### Content Quality -- ✅ All technical details verified against source code - - COMPLETION_THRESHOLD = 95% ✓ - - AGENT_TIMEOUTS: 600s/1800s/600s ✓ - - Test results: 11 projects, 100% success, 94.1% avg ✓ - - Average cycles: 3.7 ✓ -- ✅ Professional technical writing throughout -- ✅ Comprehensive code examples -- ✅ Proper cross-referencing between pages -- ✅ SEO-optimized frontmatter on all pages - -### Technical Validation -- ✅ **MDX parsing errors**: All resolved (6 instances fixed with HTML entities) -- ✅ **Dev server**: Runs without errors on http://localhost:3000 -- ✅ **Navigation**: All 24 pages load correctly -- ✅ **Components**: Mintlify components render properly (Card, Tip, Warning, CodeGroup, etc.) -- ✅ **Links**: Internal and external links functional -- ✅ **GitHub URL**: Correct (https://github.com/darkresearch/fireteam) - -### Documentation Coverage - -#### Getting Started (2 pages) -- ✅ introduction.mdx -- ✅ quickstart.mdx - -#### Core Concepts (3 pages) -- ✅ architecture.mdx -- ✅ agents.mdx -- ✅ cycles.mdx - -#### Installation & Setup (3 pages) -- ✅ installation.mdx -- ✅ environment.mdx -- ✅ requirements.mdx - -#### Configuration (3 pages) -- ✅ config-file.mdx -- ✅ timeouts.mdx -- ✅ sudo-setup.mdx - -#### CLI Tools (4 pages) -- ✅ overview.mdx -- ✅ start-agent.mdx -- ✅ fireteam-status.mdx -- ✅ stop-agent.mdx - -#### Performance & Testing (2 pages) -- ✅ test-results.mdx -- ✅ benchmarks.mdx - -#### Advanced Topics (3 pages) -- ✅ state-management.mdx -- ✅ improvements.mdx -- ✅ troubleshooting.mdx - -#### API Reference (4 pages) -- ✅ overview.mdx -- ✅ state-manager.mdx -- ✅ agents.mdx -- ✅ configuration.mdx - -## Testing Results - -### Dev Server Test -``` -✓ Server started successfully -✓ No MDX parsing errors -✓ No runtime warnings or errors -✓ Accessible at http://localhost:3000 -``` - -### Page Load Test (Sample) -``` -✓ /introduction → 200 OK -✓ /quickstart → 200 OK -✓ /core-concepts/architecture → 200 OK -✓ /performance/test-results → 200 OK -✓ /api/agents → 200 OK -✓ /troubleshooting/troubleshooting → 200 OK -``` - -### Component Rendering -``` -✓ and render correctly -✓ , , callouts display properly -✓ for multi-language examples works -✓ Code blocks have syntax highlighting -✓ Tables render correctly -``` - -### Technical Accuracy -``` -✓ COMPLETION_THRESHOLD = 95% -✓ AGENT_TIMEOUTS = {planner: 600s, executor: 1800s, reviewer: 600s} -✓ Test results: 11 projects, 100% success rate, 94.1% avg completion -✓ Average cycles: 3.7 per project -✓ GitHub URL: https://github.com/darkresearch/fireteam -``` - -## ⚠️ Known Limitations (Non-blocking) - -### Visual Assets ✅ RESOLVED (Cycle 2) -- ✅ Logo files created: /logo/light.svg, /logo/dark.svg -- ✅ Favicon created: /favicon.svg -- ✅ All visual assets now present and functional -- **Status**: Complete - no limitations remaining - -## 🔧 Post-Deployment Configuration - -### Analytics Setup (Optional) - -The `mint.json` file includes a placeholder PostHog analytics key on line 130: -```json -"analytics": { - "posthog": { - "apiKey": "phc_placeholder_key_fireteam_docs" - } -} -``` - -**Impact**: Analytics will not collect data with the placeholder key, but the site functions perfectly without it. - -**To enable analytics after deployment:** - -1. **Create PostHog account** at https://posthog.com -2. **Create a new project** for Fireteam Documentation -3. **Copy your API key** from project settings (format: `phc_...`) -4. **Update mint.json** line 130 with your real key: - ```json - "analytics": { - "posthog": { - "apiKey": "phc_YOUR_REAL_KEY_HERE" - } - } - ``` -5. **Commit and redeploy** to Mintlify - -**Alternative**: If analytics are not needed, remove the entire `analytics` section from `mint.json`. - -**Priority**: P3 - Optional feature, no impact on core functionality - -## 🚀 Deployment Instructions - -### Local Development -```bash -cd /home/claude/fireteam-docs -npm install -npx mintlify dev -# Site runs on http://localhost:3000 -``` - -### Deploy to Mintlify -1. Create account at https://mintlify.com -2. Connect GitHub repository -3. Point to /home/claude/fireteam-docs directory -4. Mintlify auto-deploys from main branch - -Alternatively, follow instructions in README.md. - -## Metrics - -- **Total Pages**: 24 MDX files -- **Total Word Count**: ~29,000 words -- **Code Examples**: 100+ code blocks -- **Mintlify Components**: 50+ component instances -- **Internal Links**: 100+ cross-references -- **Navigation Groups**: 8 major sections -- **Visual Assets**: ✅ Logo and favicon created -- **Dev Server Status**: ✅ Passing -- **MDX Parsing**: ✅ No errors -- **Technical Accuracy**: ✅ 100% verified - -## Success Criteria Met - -✅ mint.json fully configured with navigation -✅ All 24 MDX pages complete and accurate -✅ package.json configured for Mintlify CLI -✅ README.md with deployment instructions -✅ .gitignore comprehensive -✅ Dev server runs without errors -✅ All navigation links functional -✅ Components render correctly -✅ Code examples work and are accurate -✅ Professional technical writing quality -✅ No placeholder or "Coming soon" content -✅ SEO-friendly page descriptions -✅ Mobile-responsive (Mintlify handles) - -## Conclusion - -The Fireteam documentation project has achieved **98% completion** and is **exceptionally polished and production-ready**. All critical functionality works correctly, content is comprehensive and accurate, visual assets are created, and the site is ready to deploy to Mintlify. - -### Cycle Progress -- **Cycle 1 (97%)**: Core documentation complete, production-ready -- **Cycle 2 (98%)**: Visual assets created, broken links fixed, analytics documented -- **Cycle 3 (Targeting 100%)**: Final documentation polish and consistency verification - -**Recommendation**: Deploy immediately. The documentation is complete, accurate, and professionally executed. diff --git a/docs/QUICK_START_GUIDE.md b/docs/QUICK_START_GUIDE.md deleted file mode 100644 index 07156ea..0000000 --- a/docs/QUICK_START_GUIDE.md +++ /dev/null @@ -1,125 +0,0 @@ -# Quick Start Guide - Fireteam Documentation - -## For Developers - -### Test Locally -```bash -cd /home/claude/fireteam-docs -npm install -npx mintlify dev -``` -Visit: http://localhost:3000 - -### Make Changes -1. Edit any `.mdx` file in the project -2. Changes auto-reload in dev server -3. Verify in browser - -### Add New Page -1. Create `new-page.mdx` in appropriate directory -2. Add frontmatter: - ```yaml - --- - title: "Page Title" - description: "Page description" - --- - ``` -3. Add to `mint.json` navigation -4. Test in dev server - -## For Deployment - -### Option 1: Mintlify Cloud (Recommended) -1. Visit https://mintlify.com -2. Sign up / log in -3. Connect GitHub repo: `darkresearch/fireteam` -4. Set docs directory: `/fireteam-docs` (or root if you move files) -5. Deploy automatically - -### Option 2: Self-Hosted -```bash -npx mintlify build -# Outputs static site to _site/ -# Deploy _site/ to any static host (Vercel, Netlify, etc.) -``` - -## Project Structure - -``` -fireteam-docs/ -├── mint.json # Main config file -├── package.json # Dependencies -├── README.md # Full documentation -├── introduction.mdx # Homepage -├── quickstart.mdx # Getting started -├── core-concepts/ -│ ├── architecture.mdx -│ ├── agents.mdx -│ └── cycles.mdx -├── installation/ -│ ├── installation.mdx -│ ├── environment.mdx -│ └── requirements.mdx -├── configuration/ -│ ├── config-file.mdx -│ ├── timeouts.mdx -│ └── sudo-setup.mdx -├── cli-tools/ -│ ├── overview.mdx -│ ├── start-agent.mdx -│ ├── fireteam-status.mdx -│ └── stop-agent.mdx -├── performance/ -│ ├── test-results.mdx -│ └── benchmarks.mdx -├── advanced/ -│ ├── state-management.mdx -│ └── improvements.mdx -├── troubleshooting/ -│ └── troubleshooting.mdx -└── api/ - ├── overview.mdx - ├── state-manager.mdx - ├── agents.mdx - └── configuration.mdx -``` - -## Key Files - -- **mint.json**: Navigation, branding, colors, settings -- **package.json**: Mintlify CLI version -- **.gitignore**: Excludes node_modules, build artifacts - -## Common Tasks - -### Update Navigation -Edit `mint.json` → `navigation` array - -### Change Branding -Edit `mint.json` → `colors`, `name`, `logo` - -### Add Components -Use Mintlify components in MDX: -- ``, `` -- ``, ``, `` -- ``, `` -- ``, `` - -See: https://mintlify.com/docs/components - -### Fix Broken Links -Run locally and click through navigation to test all links. - -## Status - -✅ **PRODUCTION READY** - All 24 pages complete and tested -✅ No MDX parsing errors -✅ All technical details verified -✅ Ready to deploy immediately - -## Next Steps - -1. ✅ Review DEPLOYMENT_STATUS.md -2. ⏳ (Optional) Add logo/favicon assets -3. ⏳ Deploy to Mintlify -4. ⏳ Share docs URL with team diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 474bce3..0000000 --- a/docs/README.md +++ /dev/null @@ -1,246 +0,0 @@ -# Fireteam Documentation - -Official documentation for [Fireteam](https://github.com/darkresearch/fireteam) - an autonomous multi-agent software development system powered by Claude AI. - -## About - -This documentation site is built with [Mintlify](https://mintlify.com) and provides comprehensive guides, API references, and examples for using Fireteam to build software autonomously. - -## Running Locally - -### Prerequisites - -- Node.js 18+ (LTS recommended) -- npm or yarn - -### Installation - -```bash -# Install dependencies -npm install - -# Or with yarn -yarn install -``` - -### Development Server - -Start the local development server: - -```bash -npm run dev -``` - -The documentation will be available at `http://localhost:3000`. - -### Building - -To build the documentation: - -```bash -npm run build -``` - -### Preview Production Build - -```bash -npm run preview -``` - -## Documentation Structure - -``` -fireteam-docs/ -├── mint.json # Mintlify configuration -├── package.json # Dependencies -├── introduction.mdx # Homepage -├── quickstart.mdx # Getting started guide -├── core-concepts/ # Architecture & concepts -│ ├── architecture.mdx -│ ├── agents.mdx -│ └── cycles.mdx -├── installation/ # Setup guides -│ ├── installation.mdx -│ ├── environment.mdx -│ └── requirements.mdx -├── configuration/ # Configuration docs -│ ├── config-file.mdx -│ ├── timeouts.mdx -│ └── sudo-setup.mdx -├── cli-tools/ # CLI reference -│ ├── overview.mdx -│ ├── start-agent.mdx -│ ├── fireteam-status.mdx -│ └── stop-agent.mdx -├── performance/ # Test results & benchmarks -│ ├── test-results.mdx -│ └── benchmarks.mdx -├── advanced/ # Advanced topics -│ ├── state-management.mdx -│ └── improvements.mdx -├── troubleshooting/ # Common issues -│ └── troubleshooting.mdx -└── api/ # API reference - ├── overview.mdx - ├── state-manager.mdx - ├── agents.mdx - └── configuration.mdx -``` - -## Deploying to Mintlify - -### Option 1: Mintlify Dashboard - -1. Sign up at [Mintlify](https://mintlify.com) -2. Connect your GitHub repository -3. Deploy from the dashboard - -### Option 2: Mintlify CLI - -```bash -# Install Mintlify CLI globally -npm install -g mintlify - -# Deploy -mintlify deploy -``` - -### Environment Setup - -If deploying manually, configure these secrets: - -- `MINTLIFY_PROJECT_ID` - Your Mintlify project ID -- `GITHUB_TOKEN` - For GitHub integration (optional) - -## Contributing - -### Adding New Pages - -1. Create MDX file in appropriate directory -2. Add to navigation in `mint.json`: - -```json -{ - "group": "Your Section", - "pages": [ - "path/to/your-page" - ] -} -``` - -3. Test locally with `npm run dev` -4. Submit pull request - -### Content Guidelines - -- Use clear, concise language -- Include code examples where relevant -- Add Mintlify components for better UX: - - ``, ``, `` for callouts - - `` for multi-language examples - - `` for FAQs - - `` for feature highlights -- Follow existing page structure and formatting - -### MDX Frontmatter - -Every page should have frontmatter: - -```mdx ---- -title: "Page Title" -description: "Brief description for SEO" ---- -``` - -## Mintlify Components - -### Callouts - -```mdx -Helpful tip for users -Important warning -Additional information -``` - -### Code Groups - -```mdx - - -\`\`\`bash Ubuntu -sudo apt install package -\`\`\` - -\`\`\`bash macOS -brew install package -\`\`\` - - -``` - -### Cards - -```mdx - - - - Description - - - -``` - -### Accordions - -```mdx - - - -Answer content - - - -``` - -## Troubleshooting - -### Port Already in Use - -```bash -# Kill process on port 3000 -lsof -ti:3000 | xargs kill -9 - -# Or use different port -PORT=3001 npm run dev -``` - -### Build Errors - -```bash -# Clear cache and rebuild -rm -rf node_modules package-lock.json -npm install -npm run dev -``` - -### Broken Links - -Mintlify will warn about broken internal links during `npm run dev`. Check console output. - -## Links - -- **Live Docs:** https://docs.fireteam.dev (when deployed) -- **Fireteam GitHub:** https://github.com/darkresearch/fireteam -- **Mintlify Docs:** https://mintlify.com/docs -- **Report Issues:** https://github.com/darkresearch/fireteam/issues - -## Acknowledgments - -- Built with [Mintlify](https://mintlify.com) -- Powered by [Claude AI](https://claude.ai) -- Created by the Fireteam team - ---- - -**Need help?** Open an issue on [GitHub](https://github.com/darkresearch/fireteam/issues) or check the [troubleshooting guide](/troubleshooting/troubleshooting). diff --git a/docs/advanced/improvements.mdx b/docs/advanced/improvements.mdx deleted file mode 100644 index a35d0a6..0000000 --- a/docs/advanced/improvements.mdx +++ /dev/null @@ -1,431 +0,0 @@ ---- -title: "System Improvements" -description: "Completed enhancements and ongoing improvements to Fireteam" ---- - -## Overview - -Fireteam has undergone continuous improvement based on real-world testing and user feedback. This page documents completed enhancements, their impact, and ongoing improvement efforts. - -## Completed Improvements - -### 1. Configurable Agent Timeouts ✅ - -**Problem:** Fixed 5-minute planner and 10-minute executor timeouts caused failures on complex projects. - -**Solution:** Made timeouts configurable in `config.py`: - -```python -AGENT_TIMEOUTS = { - "planner": 600, # 10 minutes (was 5) - "reviewer": 600, # 10 minutes - "executor": 1800 # 30 minutes (was 10) -} -``` - -**Impact:** -- Zero timeout failures in production -- Complex projects complete successfully -- High success rate across diverse project types - -**Status:** ✅ Implemented in config.py - -### 2. Sudo Password Support ✅ - -**Problem:** Projects requiring system packages (Node.js, build tools) failed without passwordless sudo. - -**Solution:** Added `SUDO_PASSWORD` environment variable: - -```bash -# .env file -SUDO_PASSWORD=your_password_here -``` - -**Implementation:** - -```python -# config.py -SUDO_PASSWORD = os.getenv("SUDO_PASSWORD", None) - -def has_sudo_access(): - return SUDO_PASSWORD is not None -``` - -**Impact:** -- Prevents 5-10 wasted cycles on package installation -- GitHub Analyzer: Would have saved 8 cycles (~2 hours) -- Enables TypeScript/Node.js projects without manual setup - -**Status:** ✅ Implemented and documented - -### 3. Parse Failure Handling ✅ - -**Problem:** Reviewer output parse failures caused `completion_percentage = 0`, triggering unnecessary cycles. - -**Example failure:** -``` -Cycle 1: 92% → Parse failure → 0% → Wasted retry cycle -``` - -**Solution:** Use last known completion % on parse failures: - -```python -def update_completion_percentage(parsed_value, logger): - if parsed_value is None: - last_known = state.get("completion_percentage", 0) - logger.warning(f"Parse failure, using last known: {last_known}%") - return last_known - return parsed_value -``` - -**Safety valve:** After 3 consecutive parse failures, reset to 0 to prevent infinite loops. - -**Impact:** -- Eliminates wasted cycles from benign parse errors -- Maintains safety with consecutive failure detection -- GitHub Analyzer: Would have prevented Cycle 1 regression - -**Status:** ✅ Implemented in StateManager - -### 4. Agent Drift Detection ✅ - -**Problem:** Agents sometimes worked on features outside project scope (scope creep). - -**Example:** -``` -Goal: "Build a CLI tool" -Agent created: CLI + npm publishing + CI/CD + deployment automation -``` - -**Solution:** Goal alignment checks every 3 cycles: - -```python -if cycle_num > 0 and cycle_num % 3 == 0: - logger.info(f"GOAL ALIGNMENT CHECK (Cycle {cycle_num})") - logger.info(f"Original Goal: {self.goal}") - logger.info("⚠️ Reminder: Ensure all work aligns with original goal!") -``` - -**Impact:** -- Reduces scope creep -- Keeps agents focused on core requirements -- Improves cycle efficiency - -**Status:** ✅ Implemented in orchestrator.py - -### 5. State Isolation Between Projects ✅ - -**Problem:** Residual state from previous projects could contaminate new runs. - -**Solution:** Complete state reset on `start-agent`: - -```python -def initialize_project(project_dir, goal): - # Wipe previous state completely - if state_file.exists(): - state_file.unlink() - - # Create fresh state - return new_clean_state(project_dir, goal) -``` - -**Impact:** -- Zero cross-project contamination -- Predictable behavior for each run -- Clean slate for every project - -**Status:** ✅ Implemented in StateManager - -## Ongoing Improvements - -### 6. Environment Dependency Detection 🔄 - -**Problem:** Agents don't proactively detect language runtime requirements. - -**Current behavior:** -``` -Goal: "Build a TypeScript CLI" -→ Cycle 0: Try to run tsc → Not found → Fail -→ Cycle 1-7: Various Node.js installation attempts -→ Cycle 8: Finally successful (binary download) -``` - -**Proposed solution:** -```python -def detect_environment_needs(goal: str) -> dict: - needs = { - "nodejs": "typescript" in goal.lower() or "ts" in goal, - "ruby": "ruby" in goal.lower() or "rails" in goal, - "rust": "rust" in goal.lower(), - # ... - } - return needs - -def setup_environment(needs: dict): - if needs["nodejs"] and not nodejs_installed(): - install_nodejs() # With sudo support -``` - -**Expected impact:** -- Save 5-10 cycles on environment setup -- Zero wasted cycles on missing runtimes -- Proactive dependency installation - -**Status:** 🔄 Planned for v2.0 - -### 7. Monotonic Completion Enforcement 🔄 - -**Problem:** Completion % can regress without actual code regression. - -**Observed regressions:** -``` -CSV Analyzer: 93% → 96% → 92% (dropped 4%) -JSON Parser: 88% → 85% → 92% -``` - -**Proposed solution:** -```python -def enforce_monotonic_completion(new_pct, old_pct): - if new_pct < old_pct: - # Allow slight variations (noise) - if old_pct - new_pct <= 2: - return old_pct - # Significant drop - require justification - logger.warning(f"Completion dropped {old_pct}% → {new_pct}%") - # Use old unless reviewer explicitly justifies - return max(new_pct, old_pct) -``` - -**Expected impact:** -- Smoother completion curves -- Fewer validation resets -- More predictable convergence - -**Status:** 🔄 Under consideration - -### 8. Adaptive Timeouts 🔄 - -**Problem:** Later cycles tend to be faster (smaller changes) but use same timeouts as early cycles. - -**Observation:** -``` -Cycle 0: Executor uses 25 minutes (83% of 30min timeout) -Cycle 2: Executor uses 12 minutes (40% of 30min timeout) -Cycle 5: Executor uses 8 minutes (27% of 30min timeout) -``` - -**Proposed solution:** -```python -def get_adaptive_timeout(agent: str, cycle: int) -> int: - base_timeout = AGENT_TIMEOUTS[agent] - - if cycle <= 1: - return base_timeout # Full timeout for early cycles - - # Reduce by 20-30% for later cycles - reduction = min(0.3, (cycle - 1) * 0.1) - return int(base_timeout * (1 - reduction)) -``` - -**Expected impact:** -- Faster failure detection in later cycles -- Reduced waiting on hung processes -- More efficient resource usage - -**Status:** 🔄 Experimental - -### 9. Enhanced Logging and Telemetry 🔄 - -**Problem:** Limited visibility into agent decision-making and performance patterns. - -**Proposed enhancements:** -- Structured logging (JSON format option) -- Performance metrics (cycle duration, API latency) -- Decision tracking (why agent chose specific approach) -- Resource monitoring (CPU, memory, disk per phase) - -**Example structured log:** -```json -{ - "timestamp": "2025-10-17T14:30:22", - "cycle": 2, - "phase": "execution", - "duration": 892.5, - "api_calls": 12, - "completion_before": 88, - "completion_after": 92, - "files_modified": 8, - "tests_passed": 15, - "tests_failed": 0 -} -``` - -**Expected impact:** -- Better debugging -- Performance analytics -- Optimization opportunities -- Predictive completion estimates - -**Status:** 🔄 Design phase - -### 10. Multi-Project Queue 🔄 - -**Problem:** Can only run one project at a time; sequential execution only. - -**Proposed solution:** -```bash -# Queue multiple projects -fireteam-queue add ~/project1 "Goal 1" -fireteam-queue add ~/project2 "Goal 2" -fireteam-queue add ~/project3 "Goal 3" - -# Start queue processor -fireteam-queue start - -# Monitor queue -fireteam-queue status -``` - -**Expected impact:** -- Batch processing of multiple projects -- Unattended overnight runs -- Better resource utilization - -**Status:** 🔄 Future consideration (v2.0+) - -## Impact Analysis - -### Before vs. After Improvements - -| Metric | Before | After | Improvement | -|--------|--------|-------|-------------| -| **Timeout failures** | ~10% of cycles | 0% | ✅ 100% reduction | -| **Parse failure waste** | 1-2 cycles | 0 | ✅ Eliminated | -| **Sudo issues** | 8 wasted cycles | 0 | ✅ With sudo config | -| **Scope creep** | Frequent | Rare | ✅ 80% reduction | -| **State contamination** | Occasional | Never | ✅ 100% prevention | - -### Real-World Impact: GitHub Analyzer - -**Original run (before improvements):** -- 19 cycles total -- 8 cycles wasted on Node.js (no sudo) -- 1 cycle wasted on parse failure -- Added deployment features (scope creep) -- **Time:** ~5 hours - -**Projected with improvements:** -- Would install Node.js in Cycle 0 (with sudo) -- No parse failure cycle -- Goal alignment check would prevent scope creep -- **Estimated:** ~10 cycles, 2-3 hours (40-60% faster) - -## Testing Validation - -### Improvements Validated in Production - -✅ **Timeout increases:** Zero timeouts in production use -✅ **State isolation:** Multiple sequential projects, no contamination -✅ **Parse failure handling:** Robust recovery mechanisms validated -✅ **Agent drift detection:** Goal alignment checks active - -### Still Needs Testing - -⚠️ **Environment detection:** Not yet implemented -⚠️ **Monotonic completion:** Under consideration -⚠️ **Adaptive timeouts:** Experimental phase - -## Future Roadmap - -### v1.1 (Next Release) - -- ✅ All completed improvements (already released) -- 🔄 Environment dependency detection -- 🔄 Enhanced logging/telemetry -- 🔄 Monotonic completion enforcement - -### v2.0 (Major Update) - -- 🔄 Adaptive timeouts -- 🔄 Multi-project queue -- 🔄 Web UI for monitoring -- 🔄 Plugin system for custom agents - -### v3.0 (Vision) - -- Distributed execution (multiple machines) -- Language model agnostic (support for other LLMs) -- Advanced code analysis (security, performance) -- Integration with CI/CD pipelines - -## Contributing Improvements - -### How to Propose Improvements - -1. **Identify issue:** Observe patterns in your usage -2. **Quantify impact:** How many cycles wasted? How often? -3. **Propose solution:** Code or detailed description -4. **Test locally:** Validate with diverse projects -5. **Submit:** GitHub issue or pull request - -### Example Improvement Proposal - -```markdown -## Improvement: Intelligent Retry Delays - -**Problem:** Fixed 5-second retry delay wastes time on quick failures -but doesn't help with long API outages. - -**Observed:** -- Network hiccup: 5sec delay sufficient -- API outage: 5sec useless, need 60+ seconds - -**Proposed Solution:** -Exponential backoff: 5s → 15s → 45s - -**Expected Impact:** -- Faster recovery from transient issues -- Better handling of prolonged outages -- 30% reduction in wasted retry time - -**Testing Plan:** -- Simulate network failures -- Measure retry success rates -- Compare fixed vs. exponential delays -``` - -## Improvement Metrics - -### How We Measure Success - -1. **Cycle reduction:** Fewer cycles to same completion % -2. **Time savings:** Faster project completion -3. **Success rate:** More projects reaching ≥90% -4. **Reliability:** Fewer failures and errors -5. **User satisfaction:** Easier to use, better results - -### Current Performance (Post-Improvements) - -- ✅ **High success rate** - Projects consistently reach 90%+ completion -- ✅ **Strong completion quality** - Exceeds 90% target threshold -- ✅ **Efficient cycles** - Average 3-4 cycles per project -- ✅ **Minimal timeout failures** - Rare with current timeouts -- ✅ **No state issues** - Isolation working correctly - -## Next Steps - - - - - Leverage improvements via configuration - - - - Understand improved state handling - - - - Contribute your own improvements - - - diff --git a/docs/advanced/state-management.mdx b/docs/advanced/state-management.mdx deleted file mode 100644 index e762ccf..0000000 --- a/docs/advanced/state-management.mdx +++ /dev/null @@ -1,522 +0,0 @@ ---- -title: "State Management" -description: "Understanding Fireteam's state isolation, persistence, and recovery mechanisms" ---- - -## Overview - -Fireteam uses a sophisticated state management system to maintain project continuity across cycles while ensuring complete isolation between different projects. State is stored in JSON format and includes all information needed to resume or analyze project progress. - -## State File Location - -```bash -/home/claude/fireteam/state/current.json -``` - - -This file is **gitignored** and local to the Fireteam installation. It is **not** stored in your project directory. - - -## State Structure - -### Complete State Schema - -```json -{ - "project_dir": "/home/claude/my-project", - "goal": "Build a Python CLI calculator", - "status": "executing", - "cycle_number": 2, - "completion_percentage": 92, - "validation_checks": 0, - "git_branch": "agent-20251017-143022", - "current_plan": "...", - "last_execution_result": "...", - "last_review": "...", - "started_at": "2025-10-17T14:30:22.123456", - "updated_at": "2025-10-17T14:45:18.789012", - "completed": false, - "completed_at": null -} -``` - -### Field Descriptions - - - Absolute path to the project directory - - - - Original project objective/prompt - - - - Current phase: `planning`, `executing`, `reviewing` - - - - Current cycle number (0-indexed) - - - - Latest reviewer estimate (0-100) - - - - Consecutive high reviews (≥95%) for completion validation - - - - Active git branch name (e.g., `agent-20251017-143022`) - - - - Latest plan from Planner agent - - - - Latest execution output from Executor agent - - - - Latest review from Reviewer agent - - - - ISO 8601 timestamp when project started - - - - ISO 8601 timestamp of last state update - - - - Whether project has completed - - - - ISO 8601 timestamp when project completed (null if not completed) - - -## State Lifecycle - -### 1. Initialization - -When `start-agent` is called: - -```python -state = { - "project_dir": "/home/claude/project", - "goal": "Build a CLI app", - "status": "planning", - "cycle_number": 0, - "completion_percentage": 0, - "validation_checks": 0, - "git_branch": "agent-20251017-143022", - "started_at": "2025-10-17T14:30:22", - # Other fields initialized to empty/null -} -``` - -### 2. Cycle Updates - -During each cycle phase: - -**Planning:** -```python -state["status"] = "planning" -state["current_plan"] = planner_result -``` - -**Execution:** -```python -state["status"] = "executing" -state["last_execution_result"] = executor_result -``` - -**Review:** -```python -state["status"] = "reviewing" -state["last_review"] = reviewer_result -state["completion_percentage"] = completion_pct -state["updated_at"] = current_timestamp -``` - -**Cycle increment:** -```python -state["cycle_number"] += 1 -``` - -### 3. Completion - -When project completes: - -```python -state["completed"] = True -state["completed_at"] = "2025-10-17T15:45:30" -state["status"] = "completed" -``` - -### 4. Reset (New Project) - -When starting a new project: - -```python -# Previous state completely wiped -state = new_state_for_new_project() -``` - - -State isolation ensures no cross-contamination between projects. Each run starts with a clean slate. - - -## State Persistence - -### When State is Saved - -State is saved at critical points: - -1. **After planning** - `current_plan` updated -2. **After execution** - `last_execution_result` updated -3. **After review** - `completion_percentage`, `last_review` updated -4. **On shutdown** - Final state persisted -5. **On error** - Last known good state preserved - -### Atomic Writes - -State writes are atomic: - -```python -# Write to temporary file -temp_file.write(json.dumps(state)) - -# Atomic move -os.replace(temp_file, state_file) -``` - -This prevents corruption from crashes during writes. - -### Backup Strategy - -**Fireteam doesn't backup state**, but you can: - -```bash -# Manual backup -cp /home/claude/fireteam/state/current.json ~/state-backup-$(date +%s).json - -# Automated backup script -#!/bin/bash -while true; do - if [ -f /home/claude/fireteam/state/current.json ]; then - cp /home/claude/fireteam/state/current.json \ - ~/fireteam-backups/state-$(date +%Y%m%d-%H%M%S).json - fi - sleep 300 # Every 5 minutes -done -``` - -## Parse Failure Handling - -### The Problem - -Sometimes reviewer output can't be parsed to extract completion %: - -``` -Review text without clear percentage... -``` - -**Previous behavior:** -- Parse failure → `completion_percentage = 0` -- Triggered unnecessary cycles - -### The Solution - -**Improved parse failure handling:** - -```python -def update_completion_percentage(parsed_value, logger): - if parsed_value is None: - # Parse failure - use last known value - last_known = state.get("completion_percentage", 0) - logger.warning(f"Parse failure, using last known: {last_known}%") - return last_known - else: - # Valid parse - use new value - return parsed_value -``` - -**Safety valve:** If multiple consecutive parse failures (3+), reset to 0 to prevent infinite loops. - - -This improvement prevents wasted cycles from benign parse errors while maintaining safety. - - -## State Inspection - -### View Current State - -```bash -# Pretty-print JSON -cat /home/claude/fireteam/state/current.json | python3 -m json.tool - -# Or use jq -cat /home/claude/fireteam/state/current.json | jq . -``` - -### Extract Specific Fields - -```bash -# Get completion percentage -jq '.completion_percentage' /home/claude/fireteam/state/current.json - -# Get cycle number -jq '.cycle_number' /home/claude/fireteam/state/current.json - -# Get goal -jq '.goal' /home/claude/fireteam/state/current.json -``` - -### Monitor State Changes - -```bash -# Watch state file -watch -n 5 'cat /home/claude/fireteam/state/current.json | jq "{cycle: .cycle_number, completion: .completion_percentage, status: .status}"' -``` - -## State Isolation - -### How Isolation Works - -**Between projects:** - -```bash -# Project 1 -start-agent --project-dir ~/app1 --prompt "Goal 1" -# State: { project_dir: "~/app1", goal: "Goal 1", ... } - -stop-agent - -# Project 2 - COMPLETELY FRESH STATE -start-agent --project-dir ~/app2 --prompt "Goal 2" -# State: { project_dir: "~/app2", goal: "Goal 2", ... } -# Previous state completely wiped -``` - -**Why isolation matters:** -- Prevents plan contamination -- Avoids execution context bleed -- Ensures clean git history per project -- Eliminates cross-project bugs - -### State Reset on Start - -```python -def initialize_project(project_dir, goal): - # Completely wipe previous state - if state_file.exists(): - state_file.unlink() - - # Create fresh state - return { - "project_dir": project_dir, - "goal": goal, - "cycle_number": 0, - # All other fields fresh - } -``` - -## Troubleshooting State Issues - -### Corrupted State File - -**Symptoms:** -- `fireteam-status` shows "Error loading state" -- JSON parse errors -- Unexpected behavior - -**Diagnosis:** -```bash -# Validate JSON -cat /home/claude/fireteam/state/current.json | python3 -m json.tool -``` - -**Fix:** -```bash -# Backup corrupted state -mv /home/claude/fireteam/state/current.json ~/corrupted-state.json - -# Stop agent (will fail gracefully) -stop-agent - -# Start fresh (state recreated) -start-agent --project-dir ~/project --prompt "Goal" -``` - -### Stale State - -**Symptoms:** -- `fireteam-status` shows old project -- No orchestrator running - -**Fix:** -```bash -# Verify no process -ps aux | grep orchestrator - -# Remove stale state -rm -f /home/claude/fireteam/state/current.json -rm -f /home/claude/fireteam/orchestrator.pid -``` - -### State-Git Mismatch - -**Problem:** State shows cycle 5, but git only has 3 commits - -**Cause:** Commits failed or state advanced without commits - -**Investigation:** -```bash -# Check state -jq '.cycle_number' state/current.json - -# Check git commits -cd ~/project && git log --oneline | wc -l - -# View logs for commit errors -grep "commit" /home/claude/fireteam/logs/orchestrator_*.log -``` - -## Advanced State Management - -### Reading State Programmatically - -```python -import json - -with open('/home/claude/fireteam/state/current.json') as f: - state = json.load(f) - -print(f"Project: {state['project_dir']}") -print(f"Cycle: {state['cycle_number']}") -print(f"Completion: {state['completion_percentage']}%") -print(f"Status: {state['status']}") -``` - -### Monitoring State Changes - -```python -import json -import time -from watchdog.observers import Observer -from watchdog.events import FileSystemEventHandler - -class StateMonitor(FileSystemEventHandler): - def on_modified(self, event): - if event.src_path.endswith('current.json'): - with open(event.src_path) as f: - state = json.load(f) - print(f"Cycle {state['cycle_number']}: {state['completion_percentage']}%") - -observer = Observer() -observer.schedule(StateMonitor(), '/home/claude/fireteam/state') -observer.start() -``` - -### State-Based Alerts - -```bash -#!/bin/bash -# alert-on-completion.sh - -while true; do - if [ -f /home/claude/fireteam/state/current.json ]; then - COMPLETION=$(jq '.completion_percentage' /home/claude/fireteam/state/current.json) - if [ "$COMPLETION" -ge 95 ]; then - notify-send "Fireteam" "Project at ${COMPLETION}%!" - break - fi - fi - sleep 30 -done -``` - -## State Directory Structure - -``` -/home/claude/fireteam/state/ -├── current.json # Active project state -└── .gitignore # Ensures current.json never committed -``` - -**Note:** Only one state file at a time (current project). - -## State vs. Git History - -### State: Runtime Information - -- Current cycle, phase, completion -- Agent outputs (plan, execution, review) -- Transient, wiped between projects - -### Git: Persistent Project History - -- Code changes per cycle -- Commit messages with completion % -- Permanent, survives state resets - - -Think of state as "RAM" (temporary) and git history as "disk" (permanent) for your project. - - -## Best Practices - -### 1. Don't Manually Edit State - - -Manually editing `current.json` can cause undefined behavior. Let Fireteam manage state. - - -### 2. Backup State for Analysis - -```bash -# After interesting runs -cp /home/claude/fireteam/state/current.json ~/analysis/project-X-state.json -``` - -### 3. Inspect State on Failures - -```bash -# When something goes wrong -fireteam-status # Quick view -cat state/current.json | jq . # Detailed view -``` - -### 4. Use Git for History - -```bash -# Don't rely on state for history -cd ~/project -git log --oneline # Permanent record -``` - -## Next Steps - - - - - See state management enhancements - - - - Monitor state in real-time - - - - Resolve state-related issues - - - - Understand Fireteam's overall design - - - diff --git a/docs/api/agents.mdx b/docs/api/agents.mdx deleted file mode 100644 index 469b7ed..0000000 --- a/docs/api/agents.mdx +++ /dev/null @@ -1,135 +0,0 @@ ---- -title: "Agents API" -description: "API reference for Fireteam's agent classes" ---- - -## Agent Class Hierarchy - -``` -BaseAgent (abstract) - ├── PlannerAgent - ├── ExecutorAgent - └── ReviewerAgent -``` - -## BaseAgent (Abstract) - -Base class for all agents providing common functionality. - -**Location:** `/home/claude/fireteam/src/agents/base.py` - -### Constructor - -```python -BaseAgent(agent_type: str, logger, timeout: int = None) -``` - -**Parameters:** -- `agent_type` (str): Agent identifier ("planner", "executor", "reviewer") -- `logger`: Python logger instance -- `timeout` (int): Timeout in seconds (from config if not specified) - -### Methods - -#### execute() - -```python -def execute(**kwargs) -> dict -``` - -Abstract method implemented by subclasses. - -**Returns:** dict with `success` boolean and agent-specific results - -#### _call_claude() - -```python -def _call_claude(prompt: str, cwd: str) -> str -``` - -Internal method to invoke Claude CLI with retry logic. - -## PlannerAgent - -Creates and updates project plans. - -**Location:** `/home/claude/fireteam/src/agents/planner.py` - -### execute() - -```python -def execute( - project_dir: str, - goal: str, - cycle_number: int, - previous_plan: str = None, - last_execution_result: str = None, - last_review: str = None -) -> dict -``` - -**Returns:** -```python -{ - "success": bool, - "plan": str, - "error": str # if success=False -} -``` - -## ExecutorAgent - -Executes tasks from the plan. - -**Location:** `/home/claude/fireteam/src/agents/executor.py` - -### execute() - -```python -def execute( - project_dir: str, - goal: str, - plan: str, - cycle_number: int -) -> dict -``` - -**Returns:** -```python -{ - "success": bool, - "execution_result": str, - "error": str # if success=False -} -``` - -## ReviewerAgent - -Reviews code and estimates completion. - -**Location:** `/home/claude/fireteam/src/agents/reviewer.py` - -### execute() - -```python -def execute( - project_dir: str, - goal: str, - plan: str, - execution_result: str, - cycle_number: int, - is_validation: bool = False -) -> dict -``` - -**Returns:** -```python -{ - "success": bool, - "review": str, - "completion_percentage": int, # 0-100 or None if parse failed - "error": str # if success=False -} -``` - -See [API Overview](/api/overview) for usage examples. diff --git a/docs/api/configuration.mdx b/docs/api/configuration.mdx deleted file mode 100644 index 4805773..0000000 --- a/docs/api/configuration.mdx +++ /dev/null @@ -1,129 +0,0 @@ ---- -title: "Configuration API" -description: "Configuration system reference and environment variables" ---- - -## Configuration Module - -**Location:** `/home/claude/fireteam/src/config.py` - -## Constants - -### System Paths - -```python -SYSTEM_DIR: str = "/home/claude/fireteam" -STATE_DIR: str = os.path.join(SYSTEM_DIR, "state") -LOGS_DIR: str = os.path.join(SYSTEM_DIR, "logs") -CLI_DIR: str = os.path.join(SYSTEM_DIR, "cli") -``` - -### Claude CLI - -```python -CLAUDE_CLI: str = "claude" -DANGEROUSLY_SKIP_PERMISSIONS: str = "--dangerously-skip-permissions" -``` - -### Agent Configuration - -```python -MAX_RETRIES: int = 3 -RETRY_DELAY: int = 5 # seconds - -AGENT_TIMEOUTS: dict = { - "planner": 600, # 10 minutes - "reviewer": 600, # 10 minutes - "executor": 1800 # 30 minutes -} -``` - -### Completion Thresholds - -```python -COMPLETION_THRESHOLD: int = 95 # percentage -VALIDATION_CHECKS_REQUIRED: int = 3 # consecutive checks -``` - -### Git Configuration - -```python -GIT_USER_NAME: str = os.environ.get("GIT_USER_NAME", "fireteam") -GIT_USER_EMAIL: str = os.environ.get("GIT_USER_EMAIL", "fireteam@darkresearch.ai") -``` - -### Logging - -```python -LOG_LEVEL: str = "INFO" -LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -``` - -### Sudo Configuration - -```python -SUDO_PASSWORD: str | None = os.getenv("SUDO_PASSWORD", None) - -def has_sudo_access() -> bool: - """Check if sudo password is available.""" - return SUDO_PASSWORD is not None -``` - -## Environment Variables - -### Supported Variables - -| Variable | Type | Default | Description | -|----------|------|---------|-------------| -| `GIT_USER_NAME` | string | "fireteam" | Git committer name | -| `GIT_USER_EMAIL` | string | "fireteam@darkresearch.ai" | Git committer email | -| `SUDO_PASSWORD` | string | None | Sudo password for system operations | - -### Loading Environment - -```python -from pathlib import Path -from dotenv import load_dotenv - -env_file = Path(__file__).parent / ".env" -if env_file.exists(): - load_dotenv(env_file) -``` - -## Usage Examples - -### Accessing Configuration - -```python -import config - -# Use timeout values -timeout = config.AGENT_TIMEOUTS["executor"] - -# Check sudo access -if config.has_sudo_access(): - # Perform sudo operation - pass - -# Git configuration -print(f"Commits by: {config.GIT_USER_NAME} <{config.GIT_USER_EMAIL}>") -``` - -### Modifying Configuration - -Edit `/home/claude/fireteam/config.py`: - -```python -# Custom timeouts for large projects -AGENT_TIMEOUTS = { - "planner": 900, # 15 minutes - "executor": 3600, # 60 minutes - "reviewer": 900 # 15 minutes -} - -# Stricter completion -COMPLETION_THRESHOLD = 98 -VALIDATION_CHECKS_REQUIRED = 5 -``` - -See [Configuration Guide](/configuration/config-file) for details. diff --git a/docs/api/estimate-complexity.mdx b/docs/api/estimate-complexity.mdx new file mode 100644 index 0000000..78ab868 --- /dev/null +++ b/docs/api/estimate-complexity.mdx @@ -0,0 +1,137 @@ +--- +title: estimate_complexity() +description: Estimate task complexity before execution +--- + +# estimate_complexity() + +Estimates the complexity of a task. Used internally by `execute()` when no mode is specified, but can also be called directly. + +## Signature + +```python +async def estimate_complexity( + goal: str, + context: str | None = None, +) -> ComplexityLevel +``` + +## Parameters + + + The task to analyze. + + + + Additional context about the task. + + +## Returns + +Returns a `ComplexityLevel` enum value: + +```python +class ComplexityLevel(Enum): + TRIVIAL = "trivial" # Single-line changes + SIMPLE = "simple" # Self-contained tasks + MODERATE = "moderate" # Multi-file changes + COMPLEX = "complex" # Architectural changes +``` + +## Examples + +### Basic Usage + +```python +from fireteam import estimate_complexity + +complexity = await estimate_complexity( + goal="Fix the typo in README.md", +) +# Returns: ComplexityLevel.TRIVIAL +``` + +### With Context + +```python +complexity = await estimate_complexity( + goal="Add user authentication", + context="Using FastAPI with existing User model and database", +) +# Returns: ComplexityLevel.MODERATE +``` + +### Use Result for Mode Selection + +```python +from fireteam import estimate_complexity, execute, ExecutionMode, ComplexityLevel + +complexity = await estimate_complexity( + goal="Implement new feature", +) + +# Custom mode selection logic +if complexity == ComplexityLevel.COMPLEX: + mode = ExecutionMode.FULL +else: + mode = ExecutionMode.SIMPLE + +result = await execute( + project_dir="/path/to/project", + goal="Implement new feature", + mode=mode, +) +``` + +## Complexity Guidelines + +### TRIVIAL + +- Fix typos +- Add comments +- Simple formatting + +```python +await estimate_complexity("Fix typo in README") +# → ComplexityLevel.TRIVIAL +``` + +### SIMPLE + +- Single function implementation +- Add logging +- Fix obvious bugs + +```python +await estimate_complexity("Add logging to auth module") +# → ComplexityLevel.SIMPLE +``` + +### MODERATE + +- Refactor a module +- Add feature with tests +- Fix complex bug + +```python +await estimate_complexity("Refactor user service") +# → ComplexityLevel.MODERATE +``` + +### COMPLEX + +- Architectural changes +- New subsystems +- Major refactoring + +```python +await estimate_complexity("Redesign authentication system") +# → ComplexityLevel.COMPLEX +``` + +## Implementation Notes + +- Uses a single Claude API call with no tools +- Response is parsed to extract complexity level +- Defaults to SIMPLE if response is unclear +- Handles case-insensitive responses diff --git a/docs/api/execute.mdx b/docs/api/execute.mdx new file mode 100644 index 0000000..eea33b3 --- /dev/null +++ b/docs/api/execute.mdx @@ -0,0 +1,146 @@ +--- +title: execute() +description: Execute a task with adaptive complexity handling +--- + +# execute() + +The main function for executing tasks. Automatically estimates complexity and selects the appropriate execution strategy. + +## Signature + +```python +async def execute( + project_dir: str | Path, + goal: str, + context: str | None = None, + mode: ExecutionMode | None = None, + run_tests: bool = True, +) -> ExecutionResult +``` + +## Parameters + + + Path to the project directory. Will be resolved to an absolute path. + + + + The task to accomplish. Should be clear and specific. + + + + Additional context to help Claude understand the task. Can include error logs, requirements, or relevant information. + + + + Execution mode to use. If `None`, complexity is estimated and mode is selected automatically. + + + + Whether to run tests after file edits. When `True`, quality hooks are enabled. + + +## Returns + +Returns an `ExecutionResult` with the following fields: + +```python +@dataclass +class ExecutionResult: + success: bool # Whether the task completed successfully + mode: ExecutionMode # The execution mode used + output: str | None # Execution output + error: str | None # Error message if failed + completion_percentage: int # 0-100 completion estimate + metadata: dict # Additional info (plan, review, etc.) +``` + +## Examples + +### Basic Usage + +```python +from fireteam import execute + +result = await execute( + project_dir="/path/to/project", + goal="Fix the login bug", +) + +if result.success: + print(f"Done! Output: {result.output}") +``` + +### With Context + +```python +result = await execute( + project_dir="/path/to/project", + goal="Fix the authentication error", + context=""" + Error from logs: + TypeError: 'NoneType' object is not subscriptable + at auth.py:42 in validate_token() + """, +) +``` + +### Specify Mode + +```python +from fireteam import execute, ExecutionMode + +# Force full planning cycle +result = await execute( + project_dir="/path/to/project", + goal="Refactor the user module", + mode=ExecutionMode.FULL, +) +``` + +### Disable Tests + +```python +result = await execute( + project_dir="/path/to/project", + goal="Add experimental feature", + run_tests=False, +) +``` + +## Execution Flow + +1. **Complexity Estimation** (if mode not specified) + - Analyzes goal and context + - Returns TRIVIAL, SIMPLE, MODERATE, or COMPLEX + +2. **Mode Selection** + - TRIVIAL → SINGLE_TURN + - SIMPLE → SIMPLE + - MODERATE → MODERATE + - COMPLEX → FULL + +3. **Execution** (varies by mode) + - SINGLE_TURN: Direct execution + - SIMPLE: Execute only + - MODERATE: Execute + review + - FULL: Plan + execute + validation reviews + +4. **Result** + - Returns ExecutionResult with success status, output, and metadata + +## Error Handling + +The function catches exceptions and returns them in the result: + +```python +result = await execute( + project_dir="/nonexistent/path", + goal="Do something", +) + +if not result.success: + print(f"Error: {result.error}") + # Error: Project directory not found +``` diff --git a/docs/api/overview.mdx b/docs/api/overview.mdx deleted file mode 100644 index e65d5f5..0000000 --- a/docs/api/overview.mdx +++ /dev/null @@ -1,419 +0,0 @@ ---- -title: "API Overview" -description: "Technical reference for Fireteam's Python architecture" ---- - -## Overview - -Fireteam is built as a modular Python system with clean separation between orchestration, agent execution, and state management. This section provides technical documentation for developers who want to understand or extend Fireteam's internals. - -## Architecture Components - - - - - Main control loop managing agent lifecycle and cycle execution - - - - Specialized agents (Planner, Executor, Reviewer) powered by Claude - - - - Persistent state management with isolation and recovery - - - - Centralized configuration via config.py - - - - -## Project Structure - -``` -fireteam/ -├── src/ # Source code directory -│ ├── orchestrator.py # Main orchestration loop -│ ├── config.py # System configuration -│ ├── __init__.py -│ ├── agents/ -│ │ ├── __init__.py -│ │ ├── base.py # Base agent class -│ │ ├── planner.py # Planner agent implementation -│ │ ├── executor.py # Executor agent implementation -│ │ └── reviewer.py # Reviewer agent implementation -│ └── state/ -│ └── manager.py # State management module -├── state/ # Runtime state data (gitignored) -│ └── current.json # Active project state -├── cli/ -│ ├── start-agent # Start command -│ ├── stop-agent # Stop command -│ └── fireteam-status # Status tool -└── logs/ # Orchestrator logs -``` - -## Core Classes - -### Orchestrator - -Main control class managing the agent system lifecycle. - -**Location:** `/home/claude/fireteam/src/orchestrator.py` - -**Key methods:** -- `__init__(project_dir, goal)` - Initialize orchestrator -- `run()` - Main execution loop -- `run_cycle(state)` - Execute single cycle -- `check_completion(state)` - Validation logic -- `commit_changes(cycle, message)` - Git integration - -**Usage:** -```python -orchestrator = Orchestrator( - project_dir="/home/claude/project", - goal="Build a CLI calculator" -) -orchestrator.run() -``` - -### BaseAgent - -Abstract base class for all agents. - -**Location:** `/home/claude/fireteam/src/agents/base.py` - -**Key methods:** -- `execute(**kwargs)` - Main execution method (abstract) -- `_call_claude(prompt, cwd)` - Claude CLI interaction -- `_parse_output(output)` - Output parsing - -### PlannerAgent - -Creates and updates project plans. - -**Location:** `/home/claude/fireteam/src/agents/planner.py` - -**Input:** -- `goal`: Project objective -- `cycle_number`: Current cycle -- `previous_plan`: Last plan (if any) -- `last_execution_result`: Executor output -- `last_review`: Reviewer feedback - -**Output:** -- `plan`: Structured project plan - -### ExecutorAgent - -Implements tasks from the plan. - -**Location:** `/home/claude/fireteam/src/agents/executor.py` - -**Input:** -- `goal`: Project objective -- `plan`: Current plan to execute -- `cycle_number`: Current cycle - -**Output:** -- `execution_result`: Summary of work done - -### ReviewerAgent - -Reviews code and estimates completion. - -**Location:** `/home/claude/fireteam/src/agents/reviewer.py` - -**Input:** -- `goal`: Project objective -- `plan`: Current plan -- `execution_result`: What was implemented -- `cycle_number`: Current cycle -- `is_validation`: Whether in validation mode - -**Output:** -- `review`: Review text -- `completion_percentage`: Estimated completion (0-100) - -### StateManager - -Manages project state persistence. - -**Location:** `/home/claude/fireteam/src/state/manager.py` - -**Key methods:** -- `initialize_project(dir, goal)` - Create fresh state -- `load_state()` - Load current state -- `update_state(updates)` - Update state fields -- `increment_cycle()` - Advance cycle counter -- `mark_completed()` - Mark project complete - -## Configuration System - -### config.py Structure - -```python -# System paths -SYSTEM_DIR = "/home/claude/fireteam" -STATE_DIR = os.path.join(SYSTEM_DIR, "state") -LOGS_DIR = os.path.join(SYSTEM_DIR, "logs") - -# Agent timeouts (seconds) -AGENT_TIMEOUTS = { - "planner": 600, - "reviewer": 600, - "executor": 1800 -} - -# Completion thresholds -COMPLETION_THRESHOLD = 95 -VALIDATION_CHECKS_REQUIRED = 3 - -# Git configuration -GIT_USER_NAME = os.environ.get("GIT_USER_NAME", "fireteam") -GIT_USER_EMAIL = os.environ.get("GIT_USER_EMAIL", "fireteam@darkresearch.ai") - -# Optional sudo password -SUDO_PASSWORD = os.getenv("SUDO_PASSWORD", None) -``` - -See [Configuration Reference](/configuration/config-file) for details. - -## Agent Communication Flow - -``` -Orchestrator - ↓ (passes goal, context) -PlannerAgent → creates plan - ↓ (passes plan, goal) -ExecutorAgent → implements plan - ↓ (passes execution result, plan) -ReviewerAgent → reviews & estimates completion - ↓ (passes completion %) -Orchestrator → checks validation → commits to git → next cycle -``` - -## Extending Fireteam - -### Adding a Custom Agent - -```python -# agents/custom_agent.py -from agents.base import BaseAgent - -class CustomAgent(BaseAgent): - def __init__(self, logger): - super().__init__("custom", logger) - - def execute(self, **kwargs): - prompt = self._build_prompt(**kwargs) - output = self._call_claude(prompt, kwargs['project_dir']) - result = self._parse_output(output) - - return { - "success": True, - "custom_result": result - } -``` - -### Modifying Agent Prompts - -Agent prompts are built in each agent's `_build_prompt()` method: - -```python -# agents/planner.py -def _build_prompt(self, goal, cycle_number, ...): - return f""" - You are a planning agent for project: {goal} - - Current cycle: {cycle_number} - - Previous plan: {previous_plan} - - Create an updated plan... - """ -``` - -### Custom State Fields - -```python -# Extend state with custom fields -state_manager.update_state({ - "custom_metric": 42, - "custom_flag": True -}) -``` - -## API Examples - -### Programmatic Project Execution - -```python -import sys -sys.path.insert(0, '/home/claude/fireteam/src') - -from orchestrator import Orchestrator - -# Create orchestrator -orch = Orchestrator( - project_dir="/home/claude/my-project", - goal="Build a Python CLI calculator" -) - -# Run project -exit_code = orch.run() - -print(f"Project completed with exit code: {exit_code}") -``` - -### Reading State Programmatically - -```python -import json - -with open('/home/claude/fireteam/state/current.json') as f: - state = json.load(f) - -print(f"Cycle: {state['cycle_number']}") -print(f"Completion: {state['completion_percentage']}%") -``` - -### Custom Logging - -```python -import logging - -# Configure custom logger -logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler('custom.log'), - logging.StreamHandler() - ] -) - -logger = logging.getLogger("fireteam-custom") -``` - -## Integration Points - -### Git Integration - -Fireteam automatically: -- Initializes git repo if needed -- Creates timestamped branch -- Commits after each cycle -- Pushes to remote (if configured) - -**Manual git operations:** -```python -# In orchestrator.py -def commit_changes(self, cycle_number, message_suffix): - subprocess.run(["git", "add", "."], cwd=self.project_dir) - subprocess.run( - ["git", "commit", "-m", f"Cycle {cycle_number}: {message_suffix}"], - cwd=self.project_dir - ) -``` - -### Claude CLI Integration - -All agents use: -```python -# Base agent method -def _call_claude(self, prompt, cwd): - result = subprocess.run( - [ - "claude", - "--dangerously-skip-permissions", - "--prompt", prompt, - "--cwd", cwd - ], - capture_output=True, - text=True, - timeout=self.timeout - ) - return result.stdout -``` - -## Error Handling - -### Retry Logic - -```python -# config.py -MAX_RETRIES = 3 -RETRY_DELAY = 5 # seconds - -# Implemented in agents -for attempt in range(MAX_RETRIES): - try: - return self._call_claude(prompt, cwd) - except subprocess.TimeoutExpired: - if attempt < MAX_RETRIES - 1: - time.sleep(RETRY_DELAY) - continue - raise -``` - -### Graceful Degradation - -- Agent failures don't crash orchestrator -- State preserved on errors -- Logged for debugging -- Manual intervention possible - -## Testing - -### Running Unit Tests - -```python -# tests/test_state_manager.py -import pytest -from state.manager import StateManager - -def test_state_initialization(): - sm = StateManager() - state = sm.initialize_project("/tmp/test", "Test goal") - assert state["cycle_number"] == 0 - assert state["goal"] == "Test goal" -``` - -### Mock Agent Testing - -```python -# tests/test_orchestrator.py -from unittest.mock import Mock, patch - -def test_cycle_execution(): - with patch('agents.planner.PlannerAgent') as mock_planner: - mock_planner.return_value.execute.return_value = { - "success": True, - "plan": "Test plan" - } - # Test orchestrator cycle... -``` - -## Next Steps - - - - - Detailed StateManager class documentation - - - - Agent class hierarchy and methods - - - - Configuration system reference - - - - View source code - - - diff --git a/docs/api/state-manager.mdx b/docs/api/state-manager.mdx deleted file mode 100644 index f5d417e..0000000 --- a/docs/api/state-manager.mdx +++ /dev/null @@ -1,121 +0,0 @@ ---- -title: "StateManager API" -description: "Detailed API reference for Fireteam's StateManager class" ---- - -## StateManager Class - -The StateManager class handles all project state persistence, ensuring clean isolation between projects and reliable state recovery. - -**Location:** `/home/claude/fireteam/state/manager.py` - -## Class Reference - -### Constructor - -```python -StateManager() -``` - -No parameters required. Automatically configures paths from `config.py`. - -## Methods - -### initialize_project() - -Create a fresh project state. - -```python -def initialize_project(project_dir: str, goal: str) -> dict -``` - -**Parameters:** -- `project_dir` (str): Absolute path to project directory -- `goal` (str): Project objective - -**Returns:** dict - Fresh state with initialized fields - -**Example:** -```python -sm = StateManager() -state = sm.initialize_project("/home/claude/project", "Build a CLI calculator") -``` - -### load_state() - -Load current state from disk. - -```python -def load_state() -> dict | None -``` - -**Returns:** dict or None if no state exists - -**Example:** -```python -state = sm.load_state() -if state: - print(f"Cycle: {state['cycle_number']}") -``` - -### update_state() - -Update specific state fields. - -```python -def update_state(updates: dict) -> dict -``` - -**Parameters:** -- `updates` (dict): Fields to update - -**Returns:** dict - Updated complete state - -**Example:** -```python -state = sm.update_state({ - "status": "executing", - "current_plan": "New plan text" -}) -``` - -### update_completion_percentage() - -Update completion % with parse failure handling. - -```python -def update_completion_percentage(parsed_value: int | None, logger) -> int -``` - -**Parameters:** -- `parsed_value` (int|None): Parsed completion % or None if parse failed -- `logger`: Logger instance for warnings - -**Returns:** int - Completion % to use (parsed or last known) - -**Example:** -```python -completion = sm.update_completion_percentage(92, logger) -``` - -### increment_cycle() - -Advance to next cycle. - -```python -def increment_cycle() -> dict -``` - -**Returns:** dict - Updated state with incremented cycle_number - -### mark_completed() - -Mark project as completed. - -```python -def mark_completed() -> dict -``` - -**Returns:** dict - State with completed=True and timestamp - -See [API Overview](/api/overview) for more details. diff --git a/docs/api/types.mdx b/docs/api/types.mdx new file mode 100644 index 0000000..35ebbb8 --- /dev/null +++ b/docs/api/types.mdx @@ -0,0 +1,155 @@ +--- +title: Types +description: Type definitions for Fireteam +--- + +# Types + +This page documents the types used by Fireteam. + +## ExecutionMode + +Enum for execution strategies. + +```python +from fireteam import ExecutionMode + +class ExecutionMode(Enum): + SINGLE_TURN = "single_turn" # Direct Opus call, minimal tools + SIMPLE = "simple" # Execute only + MODERATE = "moderate" # Execute + single review + FULL = "full" # Plan + execute + validation reviews +``` + +### Values + +| Value | Description | +|-------|-------------| +| `SINGLE_TURN` | Minimal execution for trivial tasks | +| `SIMPLE` | Execute without review | +| `MODERATE` | Execute with single review | +| `FULL` | Plan + execute + multiple validation reviews | + +## ComplexityLevel + +Enum for task complexity classification. + +```python +from fireteam import ComplexityLevel + +class ComplexityLevel(Enum): + TRIVIAL = "trivial" # Single-line changes + SIMPLE = "simple" # Self-contained tasks + MODERATE = "moderate" # Multi-file changes + COMPLEX = "complex" # Architectural changes +``` + +### Mapping to ExecutionMode + +| ComplexityLevel | ExecutionMode | +|-----------------|---------------| +| `TRIVIAL` | `SINGLE_TURN` | +| `SIMPLE` | `SIMPLE` | +| `MODERATE` | `MODERATE` | +| `COMPLEX` | `FULL` | + +## ExecutionResult + +Dataclass containing execution results. + +```python +from dataclasses import dataclass, field +from fireteam import ExecutionMode + +@dataclass +class ExecutionResult: + success: bool + mode: ExecutionMode + output: str | None = None + error: str | None = None + completion_percentage: int = 0 + metadata: dict = field(default_factory=dict) +``` + +### Fields + +| Field | Type | Description | +|-------|------|-------------| +| `success` | `bool` | Whether the task completed successfully | +| `mode` | `ExecutionMode` | The execution mode that was used | +| `output` | `str \| None` | Execution output/result text | +| `error` | `str \| None` | Error message if failed | +| `completion_percentage` | `int` | 0-100 completion estimate | +| `metadata` | `dict` | Additional info (plan, review, etc.) | + +### Example + +```python +from fireteam import execute + +result = await execute( + project_dir="/path/to/project", + goal="Fix the bug", +) + +# Access fields +print(result.success) # True +print(result.mode) # ExecutionMode.SIMPLE +print(result.output) # "Fixed the null pointer exception..." +print(result.completion_percentage) # 100 +print(result.metadata) # {"review": "...", "plan": "..."} +``` + +## Hook Types + +### Pre-configured Hook Sets + +```python +from fireteam.hooks import ( + QUALITY_HOOKS, # Run tests, block questions + AUTONOMOUS_HOOKS, # Block user interaction + DEBUG_HOOKS, # Log tool usage +) +``` + +### Hook Function Signature + +```python +async def hook_function( + event: dict, # Hook event data + context: Any, # Execution context + config: Any, # Configuration +) -> dict: + """ + Returns: + Empty dict to continue normally + Dict with hookSpecificOutput to modify behavior + """ + return {} +``` + +## Constants + +### COMPLEXITY_TO_MODE + +Mapping from complexity to execution mode: + +```python +from fireteam.api import COMPLEXITY_TO_MODE + +COMPLEXITY_TO_MODE = { + ComplexityLevel.TRIVIAL: ExecutionMode.SINGLE_TURN, + ComplexityLevel.SIMPLE: ExecutionMode.SIMPLE, + ComplexityLevel.MODERATE: ExecutionMode.MODERATE, + ComplexityLevel.COMPLEX: ExecutionMode.FULL, +} +``` + +### Prompts + +System prompts used by Fireteam: + +```python +from fireteam.api import EXECUTOR_PROMPT, REVIEWER_PROMPT, PLANNER_PROMPT +from fireteam.complexity import COMPLEXITY_PROMPT +``` diff --git a/docs/cli-tools/fireteam-status.mdx b/docs/cli-tools/fireteam-status.mdx deleted file mode 100644 index b49fb55..0000000 --- a/docs/cli-tools/fireteam-status.mdx +++ /dev/null @@ -1,548 +0,0 @@ ---- -title: "fireteam-status" -description: "Monitor running Fireteam projects, view logs, and check system resources" ---- - -## Synopsis - -```bash -fireteam-status [OPTIONS] -``` - -## Description - -The `fireteam-status` command provides real-time monitoring of Fireteam projects. It shows project state, completion progress, cycle information, and system resource usage. Supports one-time status checks, live monitoring, and log viewing. - -## Options - - - Enable watch mode - refresh status every N seconds - - - - Refresh interval for watch mode (seconds) - - - - Show recent log entries instead of status - - - - Follow log output in real-time (like `tail -f`) - - - - Number of log lines to display - - -## Basic Usage - -### One-Time Status Check - -```bash -fireteam-status -``` - -**Output:** -``` -============================================================ -🔥 FIRETEAM STATUS -============================================================ - -Status: ✅ RUNNING (PID: 12345) - -📁 Project State: ------------------------------------------------------------- - Project: /home/claude/bitcoin-cli - Goal: Build a Bitcoin price checker CLI using Python - Status: EXECUTING - Cycle: 2 - Completion: 92% - Git Branch: agent-20251017-143022 - Started: 2025-10-17 14:30:22 - Updated: 2025-10-17 14:45:18 - -============================================================ -``` - -### Live Monitoring - -```bash -fireteam-status --watch -``` - -Updates screen every 5 seconds with latest status. - -**Custom refresh interval:** -```bash -fireteam-status --watch --interval 10 -``` - -### View Logs - -```bash -# Show last 20 lines -fireteam-status --logs - -# Show last 50 lines -fireteam-status --logs --lines 50 - -# Follow logs in real-time -fireteam-status --logs --follow -``` - -## Status Information - -### Process Status - -Indicates if Fireteam orchestrator is running: - -- **✅ RUNNING**: Active project execution -- **⏹️ STOPPED**: No active project - -### Project State - -When a project is running, displays: - -| Field | Description | -|-------|-------------| -| **Project** | Absolute path to project directory | -| **Goal** | Project objective (truncated if >80 chars) | -| **Status** | Current phase (PLANNING, EXECUTING, REVIEWING) | -| **Cycle** | Current cycle number (0-indexed) | -| **Completion** | Latest reviewer estimate (0-100%) | -| **Git Branch** | Active git branch name | -| **Started** | Project start timestamp | -| **Updated** | Last state update timestamp | - -### System Resources (Watch Mode) - -In watch mode, shows resource usage: - -``` -💻 System Resources: ------------------------------------------------------------- - Memory: 4.2G / 16G used - CPU Load: 1.23, 1.45, 1.67 - Disk: 45% used -``` - -## Watch Mode - -### Starting Watch Mode - -```bash -fireteam-status --watch -``` - -**Features:** -- Screen clears and refreshes automatically -- Shows system resources -- Updates at configurable interval -- Exit with `Ctrl+C` - -### Optimal Use Cases - -**Long-running projects:** -```bash -# Terminal 1: Start project -start-agent --project-dir ~/complex-app --prompt "Goal" - -# Terminal 2: Monitor continuously -fireteam-status --watch --interval 3 -``` - -**Quick checks:** -```bash -# Check every 30 seconds -fireteam-status --watch --interval 30 -``` - -### Watch Mode Tips - - -Use `tmux` or `screen` to keep watch mode running in background: - -```bash -tmux new -s fireteam-monitor -fireteam-status --watch -# Detach: Ctrl+B, D -``` - - -## Log Viewing - -### Show Recent Logs - -```bash -fireteam-status --logs -``` - -Shows last 20 lines from latest orchestrator log. - -### Follow Logs - -```bash -fireteam-status --logs --follow -``` - -Continuously streams new log entries (like `tail -f`). - -### Custom Line Count - -```bash -# Show last 100 lines -fireteam-status --logs --lines 100 - -# Show last 5 lines -fireteam-status --logs --lines 5 -``` - -### Log Analysis - -**Find errors:** -```bash -fireteam-status --logs --lines 500 | grep ERROR -``` - -**Check completion progress:** -```bash -fireteam-status --logs --lines 200 | grep "Completion:" -``` - -**View cycle starts:** -```bash -fireteam-status --logs --lines 300 | grep "CYCLE" -``` - -## Understanding Output - -### Completion Percentage - -The completion percentage comes from the Reviewer agent: - -- **0-50%**: Early stages, basic structure -- **50-75%**: Core functionality implemented -- **75-90%**: Most features complete, refinement needed -- **90-95%**: Nearly complete, minor polish -- **95-100%**: Validation mode, triple-checking - - -When completion reaches ≥95%, Fireteam enters validation mode requiring 3 consecutive high reviews before completing. - - -### Cycle Progression - -**Normal pattern:** - -``` -Cycle 0: 85% → 88% → 92% -Cycle 1: 92% → 95% → 95% -Cycle 2: 95% → 96% → 98% -Validation: 98% → 98% → 98% ✅ Complete -``` - -**Issue indicators:** - -``` -Cycle 5: 92% → 88% ⚠️ Regression -Cycle 6: 88% → 88% ⚠️ Stagnation -``` - -### Status Phases - -| Phase | Meaning | Typical Duration | -|-------|---------|------------------| -| **PLANNING** | Creating/updating project plan | 2-10 minutes | -| **EXECUTING** | Implementing tasks from plan | 5-30 minutes | -| **REVIEWING** | Analyzing code, running tests | 3-10 minutes | - -## Real-World Examples - -### Example 1: Monitoring Simple Project - -```bash -$ fireteam-status - -============================================================ -🔥 FIRETEAM STATUS -============================================================ - -Status: ✅ RUNNING (PID: 54321) - -📁 Project State: ------------------------------------------------------------- - Project: /home/claude/weather-cli - Goal: Build a Python CLI that fetches weather data - Status: REVIEWING - Cycle: 1 - Completion: 95% - Git Branch: agent-20251017-140000 - Started: 2025-10-17 14:00:00 - Updated: 2025-10-17 14:35:12 - -============================================================ -``` - -**Interpretation:** -- Cycle 1 (likely near completion) -- 95% complete (in validation range) -- Currently reviewing -- Total runtime: ~35 minutes - -### Example 2: Watch Mode During Complex Build - -```bash -$ fireteam-status --watch --interval 10 -``` - -**Updates every 10 seconds:** - -``` -[Screen refreshes automatically] - -============================================================ -🔥 FIRETEAM STATUS -============================================================ - -Status: ✅ RUNNING (PID: 67890) - -📁 Project State: ------------------------------------------------------------- - Project: /home/claude/github-analyzer - Goal: Build a TypeScript CLI tool that analyzes GitHub... - Status: EXECUTING - Cycle: 8 - Completion: 78% - Git Branch: agent-20251017-120000 - Started: 2025-10-17 12:00:00 - Updated: 2025-10-17 14:22:45 - -💻 System Resources: ------------------------------------------------------------- - Memory: 6.8G / 16G used - CPU Load: 2.34, 1.98, 1.76 - Disk: 52% used - -============================================================ - Refreshing every 10s... (Ctrl+C to stop) -``` - -### Example 3: Following Logs - -```bash -$ fireteam-status --logs --follow -``` - -**Live output:** -``` -📄 Latest log: orchestrator_20251017_140000.log -============================================================ -2025-10-17 14:30:22 - orchestrator - INFO - CYCLE 2 - Starting -2025-10-17 14:30:22 - orchestrator - INFO - PHASE 1: Planning -2025-10-17 14:33:15 - orchestrator - INFO - Planning completed -2025-10-17 14:33:15 - orchestrator - INFO - PHASE 2: Execution -2025-10-17 14:48:22 - orchestrator - INFO - Execution completed -2025-10-17 14:48:22 - orchestrator - INFO - PHASE 3: Review -2025-10-17 14:52:10 - orchestrator - INFO - Review completed - Completion: 96% -[waiting for new entries...] -``` - -## Troubleshooting - -### "No active project" shown but agent is running - -**Cause:** State file missing or corrupted - -**Solution:** -```bash -# Check for state file -ls -la /home/claude/fireteam/state/current.json - -# If missing, might be early initialization -# Wait a few seconds and check again -sleep 5 -fireteam-status -``` - -### Watch mode not updating - -**Cause:** Process might have crashed - -**Solution:** -```bash -# Exit watch mode (Ctrl+C) -# Check process status -ps aux | grep orchestrator - -# View logs for errors -fireteam-status --logs --lines 50 | grep ERROR -``` - -### "Error loading state" message - -**Cause:** Malformed JSON in state file - -**Solution:** -```bash -# View state file -cat /home/claude/fireteam/state/current.json | python3 -m json.tool - -# If corrupted, stop and restart -stop-agent -start-agent --project-dir ~/project --prompt "Goal" -``` - -### Logs not showing - -**Cause:** No log files or wrong directory - -**Solution:** -```bash -# Check log directory -ls -la /home/claude/fireteam/logs/ - -# Verify orchestrator logs exist -ls -la /home/claude/fireteam/logs/orchestrator_*.log - -# If missing, check if Fireteam is actually running -ps aux | grep orchestrator -``` - -## Advanced Usage - -### Scripting with fireteam-status - -**Wait for completion:** -```bash -#!/bin/bash -while fireteam-status | grep -q "RUNNING"; do - echo "Still running... $(date)" - sleep 60 -done -echo "Project complete!" -``` - -**Alert on high completion:** -```bash -#!/bin/bash -while true; do - COMPLETION=$(fireteam-status | grep "Completion:" | awk '{print $2}' | tr -d '%') - if [ "$COMPLETION" -ge 95 ]; then - notify-send "Fireteam" "Project at ${COMPLETION}% - nearing completion!" - break - fi - sleep 30 -done -``` - -**Extract cycle number:** -```bash -CYCLE=$(fireteam-status | grep "Cycle:" | awk '{print $2}') -echo "Current cycle: $CYCLE" -``` - -### Combining with Other Tools - -**Monitor with tmux:** -```bash -tmux new-session -d -s fireteam 'fireteam-status --watch' -tmux attach -t fireteam -``` - -**Log to file:** -```bash -fireteam-status --logs --follow > fireteam-monitor.log & -``` - -**Parse JSON state directly:** -```bash -python3 << EOF -import json -with open('/home/claude/fireteam/state/current.json') as f: - state = json.load(f) - print(f"Cycle: {state['cycle_number']}") - print(f"Completion: {state['completion_percentage']}%") -EOF -``` - -## Output Formats - -### Exit Codes - -| Code | Condition | -|------|-----------| -| `0` | Success, status displayed | -| `1` | Error (invalid arguments, etc.) | - -### JSON Output (Not Yet Supported) - -Future enhancement to support: -```bash -fireteam-status --json -``` - -Would output: -```json -{ - "running": true, - "pid": 12345, - "project": "/home/claude/project", - "cycle": 2, - "completion": 95, - "status": "reviewing" -} -``` - -## Best Practices - -### 1. Monitor Long-Running Projects - - -For projects expected to take >1 hour, use watch mode in a separate terminal or tmux session. - - -### 2. Check Logs on Failures - -If completion percentage doesn't increase for 2+ cycles: -```bash -fireteam-status --logs --lines 100 | grep ERROR -``` - -### 3. Use Appropriate Refresh Intervals - -- **Fast iteration** (simple projects): `--interval 3` -- **Normal monitoring**: `--interval 5` (default) -- **Background checks**: `--interval 30` -- **Minimal overhead**: `--interval 60` - -### 4. Understand Completion Trends - -Track completion over time: -```bash -fireteam-status --logs | grep "Completion:" | tail -10 -``` - -Look for steady increases vs. stagnation. - -## Next Steps - - - - - Learn how to launch new Fireteam projects - - - - Gracefully shutdown running projects - - - - Resolve common issues with monitoring - - - - Deep dive into state files and persistence - - - diff --git a/docs/cli-tools/overview.mdx b/docs/cli-tools/overview.mdx deleted file mode 100644 index fe22063..0000000 --- a/docs/cli-tools/overview.mdx +++ /dev/null @@ -1,404 +0,0 @@ ---- -title: "CLI Tools Overview" -description: "Command-line tools for managing and monitoring Fireteam projects" ---- - -## Overview - -Fireteam provides a suite of command-line tools for starting, stopping, and monitoring autonomous development projects. These tools are installed globally during setup and provide a simple interface to the powerful multi-agent system. - -## Available Commands - - - - - Launch a new Fireteam project with a specific goal - - - - Monitor running projects and view system status - - - - Gracefully shutdown running Fireteam instances - - - - Legacy alias for fireteam-status (deprecated) - - - - -## Installation Location - -CLI tools are installed to `~/.local/bin/` during Fireteam setup: - -```bash -~/.local/bin/ -├── start-agent # Start a project -├── stop-agent # Stop running project -├── fireteam-status # View status and logs -└── agent-progress # Legacy status command -``` - - -Ensure `~/.local/bin` is in your PATH. The setup script automatically adds this to your shell configuration. - - -**Verify installation:** - -```bash -which fireteam-status -# Output: /home/claude/.local/bin/fireteam-status - -fireteam-status --help -``` - -## Quick Reference - -### Starting a Project - -```bash -start-agent --project-dir ~/my-project --prompt "Your project goal here" -``` - -### Checking Status - -```bash -# One-time status check -fireteam-status - -# Live monitoring (updates every 5s) -fireteam-status --watch - -# View logs -fireteam-status --logs -``` - -### Stopping a Project - -```bash -stop-agent -``` - -## Common Workflows - -### Workflow 1: Start and Monitor - -```bash -# Start a new project -start-agent --project-dir ~/bitcoin-cli --prompt "Build a Bitcoin price checker CLI using Python" - -# Monitor progress in real-time -fireteam-status --watch - -# When satisfied, stop (or let it complete) -stop-agent -``` - -### Workflow 2: Background Execution - -```bash -# Start project -start-agent --project-dir ~/api-server --prompt "Build a REST API with FastAPI" - -# Check on it later -fireteam-status - -# View logs to see what happened -fireteam-status --logs --follow -``` - -### Workflow 3: Multiple Projects (Sequential) - -```bash -# Project 1 -start-agent --project-dir ~/project1 --prompt "Goal 1" -# ... wait for completion or stop ... -stop-agent - -# Project 2 (fresh state) -start-agent --project-dir ~/project2 --prompt "Goal 2" -``` - - -Fireteam can only run **one project at a time**. Starting a new project while one is running will fail. Use `stop-agent` first. - - -## Command Features - -### Global Flags - -All CLI tools support standard behavior: - -- **Exit codes**: 0 for success, 1 for error -- **Error output**: Sent to stderr -- **Help text**: Available via `--help` flag - -### Shell Completion - -Add shell completion for better UX: - - - -```bash Bash -# Add to ~/.bashrc -complete -W "--project-dir --prompt --watch --logs --follow --interval" start-agent fireteam-status -``` - -```zsh Zsh -# Add to ~/.zshrc -autoload -U compinit -compinit -``` - - - -## Environment Integration - -### Working Directory - -CLI tools work from any directory but operate on specific project directories: - -```bash -# Can run from anywhere -cd ~ -start-agent --project-dir /home/claude/projects/myapp --prompt "Build app" - -# Fireteam works in /home/claude/projects/myapp -``` - -### Process Management - -- **Background execution**: Projects run as background processes -- **PID tracking**: Process ID stored in `orchestrator.pid` -- **Graceful shutdown**: SIGTERM signal for clean exit -- **Automatic cleanup**: Stale PID files removed automatically - -### State Isolation - -Each project run creates isolated state: - -- State stored in `/home/claude/fireteam/state/current.json` -- Completely reset between projects -- No cross-contamination between runs - -## Logging - -All CLI tools generate logs: - -**Orchestrator logs:** -```bash -/home/claude/fireteam/logs/orchestrator_YYYYMMDD_HHMMSS.log -``` - -**System log (when running in background):** -```bash -/home/claude/fireteam/logs/system.log -``` - -**View logs:** - -```bash -# Show recent entries -fireteam-status --logs - -# Follow live -fireteam-status --logs --follow - -# Show specific number of lines -fireteam-status --logs --lines 50 -``` - -## Troubleshooting - -### Command Not Found - -**Problem:** `start-agent: command not found` - -**Solution:** - -```bash -# Add to PATH -export PATH="$HOME/.local/bin:$PATH" - -# Make permanent -echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc -source ~/.bashrc -``` - -### Already Running Error - -**Problem:** "Agent system is already running" - -**Solution:** - -```bash -# Stop existing project -stop-agent - -# Or check status -fireteam-status - -# Force cleanup if needed -rm -f /home/claude/fireteam/orchestrator.pid -``` - -### Permission Denied - -**Problem:** Permission errors when executing commands - -**Solution:** - -```bash -# Ensure scripts are executable -chmod +x ~/.local/bin/start-agent -chmod +x ~/.local/bin/stop-agent -chmod +x ~/.local/bin/fireteam-status -``` - -### Stale PID File - -**Problem:** Status shows running but nothing is actually running - -**Solution:** - -CLI tools automatically clean up stale PID files. If issues persist: - -```bash -rm -f /home/claude/fireteam/orchestrator.pid -fireteam-status # Should show STOPPED -``` - -## Best Practices - -### 1. Always Use Absolute Paths - -```bash -# Good -start-agent --project-dir /home/claude/projects/myapp --prompt "Goal" - -# Avoid relative paths (can cause issues) -start-agent --project-dir ../myapp --prompt "Goal" -``` - -### 2. Monitor Progress - - -Use `fireteam-status --watch` to monitor long-running projects. This helps you understand agent behavior and catch issues early. - - -### 3. Check Logs for Errors - -If a project fails or behaves unexpectedly: - -```bash -fireteam-status --logs --lines 100 | grep ERROR -``` - -### 4. Graceful Shutdown - -Always use `stop-agent` instead of `kill`: - -```bash -# Good - graceful shutdown -stop-agent - -# Avoid - can corrupt state -kill -9 $(cat /home/claude/fireteam/orchestrator.pid) -``` - -### 5. One Project at a Time - -Wait for projects to complete or stop them before starting new ones: - -```bash -# Check if running -fireteam-status - -# Stop if needed -stop-agent - -# Start new project -start-agent --project-dir ~/new-project --prompt "Goal" -``` - -## Advanced Usage - -### Scripting Fireteam - -Automate project creation with shell scripts: - -```bash -#!/bin/bash -# auto-fireteam.sh - -# Stop any running project -stop-agent 2>/dev/null || true - -# Start new project -start-agent --project-dir "$1" --prompt "$2" - -# Wait for completion -while fireteam-status | grep -q "RUNNING"; do - sleep 30 -done - -# Show final status -fireteam-status -``` - -Usage: - -```bash -./auto-fireteam.sh ~/my-app "Build a CLI calculator" -``` - -### Monitoring with Watch - -Combine with standard Unix tools: - -```bash -# Monitor in terminal split/tmux -watch -n 5 'fireteam-status' - -# Alert on completion -while fireteam-status | grep -q "RUNNING"; do sleep 60; done && notify-send "Fireteam Complete" -``` - -### Log Analysis - -Extract useful information from logs: - -```bash -# Show all error messages -grep ERROR /home/claude/fireteam/logs/orchestrator_*.log - -# Show completion percentages -grep "Completion:" /home/claude/fireteam/logs/orchestrator_*.log - -# Count cycles -grep "CYCLE" /home/claude/fireteam/logs/orchestrator_*.log | wc -l -``` - -## Next Steps - - - - - Learn how to start projects and write effective goals - - - - Master status monitoring and log viewing - - - - Understand graceful shutdown procedures - - - - Build your first project with Fireteam - - - diff --git a/docs/cli-tools/start-agent.mdx b/docs/cli-tools/start-agent.mdx deleted file mode 100644 index 912a274..0000000 --- a/docs/cli-tools/start-agent.mdx +++ /dev/null @@ -1,467 +0,0 @@ ---- -title: "start-agent" -description: "Launch new Fireteam projects with autonomous development goals" ---- - -## Synopsis - -```bash -start-agent --project-dir --prompt -``` - -## Description - -The `start-agent` command launches a new Fireteam project, initializing the multi-agent system to autonomously build software according to your specified goal. The system runs in the background, executing Plan → Execute → Review cycles until the project reaches 95%+ completion with triple validation. - -## Options - -### Required Arguments - - - Absolute path to the project directory. Created if it doesn't exist. - - - - The project goal/objective. Should be clear, specific, and actionable. - - -## Basic Usage - -### Simple Project - -```bash -start-agent --project-dir ~/calculator --prompt "Build a Python command-line calculator with basic arithmetic operations" -``` - -### API Project - -```bash -start-agent \ - --project-dir ~/api-server \ - --prompt "Create a REST API with FastAPI for managing a todo list. Include CRUD operations, SQLite database, and proper error handling." -``` - -### TypeScript Project - -```bash -start-agent \ - --project-dir ~/github-analyzer \ - --prompt "Build a TypeScript CLI tool that analyzes GitHub repositories and shows contributor statistics using the GitHub API" -``` - -## Writing Effective Goals - -### Goal Structure - -A well-written goal should include: - -1. **Technology stack** (Python, TypeScript, FastAPI, etc.) -2. **Project type** (CLI, API, web scraper, etc.) -3. **Core features** (what it should do) -4. **Quality requirements** (tests, error handling, documentation) - -### Good Goal Examples - - - - - -``` -Build a Python CLI application that checks Bitcoin prices using the CoinGecko API. -Include: -- Real-time price fetching -- Support for multiple currencies (USD, EUR, GBP) -- Error handling for API failures -- Formatted output with colors -- Unit tests -``` - -**Why it's good:** -- Specifies language (Python) -- Defines type (CLI) -- Lists specific features -- Includes quality requirements - - - - -``` -Create a REST API server using FastAPI for a note-taking application. -Features: -- CRUD endpoints for notes -- SQLite database -- Input validation -- Error responses -- Basic tests -``` - -**Why it's good:** -- Clear framework (FastAPI) -- Specific functionality (note-taking) -- Technical requirements listed - - - - -``` -Build a web scraper in Python that gets headlines from Hacker News and saves them to a file -``` - -**Why it's acceptable:** -- Basic requirements clear -- Missing quality requirements -- Could benefit from more detail - - - - -``` -Make me an app -``` - -**Why it's poor:** -- No technology specified -- No feature definition -- Completely ambiguous -- Will likely produce unsatisfactory results - - - - -### Goal Writing Tips - - -**Be specific about the tech stack.** "Build a Python CLI" is better than "Build a CLI" because it prevents the agent from choosing unfamiliar technologies. - - - -**Include edge cases and quality requirements.** Mentioning "error handling" and "tests" significantly improves code quality. - - - -**Avoid scope creep.** Don't request deployment automation, npm publishing, or Docker containerization unless specifically needed. Focus on core functionality. - - -## What Happens When You Start - -### 1. Pre-flight Checks - -``` -✓ Validate arguments -✓ Check for running instances -✓ Ensure project directory accessible -``` - -### 2. Initialization - -``` -✓ Create project directory (if needed) -✓ Initialize git repository -✓ Create timestamped branch (agent-YYYYMMDD-HHMMSS) -✓ Initialize project state -``` - -### 3. Background Execution - -``` -✓ Start orchestrator process -✓ Save PID to orchestrator.pid -✓ Begin Plan → Execute → Review cycles -``` - -### 4. Confirmation - -``` -Agent system started (PID: 12345) -Use 'fireteam-status' to check status -Use 'stop-agent' to stop the system -``` - -## Monitoring Execution - -After starting a project, monitor progress: - -```bash -# One-time status check -fireteam-status - -# Live monitoring -fireteam-status --watch - -# View logs -fireteam-status --logs --follow -``` - -See [fireteam-status](/cli-tools/fireteam-status) for detailed monitoring options. - -## Project Directory Structure - -Fireteam automatically creates and manages: - -``` -~/your-project/ -├── .git/ # Git repository (auto-created) -│ └── refs/heads/ -│ └── agent-20251017-143022 # Timestamped branch -├── src/ # Source code (agent-created) -├── tests/ # Test files (agent-created) -├── requirements.txt # Python deps (if applicable) -├── package.json # Node deps (if applicable) -└── README.md # Documentation (agent-created) -``` - -## Git Integration - -### Automatic Repository Setup - -- **New directories**: Git repo initialized automatically -- **Existing repos**: New branch created from current HEAD -- **Branch naming**: `agent-YYYYMMDD-HHMMSS` format - -### Commit Behavior - -Each cycle produces a commit: - -``` -Cycle 0: 85% complete -Cycle 1: 92% complete -Cycle 2: 96% complete -``` - -### Remote Pushing - -If a git remote exists, commits are automatically pushed: - -```bash -cd ~/your-project -git remote add origin git@github.com:user/repo.git - -# Subsequent cycles automatically push -``` - -## Real-World Examples - -### Example 1: Weather CLI (2 cycles, 95% completion) - -**Command:** -```bash -start-agent \ - --project-dir ~/weather-cli \ - --prompt "Build a Python CLI that fetches weather data from OpenWeatherMap API. Include current weather, 5-day forecast, and temperature in Celsius/Fahrenheit." -``` - -**Timeline:** -- Cycle 0: Created project structure, API integration, basic features (88%) -- Cycle 1: Added error handling, tests, formatting (95%) -- Validation: Triple-checked, completed - -**Result:** Production-ready weather CLI in ~40 minutes - -### Example 2: Task Manager (1 cycle, 92% completion) - -**Command:** -```bash -start-agent \ - --project-dir ~/task-manager \ - --prompt "Build a Python CLI task manager with SQLite. Support adding, listing, completing, and deleting tasks. Include due dates and priority levels." -``` - -**Timeline:** -- Cycle 0: Complete implementation with database, CRUD, CLI (92%) -- Validation: Met threshold, completed - -**Result:** Fully functional task manager in ~25 minutes (single cycle!) - -### Example 3: REST API Server (1 cycle, 92% completion) - -**Command:** -```bash -start-agent \ - --project-dir ~/notes-api \ - --prompt "Create a FastAPI REST API for note-taking. Include endpoints for creating, reading, updating, and deleting notes. Use SQLite for storage and include input validation." -``` - -**Timeline:** -- Cycle 0: Full API with all CRUD endpoints, database, validation, tests (92%) -- Validation: Completed - -**Result:** Production-ready API in ~30 minutes (single cycle!) - -## Advanced Usage - -### Continuing Previous Work - -To continue from a specific git commit: - -```bash -cd ~/existing-project -git checkout specific-commit - -# Fireteam creates new branch from this point -start-agent --project-dir ~/existing-project --prompt "Add feature X" -``` - -### Template-Based Projects - -Start from a template directory: - -```bash -cp -r ~/templates/fastapi-template ~/my-api -start-agent --project-dir ~/my-api --prompt "Add user authentication to this FastAPI template" -``` - -### Multi-Language Projects - -```bash -start-agent \ - --project-dir ~/fullstack-app \ - --prompt "Build a full-stack app with Python FastAPI backend and basic HTML/JS frontend for a todo list" -``` - - -Multi-language projects take longer (typically 5-7 cycles) due to increased complexity. - - -## Troubleshooting - -### "Agent system is already running" - -**Cause:** Another project is currently running - -**Solution:** - -```bash -# Check what's running -fireteam-status - -# Stop it -stop-agent - -# Try again -start-agent --project-dir ~/new-project --prompt "Goal" -``` - -### "Permission denied" on project directory - -**Cause:** Insufficient permissions - -**Solution:** - -```bash -# Use directory you own -start-agent --project-dir ~/projects/myapp --prompt "Goal" - -# Or fix permissions -sudo chown -R $USER:$USER /path/to/project -``` - -### Process starts but immediately stops - -**Cause:** Likely Python or Claude CLI issues - -**Solution:** - -```bash -# Check logs for errors -fireteam-status --logs - -# Verify Python and Claude CLI -python3 --version -claude --version - -# Check Claude authentication -claude auth status -``` - -### Git initialization fails - -**Cause:** Git not configured or not installed - -**Solution:** - -```bash -# Configure git -git config --global user.name "Your Name" -git config --global user.email "you@example.com" - -# Or set in .env -echo 'GIT_USER_NAME=Your Name' >> /home/claude/fireteam/.env -echo 'GIT_USER_EMAIL=you@example.com' >> /home/claude/fireteam/.env -``` - -## Error Codes - -| Code | Meaning | Solution | -|------|---------|----------| -| `0` | Success | Project started | -| `1` | Missing arguments | Provide --project-dir and --prompt | -| `1` | Already running | Stop existing project first | -| `1` | Invalid directory | Check path and permissions | - -## Best Practices - -### 1. Use Descriptive Project Directories - -```bash -# Good - descriptive names ---project-dir ~/bitcoin-price-checker ---project-dir ~/fastapi-todo-api - -# Avoid - generic names ---project-dir ~/project1 ---project-dir ~/test -``` - -### 2. Write Goals in Active Voice - -```bash -# Good -"Build a CLI that checks Bitcoin prices" - -# Less clear -"A Bitcoin price checking CLI should be built" -``` - -### 3. Specify Testing Requirements - -```bash -# Good - includes testing -"Build a calculator with unit tests for all operations" - -# Basic - no testing mentioned -"Build a calculator" -``` - -### 4. Set Realistic Scopes - - -Projects that complete in 1-3 cycles typically have: -- Clear, focused objectives -- Single primary function -- 200-500 lines of code -- Standard tech stack - - -Complex projects (5+ cycles): -- Multiple integrated features -- 1000+ lines of code -- Advanced architectures -- Novel tech combinations - -## Next Steps - - - - - Learn how to check project status and view logs - - - - Gracefully shutdown running projects - - - - Complete walkthrough of your first project - - - diff --git a/docs/cli-tools/stop-agent.mdx b/docs/cli-tools/stop-agent.mdx deleted file mode 100644 index cc4b0f0..0000000 --- a/docs/cli-tools/stop-agent.mdx +++ /dev/null @@ -1,515 +0,0 @@ ---- -title: "stop-agent" -description: "Gracefully shutdown running Fireteam projects" ---- - -## Synopsis - -```bash -stop-agent -``` - -## Description - -The `stop-agent` command gracefully shuts down the running Fireteam orchestrator and all associated agent processes. It sends a SIGTERM signal for clean shutdown, waits up to 30 seconds for graceful exit, and performs cleanup of PID files and orphaned processes. - -## Basic Usage - -```bash -stop-agent -``` - -**Expected output:** -``` -Stopping Fireteam (PID: 12345)... -Agent system stopped -``` - -## What stop-agent Does - -### 1. Validation -- Checks if orchestrator is running -- Reads PID from `/home/claude/fireteam/orchestrator.pid` -- Validates process exists - -### 2. Graceful Shutdown -- Sends **SIGTERM** signal (signal 15) -- Orchestrator receives shutdown signal -- Current agent phase completes (if possible) -- State file saved -- Git commit finalized - -### 3. Timeout Handling -- Waits up to **30 seconds** for graceful exit -- If still running after timeout, sends **SIGKILL** (signal 9) -- Forces immediate termination - -### 4. Cleanup -- Terminates any orphaned Claude CLI processes -- Removes PID file -- Leaves state file intact (for later inspection) - -## Shutdown Behavior - -### Graceful Shutdown (Normal) - -When you run `stop-agent`, the orchestrator: - -1. **Receives SIGTERM signal** -2. **Completes current operation** (if in progress): - - Planning phase: Saves partial plan - - Execution phase: Allows current command to finish - - Review phase: Completes review -3. **Updates state file** with current progress -4. **Commits changes** to git (if any) -5. **Exits cleanly** - - -Graceful shutdown preserves all progress. You can inspect the state and git commits to understand what was accomplished. - - -### Forced Shutdown (After 30s) - -If the process doesn't exit within 30 seconds: - -1. **SIGKILL sent** (force termination) -2. **Immediate process death** -3. **State might not be saved** (last cycle's state available) -4. **Current operation incomplete** - - -Forced shutdown can leave the project in an inconsistent state. Use only as last resort. - - -## When to Stop - -### Stop Early If: - -❌ **Agent is off-track** -```bash -# Check what it's doing -fireteam-status --logs - -# If building wrong features, stop -stop-agent -``` - -❌ **Wasting cycles on blockers** -```bash -# If stuck for 3+ cycles on same issue -fireteam-status # Shows Cycle: 8, Completion: 75% (unchanged) -stop-agent -``` - -❌ **Resource constraints** -```bash -# If system resources exhausted -fireteam-status --watch # Shows high CPU/memory -stop-agent -``` - -### Let It Continue If: - -✅ **Making steady progress** -``` -Cycle 0: 85% → Cycle 1: 92% → Cycle 2: 95% -``` - -✅ **In validation mode** -``` -Completion: 96% (validation check 2/3) -``` - -✅ **Near completion** -``` -Cycle 4, Completion: 94% - likely 1-2 cycles to finish -``` - -## State Preservation - -### What's Preserved on Stop - -✅ Git commits up to last completed cycle -✅ State file with last known progress -✅ All project files -✅ Log files - -### What's Lost on Stop - -❌ Current in-progress cycle (not committed yet) -❌ Agent's in-memory state -❌ Partial plan/execution/review - -### Inspecting After Stop - -```bash -# Check final state -cat /home/claude/fireteam/state/current.json | python3 -m json.tool - -# View git history -cd ~/your-project -git log --oneline - -# See what was accomplished -git diff HEAD~3..HEAD # Last 3 cycles -``` - -## Troubleshooting - -### "Agent system is not running" - -**Situation 1: Already stopped** -```bash -$ stop-agent -Agent system is not running (no PID file found) -``` - -**This is normal** - nothing to stop. - -**Situation 2: Stale PID file** -```bash -$ stop-agent -Agent system is not running (stale PID file) -``` - -**Handled automatically** - stale PID file removed. - -### Process Won't Stop (Hangs) - -**Symptoms:** -- `stop-agent` waits 30 seconds -- Shows "Process did not stop gracefully, forcing..." -- Eventually completes - -**Causes:** -- Agent stuck in long-running operation -- Network call hanging -- Infinite loop in agent code - -**Manual intervention:** -```bash -# Find PID -cat /home/claude/fireteam/orchestrator.pid - -# Force kill -kill -9 - -# Or kill all related processes -pkill -9 -f "claude-agent-system" -pkill -9 -f "claude --dangerously-skip-permissions" - -# Cleanup -rm -f /home/claude/fireteam/orchestrator.pid -``` - -### Multiple Stop Attempts - -**Problem:** Running `stop-agent` multiple times - -**Behavior:** -```bash -$ stop-agent -Stopping Fireteam (PID: 12345)... -Agent system stopped - -$ stop-agent -Agent system is not running (no PID file found) -``` - -**This is safe** - subsequent stops are no-ops. - -### State Corruption After Force Kill - -**Problem:** Forced shutdown corrupted state file - -**Symptoms:** -```bash -fireteam-status -Error loading state: Expecting property name enclosed in double quotes -``` - -**Solution:** -```bash -# Backup corrupted state -cp /home/claude/fireteam/state/current.json ~/state-backup.json - -# Remove corrupted state -rm /home/claude/fireteam/state/current.json - -# Start fresh (state will be recreated) -start-agent --project-dir ~/project --prompt "Continue previous work" -``` - -## Real-World Scenarios - -### Scenario 1: Agent Drift Detected - -```bash -# Check what agent is doing -$ fireteam-status --logs --lines 50 -... -2025-10-17 15:30:00 - INFO - Creating npm deployment scripts -2025-10-17 15:31:00 - INFO - Setting up CI/CD pipeline -... - -# Goal was just "Build a CLI", not deployment! -$ stop-agent -Stopping Fireteam (PID: 12345)... -Agent system stopped - -# Review commits, potentially reset -$ cd ~/project -$ git log --oneline -abc1234 Cycle 3: Added deployment automation -def5678 Cycle 2: 95% complete -... - -# Revert unwanted changes -$ git reset --hard def5678 -``` - -### Scenario 2: Timeout Issues - -```bash -# Agent stuck in execution phase for 30+ minutes -$ fireteam-status -Status: EXECUTING -Cycle: 5 -Completion: 88% -Updated: 2025-10-17 14:00:00 # 40 minutes ago! - -# Likely timed out, stop and investigate -$ stop-agent -Stopping Fireteam (PID: 12345)... -[waits 30 seconds] -Process did not stop gracefully, forcing... -Agent system stopped - -# Check logs for timeout errors -$ fireteam-status --logs --lines 100 | grep -i timeout -``` - -### Scenario 3: Completion Satisfaction - -```bash -# Check status -$ fireteam-status -Status: ✅ RUNNING -Cycle: 2 -Completion: 92% - -# Good enough for prototype, stop here -$ stop-agent -Stopping Fireteam (PID: 12345)... -Agent system stopped - -# Test the project -$ cd ~/project -$ python main.py -# Works! Ship it. -``` - -## Stopping vs. Completion - -### Manual Stop - -**When:** You decide to stop -**How:** `stop-agent` -**Result:** Project at current completion level - -**Use cases:** -- Early prototype sufficient -- Agent off-track -- Time constraints -- Resource limits - -### Automatic Completion - -**When:** Fireteam decides project is complete -**How:** 3 consecutive ≥95% reviews -**Result:** High-quality, validated project - -**Indicators:** -``` -Validation check 1/3: 96% -Validation check 2/3: 97% -Validation check 3/3: 98% ✅ -PROJECT COMPLETED SUCCESSFULLY -``` - -**Behavior:** -- Orchestrator stops automatically -- Final commit created -- State marked as completed -- PID file removed - - -After automatic completion, `fireteam-status` shows: -``` -Status: ⏹️ STOPPED -✅ COMPLETED: 2025-10-17 15:45:30 -``` - - -## Best Practices - -### 1. Stop Before Starting New Projects - -```bash -# Always stop before starting new work -stop-agent - -# Verify stopped -fireteam-status - -# Start new project -start-agent --project-dir ~/new-project --prompt "Goal" -``` - -### 2. Check Progress Before Stopping - -```bash -# See what you'd be stopping -fireteam-status - -# View recent work -fireteam-status --logs --lines 50 - -# Make informed decision -stop-agent # or let it continue -``` - -### 3. Don't Interrupt Validation - -If in validation mode, let it complete: - -```bash -$ fireteam-status -Completion: 96% -Validation check 2/3 - -# Wait for third check (usually < 30 minutes) -# Then automatic completion -``` - -### 4. Save Important State - -Before stopping, if you might want to continue: - -```bash -# Backup state -cp /home/claude/fireteam/state/current.json ~/project-state-backup.json - -# Then stop -stop-agent -``` - -## Advanced Usage - -### Stopping from Scripts - -```bash -#!/bin/bash -# auto-stop-on-error.sh - -# Monitor logs for errors -fireteam-status --logs --follow | while read line; do - if echo "$line" | grep -qi "critical error"; then - echo "Critical error detected, stopping..." - stop-agent - exit 1 - fi -done -``` - -### Timed Auto-Stop - -```bash -#!/bin/bash -# stop-after-2-hours.sh - -sleep 7200 # 2 hours -if fireteam-status | grep -q "RUNNING"; then - echo "Time limit reached, stopping agent" - stop-agent -fi -``` - -### Conditional Stop - -```bash -#!/bin/bash -# stop-if-stalled.sh - -PREV_CYCLE=$(fireteam-status | grep "Cycle:" | awk '{print $2}') -sleep 1800 # 30 minutes - -CURR_CYCLE=$(fireteam-status | grep "Cycle:" | awk '{print $2}') - -if [ "$PREV_CYCLE" == "$CURR_CYCLE" ]; then - echo "No progress in 30 minutes, stopping" - stop-agent -fi -``` - -## Exit Codes - -| Code | Meaning | -|------|---------| -| `0` | Successfully stopped or was not running | -| `1` | Error during shutdown (rare) | - -## Alternatives to Stopping - -### Option 1: Let It Complete - -Most reliable approach: -```bash -# Just let validation finish -# Usually 1-3 more cycles -``` - -### Option 2: Monitor and Decide - -```bash -# Watch for a while -fireteam-status --watch --interval 10 - -# Stop if necessary -# Ctrl+C to exit watch, then stop-agent -``` - -### Option 3: Review and Continue - -```bash -# Stop to inspect -stop-agent - -# Review commits -cd ~/project && git log --oneline - -# Continue if needed (new run, fresh state) -start-agent --project-dir ~/project --prompt "Add feature X" -``` - -## Next Steps - - - - - Launch new Fireteam projects - - - - Check progress and view logs - - - - Understand state preservation - - - - Resolve shutdown issues - - - diff --git a/docs/concepts/complexity.mdx b/docs/concepts/complexity.mdx new file mode 100644 index 0000000..8ac7c94 --- /dev/null +++ b/docs/concepts/complexity.mdx @@ -0,0 +1,108 @@ +--- +title: Complexity Estimation +description: How Fireteam classifies task complexity +--- + +# Complexity Estimation + +Fireteam automatically estimates task complexity to select the appropriate execution strategy. This page explains how complexity estimation works. + +## Complexity Levels + + + + Single-line changes, typo fixes, adding comments + + + Self-contained changes, single-file modifications + + + Multi-file changes, requires some planning + + + Architectural changes, major refactoring + + + +## How It Works + +When you call `execute()` without specifying a mode, Fireteam: + +1. Sends your goal and context to Claude +2. Claude analyzes the scope and returns a complexity level +3. Fireteam maps the complexity to an execution mode + +```python +from fireteam import estimate_complexity + +complexity = await estimate_complexity( + goal="Add user authentication", + context="Using FastAPI with existing User model", +) +# Returns: ComplexityLevel.MODERATE +``` + +## Complexity to Mode Mapping + +| Complexity | Mode | Behavior | +|------------|------|----------| +| TRIVIAL | SINGLE_TURN | Single Claude call, minimal tools | +| SIMPLE | SIMPLE | Execute only, no review | +| MODERATE | MODERATE | Execute + single review | +| COMPLEX | FULL | Plan + execute + validation reviews | + +## Classification Guidelines + +### TRIVIAL Tasks + +- Fix typos +- Add/remove comments +- Rename a single variable +- Simple formatting changes + +### SIMPLE Tasks + +- Implement a single function +- Add logging to existing code +- Fix a straightforward bug +- Update configuration values + +### MODERATE Tasks + +- Refactor a module +- Add a new feature with tests +- Fix a bug requiring investigation +- Update multiple related files + +### COMPLEX Tasks + +- Major architectural changes +- Implement new subsystems +- Large-scale refactoring +- Cross-cutting concerns + +## Manual Override + +You can bypass complexity estimation by specifying the mode directly: + +```python +from fireteam import execute, ExecutionMode + +# Force FULL mode for thorough execution +result = await execute( + project_dir="/path/to/project", + goal="Add simple logging", + mode=ExecutionMode.FULL, # Override complexity estimation +) +``` + +## Customizing Estimation + +The complexity estimation uses a prompt that you can customize via `COMPLEXITY_PROMPT`: + +```python +from fireteam.complexity import COMPLEXITY_PROMPT + +# View the default prompt +print(COMPLEXITY_PROMPT) +``` diff --git a/docs/concepts/execution-modes.mdx b/docs/concepts/execution-modes.mdx new file mode 100644 index 0000000..03c3b2e --- /dev/null +++ b/docs/concepts/execution-modes.mdx @@ -0,0 +1,156 @@ +--- +title: Execution Modes +description: Understanding Fireteam's execution strategies +--- + +# Execution Modes + +Fireteam uses different execution strategies based on task complexity. Each mode balances thoroughness against efficiency. + +## Mode Overview + +``` +SINGLE_TURN → Execute +SIMPLE → Execute +MODERATE → Execute → Review +FULL → Plan → Execute → Review (×3) +``` + +## SINGLE_TURN Mode + +**For:** Trivial tasks like typo fixes, adding comments + +**Behavior:** +- Single Claude SDK call +- Minimal tool access +- No review phase +- Fastest execution + +```python +from fireteam import execute, ExecutionMode + +result = await execute( + project_dir="/path/to/project", + goal="Fix the typo in README.md", + mode=ExecutionMode.SINGLE_TURN, +) +``` + +## SIMPLE Mode + +**For:** Self-contained tasks that don't require planning + +**Behavior:** +- Execute phase only +- Full tool access (Read, Write, Edit, Bash) +- No review phase +- Quick execution + +```python +result = await execute( + project_dir="/path/to/project", + goal="Add logging to the auth module", + mode=ExecutionMode.SIMPLE, +) +``` + +## MODERATE Mode + +**For:** Tasks requiring some verification + +**Behavior:** +- Execute phase with full tools +- Single review phase afterward +- Extracts completion percentage +- Good balance of speed and quality + +```python +result = await execute( + project_dir="/path/to/project", + goal="Refactor the user service", + mode=ExecutionMode.MODERATE, +) + +# Result includes review info +print(result.completion_percentage) # e.g., 85 +print(result.metadata.get("review")) # Review output +``` + +## FULL Mode + +**For:** Complex tasks requiring planning and validation + +**Behavior:** +1. **Planning Phase**: Analyze goal, create implementation plan +2. **Execution Phase**: Implement the plan +3. **Validation Phase**: Multiple reviews until 3 consecutive >95% completion + +```python +result = await execute( + project_dir="/path/to/project", + goal="Redesign the authentication system", + mode=ExecutionMode.FULL, +) + +# Result includes all phases +print(result.metadata.get("plan")) # Implementation plan +print(result.metadata.get("review")) # Final review +print(result.completion_percentage) # Should be >95% +``` + +### Validation Requirements + +FULL mode requires **3 consecutive reviews** scoring **>95%** completion before succeeding. This ensures: + +- No premature completion +- Consistent quality across reviews +- Thorough verification of changes + +If validation fails, the result will have `success=False` with an error explaining why. + +## Mode Selection + +### Automatic (Recommended) + +Let Fireteam choose based on complexity estimation: + +```python +result = await execute( + project_dir="/path/to/project", + goal="Your task here", + # mode not specified - auto-detect +) + +print(f"Used mode: {result.mode}") +``` + +### Manual Override + +Force a specific mode: + +```python +# Be thorough with a simple task +result = await execute( + project_dir="/path/to/project", + goal="Add a comment", + mode=ExecutionMode.FULL, +) + +# Be quick with a complex task (not recommended) +result = await execute( + project_dir="/path/to/project", + goal="Refactor everything", + mode=ExecutionMode.SINGLE_TURN, +) +``` + +## Tool Access by Mode + +| Mode | Read | Write | Edit | Bash | Glob | Grep | +|------|------|-------|------|------|------|------| +| SINGLE_TURN | Limited | Limited | Limited | No | No | No | +| SIMPLE | Yes | Yes | Yes | Yes | Yes | Yes | +| MODERATE | Yes | Yes | Yes | Yes | Yes | Yes | +| FULL | Yes | Yes | Yes | Yes | Yes | Yes | + +Note: Planner and Reviewer phases in FULL mode have read-only access. diff --git a/docs/concepts/hooks.mdx b/docs/concepts/hooks.mdx new file mode 100644 index 0000000..19bdaf4 --- /dev/null +++ b/docs/concepts/hooks.mdx @@ -0,0 +1,156 @@ +--- +title: Quality Hooks +description: Enforce quality with SDK hooks +--- + +# Quality Hooks + +Fireteam includes SDK hooks that enforce quality during execution. These hooks run tests after edits and can block user interaction for fully autonomous operation. + +## Built-in Hook Sets + +### QUALITY_HOOKS + +The default hooks for quality enforcement: + +- **Run tests after edits**: Automatically runs `pytest`, `npm test`, `cargo test`, etc. after Write/Edit operations +- **Block user questions**: Prevents Claude from asking questions (fully autonomous) + +```python +from fireteam import execute + +# QUALITY_HOOKS enabled by default when run_tests=True +result = await execute( + project_dir="/path/to/project", + goal="Add feature", + run_tests=True, # Default +) +``` + +### AUTONOMOUS_HOOKS + +For fully autonomous operation without any user interaction: + +- Blocks `AskUserQuestion` tool + +```python +from fireteam.hooks import AUTONOMOUS_HOOKS +``` + +### DEBUG_HOOKS + +For debugging and logging: + +- Logs all tool usage + +```python +from fireteam.hooks import DEBUG_HOOKS +``` + +## Test Detection + +Fireteam automatically detects your test framework: + +| Framework | Detection | Command | +|-----------|-----------|---------| +| pytest | `pytest.ini`, `pyproject.toml`, `setup.py`, `tests/` | `pytest -x --tb=short` | +| npm | `package.json` | `npm test` | +| cargo | `Cargo.toml` | `cargo test` | +| go | `go.mod` | `go test ./...` | +| make | `Makefile` with `test:` target | `make test` | + +## How Hooks Work + +### PreToolUse Hooks + +Run before a tool is used. Can approve, deny, or modify the tool call. + +```python +async def block_user_questions(event, context, config): + """Block AskUserQuestion for autonomous operation.""" + if event.get("hook_event_name") != "PreToolUse": + return {} + if event.get("tool_name") != "AskUserQuestion": + return {} + + return { + "hookSpecificOutput": { + "permissionDecision": "deny", + "permissionDecisionReason": "Autonomous mode - no user interaction", + } + } +``` + +### PostToolUse Hooks + +Run after a tool completes. Can provide feedback to Claude. + +```python +async def run_tests_after_edit(event, context, config): + """Run tests after Write/Edit operations.""" + if event.get("hook_event_name") != "PostToolUse": + return {} + if event.get("tool_name") not in ["Edit", "Write"]: + return {} + + # Run tests and return results + success, output = run_tests_sync(cwd, test_command) + + if success: + return {} # No feedback needed + + return { + "hookSpecificOutput": { + "additionalContext": f"Tests failed:\n{output}", + } + } +``` + +## Disabling Hooks + +To disable test running and hooks: + +```python +result = await execute( + project_dir="/path/to/project", + goal="Add experimental feature", + run_tests=False, # Disables QUALITY_HOOKS +) +``` + +## Custom Hooks + +Create custom hooks for your use case: + +```python +from fireteam.hooks import create_test_hooks + +# Get the default test hooks configuration +hooks = create_test_hooks() + +# Modify or extend as needed +print(hooks) +# { +# "PreToolUse": [...], +# "PostToolUse": [...], +# } +``` + +## Hook Events + +| Event | When | Purpose | +|-------|------|---------| +| PreToolUse | Before tool execution | Approve/deny/modify | +| PostToolUse | After tool execution | Provide feedback | + +## Test Output + +When tests fail, the hook provides feedback to Claude: + +``` +Tests failed after editing src/auth.py: +FAILED tests/test_auth.py::test_login - AssertionError +1 failed, 5 passed +``` + +Claude receives this feedback and can fix the issue before continuing. diff --git a/docs/configuration/config-file.mdx b/docs/configuration/config-file.mdx deleted file mode 100644 index c9ea378..0000000 --- a/docs/configuration/config-file.mdx +++ /dev/null @@ -1,452 +0,0 @@ ---- -title: "Configuration File Reference" -description: "Configuration options for timeouts, completion thresholds, and validation criteria" ---- - -## Overview - -Fireteam's behavior is controlled through `config.py`, located in the root of the Fireteam installation directory (`/home/claude/fireteam/config.py`). This file contains all system-wide settings for agent timeouts, completion thresholds, git configuration, and more. - -## Configuration File Location - -```bash -/home/claude/fireteam/config.py -``` - - -Changes to `config.py` require restarting any running Fireteam instances to take effect. - - -## Core Settings - -### System Paths - -```python -SYSTEM_DIR = "/home/claude/fireteam" -STATE_DIR = os.path.join(SYSTEM_DIR, "state") -LOGS_DIR = os.path.join(SYSTEM_DIR, "logs") -CLI_DIR = os.path.join(SYSTEM_DIR, "cli") -``` - -**Description:** -- `SYSTEM_DIR`: Root directory for Fireteam installation -- `STATE_DIR`: Where project state files are stored (`state/current.json`) -- `LOGS_DIR`: Location for orchestrator and system logs -- `CLI_DIR`: Directory containing CLI executables (`start-agent`, `stop-agent`, etc.) - - -**Do not modify these paths** unless you've relocated the Fireteam installation. Changing these can break CLI tools and state management. - - -## Claude CLI Configuration - -```python -CLAUDE_CLI = "claude" -DANGEROUSLY_SKIP_PERMISSIONS = "--dangerously-skip-permissions" -``` - -**Description:** -- `CLAUDE_CLI`: Command to invoke Claude CLI (assumes `claude` is in PATH) -- `DANGEROUSLY_SKIP_PERMISSIONS`: Flag enabling fully autonomous operation without permission prompts - - -The `--dangerously-skip-permissions` flag allows agents to execute file operations, install packages, and run commands without manual approval. This is essential for autonomous operation but should only be used in controlled environments. - - -## Agent Timeouts - -```python -AGENT_TIMEOUTS = { - "planner": 600, # 10 minutes - "reviewer": 600, # 10 minutes - "executor": 1800 # 30 minutes -} -``` - -**Description:** -- `planner`: Maximum time (seconds) for planning phase -- `reviewer`: Maximum time (seconds) for review phase -- `executor`: Maximum time (seconds) for execution phase - -### Why These Values? - -These timeout values are optimized for typical project workflows: - -- **Planner (10 min)**: Complex planning, codebase analysis, and task breakdown -- **Executor (30 min)**: Handles installations, builds, implementation, and complex tasks -- **Reviewer (10 min)**: Code review, validation, and completion analysis - - -See [Timeout Configuration](/configuration/timeouts) for detailed rationale and guidance on adjusting these values. - - -### Customizing Timeouts - -**Increase timeouts for:** -- Large codebases (10,000+ lines) -- Complex build processes -- Extensive test suites -- Slow network environments - -**Example:** - -```python -AGENT_TIMEOUTS = { - "planner": 900, # 15 minutes for large projects - "reviewer": 900, # 15 minutes - "executor": 3600 # 60 minutes for complex builds -} -``` - -**Decrease timeouts for:** -- Simple projects -- Fast iteration cycles -- Time-constrained testing - -## Retry Configuration - -```python -MAX_RETRIES = 3 -RETRY_DELAY = 5 # seconds -``` - -**Description:** -- `MAX_RETRIES`: Number of retry attempts for failed agent calls -- `RETRY_DELAY`: Delay (seconds) between retry attempts - -**Behavior:** -- On timeout or API error, Fireteam retries up to 3 times -- 5-second delay between attempts prevents API rate limiting -- After 3 failures, the cycle is aborted and logged - -**Customization:** - -```python -# More aggressive retries -MAX_RETRIES = 5 -RETRY_DELAY = 3 - -# Conservative retries -MAX_RETRIES = 2 -RETRY_DELAY = 10 -``` - -## Completion Thresholds - -These settings control when the infinite loop terminates. - -```python -COMPLETION_THRESHOLD = 95 # percentage -VALIDATION_CHECKS_REQUIRED = 3 # consecutive checks needed -``` - -**Configuration options:** -- `COMPLETION_THRESHOLD`: Minimum completion percentage to enter validation mode (default: 95%) -- `VALIDATION_CHECKS_REQUIRED`: Number of consecutive validations required (default: 3) - -### Validation Behavior - -1. Reviewer scores completion (0-100%) based on the original goal each cycle -2. When score reaches COMPLETION_THRESHOLD, validation mode activates -3. System requires VALIDATION_CHECKS_REQUIRED consecutive cycles above threshold -4. If score drops below threshold, validation counter resets and cycle continues - -**Examples:** -- `98% + 5 checks`: Higher quality bar, longer runtime -- `90% + 2 checks`: Lower quality bar, shorter runtime - - -The multi-validation requirement prevents termination on a single high score. The system continues running until it demonstrates consistent quality across multiple cycles. - - -**Example configurations:** - -```python -# Higher quality requirements (longer runtime expected) -COMPLETION_THRESHOLD = 98 -VALIDATION_CHECKS_REQUIRED = 5 - -# Lower quality requirements (shorter runtime expected) -COMPLETION_THRESHOLD = 90 -VALIDATION_CHECKS_REQUIRED = 2 -``` - - -Lower thresholds will terminate earlier with potentially less complete implementations. The 95%/3 default reflects a balance between quality and iteration time. - - -## Git Configuration - -```python -GIT_USER_NAME = os.environ.get("GIT_USER_NAME", "fireteam") -GIT_USER_EMAIL = os.environ.get("GIT_USER_EMAIL", "fireteam@darkresearch.ai") -``` - -**Description:** -- `GIT_USER_NAME`: Git committer name (overrideable via `.env`) -- `GIT_USER_EMAIL`: Git committer email (overrideable via `.env`) - -**Priority order:** -1. Environment variables in `.env` file (highest priority) -2. Default values in `config.py` -3. Global git config (not used by Fireteam) - -**Example `.env` override:** - -```bash -GIT_USER_NAME="Jane Developer" -GIT_USER_EMAIL="jane@company.com" -``` - -See [Environment Setup](/installation/environment) for `.env` configuration. - -## Logging Configuration - -```python -LOG_LEVEL = "INFO" -LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" -``` - -**Description:** -- `LOG_LEVEL`: Logging verbosity (DEBUG, INFO, WARNING, ERROR, CRITICAL) -- `LOG_FORMAT`: Log message format string - -**Available log levels:** - -| Level | Description | When to use | -|-------|-------------|-------------| -| `DEBUG` | Detailed diagnostic info | Debugging agent behavior, state changes | -| `INFO` | General informational messages | Default, normal operation | -| `WARNING` | Warning messages | Non-critical issues | -| `ERROR` | Error messages | Failures and exceptions | -| `CRITICAL` | Critical errors | System-level failures | - -**Example for debugging:** - -```python -LOG_LEVEL = "DEBUG" # Verbose output for troubleshooting -``` - - -Use `DEBUG` level when troubleshooting agent failures or unexpected behavior. Logs are stored in `/home/claude/fireteam/logs/`. - - -## Sudo Password Configuration - -```python -SUDO_PASSWORD = os.getenv("SUDO_PASSWORD", None) - -def has_sudo_access(): - """Check if sudo password is available.""" - return SUDO_PASSWORD is not None -``` - -**Description:** -- `SUDO_PASSWORD`: Optional sudo password for system operations (from `.env`) -- `has_sudo_access()`: Helper function to check if sudo is configured - -**Usage:** - -Add to `.env` file: - -```bash -SUDO_PASSWORD=your_secure_password_here -``` - - -**Security considerations:** -- Store `.env` file outside version control (already in `.gitignore`) -- Use file permissions: `chmod 600 .env` -- Prefer passwordless sudo over storing passwords - - -See [Sudo Setup](/configuration/sudo-setup) for configuration details. - -## Environment Variable Loading - -```python -from pathlib import Path -from dotenv import load_dotenv - -env_file = Path(__file__).parent / ".env" -if env_file.exists(): - load_dotenv(env_file) -``` - -**Behavior:** -- Loads `.env` file from Fireteam root directory -- Environment variables override config.py defaults -- Missing `.env` file is non-fatal (uses defaults) - -**Supported environment variables:** - -| Variable | Description | Default | -|----------|-------------|---------| -| `GIT_USER_NAME` | Git committer name | `"fireteam"` | -| `GIT_USER_EMAIL` | Git committer email | `"fireteam@darkresearch.ai"` | -| `SUDO_PASSWORD` | Sudo password for system ops | `None` | - -## Complete Configuration Example - -Here's a fully customized `config.py` for a large-scale project: - -```python -""" -Configuration settings for Fireteam - Large Project Setup -""" - -import os -from pathlib import Path -from dotenv import load_dotenv - -# Load environment variables -env_file = Path(__file__).parent / ".env" -if env_file.exists(): - load_dotenv(env_file) - -# System paths -SYSTEM_DIR = "/home/claude/fireteam" -STATE_DIR = os.path.join(SYSTEM_DIR, "state") -LOGS_DIR = os.path.join(SYSTEM_DIR, "logs") -CLI_DIR = os.path.join(SYSTEM_DIR, "cli") - -# Claude CLI configuration -CLAUDE_CLI = "claude" -DANGEROUSLY_SKIP_PERMISSIONS = "--dangerously-skip-permissions" - -# Agent configuration - Extended for large projects -MAX_RETRIES = 5 -RETRY_DELAY = 10 # seconds - -# Agent timeouts (extended for complex builds) -AGENT_TIMEOUTS = { - "planner": 900, # 15 minutes - "reviewer": 900, # 15 minutes - "executor": 3600 # 60 minutes -} - -# Completion thresholds - Stricter for production -COMPLETION_THRESHOLD = 98 # percentage -VALIDATION_CHECKS_REQUIRED = 5 # consecutive checks - -# Git configuration -GIT_USER_NAME = os.environ.get("GIT_USER_NAME", "fireteam") -GIT_USER_EMAIL = os.environ.get("GIT_USER_EMAIL", "fireteam@darkresearch.ai") - -# Logging - Debug mode -LOG_LEVEL = "DEBUG" -LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - -# Sudo password -SUDO_PASSWORD = os.getenv("SUDO_PASSWORD", None) - -def has_sudo_access(): - """Check if sudo password is available.""" - return SUDO_PASSWORD is not None -``` - -## Configuration Best Practices - -### 1. Start with Defaults - - -The default configuration is optimized for most use cases. Only modify settings when you have a specific need. - - -### 2. Document Custom Changes - -Add comments explaining why you modified values: - -```python -# Extended executor timeout for Docker builds (typically 45+ minutes) -AGENT_TIMEOUTS = { - "executor": 3600 # 60 minutes -} -``` - -### 3. Test Configuration Changes - -After modifying `config.py`, test with a simple project: - -```bash -start-agent --project-dir ~/test-config --prompt "Build a simple Hello World Python app" -``` - -### 4. Version Control Config - -Keep your customized `config.py` in version control, but **exclude `.env`**: - -```bash -# .gitignore already includes -.env -state/current.json -logs/ -``` - -### 5. Environment-Specific Settings - -Use `.env` for environment-specific values (passwords, credentials), keep shared settings in `config.py`. - -## Troubleshooting Configuration - -### Changes Not Taking Effect - -**Problem:** Modified `config.py` but behavior unchanged - -**Solution:** Restart Fireteam: - -```bash -stop-agent -start-agent --project-dir ~/project --prompt "Your goal" -``` - -### Import Errors - -**Problem:** `ModuleNotFoundError` or import issues - -**Solution:** Ensure `python-dotenv` is installed: - -```bash -cd /home/claude/fireteam -pip install python-dotenv -``` - -### Timeout Issues - -**Problem:** Agents timing out frequently - -**Solution:** Increase timeout values in `AGENT_TIMEOUTS`. See [Timeout Configuration](/configuration/timeouts). - -### State Corruption - -**Problem:** Invalid state after config changes - -**Solution:** Reset state: - -```bash -stop-agent -rm /home/claude/fireteam/state/current.json -start-agent # Fresh state -``` - -## Next Steps - - - - - Deep dive on timeout values and optimization - - - - Configure passwordless sudo or SUDO_PASSWORD - - - - Configure your .env file - - - - Resolve common configuration issues - - - diff --git a/docs/configuration/sudo-setup.mdx b/docs/configuration/sudo-setup.mdx deleted file mode 100644 index ae08f9f..0000000 --- a/docs/configuration/sudo-setup.mdx +++ /dev/null @@ -1,479 +0,0 @@ ---- -title: "Sudo Access Configuration" -description: "Configure passwordless sudo or SUDO_PASSWORD for system-level operations" ---- - -## Overview - -Fireteam may require sudo access for system-level operations during project execution, such as: -- Installing system packages (apt, yum, brew) -- Configuring system services -- Modifying system files -- Setting up development environments (Node.js, Ruby, etc.) - -This guide covers two approaches: **passwordless sudo** (recommended) and **SUDO_PASSWORD** environment variable. - -## Why Sudo Access Matters - -### Real-World Impact - - -**Test Case:** GitHub Analyzer (TypeScript project) wasted **8 cycles** (approximately 2 hours) attempting to install Node.js because sudo access was blocked by password requirement. - - -**Without sudo access:** -- Agents cannot install system packages -- Repeated failed installation attempts -- Wasted cycles and time -- Agents forced to find workarounds (downloading binaries, etc.) - -**With sudo access:** -- Seamless system package installation -- Faster environment setup -- Fewer wasted cycles -- More reliable project completion - -## Option 1: Passwordless Sudo (Recommended) - -Passwordless sudo allows your user to run `sudo` commands without entering a password. This is the most secure and efficient approach for Fireteam. - -### Benefits - -✅ **Most secure**: No passwords stored in files -✅ **Most efficient**: No password prompt delays -✅ **Audit trail**: All sudo commands logged to system logs -✅ **Granular control**: Can limit to specific commands (optional) - -### Setup Instructions - -#### Step 1: Edit sudoers File - - -**CRITICAL:** Always use `visudo` to edit the sudoers file. Syntax errors can lock you out of sudo access! - - -```bash -sudo visudo -``` - -#### Step 2: Add Passwordless Rule - -Add one of the following lines to the sudoers file: - - - -```bash Full Sudo Access (Simplest) -# Replace 'username' with your actual username -username ALL=(ALL) NOPASSWD: ALL -``` - -```bash Limited to Package Managers (More Secure) -# Ubuntu/Debian -username ALL=(ALL) NOPASSWD: /usr/bin/apt, /usr/bin/apt-get, /usr/bin/dpkg - -# CentOS/RHEL -username ALL=(ALL) NOPASSWD: /usr/bin/yum, /usr/bin/dnf, /usr/bin/rpm - -# macOS -username ALL=(ALL) NOPASSWD: /usr/local/bin/brew -``` - -```bash Limited to Specific Commands (Most Secure) -# Allow only specific installations -username ALL=(ALL) NOPASSWD: /usr/bin/apt install nodejs, /usr/bin/apt install python3 -``` - - - - -To find your username, run: `whoami` - - -#### Step 3: Save and Exit - -In `visudo`: -- Press `Ctrl+O` to save (in nano) -- Press `Ctrl+X` to exit -- Or use vim commands (`:wq`) if using vim - -#### Step 4: Verify Passwordless Sudo - -Test that sudo works without password: - -```bash -sudo -n ls /root -``` - -**Expected output:** List of files (no password prompt) - -**If prompted for password:** Recheck sudoers syntax - -### Troubleshooting Passwordless Sudo - -#### "sudo: a password is required" - -**Cause:** Sudoers rule not applied or syntax error - -**Solution:** -1. Verify username is correct: `whoami` -2. Check sudoers file: `sudo visudo` -3. Ensure rule is at the **end** of the file (overrides earlier rules) - -#### Syntax Error in Sudoers - -**Symptom:** `visudo` shows error on save - -**Solution:** -- Don't save! Press `e` to re-edit -- Check syntax: username, ALL, NOPASSWD capitalization -- Use exact format: `username ALL=(ALL) NOPASSWD: ALL` - -#### Multiple User Entries - -**Problem:** Multiple rules for same user - -**Solution:** Keep only the **last** (most permissive) rule: - -```bash -# WRONG - conflicting rules -username ALL=(ALL) ALL -username ALL=(ALL) NOPASSWD: ALL - -# RIGHT - single rule -username ALL=(ALL) NOPASSWD: ALL -``` - -## Option 2: SUDO_PASSWORD in .env - -If passwordless sudo is not possible (shared systems, policy restrictions), use the `SUDO_PASSWORD` environment variable. - -### Setup Instructions - -#### Step 1: Create/Edit .env File - -```bash -cd /home/claude/fireteam -nano .env -``` - -#### Step 2: Add SUDO_PASSWORD - -```bash -SUDO_PASSWORD=your_actual_sudo_password_here -``` - - -**Security Risk:** This stores your password in plaintext. Only use if passwordless sudo is not an option. - - -#### Step 3: Secure the .env File - -Restrict file permissions to owner-only: - -```bash -chmod 600 /home/claude/fireteam/.env -``` - -Verify permissions: - -```bash -ls -la /home/claude/fireteam/.env -``` - -**Expected output:** `-rw-------` (600 permissions) - -#### Step 4: Verify .env is in .gitignore - -Ensure `.env` is **never** committed to version control: - -```bash -cat /home/claude/fireteam/.gitignore | grep .env -``` - -**Expected output:** `.env` - -If missing, add it: - -```bash -echo ".env" >> /home/claude/fireteam/.gitignore -``` - -### How SUDO_PASSWORD Works - -When configured, Fireteam agents can execute sudo commands like: - -```bash -echo "$SUDO_PASSWORD" | sudo -S apt install nodejs -``` - -The `-S` flag reads password from stdin, enabling non-interactive sudo. - -### Testing SUDO_PASSWORD - -Create a test script: - -```bash -#!/bin/bash -source /home/claude/fireteam/.env -echo "$SUDO_PASSWORD" | sudo -S ls /root -``` - -Run it: - -```bash -chmod +x test_sudo.sh -./test_sudo.sh -``` - -**Expected:** List of files (no prompt) - -## Security Considerations - -### Risk Comparison - -| Method | Security Level | Auditability | Convenience | -|--------|---------------|--------------|-------------| -| **Passwordless sudo** | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | -| **SUDO_PASSWORD** | ⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐⭐ | -| **No sudo** | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐ | - -### Best Practices - - - -**For Passwordless Sudo:** -1. ✅ Use on dedicated development machines -2. ✅ Limit to specific commands when possible -3. ✅ Regularly audit sudo logs: `sudo journalctl -u sudo` -4. ✅ Rotate sudo passwords periodically (if system requires it) - -**For SUDO_PASSWORD:** -1. ✅ Set `.env` file permissions to 600 -2. ✅ Never commit `.env` to version control -3. ✅ Use a unique password (not your user login password) -4. ✅ Rotate password regularly -5. ✅ Delete `.env` when not actively using Fireteam - -**General:** -1. ✅ Use Fireteam only on trusted systems -2. ✅ Monitor orchestrator logs for unexpected sudo usage -3. ✅ Audit project changes in git commits -4. ✅ Run Fireteam in isolated environments (VMs, containers) - - - -### Understanding the Risks - -#### What Could Go Wrong? - -**With passwordless sudo:** -- Any process running as your user can execute sudo commands -- Malicious code in a project could perform system-level operations -- Agent errors could accidentally modify system files - -**Mitigation:** -- Only run Fireteam on projects you trust -- Review git commits to understand changes -- Use VM or container isolation -- Limit sudo to specific commands - -**With SUDO_PASSWORD:** -- All risks of passwordless sudo, plus: -- Password exposure if `.env` file leaked -- Password readable by any process with file access - -**Mitigation:** -- All passwordless sudo mitigations, plus: -- Strict file permissions (600) -- Dedicated sudo password (not your main password) -- Delete `.env` when not in use - -## Choosing the Right Approach - -### Use Passwordless Sudo If: - -✅ You have administrator access to the system -✅ You're on a dedicated development machine -✅ You trust the projects Fireteam will build -✅ You want maximum efficiency - -### Use SUDO_PASSWORD If: - -✅ Passwordless sudo is not allowed (corporate policy) -✅ Shared system with multiple users -✅ You need sudo occasionally but can't configure sudoers -✅ You're comfortable with the security tradeoff - -### Use No Sudo If: - -✅ Projects don't require system packages -✅ All dependencies installable without sudo (pip, npm --prefix, etc.) -✅ Maximum security is required -✅ You're on a restricted/shared system - - -**Recommendation:** Start with passwordless sudo on a dedicated dev machine. If that's not possible, use SUDO_PASSWORD with proper security measures. As a last resort, work without sudo and manually install system dependencies. - - -## Verifying Sudo Configuration - -### Check Current Sudo Status - -Run this diagnostic script: - -```bash -#!/bin/bash - -echo "=== Fireteam Sudo Diagnostic ===" -echo - -# Check passwordless sudo -echo "1. Testing passwordless sudo..." -if sudo -n true 2>/dev/null; then - echo " ✅ Passwordless sudo: ENABLED" -else - echo " ❌ Passwordless sudo: DISABLED" -fi -echo - -# Check SUDO_PASSWORD in .env -echo "2. Checking SUDO_PASSWORD in .env..." -if [ -f /home/claude/fireteam/.env ]; then - if grep -q "SUDO_PASSWORD=" /home/claude/fireteam/.env; then - echo " ✅ SUDO_PASSWORD: CONFIGURED" - - # Check permissions - PERMS=$(stat -c %a /home/claude/fireteam/.env) - if [ "$PERMS" = "600" ]; then - echo " ✅ .env permissions: SECURE (600)" - else - echo " ⚠️ .env permissions: INSECURE ($PERMS) - should be 600" - fi - else - echo " ❌ SUDO_PASSWORD: NOT SET" - fi -else - echo " ❌ .env file: NOT FOUND" -fi -echo - -# Check .gitignore -echo "3. Checking .gitignore..." -if grep -q "^\.env$" /home/claude/fireteam/.gitignore 2>/dev/null; then - echo " ✅ .env in .gitignore: YES" -else - echo " ⚠️ .env in .gitignore: NO - add it!" -fi -echo - -# Summary -echo "=== Summary ===" -if sudo -n true 2>/dev/null; then - echo "✅ Sudo access: READY (passwordless)" -elif [ -f /home/claude/fireteam/.env ] && grep -q "SUDO_PASSWORD=" /home/claude/fireteam/.env; then - echo "✅ Sudo access: READY (via SUDO_PASSWORD)" -else - echo "❌ Sudo access: NOT CONFIGURED" - echo " See: https://docs.fireteam.dev/configuration/sudo-setup" -fi -``` - -Save as `check_sudo.sh`, make executable, and run: - -```bash -chmod +x check_sudo.sh -./check_sudo.sh -``` - -## Impact on Project Performance - -### With Sudo Access (Recommended) - -**Example: TypeScript Project (Node.js required)** - -✅ **Cycle 0:** Detect Node.js needed → `sudo apt install nodejs` → Success -✅ **Cycle 1-3:** Build project, run tests, complete -✅ **Total:** 4 cycles, ~60 minutes - -### Without Sudo Access - -**Same TypeScript Project** - -❌ **Cycle 0:** Detect Node.js needed → `sudo apt install nodejs` → **FAILED** (password required) -❌ **Cycle 1:** Retry `sudo apt install nodejs` → **FAILED** -❌ **Cycle 2:** Try `apt install nodejs` (no sudo) → **FAILED** (permission denied) -❌ **Cycle 3-7:** Various installation attempts → **FAILED** -⚠️ **Cycle 8:** Download Node.js binary to ~/.local/bin → **SUCCESS** (workaround) -✅ **Cycle 9-19:** Build project, resolve issues, complete -❌ **Total:** 20 cycles, ~5 hours (8 wasted cycles) - - -**Real test data:** GitHub Analyzer project wasted 8 cycles (approximately 2 hours) due to lack of sudo access. Final completion: 94%, but with significant efficiency loss. - - -## Troubleshooting - -### "Permission denied" on system operations - -**Symptom:** Agents fail to install packages, get permission errors - -**Diagnosis:** -```bash -# Check sudo access -sudo -n true 2>/dev/null && echo "Sudo OK" || echo "Sudo blocked" - -# Check SUDO_PASSWORD -grep SUDO_PASSWORD /home/claude/fireteam/.env -``` - -**Solution:** Configure one of the sudo methods above - -### SUDO_PASSWORD not working - -**Symptom:** Password set in .env but sudo still fails - -**Diagnosis:** -```bash -# Test SUDO_PASSWORD -source /home/claude/fireteam/.env -echo "$SUDO_PASSWORD" | sudo -S ls /root -``` - -**Possible issues:** -1. Password incorrect → Update .env -2. .env not loaded → Check file exists -3. Special characters in password → Escape or use quotes - -### Passwordless sudo not working - -**Symptom:** Configured in sudoers but still prompts for password - -**Diagnosis:** -```bash -# Check sudoers entry -sudo visudo -c # Check syntax -sudo -l -U $(whoami) # List sudo privileges -``` - -**Solution:** -1. Verify username is correct -2. Ensure NOPASSWD rule is last in file -3. Check for conflicting rules above - -## Next Steps - - - - - Review complete configuration options - - - - Configure other .env variables - - - - Resolve sudo-related issues - - - diff --git a/docs/configuration/timeouts.mdx b/docs/configuration/timeouts.mdx deleted file mode 100644 index a63040d..0000000 --- a/docs/configuration/timeouts.mdx +++ /dev/null @@ -1,472 +0,0 @@ ---- -title: "Timeout Configuration" -description: "Understanding and optimizing agent timeout values for optimal performance" ---- - -## Overview - -Agent timeouts are critical to Fireteam's performance. They determine how long each agent (Planner, Executor, Reviewer) can run before being terminated. Properly configured timeouts ensure efficient operation while preventing hung processes. - -## Default Timeout Values - -```python -AGENT_TIMEOUTS = { - "planner": 600, # 10 minutes - "reviewer": 600, # 10 minutes - "executor": 1800 # 30 minutes -} -``` - -These values are configured in `/home/claude/fireteam/config.py` and were optimized based on real-world usage patterns. - -## Why These Specific Values? - -| Agent | Timeout | Rationale | -|-------|---------|-----------| -| **Planner** | 10 min | Complex codebase analysis, plan creation, task breakdown | -| **Executor** | 30 min | Package installation, builds, test suites, code implementation | -| **Reviewer** | 10 min | Code review, test execution, completion analysis | - -## Planner Timeout: 10 Minutes - -### Why 10 Minutes? - -The Planner agent performs: -- Complete codebase analysis -- Git history examination -- Plan creation/updates -- Task breakdown and prioritization - -**Test findings:** -- Most planning cycles: 2-5 minutes -- Complex projects: 7-9 minutes -- Edge cases: Up to 10 minutes - - -Original timeout was 5 minutes but was increased to 10 minutes after observing occasional timeouts on complex projects during testing. - - -### When to Increase Planner Timeout - -Increase to **15-20 minutes** if: -- Very large codebases (10,000+ lines) -- Monorepo architecture -- Complex dependency graphs -- Multiple interrelated modules - -```python -AGENT_TIMEOUTS = { - "planner": 1200, # 20 minutes for large projects - # ... -} -``` - -### When to Decrease Planner Timeout - -Decrease to **5-7 minutes** if: -- Small, focused projects -- Simple architectures -- Rapid iteration needs -- Testing/debugging cycles - -## Executor Timeout: 30 Minutes - -### Why 30 Minutes? - -The Executor agent handles: -- Package/dependency installation -- Code implementation -- Build processes -- Test suite execution -- Documentation generation - -**Test findings:** -- Simple implementations: 5-10 minutes -- Medium complexity: 15-20 minutes -- Complex builds: 25-30 minutes -- TypeScript/Node.js setup: Up to 30 minutes (including environment setup) - - -The GitHub Analyzer project (TypeScript) initially failed with a 10-minute timeout. After increasing to 30 minutes, it completed successfully in 19 cycles with 94% completion. - - -### When to Increase Executor Timeout - -Increase to **45-60 minutes** if: -- Docker builds (multi-stage builds) -- Extensive test suites (>1000 tests) -- Large package installations (ML libraries, etc.) -- Cross-compilation tasks -- Database migrations - -```python -AGENT_TIMEOUTS = { - "executor": 3600, # 60 minutes for complex builds - # ... -} -``` - -**Example scenarios:** -- Installing TensorFlow, PyTorch: 15-30 minutes -- Building Rust projects: 20-40 minutes -- Running Selenium/E2E tests: 10-30 minutes -- Docker image builds: 20-60 minutes - -### When to Decrease Executor Timeout - -Decrease to **15-20 minutes** if: -- Pure Python projects (no compilation) -- Minimal dependencies -- No build step required -- Unit tests only (< 5 minutes) - -## Reviewer Timeout: 10 Minutes - -### Why 10 Minutes? - -The Reviewer agent performs: -- Complete code review -- Test execution -- Functionality verification -- Completion percentage calculation -- Gap/issue identification - -**Test findings:** -- Most review cycles: 3-7 minutes -- Complex reviews: 8-10 minutes -- With test execution: Up to 10 minutes - -### When to Increase Reviewer Timeout - -Increase to **15-20 minutes** if: -- Running comprehensive test suites during review -- Performance benchmarking -- Security scanning -- Large codebase reviews (10,000+ lines) - -```python -AGENT_TIMEOUTS = { - "reviewer": 1200, # 20 minutes for thorough reviews - # ... -} -``` - -### When to Decrease Reviewer Timeout - -Decrease to **5-7 minutes** if: -- Small projects -- No test execution required -- Quick validation cycles -- Documentation-only changes - -## Real-World Timeout Examples - -### Example 1: Simple Python CLI (Weather App) - -**Project characteristics:** -- 200 lines of code -- 2 dependencies (requests, python-dotenv) -- Basic API integration -- Unit tests (< 1 minute) - -**Optimal timeouts:** - -```python -AGENT_TIMEOUTS = { - "planner": 300, # 5 minutes - "executor": 900, # 15 minutes - "reviewer": 300 # 5 minutes -} -``` - -**Result:** Completed in 2 cycles, ~30 minutes total - -### Example 2: REST API Server (FastAPI) - -**Project characteristics:** -- 500 lines of code -- 5-10 dependencies -- Database integration (SQLite) -- API endpoint tests - -**Optimal timeouts (defaults work well):** - -```python -AGENT_TIMEOUTS = { - "planner": 600, # 10 minutes - "executor": 1800, # 30 minutes - "reviewer": 600 # 10 minutes -} -``` - -**Result:** Completed in 1 cycle with 92% completion ⚡ - -### Example 3: TypeScript CLI (GitHub Analyzer) - -**Project characteristics:** -- 1000+ lines of TypeScript -- Node.js + npm setup -- 15+ dependencies -- 206 tests -- Complex build process - -**Optimal timeouts:** - -```python -AGENT_TIMEOUTS = { - "planner": 900, # 15 minutes - "executor": 3600, # 60 minutes (build + test suite) - "reviewer": 1200 # 20 minutes (run all tests) -} -``` - -**Result:** Completed in 19 cycles with 94% completion -(Note: 8 cycles wasted on Node.js installation due to sudo issues) - -### Example 4: Machine Learning Project - -**Project characteristics:** -- Large dependencies (TensorFlow, pandas, numpy) -- Model training code -- Data preprocessing pipeline -- Notebook integration - -**Optimal timeouts:** - -```python -AGENT_TIMEOUTS = { - "planner": 1200, # 20 minutes (complex planning) - "executor": 5400, # 90 minutes (dependency install + training) - "reviewer": 1800 # 30 minutes (validation runs) -} -``` - -## Timeout Behavior - -### What Happens on Timeout? - -1. **Agent process terminated** after timeout duration -2. **Retry logic triggered** (up to 3 attempts by default) -3. **Exponential backoff** between retries (5 seconds delay) -4. **Cycle failure** if all retries exhausted -5. **Error logged** to orchestrator log - - -Timeouts **do not** corrupt project state. The state file is only updated on successful cycle completion. - - -### Timeout vs. Completion Time - -**Important distinction:** -- **Timeout**: Maximum allowed time -- **Actual completion**: Usually much shorter - -**Example:** Executor with 30-minute timeout typically completes in 15-20 minutes. - -## Performance Impact - -### Timeout Too Short - -**Symptoms:** -- Frequent agent timeouts -- Repeated retry cycles -- Failed installations/builds -- Incomplete implementations - -**Impact:** -- Wasted cycles (each timeout wastes full timeout duration) -- Project delays -- Potential project failure - -**Example:** GitHub Analyzer with 10-minute executor timeout: -- 8 wasted cycles trying to install Node.js -- Each timeout = 10 minutes wasted -- Total waste: ~80 minutes - -### Timeout Too Long - -**Symptoms:** -- Hung processes not killed promptly -- Excessive waiting on failures -- Resource waste on dead agents - -**Impact:** -- Delayed failure detection -- Wasted computational resources -- Longer time to recovery - -**Example:** Planner with 60-minute timeout: -- Normal planning: 5 minutes -- On failure: Wait 60 minutes to detect -- Waste: 55 minutes per failed cycle - -## Adaptive Timeout Strategy - -### Context-Based Timeouts - -Consider adjusting timeouts based on: - -**Project type:** -- Python CLI: Shorter timeouts (5-15-5) -- TypeScript/Build: Longer timeouts (15-60-20) -- ML/Data: Extended timeouts (20-90-30) - -**Cycle number:** -- Early cycles (0-2): Full timeouts (more uncertainty) -- Later cycles (3+): Can reduce by 20-30% (smaller changes) - -**Current example (not yet implemented):** - -```python -def get_timeout(agent: str, cycle: int) -> int: - """Get adaptive timeout based on cycle number.""" - base = AGENT_TIMEOUTS[agent] - - # Reduce timeout for later cycles - if cycle > 3: - return int(base * 0.7) # 30% reduction - return base -``` - -## Monitoring Timeout Efficiency - -### Check Logs for Timing Data - -Orchestrator logs show actual agent execution times: - -```bash -grep "completed" /home/claude/fireteam/logs/orchestrator_*.log -``` - -**Example output:** - -``` -2025-10-15 14:32:10 - Planning completed (4m 23s) -2025-10-15 14:48:35 - Execution completed (16m 25s) -2025-10-15 14:55:12 - Review completed (6m 37s) -``` - -### Optimization Process - -1. **Baseline**: Run project with default timeouts -2. **Analyze**: Check logs for actual completion times -3. **Adjust**: Set timeouts to 1.5x-2x actual times -4. **Test**: Verify no timeouts occur -5. **Iterate**: Fine-tune based on results - -## Timeout Configuration Checklist - - - -**Initial Setup:** -- [ ] Start with default timeouts (10-30-10) -- [ ] Run a test project to completion -- [ ] Analyze orchestrator logs for actual times - -**For Your Project:** -- [ ] Identify project type (Python/TypeScript/ML/etc.) -- [ ] Estimate build complexity (simple/medium/complex) -- [ ] Consider dependency installation time -- [ ] Factor in test suite duration - -**Adjustment:** -- [ ] Set timeouts to 1.5-2x expected duration -- [ ] Add buffer for network variability -- [ ] Test with a similar project -- [ ] Monitor for timeout errors - -**Validation:** -- [ ] Zero timeout errors in logs -- [ ] Reasonable completion times -- [ ] No excessive waiting on failures -- [ ] Consistent cycle duration - - - -## Best Practices - -### 1. Start Conservative - - -Begin with default or slightly higher timeouts. You can always decrease them after observing actual performance. - - -### 2. Account for Network Latency - -Claude API calls vary with network conditions. Add 10-20% buffer for network variability. - -### 3. Consider CI/CD Environments - -If running Fireteam in CI/CD: -- Shared resources may slow agents -- Network may be throttled -- Increase timeouts by 25-50% - -### 4. Document Custom Timeouts - -Always comment why you changed timeouts: - -```python -# Increased for large ML dependencies (TensorFlow = 20min install) -AGENT_TIMEOUTS = { - "executor": 3600 # 60 minutes -} -``` - -### 5. Test After Changes - -Validate timeout changes with a representative project before production use. - -## Troubleshooting Timeouts - -### Frequent Planner Timeouts - -**Possible causes:** -- Very large codebase (10k+ lines) -- Complex git history -- Network latency to Claude API - -**Solutions:** -- Increase planner timeout to 15-20 minutes -- Simplify project structure -- Check network connectivity - -### Frequent Executor Timeouts - -**Possible causes:** -- Long package installations -- Complex build processes -- Extensive test suites -- System package install blocked (no sudo) - -**Solutions:** -- Increase executor timeout to 45-60 minutes -- Configure sudo access ([Sudo Setup](/configuration/sudo-setup)) -- Pre-install system dependencies -- Optimize build process - -### Frequent Reviewer Timeouts - -**Possible causes:** -- Running full test suite in review -- Performance benchmarks -- Large codebase review - -**Solutions:** -- Increase reviewer timeout to 15-20 minutes -- Optimize test execution -- Consider parallel test execution - -## Next Steps - - - - - Complete configuration file documentation - - - - Resolve timeout-related issues - - - diff --git a/docs/core-concepts/agents.mdx b/docs/core-concepts/agents.mdx deleted file mode 100644 index 7402bc6..0000000 --- a/docs/core-concepts/agents.mdx +++ /dev/null @@ -1,638 +0,0 @@ ---- -title: "Agents" -description: "Understanding the specialized AI agents that power Fireteam's autonomous development" ---- - -## Agent Architecture - -Fireteam employs three specialized AI agents, each with a distinct role in the development cycle. All agents are powered by **Claude AI** through the Claude CLI, with specialized prompts that guide their behavior. - -``` -┌─────────────────────────────────────────────────┐ -│ BaseAgent (Abstract) │ -│ - Claude CLI invocation │ -│ - Timeout management │ -│ - Retry logic with exponential backoff │ -│ - Error handling │ -└─────────────────────────────────────────────────┘ - │ - ┌──────────────┼──────────────┐ - │ │ │ - ▼ ▼ ▼ - ┌──────────┐ ┌──────────┐ ┌──────────┐ - │ Planner │ │ Executor │ │ Reviewer │ - │ Agent │ │ Agent │ │ Agent │ - └──────────┘ └──────────┘ └──────────┘ -``` - -## BaseAgent Class - -All agents inherit from **BaseAgent** (`agents/base.py`), which provides common functionality: - -### Core Features - -**Claude CLI Integration**: -```python -claude --print --dangerously-skip-permissions "" -``` - -**Configurable Timeouts**: -- Planner: 10 minutes (600 seconds) -- Executor: 30 minutes (1800 seconds) -- Reviewer: 10 minutes (600 seconds) - -**Retry Logic**: -- Maximum retries: 3 attempts -- Retry delay: 5 seconds -- Exponential backoff on failures -- Detailed error logging - -**Error Handling**: -- Timeout exceptions -- Command execution errors -- Network failures -- Graceful degradation - -### BaseAgent Methods - -```python -class BaseAgent: - def __init__(self, agent_type: str, logger: Logger): - self.agent_type = agent_type - self.timeout = AGENT_TIMEOUTS[agent_type] - self.max_retries = MAX_RETRIES - - def _build_command(self, prompt: str, project_dir: str) -> list: - """Build Claude CLI command with prompt.""" - - def _execute_command(self, cmd: list, project_dir: str) -> dict: - """Execute command with retry logic.""" - - def execute(self, **kwargs) -> dict: - """Must be implemented by subclasses.""" -``` - -## Planner Agent - -The **Planner Agent** (`agents/planner.py`) is the strategic thinker that creates and updates project plans. - -### Role & Responsibilities - - - Creates comprehensive, actionable project plans that break down goals into concrete tasks - - -**Key Responsibilities**: -- Analyze the project goal -- Break down work into actionable tasks -- Organize tasks in logical order -- Identify key milestones -- Consider edge cases and testing requirements -- Update plans based on execution feedback -- Adapt to blockers and changes - -### Input & Output - -**Cycle 0 (Initial Planning)**: -- **Input**: Project goal only -- **Output**: Comprehensive initial plan - -**Cycle 1+ (Plan Updates)**: -- **Input**: Goal, previous plan, execution result, review feedback -- **Output**: Updated plan with progress and adjusted priorities - -### Prompt Strategy - -#### Initial Plan Prompt (Cycle 0) - -The Planner receives a focused prompt to create the first plan: - -``` -You are a Planner Agent in an autonomous multi-agent system. - -PROJECT GOAL: -{goal} - -YOUR TASK: -Create a comprehensive, actionable project plan to achieve this goal. - -Your plan should: -1. Break down the goal into clear, concrete tasks -2. Organize tasks in logical order -3. Identify key milestones -4. Consider edge cases and testing requirements -5. Aim for production-ready quality - -OUTPUT FORMAT: -- Overview/Summary -- Task breakdown with priorities -- Key milestones -- Testing strategy -- Success criteria -``` - -#### Update Plan Prompt (Cycle 1+) - -For subsequent cycles, the Planner gets full context: - -``` -PROJECT GOAL: {goal} -CYCLE NUMBER: {cycle_number} -PREVIOUS PLAN: {previous_plan} -LAST EXECUTION RESULT: {execution_result} -LAST REVIEW: {review_feedback} - -YOUR TASK: -Update the project plan based on progress and feedback. - -Consider: -1. What has been completed successfully? -2. What issues or blockers were encountered? -3. What tasks remain? -4. What adjustments are needed? -5. Are we ready for final validation? -``` - -### Timeout Rationale - -**10 minutes** - Why? - -- Planning is primarily analytical work (reading, thinking, organizing) -- No code execution or testing required -- Claude can analyze codebases quickly -- Planning typically completes in 3-7 minutes -- Longer timeouts would slow cycle throughput - - -If your projects require complex planning, you can increase the planner timeout in `config.py`: - -```python -AGENT_TIMEOUTS = { - "planner": 900, # 15 minutes -} -``` - - -### Example Plan Output - -```markdown -# Project Plan - Cycle 2 - -## Progress Summary -- ✅ Core API integration complete -- ✅ Basic CLI interface working -- ⚠️ Error handling needs improvement -- ❌ Tests incomplete - -## Updated Task List - -### High Priority -1. [IN PROGRESS] Add comprehensive error handling - - Retry logic for API failures - - Network timeout handling - - Invalid response validation - -2. [TODO] Complete test suite - - Unit tests for API client - - Integration tests for CLI - - Edge case coverage - -### Medium Priority -3. [TODO] Documentation polish - - Usage examples - - API documentation - - Installation instructions - -## Next Steps -Focus on error handling in this cycle. The Executor should prioritize -tasks 1 and 2 to move toward 95% completion threshold. -``` - -## Executor Agent - -The **Executor Agent** (`agents/executor.py`) is the hands-on builder that transforms plans into working code. - -### Role & Responsibilities - - - Executes planned tasks by writing production-ready, tested code - - -**Key Responsibilities**: -- Implement tasks from the current plan -- Write clean, production-quality code -- Create tests for implementations -- Handle errors gracefully -- Document code and usage -- Leave codebase in functional state - -### Input & Output - -**Input**: -- Project goal -- Current plan from Planner -- Cycle number -- Project directory path - -**Output**: -- Summary of work completed -- Files created/modified -- Issues encountered -- Remaining work - -### Prompt Strategy - -The Executor receives a task-focused prompt: - -``` -You are an Executor Agent in an autonomous multi-agent system. - -PROJECT GOAL: {goal} -CYCLE NUMBER: {cycle_number} -CURRENT PLAN: {plan} - -YOUR TASK: -Execute the tasks outlined in the plan. You should: - -1. Work through tasks systematically -2. Create/modify files as needed -3. Write clean, production-ready code -4. Test your implementations -5. Handle errors gracefully -6. Document your work - -IMPORTANT: -- Focus on the NEXT actionable tasks from the plan -- Write actual, working code (not pseudocode) -- Test thoroughly before considering tasks complete -- If you encounter blockers, document them clearly -- Leave the codebase in a functional state - -OUTPUT FORMAT: -Provide a summary of: -- What you accomplished -- What files you created/modified -- Any issues encountered -- What still needs to be done - -Work efficiently and aim for quality. Do not leave placeholders or -incomplete implementations. -``` - -### Timeout Rationale - -**30 minutes** - Why? - -- Implementation is the most time-intensive phase -- Includes writing code, running tests, debugging -- May involve installing dependencies (`pip install`, `npm install`) -- Large projects may need significant file operations -- Implementation work is the most time-intensive phase - - -The executor has **3x the timeout** of other agents because implementation includes: -- Writing multiple files -- Running test suites -- Installing and configuring dependencies -- Debugging failures -- Building/compiling code - - -### Example Execution Output - -``` -# Execution Summary - Cycle 2 - -## Accomplished -✅ Added comprehensive error handling to API client - - Implemented retry logic with exponential backoff (3 attempts) - - Added network timeout handling (10s default) - - Validate API responses before parsing - -✅ Expanded test suite - - Added 12 new unit tests for API client (tests/test_api.py:25-150) - - Added integration tests for CLI (tests/test_cli.py) - - All 18 tests passing - -✅ Improved documentation - - Updated README with installation steps - - Added usage examples - - Documented error codes - -## Files Modified -- src/api_client.py (75 lines changed) -- src/cli.py (23 lines changed) -- tests/test_api.py (120 lines added) -- tests/test_cli.py (new file, 85 lines) -- README.md (45 lines changed) - -## Issues Encountered -- None - -## Remaining Work -- Performance optimization (low priority) -- Additional edge case tests (nice-to-have) - -## Status -Project is feature-complete and production-ready. Ready for review. -``` - -## Reviewer Agent - -The **Reviewer Agent** (`agents/reviewer.py`) is the quality control expert that validates progress and estimates completion. - -### Role & Responsibilities - - - Critically evaluates the codebase, runs tests, and estimates completion percentage - - -**Key Responsibilities**: -- Examine the entire codebase -- Run tests and verify functionality -- Compare implementation against goal -- Identify gaps, bugs, or incomplete features -- Calculate honest completion percentage (0-100%) -- Provide actionable feedback - -### Input & Output - -**Input**: -- Project goal -- Current plan -- Execution result summary -- Cycle number -- Validation mode flag (true when completion ≥95%) - -**Output**: -- Completion percentage (0-100%) -- Review summary -- What's working well -- What's incomplete or broken -- Next steps - -### Prompt Strategy - -#### Standard Review Prompt - -``` -You are a Reviewer Agent in an autonomous multi-agent system. - -PROJECT GOAL: {goal} -CYCLE NUMBER: {cycle_number} -CURRENT PLAN: {plan} -LATEST EXECUTION RESULT: {execution_result} - -YOUR TASK: -Review the project's current state and assess progress. - -You should: -1. Examine the codebase thoroughly -2. Check what has been implemented vs. planned -3. Test functionality where possible -4. Identify gaps, issues, or incomplete work -5. Assess production-readiness -6. Provide an honest completion estimate - -COMPLETION CRITERIA: -- 0%: Nothing started -- 25%: Basic structure in place -- 50%: Core functionality implemented -- 75%: Most features working, needs polish -- 90%: Feature complete, needs testing -- 95%: Production-ready with comprehensive testing -- 100%: Perfect, nothing more needed - -OUTPUT FORMAT: -Your response MUST include a completion percentage in this format: -COMPLETION: XX% - -Then provide: -- Summary of current state -- What's working well -- What's incomplete or broken -- What needs to be done next -- Whether ready for production - -Be honest and critical. Don't inflate percentages. -``` - -#### Validation Mode (≥95% Completion) - -When the project reaches ≥95%, the Reviewer enters **critical validation mode**: - -``` -VALIDATION MODE: -This is a validation check. The system believes the project is >95% -complete. - -Be CRITICAL and thorough. Check for: -- Edge cases that might not be handled -- Missing error handling -- Incomplete features -- Testing gaps -- Production-readiness issues - -Only confirm high completion if truly production-ready. -``` - -This prevents false positives and ensures genuine quality. - -### Completion Percentage Extraction - -The Reviewer's output is parsed to extract the completion percentage: - -```python -# Primary pattern: "COMPLETION: XX%" -match = re.search(r'COMPLETION:\s*(\d+)%', output, re.IGNORECASE) - -# Fallback: Any percentage in output -if not match: - match = re.search(r'(\d+)%', output) - -# Default: 0% if parsing fails -completion = int(match.group(1)) if match else 0 -``` - -### Timeout Rationale - -**10 minutes** - Why? - -- Review is analytical (code reading, test running) -- No code writing or large file operations -- Test suites typically run in seconds to minutes -- Claude can scan codebases efficiently -- Similar to Planner in complexity - -### Example Review Output - -``` -COMPLETION: 98% - -## Summary of Current State - -The Bitcoin CLI project is **production-ready** with comprehensive -functionality, testing, and documentation. - -## What's Working Well - -✅ **Core Functionality** -- CoinGecko API integration working flawlessly -- Clean CLI interface with argparse -- Formatted output (price, 24h change, market cap) - -✅ **Error Handling** -- Retry logic with exponential backoff -- Network timeout handling (10s) -- Invalid response validation -- Graceful degradation on API failures - -✅ **Testing** -- 18 unit tests, all passing -- Integration tests cover main workflows -- Edge cases tested (network failures, invalid responses) - -✅**Documentation** -- Clear README with installation and usage -- Code comments explain complex logic -- Error messages are user-friendly - -## What's Incomplete or Broken - -⚠️ **Minor Polish Items** (not blocking) -- Could add support for multiple cryptocurrencies (scope creep) -- Performance could be optimized (not needed for CLI) - -## Next Steps - -Project is ready for production deployment. No critical work remains. -The minor polish items are nice-to-haves that exceed the original goal. - -## Production Ready? - -**YES** - This project meets the goal, has comprehensive testing, handles -errors gracefully, and is documented. Recommend validation completion. -``` - -## Agent Communication - -Agents **do not communicate directly**. The Orchestrator manages all data flow: - -``` -Cycle N: - 1. Orchestrator calls Planner - - Passes: goal, previous_plan, last_review - - Receives: new_plan - - 2. Orchestrator calls Executor - - Passes: goal, new_plan - - Receives: execution_result - - 3. Orchestrator calls Reviewer - - Passes: goal, new_plan, execution_result - - Receives: review, completion_percentage - - 4. Orchestrator saves all outputs to state.json - - 5. Orchestrator commits changes to Git - - 6. Cycle repeats -``` - -This architecture ensures: -- **Clean separation of concerns**: Each agent has a single, well-defined role -- **Stateless agents**: No persistent memory between invocations -- **Reproducible behavior**: Same inputs → same outputs -- **Easy debugging**: All communication logged and tracked - -## Timeout Configuration - -All agent timeouts are configurable in `config.py`: - -```python -# agents/base.py -AGENT_TIMEOUTS = { - "planner": 600, # 10 minutes - "reviewer": 600, # 10 minutes - "executor": 1800 # 30 minutes -} -``` - -### When to Adjust Timeouts - -**Increase Planner timeout** if: -- Working with very large, complex codebases -- Planning requires extensive analysis -- Projects have many dependencies to review - -**Increase Executor timeout** if: -- Projects require long build/compile times -- Large test suites take >10 minutes to run -- Dependency installation is slow (Node.js projects) - -**Increase Reviewer timeout** if: -- Test suites take >5 minutes to run -- Projects have extensive codebases to scan -- Complex integration tests needed - - -Increasing timeouts will slow down cycle throughput. Only adjust if you're seeing frequent timeouts in logs. - - -## Performance Insights - -Typical performance patterns: - -### Planner Performance -- **Average time**: 2-5 minutes per cycle -- **Timeout rate**: Very low (rarely times out) -- **Success rate**: Very high - -### Executor Performance -- **Average time**: 10-20 minutes per cycle -- **Timeout rate**: Low (mainly on first cycle of complex projects) -- **Success rate**: High -- **Bottleneck**: Dependency installation (Node.js, large Python packages) - -### Reviewer Performance -- **Average time**: 3-7 minutes per cycle -- **Timeout rate**: <2% -- **Success rate**: 98%+ -- **Accuracy**: Completion estimates within ±5% of reality - -## Best Practices - -### For Better Planning -- Provide detailed, specific goals -- Include desired tech stack in goal description -- Mention testing and documentation requirements - -### For Better Execution -- Plans should have clear, actionable tasks -- Avoid vague instructions like "improve code quality" -- Specify exact features and acceptance criteria - -### For Better Reviews -- Let Reviewer be critical (it's designed to be) -- Don't second-guess low completion percentages -- Trust the triple-validation system - -## Next Steps - - - - Learn how agents work together in cycles - - - - Optimize timeouts for your projects - - - - See the big picture - - - - How state flows between agents - - diff --git a/docs/core-concepts/architecture.mdx b/docs/core-concepts/architecture.mdx deleted file mode 100644 index 5b2b212..0000000 --- a/docs/core-concepts/architecture.mdx +++ /dev/null @@ -1,510 +0,0 @@ ---- -title: "Architecture" -description: "Deep dive into Fireteam's multi-agent orchestration architecture and cycle-based execution model" ---- - -## System Overview - -Fireteam is built on a **multi-agent orchestration architecture** where specialized AI agents work in coordinated cycles to build complete software projects autonomously. - -``` -┌─────────────────────────────────────────────────────────┐ -│ ORCHESTRATOR │ -│ (Infinite Loop) │ -└─────────────────────────────────────────────────────────┘ - │ - ▼ - ┌────────────────────────────────┐ - │ CYCLE N (repeats) │ - └────────────────────────────────┘ - │ - ┌────────────────┼────────────────┐ - │ │ │ - ▼ ▼ ▼ - ┌────────┐ ┌──────────┐ ┌──────────┐ - │ PLANNER│ │ EXECUTOR │ │ REVIEWER │ - │ Agent │ → │ Agent │ → │ Agent │ - └────────┘ └──────────┘ └──────────┘ - │ │ │ - └────────────────┼────────────────┘ - │ - ▼ - ┌─────────────┐ - │ GIT COMMIT │ - └─────────────┘ - │ - ▼ - ┌──────────────────────┐ - │ ≥95% for 3 cycles? │ - │ Yes → Complete │ - │ No → Next Cycle │ - └──────────────────────┘ -``` - -## Core Components - -### 1. Orchestrator - -The **Orchestrator** (`orchestrator.py`) is the mission commander that coordinates all agents and manages the execution loop. - -**Responsibilities**: -- Initialize and validate Git repository -- Load and manage project state -- Execute agents in sequence (Planner → Executor → Reviewer) -- Handle agent failures and retries -- Create Git commits after each cycle -- Validate completion criteria (3 consecutive 95%+ reviews) -- Manage graceful shutdown - -**Key Features**: -- **Infinite loop** until completion or manual stop -- **State persistence** between cycles -- **Error recovery** with configurable retries -- **Signal handling** for graceful shutdown (SIGTERM, SIGINT) - -```python -# Simplified orchestrator loop -while not completion_validated: - # Plan phase - plan = planner_agent.execute(goal, prev_plan, last_review) - - # Execute phase - execution_result = executor_agent.execute(plan, goal) - - # Review phase - review = reviewer_agent.execute(goal, plan, execution_result) - - # Git commit - git_commit(cycle_number, completion_percentage) - - # Check completion - if review.completion >= 95: - validation_count += 1 - if validation_count >= 3: - completion_validated = True - else: - validation_count = 0 - - cycle_number += 1 -``` - -### 2. State Manager - -The **State Manager** (`state/manager.py`) provides isolated state management for each project. - -**State Schema**: -```json -{ - "project_dir": "/absolute/path/to/project", - "goal": "Project objective description", - "status": "planning|executing|reviewing|completed", - "cycle_number": 0, - "completion_percentage": 0, - "validation_checks": 0, - "git_branch": "agent-20251017-143022", - "current_plan": "Latest plan from Planner agent", - "last_execution_result": "Latest Executor output", - "last_review": "Latest Reviewer output", - "created_at": "2025-10-17T14:30:22Z", - "updated_at": "2025-10-17T15:45:33Z" -} -``` - -**State Isolation**: -- State is **completely reset** between projects -- Prevents cross-contamination of goals, plans, or context -- Each project starts fresh with no residual data - - -State files are stored in `state/current.json` and are gitignored. Never commit state files. - - -### 3. Specialized Agents - -Three specialized agents handle different phases of development: - - - - Creates and updates project plans based on goals and progress - - - Implements tasks from the plan with production-ready code - - - Tests functionality and estimates completion percentage - - - -Each agent is a **subclass of BaseAgent** (`agents/base.py`) and implements: -- Specialized prompt generation -- Claude CLI invocation -- Output parsing and validation -- Timeout management -- Retry logic - -Learn more about agents in the [Agents](/core-concepts/agents) documentation. - -## Execution Flow - -### Phase 1: Initialization - -When you run `start-agent`, the orchestrator: - -1. **Validates the project directory** - - Creates directory if it doesn't exist - - Checks for write permissions - -2. **Initializes Git repository** - - Runs `git init` if not already a repo - - Creates timestamped branch (e.g., `agent-20251017-143022`) - - Makes initial commit with Fireteam metadata - -3. **Loads or creates state** - - Checks for existing `state/current.json` - - Creates fresh state for new projects - - Validates state schema - -4. **Configures Git** - - Sets user name and email from environment or config - - Configures branch tracking - -### Phase 2: Cycle Execution - -Each cycle consists of four phases: - -#### Planning (Planner Agent) - -**Input**: -- Project goal (constant across all cycles) -- Previous plan (if any) -- Last review output (if any) -- Current cycle number - -**Process**: -- Analyzes what's been built so far -- Reviews feedback from the Reviewer -- Creates or updates actionable task list -- Prioritizes remaining work - -**Output**: -- Structured project plan -- Task breakdown -- Implementation guidance - -**Timeout**: 10 minutes (configurable) - -#### Execution (Executor Agent) - -**Input**: -- Current plan from Planner -- Project goal -- Project directory path - -**Process**: -- Reads the plan and goal -- Implements tasks with actual working code -- Creates files, writes tests, adds documentation -- Runs tests to verify functionality - -**Output**: -- Working code implementations -- Test files -- Documentation updates -- Summary of work completed - -**Timeout**: 30 minutes (configurable) - - -The Executor has the longest timeout because implementation is the most time-intensive phase, especially for complex projects. - - -#### Review (Reviewer Agent) - -**Input**: -- Project goal -- Current plan -- Execution result summary -- Project directory path - -**Process**: -- Scans the entire codebase -- Runs tests and checks functionality -- Compares against the goal -- Identifies gaps or issues -- Calculates completion percentage (0-100%) - -**Output**: -- Completion percentage (0-100%) -- List of completed features -- List of remaining work -- Quality assessment - -**Timeout**: 10 minutes (configurable) - -#### Git Commit - -After each cycle, the orchestrator: - -1. **Stages all changes** - ```bash - git add -A - ``` - -2. **Creates descriptive commit** - ```bash - git commit -m "Cycle N: " - ``` - -3. **Pushes to remote** (if origin exists) - ```bash - git push -u origin - ``` - -This creates a complete audit trail of the project's development. - -### Phase 3: Validation - -After the Review phase, the orchestrator checks completion: - -```python -if completion_percentage >= 95: - validation_checks += 1 - - if validation_checks >= 3: - # Project complete! - mark_as_completed() - shutdown() - else: - # Continue validation cycles - continue_to_next_cycle() -else: - # Reset validation counter - validation_checks = 0 - continue_to_next_cycle() -``` - -**Why 3 consecutive checks?** - -A single high score could be a false positive. Three consecutive ≥95% reviews ensure: -- Consistent quality across cycles -- No regression in functionality -- Thorough validation from multiple angles -- Genuine project completion - - -The Reviewer is instructed to take a **fresh, critical look** each time, not just rubber-stamp previous assessments. - - -## Data Flow - -### Information Passing Between Agents - -Agents don't communicate directly. The orchestrator passes outputs as inputs: - -``` -Cycle N: - Planner Input: (goal, plan[N-1], review[N-1]) - ↓ - Planner Output: plan[N] - ↓ - Executor Input: (goal, plan[N]) - ↓ - Executor Output: execution_result[N] - ↓ - Reviewer Input: (goal, plan[N], execution_result[N]) - ↓ - Reviewer Output: review[N] (includes completion %) - ↓ - State Update: Save all outputs to state.json - ↓ - Git Commit: Commit all file changes -``` - -### State Persistence - -After each agent execution, state is saved to disk: - -```python -state_manager.update({ - "cycle_number": current_cycle, - "status": "reviewing", # or "planning", "executing" - "current_plan": plan_output, - "last_execution_result": execution_output, - "last_review": review_output, - "completion_percentage": review.completion, - "validation_checks": validation_count, - "updated_at": datetime.now().isoformat() -}) -``` - -This enables: -- **Crash recovery**: Restart from last saved state -- **Progress monitoring**: External tools can read state -- **Debugging**: Examine state at any point - -## Git Integration - -### Repository Structure - -Fireteam creates this Git structure: - -``` -project-dir/ -├── .git/ -│ └── (Fireteam-managed Git repo) -├── src/ -│ └── (Agent-written code) -├── tests/ -│ └── (Agent-written tests) -├── README.md (Agent-written docs) -├── requirements.txt (or package.json) -└── (other agent-created files) -``` - -### Branch Strategy - -Fireteam uses **timestamped feature branches**: - -``` -main (untouched) - │ - └── agent-20251017-143022 (Fireteam working branch) -``` - -**Benefits**: -- Main branch stays clean -- Easy to compare before/after -- Multiple Fireteam runs create separate branches -- You decide when to merge to main - -### Commit Messages - -Automatic commits follow this format: - -``` -Cycle 0: Initial implementation with API integration - -- Created main application structure -- Implemented CoinGecko API client -- Added error handling and retries -- Wrote initial test suite - -Completion: 88% -``` - -Clear, descriptive, and includes completion percentage. - -## Error Handling - -### Agent Failure Recovery - -When an agent fails (timeout, error, crash): - -1. **Retry Logic**: Up to 3 retries with exponential backoff -2. **State Preservation**: Current state is saved before retry -3. **Logging**: Detailed error logs in `logs/orchestrator_*.log` -4. **Graceful Degradation**: If all retries fail, stop gracefully - -```python -for attempt in range(MAX_RETRIES): - try: - result = agent.execute() - return result - except TimeoutError: - if attempt < MAX_RETRIES - 1: - time.sleep(RETRY_DELAY * (2 ** attempt)) # Exponential backoff - continue - else: - raise -``` - -### Parse Failure Handling - -If the Reviewer's output can't be parsed (invalid format): - -- **Use last known completion %**: Prevents completion from resetting to 0% -- **Log warning**: Alert but don't crash -- **Continue cycle**: Allow system to self-correct - -### State Corruption Recovery - -If `state/current.json` is corrupted: - -1. Stop the agent: `stop-agent` -2. Remove corrupt state: `rm state/current.json` -3. Restart with fresh state - -The Git history is preserved, so no code is lost. - -## Scalability & Performance - -### Resource Management - -**CPU**: Light usage - primarily orchestration logic -**Memory**: ~100-500MB depending on project size -**Disk**: Logs and state files (typically <100MB) -**Network**: Claude API calls (minimal bandwidth) - -### Timing Characteristics - -Based on real test data: - -- **Single-cycle projects**: 20-30 minutes -- **Typical projects (2-3 cycles)**: 45-90 minutes -- **Complex projects (5+ cycles)**: 2-4 hours -- **Average**: ~50 minutes per project - -### Bottlenecks - -**Primary bottleneck**: Agent execution time (Claude API) - -**Mitigation strategies**: -- Optimized timeouts per agent role -- Retry logic with exponential backoff -- State caching to avoid redundant work - -## Security Considerations - -### Dangerous Permissions Flag - -Fireteam uses `--dangerously-skip-permissions` with Claude CLI to enable fully autonomous operation. - - -This allows agents to execute any command without confirmation. Only use Fireteam in isolated environments or with projects you trust. - - -### Sudo Operations - -Some operations (Node.js installation, system packages) may require sudo: - -- **Option 1**: Configure passwordless sudo (see [Sudo Setup](/configuration/sudo-setup)) -- **Option 2**: Set `SUDO_PASSWORD` in `.env` file - -**Security Note**: Never commit `.env` with passwords to Git. - -### Git Credentials - -If pushing to remote repositories, ensure: -- SSH keys are configured, OR -- Git credential helper is set up -- Fireteam will push to `origin` if it exists - -## Next Steps - - - - Deep dive into Planner, Executor, and Reviewer agents - - - - How cycles work and validation logic - - - - Optimize agent timeouts for your projects - - - - Advanced state management patterns - - diff --git a/docs/core-concepts/cycles.mdx b/docs/core-concepts/cycles.mdx deleted file mode 100644 index 72cc294..0000000 --- a/docs/core-concepts/cycles.mdx +++ /dev/null @@ -1,578 +0,0 @@ ---- -title: "Cycles" -description: "Understanding the cycle-based execution model and multi-cycle validation system" ---- - -## What is a Cycle? - -A cycle is one complete iteration through all three agent phases: **Plan → Execute → Review**. Fireteam runs these cycles in an infinite loop until validation criteria are met. - -``` -Cycle 0: Plan → Execute → Review → Git Commit → 88% complete -Cycle 1: Plan → Execute → Review → Git Commit → 95% complete ✓ (1/3) -Cycle 2: Plan → Execute → Review → Git Commit → 98% complete ✓ (2/3) -Cycle 3: Plan → Execute → Review → Git Commit → 98% complete ✓ (3/3) - → Validation requirements met, project complete -``` - -Each cycle produces a completion score (0-100%). The system requires multiple consecutive cycles above the threshold (default: 3 cycles ≥95%) before terminating. - -## Cycle Anatomy - -### Phase 1: Planning - -**Duration**: Typically 2-5 minutes (timeout: 10 minutes) - -**What Happens**: -1. Planner Agent receives project goal and previous cycle outputs -2. Analyzes current state of the codebase -3. Reviews feedback from last Review phase -4. Creates or updates the project plan -5. Outputs structured task breakdown - -**Input**: -- Project goal (constant) -- Previous plan (if cycle > 0) -- Last execution result (if cycle > 0) -- Last review feedback (if cycle > 0) - -**Output**: -- Updated project plan with task priorities - -**State Update**: -```json -{ - "status": "planning", - "current_plan": "new_plan_content" -} -``` - -### Phase 2: Execution - -**Duration**: Typically 10-20 minutes (timeout: 30 minutes) - -**What Happens**: -1. Executor Agent reads the current plan -2. Implements the next actionable tasks -3. Writes production-ready code (no placeholders) -4. Creates or updates test files -5. Runs tests to verify functionality -6. Documents changes -7. Outputs summary of work completed - -**Input**: -- Project goal -- Current plan from Planner -- Project directory path - -**Output**: -- Execution summary -- List of files created/modified -- Issues encountered -- Remaining work - -**State Update**: -```json -{ - "status": "executing", - "last_execution_result": "execution_summary" -} -``` - - -The Executor has the longest timeout (30 minutes) because implementation includes writing code, installing dependencies, running tests, and debugging. - - -### Phase 3: Review - -**Duration**: Typically 3-7 minutes (timeout: 10 minutes) - -**What Happens**: -1. Reviewer Agent scans the entire codebase -2. Runs tests and checks functionality -3. Compares implementation against goal -4. Identifies gaps, bugs, or incomplete features -5. Estimates completion percentage (0-100%) -6. Provides detailed feedback - -**Input**: -- Project goal -- Current plan -- Execution result summary -- Validation mode flag (true if previous completion ≥95%) - -**Output**: -- Review summary -- **Completion percentage (0-100%)** -- What's working -- What's incomplete -- Next steps - -**State Update**: -```json -{ - "status": "reviewing", - "last_review": "review_content", - "completion_percentage": 95 -} -``` - -### Phase 4: Git Commit - -**Duration**: Typically <30 seconds - -**What Happens**: -1. Check for file changes with `git status --porcelain` -2. Stage all changes with `git add .` -3. Create commit with descriptive message -4. Push to remote origin (if exists) - -**Commit Message Format**: -``` -Cycle N: XX% complete - -- Summary of changes made in this cycle -- Key features implemented -- Tests added -``` - -**Example**: -``` -Cycle 2: 95% complete - -- Added comprehensive error handling -- Implemented retry logic with exponential backoff -- Expanded test suite (18 tests passing) -- Updated README with usage examples -``` - - -Every cycle creates a Git commit, providing a complete audit trail of the project's development. You can `git log` to see the entire history. - - -## Cycle Progression - -### Cycle 0 - Initial Build - -The first cycle starts from scratch: - -**Planning**: -- Receives only the project goal (no previous context) -- Creates comprehensive initial plan -- Breaks down all requirements - -**Execution**: -- Creates project structure -- Implements core functionality -- Writes initial tests -- Adds basic documentation - -**Review**: -- Assesses what was built -- Typically estimates 70-90% completion -- Identifies remaining work - -**Typical Result**: 70-90% completion - -### Cycle 1+ - Refinement - -Subsequent cycles build on existing work: - -**Planning**: -- Reviews previous plan, execution, and review -- Focuses on gaps identified by Reviewer -- Adjusts priorities based on feedback - -**Execution**: -- Implements missing features -- Improves error handling -- Expands test coverage -- Polishes documentation - -**Review**: -- Re-evaluates entire codebase -- Updates completion percentage -- May enter validation mode (if ≥95%) - -**Typical Result**: Incremental improvement toward 95%+ - -## Completion Validation System - -The validation system prevents the cycle from terminating prematurely by requiring consistent completion scores across multiple cycles. - -### The Problem - -AI assistants typically stop when they determine they're "done," without objective criteria or validation. There's no mechanism to ensure actual completeness. - -### Validation Approach - -Fireteam requires 3 consecutive review cycles scoring ≥95% (by default) before terminating. This means: - -- Completion threshold is configurable (default: 95%) -- Number of required validations is configurable (default: 3) -- If score drops below threshold during validation, counter resets -- System continues running until validation requirements are met - -### Validation Logic - -```python -def check_completion(completion_pct, validation_checks): - if completion_pct >= 95: - validation_checks += 1 - - if validation_checks >= 3: - return True # Project complete! - else: - validation_checks = 0 # Reset if percentage drops - - return False # Continue cycling -``` - -### Validation States - - - - Reviewer performs standard assessment. Validation counter stays at 0. - - - - Reviewer enters **critical validation mode** with extra scrutiny: - - Checks for edge cases - - Verifies error handling - - Confirms testing completeness - - Validates production readiness - - - - After 3 consecutive validations, project is marked complete and system stops. - - - -### Example Validation Flow - -``` -Cycle 0: 88% → validation_checks = 0 -Cycle 1: 95% → validation_checks = 1 (enters validation mode) -Cycle 2: 93% → validation_checks = 0 (dropped below 95%, reset!) -Cycle 3: 96% → validation_checks = 1 (restart validation) -Cycle 4: 97% → validation_checks = 2 -Cycle 5: 98% → validation_checks = 3 → COMPLETE! ✓ -``` - - -If completion percentage drops below the threshold during validation, the validation counter resets to 0. The system will continue cycling until it achieves the required consecutive validations. - - - -Complex projects may require many cycles to meet validation requirements. We've observed runs exceeding 37 hours with 50+ cycles before achieving three consecutive scores ≥95%. - - -## Validation Mode Details - -When `completion_percentage >= 95%`, the Reviewer receives special instructions: - -``` -VALIDATION MODE: -This is a validation check. The system believes the project is >95% -complete. - -Be CRITICAL and thorough. Check for: -- Edge cases that might not be handled -- Missing error handling -- Incomplete features -- Testing gaps -- Production-readiness issues - -Only confirm high completion if truly production-ready. -``` - -This ensures each validation is a **fresh, critical look** rather than rubber-stamping previous assessments. - -## Goal Alignment Checks - -Every 3 cycles, Fireteam performs an automatic **goal alignment check**: - -``` -════════════════════════════════════════════════════════ -GOAL ALIGNMENT CHECK (Cycle 3) -════════════════════════════════════════════════════════ -Original Goal: Build a Python CLI tool that fetches the -current Bitcoin price from CoinGecko API... - -⚠️ Reminder: Ensure all work aligns with original goal! -════════════════════════════════════════════════════════ -``` - -This prevents **scope creep** where agents might add features beyond the original goal. - -## State Persistence - -After each phase, state is saved to `state/current.json`: - -```json -{ - "project_dir": "/path/to/project", - "goal": "Build a Bitcoin price checker...", - "status": "reviewing", - "cycle_number": 2, - "completion_percentage": 95, - "validation_checks": 1, - "git_branch": "agent-20251017-143022", - "current_plan": "# Updated Plan\n...", - "last_execution_result": "Execution Summary...", - "last_review": "COMPLETION: 95%\n...", - "created_at": "2025-10-17T14:30:22Z", - "updated_at": "2025-10-17T15:45:33Z" -} -``` - -This enables: -- **Crash recovery**: Resume from last saved state -- **Progress monitoring**: External tools read state -- **Debugging**: Examine state history - -## Cycle Performance - -Typical performance patterns: - -Most projects complete in 1-3 cycles, with more complex projects occasionally requiring additional iterations for refinement and edge case handling. - -**Average**: 3-4 cycles per project - -### Time Per Cycle - -**Typical cycle duration**: -- **Cycle 0**: 20-30 minutes (initial build) -- **Cycle 1+**: 15-25 minutes (refinement) - -**Total project time**: -- **1-cycle projects**: 20-30 minutes -- **2-3 cycle projects**: 45-90 minutes -- **Average**: ~50 minutes per project - -### Completion Progression - -Typical completion percentage progression: - -``` -Cycle 0: 70-90% (initial implementation) -Cycle 1: 85-95% (refinement) -Cycle 2: 92-98% (polish + validation 1) -Cycle 3: 95-100% (validation 2-3) -``` - - -Well-defined projects with familiar tech stacks (Python, common libraries) tend to complete in fewer cycles than projects requiring environment setup (TypeScript, Node.js). - - -## Cycle Interruption & Recovery - -### Graceful Shutdown - -Use `stop-agent` to gracefully stop between cycles: - -```bash -stop-agent -``` - -This: -1. Sends `SIGTERM` signal to orchestrator -2. Waits for current agent to complete -3. Saves state to disk -4. Commits any pending changes -5. Exits cleanly - - -Never kill the process manually (`kill -9`, `Ctrl+C`). Always use `stop-agent` to preserve state. - - -### Mid-Cycle Interruption - -If an agent times out or fails mid-cycle: - -**Retry Logic**: -1. First attempt fails → wait 5 seconds → retry -2. Second attempt fails → wait 10 seconds → retry -3. Third attempt fails → mark cycle as failed, stop gracefully - -**State Preservation**: -- State is saved before each phase -- Partial work is committed to Git -- Logs capture error details - -### Recovery from Crash - -If the system crashes unexpectedly: - -1. **Check state**: `cat /home/claude/fireteam/state/current.json` -2. **Check Git**: All committed work is preserved -3. **Restart**: Run `start-agent` with same goal -4. **State recovery**: System loads last saved state and resumes - -## Advanced Cycle Concepts - -### Parse Failure Handling - -If the Reviewer's output can't be parsed (missing "COMPLETION: XX%" marker): - -**Old Behavior** (problematic): -- Default to 0% completion -- Reset validation progress incorrectly - -**New Behavior** (improved): -- Use **last known completion percentage** -- Log warning but don't crash -- Allow system to self-correct in next cycle - -```python -# StateManager handles parse failures gracefully -if parsed_completion == 0: - logger.warning("Parse failure detected, using last known completion") - return last_known_completion # Prevents regression -``` - -### Completion Percentage Regression - -What if completion percentage drops? - -**Example**: -``` -Cycle 0: 93% -Cycle 1: 96% -Cycle 2: 92% ← Dropped! -``` - -**System Response**: -- Reset validation counter to 0 -- Log warning about regression -- Continue cycling normally -- Agents should investigate why quality dropped - -This can happen if: -- Tests break due to refactoring -- New features introduce bugs -- Reviewer becomes more critical - -### Infinite Loop Protection - -**Question**: What if the project never reaches 95%? - -**Answer**: Manual intervention required. - -**Monitoring**: -```bash -fireteam-status --watch -``` - -If stuck at 85-90% for 5+ cycles, consider: -1. Checking logs for blockers -2. Reviewing the goal (too ambitious?) -3. Stopping and examining code manually -4. Adjusting the goal for next run - -## Configuration Options - -All cycle-related settings in `config.py`: - -```python -# Completion thresholds -COMPLETION_THRESHOLD = 95 # Percentage to trigger validation -VALIDATION_CHECKS_REQUIRED = 3 # Consecutive checks needed - -# Agent timeouts (in seconds) -AGENT_TIMEOUTS = { - "planner": 600, # 10 minutes - "executor": 1800, # 30 minutes - "reviewer": 600 # 10 minutes -} - -# Retry logic -MAX_RETRIES = 3 -RETRY_DELAY = 5 # seconds (with exponential backoff) -``` - -## Monitoring Cycles - -### Real-Time Monitoring - -```bash -fireteam-status --watch -``` - -Output: -``` -╔══════════════════════════════════════════════════════════════╗ -║ 🔥 FIRETEAM STATUS 🔥 ║ -╚══════════════════════════════════════════════════════════════╝ - -Status: ⚡ RUNNING - -Current Progress: - Cycle: 2 - Phase: Reviewing - Completion: 95% - Validation: 1/3 checks - -Last Updated: 2025-10-17 14:35:42 -``` - -### Log Examination - -View detailed cycle logs: - -```bash -tail -f /home/claude/fireteam/logs/orchestrator_*.log -``` - -Look for: -- Phase transitions -- Completion percentages -- Validation check counts -- Error messages -- Git commit confirmations - -## Best Practices - -### For Faster Cycles - -✅ **Write specific goals**: Clear requirements → faster implementation - -✅ **Choose familiar tech stacks**: Python projects complete faster than TypeScript - -✅ **Start medium-sized**: 100-300 line projects are the sweet spot - -### For Better Quality - -✅ **Trust validation**: Let the triple-check system work - -✅ **Monitor validation mode**: Pay attention when completion ≥95% - -✅ **Review Git history**: Check commits to understand progression - -### For Debugging - -✅ **Watch logs**: Use `fireteam-status --follow` for real-time logs - -✅ **Check state**: Examine `state/current.json` for current status - -✅ **Review commits**: `git log` shows work completed each cycle - -## Next Steps - - - - Learn what each agent does in each cycle phase - - - - Optimize cycle timing for your projects - - - - Track cycles in real-time - - - - Understand state persistence and recovery - - diff --git a/docs/installation/environment.mdx b/docs/installation/environment.mdx deleted file mode 100644 index ad8c8c3..0000000 --- a/docs/installation/environment.mdx +++ /dev/null @@ -1,402 +0,0 @@ ---- -title: "Environment Variables" -description: "Configure Fireteam environment variables for optimal operation" ---- - -## Overview - -Fireteam uses environment variables for configuration that may vary between installations or contain sensitive information. These variables are stored in a `.env` file that is automatically loaded by the system. - -## .env File Location - -The `.env` file should be located in the Fireteam installation directory: - -``` -/home/claude/fireteam/.env -``` - - -The `.env` file is automatically ignored by Git (in `.gitignore`). Never commit this file to version control as it may contain passwords. - - -## Creating Your .env File - -### Step 1: Copy the Example - -Fireteam includes an example file: - -```bash -cd /home/claude/fireteam -cp .env.example .env -``` - -### Step 2: Edit Variables - -Open `.env` in your text editor: - -```bash -nano .env -# or -vim .env -# or -code .env -``` - -## Environment Variables Reference - -### SUDO_PASSWORD - -**Purpose**: Password for sudo operations - -**Required**: No (optional) - -**Description**: Some operations require system-level access (installing Node.js, system packages, etc.). If your system doesn't have passwordless sudo configured, set this variable. - -**Example**: -```bash -SUDO_PASSWORD=your_password_here -``` - -**Security Notes**: -- ✅ File is gitignored (won't be committed) -- ✅ Only readable by your user (set permissions: `chmod 600 .env`) -- ⚠️ Plain text - consider passwordless sudo instead -- ⚠️ Only use on development/personal machines - -**When You Need This**: -- TypeScript/Node.js projects (may need Node.js installation) -- Projects requiring system package installation -- Docker or containerization projects - -**When You Don't Need This**: -- Passwordless sudo is configured -- Only building Python projects (pip doesn't need sudo) -- Running in isolated/containerized environment - - -To configure passwordless sudo instead of using a password, see the [Sudo Setup Guide](/configuration/sudo-setup). - - -### GIT_USER_NAME - -**Purpose**: Name for Git commits - -**Required**: No (has default) - -**Default**: `"fireteam"` (from `config.py`) - -**Description**: The name that appears in Git commit author fields. - -**Example**: -```bash -GIT_USER_NAME=Jane Developer -``` - -**Overrides**: `config.GIT_USER_NAME` - -**Where It Appears**: -```bash -git log -# Author: Jane Developer -``` - -### GIT_USER_EMAIL - -**Purpose**: Email for Git commits - -**Required**: No (has default) - -**Default**: `"fireteam@darkresearch.ai"` (from `config.py`) - -**Description**: The email that appears in Git commit author fields. - -**Example**: -```bash -GIT_USER_EMAIL=jane@example.com -``` - -**Overrides**: `config.GIT_USER_EMAIL` - - -If you plan to push Fireteam-created projects to GitHub or other services, use your actual Git credentials. - - -## Complete .env Example - -Here's a fully configured example: - -```bash -# Fireteam Environment Variables - -# ============================================ -# Sudo Configuration -# ============================================ -# Password for sudo operations (system package installation) -# Leave empty or comment out if using passwordless sudo -SUDO_PASSWORD=your_secure_password - -# ============================================ -# Git Configuration -# ============================================ -# Git user name for commits -# Appears in: git log, GitHub, etc. -GIT_USER_NAME=Your Name - -# Git user email for commits -# Should match your GitHub/GitLab account if pushing -GIT_USER_EMAIL=your.email@example.com -``` - -## Environment Variable Loading - -### How Loading Works - -Environment variables are loaded by `config.py` using `python-dotenv`: - -```python -# config.py -from dotenv import load_dotenv -import os - -# Load .env file if it exists -env_file = Path(__file__).parent / ".env" -if env_file.exists(): - load_dotenv(env_file) - -# Access variables -SUDO_PASSWORD = os.getenv("SUDO_PASSWORD", None) -GIT_USER_NAME = os.getenv("GIT_USER_NAME", "fireteam") -GIT_USER_EMAIL = os.getenv("GIT_USER_EMAIL", "fireteam@darkresearch.ai") -``` - -### Priority Order - -1. **Environment variables** (`.env` file) -2. **Config.py defaults** (fallback values) -3. **System environment** (if set before running) - -## Validation - -### Check if Variables are Loaded - -Test that your `.env` file is being read: - -```bash -cd /home/claude/fireteam -python3 -c "import config; print(f'Git User: {config.GIT_USER_NAME}'); print(f'Git Email: {config.GIT_USER_EMAIL}'); print(f'Sudo configured: {config.has_sudo_access()}')" -``` - -Expected output: -``` -Git User: Your Name -Git Email: your.email@example.com -Sudo configured: True -``` - -### Verify Sudo Password Works - -If you set `SUDO_PASSWORD`, verify it works: - -```bash -# This should succeed without prompting for password -echo $SUDO_PASSWORD | sudo -S echo "Sudo works!" -``` - - -Be careful running commands that expose your password in command history. Consider passwordless sudo for better security. - - -## Security Best Practices - -### File Permissions - -Restrict `.env` file access to your user only: - -```bash -chmod 600 /home/claude/fireteam/.env -``` - -This prevents other users from reading your passwords. - -### Verify Gitignore - -Confirm `.env` is in `.gitignore`: - -```bash -grep "^\.env$" /home/claude/fireteam/.gitignore -``` - -Should output: `.env` - -### Check Git Status - -Ensure `.env` is not tracked: - -```bash -cd /home/claude/fireteam -git status -``` - -`.env` should NOT appear in the output. - -### Backup Securely - -If backing up `.env`: -- ✅ Use encrypted storage -- ✅ Don't email or message it -- ✅ Don't upload to cloud storage unencrypted -- ❌ Don't commit to version control - -## Troubleshooting - -### Variables Not Loading - -**Problem**: Changed `.env` but values aren't updating - -**Solution**: -1. Verify `.env` file location: `/home/claude/fireteam/.env` -2. Check file syntax (no spaces around `=`) -3. Restart any running Fireteam processes: `stop-agent` -4. Test loading: `python3 -c "import config; print(config.GIT_USER_NAME)"` - -### Sudo Password Not Working - -**Problem**: `SUDO_PASSWORD` set but agents still fail with permission errors - -**Symptoms**: -``` -[sudo] password for user: -Error: sudo operation failed -``` - -**Solutions**: - -1. **Verify password is correct**: - ```bash - echo $SUDO_PASSWORD | sudo -S whoami - ``` - -2. **Check .env syntax**: - ```bash - # Correct - SUDO_PASSWORD=mypassword - - # Incorrect (no quotes, no spaces) - SUDO_PASSWORD = "mypassword" - ``` - -3. **Test sudo access**: - ```bash - cd /home/claude/fireteam - python3 -c "import config; print('Has sudo:', config.has_sudo_access())" - ``` - -### Git Commits Show Wrong Author - -**Problem**: Git commits show "fireteam" instead of your name - -**Cause**: `GIT_USER_NAME` and `GIT_USER_EMAIL` not set in `.env` - -**Solution**: -```bash -echo 'GIT_USER_NAME=Your Name' >> .env -echo 'GIT_USER_EMAIL=your@email.com' >> .env - -# Or configure globally for all git repos -git config --global user.name "Your Name" -git config --global user.email "your@email.com" -``` - - -`.env` values take precedence over global Git config for Fireteam projects. - - -## Advanced Configuration - -### Using System Environment Variables - -Instead of `.env`, you can set variables in your shell: - -```bash -# In ~/.bashrc or ~/.zshrc -export SUDO_PASSWORD=your_password -export GIT_USER_NAME="Your Name" -export GIT_USER_EMAIL="your@email.com" -``` - -Then reload: -```bash -source ~/.bashrc -``` - -**Pros**: -- Available to all processes -- No file to manage - -**Cons**: -- Visible in process listings -- Persists in shell history -- Less secure than file permissions - -### Per-Project Configuration - -While `.env` is global for Fireteam, Git settings can be per-project: - -```bash -cd /path/to/your/project - -# Set Git config for this project only -git config user.name "Different Name" -git config user.email "different@email.com" -``` - -Fireteam will respect these project-specific settings when making commits. - -### Environment-Specific .env Files - -For different environments (development, production): - -```bash -# Development -cp .env.example .env.dev -# Edit .env.dev - -# Production -cp .env.example .env.prod -# Edit .env.prod - -# Load specific file -ln -sf .env.dev .env # Use dev config -# or -ln -sf .env.prod .env # Use prod config -``` - -## Default Values - -If `.env` is not created or variables are not set: - -| Variable | Default Value | Source | -|----------|---------------|--------| -| `SUDO_PASSWORD` | `None` | config.py | -| `GIT_USER_NAME` | `"fireteam"` | config.py | -| `GIT_USER_EMAIL` | `"fireteam@darkresearch.ai"` | config.py | - -## Next Steps - - - - Configure passwordless sudo (more secure alternative) - - - - Learn about config.py settings - - - - Back to installation guide - - - - Start building projects - - diff --git a/docs/installation/installation.mdx b/docs/installation/installation.mdx deleted file mode 100644 index baa84be..0000000 --- a/docs/installation/installation.mdx +++ /dev/null @@ -1,439 +0,0 @@ ---- -title: "Installation" -description: "Complete installation guide for Fireteam autonomous multi-agent system" ---- - -## Prerequisites - -Before installing Fireteam, ensure your system meets these requirements: - - - - **Required**: Python 3.12 or higher - - Check your version: - ```bash - python3 --version - ``` - - If you need to install or upgrade Python: - - **Ubuntu/Debian**: `sudo apt update && sudo apt install python3.12` - - **macOS (Homebrew)**: `brew install python@3.12` - - **From source**: [python.org/downloads](https://www.python.org/downloads/) - - - - **Required**: Git for version control - - Check if installed: - ```bash - git --version - ``` - - Install if needed: - - **Ubuntu/Debian**: `sudo apt install git` - - **macOS**: `brew install git` or use Xcode Command Line Tools - - **Configuration**: - ```bash - git config --global user.name "Your Name" - git config --global user.email "your.email@example.com" - ``` - - - - **Required**: Claude CLI for agent execution - - Install following the [official guide](https://docs.claude.com/en/docs/claude-code/installation) - - Verify installation: - ```bash - claude --version - ``` - - The Claude CLI must be accessible in your PATH. - - - - **Tested on**: - - Ubuntu 20.04+ - - Debian 11+ - - macOS 12+ - - **Should work on**: - - Other Linux distributions - - WSL2 on Windows (untested) - - - -## Installation Steps - -### Step 1: Clone the Repository - -```bash -# Clone Fireteam -git clone https://github.com/darkresearch/fireteam.git - -# Navigate to directory -cd fireteam -``` - - -You can clone to any directory. The default location used in examples is `/home/claude/fireteam`. - - -### Step 2: Run the Setup Script - -Fireteam includes an automated setup script that handles installation: - -```bash -bash setup.sh -``` - -The setup script performs these actions: - - - - Creates `~/.local/bin` if it doesn't exist - - - - Creates symlinks in `~/.local/bin` for: - - `start-agent` - - `stop-agent` - - `agent-progress` - - `fireteam-status` - - - - Sets execute permissions on all CLI scripts and orchestrator - - - - Creates `logs/` and `state/` directories - - - - Checks for Python 3, Git, and Claude CLI - - - - Adds `~/.local/bin` to your PATH in `~/.bashrc` (if not already present) - - - -**Expected Output**: -``` -========================================== -Fireteam Setup -========================================== - -Adding /home/user/.local/bin to PATH... -Creating CLI command symlinks... -Python 3: Python 3.12.0 -Claude CLI: Claude Code v0.x.x -Git: git version 2.34.1 - -========================================== -Setup Complete! -========================================== - -Available commands: - start-agent --project-dir --prompt "" - stop-agent - agent-progress - -Example: - start-agent --project-dir /home/user/my-project --prompt "Build a Python calculator" - -Note: You may need to restart your shell or run: - source ~/.bashrc -``` - -### Step 3: Update Your Shell - -The setup script adds `~/.local/bin` to your PATH. Reload your shell configuration: - -**For Bash (most Linux)**: -```bash -source ~/.bashrc -``` - -**For Zsh (macOS default)**: -```bash -# Add to ~/.zshrc if not done automatically -echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.zshrc -source ~/.zshrc -``` - -**Alternative**: Close and reopen your terminal. - -### Step 4: Verify Installation - -Check that all CLI tools are accessible: - -```bash -# Check PATH -echo $PATH | grep .local/bin - -# Verify commands exist -which start-agent -which stop-agent -which fireteam-status - -# Test status command -fireteam-status -``` - -Expected output: -``` -/home/user/.local/bin/start-agent -/home/user/.local/bin/stop-agent -/home/user/.local/bin/fireteam-status - -No active Fireteam session found. -``` - - -If you see "command not found", your PATH isn't configured correctly. See [Troubleshooting](#troubleshooting) below. - - -## Directory Structure - -After installation, Fireteam's structure looks like this: - -``` -/home/claude/fireteam/ -├── agents/ -│ ├── __init__.py -│ ├── base.py # Base agent class -│ ├── planner.py # Planner agent -│ ├── executor.py # Executor agent -│ └── reviewer.py # Reviewer agent -├── state/ -│ ├── manager.py # State management -│ └── current.json # Active state (created on first run) -├── cli/ -│ ├── start-agent # Start command -│ ├── stop-agent # Stop command -│ ├── agent-progress # Progress command (legacy) -│ └── fireteam-status # Status command (recommended) -├── logs/ # Log directory (created by setup) -│ └── orchestrator_*.log -├── orchestrator.py # Main orchestration logic -├── config.py # Configuration settings -├── setup.sh # Installation script -├── .env.example # Example environment file -├── .env # Your environment file (create this) -├── .gitignore -└── README.md -``` - -## Environment Configuration - -### Create .env File - -Copy the example environment file: - -```bash -cd /home/claude/fireteam -cp .env.example .env -nano .env # or use your preferred editor -``` - -### Configure Variables - -Edit `.env` with your settings: - -```bash -# Sudo password for system-level operations -# Used when agents need to install system packages -SUDO_PASSWORD=your_password_here - -# Git configuration (optional) -# Overrides default values in config.py -GIT_USER_NAME=Your Name -GIT_USER_EMAIL=your.email@example.com -``` - - -**Security**: Never commit `.env` to version control. It's already in `.gitignore`, but be cautious. - - - -If you have passwordless sudo configured, you can leave `SUDO_PASSWORD` empty or commented out. - - -See [Environment Variables](/installation/environment) for detailed configuration options. - -## Troubleshooting - -### "command not found: start-agent" - -**Problem**: CLI tools not in PATH - -**Solution 1** - Manual PATH update: -```bash -export PATH="$HOME/.local/bin:$PATH" -``` - -**Solution 2** - Add to shell config permanently: -```bash -# For Bash -echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc -source ~/.bashrc - -# For Zsh (macOS) -echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.zshrc -source ~/.zshrc -``` - -**Solution 3** - Verify symlinks exist: -```bash -ls -la ~/.local/bin/ | grep agent -``` - -If symlinks are missing, re-run `bash setup.sh`. - -### "Claude CLI not found" - -**Problem**: Claude CLI not installed or not in PATH - -**Solution**: -1. Install Claude CLI following [official docs](https://docs.claude.com/en/docs/claude-code/installation) -2. Verify it's in PATH: `which claude` -3. Test it works: `claude --version` - -### "Python 3.12+ required" - -**Problem**: Python version too old - -**Check version**: -```bash -python3 --version -``` - -**Solution** - Install Python 3.12+: -```bash -# Ubuntu/Debian -sudo apt update -sudo apt install python3.12 - -# macOS -brew install python@3.12 -``` - -### Permission Errors - -**Problem**: `Permission denied` when running CLI tools - -**Solution**: -```bash -# Make scripts executable -chmod +x ~/.local/bin/{start-agent,stop-agent,fireteam-status} - -# Or re-run setup -bash setup.sh -``` - -### Git Configuration Issues - -**Problem**: Git commits fail with "Please tell me who you are" - -**Solution**: -```bash -git config --global user.name "Your Name" -git config --global user.email "your.email@example.com" - -# Or set in .env file -echo 'GIT_USER_NAME=Your Name' >> .env -echo 'GIT_USER_EMAIL=your.email@example.com' >> .env -``` - -## Uninstallation - -To remove Fireteam: - -```bash -# Remove CLI tools -rm ~/.local/bin/start-agent -rm ~/.local/bin/stop-agent -rm ~/.local/bin/agent-progress -rm ~/.local/bin/fireteam-status - -# Remove Fireteam directory -rm -rf /home/claude/fireteam - -# Remove PATH entry from .bashrc (manual edit) -# Open ~/.bashrc and remove the line: -# export PATH="$HOME/.local/bin:$PATH" -``` - -## Updating Fireteam - -To update to the latest version: - -```bash -cd /home/claude/fireteam - -# Pull latest changes -git pull origin main - -# Re-run setup -bash setup.sh - -# Reload shell -source ~/.bashrc -``` - - -Updates will preserve your `.env` file and existing project data. - - -## Next Steps - - - - Configure environment variables and Git settings - - - - Detailed dependency information - - - - Build your first project - - - - Customize timeouts and thresholds - - - -## System Requirements - -### Minimum Requirements - -- **CPU**: 1 core (2+ recommended) -- **RAM**: 512MB available -- **Disk**: 100MB for Fireteam + space for projects -- **Network**: Internet connection for Claude API - -### Recommended Requirements - -- **CPU**: 2+ cores -- **RAM**: 2GB+ available -- **Disk**: 1GB+ for logs and multiple projects -- **OS**: Ubuntu 20.04+ or macOS 12+ - -### Tested Configurations - -| OS | Python | Git | Claude CLI | Status | -|----|--------|-----|------------|--------| -| Ubuntu 22.04 | 3.12 | 2.34 | 0.x.x | ✅ Fully tested | -| Ubuntu 20.04 | 3.12 | 2.25 | 0.x.x | ✅ Works | -| macOS 13 | 3.12 | 2.39 | 0.x.x | ✅ Works | -| Debian 11 | 3.11 | 2.30 | 0.x.x | ⚠️ Python 3.12+ required | - -## Installation Complete! - -You're now ready to start building autonomous projects with Fireteam. Head over to the [Quickstart Guide](/quickstart) to create your first project. diff --git a/docs/installation/requirements.mdx b/docs/installation/requirements.mdx deleted file mode 100644 index 1815517..0000000 --- a/docs/installation/requirements.mdx +++ /dev/null @@ -1,335 +0,0 @@ ---- -title: "System Requirements" -description: "Prerequisites and dependencies needed to run Fireteam successfully" ---- - -## Overview - -Fireteam has minimal system requirements but does need specific tools installed and configured properly. This guide covers all prerequisites for running Fireteam successfully. - -## Required Software - -### Python 3.12+ - -Fireteam requires **Python 3.12 or higher** for optimal performance and compatibility. - - -Python 3.12+ includes important performance improvements and type system enhancements that Fireteam relies on. - - -**Check your Python version:** - -```bash -python3 --version -``` - -**Installing Python 3.12+:** - - - -```bash Ubuntu/Debian -sudo apt update -sudo apt install python3.12 python3.12-venv python3-pip -``` - -```bash macOS (Homebrew) -brew install python@3.12 -``` - -```bash From Source -# Download from python.org -wget https://www.python.org/ftp/python/3.12.0/Python-3.12.0.tgz -tar -xzf Python-3.12.0.tgz -cd Python-3.12.0 -./configure --enable-optimizations -make -j$(nproc) -sudo make altinstall -``` - - - -### Git - -Git is required for Fireteam's version control integration. Every project is automatically managed as a git repository with automatic commits and branch management. - -**Check Git installation:** - -```bash -git --version -``` - -**Installing Git:** - - - -```bash Ubuntu/Debian -sudo apt update -sudo apt install git -``` - -```bash macOS -# Git comes with Xcode Command Line Tools -xcode-select --install - -# Or via Homebrew -brew install git -``` - -```bash CentOS/RHEL -sudo yum install git -``` - - - -**Configure Git:** - -```bash -git config --global user.name "Your Name" -git config --global user.email "your.email@example.com" -``` - - -These git credentials will be used by default, but you can override them using environment variables in the `.env` file. - - -### Claude CLI - -The Claude Command Line Interface is the **core requirement** for Fireteam. All agents communicate with Claude AI through the CLI. - -**Check Claude CLI installation:** - -```bash -claude --version -``` - -**Installing Claude CLI:** - -Follow the [official Claude CLI installation guide](https://docs.claude.com/en/docs/claude-code/installation). - -**Quick installation:** - -```bash -# Download and install (example - check official docs for latest) -curl -fsSL https://claude.ai/install.sh | sh -``` - -**Authentication:** - -After installation, authenticate with your Anthropic API key: - -```bash -claude auth -``` - - -Fireteam requires an authenticated Claude CLI to function. Without proper authentication, all agent operations will fail. - - -## Optional Dependencies - -### Node.js (for TypeScript/JavaScript projects) - -While not required for Fireteam itself, Node.js is needed if you're building TypeScript or JavaScript projects. - -**Recommended: Node.js 18+ LTS** - -```bash -# Check Node.js version -node --version - -# Install via nvm (recommended) -curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.0/install.sh | bash -nvm install 18 -nvm use 18 -``` - - -Having Node.js pre-installed significantly reduces setup time when building TypeScript projects. - - -### Passwordless Sudo (Recommended) - -For projects requiring system-level package installation, passwordless sudo access significantly improves efficiency. - -**Configure passwordless sudo:** - -```bash -sudo visudo -``` - -Add this line (replace `username` with your username): - -``` -username ALL=(ALL) NOPASSWD: ALL -``` - -**Alternative:** Use `SUDO_PASSWORD` in `.env` file (see [Sudo Setup](/configuration/sudo-setup)). - -## Python Dependencies - -Fireteam manages its own Python dependencies automatically during setup. Key dependencies include: - -- `python-dotenv`: Environment variable management -- `subprocess` (stdlib): Process execution -- `pathlib` (stdlib): Path handling -- Standard library modules for state management and logging - -**No manual pip installation needed** - the `setup.sh` script handles all dependencies. - -## System Resources - -### Minimum Requirements - -- **CPU**: 2 cores (4+ recommended for faster cycles) -- **RAM**: 4GB (8GB+ recommended) -- **Disk**: 1GB free space for Fireteam + space for your projects -- **Network**: Internet connection for Claude API calls - -### Recommended Resources - -- **CPU**: 4+ cores for parallel operations -- **RAM**: 16GB for large codebases -- **Disk**: SSD with 10GB+ free space -- **Network**: Stable broadband connection (Claude API requires consistent connectivity) - - -Average project completion takes ~50 minutes and makes 15-30 API calls to Claude. Plan for adequate network bandwidth. - - -## Platform Support - -### Supported Operating Systems - -✅ **Linux** (Ubuntu 20.04+, Debian 11+, CentOS 8+, Arch Linux) -✅ **macOS** (11.0 Big Sur or later) -⚠️ **Windows** (via WSL2 recommended, native support experimental) - - -Windows native support is experimental. For best results on Windows, use **WSL2** (Windows Subsystem for Linux 2) with Ubuntu. - - -### Shell Requirements - -Fireteam CLI tools require a POSIX-compatible shell: - -- ✅ bash (recommended) -- ✅ zsh -- ✅ fish (with bash compatibility) -- ⚠️ sh (limited features) - -## Verification Checklist - -Before installing Fireteam, verify all requirements: - - - -- [ ] Python 3.12+ installed and accessible via `python3` -- [ ] Git installed and configured with username/email -- [ ] Claude CLI installed and authenticated -- [ ] Internet connection available -- [ ] At least 4GB RAM available -- [ ] 1GB+ free disk space -- [ ] POSIX-compatible shell (bash/zsh) -- [ ] (Optional) Node.js 18+ for TypeScript/JS projects -- [ ] (Optional) Passwordless sudo or SUDO_PASSWORD ready - - - -## Testing Your Environment - -Run these commands to verify your environment is ready: - -```bash -# Check Python -python3 --version -python3 -c "import sys; print('✓ Python OK' if sys.version_info >= (3, 12) else '✗ Upgrade to 3.12+')" - -# Check Git -git --version && echo "✓ Git OK" - -# Check Claude CLI -claude --version && echo "✓ Claude CLI OK" - -# Check Claude authentication -claude auth status && echo "✓ Claude authenticated" - -# Check disk space -df -h . | awk 'NR==2 {print "Free space: " $4}' - -# Optional: Check Node.js -node --version 2>/dev/null && echo "✓ Node.js OK" || echo "⚠ Node.js not installed (optional)" -``` - - -If all checks pass, you're ready to proceed with [Fireteam installation](/installation/installation)! - - -## Troubleshooting - -### Python Version Issues - -**Problem:** `python3 --version` shows Python 3.11 or lower - -**Solution:** Install Python 3.12+ and ensure it's the default: - -```bash -# Add to ~/.bashrc or ~/.zshrc -alias python3='/usr/bin/python3.12' -``` - -### Claude CLI Not Found - -**Problem:** `claude: command not found` - -**Solution:** Ensure Claude CLI is in your PATH: - -```bash -# Check if installed -which claude - -# If not in PATH, add to ~/.bashrc or ~/.zshrc -export PATH="$HOME/.local/bin:$PATH" -source ~/.bashrc -``` - -### Git Not Configured - -**Problem:** Git operations fail with identity errors - -**Solution:** Configure git user and email: - -```bash -git config --global user.name "Your Name" -git config --global user.email "you@example.com" -``` - -### Insufficient Permissions - -**Problem:** Permission denied errors during project operations - -**Solution:** Either: -1. Set up passwordless sudo (recommended) -2. Add `SUDO_PASSWORD` to `.env` file -3. Run projects in directories where you have write permissions - -## Next Steps - - - - - Complete the Fireteam installation with setup.sh - - - - Configure your .env file for optimal operation - - - - Customize Fireteam behavior via config.py - - - - Build your first project with Fireteam - - - diff --git a/docs/introduction.mdx b/docs/introduction.mdx index ba1cae8..60a391b 100644 --- a/docs/introduction.mdx +++ b/docs/introduction.mdx @@ -1,186 +1,75 @@ --- -title: "Introduction" -description: "A lightweight multi-agent wrapper around Claude that enables perpetual execution with objective completion criteria" +title: Introduction +description: Multi-phase autonomous task execution with Claude --- -## What is Fireteam? +# Fireteam -Fireteam is a lightweight wrapper around Claude that enables perpetual execution. It's a multi-agent orchestration system we built internally and decided to open-source. - -**The problem:** Claude (and other AI assistants) stop when they decide they're "done" - often prematurely. You can't control when they stop or enforce objective completion criteria. - -**Our solution:** Fireteam orchestrates four specialized Claude instances in a loop with an objective review system: - -- **Planner**: Analyzes the codebase and creates/updates project plans -- **Executor**: Implements code based on the plan -- **Reviewer**: Scores completion percentage (0-100%) against the original goal -- **Orchestrator**: Manages the cycle and enforces completion criteria - -The system runs in an infinite loop until it achieves 95%+ completion three consecutive times. This validation requirement prevents premature stopping and enables runs lasting hours, days, or longer. - - -**Why "Fireteam"?** In military terminology, a fireteam is the smallest unit - typically four people. This reflects our minimal multi-agent architecture: four Claude instances working together. - - -## How It Works - -``` -Orchestrator (Infinite Loop) - ↓ -[Plan] → [Execute] → [Review] → [Git Commit] - ↑___________________________________| -``` - -1. **Planning Phase**: Analyzes the goal and creates actionable tasks -2. **Execution Phase**: Implements code based on the plan -3. **Review Phase**: Scores completion (0-100%) based on the original goal -4. **Git Commit**: Commits all changes -5. **Loop**: Repeat until validation criteria met (default: 3 consecutive cycles ≥95%) - -The loop continues indefinitely until the completion threshold is met consistently. With default settings, this means three consecutive reviews scoring ≥95%. +Fireteam is a Python library for adaptive task execution using Claude Agent SDK. It automatically estimates task complexity and selects the appropriate execution strategy. ## Key Features - - Runs until completion criteria are met, regardless of duration - - - Requires consistent completion scores across multiple cycles + + Automatically estimates task complexity and selects the best execution mode - - Completion criteria are configurable, not subjective + + Plan, execute, and review phases for complex tasks - - Every cycle creates a commit with progress tracking + + Built-in hooks for running tests and enforcing quality - - Reviewer scores against original goal (0-100%) - - - Clean state separation between projects + + One function to execute any task: `execute()` -## What Can Fireteam Build? - -Fireteam works on projects that Claude can build, but with the ability to iterate over multiple cycles: - -### Python Projects -- **CLI Tools**: Command-line applications with argument parsing, file I/O -- **APIs**: REST APIs with FastAPI, Flask, including authentication and validation -- **Web Scrapers**: BeautifulSoup-based scrapers with error handling -- **Data Tools**: CSV/JSON analyzers, data processors with pandas -- **Database Apps**: SQLite CRUD applications with proper schema design - -### TypeScript/Node.js Projects -- **CLI Tools**: TypeScript command-line applications -- **API Integrations**: GitHub, cryptocurrency, weather, and other API clients -- **Testing Frameworks**: Comprehensive test suites with proper coverage - - -TypeScript projects may require additional setup time for Node.js installation. Python projects typically complete faster due to pre-installed runtime. - - -## Example Projects - -### Bitcoin Price Checker -A CLI tool that fetches current Bitcoin price from CoinGecko API with: -- API key management -- Rate limiting and retry logic -- Error handling -- Formatted console output - -### REST API Server -A complete FastAPI note-taking server with: -- Full CRUD operations -- Request validation -- Error handling -- Database integration - -### Task Manager CLI -A SQLite-based task management system with: -- Database schema design -- CRUD operations -- Command-line interface -- Data persistence - -## Key Characteristics - -**Perpetual execution:** Runs in an infinite loop until completion criteria are met. We've had runs exceeding 37 hours. - -**Configurable stopping criteria:** You set the completion threshold (default: 95%) and validation requirements (default: 3 consecutive passes). The system enforces these objectively. +## How It Works -**Cycle-based validation:** Each cycle produces a completion score (0-100%). Single high scores aren't sufficient - the system requires consistent quality across multiple cycles. +When you call `execute()`, Fireteam: -**Git integration:** Every cycle creates a commit, providing a complete history of the development process. +1. **Estimates complexity** - Analyzes the goal to determine if it's trivial, simple, moderate, or complex +2. **Selects execution mode** - Maps complexity to the appropriate execution strategy +3. **Executes the task** - Runs the appropriate phases (plan, execute, review) +4. **Returns results** - Provides success status, output, and completion percentage -**No placeholder code:** The Reviewer scoring penalizes incomplete implementations, so the system tends to produce working code rather than stubs. +```python +from fireteam import execute -## Use Cases +result = await execute( + project_dir="/path/to/project", + goal="Fix the authentication bug", +) - - - Build MVPs and proof-of-concepts in hours instead of days. Perfect for validating ideas quickly. - +print(f"Success: {result.success}") +print(f"Completion: {result.completion_percentage}%") +``` - - Generate complete, working examples to learn new technologies. Study the code Fireteam produces. - +## Execution Modes - - Create CLI tools, web scrapers, and automation scripts without writing boilerplate. - +| Complexity | Mode | Phases | +|------------|------|--------| +| TRIVIAL | SINGLE_TURN | Direct execution | +| SIMPLE | SIMPLE | Execute only | +| MODERATE | MODERATE | Execute + review | +| COMPLEX | FULL | Plan + execute + validation reviews | - - Build type-safe API integrations with proper error handling and retry logic. - +## When to Use Fireteam - - Generate comprehensive test suites and documentation for existing projects. - - +Fireteam is ideal for: -## Getting Started +- **Autonomous task execution** - Let Claude complete tasks without constant supervision +- **Complex refactoring** - Multi-file changes with validation +- **Feature implementation** - Plan, implement, and verify new features +- **Bug fixing** - Analyze, fix, and confirm resolution -Ready to deploy your first Fireteam? +## Next Steps - - Get Fireteam running in 5 minutes + + Get started with Fireteam in 5 minutes - - Understand how the system works - - - Detailed setup instructions - - - Customize timeouts and thresholds + + Full API documentation - -## Open Source - -Fireteam is open source and available on GitHub: - -[https://github.com/darkresearch/fireteam](https://github.com/darkresearch/fireteam) - -Contributions, issues, and feedback are welcome! - -## Next Steps - - - - Follow the [Quickstart Guide](/quickstart) to install and run your first project - - - Learn how agents work together in the [Core Concepts](/core-concepts/architecture) section - - - Optimize timeouts and thresholds in [Configuration](/configuration/config-file) - - - Track progress with [fireteam-status](/cli-tools/fireteam-status) - - diff --git a/docs/mint.json b/docs/mint.json index e630779..f960b52 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -55,53 +55,19 @@ ] }, { - "group": "Core Concepts", + "group": "Concepts", "pages": [ - "core-concepts/architecture", - "core-concepts/agents", - "core-concepts/cycles" - ] - }, - { - "group": "Installation & Setup", - "pages": [ - "installation/installation", - "installation/environment", - "installation/requirements" - ] - }, - { - "group": "Configuration", - "pages": [ - "configuration/config-file", - "configuration/timeouts", - "configuration/sudo-setup" - ] - }, - { - "group": "CLI Tools", - "pages": [ - "cli-tools/overview", - "cli-tools/start-agent", - "cli-tools/fireteam-status", - "cli-tools/stop-agent" - ] - }, - { - "group": "Advanced Topics", - "pages": [ - "advanced/state-management", - "advanced/improvements", - "troubleshooting/troubleshooting" + "concepts/complexity", + "concepts/execution-modes", + "concepts/hooks" ] }, { "group": "API Reference", "pages": [ - "api/overview", - "api/state-manager", - "api/agents", - "api/configuration" + "api/execute", + "api/estimate-complexity", + "api/types" ] } ], @@ -117,10 +83,5 @@ }, "search": { "prompt": "Search Fireteam docs..." - }, - "analytics": { - "posthog": { - "apiKey": "phc_placeholder_key_fireteam_docs" - } } } diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx index b69761b..16e73c0 100644 --- a/docs/quickstart.mdx +++ b/docs/quickstart.mdx @@ -1,434 +1,156 @@ --- -title: "Quickstart" -description: "Installation and first project walkthrough" +title: Quickstart +description: Get started with Fireteam in 5 minutes --- -## Prerequisites - -Before installing Fireteam, ensure you have the following installed: - - - - Required for running the Fireteam orchestrator and agents - - - Used for version control and automatic commits - - - Powers the autonomous agents - - - Linux or macOS (tested on Ubuntu) - - - - -Need help installing Claude CLI? Check the [official installation guide](https://docs.claude.com/en/docs/claude-code/installation). - - ## Installation -### Step 1: Clone the Repository - -```bash -git clone https://github.com/darkresearch/fireteam.git -cd fireteam -``` - -### Step 2: Run Setup Script - -Fireteam includes an automated setup script that: -- Creates necessary directories -- Installs the CLI tools to `~/.local/bin` -- Sets up the environment - -```bash -bash setup.sh -``` - -The setup script will: -1. Create `state/` and `logs/` directories -2. Copy CLI tools (`start-agent`, `stop-agent`, `fireteam-status`) to `~/.local/bin` -3. Make the tools executable -4. Verify the installation - -### Step 3: Update Your PATH - -If `~/.local/bin` isn't already in your PATH, add it: - -```bash -echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc -source ~/.bashrc -``` - -For macOS using zsh: +Install Fireteam using pip: ```bash -echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.zshrc -source ~/.zshrc +pip install fireteam ``` -### Step 4: Configure Environment Variables (Optional) - -Create a `.env` file in the Fireteam directory: - -```bash -cd /home/claude/fireteam -nano .env -``` - -Add your configuration: - -```bash -# Git configuration -GIT_USER_NAME="Your Name" -GIT_USER_EMAIL="your.email@example.com" - -# Optional: Sudo password for system-level operations -# SUDO_PASSWORD=your_password_here -``` - - -Never commit your `.env` file to version control. It's already in `.gitignore`. - - -### Step 5: Verify Installation - -Check that the CLI tools are accessible: - -```bash -which fireteam-status -# Should output: /home/your-username/.local/bin/fireteam-status - -fireteam-status -# Should output: No active Fireteam session found. -``` - -## Your First Project - -Let's build a Bitcoin price checker CLI tool - a real-world project that Fireteam completed at 98% in just 3 cycles during testing. - -### Step 1: Start the Agent - -```bash -start-agent --project-dir ~/bitcoin-cli --prompt "Build a Python CLI tool that fetches the current Bitcoin price from the CoinGecko API. Include error handling, retry logic, and formatted output. The tool should accept command-line arguments and display the price in USD." -``` - - -Be specific in your goal description. Include desired features, error handling requirements, and output format. - - -### Step 2: Monitor Progress - -In a separate terminal, watch the progress in real-time: - -```bash -fireteam-status --watch -``` - -You'll see output like: - -``` -╔══════════════════════════════════════════════════════════════╗ -║ 🔥 FIRETEAM STATUS 🔥 ║ -╚══════════════════════════════════════════════════════════════╝ - -Status: ⚡ RUNNING - -Project Information: - Directory: /home/user/bitcoin-cli - Branch: agent-20251017-143022 - Goal: Build a Python CLI tool that fetches the current Bitcoin... - -Current Progress: - Cycle: 2 - Phase: Executing - Completion: 95% - Validation: 0/3 checks - -Last Updated: 2025-10-17 14:35:42 -``` - -### Step 3: Understanding Execution - -Fireteam runs in an infinite loop until validation criteria are met. Here's a typical execution flow: - -**Cycle 0 - Initial Build** (~30 min) -- **Planning**: Analyzes goal, creates project structure plan -- **Execution**: Implements core functionality, API integration, CLI -- **Review**: Tests the implementation → 88% complete - -**Cycle 1 - Refinement** (~25 min) -- **Planning**: Reviews gaps, plans improvements -- **Execution**: Adds error handling, retry logic, better output -- **Review**: Tests enhanced version → 95% complete (validation 1/3) ✓ - -**Cycle 2 - Validation** (~20 min) -- **Planning**: Critical review of remaining gaps -- **Execution**: Adds documentation, edge case handling -- **Review**: Comprehensive test → 98% complete (validation 2/3) ✓ - -**Cycle 3 - Final Validation** (~20 min) -- **Planning**: Final polish opportunities -- **Execution**: Minor refinements -- **Review**: Final verification → 98% complete (validation 3/3) ✓ -- Validation requirements met, project terminates - -This example shows 4 cycles (~1.5 hours). Complex projects may take significantly longer - we've observed 50+ cycles over 30+ hours. The system continues until it achieves the configured validation requirements (default: 3 consecutive reviews ≥95%). +Requires Python 3.10+ and a valid `ANTHROPIC_API_KEY` environment variable. - -Runtime varies significantly by project complexity. Simple projects may complete in 1-2 hours. Complex projects can run for many hours or days. Monitor progress with `fireteam-status --watch`. - - -### Step 4: Check the Results - -Once complete, explore your new project: - -```bash -cd ~/bitcoin-cli -ls -la -``` - -You'll find: -- **Source code**: `bitcoin_cli.py` or similar -- **Tests**: `test_bitcoin_cli.py` -- **Documentation**: `README.md` with usage instructions -- **Git history**: Complete commit log of all changes -- **Requirements**: `requirements.txt` with dependencies - -Test the CLI: +## Basic Usage -```bash -# Install dependencies -pip install -r requirements.txt - -# Run the tool -python bitcoin_cli.py -``` - -### Step 5: Explore the Git History - -Every cycle creates automatic commits: - -```bash -git log --oneline -``` - -Output: -``` -a3f8b2c Cycle 3: Final validation - Documentation polish -e7d9a1b Cycle 2: Enhanced error handling and retry logic -4c2e8f0 Cycle 1: Refactored output formatting -b1a7d3e Cycle 0: Initial implementation with API integration -9f4e2a8 Initial commit - Fireteam project initialization -``` - -## What Just Happened? - -Fireteam autonomously: - -1. ✅ Created a complete Python CLI project structure -2. ✅ Implemented CoinGecko API integration -3. ✅ Added comprehensive error handling and retry logic -4. ✅ Created formatted console output -5. ✅ Wrote unit tests with good coverage -6. ✅ Generated complete documentation -7. ✅ Committed all changes to Git with descriptive messages -8. ✅ Validated the implementation through 3 review cycles - -**All without human intervention after the initial goal.** - -## Project Monitoring - -### Real-Time Watching - -Keep the `--watch` flag running to see live updates: - -```bash -fireteam-status --watch -``` - -Updates every 3 seconds with current cycle, phase, and completion percentage. - -### Detailed Logs - -View full agent logs: +### Execute a Task -```bash -fireteam-status --logs -``` +The simplest way to use Fireteam is with the `execute()` function: -Shows the last 50 lines of orchestrator logs with detailed agent outputs. +```python +import asyncio +from fireteam import execute -### Follow Mode +async def main(): + result = await execute( + project_dir="/path/to/your/project", + goal="Fix the bug in auth.py where users can't log in", + ) -Stream logs in real-time: + if result.success: + print(f"Task completed!") + print(f"Output: {result.output}") + print(f"Completion: {result.completion_percentage}%") + else: + print(f"Task failed: {result.error}") -```bash -fireteam-status --follow +asyncio.run(main()) ``` -Perfect for debugging or watching the agents work. - -## Stopping a Project +### Add Context -To gracefully stop Fireteam: +Provide additional context to help Claude understand the task: -```bash -stop-agent +```python +result = await execute( + project_dir="/path/to/project", + goal="Fix the authentication bug", + context=""" + Error logs show: + - NullPointerException at auth.py:42 + - Users report login failures after password reset + """, +) ``` -This: -- Sends a shutdown signal to the orchestrator -- Waits for the current agent to complete -- Saves the current state -- Commits any pending changes - - -Do not kill the process manually (`Ctrl+C` or `kill -9`). Always use `stop-agent` for graceful shutdown. - - -## Common First Project Ideas - -Try these proven project ideas that work great with Fireteam: - - - - **Goal**: "Build a Python CLI that fetches weather data from OpenWeatherMap API for any city" - - **Expected**: 95% in 2 cycles - +### Specify Execution Mode - - **Goal**: "Create a SQLite-based task manager CLI with CRUD operations" +Force a specific execution mode instead of auto-detection: - **Expected**: 92% in 1 cycle - - - - **Goal**: "Build a Python script to scrape Hacker News front page headlines using BeautifulSoup" - - **Expected**: 92% in 1 cycle - - - - **Goal**: "Create a FastAPI note-taking API with full CRUD endpoints and validation" - - **Expected**: 92% in 1 cycle - - - -## Tips for Better Results - -### Writing Effective Goals +```python +from fireteam import execute, ExecutionMode - -**Good goals are specific, actionable, and include desired features.** - +# Use full plan+execute+review cycle for complex tasks +result = await execute( + project_dir="/path/to/project", + goal="Refactor the entire authentication module", + mode=ExecutionMode.FULL, +) -✅ **Good Example**: -``` -Build a Python CLI calculator that supports basic arithmetic operations -(+, -, *, /), handles division by zero, accepts command-line arguments, -and provides helpful usage instructions. -``` - -❌ **Bad Example**: -``` -Make a calculator +# Use single-turn for trivial tasks +result = await execute( + project_dir="/path/to/project", + goal="Add a comment explaining the login function", + mode=ExecutionMode.SINGLE_TURN, +) ``` -### Goal Best Practices - -1. **Specify the language**: "Python CLI" or "TypeScript Node.js app" -2. **List key features**: API endpoints, error handling, data validation -3. **Mention tech stack**: FastAPI, SQLite, BeautifulSoup, etc. -4. **Include quality requirements**: Error handling, tests, documentation -5. **Be realistic**: Start with well-defined, medium-sized projects - -### Optimal Project Complexity - -**Sweet Spot**: Medium complexity, well-defined requirements -- CLI tools (100-300 lines) -- API clients with 3-5 endpoints -- Simple web scrapers -- CRUD applications with database - -**Too Simple**: May complete too quickly for validation -- Single-function scripts -- Hello World programs +### Disable Test Running -**Too Complex**: May require many cycles -- Full web applications with frontend -- Complex distributed systems -- Projects requiring multiple services +By default, Fireteam runs tests after edits. Disable this if needed: -## Troubleshooting - -### "Command not found: start-agent" - -Your PATH isn't configured correctly. Add `~/.local/bin`: - -```bash -echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc -source ~/.bashrc +```python +result = await execute( + project_dir="/path/to/project", + goal="Add experimental feature", + run_tests=False, # Don't run tests after edits +) ``` -### "Claude CLI not found" +## Complexity Estimation -Install the Claude CLI: +Use `estimate_complexity()` to understand how Fireteam will handle a task: -```bash -# Follow the official guide -# https://docs.claude.com/en/docs/claude-code/installation -``` +```python +from fireteam import estimate_complexity, ComplexityLevel -### "Permission denied" errors +complexity = await estimate_complexity( + goal="Add user authentication with OAuth", + context="Using FastAPI and existing user model", +) -Make CLI tools executable: +print(f"Complexity: {complexity}") +# ComplexityLevel.MODERATE -```bash -chmod +x ~/.local/bin/{start-agent,stop-agent,fireteam-status} +# Map to execution mode +if complexity == ComplexityLevel.TRIVIAL: + print("Will use SINGLE_TURN mode") +elif complexity == ComplexityLevel.SIMPLE: + print("Will use SIMPLE mode") +elif complexity == ComplexityLevel.MODERATE: + print("Will use MODERATE mode (execute + review)") +elif complexity == ComplexityLevel.COMPLEX: + print("Will use FULL mode (plan + execute + reviews)") ``` -### Agent timeouts +## Understanding Results -Some projects need more time. Edit `config.py`: +The `ExecutionResult` contains: ```python -AGENT_TIMEOUTS = { - "planner": 900, # 15 minutes for complex planning - "reviewer": 900, # 15 minutes for thorough review - "executor": 2400 # 40 minutes for large implementations -} -``` +result = await execute(project_dir=".", goal="Fix bug") -### More Help +# Check success +if result.success: + # Task completed successfully + print(result.output) # Execution output + print(result.completion_percentage) # 0-100 + print(result.metadata) # Additional info (plan, review, etc.) +else: + # Task failed + print(result.error) # Error message -Check the [Troubleshooting Guide](/troubleshooting/troubleshooting) for detailed solutions. +# Always available +print(result.mode) # ExecutionMode used +``` ## Next Steps - - Learn how agents coordinate in cycles + + Learn about the different execution strategies - - - Optimize agent timeouts for your projects + + Understand how tasks are classified - - - Master the fireteam-status command + + Configure test running and quality enforcement + + + Full API documentation - -## Congratulations! - -You've successfully installed Fireteam and built your first autonomous project. The system is now ready to tackle more complex challenges. - -Experiment with different project types, monitor the agents' work, and watch as Fireteam builds production-ready code autonomously. - -**Happy building!** 🔥 diff --git a/docs/troubleshooting/troubleshooting.mdx b/docs/troubleshooting/troubleshooting.mdx deleted file mode 100644 index 7a19064..0000000 --- a/docs/troubleshooting/troubleshooting.mdx +++ /dev/null @@ -1,559 +0,0 @@ ---- -title: "Troubleshooting" -description: "Common issues and solutions for Fireteam" ---- - -## Common Issues - -### Installation & Setup - - - - - -**Symptoms:** -```bash -$ start-agent -bash: start-agent: command not found -``` - -**Cause:** CLI tools not in PATH - -**Solution:** -```bash -# Add to PATH -export PATH="$HOME/.local/bin:$PATH" - -# Make permanent -echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc -source ~/.bashrc - -# Verify -which start-agent -``` - - - - - -**Symptoms:** -``` -Error: claude: command not found -Error: Claude CLI not authenticated -``` - -**Solution:** -```bash -# Install Claude CLI -curl -fsSL https://claude.ai/install.sh | sh - -# Authenticate -claude auth - -# Verify -claude --version -claude auth status -``` - - - - - -**Symptoms:** -``` -Error: Python 3.12+ required, found 3.10 -``` - -**Solution:** -```bash -# Check version -python3 --version - -# Install Python 3.12+ -sudo apt install python3.12 # Ubuntu/Debian -brew install python@3.12 # macOS - -# Update alias -alias python3='/usr/bin/python3.12' -``` - - - - - -### Runtime Issues - - - - - -**Symptoms:** -``` -Agent system is already running (PID: 12345) -Use 'stop-agent' to stop it first -``` - -**Solution:** -```bash -# Check what's running -fireteam-status - -# Stop it -stop-agent - -# If stuck, force kill -kill -9 $(cat /home/claude/fireteam/orchestrator.pid) -rm /home/claude/fireteam/orchestrator.pid - -# Try again -start-agent --project-dir ~/project --prompt "Goal" -``` - - - - - -**Symptoms:** -``` -ERROR - Agent timeout after 600 seconds -ERROR - Planner/Executor/Reviewer exceeded timeout -``` - -**Solution:** -```python -# Edit /home/claude/fireteam/config.py -AGENT_TIMEOUTS = { - "planner": 1200, # 20 minutes - "executor": 3600, # 60 minutes - "reviewer": 1200 # 20 minutes -} -``` - -**Or check network/resources:** -```bash -# Network latency -ping api.anthropic.com - -# System resources -htop # Check CPU/memory -df -h # Check disk space -``` - - - - - -**Symptoms:** -``` -Permission denied: apt install nodejs -Permission denied: /usr/local/bin -``` - -**Solution:** - -**Option 1: Passwordless sudo (recommended)** -```bash -sudo visudo -# Add: username ALL=(ALL) NOPASSWD: ALL -``` - -**Option 2: SUDO_PASSWORD in .env** -```bash -echo 'SUDO_PASSWORD=your_password' >> /home/claude/fireteam/.env -chmod 600 /home/claude/fireteam/.env -``` - -See [Sudo Setup](/configuration/sudo-setup) for details. - - - - - -**Symptoms:** -``` -Error loading state: Expecting property name -fireteam-status shows incorrect data -``` - -**Solution:** -```bash -# Backup corrupted state -cp /home/claude/fireteam/state/current.json ~/corrupted-state.json - -# Validate JSON -cat /home/claude/fireteam/state/current.json | python3 -m json.tool - -# If invalid, remove and restart -rm /home/claude/fireteam/state/current.json -stop-agent -start-agent --project-dir ~/project --prompt "Goal" -``` - - - - - -**Symptoms:** -``` -Git initialization error -fatal: unable to auto-detect email address -``` - -**Solution:** -```bash -# Configure git -git config --global user.name "Your Name" -git config --global user.email "you@example.com" - -# Or use .env -echo 'GIT_USER_NAME=Your Name' >> /home/claude/fireteam/.env -echo 'GIT_USER_EMAIL=you@example.com' >> /home/claude/fireteam/.env -``` - - - - - -### Performance Issues - - - - - -**Symptoms:** -``` -Cycle 5: 88% -Cycle 6: 88% -Cycle 7: 88% -``` - -**Diagnosis:** -```bash -# Check logs for errors -fireteam-status --logs --lines 100 | grep ERROR - -# View recent activity -fireteam-status --logs --lines 50 -``` - -**Solutions:** - -1. **Environment issue:** Install missing dependencies -2. **Scope issue:** Goal too vague, stop and restart with clearer goal -3. **Blocker:** Agent stuck on unsolvable problem, review and manually fix -4. **Stop and inspect:** -```bash -stop-agent -cd ~/project -git log --oneline -# Review what was accomplished -``` - - - - - -**Symptoms:** -- Building features not requested -- Creating deployment automation for simple CLI -- Adding unnecessary complexity - -**Solution:** - -**Immediate:** -```bash -# Stop agent -stop-agent - -# Check git commits -cd ~/project && git log --oneline - -# Revert unwanted changes -git reset --hard -``` - -**Prevention:** -- Write more specific goals -- Monitor goal alignment logs (every 3 cycles) -- Stop early if drift detected - - - - - -**Causes:** -- Large codebase (10k+ lines) -- Extensive dependencies -- Slow network -- Resource constraints - -**Solutions:** - -1. **Increase timeouts:** -```python -AGENT_TIMEOUTS = { - "executor": 3600 # 60 minutes -} -``` - -2. **Pre-install dependencies:** -```bash -cd ~/project -npm install # or pip install -r requirements.txt -``` - -3. **Check resources:** -```bash -htop # CPU/memory -iostat # Disk I/O -``` - - - - - -### Project-Specific Issues - - - - - -**Symptoms:** -``` -tsc: command not found -npm: command not found -Multiple cycles trying to install Node.js -``` - -**Solution:** - -**Pre-install Node.js:** -```bash -# Via nvm (recommended) -curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.0/install.sh | bash -nvm install 18 -nvm use 18 - -# Or system package -sudo apt install nodejs npm # Ubuntu -brew install node # macOS -``` - -**Or configure sudo:** -See [Sudo Setup](/configuration/sudo-setup) - - - - - -**Symptoms:** -- Reviewer says 95% complete -- Tests actually fail when you run them - -**Investigation:** -```bash -cd ~/project - -# Run tests manually -pytest # or npm test, etc. - -# Check what reviewer saw -fireteam-status --logs | grep -A 10 "Review completed" -``` - -**Solution:** -- Fix failing tests manually -- Or restart with goal: "Fix all failing tests" - - - - - -**Symptoms:** -``` -ModuleNotFoundError: No module named 'requests' -ImportError: cannot import name 'FastAPI' -``` - -**Solution:** -```bash -cd ~/project - -# Check requirements file -cat requirements.txt # or package.json - -# Install manually -pip install -r requirements.txt -# or -npm install - -# Or let agent fix it -start-agent --project-dir ~/project --prompt "Fix all missing dependencies and imports" -``` - - - - - -## Debugging Strategies - -### 1. Check Logs First - -```bash -# Recent activity -fireteam-status --logs --lines 50 - -# Error messages -fireteam-status --logs | grep ERROR - -# Full context -fireteam-status --logs --lines 500 > debug.log -``` - -### 2. Inspect State - -```bash -# Current state -cat /home/claude/fireteam/state/current.json | python3 -m json.tool - -# Key fields -jq '{cycle: .cycle_number, completion: .completion_percentage, status: .status}' state/current.json -``` - -### 3. Review Git History - -```bash -cd ~/project - -# See what was done -git log --oneline - -# Detailed diff -git show HEAD # Latest commit -git diff HEAD~3..HEAD # Last 3 cycles -``` - -### 4. Test Manually - -```bash -cd ~/project - -# Run the project -python main.py # or node index.js, etc. - -# Run tests -pytest # or npm test - -# Check for issues -``` - -### 5. Incremental Debugging - -```bash -# Stop agent -stop-agent - -# Fix issue manually -# ... make changes ... - -# Restart to continue -start-agent --project-dir ~/project --prompt "Continue and complete remaining work" -``` - -## Error Messages Reference - -### Common Error Patterns - -| Error | Cause | Solution | -|-------|-------|----------| -| `timeout after N seconds` | Agent exceeded time limit | Increase timeout in config.py | -| `command not found` | Missing CLI tool or PATH issue | Install tool or add to PATH | -| `Permission denied` | Sudo required | Configure sudo access | -| `No module named X` | Missing Python package | `pip install X` | -| `ENOENT: no such file` | Missing file/directory | Check project structure | -| `Parse error` | Invalid JSON/code | Check state file or recent changes | -| `Git error` | Git not configured | Configure git user/email | - -## Getting Help - -### Information to Provide - -When seeking help, include: - -1. **Error message:** -```bash -fireteam-status --logs --lines 100 > error.log -``` - -2. **State:** -```bash -cat /home/claude/fireteam/state/current.json > state.json -``` - -3. **Configuration:** -```bash -cat /home/claude/fireteam/config.py > config.txt -``` - -4. **Environment:** -```bash -python3 --version -claude --version -git --version -uname -a -``` - -5. **Project goal:** -``` -Goal: "Build a Python CLI calculator" -Cycle: 8 -Completion: 75% (stuck) -``` - -### Where to Get Help - -- **GitHub Issues:** https://github.com/darkresearch/fireteam/issues -- **Documentation:** https://docs.fireteam.dev -- **Claude Code Docs:** https://docs.claude.com/en/docs/claude-code - -## Preventive Measures - -### Before Starting Projects - -✅ Verify all requirements installed -✅ Configure sudo access if needed -✅ Check disk space (1GB+ free) -✅ Test Claude CLI authentication -✅ Write clear, specific goals - -### During Execution - -✅ Monitor first 1-2 cycles closely -✅ Check for environment errors early -✅ Watch for agent drift (goal alignment) -✅ Review git commits periodically - -### After Completion - -✅ Test the project manually -✅ Review all git commits -✅ Check for missing tests -✅ Validate error handling - -## Next Steps - - - - - Optimize settings to prevent issues - - - - Fix permission issues - - - - Report bugs and request features - - - diff --git a/hooks/hooks.json b/hooks/hooks.json new file mode 100644 index 0000000..29257c8 --- /dev/null +++ b/hooks/hooks.json @@ -0,0 +1,9 @@ +{ + "hooks": [ + { + "event": "UserPromptSubmit", + "matcher": "*", + "command": "python -m fireteam.claude_hooks.user_prompt_submit" + } + ] +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..098694c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "fireteam" +version = "0.1.0" +description = "Multi-phase autonomous task execution with complexity estimation, planning, execution, and review" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "claude-agent-sdk>=0.1.4", + "python-dotenv>=1.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-asyncio>=0.21.0", +] + +[tool.setuptools] +packages = ["fireteam"] + +[tool.setuptools.package-dir] +fireteam = "src" + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +asyncio_mode = "auto" +markers = [ + "unit: Unit tests (fast, no external dependencies)", + "integration: Integration tests (require API key, use --run-integration)", + "slow: Slow running tests", +] diff --git a/requirements.txt b/requirements.txt index 9566e13..c517ac3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,11 +6,6 @@ claude-agent-sdk>=0.1.4 # Environment management python-dotenv>=1.0.0 -# Memory layer - local vector storage and embeddings -chromadb>=1.0.0 -transformers>=4.50.0 -torch>=2.5.0 -sentence-transformers>=2.2.0 - # Testing pytest>=7.0.0 +pytest-asyncio>=0.21.0 diff --git a/service/claude-agent.service b/service/claude-agent.service deleted file mode 100644 index 48bb10d..0000000 --- a/service/claude-agent.service +++ /dev/null @@ -1,18 +0,0 @@ -[Unit] -Description=Claude Agent System -After=network.target - -[Service] -Type=simple -User=claude -Group=claude -WorkingDirectory=/home/claude/claude-agent-system -Environment="PATH=/usr/local/bin:/usr/bin:/bin:/home/claude/.local/bin" -ExecStart=/bin/bash -c 'if [ -f /home/claude/claude-agent-system/orchestrator.pid ]; then exec tail -f /dev/null; fi' -Restart=on-failure -RestartSec=5s -StandardOutput=journal -StandardError=journal - -[Install] -WantedBy=multi-user.target diff --git a/src/__init__.py b/src/__init__.py index 68dfd2d..45bf8bc 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,2 +1,31 @@ -"""Fireteam - Autonomous multi-agent system for long-running project execution.""" +""" +Fireteam - Adaptive task execution using Claude Agent SDK. +Minimal layer on top of SDK that adds: +- Complexity estimation (auto-select execution mode) +- Quality hooks (auto-run tests after code changes) + +Usage: + from fireteam import execute, ExecutionMode + + result = await execute( + project_dir="/path/to/project", + goal="Fix the bug in auth.py", + ) +""" + +from .api import execute +from .models import ExecutionMode, ExecutionResult +from .complexity import ComplexityLevel, estimate_complexity +from .hooks import QUALITY_HOOKS, AUTONOMOUS_HOOKS, create_test_hooks + +__all__ = [ + "execute", + "ExecutionMode", + "ExecutionResult", + "ComplexityLevel", + "estimate_complexity", + "QUALITY_HOOKS", + "AUTONOMOUS_HOOKS", + "create_test_hooks", +] diff --git a/src/agents/__init__.py b/src/agents/__init__.py deleted file mode 100644 index a619d29..0000000 --- a/src/agents/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -"""Agent wrappers for Claude sub-agents.""" - -from .planner import PlannerAgent -from .executor import ExecutorAgent -from .reviewer import ReviewerAgent - -__all__ = ['PlannerAgent', 'ExecutorAgent', 'ReviewerAgent'] diff --git a/src/agents/base.py b/src/agents/base.py deleted file mode 100644 index 715afcc..0000000 --- a/src/agents/base.py +++ /dev/null @@ -1,280 +0,0 @@ -""" -Base agent class for Claude sub-agents. -Provides common functionality for invoking Claude Agent SDK with specialized prompts. -""" - -import logging -import time -import os -import asyncio -from typing import Any -import config - - -class BaseAgent: - """Base class for all specialized agents using Claude Agent SDK.""" - - def __init__(self, agent_type: str, logger: logging.Logger | None = None, memory_manager=None): - self.agent_type = agent_type - self.logger = logger or logging.getLogger(f"agent.{agent_type}") - self.memory = memory_manager # Injected by orchestrator - self.max_retries = config.MAX_RETRIES - self.retry_delay = config.RETRY_DELAY - self.timeout = config.AGENT_TIMEOUTS.get(agent_type, 600) # Default 10 min if not specified - self._execution_context = {} # Store for memory retrieval - - def get_system_prompt(self) -> str: - """ - Get the system prompt for this agent. - Must be implemented by subclasses to define agent identity and core guidelines. - """ - raise NotImplementedError("Subclasses must implement get_system_prompt()") - - async def _execute_with_sdk(self, prompt: str, project_dir: str) -> dict[str, Any]: - """Execute prompt using Claude Agent SDK, automatically injecting memories into system prompt.""" - try: - self.logger.info(f"[{self.agent_type.upper()}] Initializing Claude Agent SDK...") - - # Import SDK and error types - from claude_agent_sdk import ( - ClaudeSDKClient, - ClaudeAgentOptions, - CLINotFoundError, - CLIConnectionError, - ProcessError - ) - - # Get base system prompt - base_system_prompt = self.get_system_prompt() - - # Automatic memory retrieval (happens silently to agent) - memory_context = self._retrieve_and_format_memories() - - # Inject memories into system prompt - enhanced_system_prompt = base_system_prompt - if memory_context: - enhanced_system_prompt += "\n" + memory_context - self.logger.debug(f"[{self.agent_type.upper()}] System prompt enhanced with memories") - - # Configure SDK options - # Note: API key is read from ANTHROPIC_API_KEY environment variable - self.logger.info(f"[{self.agent_type.upper()}] Configuring SDK with model: {config.SDK_MODEL}") - options = ClaudeAgentOptions( - allowed_tools=config.SDK_ALLOWED_TOOLS, - permission_mode=config.SDK_PERMISSION_MODE, - model=config.SDK_MODEL, - cwd=project_dir, # Set working directory for Claude Code - system_prompt=enhanced_system_prompt # Enhanced with memories - ) - - # Execute with SDK with timeout - self.logger.info(f"[{self.agent_type.upper()}] Connecting to Claude CLI (timeout: {self.timeout}s)...") - async with ClaudeSDKClient(options=options) as client: - # Set working directory - os.chdir(project_dir) - - # Send the query - self.logger.info(f"[{self.agent_type.upper()}] Sending query to Claude...") - await client.query(prompt) - self.logger.info(f"[{self.agent_type.upper()}] Query sent, waiting for response...") - - output_text = "" - message_count = 0 - async for message in client.receive_response(): - message_count += 1 - self.logger.info(f"[{self.agent_type.upper()}] Received message {message_count}: {type(message).__name__}") - - # Collect all text from the response - if hasattr(message, 'content'): - if isinstance(message.content, str): - output_text += message.content - elif isinstance(message.content, list): - for block in message.content: - if hasattr(block, 'text'): - output_text += block.text - elif isinstance(block, dict) and 'text' in block: - output_text += block['text'] - elif isinstance(message, str): - output_text += message - elif isinstance(message, dict): - # Try common keys - output_text += message.get('content', '') or message.get('text', '') - - # Validate we got actual output - if not output_text or len(output_text.strip()) == 0: - error_msg = "SDK returned empty output - Claude may have failed silently" - self.logger.error(error_msg) - return { - "success": False, - "output": None, - "error": error_msg - } - - return { - "success": True, - "output": output_text, - "error": None - } - - except Exception as e: - # Try to import error types for better error messages - try: - from claude_agent_sdk import CLINotFoundError, CLIConnectionError, ProcessError - - if isinstance(e, CLINotFoundError): - self.logger.error("Claude Code CLI not found - check that 'claude' is in PATH") - elif isinstance(e, CLIConnectionError): - self.logger.error("Failed to connect to Claude Code CLI - check if CLI is responsive") - elif isinstance(e, ProcessError): - self.logger.error(f"Claude Code CLI process error: {str(e)}") - else: - self.logger.error(f"SDK execution error: {str(e)}") - except ImportError: - self.logger.error(f"SDK execution error: {str(e)}") - - return { - "success": False, - "output": None, - "error": str(e) - } - - def _execute_command(self, prompt: str, project_dir: str) -> dict[str, Any]: - """Execute Claude Agent SDK with retry logic and timeout.""" - for attempt in range(self.max_retries): - try: - self.logger.info(f"[{self.agent_type.upper()}] Starting attempt {attempt + 1}/{self.max_retries} (timeout: {self.timeout}s)") - - # Run async SDK call in sync context with timeout - start_time = time.time() - try: - # Use wait_for to enforce timeout - result = asyncio.run( - asyncio.wait_for( - self._execute_with_sdk(prompt, project_dir), - timeout=self.timeout - ) - ) - except asyncio.TimeoutError: - elapsed = time.time() - start_time - error_msg = f"SDK call timed out after {elapsed:.1f}s (limit: {self.timeout}s)" - self.logger.error(f"[{self.agent_type.upper()}] {error_msg}") - return { - "success": False, - "output": None, - "error": error_msg - } - - elapsed = time.time() - start_time - self.logger.info(f"[{self.agent_type.upper()}] SDK call completed in {elapsed:.1f}s") - - if result["success"]: - self.logger.info(f"{self.agent_type} completed successfully") - return result - else: - self.logger.warning(f"{self.agent_type} failed") - self.logger.warning(f"error: {result['error']}") - - if attempt < self.max_retries - 1: - time.sleep(self.retry_delay) - continue - else: - return result - - except Exception as e: - self.logger.error(f"{self.agent_type} error: {str(e)}") - if attempt < self.max_retries - 1: - time.sleep(self.retry_delay) - continue - else: - return { - "success": False, - "output": None, - "error": str(e) - } - - return { - "success": False, - "output": None, - "error": f"Failed after {self.max_retries} attempts" - } - - def _build_memory_context_query(self) -> str: - """ - Build context query for semantic search. - Override in subclasses to customize based on agent type. - Access self._execution_context for execute() parameters. - """ - return "" - - def _get_relevant_memory_types(self) -> list[str]: - """ - Return memory types relevant to this agent. - Override in subclasses. - """ - return [] # All types by default - - def _retrieve_and_format_memories(self) -> str: - """Automatically retrieve and format relevant memories.""" - if not self.memory: - return "" - - # Build context query - context_query = self._build_memory_context_query() - if not context_query: - return "" - - self.logger.info(f"[{self.agent_type.upper()}] Retrieving memories...") - start_time = time.time() - - # Semantic search - memories = self.memory.search( - query=context_query, - limit=config.MEMORY_SEARCH_LIMIT, - memory_types=self._get_relevant_memory_types() or None - ) - - elapsed = time.time() - start_time - self.logger.info(f"[{self.agent_type.upper()}] Retrieved {len(memories)} memories in {elapsed:.2f}s") - - if not memories: - self.logger.info(f"[{self.agent_type.upper()}] No relevant memories found") - return "" - - # Format for injection (cleaner template) - memory_lines = [] - for mem in memories: - mem_type = mem.get('type', 'learning').replace('_', ' ').title() - content = mem.get('content', '') - cycle = mem.get('cycle', '?') - memory_lines.append(f"• {mem_type} (Cycle {cycle}): {content}") - - memory_text = f""" ---- -BACKGROUND KNOWLEDGE FROM PREVIOUS WORK: -(You have access to these learnings from earlier cycles) - -{"\n".join(memory_lines)} - -Use this background knowledge naturally. Don't explicitly reference cycles. ---- -""" - - return memory_text - - def execute(self, **kwargs) -> dict[str, Any]: - """ - Template method - handles memory injection automatically. - Subclasses should NOT override this - override _do_execute instead. - """ - # Store execution context for memory retrieval - self._execution_context = kwargs - - # Call subclass implementation - return self._do_execute(**kwargs) - - def _do_execute(self, **kwargs) -> dict[str, Any]: - """ - Subclass implementation of execute logic. - Subclasses override this instead of execute(). - """ - raise NotImplementedError("Subclasses must implement _do_execute()") diff --git a/src/agents/executor.py b/src/agents/executor.py deleted file mode 100644 index 6a849dd..0000000 --- a/src/agents/executor.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -Executor Agent - Responsible for executing planned tasks. -""" - -from typing import Any -from .base import BaseAgent - - -class ExecutorAgent(BaseAgent): - """Agent responsible for executing planned tasks.""" - - def __init__(self, logger=None, memory_manager=None): - super().__init__("executor", logger, memory_manager) - - def get_system_prompt(self) -> str: - """Return the system prompt defining the Executor Agent's identity and guidelines.""" - return """You are an Executor Agent in an autonomous multi-agent system. - -YOUR ROLE: -You are responsible for executing tasks according to project plans. You work alongside a Planner Agent (who creates the plan) and a Reviewer Agent (who assesses your work). - -CORE RESPONSIBILITIES: -1. Work through tasks systematically -2. Create/modify files as needed -3. Write clean, production-ready code -4. Test your implementations -5. Handle errors gracefully -6. Document your work - -EXECUTION PRINCIPLES: -- Focus on the NEXT actionable tasks from the plan -- Write actual, working code (not pseudocode) -- Test thoroughly before considering tasks complete -- If you encounter blockers, document them clearly -- Leave the codebase in a functional state -- Never leave placeholders or incomplete implementations - -QUALITY STANDARDS: -- Production-ready code quality -- Proper error handling -- Clean, maintainable implementations -- Thorough testing -- Clear documentation - -OUTPUT FORMAT: -Always provide a summary of: -- What you accomplished -- What files you created/modified -- Any issues encountered -- What still needs to be done - -Work efficiently and aim for quality.""" - - def _build_memory_context_query(self) -> str: - """Build context query for execution.""" - plan = self._execution_context.get('plan', '') - goal = self._execution_context.get('goal', '') - return f"Implementing plan: {plan}. Goal: {goal}" - - def _get_relevant_memory_types(self) -> list[str]: - """Executor cares about failed approaches, traces, code locations.""" - return ["failed_approach", "trace", "code_location"] - - def _do_execute( - self, - project_dir: str, - goal: str, - plan: str, - cycle_number: int - ) -> dict[str, Any]: - """ - Execute tasks according to the plan. - - Args: - project_dir: Path to project directory - goal: Project goal/objective - plan: Current plan to execute - cycle_number: Current cycle number - - Returns: - Dict with success status and execution results - """ - prompt = self._build_execution_prompt(goal, plan, cycle_number) - - # Execute via Claude Agent SDK - result = self._execute_command(prompt, project_dir) - - if result["success"]: - return { - "success": True, - "execution_result": result["output"], - "raw_output": result["output"] - } - else: - return { - "success": False, - "execution_result": None, - "error": result["error"] - } - - def _build_execution_prompt(self, goal: str, plan: str, cycle_number: int) -> str: - """Build prompt for task execution.""" - return f"""Execute the tasks outlined in the plan. - -PROJECT GOAL: -{goal} - -CYCLE NUMBER: {cycle_number} - -CURRENT PLAN: -{plan}""" diff --git a/src/agents/planner.py b/src/agents/planner.py deleted file mode 100644 index 3e1463f..0000000 --- a/src/agents/planner.py +++ /dev/null @@ -1,152 +0,0 @@ -""" -Planner Agent - Responsible for creating and updating project plans. -""" - -import json -from typing import Any -from .base import BaseAgent - - -class PlannerAgent(BaseAgent): - """Agent responsible for creating and updating project plans.""" - - def __init__(self, logger=None, memory_manager=None): - super().__init__("planner", logger, memory_manager) - - def get_system_prompt(self) -> str: - """Return the system prompt defining the Planner Agent's identity and guidelines.""" - return """You are a Planner Agent in an autonomous multi-agent system. - -YOUR ROLE: -You are responsible for creating and updating comprehensive project plans to achieve given goals. You work alongside an Executor Agent (who implements the plan) and a Reviewer Agent (who assesses progress). - -CORE RESPONSIBILITIES: -1. Break down goals into clear, concrete tasks -2. Organize tasks in logical order -3. Identify key milestones -4. Consider edge cases and testing requirements -5. Aim for production-ready quality -6. Update plans based on execution feedback and reviews - -PLANNING PRINCIPLES: -- Be specific and actionable - avoid vague or abstract tasks -- Consider dependencies between tasks -- Include testing and validation steps -- Plan for error handling and edge cases -- Adjust plans dynamically based on progress - -OUTPUT FORMAT: -Always provide your plan as a structured markdown document with: -- Overview/Summary (for initial plans) or Progress Summary (for updates) -- Task breakdown with priorities -- Key milestones -- Testing strategy (initial) or Remaining work (updates) -- Success criteria or Next steps - -Your plans guide the Executor Agent's work and should be clear enough for autonomous execution.""" - - def _build_memory_context_query(self) -> str: - """Build context query for planning.""" - goal = self._execution_context.get('goal', '') - last_review = self._execution_context.get('last_review', '') - return f"Planning to achieve: {goal}. Recent feedback: {last_review}" - - def _get_relevant_memory_types(self) -> list[str]: - """Planner cares about decisions, failed approaches, learnings.""" - return ["decision", "failed_approach", "learning"] - - def _do_execute( - self, - project_dir: str, - goal: str, - cycle_number: int, - previous_plan: str | None = None, - last_execution_result: str | None = None, - last_review: str | None = None - ) -> dict[str, Any]: - """ - Create or update project plan based on current state. - - Args: - project_dir: Path to project directory - goal: Project goal/objective - cycle_number: Current cycle number - previous_plan: Previous plan (if any) - last_execution_result: Result from last execution (if any) - last_review: Review from last cycle (if any) - - Returns: - Dict with success status and plan - """ - # Build context-aware prompt - if cycle_number == 0: - prompt = self._build_initial_plan_prompt(goal) - else: - prompt = self._build_update_plan_prompt( - goal, previous_plan, last_execution_result, last_review, cycle_number - ) - - # Execute via Claude Agent SDK - result = self._execute_command(prompt, project_dir) - - if result["success"]: - # Extract plan from output - plan = self._extract_plan(result["output"]) - return { - "success": True, - "plan": plan, - "raw_output": result["output"] - } - else: - return { - "success": False, - "plan": None, - "error": result["error"] - } - - def _build_initial_plan_prompt(self, goal: str) -> str: - """Build prompt for initial plan creation.""" - return f"""Create a comprehensive, actionable project plan to achieve this goal. - -PROJECT GOAL: -{goal} - -Be specific and actionable. This plan will guide the Executor Agent.""" - - def _build_update_plan_prompt( - self, - goal: str, - previous_plan: str, - last_execution_result: str | None, - last_review: str | None, - cycle_number: int - ) -> str: - """Build prompt for plan updates based on progress.""" - return f"""Update the project plan based on progress and feedback. - -PROJECT GOAL: -{goal} - -CYCLE NUMBER: {cycle_number} - -PREVIOUS PLAN: -{previous_plan} - -LAST EXECUTION RESULT: -{last_execution_result or "No execution yet"} - -LAST REVIEW: -{last_review or "No review yet"} - -Consider: -1. What has been completed successfully? -2. What issues or blockers were encountered? -3. What tasks remain? -4. What adjustments are needed? -5. Are we ready for final validation?""" - - def _extract_plan(self, output: str) -> str: - """Extract plan from Claude output.""" - # For now, return the full output as the plan - # Could add more sophisticated parsing if needed - return output.strip() diff --git a/src/agents/reviewer.py b/src/agents/reviewer.py deleted file mode 100644 index 94b94b6..0000000 --- a/src/agents/reviewer.py +++ /dev/null @@ -1,207 +0,0 @@ -""" -Reviewer Agent - Responsible for assessing project status and completion. -""" - -import re -from typing import Any -from .base import BaseAgent - - -class ReviewerAgent(BaseAgent): - """Agent responsible for reviewing progress and estimating completion.""" - - def __init__(self, logger=None, memory_manager=None): - super().__init__("reviewer", logger, memory_manager) - - def get_system_prompt(self) -> str: - """Return the system prompt defining the Reviewer Agent's identity and guidelines.""" - return """You are a Reviewer Agent in an autonomous multi-agent system. - -YOUR ROLE: -You are responsible for reviewing project progress and assessing completion percentage. You work alongside a Planner Agent (who creates plans) and an Executor Agent (who implements them). - -CORE RESPONSIBILITIES: -1. Examine the codebase thoroughly -2. Check what has been implemented vs. planned -3. Test functionality where possible -4. Identify gaps, issues, or incomplete work -5. Assess production-readiness -6. Provide honest completion estimates - -COMPLETION CRITERIA: -- 0%: Nothing started -- 25%: Basic structure in place -- 50%: Core functionality implemented -- 75%: Most features working, needs polish -- 90%: Feature complete, needs testing -- 95%: Production-ready with comprehensive testing -- 100%: Perfect, nothing more needed - -REVIEW PRINCIPLES: -- Be honest and critical - don't inflate percentages -- Verify actual functionality, not just code existence -- Check for edge cases and error handling -- Assess testing coverage -- Consider production-readiness -- In validation mode, be extra thorough and critical - -OUTPUT FORMAT: -Your response MUST include a completion percentage in this exact format: -COMPLETION: XX% - -Then provide: -- Summary of current state -- What's working well -- What's incomplete or broken -- What needs to be done next -- Whether ready for production - -MEMORY EXTRACTION: -As you review, identify key learnings: -1. **Patterns**: Architectural patterns discovered (e.g., "All DB calls use async/await") -2. **Decisions**: Technical decisions made (e.g., "Chose SQLite for simpler deployment") -3. **Failed Approaches**: What was tried but failed (e.g., "Tried bcrypt but Node 18 issues") -4. **Code Locations**: Where things are (e.g., "Auth middleware in src/auth/jwt.js") - -Format in your review using: -LEARNING[type]: content - -Example: -LEARNING[pattern]: All database operations use connection pooling -LEARNING[decision]: Using JWT tokens with 24h expiry for sessions -LEARNING[failed_approach]: Attempted websockets but had CORS issues -LEARNING[code_location]: User authentication logic in src/auth/handler.py""" - - def _build_memory_context_query(self) -> str: - """Build context query for review.""" - execution_result = self._execution_context.get('execution_result', '') - plan = self._execution_context.get('plan', '') - return f"Reviewing implementation: {execution_result}. Original plan: {plan}" - - def _get_relevant_memory_types(self) -> list[str]: - """Reviewer cares about patterns, decisions, learnings.""" - return ["learning", "decision", "pattern"] - - def _do_execute( - self, - project_dir: str, - goal: str, - plan: str, - execution_result: str, - cycle_number: int, - is_validation: bool = False - ) -> dict[str, Any]: - """ - Review project progress and estimate completion percentage. - - Args: - project_dir: Path to project directory - goal: Project goal/objective - plan: Current plan - execution_result: Result from last execution - cycle_number: Current cycle number - is_validation: Whether this is a validation check - - Returns: - Dict with success status, review, and completion percentage - """ - prompt = self._build_review_prompt( - goal, plan, execution_result, cycle_number, is_validation - ) - - # Execute via Claude Agent SDK - result = self._execute_command(prompt, project_dir) - - if result["success"]: - # Extract completion percentage from output - completion_pct = self._extract_completion_percentage(result["output"]) - # Extract learnings from output - learnings = self._extract_learnings(result["output"]) - return { - "success": True, - "review": result["output"], - "completion_percentage": completion_pct, - "learnings": learnings, - "raw_output": result["output"] - } - else: - return { - "success": False, - "review": None, - "completion_percentage": 0, - "learnings": [], - "error": result["error"] - } - - def _build_review_prompt( - self, - goal: str, - plan: str, - execution_result: str, - cycle_number: int, - is_validation: bool - ) -> str: - """Build prompt for project review.""" - validation_note = "" - if is_validation: - validation_note = """ -VALIDATION MODE: -This is a validation check. The system believes the project is >95% complete. -Be CRITICAL and thorough. Check for: -- Edge cases that might not be handled -- Missing error handling -- Incomplete features -- Testing gaps -- Production-readiness issues - -Only confirm high completion if truly production-ready. -""" - - return f"""Review the project's current state and assess progress. - -PROJECT GOAL: -{goal} - -CYCLE NUMBER: {cycle_number} - -CURRENT PLAN: -{plan} - -LATEST EXECUTION RESULT: -{execution_result} - -{validation_note}""" - - def _extract_completion_percentage(self, output: str) -> int: - """Extract completion percentage from review output.""" - # Look for "COMPLETION: XX%" pattern - match = re.search(r'COMPLETION:\s*(\d+)%', output, re.IGNORECASE) - if match: - return int(match.group(1)) - - # Fallback: look for any percentage - match = re.search(r'(\d+)%', output) - if match: - return int(match.group(1)) - - # Default to 0 if no percentage found - self.logger.warning("Could not extract completion percentage from review") - return 0 - - def _extract_learnings(self, review_text: str) -> list[dict]: - """Parse structured learnings from review.""" - learnings = [] - - # Match pattern: LEARNING[type]: content - pattern = r'LEARNING\[(\w+)\]:\s*(.+?)(?=\n|$)' - matches = re.findall(pattern, review_text, re.MULTILINE) - - for match in matches: - learning_type = match[0].lower() - content = match[1].strip() - learnings.append({ - "type": learning_type, - "content": content - }) - - return learnings diff --git a/src/api.py b/src/api.py new file mode 100644 index 0000000..99e8c7d --- /dev/null +++ b/src/api.py @@ -0,0 +1,103 @@ +""" +Public API for fireteam library. + +Provides adaptive task execution using Claude Agent SDK primitives. +Minimal layer on top of SDK - complexity estimation + execution mode selection. + +Usage: + import fireteam + + result = await fireteam.execute( + project_dir="/path/to/project", + goal="Fix the bug in auth.py", + context="Error logs: ...", + ) +""" + +import logging +from pathlib import Path + +from . import config +from .complexity import ComplexityLevel, estimate_complexity +from .hooks import QUALITY_HOOKS, create_test_hooks +from .models import ExecutionMode, ExecutionResult, LoopConfig +from .loops import single_turn, moderate_loop, full_loop + + +# Map complexity levels to execution modes +# SIMPLE is now treated as SINGLE_TURN (no separate mode) +COMPLEXITY_TO_MODE = { + ComplexityLevel.TRIVIAL: ExecutionMode.SINGLE_TURN, + ComplexityLevel.SIMPLE: ExecutionMode.SINGLE_TURN, # Merged with SINGLE_TURN + ComplexityLevel.MODERATE: ExecutionMode.MODERATE, + ComplexityLevel.COMPLEX: ExecutionMode.FULL, +} + + +async def execute( + project_dir: str | Path, + goal: str, + mode: ExecutionMode | None = None, + context: str = "", + run_tests: bool = True, + test_command: list[str] | None = None, + max_iterations: int | None = None, + logger: logging.Logger | None = None, +) -> ExecutionResult: + """ + Execute a task with appropriate complexity handling. + + Args: + project_dir: Path to the project directory + goal: Task description + mode: Execution mode (None = auto-detect from complexity) + context: Additional context (crash logs, etc.) + run_tests: Run tests after code changes (default: True) + test_command: Custom test command (auto-detected if None) + max_iterations: Maximum loop iterations for MODERATE/FULL modes (None = infinite) + logger: Optional logger + + Returns: + ExecutionResult with success status and output + """ + project_dir = Path(project_dir).resolve() + log = logger or logging.getLogger("fireteam") + + # Configure quality hooks + hooks = None + if run_tests: + hooks = create_test_hooks(test_command=test_command) if test_command else QUALITY_HOOKS + log.info("Quality hooks enabled") + + # Auto-detect mode if not specified + if mode is None: + log.info("Estimating task complexity...") + complexity = await estimate_complexity(goal, context, project_dir=project_dir) + mode = COMPLEXITY_TO_MODE[complexity] + log.info(f"Complexity: {complexity.value} -> Mode: {mode.value}") + + # Use config default if max_iterations not explicitly provided + effective_max_iterations = max_iterations if max_iterations is not None else config.MAX_ITERATIONS + + # Dispatch based on mode + if mode == ExecutionMode.SINGLE_TURN: + return await single_turn(project_dir, goal, context, hooks, log) + + elif mode == ExecutionMode.MODERATE: + cfg = LoopConfig( + max_iterations=effective_max_iterations, + parallel_reviewers=1, + majority_required=1, + ) + return await moderate_loop(project_dir, goal, context, hooks, cfg, log) + + elif mode == ExecutionMode.FULL: + cfg = LoopConfig( + max_iterations=effective_max_iterations, + parallel_reviewers=3, + majority_required=2, + ) + return await full_loop(project_dir, goal, context, hooks, cfg, log) + + else: + return ExecutionResult(success=False, mode=mode, error=f"Unknown mode: {mode}") diff --git a/src/claude_hooks/__init__.py b/src/claude_hooks/__init__.py new file mode 100644 index 0000000..b1c3b1b --- /dev/null +++ b/src/claude_hooks/__init__.py @@ -0,0 +1,10 @@ +""" +Claude Code hooks for fireteam plugin. + +These hooks integrate fireteam with Claude Code's hook system, +enabling the /fireteam on|off mode toggle. +""" + +from .user_prompt_submit import is_fireteam_enabled + +__all__ = ["is_fireteam_enabled"] diff --git a/src/claude_hooks/user_prompt_submit.py b/src/claude_hooks/user_prompt_submit.py new file mode 100644 index 0000000..98a3822 --- /dev/null +++ b/src/claude_hooks/user_prompt_submit.py @@ -0,0 +1,62 @@ +""" +Claude Code hook that intercepts user prompts when fireteam mode is enabled. + +When fireteam mode is ON: +1. Reads the user's task from stdin +2. Invokes fireteam.execute() with the task +3. Returns the result to Claude Code +""" +import sys +import json +import asyncio +from pathlib import Path + + +def is_fireteam_enabled() -> bool: + """Check session state for fireteam mode.""" + state_file = Path.home() / ".claude" / "fireteam_state.json" + if state_file.exists(): + try: + state = json.loads(state_file.read_text()) + return state.get("enabled", False) + except (json.JSONDecodeError, IOError): + return False + return False + + +async def main(): + """Main hook entry point.""" + input_data = json.loads(sys.stdin.read()) + + if not is_fireteam_enabled(): + # Fireteam mode is OFF - pass through normally + print(json.dumps({})) + return + + # Fireteam mode is ON - inject orchestration context + user_prompt = input_data.get("prompt", "") + cwd = input_data.get("cwd", ".") + + # Import and run fireteam + from fireteam import execute + + result = await execute( + project_dir=cwd, + goal=user_prompt, + ) + + # Return result to Claude Code + output = { + "hookSpecificOutput": { + "additionalContext": f"Fireteam completed with {result.completion_percentage}% completion.\n\nResult:\n{result.output}", + } + } + + if not result.success: + output["hookSpecificOutput"]["additionalContext"] += f"\n\nError: {result.error}" + + print(json.dumps(output)) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/complexity.py b/src/complexity.py new file mode 100644 index 0000000..485ffac --- /dev/null +++ b/src/complexity.py @@ -0,0 +1,97 @@ +""" +Complexity estimation for adaptive execution mode selection. + +Fireteam estimates task complexity to choose the appropriate execution mode: +- TRIVIAL: Single Opus turn (direct SDK call, no agents) +- SIMPLE: Executor only +- MODERATE: Executor + single Reviewer +- COMPLEX: Full Planner + Executor + triple Reviewer +""" + +from enum import Enum +from pathlib import Path + +from claude_agent_sdk import query, ClaudeAgentOptions + +from . import config +from .prompts import COMPLEXITY_PROMPT + + +class ComplexityLevel(Enum): + """Task complexity levels.""" + TRIVIAL = "trivial" # Single turn, no agents + SIMPLE = "simple" # Executor only + MODERATE = "moderate" # Executor + single Reviewer + COMPLEX = "complex" # Full Planner + Executor + triple Reviewer + + +# Read-only tools for codebase exploration during complexity estimation +EXPLORATION_TOOLS = ["Glob", "Grep", "Read"] + + +async def estimate_complexity( + goal: str, + context: str = "", + project_dir: str | Path | None = None, +) -> ComplexityLevel: + """ + Estimate task complexity by asking Opus. + + When project_dir is provided, Claude can explore the codebase using + read-only tools (Glob, Grep, Read) to make a more accurate estimate. + + Args: + goal: The task description + context: Additional context (e.g., crash logs, file contents) + project_dir: Project directory for codebase exploration (optional) + + Returns: + ComplexityLevel indicating how to execute this task + """ + prompt = COMPLEXITY_PROMPT.format(goal=goal, context=context or "None provided") + + # Enable codebase exploration if project_dir is provided + if project_dir: + options = ClaudeAgentOptions( + allowed_tools=EXPLORATION_TOOLS, + permission_mode="plan", # Read-only mode + model=config.SDK_MODEL, + cwd=str(Path(project_dir).resolve()), + setting_sources=config.SDK_SETTING_SOURCES, + ) + else: + # No tools - quick estimation without codebase access + options = ClaudeAgentOptions( + allowed_tools=[], + max_turns=1, + model=config.SDK_MODEL, + ) + + result_text = "" + async for message in query(prompt=prompt, options=options): + if hasattr(message, "result"): + result_text = message.result + elif hasattr(message, "content"): + # Capture final text response after tool use + if isinstance(message.content, str): + result_text = message.content + elif isinstance(message.content, list): + for block in message.content: + if hasattr(block, "text"): + result_text = block.text + + # Parse the response - look for complexity level keywords + result_upper = result_text.strip().upper() + + # Check for explicit complexity keywords (last occurrence wins for multi-turn) + if "COMPLEX" in result_upper: + return ComplexityLevel.COMPLEX + elif "MODERATE" in result_upper: + return ComplexityLevel.MODERATE + elif "TRIVIAL" in result_upper: + return ComplexityLevel.TRIVIAL + elif "SIMPLE" in result_upper: + return ComplexityLevel.SIMPLE + else: + # Default to SIMPLE if unclear + return ComplexityLevel.SIMPLE diff --git a/src/config.py b/src/config.py index 29a546d..bca70b7 100644 --- a/src/config.py +++ b/src/config.py @@ -1,5 +1,7 @@ """ Configuration settings for Fireteam. + +Minimal configuration - most behavior comes from SDK defaults and CLAUDE.md. """ import os @@ -7,73 +9,24 @@ from dotenv import load_dotenv # Load environment variables from .env file -# Look in repo root (parent of src directory) env_file = Path(__file__).parent.parent / ".env" if env_file.exists(): load_dotenv(env_file) -# System paths - configurable via FIRETEAM_DIR environment variable -# Defaults to /home/claude/fireteam for standalone mode -# Can be set to /app for containerized environments (e.g., terminal-bench) -SYSTEM_DIR = os.getenv("FIRETEAM_DIR", "/home/claude/fireteam") -STATE_DIR = os.path.join(SYSTEM_DIR, "state") -LOGS_DIR = os.path.join(SYSTEM_DIR, "logs") -CLI_DIR = os.path.join(SYSTEM_DIR, "cli") - # Claude Agent SDK configuration -# Note: API key is lazy-loaded to allow --help and other non-API operations -def get_anthropic_api_key(): - """Get ANTHROPIC_API_KEY, raising error only when actually needed.""" - api_key = os.getenv("ANTHROPIC_API_KEY") - if not api_key: - raise ValueError( - "ANTHROPIC_API_KEY environment variable must be set. " - "Set it in your environment or in .env file." - ) - return api_key - -# SDK options +SDK_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-opus-4-5-20251101") SDK_ALLOWED_TOOLS = ["Read", "Write", "Bash", "Edit", "Grep", "Glob"] -# Autonomous operation SDK_PERMISSION_MODE = "bypassPermissions" -# Using latest claude sonnet 4.5 -SDK_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-5-20250929") - -# Agent configuration -MAX_RETRIES = 3 -RETRY_DELAY = 5 # seconds +SDK_SETTING_SOURCES = ["project"] # Auto-load CLAUDE.md -# Agent timeouts (in seconds) -# Can be overridden via FIRETEAM_AGENT_TIMEOUT_* env vars (e.g., FIRETEAM_AGENT_TIMEOUT_PLANNER=120) -# Shorter timeouts in CI to fail fast instead of hanging -DEFAULT_TIMEOUT = int(os.getenv("FIRETEAM_DEFAULT_TIMEOUT", "600")) # 10 minutes default -AGENT_TIMEOUTS = { - "planner": int(os.getenv("FIRETEAM_AGENT_TIMEOUT_PLANNER", DEFAULT_TIMEOUT)), - "reviewer": int(os.getenv("FIRETEAM_AGENT_TIMEOUT_REVIEWER", DEFAULT_TIMEOUT)), - "executor": int(os.getenv("FIRETEAM_AGENT_TIMEOUT_EXECUTOR", str(DEFAULT_TIMEOUT * 3))) # 30 min default -} +# Completion validation +COMPLETION_THRESHOLD = 95 # percentage required +VALIDATION_CHECKS_REQUIRED = 3 # consecutive reviews needed -# Completion thresholds -COMPLETION_THRESHOLD = 95 # percentage -VALIDATION_CHECKS_REQUIRED = 3 # consecutive checks needed - -# Git configuration -GIT_USER_NAME = os.environ.get("GIT_USER_NAME", "fireteam") -GIT_USER_EMAIL = os.environ.get("GIT_USER_EMAIL", "fireteam@darkresearch.ai") +# Loop configuration +# None = infinite iterations (default), set via FIRETEAM_MAX_ITERATIONS env var +_max_iter = os.getenv("FIRETEAM_MAX_ITERATIONS") +MAX_ITERATIONS: int | None = int(_max_iter) if _max_iter else None # Logging -LOG_LEVEL = os.getenv("LOG_LEVEL", os.getenv("FIRETEAM_LOG_LEVEL", "INFO")).upper() -LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - -# Sudo password for system operations (optional) -# Set in .env file: SUDO_PASSWORD=your_password_here -SUDO_PASSWORD = os.getenv("SUDO_PASSWORD", None) - -# Memory configuration -MEMORY_DIR = os.path.join(SYSTEM_DIR, "memory") -MEMORY_EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-0.6B" -MEMORY_SEARCH_LIMIT = 10 # How many memories to retrieve per query - -def has_sudo_access(): - """Check if sudo password is available.""" - return SUDO_PASSWORD is not None +LOG_LEVEL = os.getenv("FIRETEAM_LOG_LEVEL", "INFO").upper() diff --git a/src/hooks.py b/src/hooks.py new file mode 100644 index 0000000..b785baf --- /dev/null +++ b/src/hooks.py @@ -0,0 +1,299 @@ +""" +SDK Hooks for automatic quality enforcement. + +Provides PostToolUse hooks that run tests after code changes, +giving Claude immediate feedback when tests fail. +""" + +import asyncio +import logging +import subprocess +from pathlib import Path +from typing import Any + +from claude_agent_sdk import HookMatcher + + +# Default test commands to try (in order of preference) +DEFAULT_TEST_COMMANDS = [ + ["pytest", "-x", "--tb=short"], # Python + ["npm", "test"], # Node.js + ["cargo", "test"], # Rust + ["go", "test", "./..."], # Go + ["make", "test"], # Makefile-based +] + + +def detect_test_command(project_dir: Path) -> list[str] | None: + """ + Detect the appropriate test command for a project. + Returns None if no test framework is detected. + """ + # Check for Python (pytest/pyproject.toml) + if (project_dir / "pytest.ini").exists() or \ + (project_dir / "pyproject.toml").exists() or \ + (project_dir / "setup.py").exists() or \ + (project_dir / "tests").is_dir(): + return ["pytest", "-x", "--tb=short"] + + # Check for Node.js + if (project_dir / "package.json").exists(): + return ["npm", "test"] + + # Check for Rust + if (project_dir / "Cargo.toml").exists(): + return ["cargo", "test"] + + # Check for Go + if (project_dir / "go.mod").exists(): + return ["go", "test", "./..."] + + # Check for Makefile with test target + makefile = project_dir / "Makefile" + if makefile.exists(): + content = makefile.read_text() + if "test:" in content: + return ["make", "test"] + + return None + + +def run_tests_sync(project_dir: Path, test_command: list[str], timeout: int = 120) -> tuple[bool, str]: + """ + Run tests synchronously and return (success, output). + """ + try: + result = subprocess.run( + test_command, + cwd=project_dir, + capture_output=True, + text=True, + timeout=timeout, + ) + + output = result.stdout + result.stderr + success = result.returncode == 0 + + return success, output + + except subprocess.TimeoutExpired: + return False, f"Tests timed out after {timeout}s" + except FileNotFoundError: + return False, f"Test command not found: {test_command[0]}" + except Exception as e: + return False, f"Error running tests: {e}" + + +async def run_tests_after_edit(input_data: dict, tool_use_id: str | None, context: Any) -> dict: + """ + PostToolUse hook: Run tests after any Edit/Write operation. + + Provides feedback to Claude if tests fail, allowing immediate correction. + """ + # Only process PostToolUse events + if input_data.get("hook_event_name") != "PostToolUse": + return {} + + tool_name = input_data.get("tool_name", "") + + # Only run for file modification tools + if tool_name not in ("Edit", "Write"): + return {} + + # Get project directory from context + cwd = input_data.get("cwd", "") + if not cwd: + return {} + + project_dir = Path(cwd) + + # Detect test command + test_command = detect_test_command(project_dir) + if not test_command: + # No test framework detected - skip + return {} + + # Get the file that was modified + tool_input = input_data.get("tool_input", {}) + modified_file = tool_input.get("file_path", "unknown") + + # Run tests + success, output = await asyncio.to_thread( + run_tests_sync, project_dir, test_command + ) + + if success: + # Tests passed - no feedback needed + return {} + + # Tests failed - provide feedback to Claude + # Truncate output if too long + max_output_len = 2000 + if len(output) > max_output_len: + output = output[:max_output_len] + "\n... (output truncated)" + + feedback = f"""Tests failed after editing {modified_file}. + +Command: {' '.join(test_command)} + +Output: +{output} + +Please fix the failing tests before continuing.""" + + return { + "hookSpecificOutput": { + "hookEventName": input_data["hook_event_name"], + "additionalContext": feedback, + } + } + + +async def log_tool_usage(input_data: dict, tool_use_id: str | None, context: Any) -> dict: + """ + PostToolUse hook: Log all tool usage for debugging/auditing. + """ + if input_data.get("hook_event_name") != "PostToolUse": + return {} + + tool_name = input_data.get("tool_name", "") + tool_input = input_data.get("tool_input", {}) + + logger = logging.getLogger("fireteam.hooks") + logger.debug(f"Tool used: {tool_name}, input: {tool_input}") + + return {} + + +async def block_user_questions(input_data: dict, tool_use_id: str | None, context: Any) -> dict: + """ + PreToolUse hook: Block AskUserQuestion in autonomous mode. + + Fireteam runs autonomously without user interaction. If Claude tries to + ask a clarifying question, we deny it and tell Claude to proceed with + its best judgment. + + This is a belt+suspenders approach - AskUserQuestion should also not be + in allowed_tools, but this hook catches it if it somehow gets through. + """ + if input_data.get("hook_event_name") != "PreToolUse": + return {} + + tool_name = input_data.get("tool_name", "") + + if tool_name == "AskUserQuestion": + return { + "hookSpecificOutput": { + "hookEventName": input_data["hook_event_name"], + "permissionDecision": "deny", + "permissionDecisionReason": ( + "This is an autonomous execution - no user is available to answer questions. " + "Proceed with your best judgment based on the available context. " + "Make reasonable assumptions and document them in your work." + ), + } + } + + return {} + + +def create_test_hooks( + test_command: list[str] | None = None, + test_timeout: int = 120, +) -> dict[str, list]: + """ + Create hook configuration for automatic test running. + + Args: + test_command: Explicit test command to use (auto-detected if None) + test_timeout: Timeout in seconds for test execution + + Returns: + Hook configuration dict to pass to ClaudeAgentOptions + """ + + async def test_hook(input_data: dict, tool_use_id: str | None, context: Any) -> dict: + """Custom test hook with configured command and timeout.""" + if input_data.get("hook_event_name") != "PostToolUse": + return {} + + tool_name = input_data.get("tool_name", "") + if tool_name not in ("Edit", "Write"): + return {} + + cwd = input_data.get("cwd", "") + if not cwd: + return {} + + project_dir = Path(cwd) + + # Use configured command or auto-detect + cmd = test_command or detect_test_command(project_dir) + if not cmd: + return {} + + tool_input = input_data.get("tool_input", {}) + modified_file = tool_input.get("file_path", "unknown") + + success, output = await asyncio.to_thread( + run_tests_sync, project_dir, cmd, test_timeout + ) + + if success: + return {} + + max_output_len = 2000 + if len(output) > max_output_len: + output = output[:max_output_len] + "\n... (output truncated)" + + feedback = f"""Tests failed after editing {modified_file}. + +Command: {' '.join(cmd)} + +Output: +{output} + +Please fix the failing tests before continuing.""" + + return { + "hookSpecificOutput": { + "hookEventName": input_data["hook_event_name"], + "additionalContext": feedback, + } + } + + return { + "PreToolUse": [ + # Block AskUserQuestion in autonomous mode + HookMatcher(matcher="AskUserQuestion", hooks=[block_user_questions]) + ], + "PostToolUse": [ + HookMatcher(matcher="Edit|Write", hooks=[test_hook]) + ] + } + + +# Pre-configured hook sets for common use cases +QUALITY_HOOKS = { + "PreToolUse": [ + # Block AskUserQuestion in autonomous mode (belt+suspenders) + HookMatcher(matcher="AskUserQuestion", hooks=[block_user_questions]) + ], + "PostToolUse": [ + # Run tests after code changes + HookMatcher(matcher="Edit|Write", hooks=[run_tests_after_edit]) + ] +} + +AUTONOMOUS_HOOKS = { + "PreToolUse": [ + # Block AskUserQuestion in autonomous mode + HookMatcher(matcher="AskUserQuestion", hooks=[block_user_questions]) + ], +} + +DEBUG_HOOKS = { + "PostToolUse": [ + HookMatcher(hooks=[log_tool_usage]) + ] +} diff --git a/src/loops.py b/src/loops.py new file mode 100644 index 0000000..f552592 --- /dev/null +++ b/src/loops.py @@ -0,0 +1,435 @@ +""" +Execution implementations for fireteam. + +SINGLE_TURN: direct SDK call, no loop +MODERATE: execute → review loop until complete +FULL: plan → execute → parallel reviews loop until complete +""" + +import asyncio +import itertools +import logging +from pathlib import Path + +from claude_agent_sdk import query, ClaudeAgentOptions + +from . import config +from .models import ( + ExecutionMode, + ExecutionResult, + IterationState, + LoopConfig, + PhaseType, + ReviewResult, +) +from .prompts.builder import build_prompt + + +# Tool permission sets per phase +PLAN_TOOLS = ["Glob", "Grep", "Read"] +EXECUTE_TOOLS = ["Read", "Write", "Edit", "Bash", "Glob", "Grep"] +REVIEW_TOOLS = ["Read", "Glob", "Grep", "Bash"] + + +async def single_turn( + project_dir: Path, + goal: str, + context: str = "", + hooks: dict | None = None, + log: logging.Logger | None = None, +) -> ExecutionResult: + """ + SINGLE_TURN mode: direct SDK call, no loop. + + For trivial and simple tasks that don't need iteration. + """ + log = log or logging.getLogger("fireteam") + log.info("SINGLE_TURN: Direct SDK call") + + prompt = build_prompt( + phase=PhaseType.EXECUTE, + goal=goal, + context=context, + ) + + options = ClaudeAgentOptions( + allowed_tools=EXECUTE_TOOLS, + permission_mode=config.SDK_PERMISSION_MODE, + model=config.SDK_MODEL, + cwd=str(project_dir), + setting_sources=config.SDK_SETTING_SOURCES, + hooks=hooks, + max_turns=10, # Limit for trivial tasks + ) + + try: + result_text = "" + async for message in query(prompt=prompt, options=options): + if hasattr(message, "result"): + result_text = message.result + elif hasattr(message, "content"): + if isinstance(message.content, str): + result_text += message.content + elif isinstance(message.content, list): + for block in message.content: + if hasattr(block, "text"): + result_text += block.text + + return ExecutionResult( + success=True, + mode=ExecutionMode.SINGLE_TURN, + output=result_text, + completion_percentage=100, + iterations=1, + ) + except Exception as e: + log.error(f"Single turn failed: {e}") + return ExecutionResult(success=False, mode=ExecutionMode.SINGLE_TURN, error=str(e)) + + +async def run_phase( + phase: PhaseType, + prompt: str, + project_dir: Path, + hooks: dict | None = None, +) -> str: + """ + Run a single SDK query for a phase. + + Each phase gets appropriate tool permissions: + - PLAN: read-only (Glob, Grep, Read) + - EXECUTE: full access + hooks + - REVIEW: read-only + Bash for tests + """ + if phase == PhaseType.PLAN: + tools = PLAN_TOOLS + permission_mode = "plan" + phase_hooks = None + elif phase == PhaseType.EXECUTE: + tools = EXECUTE_TOOLS + permission_mode = config.SDK_PERMISSION_MODE + phase_hooks = hooks + elif phase == PhaseType.REVIEW: + tools = REVIEW_TOOLS + permission_mode = "plan" + phase_hooks = None + else: + raise ValueError(f"Unknown phase: {phase}") + + options = ClaudeAgentOptions( + allowed_tools=tools, + permission_mode=permission_mode, + model=config.SDK_MODEL, + cwd=str(project_dir), + setting_sources=config.SDK_SETTING_SOURCES, + hooks=phase_hooks, + ) + + result_text = "" + async for message in query(prompt=prompt, options=options): + if hasattr(message, "result"): + result_text = message.result + elif hasattr(message, "content"): + if isinstance(message.content, str): + result_text += message.content + elif isinstance(message.content, list): + for block in message.content: + if hasattr(block, "text"): + result_text += block.text + + return result_text + + +async def run_single_review( + goal: str, + state: IterationState, + project_dir: Path, + reviewer_id: int = 1, + threshold: int = 95, +) -> ReviewResult: + """Run a single reviewer and return structured result.""" + prompt = build_prompt( + phase=PhaseType.REVIEW, + goal=goal, + execution_output=state.execution_output, + plan=state.plan, + previous_feedback=state.accumulated_feedback if state.iteration > 1 else None, + reviewer_id=reviewer_id, + iteration=state.iteration, + ) + + output = await run_phase(PhaseType.REVIEW, prompt, project_dir) + return ReviewResult.from_output(output, threshold=threshold) + + +async def run_parallel_reviews( + goal: str, + state: IterationState, + project_dir: Path, + num_reviewers: int = 3, + threshold: int = 95, + log: logging.Logger | None = None, +) -> list[ReviewResult]: + """ + Run multiple reviewers in parallel using asyncio.gather(). + + Returns list of ReviewResults, handling any exceptions gracefully. + """ + log = log or logging.getLogger("fireteam") + + tasks = [ + run_single_review(goal, state, project_dir, reviewer_id=i + 1, threshold=threshold) + for i in range(num_reviewers) + ] + + results = await asyncio.gather(*tasks, return_exceptions=True) + + processed: list[ReviewResult] = [] + for i, result in enumerate(results): + if isinstance(result, Exception): + log.warning(f"Reviewer {i + 1} failed: {result}") + processed.append( + ReviewResult( + completion_percentage=0, + feedback=f"Review failed: {result}", + issues=["Reviewer encountered an error"], + passed=False, + ) + ) + else: + processed.append(result) + + return processed + + +def check_completion(reviews: list[ReviewResult], cfg: LoopConfig) -> bool: + """Check if completion criteria is met (majority must pass).""" + passing = sum(1 for r in reviews if r.passed) + return passing >= cfg.majority_required + + +async def moderate_loop( + project_dir: Path, + goal: str, + context: str = "", + hooks: dict | None = None, + cfg: LoopConfig | None = None, + log: logging.Logger | None = None, +) -> ExecutionResult: + """ + MODERATE mode: execute → review loop until complete. + + Loop continues until: + 1. Single reviewer says >= threshold, OR + 2. Max iterations reached (if set) + + Feedback from each review flows to the next execution. + """ + cfg = cfg or LoopConfig(parallel_reviewers=1, majority_required=1) + log = log or logging.getLogger("fireteam") + state = IterationState() + + # Use infinite counter if max_iterations is None, otherwise bounded range + counter = itertools.count(1) if cfg.max_iterations is None else range(1, cfg.max_iterations + 1) + max_display = "∞" if cfg.max_iterations is None else cfg.max_iterations + + for iteration in counter: + state.iteration = iteration + log.info(f"MODERATE iteration {iteration}/{max_display}") + + # === EXECUTE === + exec_prompt = build_prompt( + phase=PhaseType.EXECUTE, + goal=goal, + context=context, + previous_feedback=state.accumulated_feedback if iteration > 1 else None, + ) + + try: + state.execution_output = await run_phase( + PhaseType.EXECUTE, exec_prompt, project_dir, hooks=hooks + ) + log.info(f"Execution complete (iteration {iteration})") + except Exception as e: + log.error(f"Execution failed: {e}") + return ExecutionResult( + success=False, + mode=ExecutionMode.MODERATE, + error=f"Execution failed on iteration {iteration}: {e}", + iterations=iteration, + ) + + # === REVIEW === + try: + review = await run_single_review( + goal, state, project_dir, threshold=cfg.completion_threshold + ) + state.add_review([review]) + log.info(f"Review: {review.completion_percentage}% {'PASS' if review.passed else 'FAIL'}") + except Exception as e: + log.warning(f"Review failed: {e}") + continue + + # === CHECK COMPLETION === + if check_completion([review], cfg): + log.info(f"Completion threshold met at iteration {iteration}") + return ExecutionResult( + success=True, + mode=ExecutionMode.MODERATE, + output=state.execution_output, + completion_percentage=review.completion_percentage, + iterations=iteration, + metadata={"review_history": state.review_history}, + ) + + # Max iterations reached (only reachable if max_iterations is set) + last_completion = 0 + if state.review_history: + last_reviews = state.review_history[-1].get("reviews", []) + if last_reviews: + last_completion = last_reviews[0].get("completion", 0) + + return ExecutionResult( + success=False, + mode=ExecutionMode.MODERATE, + output=state.execution_output, + error=f"Did not reach {cfg.completion_threshold}% after {cfg.max_iterations} iterations", + completion_percentage=last_completion, + iterations=cfg.max_iterations or state.iteration, + metadata={"review_history": state.review_history}, + ) + + +async def full_loop( + project_dir: Path, + goal: str, + context: str = "", + hooks: dict | None = None, + cfg: LoopConfig | None = None, + log: logging.Logger | None = None, +) -> ExecutionResult: + """ + FULL mode: plan → execute → parallel reviews loop until complete. + + Loop continues until: + 1. Majority (2 of 3) reviewers say >= threshold, OR + 2. Max iterations reached (if set) + + Plan is created once, then execute-review loops with feedback. + """ + cfg = cfg or LoopConfig(parallel_reviewers=3, majority_required=2) + log = log or logging.getLogger("fireteam") + state = IterationState() + + # === PLAN (once at start) === + log.info("FULL mode: Planning phase") + plan_prompt = build_prompt( + phase=PhaseType.PLAN, + goal=goal, + context=context, + ) + + try: + state.plan = await run_phase(PhaseType.PLAN, plan_prompt, project_dir) + log.info("Planning complete") + except Exception as e: + log.error(f"Planning failed: {e}") + return ExecutionResult( + success=False, + mode=ExecutionMode.FULL, + error=f"Planning failed: {e}", + ) + + # === EXECUTE-REVIEW LOOP === + # Use infinite counter if max_iterations is None, otherwise bounded range + counter = itertools.count(1) if cfg.max_iterations is None else range(1, cfg.max_iterations + 1) + max_display = "∞" if cfg.max_iterations is None else cfg.max_iterations + + for iteration in counter: + state.iteration = iteration + log.info(f"FULL iteration {iteration}/{max_display}") + + # === EXECUTE === + exec_prompt = build_prompt( + phase=PhaseType.EXECUTE, + goal=goal, + context=context, + plan=state.plan, + previous_feedback=state.accumulated_feedback if iteration > 1 else None, + ) + + try: + state.execution_output = await run_phase( + PhaseType.EXECUTE, exec_prompt, project_dir, hooks=hooks + ) + log.info(f"Execution complete (iteration {iteration})") + except Exception as e: + log.error(f"Execution failed: {e}") + return ExecutionResult( + success=False, + mode=ExecutionMode.FULL, + error=f"Execution failed on iteration {iteration}: {e}", + iterations=iteration, + metadata={"plan": state.plan}, + ) + + # === PARALLEL REVIEWS === + log.info(f"Running {cfg.parallel_reviewers} parallel reviewers") + try: + reviews = await run_parallel_reviews( + goal, + state, + project_dir, + num_reviewers=cfg.parallel_reviewers, + threshold=cfg.completion_threshold, + log=log, + ) + state.add_review(reviews) + + for i, r in enumerate(reviews, 1): + log.info(f" Reviewer {i}: {r.completion_percentage}% {'PASS' if r.passed else 'FAIL'}") + except Exception as e: + log.warning(f"Review phase failed: {e}") + continue + + # === CHECK MAJORITY COMPLETION === + passing = sum(1 for r in reviews if r.passed) + avg_completion = sum(r.completion_percentage for r in reviews) // len(reviews) + + if check_completion(reviews, cfg): + log.info(f"Majority completion ({passing}/{len(reviews)}) at iteration {iteration}") + return ExecutionResult( + success=True, + mode=ExecutionMode.FULL, + output=state.execution_output, + completion_percentage=avg_completion, + iterations=iteration, + metadata={ + "plan": state.plan, + "review_history": state.review_history, + "final_reviews": [ + {"reviewer": i + 1, "completion": r.completion_percentage, "passed": r.passed} + for i, r in enumerate(reviews) + ], + }, + ) + + # Max iterations reached (only reachable if max_iterations is set) + avg_completion = 0 + if state.review_history: + last_reviews = state.review_history[-1].get("reviews", []) + if last_reviews: + avg_completion = sum(r.get("completion", 0) for r in last_reviews) // len(last_reviews) + + return ExecutionResult( + success=False, + mode=ExecutionMode.FULL, + output=state.execution_output, + error=f"Did not achieve majority completion after {cfg.max_iterations} iterations", + completion_percentage=avg_completion, + iterations=cfg.max_iterations or state.iteration, + metadata={ + "plan": state.plan, + "review_history": state.review_history, + }, + ) diff --git a/src/memory/__init__.py b/src/memory/__init__.py deleted file mode 100644 index 7878ee4..0000000 --- a/src/memory/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Memory management module for Fireteam.""" - -from .manager import MemoryManager - -__all__ = ["MemoryManager"] - diff --git a/src/memory/manager.py b/src/memory/manager.py deleted file mode 100644 index f2bf424..0000000 --- a/src/memory/manager.py +++ /dev/null @@ -1,245 +0,0 @@ -"""Memory manager with semantic search and observability.""" - -import chromadb -from transformers import AutoModel, AutoTokenizer -from sentence_transformers import SentenceTransformer -import torch -import hashlib -import logging -import time -import uuid -from typing import Any, Optional -from functools import lru_cache - - -class MemoryManager: - """Manages trace memory with automatic semantic search and observability.""" - - def __init__(self, memory_dir: str = None, logger: logging.Logger = None, - embedding_model: str = None): - """Initialize with embeddings and Chroma storage. - - Args: - memory_dir: Directory for memory storage - logger: Logger instance - embedding_model: HuggingFace model name for embeddings - (defaults to config.MEMORY_EMBEDDING_MODEL) - """ - self.logger = logger or logging.getLogger("memory") - - if memory_dir is None: - import config - memory_dir = config.MEMORY_DIR - - self.logger.info("[MEMORY] Initializing MemoryManager...") - - # Initialize Chroma with persistent storage - self.chroma_client = chromadb.PersistentClient(path=memory_dir) - self.logger.info(f"[MEMORY] Chroma initialized at {memory_dir}") - - # Load embedding model - if embedding_model is None: - import config - embedding_model = config.MEMORY_EMBEDDING_MODEL - - self.embedding_model_name = embedding_model - self.logger.info(f"[MEMORY] Loading model {embedding_model}...") - start_time = time.time() - - # Use sentence-transformers for lightweight models, - # otherwise use transformers library for Qwen3 - if 'sentence-transformers' in embedding_model or 'all-MiniLM' in embedding_model: - # Lightweight model - use sentence-transformers API - self.model = SentenceTransformer(embedding_model) - self.tokenizer = self.model.tokenizer - self.use_sentence_transformers = True - else: - # Qwen3 or other transformers model - self.tokenizer = AutoTokenizer.from_pretrained(embedding_model) - self.model = AutoModel.from_pretrained(embedding_model) - self.use_sentence_transformers = False - - # Use Metal/MPS acceleration on Mac (with CPU fallback) - if torch.backends.mps.is_available(): - self.model = self.model.to("mps") - self.logger.info("[MEMORY] Using Metal/MPS acceleration") - else: - self.logger.info("[MEMORY] Using CPU (MPS not available)") - - load_time = time.time() - start_time - self.logger.info(f"[MEMORY] Model loaded in {load_time:.2f}s") - - self.current_collection = None - - @lru_cache(maxsize=100) - def _get_embeddings_cached(self, text_tuple: tuple) -> tuple: - """Cached embedding generation (uses tuple for hashability).""" - texts = list(text_tuple) - return tuple(self._get_embeddings_impl(texts)) - - def _get_embeddings_impl(self, texts: list[str]) -> list[list[float]]: - """Generate embeddings using configured model.""" - if self.use_sentence_transformers: - # Use sentence-transformers API (simpler) - embeddings = self.model.encode(texts, convert_to_numpy=True) - return embeddings.tolist() - else: - # Use transformers API for Qwen3 - # Tokenize - inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt") - - # Move to MPS if available - if torch.backends.mps.is_available(): - inputs = {k: v.to("mps") for k, v in inputs.items()} - - # Generate embeddings - with torch.no_grad(): - outputs = self.model(**inputs) - - # Mean pooling - embeddings = outputs.last_hidden_state.mean(dim=1) - - # Normalize - embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) - - return embeddings.cpu().tolist() - - def _get_embeddings(self, texts: list[str]) -> list[list[float]]: - """Get embeddings with caching.""" - # Use cache for single text queries (common case) - if len(texts) == 1: - return list(self._get_embeddings_cached((texts[0],))) - # Batch queries don't use cache - return self._get_embeddings_impl(texts) - - def _get_collection_name(self, project_dir: str) -> str: - """Generate collection name from project directory.""" - return hashlib.md5(project_dir.encode()).hexdigest()[:16] - - def initialize_project(self, project_dir: str, goal: str): - """Initialize memory for a new project.""" - collection_name = self._get_collection_name(project_dir) - self.logger.info(f"[MEMORY] Initializing project collection: {collection_name}") - - # Get or create collection - self.current_collection = self.chroma_client.get_or_create_collection( - name=collection_name, - metadata={"project_dir": project_dir, "goal": goal} - ) - - # Log existing memory count - count = self.current_collection.count() - self.logger.info(f"[MEMORY] Project initialized with {count} existing memories") - - def add_memory(self, content: str, memory_type: str, cycle: int, metadata: dict = None): - """ - Add a memory (unified method for all types). - - Args: - content: The memory content (text) - memory_type: Type (trace, failed_approach, decision, learning, code_location) - cycle: Cycle number when this was recorded - metadata: Optional additional metadata - """ - if not self.current_collection: - raise ValueError("Project not initialized. Call initialize_project first.") - - self.logger.debug(f"[MEMORY] Adding {memory_type} from cycle {cycle}: {content[:80]}...") - - start_time = time.time() - - # Generate embedding - embedding = self._get_embeddings([content])[0] - - # Prepare metadata - mem_metadata = { - "type": memory_type, - "cycle": cycle, - **(metadata or {}) - } - - # Generate ID - mem_id = str(uuid.uuid4()) - - # Add to collection - self.current_collection.add( - ids=[mem_id], - embeddings=[embedding], - documents=[content], - metadatas=[mem_metadata] - ) - - elapsed = time.time() - start_time - self.logger.info(f"[MEMORY] Added {memory_type} in {elapsed:.2f}s") - - def search(self, query: str, limit: int = 10, memory_types: list[str] = None) -> list[dict]: - """ - Semantic search for relevant memories. - - Args: - query: Search query (will be embedded) - limit: Maximum results to return - memory_types: Filter by memory types (optional) - - Returns: - List of memory dicts with 'content', 'type', 'cycle', etc. - """ - if not self.current_collection: - return [] - - self.logger.info(f"[MEMORY] Searching: {query[:100]}...") - start_time = time.time() - - # Generate query embedding (cached) - query_embedding = self._get_embeddings([query])[0] - - # Build where clause for type filtering - where = None - if memory_types: - where = {"type": {"$in": memory_types}} - self.logger.debug(f"[MEMORY] Filtering by types: {memory_types}") - - # Search - results = self.current_collection.query( - query_embeddings=[query_embedding], - n_results=limit, - where=where - ) - - # Format results - memories = [] - if results['documents'] and results['documents'][0]: - for i, doc in enumerate(results['documents'][0]): - memories.append({ - "content": doc, - "type": results['metadatas'][0][i].get('type', 'unknown'), - "cycle": results['metadatas'][0][i].get('cycle', 0), - "distance": results['distances'][0][i] if 'distances' in results else None - }) - - elapsed = time.time() - start_time - self.logger.info(f"[MEMORY] Found {len(memories)} memories in {elapsed:.2f}s") - - # Log top results if debug enabled - if self.logger.level <= logging.DEBUG: - for i, mem in enumerate(memories[:3]): # Top 3 - self.logger.debug(f"[MEMORY] {i+1}. [{mem['type']}] {mem['content'][:60]}...") - - return memories - - def clear_project_memory(self, project_dir: str): - """Clear all memory for a project (with confirmation logging).""" - collection_name = self._get_collection_name(project_dir) - - try: - # Get count before deleting - collection = self.chroma_client.get_collection(name=collection_name) - count = collection.count() - - self.logger.info(f"[MEMORY] Deleting collection {collection_name} ({count} memories)...") - self.chroma_client.delete_collection(name=collection_name) - self.logger.info(f"[MEMORY] Successfully deleted {count} memories") - - except Exception as e: - self.logger.warning(f"[MEMORY] Could not delete collection: {e}") - diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..9aac9ba --- /dev/null +++ b/src/models.py @@ -0,0 +1,137 @@ +""" +Data models for fireteam execution. +""" + +from dataclasses import dataclass, field +from enum import Enum +from typing import Any + + +class ExecutionMode(Enum): + """Execution modes for fireteam tasks.""" + SINGLE_TURN = "single_turn" # Direct Opus call, no loop + MODERATE = "moderate" # Execute + review loop + FULL = "full" # Plan + execute + parallel reviews loop + + +class PhaseType(Enum): + """Phase types within execution.""" + PLAN = "plan" + EXECUTE = "execute" + REVIEW = "review" + + +@dataclass +class ReviewResult: + """Result from a single reviewer.""" + completion_percentage: int + feedback: str + issues: list[str] = field(default_factory=list) + passed: bool = False + + @classmethod + def from_output(cls, output: str, threshold: int = 95) -> "ReviewResult": + """Parse a ReviewResult from reviewer output.""" + completion = _extract_completion(output) + issues = _extract_issues(output) + return cls( + completion_percentage=completion, + feedback=output, + issues=issues, + passed=completion >= threshold, + ) + + +@dataclass +class IterationState: + """State tracked across loop iterations.""" + iteration: int = 0 + plan: str | None = None + execution_output: str | None = None + review_history: list[dict[str, Any]] = field(default_factory=list) + accumulated_feedback: str = "" + + def add_review(self, reviews: list[ReviewResult]) -> None: + """Add reviews from an iteration and update accumulated feedback.""" + self.review_history.append({ + "iteration": self.iteration, + "reviews": [ + { + "completion": r.completion_percentage, + "passed": r.passed, + "issues": r.issues, + } + for r in reviews + ], + }) + self.accumulated_feedback = self._aggregate_feedback(reviews) + + def _aggregate_feedback(self, reviews: list[ReviewResult]) -> str: + """Aggregate feedback from reviewers into actionable format.""" + if not reviews: + return "" + + parts = [] + for i, review in enumerate(reviews, 1): + if len(reviews) > 1: + parts.append(f"Reviewer {i} ({review.completion_percentage}%):") + parts.append(review.feedback[:1500]) # Truncate long feedback + if review.issues: + parts.append("Issues found:") + for issue in review.issues[:5]: # Limit issues + parts.append(f" - {issue}") + parts.append("") + + return "\n".join(parts) + + +@dataclass +class LoopConfig: + """Configuration for execution loops.""" + max_iterations: int | None = None # None = infinite (default) + completion_threshold: int = 95 + parallel_reviewers: int = 1 # 1 for MODERATE, 3 for FULL + majority_required: int = 1 # 1 for MODERATE, 2 for FULL + + +@dataclass +class ExecutionResult: + """Result of a fireteam execution.""" + success: bool + mode: ExecutionMode + output: str | None = None + error: str | None = None + completion_percentage: int = 0 + iterations: int = 0 + metadata: dict[str, Any] = field(default_factory=dict) + + +def _extract_completion(text: str) -> int: + """Extract completion percentage from review output.""" + import re + # Look for COMPLETION: XX% pattern + match = re.search(r'COMPLETION:\s*(\d+)%', text, re.IGNORECASE) + if match: + return int(match.group(1)) + # Fallback: find any percentage + match = re.search(r'(\d+)%', text) + return int(match.group(1)) if match else 50 + + +def _extract_issues(text: str) -> list[str]: + """Extract issues list from review output.""" + import re + issues = [] + + # Look for ISSUES: section + issues_match = re.search(r'ISSUES:\s*\n((?:[-*]\s*.+\n?)+)', text, re.IGNORECASE) + if issues_match: + issues_text = issues_match.group(1) + for line in issues_text.split('\n'): + line = line.strip() + if line.startswith(('-', '*')): + issue = line.lstrip('-* ').strip() + if issue: + issues.append(issue) + + return issues diff --git a/src/orchestrator.py b/src/orchestrator.py deleted file mode 100755 index 38234c3..0000000 --- a/src/orchestrator.py +++ /dev/null @@ -1,467 +0,0 @@ -#!/usr/bin/env python3 -""" -Main orchestrator for Fireteam. -Manages infinite cycles of planning → execution → review. -""" - -import os -import sys -import logging -import subprocess -import signal -from datetime import datetime -from pathlib import Path - -# Add system directory to path -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) - -import config -from state.manager import StateManager -from memory.manager import MemoryManager -from agents import PlannerAgent, ExecutorAgent, ReviewerAgent - - -class Orchestrator: - """Main orchestrator managing the agent system lifecycle.""" - - def __init__(self, project_dir: str, goal: str, debug: bool = False, keep_memory: bool = False): - self.project_dir = os.path.abspath(project_dir) - self.goal = goal - self.debug = debug - self.keep_memory = keep_memory # Flag to preserve memory/state after completion - self.state_manager = StateManager() - - # Set up logging - self.setup_logging() - - # Initialize memory (pass logger for observability) - self.memory = MemoryManager(logger=self.logger) - - # Initialize agents WITH memory manager - self.planner = PlannerAgent(self.logger, memory_manager=self.memory) - self.executor = ExecutorAgent(self.logger, memory_manager=self.memory) - self.reviewer = ReviewerAgent(self.logger, memory_manager=self.memory) - - # Signal handling for graceful shutdown - signal.signal(signal.SIGINT, self._signal_handler) - signal.signal(signal.SIGTERM, self._signal_handler) - - self.running = True - - def setup_logging(self): - """Set up logging to file and console.""" - # Ensure logs directory exists - os.makedirs(config.LOGS_DIR, exist_ok=True) - - log_file = os.path.join( - config.LOGS_DIR, - f"orchestrator_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" - ) - - # Override log level if debug flag is set - log_level = "DEBUG" if self.debug else config.LOG_LEVEL - - logging.basicConfig( - level=getattr(logging, log_level), - format=config.LOG_FORMAT, - handlers=[ - logging.FileHandler(log_file), - logging.StreamHandler(sys.stdout) - ] - ) - - self.logger = logging.getLogger("orchestrator") - self.logger.info("=" * 80) - self.logger.info("Fireteam Starting") - self.logger.info(f"Project: {self.project_dir}") - self.logger.info(f"Goal: {self.goal}") - self.logger.info("=" * 80) - - def _signal_handler(self, signum, frame): - """Handle shutdown signals gracefully.""" - self.logger.info(f"Received signal {signum}, shutting down gracefully...") - self.running = False - - def initialize_git_repo(self) -> str: - """ - Initialize git repo if needed and create a new branch. - Works with both new and existing repositories. - Returns the branch name. - """ - try: - # Ensure project directory exists - os.makedirs(self.project_dir, exist_ok=True) - - # Check if .git exists - git_dir = os.path.join(self.project_dir, ".git") - repo_exists = os.path.exists(git_dir) - - if not repo_exists: - self.logger.info("Initializing new git repository") - subprocess.run( - ["git", "init"], - cwd=self.project_dir, - check=True, - capture_output=True - ) - else: - self.logger.info("Using existing git repository") - - # Set git config only if not already configured - try: - result = subprocess.run( - ["git", "config", "user.name"], - cwd=self.project_dir, - capture_output=True, - text=True - ) - if result.returncode != 0 or not result.stdout.strip(): - self.logger.info("Configuring git user.name") - subprocess.run( - ["git", "config", "user.name", config.GIT_USER_NAME], - cwd=self.project_dir, - check=True, - capture_output=True - ) - - result = subprocess.run( - ["git", "config", "user.email"], - cwd=self.project_dir, - capture_output=True, - text=True - ) - if result.returncode != 0 or not result.stdout.strip(): - self.logger.info("Configuring git user.email") - subprocess.run( - ["git", "config", "user.email", config.GIT_USER_EMAIL], - cwd=self.project_dir, - check=True, - capture_output=True - ) - except subprocess.CalledProcessError as e: - self.logger.warning(f"Could not configure git user: {e}") - # Continue anyway - git might work with global config - - # For new repos, create initial commit if no commits exist - if not repo_exists: - try: - # Check if there are any commits - subprocess.run( - ["git", "rev-parse", "HEAD"], - cwd=self.project_dir, - check=True, - capture_output=True - ) - except subprocess.CalledProcessError: - # No commits yet, create initial commit - self.logger.info("Creating initial commit") - subprocess.run( - ["git", "add", "."], - cwd=self.project_dir, - capture_output=True - ) - subprocess.run( - ["git", "commit", "-m", "Initial commit", "--allow-empty"], - cwd=self.project_dir, - check=True, - capture_output=True - ) - - # Create new branch with timestamp from current HEAD - branch_name = f"fireteam-{datetime.now().strftime('%Y%m%d-%H%M%S')}" - self.logger.info(f"Creating branch: {branch_name}") - - subprocess.run( - ["git", "checkout", "-b", branch_name], - cwd=self.project_dir, - check=True, - capture_output=True - ) - - # Initialize memory for project - self.memory.initialize_project(self.project_dir, self.goal) - - return branch_name - - except subprocess.CalledProcessError as e: - self.logger.error(f"Git initialization error: {e}") - raise - - def commit_changes(self, cycle_number: int, message_suffix: str = ""): - """Commit changes after each cycle.""" - try: - # Check if there are changes to commit - result = subprocess.run( - ["git", "status", "--porcelain"], - cwd=self.project_dir, - capture_output=True, - text=True, - check=True - ) - - if not result.stdout.strip(): - self.logger.info("No changes to commit") - return - - # Add all changes - subprocess.run( - ["git", "add", "."], - cwd=self.project_dir, - check=True, - capture_output=True - ) - - # Commit with descriptive message - commit_msg = f"Cycle {cycle_number}: {message_suffix}" - subprocess.run( - ["git", "commit", "-m", commit_msg], - cwd=self.project_dir, - check=True, - capture_output=True - ) - - self.logger.info(f"Committed changes: {commit_msg}") - - # Push to remote if it exists - self.push_to_remote() - - except subprocess.CalledProcessError as e: - self.logger.error(f"Git commit error: {e}") - # Don't raise - continue even if commit fails - - def push_to_remote(self): - """Push to remote origin if it exists.""" - try: - # Check if remote exists - result = subprocess.run( - ["git", "remote", "get-url", "origin"], - cwd=self.project_dir, - capture_output=True, - text=True - ) - - if result.returncode == 0: - self.logger.info("Pushing to remote origin") - subprocess.run( - ["git", "push", "-u", "origin", "HEAD"], - cwd=self.project_dir, - check=True, - capture_output=True - ) - self.logger.info("Successfully pushed to remote") - - except subprocess.CalledProcessError as e: - self.logger.warning(f"Could not push to remote: {e}") - # Don't raise - pushing is optional - - def run_cycle(self, state: dict) -> dict: - """ - Run a single plan → execute → review cycle. - Returns updated state. - """ - cycle_num = state.get("cycle_number", 0) - self.logger.info(f"\n{'=' * 80}") - self.logger.info(f"CYCLE {cycle_num} - Starting") - self.logger.info(f"{'=' * 80}\n") - - # Goal alignment check every 3 cycles - if cycle_num > 0 and cycle_num % 3 == 0: - self.logger.info(f"{'='*60}") - self.logger.info(f"GOAL ALIGNMENT CHECK (Cycle {cycle_num})") - self.logger.info(f"{'='*60}") - self.logger.info(f"Original Goal: {self.goal}") - self.logger.info(f"\n⚠️ Reminder: Ensure all work aligns with original goal!") - self.logger.info(f"{'='*60}\n") - - # PHASE 1: Planning - self.logger.info("PHASE 1: Planning") - self.state_manager.update_state({"status": "planning"}) - - planner_result = self.planner.execute( - project_dir=self.project_dir, - goal=self.goal, - cycle_number=cycle_num, - previous_plan=state.get("current_plan"), - last_execution_result=state.get("last_execution_result"), - last_review=state.get("last_review") - ) - - if not planner_result["success"]: - self.logger.error(f"Planning failed: {planner_result.get('error')}") - return state - - current_plan = planner_result["plan"] - self.logger.info("Planning completed") - - # PHASE 2: Execution - self.logger.info("\nPHASE 2: Execution") - self.state_manager.update_state({ - "status": "executing", - "current_plan": current_plan - }) - - executor_result = self.executor.execute( - project_dir=self.project_dir, - goal=self.goal, - plan=current_plan, - cycle_number=cycle_num - ) - - if not executor_result["success"]: - self.logger.error(f"Execution failed: {executor_result.get('error')}") - return state - - execution_result = executor_result["execution_result"] - self.logger.info("Execution completed") - - # Record execution trace in memory - self.memory.add_memory( - content=execution_result, - memory_type="trace", - cycle=cycle_num - ) - - # PHASE 3: Review - self.logger.info("\nPHASE 3: Review") - self.state_manager.update_state({ - "status": "reviewing", - "last_execution_result": execution_result - }) - - is_validation = state.get("completion_percentage", 0) >= config.COMPLETION_THRESHOLD - - reviewer_result = self.reviewer.execute( - project_dir=self.project_dir, - goal=self.goal, - plan=current_plan, - execution_result=execution_result, - cycle_number=cycle_num, - is_validation=is_validation - ) - - if not reviewer_result["success"]: - self.logger.error(f"Review failed: {reviewer_result.get('error')}") - return state - - review = reviewer_result["review"] - parsed_completion = reviewer_result["completion_percentage"] - - # Use StateManager's parse failure handling - completion_pct = self.state_manager.update_completion_percentage( - parsed_completion, - logger=self.logger - ) - - self.logger.info(f"Review completed - Completion: {completion_pct}%") - - # Extract and store learnings from reviewer - if "learnings" in reviewer_result: - for learning in reviewer_result["learnings"]: - self.memory.add_memory( - content=learning["content"], - memory_type=learning["type"], - cycle=cycle_num - ) - - # Update state (completion_percentage already set by update_completion_percentage) - updated_state = self.state_manager.update_state({ - "current_plan": current_plan, - "last_execution_result": execution_result, - "last_review": review - }) - - # Commit changes - self.commit_changes(cycle_num, f"{completion_pct}% complete") - - # Increment cycle counter - self.state_manager.increment_cycle() - - return updated_state - - def check_completion(self, state: dict) -> bool: - """ - Check if project is complete based on validation logic. - Requires 3 consecutive reviews with >95% completion. - """ - completion_pct = state.get("completion_percentage", 0) - validation_checks = state.get("validation_checks", 0) - - if completion_pct >= config.COMPLETION_THRESHOLD: - validation_checks += 1 - self.state_manager.update_state({"validation_checks": validation_checks}) - - self.logger.info(f"Validation check {validation_checks}/{config.VALIDATION_CHECKS_REQUIRED}") - - if validation_checks >= config.VALIDATION_CHECKS_REQUIRED: - self.logger.info("Project completed! All validation checks passed.") - return True - else: - # Reset validation checks if percentage drops - if validation_checks > 0: - self.logger.info("Completion percentage dropped, resetting validation checks") - self.state_manager.update_state({"validation_checks": 0}) - - return False - - def run(self): - """Main execution loop.""" - try: - # Initialize state - state = self.state_manager.initialize_project(self.project_dir, self.goal) - - # Initialize git - branch_name = self.initialize_git_repo() - self.state_manager.update_state({"git_branch": branch_name}) - - # Infinite loop until completion - while self.running: - state = self.run_cycle(state) - - if self.check_completion(state): - self.state_manager.mark_completed() - self.logger.info("\n" + "=" * 80) - self.logger.info("PROJECT COMPLETED SUCCESSFULLY") - self.logger.info("=" * 80) - - # Automatic cleanup (unless --keep-memory flag set) - if not self.keep_memory: - self.logger.info("Cleaning up project data...") - self.memory.clear_project_memory(self.project_dir) - self.state_manager.clear_state() - self.logger.info("Cleanup complete") - else: - self.logger.info("Debug mode: Memory and state preserved for analysis") - - break - - return 0 - - except Exception as e: - self.logger.error(f"Orchestrator error: {e}", exc_info=True) - return 1 - - -def main(): - """Entry point for orchestrator.""" - import argparse - - parser = argparse.ArgumentParser(description="Fireteam Orchestrator") - parser.add_argument("--project-dir", required=True, help="Project directory") - parser.add_argument("--goal", required=True, help="Project goal/prompt") - parser.add_argument("--debug", action="store_true", help="Enable debug logging") - parser.add_argument("--keep-memory", action="store_true", - help="Preserve memory and state after completion (for debugging)") - - args = parser.parse_args() - - orchestrator = Orchestrator( - args.project_dir, - args.goal, - debug=args.debug, - keep_memory=args.keep_memory - ) - sys.exit(orchestrator.run()) - - -if __name__ == "__main__": - main() diff --git a/src/prompts/__init__.py b/src/prompts/__init__.py new file mode 100644 index 0000000..a8a4cfd --- /dev/null +++ b/src/prompts/__init__.py @@ -0,0 +1,30 @@ +""" +Prompt loading utilities. + +Loads prompts from markdown files in this directory. +""" + +from pathlib import Path + +_PROMPTS_DIR = Path(__file__).parent + + +def load_prompt(name: str) -> str: + """ + Load a prompt from a markdown file. + + Args: + name: Prompt name (without .md extension) + + Returns: + Prompt content as string + """ + prompt_file = _PROMPTS_DIR / f"{name}.md" + return prompt_file.read_text().strip() + + +# Pre-load prompts for convenience +EXECUTOR_PROMPT = load_prompt("executor") +REVIEWER_PROMPT = load_prompt("reviewer") +PLANNER_PROMPT = load_prompt("planner") +COMPLEXITY_PROMPT = load_prompt("complexity") diff --git a/src/prompts/builder.py b/src/prompts/builder.py new file mode 100644 index 0000000..ae95bad --- /dev/null +++ b/src/prompts/builder.py @@ -0,0 +1,122 @@ +""" +Prompt builder for fireteam phases. + +Builds prompts by combining base templates with: +- Goal and context +- Plan (for execute phase) +- Previous feedback (for iteration loops) +""" + +from ..models import PhaseType +from . import EXECUTOR_PROMPT, REVIEWER_PROMPT, PLANNER_PROMPT + + +def build_prompt( + phase: PhaseType, + goal: str, + context: str = "", + plan: str | None = None, + execution_output: str | None = None, + previous_feedback: str | None = None, + reviewer_id: int | None = None, + iteration: int | None = None, +) -> str: + """ + Build a phase-specific prompt with accumulated context. + + Args: + phase: The execution phase (PLAN, EXECUTE, REVIEW) + goal: The task goal + context: Additional context (crash logs, etc.) + plan: Implementation plan (for EXECUTE phase) + execution_output: Output from execution (for REVIEW phase) + previous_feedback: Feedback from previous iteration + reviewer_id: Reviewer number (for parallel reviews) + iteration: Current iteration number + + Returns: + Complete prompt string + """ + if phase == PhaseType.PLAN: + return _build_plan_prompt(goal, context) + elif phase == PhaseType.EXECUTE: + return _build_execute_prompt(goal, context, plan, previous_feedback) + elif phase == PhaseType.REVIEW: + return _build_review_prompt( + goal, execution_output, plan, previous_feedback, reviewer_id, iteration + ) + else: + raise ValueError(f"Unknown phase: {phase}") + + +def _build_plan_prompt(goal: str, context: str) -> str: + """Build planning phase prompt.""" + parts = [PLANNER_PROMPT, "", f"Goal: {goal}"] + + if context: + parts.extend(["", f"Context:\n{context}"]) + + return "\n".join(parts) + + +def _build_execute_prompt( + goal: str, + context: str, + plan: str | None, + previous_feedback: str | None, +) -> str: + """Build execution phase prompt with optional plan and feedback.""" + parts = [EXECUTOR_PROMPT, "", f"Goal: {goal}"] + + if context: + parts.extend(["", f"Context:\n{context}"]) + + if plan: + parts.extend(["", f"Plan:\n{plan}"]) + + if previous_feedback: + parts.extend([ + "", + "IMPORTANT - Address this feedback from the previous iteration:", + previous_feedback, + ]) + + return "\n".join(parts) + + +def _build_review_prompt( + goal: str, + execution_output: str | None, + plan: str | None, + previous_feedback: str | None, + reviewer_id: int | None, + iteration: int | None, +) -> str: + """Build review phase prompt.""" + parts = [REVIEWER_PROMPT] + + # Add context about which reviewer this is + if reviewer_id is not None or iteration is not None: + context_parts = [] + if reviewer_id: + context_parts.append(f"Reviewer #{reviewer_id}") + if iteration: + context_parts.append(f"Iteration {iteration}") + parts.extend(["", f"[{' - '.join(context_parts)}]"]) + + parts.extend(["", f"Goal: {goal}"]) + + if plan: + # Truncate long plans + plan_text = plan[:1500] + "..." if len(plan) > 1500 else plan + parts.extend(["", f"Implementation plan:\n{plan_text}"]) + + if execution_output: + # Truncate long outputs + output_text = execution_output[:2000] + "..." if len(execution_output) > 2000 else execution_output + parts.extend(["", f"Execution output:\n{output_text}"]) + + if previous_feedback: + parts.extend(["", f"Previous review feedback:\n{previous_feedback}"]) + + return "\n".join(parts) diff --git a/src/prompts/complexity.md b/src/prompts/complexity.md new file mode 100644 index 0000000..39ddbc7 --- /dev/null +++ b/src/prompts/complexity.md @@ -0,0 +1,44 @@ +You are a task complexity estimator. Analyze the following task and estimate its complexity. + +## Task + +{goal} + +## Additional Context + +{context} + +## Codebase Exploration + +You have access to read-only tools (Glob, Grep, Read) to explore the codebase. Use them to understand: +- Project structure and size +- Relevant files that would need changes +- Existing patterns and architecture +- Test coverage and conventions + +Explore as needed to make an accurate estimate. Don't over-explore - just enough to understand the scope. + +## Complexity Levels + +- **TRIVIAL**: Can be done in a single response. Examples: typo fix, simple rename, answer a question. +- **SIMPLE**: Requires a few focused changes but is straightforward. Examples: fix a simple bug, add a small feature, update config. +- **MODERATE**: Requires multiple changes across files and would benefit from iterative execution with review. Examples: add a feature with tests, refactor a module, fix a complex bug. +- **COMPLEX**: Requires planning, architectural decisions, and thorough review by multiple reviewers. Examples: new system design, major refactor, multi-component feature. + +## What Happens Next + +- TRIVIAL/SIMPLE: Single execution pass +- MODERATE: Execute -> review loop until complete +- COMPLEX: Plan once, then execute -> parallel reviews loop until complete + +## Instructions + +Consider: + +1. How many files will likely need changes? +2. Is there ambiguity in the requirements? +3. Will this require understanding existing architecture? +4. Is there risk of breaking existing functionality? +5. Would iterative review add value? + +After your exploration, respond with your final answer on its own line: TRIVIAL, SIMPLE, MODERATE, or COMPLEX diff --git a/src/prompts/executor.md b/src/prompts/executor.md new file mode 100644 index 0000000..5955fa0 --- /dev/null +++ b/src/prompts/executor.md @@ -0,0 +1,8 @@ +You are executing a task. Work systematically: + +1. Understand the goal and context +2. Make the necessary changes +3. Test your changes work +4. Summarize what you did + +Focus on quality and correctness. Leave the codebase in a working state. diff --git a/src/prompts/planner.md b/src/prompts/planner.md new file mode 100644 index 0000000..f3b9194 --- /dev/null +++ b/src/prompts/planner.md @@ -0,0 +1,10 @@ +You are analyzing a task to create an implementation plan. + +Explore the codebase to understand: + +1. What needs to change +2. Which files are involved +3. What approach to take +4. What risks exist + +Provide a clear, actionable plan. Be specific about files and changes. diff --git a/src/prompts/reviewer.md b/src/prompts/reviewer.md new file mode 100644 index 0000000..1979855 --- /dev/null +++ b/src/prompts/reviewer.md @@ -0,0 +1,19 @@ +You are reviewing completed work. Assess thoroughly: + +1. Does the implementation match the goal? +2. Are there bugs or edge cases? +3. Do tests pass? (Run them if you can) +4. Is it production-ready? + +Be critical but fair. Your feedback will be used to guide the next iteration if the task is incomplete. + +End your review with this format: + +ISSUES: +- List each issue found (one per line) +- Be specific about what needs to change +- Skip this section if no issues found + +COMPLETION: XX% + +Where XX is your honest assessment (0-100). Use 95%+ only when the task is truly complete with no remaining issues. diff --git a/src/state/__init__.py b/src/state/__init__.py deleted file mode 100644 index 7aa8b9b..0000000 --- a/src/state/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""State management for Fireteam.""" - diff --git a/src/state/manager.py b/src/state/manager.py deleted file mode 100644 index 973ec4a..0000000 --- a/src/state/manager.py +++ /dev/null @@ -1,218 +0,0 @@ -""" -State management for Fireteam. -Handles persistence and isolation of project state to prevent cross-project contamination. -""" - -import json -import os -import fcntl -from datetime import datetime -from typing import Any -from pathlib import Path - - -class StateManager: - """Manages agent system state with project isolation.""" - - def __init__(self, state_dir: str | None = None): - # Use provided state_dir, or fall back to config, or use default - if state_dir is None: - import config - state_dir = config.STATE_DIR - - self.state_dir = Path(state_dir) - self.state_dir.mkdir(parents=True, exist_ok=True) - self.state_file = self.state_dir / "current.json" - self.lock_file = self.state_dir / "state.lock" - - def _acquire_lock(self): - """Acquire exclusive lock on state file.""" - self.lock_fd = open(self.lock_file, 'w') - fcntl.flock(self.lock_fd, fcntl.LOCK_EX) - - def _release_lock(self): - """Release lock on state file.""" - if hasattr(self, 'lock_fd'): - fcntl.flock(self.lock_fd, fcntl.LOCK_UN) - self.lock_fd.close() - - def initialize_project(self, project_dir: str, goal: str) -> dict[str, Any]: - """ - Initialize fresh state for a new project. - CRITICAL: Completely clears previous state to avoid cross-project contamination. - """ - self._acquire_lock() - try: - state = { - "project_dir": os.path.abspath(project_dir), - "goal": goal, - "status": "planning", - "cycle_number": 0, - "completion_percentage": 0, - "last_known_completion": 0, # For parse failure fallback - "consecutive_parse_failures": 0, # Safety counter - "validation_checks": 0, - "git_branch": None, - "current_plan": None, - "last_execution_result": None, - "last_review": None, - "started_at": datetime.now().isoformat(), - "updated_at": datetime.now().isoformat(), - "completed": False - } - - with open(self.state_file, 'w') as f: - json.dump(state, f, indent=2) - - return state - finally: - self._release_lock() - - def load_state(self) -> dict[str, Any] | None: - """Load current state from disk.""" - self._acquire_lock() - try: - if not self.state_file.exists(): - return None - - with open(self.state_file, 'r') as f: - return json.load(f) - finally: - self._release_lock() - - def update_state(self, updates: dict[str, Any]) -> dict[str, Any]: - """ - Update state with new values. - Always updates the 'updated_at' timestamp. - """ - self._acquire_lock() - try: - # Load state without nested locking - if self.state_file.exists(): - with open(self.state_file, 'r') as f: - state = json.load(f) - else: - state = {} - - state.update(updates) - state['updated_at'] = datetime.now().isoformat() - - with open(self.state_file, 'w') as f: - json.dump(state, f, indent=2) - - return state - finally: - self._release_lock() - - def get_status(self) -> dict[str, Any]: - """Get current status for CLI display.""" - state = self.load_state() - if not state: - return { - "status": "idle", - "message": "No active project" - } - - return { - "status": state.get("status", "unknown"), - "project_dir": state.get("project_dir"), - "goal": state.get("goal"), - "cycle_number": state.get("cycle_number", 0), - "completion_percentage": state.get("completion_percentage", 0), - "last_updated": state.get("updated_at"), - "completed": state.get("completed", False) - } - - def mark_completed(self): - """Mark current project as completed.""" - self.update_state({ - "status": "completed", - "completed": True, - "completed_at": datetime.now().isoformat() - }) - - def clear_state(self): - """Completely clear state - used when project finishes.""" - self._acquire_lock() - try: - if self.state_file.exists(): - self.state_file.unlink() - finally: - self._release_lock() - - def increment_cycle(self): - """Increment the cycle counter.""" - self._acquire_lock() - try: - if self.state_file.exists(): - with open(self.state_file, 'r') as f: - state = json.load(f) - - state["cycle_number"] = state.get("cycle_number", 0) + 1 - state['updated_at'] = datetime.now().isoformat() - - with open(self.state_file, 'w') as f: - json.dump(state, f, indent=2) - finally: - self._release_lock() - - def update_completion_percentage(self, parsed_percentage: int | None, logger=None) -> int: - """ - Update completion percentage with fallback to last known value on parse failure. - - Args: - parsed_percentage: Result from parser (may be None if parsing failed) - logger: Optional logger for warnings - - Returns: - int: Completion percentage to use - """ - self._acquire_lock() - try: - if self.state_file.exists(): - with open(self.state_file, 'r') as f: - state = json.load(f) - else: - state = {} - - if parsed_percentage is not None: - # Successful parse - reset failure counter - state["consecutive_parse_failures"] = 0 - state["last_known_completion"] = parsed_percentage - state["completion_percentage"] = parsed_percentage - if logger: - logger.info(f"Completion: {parsed_percentage}%") - result = parsed_percentage - else: - # Parse failure - use last known value - state["consecutive_parse_failures"] = state.get("consecutive_parse_failures", 0) + 1 - last_known = state.get("last_known_completion", 0) - - if logger: - logger.warning( - f"Could not parse completion percentage " - f"(failure #{state['consecutive_parse_failures']}). " - f"Using last known: {last_known}%" - ) - - # Safety valve: stop after 3 consecutive failures - if state["consecutive_parse_failures"] >= 3: - if logger: - logger.error( - "3 consecutive parse failures - parser may be broken. " - "Defaulting to 0% to force investigation." - ) - state["completion_percentage"] = 0 - result = 0 - else: - state["completion_percentage"] = last_known - result = last_known - - state['updated_at'] = datetime.now().isoformat() - - with open(self.state_file, 'w') as f: - json.dump(state, f, indent=2) - - return result - finally: - self._release_lock() diff --git a/tests/README.md b/tests/README.md deleted file mode 100644 index e573306..0000000 --- a/tests/README.md +++ /dev/null @@ -1,413 +0,0 @@ -# Fireteam Tests - -This directory contains comprehensive tests for the entire Fireteam codebase, including unit tests and integration tests for all components. - -## Test Summary - -**Total Tests: 161** - -- ✅ **Configuration Tests** (15 tests) - test_config.py -- ✅ **State Manager Tests** (20 tests) - test_state_manager.py -- ✅ **Agent Tests** (38 tests) - test_agents.py -- ✅ **Orchestrator Tests** (28 tests) - test_orchestrator.py -- ✅ **CLI Tools Tests** (24 tests) - test_cli_tools.py -- ✅ **Memory System Tests** (36 tests) - test_memory_*.py - -## Running Tests - -### Run All Tests - -```bash -cd /Users/osprey/repos/dark/fireteam -source .venv/bin/activate -pytest tests/ -v -``` - -### Run Specific Test Categories - -```bash -# Configuration tests -pytest tests/test_config.py -v - -# State manager tests -pytest tests/test_state_manager.py -v - -# Agent tests (BaseAgent, Planner, Executor, Reviewer) -pytest tests/test_agents.py -v - -# Orchestrator integration tests -pytest tests/test_orchestrator.py -v - -# CLI tools tests -pytest tests/test_cli_tools.py -v - -# Memory system tests -pytest tests/test_memory_*.py -v -``` - -### Run with Coverage - -```bash -pytest tests/ --cov=src --cov-report=html -``` - -### Run Specific Test - -```bash -pytest tests/test_config.py::TestConfig::test_agent_timeouts -v -``` - -## Test Structure - -### 1. Configuration Tests (`test_config.py`) - -Tests for configuration module and environment variable handling: -- System directory configuration -- API key validation and lazy loading -- SDK configuration (tools, permissions, model) -- Agent configuration (retries, timeouts) -- Completion thresholds -- Git configuration -- Logging configuration -- Sudo configuration -- Memory system configuration -- Environment variable overrides -- Type validation - -### 2. State Manager Tests (`test_state_manager.py`) - -Tests for project state management: -- Initialization and file structure -- Project state initialization -- State loading and persistence -- State updates and timestamps -- Status reporting -- Completion tracking -- State clearing -- Cycle counting -- Completion percentage updates with fallbacks -- Parse failure handling -- State isolation between projects -- File locking mechanism -- Concurrent updates -- JSON format validation - -### 3. Agent Tests (`test_agents.py`) - -Tests for all agent classes: - -**BaseAgent:** -- Initialization and configuration -- Abstract method enforcement -- Execution context storage -- Memory manager integration -- Memory retrieval with/without manager -- Timeout configuration - -**PlannerAgent:** -- Initialization and system prompts -- Initial plan generation -- Plan updates based on feedback -- Memory context building -- Relevant memory type filtering -- Success and failure handling - -**ExecutorAgent:** -- Initialization and system prompts -- Execution prompt building -- Memory context building -- Relevant memory type filtering -- Success and failure handling - -**ReviewerAgent:** -- Initialization and system prompts -- Review prompt building -- Validation mode -- Completion percentage extraction (multiple formats) -- Learning extraction from reviews -- Memory context building -- Relevant memory type filtering -- Success and failure handling - -### 4. Orchestrator Tests (`test_orchestrator.py`) - -Integration tests for the main orchestrator: -- Initialization with various flags -- Logging setup -- Git repository initialization (new and existing) -- Git commit changes -- Remote push handling -- Completion checking and validation -- Cycle execution structure -- Agent failure handling (planner, executor, reviewer) -- Learning extraction and storage -- Goal alignment checks -- Memory manager injection -- State manager integration -- Signal handling -- Validation mode triggering -- CLI interface and argument parsing - -### 5. CLI Tools Tests (`test_cli_tools.py`) - -Tests for command-line utilities: -- Fireteam status command functionality -- Process monitoring -- State file parsing -- Timestamp formatting -- Script existence and structure -- Argument parsing -- System resource monitoring (memory, CPU, disk) -- PID file handling -- Log file handling -- Error handling -- Output formatting - -### 6. Memory System Tests (`test_memory_*.py`) - -Comprehensive tests for the memory system: - -**test_memory_manager.py:** -- Initialization and model loading -- Project initialization -- Adding memories -- Semantic search -- Memory type filtering -- Embedding caching -- Cleanup functionality -- Edge cases - -**test_base_agent_memory.py:** -- Execution context storage -- Template method pattern -- Automatic memory retrieval -- Memory injection into prompts -- Graceful degradation without memory - -**test_memory_integration.py:** -- Full cycle memory flow -- Reviewer learning extraction -- Memory persistence across cycles -- Realistic multi-cycle scenarios - -**test_memory_isolation.py:** -- Separate collections per project -- No memory leakage between projects -- Cleanup isolation -- Hash collision resistance - -## Requirements - -Install test dependencies using uv: - -```bash -cd /Users/osprey/repos/dark/fireteam -source .venv/bin/activate -uv pip install -r requirements.txt -``` - -Key dependencies: -- pytest>=7.0.0 -- chromadb>=1.0.0 -- transformers>=4.50.0 -- torch>=2.5.0 - -## First Run - -**Note:** The first test run will download the Qwen3-Embedding-0.6B model (~1.2GB) from Hugging Face for memory tests. This is cached locally, so subsequent runs are faster. - -## Troubleshooting - -### Model Download Issues - -If model download fails: -```bash -# Clear Hugging Face cache -rm -rf ~/.cache/huggingface/ - -# Re-run tests -pytest tests/ -v -``` - -### Chroma Database Lock Issues - -If tests fail with database lock errors: -```bash -# Clear test artifacts -rm -rf /tmp/test-* -rm -rf /tmp/*-project-* - -# Re-run tests -pytest tests/ -v -``` - -### MPS/Metal Issues on Mac - -If you see MPS-related warnings, this is normal. Tests will fall back to CPU automatically. - -## Test Coverage - -✅ **Comprehensive Coverage** across all components: - -### Core Components -- ✅ Configuration management -- ✅ State management and persistence -- ✅ File locking and concurrency -- ✅ Project isolation -- ✅ Completion tracking - -### Agents -- ✅ BaseAgent template pattern -- ✅ PlannerAgent logic -- ✅ ExecutorAgent logic -- ✅ ReviewerAgent logic -- ✅ Memory integration -- ✅ Timeout configuration - -### Orchestrator -- ✅ Full cycle execution -- ✅ Git integration -- ✅ Agent coordination -- ✅ Error handling -- ✅ Validation mode -- ✅ Learning extraction - -### Memory System -- ✅ MemoryManager CRUD operations -- ✅ Embedding generation and caching -- ✅ Semantic search functionality -- ✅ Project isolation -- ✅ Automatic retrieval -- ✅ Learning extraction -- ✅ Cleanup functionality - -### CLI Tools -- ✅ Status monitoring -- ✅ Process management -- ✅ Log handling -- ✅ Error handling -- ✅ Output formatting - -## Test Quality - -All tests follow best practices: -- **Isolated**: Each test is independent -- **Deterministic**: Tests produce consistent results -- **Fast**: Most tests run in milliseconds -- **Comprehensive**: Test both success and failure paths -- **Intent-focused**: Test functionality, not implementation details -- **Well-documented**: Clear test names and docstrings - -## New Test Categories - -### Lightweight Tests (2 tests) - -Fast tests using small embedding models (`sentence-transformers/all-MiniLM-L6-v2`). -Verify HuggingFace integration without heavy downloads. - -**What they test:** -- HuggingFace model loading pipeline -- Embedding generation works -- Save/retrieve memories with semantic search - -**Run with:** -```bash -pytest tests/ -m "lightweight" -v -``` - -**Performance:** ~5-10 seconds (first run downloads ~80MB model) - -### End-to-End Tests (1 test) - -Real subprocess tests that spawn Fireteam and complete actual tasks. -Uses real Claude API - costs money and takes time. - -**What they test:** -- Complete Fireteam workflow from start to finish -- Real subprocess spawning -- File creation and git commits -- Task completion with 95%+ accuracy - -**Run with:** -```bash -pytest tests/ -m "e2e" -v --keep-artifacts -``` - -**Performance:** ~3-5 minutes per test -**Cost:** ~$0.10-0.50 per run (uses Claude API) - -### Integration Tests (1 test) - -Tests with external systems (terminal-bench). -Requires `tb` command to be installed. - -**What they test:** -- Terminal-bench adapter works correctly -- 100% accuracy on hello-world task -- Installation script works -- Container environment setup - -**Run with:** -```bash -pytest tests/ -m "integration" -v -``` - -**Performance:** ~10 minutes per test -**Cost:** ~$0.20-1.00 per run (uses Claude API) - -## Running Tests Selectively - -```bash -# Fast tests only (skip API calls and slow tests) - for CI -pytest tests/ -m "not slow and not e2e and not integration" -v - -# All unit tests including lightweight embedding tests -pytest tests/ -m "not slow" -v - -# Only slow/expensive tests -pytest tests/ -m "slow" -v - -# Parallel execution (safe with isolated fixtures) -pytest tests/ -n auto - -# Keep artifacts on failure for debugging -pytest tests/ --keep-artifacts -v -``` - -## Dependencies - -### Core test dependencies (always needed): -- pytest>=7.0.0 -- All src/ dependencies (chromadb, transformers, torch, etc.) - -### Lightweight embedding tests: -- sentence-transformers>=2.2.0 (already in requirements.txt) - -### Integration tests: -- terminal-bench: `uv tool install terminal-bench` -- Docker (for terminal-bench containers) - -## API Costs & CI Considerations - -E2E and integration tests use real Claude API: -- **Hello world test:** ~$0.10-0.50 per run -- **Terminal-bench test:** ~$0.20-1.00 per run - -**Recommendation for CI:** -- Run fast tests (unit + lightweight) on all PRs (~2 minutes, no cost) -- Run e2e/integration tests only on main branch (saves ~$1-2 per PR) - -## Test Summary - -**Total: 165 tests** - -- Configuration: 15 tests -- State Manager: 20 tests -- Agents: 38 tests -- Orchestrator: 28 tests -- CLI Tools: 24 tests -- Memory System: 36 tests -- **Lightweight Embeddings: 2 tests** ⚡ NEW -- **E2E Hello World: 1 test** 🚀 NEW -- **Terminal-bench Integration: 1 test** 🎯 NEW - diff --git a/tests/batch_test_runner.sh b/tests/batch_test_runner.sh deleted file mode 100755 index 790b68d..0000000 --- a/tests/batch_test_runner.sh +++ /dev/null @@ -1,159 +0,0 @@ -#!/bin/bash -# Batch test runner - runs all tests sequentially with auto-progression -# Runs in background, survives session disconnections - -SYSTEM_DIR="/home/claude/fireteam" -LOG_FILE="$SYSTEM_DIR/logs/batch_test_runner.log" -COMPLETION_THRESHOLD=90 # Stop test when it reaches this percentage - -# Test definitions: "project_dir|prompt" -declare -a TESTS=( - "/home/claude/csv-analyzer-v2|Build a Python CLI tool that analyzes CSV files and generates statistics. Requirements: Read CSV files using pandas or csv module, calculate statistics (mean, median, standard deviation, min, max) for numeric columns, generate a summary report in both terminal output and JSON format, handle missing data gracefully, include a sample CSV file with test data, support filtering by column, and make it production-ready with proper error handling and documentation." - - "/home/claude/json-log-parser|Build a Python CLI tool that parses JSON-formatted application logs and extracts insights. Requirements: Parse JSON log files line-by-line, filter by log level (ERROR, WARN, INFO, DEBUG), count occurrences by type, generate summary statistics (total logs, errors per hour/day, top error messages), support date range filtering, create sample log file with realistic data, export results to JSON/CSV, and make it production-ready with proper error handling." - - "/home/claude/web-scraper|Build a Python web scraper that extracts top headlines from Hacker News front page. Requirements: Use requests + BeautifulSoup, extract titles/scores/authors, handle pagination (top 10 stories), export to JSON/CSV, respect robots.txt, implement rate limiting, handle network errors gracefully, and make it production-ready with proper error handling." - - "/home/claude/task-manager-cli|Build a Task Manager CLI with SQLite persistence. Requirements: SQLite database for storage, CRUD operations (Create, Read, Update, Delete tasks), task properties (id, title, description, status, due_date), commands (add, list, complete, delete), filter by status (pending/completed), and make it production-ready with proper error handling and documentation." - - "/home/claude/rest-api-server|Build a REST API server for a note-taking application using Flask or FastAPI. Requirements: Endpoints for GET, POST, PUT, DELETE notes, SQLite or in-memory storage, JSON request/response, input validation, error handling (404, 400, 500), API documentation (Swagger for FastAPI), basic tests using pytest, and make it production-ready." - - "/home/claude/github-analyzer|Build a CLI tool that analyzes GitHub repositories using GitHub API. Requirements: Fetch repo info (stars, forks, languages, contributors), analyze commit history (last 30 days), generate markdown report, handle API rate limits, pretty terminal output with colors, and make it production-ready with proper error handling." -) - -log_message() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" -} - -get_completion_percentage() { - local state_file="$SYSTEM_DIR/state/current.json" - if [ -f "$state_file" ]; then - python3 -c "import json; f=open('$state_file'); data=json.load(f); print(data.get('completion_percentage', 0))" 2>/dev/null || echo "0" - else - echo "0" - fi -} - -wait_for_completion() { - local project_name="$1" - local max_wait_minutes=90 # Max 90 minutes per test - local check_interval=60 # Check every minute - local elapsed=0 - - log_message "Waiting for $project_name to reach $COMPLETION_THRESHOLD%..." - - while [ $elapsed -lt $((max_wait_minutes * 60)) ]; do - sleep $check_interval - elapsed=$((elapsed + check_interval)) - - completion=$(get_completion_percentage) - log_message "$project_name: ${completion}% complete (${elapsed}s elapsed)" - - # Check if threshold reached - if [ "$completion" -ge "$COMPLETION_THRESHOLD" ]; then - log_message "$project_name: Reached $completion% - SUCCESS!" - return 0 - fi - - # Check if process still running - if ! pgrep -f "orchestrator.py" > /dev/null; then - log_message "$project_name: Process died unexpectedly" - return 1 - fi - done - - log_message "$project_name: Timeout after $max_wait_minutes minutes" - return 2 -} - -run_test() { - local test_def="$1" - local project_dir=$(echo "$test_def" | cut -d'|' -f1) - local prompt=$(echo "$test_def" | cut -d'|' -f2-) - local project_name=$(basename "$project_dir") - - log_message "========================================" - log_message "Starting Test: $project_name" - log_message "Project Dir: $project_dir" - log_message "========================================" - - # Start the test - $SYSTEM_DIR/cli/start-agent --project-dir "$project_dir" --prompt "$prompt" - - if [ $? -ne 0 ]; then - log_message "ERROR: Failed to start $project_name" - return 1 - fi - - # Wait for completion or threshold - wait_for_completion "$project_name" - local result=$? - - # Stop the agent - $SYSTEM_DIR/cli/stop-agent - - # Record result - if [ $result -eq 0 ]; then - log_message "$project_name: COMPLETED SUCCESSFULLY" - echo "$project_name" >> "$SYSTEM_DIR/logs/completed_tests.txt" - elif [ $result -eq 1 ]; then - log_message "$project_name: FAILED (process died)" - echo "$project_name" >> "$SYSTEM_DIR/logs/failed_tests.txt" - else - log_message "$project_name: TIMEOUT" - echo "$project_name" >> "$SYSTEM_DIR/logs/timeout_tests.txt" - fi - - # Brief pause between tests - sleep 10 - - return $result -} - -# Main execution -main() { - log_message "==========================================" - log_message "Batch Test Runner Started" - log_message "Tests to run: ${#TESTS[@]}" - log_message "Completion threshold: $COMPLETION_THRESHOLD%" - log_message "==========================================" - - # Clean up old result files - rm -f "$SYSTEM_DIR/logs/completed_tests.txt" - rm -f "$SYSTEM_DIR/logs/failed_tests.txt" - rm -f "$SYSTEM_DIR/logs/timeout_tests.txt" - - local test_num=1 - local total_tests=${#TESTS[@]} - - for test_def in "${TESTS[@]}"; do - log_message "Running test $test_num of $total_tests" - run_test "$test_def" - test_num=$((test_num + 1)) - done - - log_message "==========================================" - log_message "Batch Test Runner Completed" - log_message "==========================================" - - # Summary - local completed=$(wc -l < "$SYSTEM_DIR/logs/completed_tests.txt" 2>/dev/null || echo "0") - local failed=$(wc -l < "$SYSTEM_DIR/logs/failed_tests.txt" 2>/dev/null || echo "0") - local timeout=$(wc -l < "$SYSTEM_DIR/logs/timeout_tests.txt" 2>/dev/null || echo "0") - - log_message "Results:" - log_message " Completed: $completed" - log_message " Failed: $failed" - log_message " Timeout: $timeout" -} - -# Run in background if --background flag provided -if [ "$1" == "--background" ]; then - nohup bash "$0" >> "$LOG_FILE" 2>&1 & - echo "Batch test runner started in background (PID: $!)" - echo "Monitor progress: tail -f $LOG_FILE" - exit 0 -fi - -# Run normally -main diff --git a/tests/conftest.py b/tests/conftest.py index c11b0c2..5b4a212 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,33 @@ -"""Shared pytest fixtures for all tests.""" +"""Shared pytest fixtures for fireteam tests.""" import pytest import tempfile import shutil import os +import sys from pathlib import Path +from unittest.mock import AsyncMock, MagicMock + +# Try to import real SDK; mock only if not available +# This allows integration tests to use the real SDK while unit tests use mocks +try: + import claude_agent_sdk + _SDK_AVAILABLE = True +except ImportError: + _SDK_AVAILABLE = False + + # Create a mock ClaudeAgentOptions class that stores kwargs as attributes + class MockClaudeAgentOptions: + """Mock class that stores constructor kwargs as attributes.""" + def __init__(self, **kwargs): + for key, value in kwargs.items(): + setattr(self, key, value) + + mock_sdk = MagicMock() + mock_sdk.query = AsyncMock() + mock_sdk.ClaudeAgentOptions = MockClaudeAgentOptions + mock_sdk.HookMatcher = MagicMock() + sys.modules["claude_agent_sdk"] = mock_sdk @pytest.fixture @@ -19,25 +42,48 @@ def isolated_tmp_dir(request): @pytest.fixture -def isolated_system_dirs(isolated_tmp_dir): - """Create isolated state/logs/memory dirs.""" - system_dir = isolated_tmp_dir / "system" - (system_dir / "state").mkdir(parents=True) - (system_dir / "logs").mkdir(parents=True) - (system_dir / "memory").mkdir(parents=True) - return system_dir +def project_dir(isolated_tmp_dir): + """Create a mock project directory with basic structure.""" + project = isolated_tmp_dir / "project" + project.mkdir() + + # Create basic Python project structure + (project / "src").mkdir() + (project / "tests").mkdir() + (project / "pyproject.toml").write_text(""" +[project] +name = "test-project" +version = "0.1.0" +""") + (project / "src" / "main.py").write_text(""" +def hello(): + return "Hello, World!" +""") + + return project @pytest.fixture -def lightweight_memory_manager(isolated_system_dirs): - """MemoryManager with lightweight embedding model.""" - import sys - sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - from memory.manager import MemoryManager - - return MemoryManager( - memory_dir=str(isolated_system_dirs / "memory"), - embedding_model='sentence-transformers/all-MiniLM-L6-v2' +def mock_sdk_query(): + """Mock the claude_agent_sdk.query function.""" + async def mock_query(*args, **kwargs): + # Yield a mock message with result + class MockMessage: + result = "Task completed successfully." + yield MockMessage() + + return mock_query + + +@pytest.fixture +def mock_execution_result(): + """Create a mock ExecutionResult for testing.""" + from fireteam.api import ExecutionResult, ExecutionMode + return ExecutionResult( + success=True, + mode=ExecutionMode.SIMPLE, + output="Task completed.", + completion_percentage=100, ) @@ -48,12 +94,24 @@ def pytest_addoption(parser): action="store_true", help="Keep test artifacts on failure for debugging" ) + parser.addoption( + "--run-integration", + action="store_true", + help="Run integration tests that require API keys" + ) def pytest_configure(config): """Register custom markers.""" - config.addinivalue_line("markers", "lightweight: Lightweight tests with small models") - config.addinivalue_line("markers", "e2e: End-to-end tests with real subprocesses") + config.addinivalue_line("markers", "unit: Unit tests (fast, no external deps)") + config.addinivalue_line("markers", "integration: Integration tests (require API key)") config.addinivalue_line("markers", "slow: Slow running tests") - config.addinivalue_line("markers", "integration: Integration tests with external systems") + +def pytest_collection_modifyitems(config, items): + """Skip integration tests unless --run-integration is passed.""" + if not config.getoption("--run-integration"): + skip_integration = pytest.mark.skip(reason="need --run-integration option to run") + for item in items: + if "integration" in item.keywords: + item.add_marker(skip_integration) diff --git a/tests/helpers.py b/tests/helpers.py deleted file mode 100644 index be625da..0000000 --- a/tests/helpers.py +++ /dev/null @@ -1,298 +0,0 @@ -"""Test helpers for Fireteam tests.""" - -import subprocess -import sys -import os -import re -import time -import threading -from pathlib import Path -from dataclasses import dataclass -from typing import List, Optional - - -@dataclass -class TestResult: - """Result from running a Fireteam test.""" - success: bool - returncode: int - project_dir: Path - logs: str - duration: float - git_commits: int - files_created: List[str] - cycle_count: int - final_completion: int - - def __str__(self): - """Human-readable summary.""" - status = "✅ SUCCESS" if self.success else "❌ FAILED" - return ( - f"{status}\n" - f" Duration: {self.duration:.1f}s\n" - f" Cycles: {self.cycle_count}\n" - f" Completion: {self.final_completion}%\n" - f" Commits: {self.git_commits}\n" - f" Files: {len(self.files_created)}" - ) - - -class LogParser: - """Parse Fireteam logs to extract metrics.""" - - @staticmethod - def extract_cycle_count(logs: str) -> int: - """Extract final cycle count from logs.""" - cycles = re.findall(r'CYCLE (\d+)', logs) - return max(map(int, cycles)) if cycles else 0 - - @staticmethod - def extract_final_completion(logs: str) -> int: - """Extract final completion percentage from logs.""" - completions = re.findall(r'(?:Completion|completion):\s*(\d+)%', logs) - return int(completions[-1]) if completions else 0 - - -class StreamingOutputHandler: - """Handle real-time output streaming with progress updates.""" - - def __init__(self, process: subprocess.Popen, show_progress: bool = True): - self.process = process - self.show_progress = show_progress - self.stdout_lines = [] - self.stderr_lines = [] - - def collect_output(self) -> tuple[str, str]: - """Collect output while showing progress.""" - stdout_thread = threading.Thread( - target=self._stream_output, - args=(self.process.stdout, self.stdout_lines, True) - ) - stderr_thread = threading.Thread( - target=self._stream_output, - args=(self.process.stderr, self.stderr_lines, False) - ) - - stdout_thread.start() - stderr_thread.start() - stdout_thread.join() - stderr_thread.join() - - return '\n'.join(self.stdout_lines), '\n'.join(self.stderr_lines) - - def _stream_output(self, pipe, lines: List[str], is_stdout: bool): - """Stream output from pipe, showing progress.""" - for line in iter(pipe.readline, ''): - if not line: - break - line = line.rstrip() - lines.append(line) - - if is_stdout and self.show_progress: - # Update progress indicator - if 'CYCLE' in line: - cycle = re.search(r'CYCLE (\d+)', line) - if cycle: - print(f"\r🔄 Cycle {cycle.group(1)} ", end='', flush=True) - elif 'PHASE' in line: - phase = re.search(r'PHASE \d+: (\w+)', line) - if phase: - print(f"\r → {phase.group(1)}...", end='', flush=True) - elif 'Completion:' in line: - completion = re.search(r'(\d+)%', line) - if completion: - print(f"\r ✓ {completion.group(1)}%", flush=True) - pipe.close() - - -class FireteamTestRunner: - """Helper for spawning and testing Fireteam processes.""" - - def __init__(self, project_dir: Path, system_dir: Path): - self.project_dir = project_dir - self.system_dir = system_dir - self.process = None - self.start_time = None - - def run(self, goal: str, timeout: int = 300, keep_memory: bool = False, - show_progress: bool = True) -> TestResult: - """Spawn Fireteam and wait for completion with real-time output.""" - self.start_time = time.time() - - print(f"\n{'='*60}") - print(f"🚀 Starting Fireteam") - print(f"{'='*60}") - print(f"Goal: {goal}") - print(f"Timeout: {timeout}s\n") - - self._ensure_git_repo() - - env = os.environ.copy() - env['FIRETEAM_DIR'] = str(self.system_dir) - env['PYTHONUNBUFFERED'] = '1' - - cmd = [ - sys.executable, 'src/orchestrator.py', - '--project-dir', str(self.project_dir), - '--goal', goal - ] - if keep_memory: - cmd.append('--keep-memory') - - try: - self.process = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - text=True, bufsize=1, env=env - ) - except FileNotFoundError as e: - raise RuntimeError(f"Failed to start Fireteam: {e}") - - handler = StreamingOutputHandler(self.process, show_progress) - - try: - stdout, stderr = handler.collect_output() - self.process.wait(timeout=timeout) - duration = time.time() - self.start_time - - print(f"\n{'='*60}") - print(f"⏱️ Completed in {duration:.1f}s") - print(f"{'='*60}\n") - - cycle_count = LogParser.extract_cycle_count(stdout) - final_completion = LogParser.extract_final_completion(stdout) - - return TestResult( - success=(self.process.returncode == 0), - returncode=self.process.returncode, - project_dir=self.project_dir, - logs=stdout + "\n" + stderr, - duration=duration, - git_commits=self._count_commits(), - files_created=self._list_files(), - cycle_count=cycle_count, - final_completion=final_completion - ) - except subprocess.TimeoutExpired: - self.process.kill() - self.process.wait() - duration = time.time() - self.start_time - raise TimeoutError( - f"⏱️ Fireteam timed out after {timeout}s (ran for {duration:.1f}s)" - ) - - def _ensure_git_repo(self): - """Ensure project directory is a git repo.""" - git_dir = self.project_dir / ".git" - if not git_dir.exists(): - subprocess.run(['git', 'init'], cwd=self.project_dir, check=True, capture_output=True) - subprocess.run(['git', 'config', 'user.name', 'Fireteam Test'], - cwd=self.project_dir, check=True, capture_output=True) - subprocess.run(['git', 'config', 'user.email', 'test@fireteam.ai'], - cwd=self.project_dir, check=True, capture_output=True) - - def _count_commits(self) -> int: - """Count git commits in project.""" - try: - result = subprocess.run(['git', 'rev-list', '--count', 'HEAD'], - cwd=self.project_dir, capture_output=True, - text=True, check=True) - return int(result.stdout.strip()) - except (subprocess.CalledProcessError, ValueError): - return 0 - - def _list_files(self) -> List[str]: - """List non-git files in project directory.""" - files = [] - for item in self.project_dir.rglob('*'): - if '.git' in item.parts or not item.is_file(): - continue - files.append(item.relative_to(self.project_dir).as_posix()) - return sorted(files) - - -@dataclass -class TerminalBenchResult: - """Parsed result from terminal-bench run.""" - task_id: str - success: bool - passed: bool - accuracy: float - duration: Optional[float] - error: Optional[str] - - def __str__(self): - """Human-readable summary.""" - status = "✅ PASSED" if self.passed else "❌ FAILED" - lines = [ - f"\n{'='*60}", - f"Terminal-bench Result: {status}", - f"{'='*60}", - f"Task: {self.task_id}", - f"Success: {'Yes' if self.success else 'No'}", - f"Accuracy: {self.accuracy * 100:.1f}%", - ] - if self.duration: - lines.append(f"Duration: {self.duration:.1f}s") - if self.error: - lines.append(f"Error: {self.error}") - lines.append(f"{'='*60}\n") - return '\n'.join(lines) - - -class TerminalBenchParser: - """Parse terminal-bench stdout output.""" - - @staticmethod - def parse_output(stdout: str, task_id: str) -> TerminalBenchResult: - """Parse terminal-bench stdout for task results.""" - # Look for success/failure indicators - success_found = any(keyword in stdout.lower() for keyword in [ - 'passed', 'success', '✓', '✅' - ]) - - failure_found = any(keyword in stdout.lower() for keyword in [ - 'failed', 'error', '✗', '❌' - ]) - - # Extract accuracy/score - accuracy = 0.0 - accuracy_patterns = [ - r'accuracy[:\s]+(\d+\.?\d*)', - r'score[:\s]+(\d+\.?\d*)', - r'(\d+)%\s+correct', - ] - - for pattern in accuracy_patterns: - match = re.search(pattern, stdout.lower()) - if match: - val = float(match.group(1)) - accuracy = val if val <= 1.0 else val / 100.0 - break - - passed = success_found and not failure_found - - # Extract duration if available - duration = None - duration_match = re.search( - r'(?:took|duration|time)[:\s]+(\d+\.?\d*)\s*(?:s|sec|seconds)', - stdout.lower() - ) - if duration_match: - duration = float(duration_match.group(1)) - - # Extract error message if failed - error = None - if not passed: - error_match = re.search(r'error[:\s]+(.+?)(?:\n|$)', stdout, re.IGNORECASE) - if error_match: - error = error_match.group(1).strip() - - return TerminalBenchResult( - task_id=task_id, - success=success_found, - passed=passed, - accuracy=accuracy, - duration=duration, - error=error - ) - diff --git a/tests/pytest.ini b/tests/pytest.ini deleted file mode 100644 index 22a6a99..0000000 --- a/tests/pytest.ini +++ /dev/null @@ -1,13 +0,0 @@ -[pytest] -testpaths = tests -python_files = test_*.py -python_classes = Test* -python_functions = test_* -addopts = -v --tb=short - -markers = - lightweight: Lightweight tests with small models (fast for CI) - e2e: End-to-end tests with real subprocesses (slow, uses API) - slow: Slow running tests (multi-minute) - integration: Integration tests with external systems (terminal-bench) - diff --git a/tests/run_memory_tests.sh b/tests/run_memory_tests.sh deleted file mode 100755 index a5c3ca1..0000000 --- a/tests/run_memory_tests.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -# Run memory system tests - -set -e - -cd "$(dirname "$0")/.." - -echo "==================================" -echo "Running Memory System Tests" -echo "==================================" -echo "" - -# Run memory-specific tests -echo "1. Testing MemoryManager..." -python -m pytest tests/test_memory_manager.py -v - -echo "" -echo "2. Testing BaseAgent Memory Integration..." -python -m pytest tests/test_base_agent_memory.py -v - -echo "" -echo "3. Testing Memory Integration..." -python -m pytest tests/test_memory_integration.py -v - -echo "" -echo "4. Testing Project Isolation..." -python -m pytest tests/test_memory_isolation.py -v - -echo "" -echo "==================================" -echo "All Memory Tests Complete!" -echo "==================================" - diff --git a/tests/test_agents.py b/tests/test_agents.py deleted file mode 100644 index e63bc75..0000000 --- a/tests/test_agents.py +++ /dev/null @@ -1,599 +0,0 @@ -""" -Unit tests for agent classes. -Tests BaseAgent, PlannerAgent, ExecutorAgent, and ReviewerAgent functionality. -""" - -import pytest -import tempfile -import shutil -import logging -import sys -from pathlib import Path -from unittest.mock import Mock, patch, AsyncMock, MagicMock - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from agents.base import BaseAgent -from agents.planner import PlannerAgent -from agents.executor import ExecutorAgent -from agents.reviewer import ReviewerAgent - - -class TestBaseAgent: - """Test BaseAgent functionality.""" - - @pytest.fixture - def logger(self): - """Create test logger.""" - return logging.getLogger("test") - - @pytest.fixture - def mock_memory_manager(self): - """Create mock memory manager.""" - memory = Mock() - memory.search = Mock(return_value=[]) - return memory - - def test_initialization(self, logger): - """Test BaseAgent initialization.""" - # Need to create a concrete subclass - class TestAgent(BaseAgent): - def get_system_prompt(self): - return "Test prompt" - - def _do_execute(self, **kwargs): - return {"success": True} - - agent = TestAgent("test", logger) - - assert agent.agent_type == "test" - assert agent.logger == logger - assert agent.max_retries > 0 - assert agent.retry_delay > 0 - assert agent.timeout > 0 - - def test_get_system_prompt_not_implemented(self, logger): - """Test that BaseAgent requires get_system_prompt implementation.""" - agent = BaseAgent("test", logger) - - with pytest.raises(NotImplementedError): - agent.get_system_prompt() - - def test_do_execute_not_implemented(self, logger): - """Test that BaseAgent requires _do_execute implementation.""" - agent = BaseAgent("test", logger) - - with pytest.raises(NotImplementedError): - agent._do_execute() - - def test_execution_context_storage(self, logger): - """Test that execute() stores execution context.""" - class TestAgent(BaseAgent): - def get_system_prompt(self): - return "Test prompt" - - def _do_execute(self, **kwargs): - # Check that context is available - assert self._execution_context == kwargs - return {"success": True} - - agent = TestAgent("test", logger) - agent.execute(project_dir="/tmp/test", goal="Test goal") - - # Context should be stored - assert agent._execution_context["project_dir"] == "/tmp/test" - assert agent._execution_context["goal"] == "Test goal" - - def test_memory_integration(self, logger, mock_memory_manager): - """Test memory manager integration.""" - class TestAgent(BaseAgent): - def get_system_prompt(self): - return "Test prompt" - - def _do_execute(self, **kwargs): - return {"success": True} - - agent = TestAgent("test", logger, memory_manager=mock_memory_manager) - - assert agent.memory == mock_memory_manager - - def test_retrieve_memories_without_manager(self, logger): - """Test memory retrieval when no manager is set.""" - class TestAgent(BaseAgent): - def get_system_prompt(self): - return "Test prompt" - - def _build_memory_context_query(self): - return "test query" - - def _do_execute(self, **kwargs): - return {"success": True} - - agent = TestAgent("test", logger, memory_manager=None) - - # Should return empty string gracefully - result = agent._retrieve_and_format_memories() - assert result == "" - - def test_retrieve_memories_with_results(self, logger, mock_memory_manager): - """Test memory retrieval with results.""" - # Mock memories - mock_memory_manager.search.return_value = [ - {"content": "Learning 1", "type": "learning", "cycle": 1}, - {"content": "Decision 1", "type": "decision", "cycle": 2} - ] - - class TestAgent(BaseAgent): - def get_system_prompt(self): - return "Test prompt" - - def _build_memory_context_query(self): - return "test query" - - def _get_relevant_memory_types(self): - return ["learning", "decision"] - - def _do_execute(self, **kwargs): - return {"success": True} - - agent = TestAgent("test", logger, memory_manager=mock_memory_manager) - - # Retrieve memories - result = agent._retrieve_and_format_memories() - - # Should have formatted memories - assert result != "" - assert "Learning 1" in result - assert "Decision 1" in result - assert "BACKGROUND KNOWLEDGE" in result - - def test_timeout_configuration(self, logger): - """Test that agent timeout is configured correctly.""" - import config - - # Planner should have planner timeout - planner = PlannerAgent(logger) - assert planner.timeout == config.AGENT_TIMEOUTS["planner"] - - # Executor should have executor timeout - executor = ExecutorAgent(logger) - assert executor.timeout == config.AGENT_TIMEOUTS["executor"] - - # Reviewer should have reviewer timeout - reviewer = ReviewerAgent(logger) - assert reviewer.timeout == config.AGENT_TIMEOUTS["reviewer"] - - -class TestPlannerAgent: - """Test PlannerAgent functionality.""" - - @pytest.fixture - def logger(self): - """Create test logger.""" - return logging.getLogger("test-planner") - - @pytest.fixture - def planner(self, logger): - """Create PlannerAgent instance.""" - return PlannerAgent(logger) - - def test_initialization(self, planner): - """Test PlannerAgent initialization.""" - assert planner.agent_type == "planner" - assert planner.logger is not None - - def test_get_system_prompt(self, planner): - """Test that planner has proper system prompt.""" - prompt = planner.get_system_prompt() - - assert isinstance(prompt, str) - assert len(prompt) > 0 - - # Should mention key responsibilities - assert "plan" in prompt.lower() or "planner" in prompt.lower() - assert "task" in prompt.lower() - - def test_build_initial_plan_prompt(self, planner): - """Test initial plan prompt building.""" - goal = "Build a web application" - - prompt = planner._build_initial_plan_prompt(goal) - - assert isinstance(prompt, str) - assert goal in prompt - assert "plan" in prompt.lower() - - def test_build_update_plan_prompt(self, planner): - """Test plan update prompt building.""" - goal = "Build a web application" - previous_plan = "Step 1: Create files" - execution_result = "Created files successfully" - review = "Good progress, 50% complete" - cycle = 2 - - prompt = planner._build_update_plan_prompt( - goal, previous_plan, execution_result, review, cycle - ) - - assert isinstance(prompt, str) - assert goal in prompt - assert str(cycle) in prompt - - def test_extract_plan(self, planner): - """Test plan extraction from output.""" - output = """ -# Project Plan - -## Tasks -1. Setup environment -2. Write code -3. Test - -This is the plan. -""" - - plan = planner._extract_plan(output) - - assert isinstance(plan, str) - assert "Tasks" in plan - assert "Setup environment" in plan - - def test_relevant_memory_types(self, planner): - """Test that planner requests relevant memory types.""" - types = planner._get_relevant_memory_types() - - assert isinstance(types, list) - # Planner should care about decisions and failed approaches - assert "decision" in types - assert "failed_approach" in types - - def test_build_memory_context_query(self, planner): - """Test memory context query building.""" - # Set execution context - planner._execution_context = { - "goal": "Build app", - "last_review": "Good progress" - } - - query = planner._build_memory_context_query() - - assert isinstance(query, str) - assert "Build app" in query - - @patch.object(PlannerAgent, '_execute_command') - def test_do_execute_success(self, mock_execute, planner): - """Test successful plan execution.""" - # Mock successful execution - mock_execute.return_value = { - "success": True, - "output": "# Plan\n\n1. Task 1\n2. Task 2" - } - - result = planner._do_execute( - project_dir="/tmp/test", - goal="Test goal", - cycle_number=0 - ) - - assert result["success"] is True - assert "plan" in result - assert "Task 1" in result["plan"] - - @patch.object(PlannerAgent, '_execute_command') - def test_do_execute_failure(self, mock_execute, planner): - """Test failed plan execution.""" - # Mock failed execution - mock_execute.return_value = { - "success": False, - "error": "Test error" - } - - result = planner._do_execute( - project_dir="/tmp/test", - goal="Test goal", - cycle_number=0 - ) - - assert result["success"] is False - assert "error" in result - - -class TestExecutorAgent: - """Test ExecutorAgent functionality.""" - - @pytest.fixture - def logger(self): - """Create test logger.""" - return logging.getLogger("test-executor") - - @pytest.fixture - def executor(self, logger): - """Create ExecutorAgent instance.""" - return ExecutorAgent(logger) - - def test_initialization(self, executor): - """Test ExecutorAgent initialization.""" - assert executor.agent_type == "executor" - assert executor.logger is not None - - def test_get_system_prompt(self, executor): - """Test that executor has proper system prompt.""" - prompt = executor.get_system_prompt() - - assert isinstance(prompt, str) - assert len(prompt) > 0 - - # Should mention execution responsibilities - assert "execut" in prompt.lower() - assert "code" in prompt.lower() or "implement" in prompt.lower() - - def test_build_execution_prompt(self, executor): - """Test execution prompt building.""" - goal = "Build a web application" - plan = "1. Create files\n2. Write code" - cycle = 1 - - prompt = executor._build_execution_prompt(goal, plan, cycle) - - assert isinstance(prompt, str) - assert goal in prompt - assert plan in prompt - assert str(cycle) in prompt - - def test_relevant_memory_types(self, executor): - """Test that executor requests relevant memory types.""" - types = executor._get_relevant_memory_types() - - assert isinstance(types, list) - # Executor should care about failed approaches and traces - assert "failed_approach" in types - assert "trace" in types - - def test_build_memory_context_query(self, executor): - """Test memory context query building.""" - # Set execution context - executor._execution_context = { - "plan": "Create files", - "goal": "Build app" - } - - query = executor._build_memory_context_query() - - assert isinstance(query, str) - assert "Create files" in query - - @patch.object(ExecutorAgent, '_execute_command') - def test_do_execute_success(self, mock_execute, executor): - """Test successful execution.""" - # Mock successful execution - mock_execute.return_value = { - "success": True, - "output": "Created files and wrote code successfully" - } - - result = executor._do_execute( - project_dir="/tmp/test", - goal="Test goal", - plan="Create files", - cycle_number=1 - ) - - assert result["success"] is True - assert "execution_result" in result - assert "successfully" in result["execution_result"] - - @patch.object(ExecutorAgent, '_execute_command') - def test_do_execute_failure(self, mock_execute, executor): - """Test failed execution.""" - # Mock failed execution - mock_execute.return_value = { - "success": False, - "error": "Test error" - } - - result = executor._do_execute( - project_dir="/tmp/test", - goal="Test goal", - plan="Create files", - cycle_number=1 - ) - - assert result["success"] is False - assert "error" in result - - -class TestReviewerAgent: - """Test ReviewerAgent functionality.""" - - @pytest.fixture - def logger(self): - """Create test logger.""" - return logging.getLogger("test-reviewer") - - @pytest.fixture - def reviewer(self, logger): - """Create ReviewerAgent instance.""" - return ReviewerAgent(logger) - - def test_initialization(self, reviewer): - """Test ReviewerAgent initialization.""" - assert reviewer.agent_type == "reviewer" - assert reviewer.logger is not None - - def test_get_system_prompt(self, reviewer): - """Test that reviewer has proper system prompt.""" - prompt = reviewer.get_system_prompt() - - assert isinstance(prompt, str) - assert len(prompt) > 0 - - # Should mention review responsibilities - assert "review" in prompt.lower() - assert "completion" in prompt.lower() or "progress" in prompt.lower() - - def test_build_review_prompt(self, reviewer): - """Test review prompt building.""" - goal = "Build a web application" - plan = "1. Create files\n2. Write code" - execution_result = "Created files" - cycle = 1 - - prompt = reviewer._build_review_prompt( - goal, plan, execution_result, cycle, is_validation=False - ) - - assert isinstance(prompt, str) - assert goal in prompt - assert plan in prompt - assert execution_result in prompt - - def test_build_review_prompt_validation_mode(self, reviewer): - """Test review prompt in validation mode.""" - prompt = reviewer._build_review_prompt( - "Test goal", "Test plan", "Test result", 5, is_validation=True - ) - - # Should include validation instructions - assert "VALIDATION" in prompt - assert "critical" in prompt.lower() or "thorough" in prompt.lower() - - def test_extract_completion_percentage_exact_format(self, reviewer): - """Test completion percentage extraction with exact format.""" - output = """ -Review Summary: -Project is progressing well. - -COMPLETION: 75% - -Next steps: Continue implementation. -""" - - percentage = reviewer._extract_completion_percentage(output) - assert percentage == 75 - - def test_extract_completion_percentage_case_insensitive(self, reviewer): - """Test completion percentage extraction is case insensitive.""" - output = "completion: 80%" - percentage = reviewer._extract_completion_percentage(output) - assert percentage == 80 - - def test_extract_completion_percentage_fallback(self, reviewer): - """Test completion percentage extraction fallback.""" - output = "The project is about 60% complete overall." - percentage = reviewer._extract_completion_percentage(output) - assert percentage == 60 - - def test_extract_completion_percentage_none(self, reviewer): - """Test completion percentage extraction when not found.""" - output = "Review: Looking good!" - percentage = reviewer._extract_completion_percentage(output) - assert percentage == 0 - - def test_extract_learnings(self, reviewer): - """Test learning extraction from review.""" - review = """ -Review summary: -Progress is good. - -LEARNING[pattern]: All API calls use async/await -LEARNING[decision]: Using SQLite for simpler deployment -LEARNING[failed_approach]: Tried bcrypt but had Node 18 issues -LEARNING[code_location]: Auth middleware in src/auth/jwt.js - -That's all. -""" - - learnings = reviewer._extract_learnings(review) - - assert len(learnings) == 4 - - # Check each learning - types = [l["type"] for l in learnings] - assert "pattern" in types - assert "decision" in types - assert "failed_approach" in types - assert "code_location" in types - - # Check content - contents = [l["content"] for l in learnings] - assert any("async/await" in c for c in contents) - assert any("SQLite" in c for c in contents) - - def test_extract_learnings_no_learnings(self, reviewer): - """Test learning extraction with no learnings.""" - review = "Just a simple review with no structured learnings." - - learnings = reviewer._extract_learnings(review) - - assert len(learnings) == 0 - - def test_relevant_memory_types(self, reviewer): - """Test that reviewer requests relevant memory types.""" - types = reviewer._get_relevant_memory_types() - - assert isinstance(types, list) - # Reviewer should care about patterns, decisions, learnings - assert "learning" in types - assert "decision" in types - assert "pattern" in types - - def test_build_memory_context_query(self, reviewer): - """Test memory context query building.""" - # Set execution context - reviewer._execution_context = { - "execution_result": "Files created", - "plan": "Create files" - } - - query = reviewer._build_memory_context_query() - - assert isinstance(query, str) - assert "Files created" in query - - @patch.object(ReviewerAgent, '_execute_command') - def test_do_execute_success(self, mock_execute, reviewer): - """Test successful review.""" - # Mock successful review - mock_execute.return_value = { - "success": True, - "output": "COMPLETION: 85%\nGood progress!\nLEARNING[pattern]: Using MVC" - } - - result = reviewer._do_execute( - project_dir="/tmp/test", - goal="Test goal", - plan="Test plan", - execution_result="Test result", - cycle_number=1 - ) - - assert result["success"] is True - assert "review" in result - assert "completion_percentage" in result - assert result["completion_percentage"] == 85 - assert "learnings" in result - assert len(result["learnings"]) == 1 - - @patch.object(ReviewerAgent, '_execute_command') - def test_do_execute_failure(self, mock_execute, reviewer): - """Test failed review.""" - # Mock failed review - mock_execute.return_value = { - "success": False, - "error": "Test error" - } - - result = reviewer._do_execute( - project_dir="/tmp/test", - goal="Test goal", - plan="Test plan", - execution_result="Test result", - cycle_number=1 - ) - - assert result["success"] is False - assert "error" in result - assert result["completion_percentage"] == 0 - assert len(result["learnings"]) == 0 - diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..bf0b76e --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,294 @@ +"""Unit tests for the fireteam API.""" + +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock, AsyncMock + +from fireteam.api import execute, COMPLEXITY_TO_MODE +from fireteam.models import ExecutionMode, ExecutionResult, _extract_completion, _extract_issues +from fireteam.prompts import EXECUTOR_PROMPT, REVIEWER_PROMPT, PLANNER_PROMPT +from fireteam.complexity import ComplexityLevel + + +class TestExecutionMode: + """Tests for ExecutionMode enum.""" + + def test_modes_exist(self): + """All expected execution modes exist.""" + assert ExecutionMode.SINGLE_TURN.value == "single_turn" + assert ExecutionMode.MODERATE.value == "moderate" + assert ExecutionMode.FULL.value == "full" + + def test_modes_count(self): + """Exactly 3 execution modes exist (SIMPLE removed).""" + assert len(ExecutionMode) == 3 + + +class TestExecutionResult: + """Tests for ExecutionResult dataclass.""" + + def test_success_result(self): + """Can create successful result.""" + result = ExecutionResult( + success=True, + mode=ExecutionMode.MODERATE, + output="Done", + completion_percentage=100, + ) + assert result.success is True + assert result.mode == ExecutionMode.MODERATE + assert result.output == "Done" + assert result.completion_percentage == 100 + + def test_failure_result(self): + """Can create failure result.""" + result = ExecutionResult( + success=False, + mode=ExecutionMode.FULL, + error="Something went wrong", + ) + assert result.success is False + assert result.error == "Something went wrong" + + def test_default_values(self): + """Default values are correct.""" + result = ExecutionResult(success=True, mode=ExecutionMode.SINGLE_TURN) + assert result.output is None + assert result.error is None + assert result.completion_percentage == 0 + assert result.metadata == {} + assert result.iterations == 0 + + +class TestComplexityToMode: + """Tests for complexity to mode mapping.""" + + def test_trivial_maps_to_single_turn(self): + """TRIVIAL maps to SINGLE_TURN.""" + assert COMPLEXITY_TO_MODE[ComplexityLevel.TRIVIAL] == ExecutionMode.SINGLE_TURN + + def test_simple_maps_to_single_turn(self): + """SIMPLE now maps to SINGLE_TURN (merged).""" + assert COMPLEXITY_TO_MODE[ComplexityLevel.SIMPLE] == ExecutionMode.SINGLE_TURN + + def test_moderate_maps_to_moderate(self): + """MODERATE maps to MODERATE.""" + assert COMPLEXITY_TO_MODE[ComplexityLevel.MODERATE] == ExecutionMode.MODERATE + + def test_complex_maps_to_full(self): + """COMPLEX maps to FULL.""" + assert COMPLEXITY_TO_MODE[ComplexityLevel.COMPLEX] == ExecutionMode.FULL + + def test_all_complexity_levels_mapped(self): + """All complexity levels have a mode mapping.""" + for level in ComplexityLevel: + assert level in COMPLEXITY_TO_MODE + + +class TestPrompts: + """Tests for system prompts.""" + + def test_executor_prompt_exists(self): + """Executor prompt exists and contains key guidance.""" + assert len(EXECUTOR_PROMPT) > 0 + assert "quality" in EXECUTOR_PROMPT.lower() or "work" in EXECUTOR_PROMPT.lower() + + def test_reviewer_prompt_exists(self): + """Reviewer prompt exists and asks for completion percentage.""" + assert len(REVIEWER_PROMPT) > 0 + assert "COMPLETION" in REVIEWER_PROMPT + + def test_reviewer_prompt_has_issues_format(self): + """Reviewer prompt includes ISSUES section format.""" + assert "ISSUES" in REVIEWER_PROMPT + + def test_planner_prompt_exists(self): + """Planner prompt exists and asks for analysis.""" + assert len(PLANNER_PROMPT) > 0 + assert "plan" in PLANNER_PROMPT.lower() + + +class TestExtractCompletion: + """Tests for completion percentage extraction.""" + + def test_extracts_completion_format(self): + """Extracts COMPLETION: XX% format.""" + text = "Review complete.\nCOMPLETION: 95%\nGood work." + assert _extract_completion(text) == 95 + + def test_handles_lowercase(self): + """Handles lowercase completion.""" + text = "completion: 80%" + assert _extract_completion(text) == 80 + + def test_handles_mixed_case(self): + """Handles mixed case.""" + text = "Completion: 75%" + assert _extract_completion(text) == 75 + + def test_fallback_to_any_percentage(self): + """Falls back to any percentage in text.""" + text = "I'd say this is about 60% done." + assert _extract_completion(text) == 60 + + def test_defaults_to_50(self): + """Defaults to 50 when no percentage found.""" + text = "I can't really say how done this is." + assert _extract_completion(text) == 50 + + +class TestExtractIssues: + """Tests for issues extraction.""" + + def test_extracts_issues_list(self): + """Extracts issues from ISSUES section.""" + text = """Review complete. + +ISSUES: +- Missing error handling +- Tests not added +- Documentation incomplete + +COMPLETION: 70%""" + issues = _extract_issues(text) + assert len(issues) == 3 + assert "Missing error handling" in issues + assert "Tests not added" in issues + + def test_handles_asterisk_bullets(self): + """Handles * bullets in issues.""" + text = """ISSUES: +* Issue one +* Issue two + +COMPLETION: 80%""" + issues = _extract_issues(text) + assert len(issues) == 2 + + def test_empty_when_no_issues_section(self): + """Returns empty list when no ISSUES section.""" + text = "COMPLETION: 100%" + issues = _extract_issues(text) + assert issues == [] + + +class TestExecute: + """Tests for main execute function.""" + + @pytest.mark.asyncio + async def test_auto_detects_complexity(self, project_dir): + """Auto-detects complexity when mode is None.""" + mock_message = MagicMock() + mock_message.result = "Task completed." + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.api.estimate_complexity", return_value=ComplexityLevel.TRIVIAL): + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="Fix the typo", + mode=None, + run_tests=False, + ) + assert result.mode == ExecutionMode.SINGLE_TURN + + @pytest.mark.asyncio + async def test_uses_specified_mode(self, project_dir): + """Uses specified mode when provided.""" + mock_message = MagicMock() + mock_message.result = "Task completed." + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="Fix the bug", + mode=ExecutionMode.SINGLE_TURN, + run_tests=False, + ) + assert result.mode == ExecutionMode.SINGLE_TURN + + @pytest.mark.asyncio + async def test_single_turn_mode(self, project_dir): + """SINGLE_TURN mode makes single SDK call.""" + mock_message = MagicMock() + mock_message.result = "Done in one turn." + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="Fix typo", + mode=ExecutionMode.SINGLE_TURN, + run_tests=False, + ) + assert result.success is True + assert result.completion_percentage == 100 + assert result.iterations == 1 + + @pytest.mark.asyncio + async def test_handles_execution_error(self, project_dir): + """Handles execution errors gracefully.""" + async def mock_query(*args, **kwargs): + raise Exception("SDK error") + yield # Never reached + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="Do something", + mode=ExecutionMode.SINGLE_TURN, + run_tests=False, + ) + assert result.success is False + assert result.error is not None + + @pytest.mark.asyncio + async def test_includes_context_in_prompt(self, project_dir): + """Includes context in the prompt when provided.""" + captured_prompt = None + + async def mock_query(prompt, options): + nonlocal captured_prompt + captured_prompt = prompt + mock_message = MagicMock() + mock_message.result = "Done." + yield mock_message + + with patch("fireteam.loops.query", mock_query): + await execute( + project_dir=project_dir, + goal="Fix bug", + context="Error: NullPointer at line 42", + mode=ExecutionMode.SINGLE_TURN, + run_tests=False, + ) + assert "NullPointer" in captured_prompt + + @pytest.mark.asyncio + async def test_resolves_path(self, project_dir): + """Resolves project_dir to absolute path.""" + mock_message = MagicMock() + mock_message.result = "Done." + captured_options = None + + async def mock_query(prompt, options): + nonlocal captured_options + captured_options = options + yield mock_message + + with patch("fireteam.loops.query", mock_query): + await execute( + project_dir=str(project_dir), + goal="Task", + mode=ExecutionMode.SINGLE_TURN, + run_tests=False, + ) + # Should be absolute path + assert Path(captured_options.cwd).is_absolute() diff --git a/tests/test_base_agent_memory.py b/tests/test_base_agent_memory.py deleted file mode 100644 index 9105c5e..0000000 --- a/tests/test_base_agent_memory.py +++ /dev/null @@ -1,238 +0,0 @@ -""" -Unit tests for BaseAgent memory integration. -Tests execution context storage, automatic retrieval, and memory injection. -""" - -import pytest -import tempfile -import shutil -from pathlib import Path -import sys -from unittest.mock import Mock, MagicMock, patch - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from agents.base import BaseAgent -from memory.manager import MemoryManager - - -class ConcreteAgent(BaseAgent): - """Concrete implementation for testing.""" - - def get_system_prompt(self) -> str: - return "Test agent system prompt" - - def _do_execute(self, **kwargs): - """Simple implementation for testing.""" - return { - "success": True, - "test_result": "completed", - "kwargs_received": kwargs - } - - def _build_memory_context_query(self) -> str: - """Build context query from stored execution context.""" - goal = self._execution_context.get('goal', '') - plan = self._execution_context.get('plan', '') - return f"Working on: {goal}. Plan: {plan}" - - def _get_relevant_memory_types(self) -> list[str]: - return ["learning", "decision"] - - -@pytest.mark.slow -class TestBaseAgentMemoryIntegration: - """Test BaseAgent memory features (uses heavy Qwen3 model).""" - - @pytest.fixture - def temp_memory_dir(self): - """Create temporary memory directory.""" - temp_dir = tempfile.mkdtemp() - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - @pytest.fixture - def memory_manager(self, temp_memory_dir): - """Create MemoryManager instance.""" - return MemoryManager(memory_dir=temp_memory_dir) - - @pytest.fixture - def agent_with_memory(self, memory_manager): - """Create agent with memory manager.""" - return ConcreteAgent("test", memory_manager=memory_manager) - - @pytest.fixture - def agent_without_memory(self): - """Create agent without memory manager.""" - return ConcreteAgent("test", memory_manager=None) - - def test_execution_context_storage(self, agent_without_memory): - """Test that execute() stores kwargs in _execution_context.""" - kwargs = { - "project_dir": "/tmp/test", - "goal": "Test goal", - "plan": "Test plan", - "cycle_number": 5 - } - - agent_without_memory.execute(**kwargs) - - # Check context was stored - assert agent_without_memory._execution_context == kwargs - assert agent_without_memory._execution_context["goal"] == "Test goal" - assert agent_without_memory._execution_context["cycle_number"] == 5 - - def test_execute_calls_do_execute(self, agent_without_memory): - """Test that execute() properly calls _do_execute().""" - result = agent_without_memory.execute( - project_dir="/tmp/test", - goal="Test goal", - plan="Test plan" - ) - - # Should return result from _do_execute - assert result["success"] is True - assert result["test_result"] == "completed" - assert "kwargs_received" in result - - def test_memory_context_query_building(self, agent_with_memory): - """Test that agents can build context queries from execution context.""" - agent_with_memory._execution_context = { - "goal": "Build auth system", - "plan": "Implement JWT tokens" - } - - query = agent_with_memory._build_memory_context_query() - - assert "Build auth system" in query - assert "Implement JWT tokens" in query - - def test_retrieve_memories_without_memory_manager(self, agent_without_memory): - """Test that retrieval works gracefully without memory manager.""" - agent_without_memory._execution_context = {"goal": "Test"} - - memories = agent_without_memory._retrieve_and_format_memories() - - # Should return empty string - assert memories == "" - - def test_retrieve_memories_with_empty_query(self, agent_with_memory): - """Test retrieval with empty context query.""" - # Agent returns empty query - agent_with_memory._execution_context = {} - - memories = agent_with_memory._retrieve_and_format_memories() - - # Should return empty string - assert memories == "" - - def test_retrieve_and_format_memories(self, agent_with_memory, memory_manager): - """Test automatic memory retrieval and formatting.""" - project_dir = "/tmp/test-project" - memory_manager.initialize_project(project_dir, "Test goal") - - # Add some memories - memory_manager.add_memory( - content="Authentication uses JWT tokens", - memory_type="decision", - cycle=1 - ) - memory_manager.add_memory( - content="All API calls use async/await pattern", - memory_type="learning", - cycle=2 - ) - - # Set execution context - agent_with_memory._execution_context = { - "goal": "Build authentication", - "plan": "Implement JWT middleware" - } - - # Retrieve memories - formatted = agent_with_memory._retrieve_and_format_memories() - - # Should contain formatted memories - assert "BACKGROUND KNOWLEDGE" in formatted - assert "JWT tokens" in formatted - assert "Cycle 1" in formatted or "Cycle 2" in formatted - - def test_memory_type_filtering(self, agent_with_memory, memory_manager): - """Test that agents retrieve only relevant memory types.""" - project_dir = "/tmp/test-project-types" - memory_manager.initialize_project(project_dir, "Test goal") - - # Add different types - memory_manager.add_memory("Learning 1", "learning", 1) - memory_manager.add_memory("Decision 1", "decision", 1) - memory_manager.add_memory("Trace 1", "trace", 1) - memory_manager.add_memory("Failed 1", "failed_approach", 1) - - # Agent only wants learning and decision - agent_with_memory._execution_context = {"goal": "Test"} - - # Mock search to verify it's called with correct types - original_search = memory_manager.search - - def mock_search(query, limit=10, memory_types=None): - # Verify types passed - assert memory_types is not None - assert set(memory_types) == {"learning", "decision"} - return original_search(query, limit, memory_types) - - memory_manager.search = mock_search - - # Trigger retrieval - agent_with_memory._retrieve_and_format_memories() - - -@pytest.mark.slow -class TestMemoryInjection: - """Test memory injection into agent execution (uses heavy Qwen3 model).""" - - @pytest.fixture - def temp_memory_dir(self): - """Create temporary memory directory.""" - temp_dir = tempfile.mkdtemp() - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - def test_memory_injection_into_system_prompt(self, temp_memory_dir): - """Test that memories are injected into system prompt.""" - memory_manager = MemoryManager(memory_dir=temp_memory_dir) - agent = ConcreteAgent("test", memory_manager=memory_manager) - - # Initialize project and add memory - memory_manager.initialize_project("/tmp/test", "Test goal") - memory_manager.add_memory("Important context", "learning", 1) - - # Set execution context - agent._execution_context = {"goal": "Important context test"} - - # Mock _execute_with_sdk to capture enhanced prompt - captured_prompt = None - - async def mock_execute(prompt, project_dir): - nonlocal captured_prompt - # Get the enhanced system prompt from options - # This would be called inside _execute_with_sdk - memory_context = agent._retrieve_and_format_memories() - base_prompt = agent.get_system_prompt() - captured_prompt = base_prompt + "\n" + memory_context if memory_context else base_prompt - - return {"success": True, "output": "Test output", "error": None} - - with patch.object(agent, '_execute_with_sdk', side_effect=mock_execute): - with patch.object(agent, '_execute_command', return_value={"success": True, "output": "Test"}): - agent.execute(goal="Test") - - # Verify memory was retrieved and formatted - formatted = agent._retrieve_and_format_memories() - assert "Important context" in formatted - assert "BACKGROUND KNOWLEDGE" in formatted - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) - diff --git a/tests/test_cli_tools.py b/tests/test_cli_tools.py deleted file mode 100644 index 4c175ae..0000000 --- a/tests/test_cli_tools.py +++ /dev/null @@ -1,465 +0,0 @@ -""" -Tests for CLI tools. -Tests fireteam-status and other CLI utilities. -""" - -import pytest -import tempfile -import shutil -import json -import os -import sys -from pathlib import Path -from unittest.mock import Mock, patch, MagicMock -from io import StringIO - -# Add CLI directory to path -sys.path.insert(0, str(Path(__file__).parent.parent / "cli")) - - -class TestFireteamStatus: - """Test fireteam-status CLI tool.""" - - @pytest.fixture - def temp_system_dir(self): - """Create temporary system directory.""" - temp_dir = Path(tempfile.mkdtemp(prefix="test-system-")) - - # Create subdirectories - (temp_dir / "state").mkdir() - (temp_dir / "logs").mkdir() - - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - @pytest.fixture - def mock_state_file(self, temp_system_dir): - """Create mock state file.""" - state_file = temp_system_dir / "state" / "current.json" - state_data = { - "project_dir": "/tmp/test-project", - "goal": "Build a test application", - "status": "executing", - "cycle_number": 5, - "completion_percentage": 75, - "git_branch": "fireteam-20250101-120000", - "started_at": "2025-01-01T12:00:00", - "updated_at": "2025-01-01T12:30:00", - "completed": False - } - - with open(state_file, 'w') as f: - json.dump(state_data, f) - - return state_file - - def test_import_fireteam_status(self): - """Test that fireteam-status can be imported.""" - # This is a sanity check - try: - # Can't easily import because of SYSTEM_DIR hardcoded path - # But we can read the file - status_file = Path(__file__).parent.parent / "cli" / "fireteam-status" - assert status_file.exists() - - content = status_file.read_text() - assert "def show_status" in content - assert "def load_state" in content - except Exception as e: - pytest.skip(f"Could not read fireteam-status: {e}") - - @patch('sys.argv', ['fireteam-status', '--help']) - def test_fireteam_status_help(self): - """Test fireteam-status help output.""" - # Import the module (this will be tricky due to hardcoded paths) - # For now, just verify file structure - status_file = Path(__file__).parent.parent / "cli" / "fireteam-status" - assert status_file.exists() - - content = status_file.read_text() - # Check for key functions - assert "def main()" in content - assert "argparse" in content - assert "--watch" in content - assert "--logs" in content - - def test_check_process_running(self): - """Test check_process_running function.""" - # We'll test the logic, not the actual function - # since it has hardcoded paths - - # Current process should be running - current_pid = os.getpid() - - # Verify process exists - try: - os.kill(current_pid, 0) - is_running = True - except (OSError, ProcessLookupError): - is_running = False - - assert is_running is True - - # Invalid PID should not be running - fake_pid = 999999 - try: - os.kill(fake_pid, 0) - is_running = True - except (OSError, ProcessLookupError): - is_running = False - - assert is_running is False - - def test_format_timestamp(self): - """Test timestamp formatting logic.""" - from datetime import datetime - - # Test ISO format parsing - iso_timestamp = "2025-01-01T12:30:45" - dt = datetime.fromisoformat(iso_timestamp) - formatted = dt.strftime("%Y-%m-%d %H:%M:%S") - - assert formatted == "2025-01-01 12:30:45" - - def test_state_file_format(self, mock_state_file): - """Test state file can be parsed.""" - # Read and parse state file - with open(mock_state_file, 'r') as f: - state = json.load(f) - - # Verify required fields - assert "project_dir" in state - assert "goal" in state - assert "status" in state - assert "cycle_number" in state - assert "completion_percentage" in state - assert "started_at" in state - assert "updated_at" in state - - # Verify values - assert state["project_dir"] == "/tmp/test-project" - assert state["status"] == "executing" - assert state["cycle_number"] == 5 - assert state["completion_percentage"] == 75 - - -class TestCLIScripts: - """Test CLI shell scripts.""" - - def test_start_agent_script_exists(self): - """Test that start-agent script exists.""" - script_file = Path(__file__).parent.parent / "cli" / "start-agent" - assert script_file.exists() - - content = script_file.read_text() - # Check for key elements - assert "#!/bin/bash" in content - assert "--project-dir" in content - assert "--prompt" in content or "--goal" in content - - def test_stop_agent_script_exists(self): - """Test that stop-agent script exists.""" - script_file = Path(__file__).parent.parent / "cli" / "stop-agent" - assert script_file.exists() - - content = script_file.read_text() - # Check for key elements - assert "#!/bin/bash" in content - assert "PID" in content - assert "kill" in content - - def test_agent_progress_script_exists(self): - """Test that agent-progress script exists.""" - script_file = Path(__file__).parent.parent / "cli" / "agent-progress" - if script_file.exists(): - content = script_file.read_text() - assert len(content) > 0 - - -class TestCLIArgumentParsing: - """Test CLI argument parsing logic.""" - - def test_status_arguments(self): - """Test status command argument parsing.""" - import argparse - - # Simulate argument parsing for status command - parser = argparse.ArgumentParser() - parser.add_argument("--watch", action="store_true") - parser.add_argument("--interval", type=int, default=5) - parser.add_argument("--logs", action="store_true") - parser.add_argument("--follow", action="store_true") - parser.add_argument("--lines", type=int, default=20) - - # Test default - args = parser.parse_args([]) - assert args.watch is False - assert args.interval == 5 - assert args.logs is False - - # Test watch mode - args = parser.parse_args(["--watch"]) - assert args.watch is True - - # Test custom interval - args = parser.parse_args(["--watch", "--interval", "10"]) - assert args.watch is True - assert args.interval == 10 - - # Test logs - args = parser.parse_args(["--logs"]) - assert args.logs is True - - # Test follow - args = parser.parse_args(["--logs", "--follow"]) - assert args.logs is True - assert args.follow is True - - -class TestSystemResourceMonitoring: - """Test system resource monitoring functions.""" - - @patch('subprocess.check_output') - def test_memory_info_parsing(self, mock_subprocess): - """Test memory information parsing.""" - # Mock free -h output - mock_subprocess.return_value = """ total used free shared buff/cache available -Mem: 15Gi 8.0Gi 2.0Gi 500Mi 5.0Gi 10Gi -Swap: 2.0Gi 0.0Gi 2.0Gi""" - - output = mock_subprocess() - lines = output.strip().split('\n') - mem_data = lines[1].split() - - assert mem_data[1] == "15Gi" # total - assert mem_data[2] == "8.0Gi" # used - - @patch('subprocess.check_output') - def test_cpu_load_parsing(self, mock_subprocess): - """Test CPU load information parsing.""" - # Mock uptime output - mock_subprocess.return_value = " 12:30:45 up 10 days, 3:45, 2 users, load average: 1.23, 1.45, 1.67" - - output = mock_subprocess() - load = output.split('load average:')[1].strip() - - assert load == "1.23, 1.45, 1.67" - - @patch('subprocess.check_output') - def test_disk_usage_parsing(self, mock_subprocess): - """Test disk usage information parsing.""" - # Mock df -h output - mock_subprocess.return_value = """Filesystem Size Used Avail Use% Mounted on -/dev/sda1 100G 60G 40G 60% /""" - - output = mock_subprocess() - disk_line = output.strip().split('\n')[1] - disk_usage = disk_line.split()[4] - - assert disk_usage == "60%" - - -class TestPIDFileHandling: - """Test PID file handling.""" - - @pytest.fixture - def temp_pid_file(self): - """Create temporary PID file.""" - temp_file = Path(tempfile.mktemp(suffix=".pid")) - yield temp_file - if temp_file.exists(): - temp_file.unlink() - - def test_write_pid_file(self, temp_pid_file): - """Test writing PID to file.""" - pid = 12345 - temp_pid_file.write_text(str(pid)) - - # Read back - read_pid = int(temp_pid_file.read_text().strip()) - assert read_pid == pid - - def test_read_pid_file(self, temp_pid_file): - """Test reading PID from file.""" - pid = 67890 - temp_pid_file.write_text(f"{pid}\n") - - # Read back - read_pid = int(temp_pid_file.read_text().strip()) - assert read_pid == pid - - def test_pid_file_cleanup(self, temp_pid_file): - """Test PID file cleanup.""" - temp_pid_file.write_text("12345") - assert temp_pid_file.exists() - - # Cleanup - temp_pid_file.unlink() - assert not temp_pid_file.exists() - - -class TestLogFileHandling: - """Test log file handling.""" - - @pytest.fixture - def temp_log_dir(self): - """Create temporary log directory.""" - temp_dir = Path(tempfile.mkdtemp(prefix="test-logs-")) - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - def test_log_file_creation(self, temp_log_dir): - """Test log file creation.""" - log_file = temp_log_dir / "orchestrator_20250101_120000.log" - - # Write log content - log_content = "2025-01-01 12:00:00 - INFO - Starting system\n" - log_file.write_text(log_content) - - # Verify - assert log_file.exists() - assert log_file.read_text() == log_content - - def test_find_latest_log(self, temp_log_dir): - """Test finding latest log file.""" - # Create multiple log files - log1 = temp_log_dir / "orchestrator_20250101_120000.log" - log2 = temp_log_dir / "orchestrator_20250101_130000.log" - log3 = temp_log_dir / "orchestrator_20250101_140000.log" - - log1.write_text("Log 1") - log2.write_text("Log 2") - log3.write_text("Log 3") - - # Find latest - log_files = sorted(temp_log_dir.glob("orchestrator_*.log")) - latest_log = log_files[-1] - - assert latest_log == log3 - - def test_read_log_lines(self, temp_log_dir): - """Test reading specific number of log lines.""" - log_file = temp_log_dir / "test.log" - - # Write multiple lines - lines = [f"Line {i}\n" for i in range(50)] - log_file.write_text("".join(lines)) - - # Read last N lines - content = log_file.read_text().split('\n') - last_20 = content[-21:-1] # -1 excludes empty line at end - - assert len(last_20) == 20 - assert last_20[-1] == "Line 49" - - -class TestCLIErrorHandling: - """Test CLI error handling.""" - - def test_missing_state_file(self): - """Test handling of missing state file.""" - fake_path = Path("/tmp/nonexistent-state-file.json") - - # Should not crash when file doesn't exist - exists = fake_path.exists() - assert exists is False - - # Handling logic should check existence first - if not exists: - state = None - else: - with open(fake_path, 'r') as f: - state = json.load(f) - - assert state is None - - def test_invalid_json_state(self): - """Test handling of invalid JSON in state file.""" - temp_file = Path(tempfile.mktemp(suffix=".json")) - - try: - # Write invalid JSON - temp_file.write_text("{ invalid json }") - - # Try to parse - try: - with open(temp_file, 'r') as f: - state = json.load(f) - except json.JSONDecodeError: - state = None - - assert state is None - finally: - if temp_file.exists(): - temp_file.unlink() - - def test_missing_pid_file(self): - """Test handling of missing PID file.""" - fake_path = Path("/tmp/nonexistent.pid") - - # Should handle gracefully - if not fake_path.exists(): - running = False - else: - pid = int(fake_path.read_text().strip()) - # Check if process is running - try: - os.kill(pid, 0) - running = True - except (OSError, ProcessLookupError): - running = False - - assert running is False - - -class TestCLIOutputFormatting: - """Test CLI output formatting.""" - - def test_status_display_format(self): - """Test status display formatting.""" - # Test the format structure (without actually calling the function) - status_lines = [ - "=" * 60, - "🔥 FIRETEAM STATUS", - "=" * 60, - "", - "Status: ✅ RUNNING (PID: 12345)", - "", - "📁 Project State:", - "-" * 60, - " Project: /tmp/test-project", - " Goal: Build application", - " Status: EXECUTING", - " Cycle: 5", - " Completion: 75%", - ] - - # Verify formatting - assert len(status_lines) > 0 - assert "FIRETEAM STATUS" in status_lines[1] - - def test_goal_truncation(self): - """Test long goal string truncation.""" - long_goal = "A" * 100 - - # Truncate if too long - if len(long_goal) > 80: - truncated = long_goal[:77] + "..." - else: - truncated = long_goal - - assert len(truncated) == 80 - assert truncated.endswith("...") - - def test_timestamp_formatting(self): - """Test timestamp formatting.""" - from datetime import datetime - - iso_timestamp = "2025-01-01T12:30:45" - dt = datetime.fromisoformat(iso_timestamp) - formatted = dt.strftime("%Y-%m-%d %H:%M:%S") - - assert " " in formatted - assert ":" in formatted - assert "-" in formatted - diff --git a/tests/test_complexity.py b/tests/test_complexity.py new file mode 100644 index 0000000..c1ba490 --- /dev/null +++ b/tests/test_complexity.py @@ -0,0 +1,255 @@ +"""Unit tests for complexity estimation.""" + +import pytest +from unittest.mock import patch, AsyncMock, MagicMock + +from fireteam.complexity import ComplexityLevel, estimate_complexity, COMPLEXITY_PROMPT + + +class TestComplexityLevel: + """Tests for ComplexityLevel enum.""" + + def test_complexity_levels_exist(self): + """All expected complexity levels exist.""" + assert ComplexityLevel.TRIVIAL.value == "trivial" + assert ComplexityLevel.SIMPLE.value == "simple" + assert ComplexityLevel.MODERATE.value == "moderate" + assert ComplexityLevel.COMPLEX.value == "complex" + + def test_complexity_levels_count(self): + """Exactly 4 complexity levels exist.""" + assert len(ComplexityLevel) == 4 + + +class TestComplexityPrompt: + """Tests for the complexity prompt template.""" + + def test_prompt_has_placeholders(self): + """Prompt contains required placeholders.""" + assert "{goal}" in COMPLEXITY_PROMPT + assert "{context}" in COMPLEXITY_PROMPT + + def test_prompt_describes_levels(self): + """Prompt describes all complexity levels.""" + assert "TRIVIAL" in COMPLEXITY_PROMPT + assert "SIMPLE" in COMPLEXITY_PROMPT + assert "MODERATE" in COMPLEXITY_PROMPT + assert "COMPLEX" in COMPLEXITY_PROMPT + + def test_prompt_format_works(self): + """Prompt can be formatted with goal and context.""" + formatted = COMPLEXITY_PROMPT.format(goal="Fix a bug", context="Error logs") + assert "Fix a bug" in formatted + assert "Error logs" in formatted + + +class TestEstimateComplexity: + """Tests for estimate_complexity function.""" + + @pytest.mark.asyncio + async def test_returns_trivial(self): + """Returns TRIVIAL when model responds with TRIVIAL.""" + mock_message = MagicMock() + mock_message.result = "TRIVIAL" + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + result = await estimate_complexity("fix typo") + assert result == ComplexityLevel.TRIVIAL + + @pytest.mark.asyncio + async def test_returns_simple(self): + """Returns SIMPLE when model responds with SIMPLE.""" + mock_message = MagicMock() + mock_message.result = "SIMPLE" + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + result = await estimate_complexity("add logging") + assert result == ComplexityLevel.SIMPLE + + @pytest.mark.asyncio + async def test_returns_moderate(self): + """Returns MODERATE when model responds with MODERATE.""" + mock_message = MagicMock() + mock_message.result = "MODERATE" + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + result = await estimate_complexity("refactor auth module") + assert result == ComplexityLevel.MODERATE + + @pytest.mark.asyncio + async def test_returns_complex(self): + """Returns COMPLEX when model responds with COMPLEX.""" + mock_message = MagicMock() + mock_message.result = "COMPLEX" + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + result = await estimate_complexity("redesign the architecture") + assert result == ComplexityLevel.COMPLEX + + @pytest.mark.asyncio + async def test_handles_lowercase_response(self): + """Handles lowercase response.""" + mock_message = MagicMock() + mock_message.result = "moderate" + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + result = await estimate_complexity("some task") + assert result == ComplexityLevel.MODERATE + + @pytest.mark.asyncio + async def test_handles_response_with_extra_text(self): + """Handles response with extra text around the level.""" + mock_message = MagicMock() + mock_message.result = "I think this is COMPLEX because it involves many files." + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + result = await estimate_complexity("big task") + assert result == ComplexityLevel.COMPLEX + + @pytest.mark.asyncio + async def test_defaults_to_simple_on_unclear_response(self): + """Defaults to SIMPLE when response is unclear.""" + mock_message = MagicMock() + mock_message.result = "I'm not sure how to classify this." + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + result = await estimate_complexity("ambiguous task") + assert result == ComplexityLevel.SIMPLE + + @pytest.mark.asyncio + async def test_defaults_to_simple_on_empty_response(self): + """Defaults to SIMPLE when response is empty.""" + mock_message = MagicMock() + mock_message.result = "" + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + result = await estimate_complexity("task") + assert result == ComplexityLevel.SIMPLE + + @pytest.mark.asyncio + async def test_context_is_included_in_prompt(self): + """Context is included when provided.""" + mock_message = MagicMock() + mock_message.result = "SIMPLE" + captured_prompt = None + + async def mock_query(prompt, **kwargs): + nonlocal captured_prompt + captured_prompt = prompt + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + await estimate_complexity("fix bug", context="Error: NullPointer") + + assert "Error: NullPointer" in captured_prompt + + @pytest.mark.asyncio + async def test_no_context_shows_none_provided(self): + """Shows 'None provided' when no context given.""" + mock_message = MagicMock() + mock_message.result = "SIMPLE" + captured_prompt = None + + async def mock_query(prompt, **kwargs): + nonlocal captured_prompt + captured_prompt = prompt + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + await estimate_complexity("fix bug") + + assert "None provided" in captured_prompt + + @pytest.mark.asyncio + async def test_uses_no_tools_without_project_dir(self): + """Without project_dir, estimation uses no tools.""" + mock_message = MagicMock() + mock_message.result = "SIMPLE" + captured_options = None + + async def mock_query(prompt, options): + nonlocal captured_options + captured_options = options + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + await estimate_complexity("task") + + assert captured_options.allowed_tools == [] + + @pytest.mark.asyncio + async def test_uses_single_turn_without_project_dir(self): + """Without project_dir, estimation uses max_turns=1.""" + mock_message = MagicMock() + mock_message.result = "SIMPLE" + captured_options = None + + async def mock_query(prompt, options): + nonlocal captured_options + captured_options = options + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + await estimate_complexity("task") + + assert captured_options.max_turns == 1 + + @pytest.mark.asyncio + async def test_uses_exploration_tools_with_project_dir(self, project_dir): + """With project_dir, estimation uses read-only exploration tools.""" + mock_message = MagicMock() + mock_message.result = "MODERATE" + captured_options = None + + async def mock_query(prompt, options): + nonlocal captured_options + captured_options = options + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + await estimate_complexity("refactor auth", project_dir=project_dir) + + assert set(captured_options.allowed_tools) == {"Glob", "Grep", "Read"} + assert captured_options.permission_mode == "plan" + + @pytest.mark.asyncio + async def test_sets_cwd_with_project_dir(self, project_dir): + """With project_dir, estimation sets cwd for tool access.""" + mock_message = MagicMock() + mock_message.result = "SIMPLE" + captured_options = None + + async def mock_query(prompt, options): + nonlocal captured_options + captured_options = options + yield mock_message + + with patch("fireteam.complexity.query", mock_query): + await estimate_complexity("task", project_dir=project_dir) + + from pathlib import Path + assert Path(captured_options.cwd).is_absolute() diff --git a/tests/test_config.py b/tests/test_config.py deleted file mode 100644 index 7dea6b1..0000000 --- a/tests/test_config.py +++ /dev/null @@ -1,254 +0,0 @@ -""" -Unit tests for configuration module. -Tests environment variable loading, validation, and configuration values. -""" - -import pytest -import os -from unittest.mock import patch -import sys -from pathlib import Path - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - - -class TestConfig: - """Test configuration module.""" - - def test_system_directories(self): - """Test that system directories are configured.""" - import config - - # System directory should be set - assert config.SYSTEM_DIR is not None - assert isinstance(config.SYSTEM_DIR, str) - - # Derived directories should be set - assert config.STATE_DIR is not None - assert config.LOGS_DIR is not None - assert config.CLI_DIR is not None - assert config.MEMORY_DIR is not None - - # Paths should be properly constructed - assert config.SYSTEM_DIR in config.STATE_DIR - assert config.SYSTEM_DIR in config.LOGS_DIR - assert config.SYSTEM_DIR in config.CLI_DIR - assert config.SYSTEM_DIR in config.MEMORY_DIR - - @patch.dict(os.environ, {"FIRETEAM_DIR": "/custom/path"}, clear=False) - def test_custom_system_dir(self): - """Test FIRETEAM_DIR environment variable override.""" - # Need to reimport to pick up env var - import importlib - import config as config_module - importlib.reload(config_module) - - # Should use custom path - assert "/custom/path" in config_module.SYSTEM_DIR or config_module.SYSTEM_DIR == "/custom/path" - - def test_anthropic_api_key_function(self): - """Test Anthropic API key lazy loading.""" - import config - - # Should have the function - assert hasattr(config, 'get_anthropic_api_key') - assert callable(config.get_anthropic_api_key) - - # If ANTHROPIC_API_KEY is set, should return it - if os.getenv("ANTHROPIC_API_KEY"): - api_key = config.get_anthropic_api_key() - assert api_key is not None - assert isinstance(api_key, str) - assert len(api_key) > 0 - - @patch.dict(os.environ, {}, clear=False) - @patch("os.getenv", side_effect=lambda key, default=None: default if key == "ANTHROPIC_API_KEY" else os.environ.get(key, default)) - def test_anthropic_api_key_missing(self, mock_getenv): - """Test that missing API key raises error when accessed.""" - import importlib - import config as config_module - importlib.reload(config_module) - - # Should raise ValueError when accessed - with pytest.raises(ValueError, match="ANTHROPIC_API_KEY"): - config_module.get_anthropic_api_key() - - def test_sdk_configuration(self): - """Test Claude SDK configuration values.""" - import config - - # SDK tools should be defined - assert hasattr(config, 'SDK_ALLOWED_TOOLS') - assert isinstance(config.SDK_ALLOWED_TOOLS, list) - assert len(config.SDK_ALLOWED_TOOLS) > 0 - - # Should include essential tools - assert "Read" in config.SDK_ALLOWED_TOOLS - assert "Write" in config.SDK_ALLOWED_TOOLS - assert "Bash" in config.SDK_ALLOWED_TOOLS - - # Permission mode should be set - assert hasattr(config, 'SDK_PERMISSION_MODE') - assert config.SDK_PERMISSION_MODE == "bypassPermissions" - - # Model should be set - assert hasattr(config, 'SDK_MODEL') - assert isinstance(config.SDK_MODEL, str) - assert "claude" in config.SDK_MODEL.lower() - - def test_agent_configuration(self): - """Test agent-related configuration.""" - import config - - # Retry configuration - assert hasattr(config, 'MAX_RETRIES') - assert isinstance(config.MAX_RETRIES, int) - assert config.MAX_RETRIES > 0 - - assert hasattr(config, 'RETRY_DELAY') - assert isinstance(config.RETRY_DELAY, (int, float)) - assert config.RETRY_DELAY > 0 - - def test_agent_timeouts(self): - """Test agent timeout configurations.""" - import config - - # Timeouts dictionary should exist - assert hasattr(config, 'AGENT_TIMEOUTS') - assert isinstance(config.AGENT_TIMEOUTS, dict) - - # Should have timeouts for each agent type - assert "planner" in config.AGENT_TIMEOUTS - assert "executor" in config.AGENT_TIMEOUTS - assert "reviewer" in config.AGENT_TIMEOUTS - - # All timeouts should be positive integers - for agent_type, timeout in config.AGENT_TIMEOUTS.items(): - assert isinstance(timeout, int) - assert timeout > 0 - - # Executor should have longest timeout (builds, tests, etc.) - assert config.AGENT_TIMEOUTS["executor"] >= config.AGENT_TIMEOUTS["planner"] - assert config.AGENT_TIMEOUTS["executor"] >= config.AGENT_TIMEOUTS["reviewer"] - - def test_completion_thresholds(self): - """Test completion threshold configurations.""" - import config - - # Completion threshold - assert hasattr(config, 'COMPLETION_THRESHOLD') - assert isinstance(config.COMPLETION_THRESHOLD, int) - assert 0 <= config.COMPLETION_THRESHOLD <= 100 - - # Validation checks - assert hasattr(config, 'VALIDATION_CHECKS_REQUIRED') - assert isinstance(config.VALIDATION_CHECKS_REQUIRED, int) - assert config.VALIDATION_CHECKS_REQUIRED > 0 - - def test_git_configuration(self): - """Test git-related configuration.""" - import config - - # Git user configuration - assert hasattr(config, 'GIT_USER_NAME') - assert isinstance(config.GIT_USER_NAME, str) - assert len(config.GIT_USER_NAME) > 0 - - assert hasattr(config, 'GIT_USER_EMAIL') - assert isinstance(config.GIT_USER_EMAIL, str) - assert "@" in config.GIT_USER_EMAIL - - def test_logging_configuration(self): - """Test logging configuration.""" - import config - - # Log level should be set - assert hasattr(config, 'LOG_LEVEL') - assert isinstance(config.LOG_LEVEL, str) - assert config.LOG_LEVEL in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] - - # Log format should be set - assert hasattr(config, 'LOG_FORMAT') - assert isinstance(config.LOG_FORMAT, str) - assert len(config.LOG_FORMAT) > 0 - - def test_sudo_configuration(self): - """Test sudo password configuration.""" - import config - - # Should have sudo password attribute - assert hasattr(config, 'SUDO_PASSWORD') - - # has_sudo_access function should exist - assert hasattr(config, 'has_sudo_access') - assert callable(config.has_sudo_access) - - # Function should return boolean - result = config.has_sudo_access() - assert isinstance(result, bool) - - def test_memory_configuration(self): - """Test memory system configuration.""" - import config - - # Memory directory should be set - assert hasattr(config, 'MEMORY_DIR') - assert isinstance(config.MEMORY_DIR, str) - - # Embedding model should be configured - assert hasattr(config, 'MEMORY_EMBEDDING_MODEL') - assert isinstance(config.MEMORY_EMBEDDING_MODEL, str) - assert len(config.MEMORY_EMBEDDING_MODEL) > 0 - - # Search limit should be set - assert hasattr(config, 'MEMORY_SEARCH_LIMIT') - assert isinstance(config.MEMORY_SEARCH_LIMIT, int) - assert config.MEMORY_SEARCH_LIMIT > 0 - - @patch.dict(os.environ, {"ANTHROPIC_MODEL": "claude-opus-4-20250514"}, clear=False) - def test_model_override(self): - """Test that model can be overridden via environment variable.""" - import importlib - import config as config_module - importlib.reload(config_module) - - # Should use overridden model - assert config_module.SDK_MODEL == "claude-opus-4-20250514" - - @patch.dict(os.environ, {"LOG_LEVEL": "DEBUG"}, clear=False) - def test_log_level_override(self): - """Test that log level can be overridden via environment variable.""" - import importlib - import config as config_module - importlib.reload(config_module) - - # Should use overridden log level - assert config_module.LOG_LEVEL == "DEBUG" - - def test_configuration_types(self): - """Test that all configuration values have correct types.""" - import config - - # String configurations - assert isinstance(config.SYSTEM_DIR, str) - assert isinstance(config.SDK_PERMISSION_MODE, str) - assert isinstance(config.SDK_MODEL, str) - assert isinstance(config.GIT_USER_NAME, str) - assert isinstance(config.GIT_USER_EMAIL, str) - assert isinstance(config.LOG_LEVEL, str) - assert isinstance(config.LOG_FORMAT, str) - assert isinstance(config.MEMORY_EMBEDDING_MODEL, str) - - # Integer configurations - assert isinstance(config.MAX_RETRIES, int) - assert isinstance(config.COMPLETION_THRESHOLD, int) - assert isinstance(config.VALIDATION_CHECKS_REQUIRED, int) - assert isinstance(config.MEMORY_SEARCH_LIMIT, int) - - # List configurations - assert isinstance(config.SDK_ALLOWED_TOOLS, list) - - # Dict configurations - assert isinstance(config.AGENT_TIMEOUTS, dict) - diff --git a/tests/test_e2e_hello_world.py b/tests/test_e2e_hello_world.py deleted file mode 100644 index 9e7de46..0000000 --- a/tests/test_e2e_hello_world.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -End-to-end test for Fireteam completing a real task. -Spawns actual Fireteam subprocess and validates task completion. -""" - -import pytest -import subprocess -import sys -from pathlib import Path - -# Add parent to path for helpers -sys.path.insert(0, str(Path(__file__).parent)) -from helpers import FireteamTestRunner - - -@pytest.mark.e2e -@pytest.mark.slow -class TestHelloWorldEndToEnd: - """End-to-end test of Fireteam completing a simple task.""" - - def test_hello_world_completion(self, isolated_tmp_dir, isolated_system_dirs): - """Test Fireteam completes hello world task.""" - project_dir = isolated_tmp_dir / "project" - project_dir.mkdir() - - runner = FireteamTestRunner(project_dir, isolated_system_dirs) - - result = runner.run( - goal="Create a file called hello_world.py that prints 'Hello, World!' when run", - timeout=300, - keep_memory=True # Keep for debugging on failure - ) - - # Print result summary for observability - print(f"\n{result}") - - # Use structured assertions with helpful error messages - assert result.success, ( - f"Fireteam failed to complete task.\n" - f"Return code: {result.returncode}\n" - f"Last 30 log lines:\n" + "\n".join(result.logs.splitlines()[-30:]) - ) - - # Verify file was created - hello_file = project_dir / "hello_world.py" - assert hello_file.exists(), ( - f"hello_world.py not found in {project_dir}\n" - f"Files created: {result.files_created}" - ) - - # Verify output - output = subprocess.run( - [sys.executable, "hello_world.py"], - cwd=project_dir, - capture_output=True, - text=True - ) - assert "Hello, World!" in output.stdout, ( - f"Unexpected output: {output.stdout}\n" - f"stderr: {output.stderr}" - ) - - # Verify git history - assert result.git_commits > 0, "No git commits found" - - # Verify reasonable metrics - assert result.cycle_count >= 1, "No cycles detected" - assert result.final_completion >= 95, f"Completion only {result.final_completion}%" - diff --git a/tests/test_hooks.py b/tests/test_hooks.py new file mode 100644 index 0000000..d530e92 --- /dev/null +++ b/tests/test_hooks.py @@ -0,0 +1,373 @@ +"""Unit tests for SDK hooks.""" + +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock +import subprocess + +from fireteam.hooks import ( + detect_test_command, + run_tests_sync, + run_tests_after_edit, + block_user_questions, + log_tool_usage, + create_test_hooks, + QUALITY_HOOKS, + AUTONOMOUS_HOOKS, + DEBUG_HOOKS, + DEFAULT_TEST_COMMANDS, +) + + +class TestDetectTestCommand: + """Tests for test command detection.""" + + def test_detects_pytest_ini(self, isolated_tmp_dir): + """Detects Python project with pytest.ini.""" + (isolated_tmp_dir / "pytest.ini").write_text("[pytest]") + result = detect_test_command(isolated_tmp_dir) + assert result == ["pytest", "-x", "--tb=short"] + + def test_detects_pyproject_toml(self, isolated_tmp_dir): + """Detects Python project with pyproject.toml.""" + (isolated_tmp_dir / "pyproject.toml").write_text("[project]") + result = detect_test_command(isolated_tmp_dir) + assert result == ["pytest", "-x", "--tb=short"] + + def test_detects_setup_py(self, isolated_tmp_dir): + """Detects Python project with setup.py.""" + (isolated_tmp_dir / "setup.py").write_text("from setuptools import setup") + result = detect_test_command(isolated_tmp_dir) + assert result == ["pytest", "-x", "--tb=short"] + + def test_detects_tests_directory(self, isolated_tmp_dir): + """Detects Python project with tests/ directory.""" + (isolated_tmp_dir / "tests").mkdir() + result = detect_test_command(isolated_tmp_dir) + assert result == ["pytest", "-x", "--tb=short"] + + def test_detects_nodejs(self, isolated_tmp_dir): + """Detects Node.js project with package.json.""" + (isolated_tmp_dir / "package.json").write_text('{"name": "test"}') + result = detect_test_command(isolated_tmp_dir) + assert result == ["npm", "test"] + + def test_detects_rust(self, isolated_tmp_dir): + """Detects Rust project with Cargo.toml.""" + (isolated_tmp_dir / "Cargo.toml").write_text("[package]") + result = detect_test_command(isolated_tmp_dir) + assert result == ["cargo", "test"] + + def test_detects_go(self, isolated_tmp_dir): + """Detects Go project with go.mod.""" + (isolated_tmp_dir / "go.mod").write_text("module test") + result = detect_test_command(isolated_tmp_dir) + assert result == ["go", "test", "./..."] + + def test_detects_makefile_with_test(self, isolated_tmp_dir): + """Detects Makefile with test target.""" + (isolated_tmp_dir / "Makefile").write_text("test:\n\techo 'testing'") + result = detect_test_command(isolated_tmp_dir) + assert result == ["make", "test"] + + def test_ignores_makefile_without_test(self, isolated_tmp_dir): + """Ignores Makefile without test target.""" + (isolated_tmp_dir / "Makefile").write_text("build:\n\techo 'building'") + result = detect_test_command(isolated_tmp_dir) + assert result is None + + def test_returns_none_for_unknown_project(self, isolated_tmp_dir): + """Returns None for unknown project type.""" + result = detect_test_command(isolated_tmp_dir) + assert result is None + + def test_python_takes_priority(self, isolated_tmp_dir): + """Python detection takes priority over other frameworks.""" + # Create both Python and Node.js markers + (isolated_tmp_dir / "pyproject.toml").write_text("[project]") + (isolated_tmp_dir / "package.json").write_text('{"name": "test"}') + result = detect_test_command(isolated_tmp_dir) + assert result == ["pytest", "-x", "--tb=short"] + + +class TestRunTestsSync: + """Tests for synchronous test execution.""" + + def test_returns_success_on_zero_exit(self, isolated_tmp_dir): + """Returns success=True when command exits with 0.""" + with patch("subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="All tests passed", + stderr="" + ) + success, output = run_tests_sync(isolated_tmp_dir, ["pytest"]) + assert success is True + assert "All tests passed" in output + + def test_returns_failure_on_nonzero_exit(self, isolated_tmp_dir): + """Returns success=False when command exits with non-zero.""" + with patch("subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=1, + stdout="", + stderr="1 test failed" + ) + success, output = run_tests_sync(isolated_tmp_dir, ["pytest"]) + assert success is False + assert "1 test failed" in output + + def test_handles_timeout(self, isolated_tmp_dir): + """Handles test timeout gracefully.""" + with patch("subprocess.run") as mock_run: + mock_run.side_effect = subprocess.TimeoutExpired(cmd="pytest", timeout=120) + success, output = run_tests_sync(isolated_tmp_dir, ["pytest"], timeout=120) + assert success is False + assert "timed out" in output + + def test_handles_command_not_found(self, isolated_tmp_dir): + """Handles missing command gracefully.""" + with patch("subprocess.run") as mock_run: + mock_run.side_effect = FileNotFoundError() + success, output = run_tests_sync(isolated_tmp_dir, ["nonexistent"]) + assert success is False + assert "not found" in output + + def test_handles_generic_error(self, isolated_tmp_dir): + """Handles generic errors gracefully.""" + with patch("subprocess.run") as mock_run: + mock_run.side_effect = Exception("Something went wrong") + success, output = run_tests_sync(isolated_tmp_dir, ["pytest"]) + assert success is False + assert "Error" in output + + def test_combines_stdout_and_stderr(self, isolated_tmp_dir): + """Combines stdout and stderr in output.""" + with patch("subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="stdout content", + stderr="stderr content" + ) + success, output = run_tests_sync(isolated_tmp_dir, ["pytest"]) + assert "stdout content" in output + assert "stderr content" in output + + +class TestRunTestsAfterEdit: + """Tests for PostToolUse test running hook.""" + + @pytest.mark.asyncio + async def test_ignores_non_post_tool_use(self): + """Ignores events that aren't PostToolUse.""" + result = await run_tests_after_edit( + {"hook_event_name": "PreToolUse"}, + None, + None + ) + assert result == {} + + @pytest.mark.asyncio + async def test_ignores_non_edit_write_tools(self): + """Ignores tools other than Edit/Write.""" + result = await run_tests_after_edit( + {"hook_event_name": "PostToolUse", "tool_name": "Read"}, + None, + None + ) + assert result == {} + + @pytest.mark.asyncio + async def test_ignores_missing_cwd(self): + """Ignores when cwd is not provided.""" + result = await run_tests_after_edit( + {"hook_event_name": "PostToolUse", "tool_name": "Edit", "cwd": ""}, + None, + None + ) + assert result == {} + + @pytest.mark.asyncio + async def test_ignores_no_test_framework(self, isolated_tmp_dir): + """Ignores when no test framework is detected.""" + result = await run_tests_after_edit( + { + "hook_event_name": "PostToolUse", + "tool_name": "Edit", + "cwd": str(isolated_tmp_dir), + }, + None, + None + ) + assert result == {} + + @pytest.mark.asyncio + async def test_returns_empty_on_success(self, isolated_tmp_dir): + """Returns empty dict when tests pass.""" + (isolated_tmp_dir / "pyproject.toml").write_text("[project]") + + with patch("fireteam.hooks.run_tests_sync", return_value=(True, "All passed")): + result = await run_tests_after_edit( + { + "hook_event_name": "PostToolUse", + "tool_name": "Edit", + "cwd": str(isolated_tmp_dir), + "tool_input": {"file_path": "test.py"}, + }, + None, + None + ) + assert result == {} + + @pytest.mark.asyncio + async def test_returns_feedback_on_failure(self, isolated_tmp_dir): + """Returns feedback when tests fail.""" + (isolated_tmp_dir / "pyproject.toml").write_text("[project]") + + with patch("fireteam.hooks.run_tests_sync", return_value=(False, "1 test failed")): + result = await run_tests_after_edit( + { + "hook_event_name": "PostToolUse", + "tool_name": "Edit", + "cwd": str(isolated_tmp_dir), + "tool_input": {"file_path": "test.py"}, + }, + None, + None + ) + assert "hookSpecificOutput" in result + assert "Tests failed" in result["hookSpecificOutput"]["additionalContext"] + + @pytest.mark.asyncio + async def test_truncates_long_output(self, isolated_tmp_dir): + """Truncates output longer than 2000 chars.""" + (isolated_tmp_dir / "pyproject.toml").write_text("[project]") + long_output = "x" * 3000 + + with patch("fireteam.hooks.run_tests_sync", return_value=(False, long_output)): + result = await run_tests_after_edit( + { + "hook_event_name": "PostToolUse", + "tool_name": "Edit", + "cwd": str(isolated_tmp_dir), + "tool_input": {"file_path": "test.py"}, + }, + None, + None + ) + context = result["hookSpecificOutput"]["additionalContext"] + assert "truncated" in context + assert len(context) < 3000 + + +class TestBlockUserQuestions: + """Tests for PreToolUse AskUserQuestion blocking hook.""" + + @pytest.mark.asyncio + async def test_ignores_non_pre_tool_use(self): + """Ignores events that aren't PreToolUse.""" + result = await block_user_questions( + {"hook_event_name": "PostToolUse"}, + None, + None + ) + assert result == {} + + @pytest.mark.asyncio + async def test_ignores_other_tools(self): + """Ignores tools other than AskUserQuestion.""" + result = await block_user_questions( + {"hook_event_name": "PreToolUse", "tool_name": "Edit"}, + None, + None + ) + assert result == {} + + @pytest.mark.asyncio + async def test_blocks_ask_user_question(self): + """Blocks AskUserQuestion with deny decision.""" + result = await block_user_questions( + {"hook_event_name": "PreToolUse", "tool_name": "AskUserQuestion"}, + None, + None + ) + assert "hookSpecificOutput" in result + output = result["hookSpecificOutput"] + assert output["permissionDecision"] == "deny" + assert "autonomous" in output["permissionDecisionReason"].lower() + + +class TestLogToolUsage: + """Tests for debug logging hook.""" + + @pytest.mark.asyncio + async def test_ignores_non_post_tool_use(self): + """Ignores events that aren't PostToolUse.""" + result = await log_tool_usage( + {"hook_event_name": "PreToolUse"}, + None, + None + ) + assert result == {} + + @pytest.mark.asyncio + async def test_returns_empty_dict(self): + """Always returns empty dict (just logs).""" + result = await log_tool_usage( + {"hook_event_name": "PostToolUse", "tool_name": "Edit", "tool_input": {}}, + None, + None + ) + assert result == {} + + +class TestCreateTestHooks: + """Tests for hook configuration factory.""" + + def test_returns_dict_with_pre_and_post(self): + """Returns dict with PreToolUse and PostToolUse keys.""" + hooks = create_test_hooks() + assert "PreToolUse" in hooks + assert "PostToolUse" in hooks + + def test_pre_tool_use_blocks_questions(self): + """PreToolUse contains AskUserQuestion blocker.""" + hooks = create_test_hooks() + pre_hooks = hooks["PreToolUse"] + assert len(pre_hooks) > 0 + + def test_post_tool_use_runs_tests(self): + """PostToolUse contains test runner.""" + hooks = create_test_hooks() + post_hooks = hooks["PostToolUse"] + assert len(post_hooks) > 0 + + +class TestPreConfiguredHooks: + """Tests for pre-configured hook sets.""" + + def test_quality_hooks_has_pre_and_post(self): + """QUALITY_HOOKS has both PreToolUse and PostToolUse.""" + assert "PreToolUse" in QUALITY_HOOKS + assert "PostToolUse" in QUALITY_HOOKS + + def test_autonomous_hooks_has_pre(self): + """AUTONOMOUS_HOOKS has PreToolUse.""" + assert "PreToolUse" in AUTONOMOUS_HOOKS + + def test_debug_hooks_has_post(self): + """DEBUG_HOOKS has PostToolUse.""" + assert "PostToolUse" in DEBUG_HOOKS + + +class TestDefaultTestCommands: + """Tests for default test commands list.""" + + def test_includes_common_frameworks(self): + """Includes commands for common test frameworks.""" + commands_flat = [cmd[0] for cmd in DEFAULT_TEST_COMMANDS] + assert "pytest" in commands_flat + assert "npm" in commands_flat + assert "cargo" in commands_flat + assert "go" in commands_flat + assert "make" in commands_flat diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..4d1a4e7 --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,400 @@ +"""Integration tests for fireteam. + +These tests verify the full execution flow with mocked SDK calls. +Run with --run-integration for tests that require API keys. +""" + +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock + +from fireteam.api import execute +from fireteam.models import ExecutionMode, ExecutionResult +from fireteam.complexity import ComplexityLevel, estimate_complexity + + +class TestComplexityToExecutionFlow: + """Tests for complexity estimation to execution mode flow.""" + + @pytest.mark.asyncio + async def test_trivial_task_uses_single_turn(self, project_dir): + """Trivial tasks use SINGLE_TURN mode.""" + # Mock complexity estimation to return TRIVIAL + with patch("fireteam.api.estimate_complexity", return_value=ComplexityLevel.TRIVIAL): + # Mock SDK query + mock_message = MagicMock() + mock_message.result = "Fixed the typo." + + async def mock_query(*args, **kwargs): + yield mock_message + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="fix typo in readme", + run_tests=False, + ) + + assert result.success is True + assert result.mode == ExecutionMode.SINGLE_TURN + + @pytest.mark.asyncio + async def test_complex_task_uses_full_mode(self, project_dir): + """Complex tasks use FULL mode with planning and review.""" + call_prompts = [] + + async def mock_query(prompt, options): + call_prompts.append(prompt) + mock_message = MagicMock() + # Return different responses based on call + if len(call_prompts) == 1: # Planning + mock_message.result = "Plan: 1. Analyze 2. Implement 3. Test" + elif len(call_prompts) == 2: # Execution + mock_message.result = "Implemented the feature." + else: # Reviews (3 parallel) + mock_message.result = "COMPLETION: 98%" + yield mock_message + + with patch("fireteam.api.estimate_complexity", return_value=ComplexityLevel.COMPLEX): + with patch("fireteam.loops.query", mock_query): + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="redesign the authentication system", + run_tests=False, + ) + + # Should have at least 3 calls: plan, execute, reviews + assert len(call_prompts) >= 3 + assert result.mode == ExecutionMode.FULL + + +class TestExecutionWithContext: + """Tests for execution with additional context.""" + + @pytest.mark.asyncio + async def test_context_flows_to_execution(self, project_dir): + """Context is included in execution prompt.""" + captured_prompts = [] + + async def mock_query(prompt, options): + captured_prompts.append(prompt) + mock_message = MagicMock() + mock_message.result = "Fixed based on crash logs." + yield mock_message + + with patch("fireteam.loops.query", mock_query): + await execute( + project_dir=project_dir, + goal="fix the crash", + context="Error: NullPointerException at auth.py:42", + mode=ExecutionMode.SINGLE_TURN, + run_tests=False, + ) + + # Context should be in the prompt + assert any("NullPointerException" in p for p in captured_prompts) + + +class TestHooksIntegration: + """Tests for hooks integration with execution.""" + + @pytest.mark.asyncio + async def test_quality_hooks_enabled_by_default(self, project_dir): + """Quality hooks are enabled when run_tests=True.""" + captured_options = None + + async def mock_query(prompt, options): + nonlocal captured_options + captured_options = options + mock_message = MagicMock() + mock_message.result = "Done." + yield mock_message + + with patch("fireteam.loops.query", mock_query): + await execute( + project_dir=project_dir, + goal="add feature", + mode=ExecutionMode.SINGLE_TURN, + run_tests=True, # Default + ) + + # Hooks should be configured + assert captured_options.hooks is not None + + @pytest.mark.asyncio + async def test_hooks_disabled_when_run_tests_false(self, project_dir): + """No hooks when run_tests=False.""" + captured_options = None + + async def mock_query(prompt, options): + nonlocal captured_options + captured_options = options + mock_message = MagicMock() + mock_message.result = "Done." + yield mock_message + + with patch("fireteam.loops.query", mock_query): + await execute( + project_dir=project_dir, + goal="add feature", + mode=ExecutionMode.SINGLE_TURN, + run_tests=False, + ) + + # Hooks should be None + assert captured_options.hooks is None + + +class TestModerateModeLoop: + """Tests for MODERATE mode execute-review loop.""" + + @pytest.mark.asyncio + async def test_moderate_mode_loops_until_complete(self, project_dir): + """MODERATE mode loops execute->review until >95%.""" + call_count = 0 + + async def mock_query(prompt, options): + nonlocal call_count + call_count += 1 + mock_message = MagicMock() + # First iteration: execute, review (70%) + # Second iteration: execute, review (96%) + if call_count == 1: + mock_message.result = "First implementation attempt." + elif call_count == 2: + mock_message.result = "Looks incomplete. COMPLETION: 70%" + elif call_count == 3: + mock_message.result = "Fixed based on feedback." + else: + mock_message.result = "Now complete. COMPLETION: 96%" + yield mock_message + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="refactor auth", + mode=ExecutionMode.MODERATE, + run_tests=False, + ) + + # Should have looped: 2 execute + 2 review = 4 calls + assert call_count == 4 + assert result.success is True + assert result.completion_percentage >= 95 + assert result.iterations == 2 + + @pytest.mark.asyncio + async def test_moderate_mode_stops_at_max_iterations(self, project_dir): + """MODERATE mode stops after max iterations.""" + call_count = 0 + + async def mock_query(prompt, options): + nonlocal call_count + call_count += 1 + mock_message = MagicMock() + if call_count % 2 == 1: + mock_message.result = "Still working..." + else: + mock_message.result = "Not quite there. COMPLETION: 70%" + yield mock_message + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="endless task", + mode=ExecutionMode.MODERATE, + max_iterations=3, + run_tests=False, + ) + + # Should stop after 3 iterations (6 calls: 3 execute + 3 review) + assert call_count == 6 + assert result.success is False + assert result.iterations == 3 + + +class TestFullModeLoop: + """Tests for FULL mode plan-execute-review loop.""" + + @pytest.mark.asyncio + async def test_full_mode_uses_parallel_reviews(self, project_dir): + """FULL mode runs 3 parallel reviewers.""" + call_count = 0 + review_count = 0 + + async def mock_query(prompt, options): + nonlocal call_count, review_count + call_count += 1 + mock_message = MagicMock() + + # Match actual prompt patterns from prompts/*.md + if "analyzing" in prompt.lower(): # Planner: "You are analyzing..." + mock_message.result = "Plan: Step 1, Step 2, Step 3" + elif "executing" in prompt.lower(): # Executor: "You are executing..." + mock_message.result = "Executed all steps." + elif "reviewing" in prompt.lower(): # Reviewer: "You are reviewing..." + review_count += 1 + mock_message.result = f"Reviewer check. COMPLETION: 96%" + else: + mock_message.result = "Done." + yield mock_message + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="big refactor", + mode=ExecutionMode.FULL, + run_tests=False, + ) + + # Should have 3 parallel reviews (need 2/3 majority) + assert review_count == 3 + assert result.success is True + assert "final_reviews" in result.metadata + + @pytest.mark.asyncio + async def test_full_mode_majority_required(self, project_dir): + """FULL mode requires 2/3 majority to complete.""" + review_index = 0 + + async def mock_query(prompt, options): + nonlocal review_index + mock_message = MagicMock() + + if "analyzing" in prompt.lower(): # Planner + mock_message.result = "Plan: Do the thing" + elif "executing" in prompt.lower(): # Executor + mock_message.result = "Did the thing." + elif "reviewing" in prompt.lower(): # Reviewer + review_index += 1 + # Only 1 of 3 passes - not majority + if review_index % 3 == 1: + mock_message.result = "COMPLETION: 96%" + else: + mock_message.result = "COMPLETION: 70%" + else: + mock_message.result = "Done." + yield mock_message + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="task needing consensus", + mode=ExecutionMode.FULL, + max_iterations=2, + run_tests=False, + ) + + # Should fail - only 1/3 pass, need 2/3 + assert result.success is False + + @pytest.mark.asyncio + async def test_full_mode_feedback_flows_to_next_iteration(self, project_dir): + """Review feedback flows to next execution iteration.""" + review_count = 0 + captured_exec_prompts = [] + + async def mock_query(prompt, options): + nonlocal review_count + mock_message = MagicMock() + + if "analyzing" in prompt.lower(): # Planner + mock_message.result = "Plan: Fix the bug" + elif "executing" in prompt.lower(): # Executor + captured_exec_prompts.append(prompt) + mock_message.result = "Attempted fix." + elif "reviewing" in prompt.lower(): # Reviewer + review_count += 1 + if review_count <= 3: + # First iteration reviews say incomplete + mock_message.result = "Missing error handling. COMPLETION: 70%" + else: + mock_message.result = "COMPLETION: 96%" + else: + mock_message.result = "Done." + yield mock_message + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="fix bug", + mode=ExecutionMode.FULL, + run_tests=False, + ) + + # Second execution should include feedback from first review + assert len(captured_exec_prompts) >= 2 + # Check for feedback indicators in second execution prompt + second_prompt = captured_exec_prompts[1].lower() + assert "feedback" in second_prompt or "previous" in second_prompt or "iteration" in second_prompt + + +class TestErrorHandling: + """Tests for error handling in execution flow.""" + + @pytest.mark.asyncio + async def test_handles_sdk_exception(self, project_dir): + """Handles SDK exceptions gracefully.""" + async def mock_query(*args, **kwargs): + raise Exception("API rate limit exceeded") + yield # Never reached + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="do something", + mode=ExecutionMode.SINGLE_TURN, + run_tests=False, + ) + + assert result.success is False + assert "rate limit" in result.error.lower() or "error" in result.error.lower() + + @pytest.mark.asyncio + async def test_handles_planning_failure(self, project_dir): + """Handles planning phase failure in FULL mode.""" + async def mock_query(prompt, options): + if "analyzing" in prompt.lower(): # Planner + raise Exception("Planning failed") + mock_message = MagicMock() + mock_message.result = "Done." + yield mock_message + + with patch("fireteam.loops.query", mock_query): + result = await execute( + project_dir=project_dir, + goal="complex task", + mode=ExecutionMode.FULL, + run_tests=False, + ) + + assert result.success is False + assert "planning" in result.error.lower() or "failed" in result.error.lower() + + +@pytest.mark.integration +class TestRealExecution: + """Integration tests that require real API calls. + + Run with: pytest --run-integration + """ + + @pytest.mark.asyncio + async def test_trivial_task_real_execution(self, project_dir): + """Test real execution of a trivial task.""" + # This test requires ANTHROPIC_API_KEY + import os + if not os.environ.get("ANTHROPIC_API_KEY"): + pytest.skip("ANTHROPIC_API_KEY not set") + + result = await execute( + project_dir=project_dir, + goal="What is 2 + 2?", + mode=ExecutionMode.SINGLE_TURN, + run_tests=False, + ) + + assert result.success is True + assert result.output is not None + assert "4" in result.output diff --git a/tests/test_memory_integration.py b/tests/test_memory_integration.py deleted file mode 100644 index c29be0f..0000000 --- a/tests/test_memory_integration.py +++ /dev/null @@ -1,333 +0,0 @@ -""" -Integration tests for memory system with full orchestrator cycle. -Tests memory recording, retrieval, and cleanup in realistic scenarios. -""" - -import pytest -import tempfile -import shutil -import os -from pathlib import Path -import sys -from unittest.mock import Mock, patch, MagicMock - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from memory.manager import MemoryManager -from state.manager import StateManager -from agents import PlannerAgent, ExecutorAgent, ReviewerAgent -from test_base_agent_memory import ConcreteAgent - - -@pytest.mark.slow -class TestMemoryIntegration: - """Test memory integration across full cycles (uses heavy Qwen3 model).""" - - @pytest.fixture - def temp_dirs(self): - """Create temporary directories for testing.""" - memory_dir = tempfile.mkdtemp() - state_dir = tempfile.mkdtemp() - project_dir = tempfile.mkdtemp() - - yield { - "memory": memory_dir, - "state": state_dir, - "project": project_dir - } - - shutil.rmtree(memory_dir, ignore_errors=True) - shutil.rmtree(state_dir, ignore_errors=True) - shutil.rmtree(project_dir, ignore_errors=True) - - @pytest.fixture - def memory_manager(self, temp_dirs): - """Create MemoryManager instance.""" - return MemoryManager(memory_dir=temp_dirs["memory"]) - - @pytest.fixture - def agents_with_memory(self, memory_manager): - """Create agents with memory manager.""" - return { - "planner": PlannerAgent(memory_manager=memory_manager), - "executor": ExecutorAgent(memory_manager=memory_manager), - "reviewer": ReviewerAgent(memory_manager=memory_manager) - } - - def test_memory_flows_through_cycle(self, memory_manager, agents_with_memory, temp_dirs): - """Test that memory is recorded and retrieved across a cycle.""" - project_dir = temp_dirs["project"] - goal = "Build a simple calculator" - - # Initialize memory for project - memory_manager.initialize_project(project_dir, goal) - - # Cycle 1: Add some learnings manually - memory_manager.add_memory( - content="User wants command-line interface", - memory_type="decision", - cycle=0 - ) - memory_manager.add_memory( - content="Python 3.12+ required", - memory_type="learning", - cycle=0 - ) - - # Simulate Cycle 2: Planner should retrieve these memories - planner = agents_with_memory["planner"] - - # Set execution context (what planner.execute would do) - planner._execution_context = { - "goal": goal, - "last_review": "Need to implement basic operations" - } - - # Retrieve memories - memories_text = planner._retrieve_and_format_memories() - - # Should contain previous learnings - assert "command-line interface" in memories_text or "Python 3.12" in memories_text - assert "BACKGROUND KNOWLEDGE" in memories_text - - def test_reviewer_extracts_learnings(self, agents_with_memory): - """Test that reviewer can extract learnings from its output.""" - reviewer = agents_with_memory["reviewer"] - - # Sample review text with learnings - review_text = """ - Project is progressing well. COMPLETION: 50% - - LEARNING[pattern]: All database operations use async/await - LEARNING[decision]: Chose SQLite for simplicity - LEARNING[failed_approach]: Tried Redis but had connection issues - LEARNING[code_location]: Main calculator logic in src/calc.py - - Overall the code looks good but needs more testing. - """ - - learnings = reviewer._extract_learnings(review_text) - - # Should extract all 4 learnings - assert len(learnings) == 4 - - # Verify types - types = [l["type"] for l in learnings] - assert "pattern" in types - assert "decision" in types - assert "failed_approach" in types - assert "code_location" in types - - # Verify content - contents = [l["content"] for l in learnings] - assert any("async/await" in c for c in contents) - assert any("SQLite" in c for c in contents) - - def test_different_agents_retrieve_different_memory_types(self, memory_manager, agents_with_memory, temp_dirs): - """Test that different agents retrieve different types of memories.""" - project_dir = temp_dirs["project"] - memory_manager.initialize_project(project_dir, "Test goal") - - # Add various memory types - memory_manager.add_memory("Pattern: Use async", "pattern", 1) - memory_manager.add_memory("Decision: Use SQLite", "decision", 1) - memory_manager.add_memory("Failed: Tried Redis", "failed_approach", 1) - memory_manager.add_memory("Trace: npm install failed", "trace", 1) - memory_manager.add_memory("Location: auth in src/auth.js", "code_location", 1) - - # Planner retrieves decisions, failed approaches, learnings - planner = agents_with_memory["planner"] - assert set(planner._get_relevant_memory_types()) == {"decision", "failed_approach", "learning"} - - # Executor retrieves failed approaches, traces, code locations - executor = agents_with_memory["executor"] - assert set(executor._get_relevant_memory_types()) == {"failed_approach", "trace", "code_location"} - - # Reviewer retrieves learnings, decisions, patterns - reviewer = agents_with_memory["reviewer"] - assert set(reviewer._get_relevant_memory_types()) == {"learning", "decision", "pattern"} - - def test_memory_persists_across_cycles(self, memory_manager, temp_dirs): - """Test that memories persist and accumulate across cycles.""" - project_dir = temp_dirs["project"] - memory_manager.initialize_project(project_dir, "Test goal") - - # Cycle 1: Add memories - memory_manager.add_memory("Cycle 1 learning", "learning", 1) - assert memory_manager.current_collection.count() == 1 - - # Cycle 2: Add more memories - memory_manager.add_memory("Cycle 2 learning", "learning", 2) - assert memory_manager.current_collection.count() == 2 - - # Cycle 3: Add more memories - memory_manager.add_memory("Cycle 3 learning", "learning", 3) - assert memory_manager.current_collection.count() == 3 - - # Search should find all relevant - results = memory_manager.search("learning", limit=10) - assert len(results) == 3 - - def test_agent_without_memory_works_normally(self, agents_with_memory): - """Test that agents work fine when memory manager is None.""" - agent_no_memory = ConcreteAgent("test", memory_manager=None) - - # Execute should work - result = agent_no_memory.execute( - project_dir="/tmp/test", - goal="Test" - ) - - assert result["success"] is True - - # Memory retrieval should return empty - agent_no_memory._execution_context = {"goal": "Test"} - memories = agent_no_memory._retrieve_and_format_memories() - assert memories == "" - - -@pytest.mark.slow -class TestMemoryCleanup: - """Test cleanup functionality (uses heavy Qwen3 model).""" - - @pytest.fixture - def temp_memory_dir(self): - """Create temporary memory directory.""" - temp_dir = tempfile.mkdtemp() - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - def test_cleanup_removes_all_memories(self, temp_memory_dir): - """Test that cleanup removes all project memories.""" - memory_manager = MemoryManager(memory_dir=temp_memory_dir) - project_dir = "/tmp/test-cleanup" - - # Initialize and add memories - memory_manager.initialize_project(project_dir, "Test goal") - memory_manager.add_memory("Memory 1", "learning", 1) - memory_manager.add_memory("Memory 2", "decision", 2) - memory_manager.add_memory("Memory 3", "trace", 3) - - assert memory_manager.current_collection.count() == 3 - - # Clear memories - memory_manager.clear_project_memory(project_dir) - - # Reinitialize and check - should be empty - memory_manager.initialize_project(project_dir, "Test goal") - assert memory_manager.current_collection.count() == 0 - - def test_cleanup_only_affects_target_project(self, temp_memory_dir): - """Test that cleanup only removes memories for specified project.""" - memory_manager = MemoryManager(memory_dir=temp_memory_dir) - - project1 = "/tmp/test-project-a" - project2 = "/tmp/test-project-b" - - # Add memories to project 1 - memory_manager.initialize_project(project1, "Goal 1") - memory_manager.add_memory("Project 1 memory", "learning", 1) - - # Add memories to project 2 - memory_manager.initialize_project(project2, "Goal 2") - memory_manager.add_memory("Project 2 memory", "learning", 1) - - # Clear project 1 - memory_manager.clear_project_memory(project1) - - # Project 2 should still have memories - memory_manager.initialize_project(project2, "Goal 2") - assert memory_manager.current_collection.count() == 1 - - results = memory_manager.search("memory", limit=10) - assert "Project 2" in results[0]["content"] - - -@pytest.mark.slow -class TestEndToEndScenario: - """Test realistic end-to-end scenarios.""" - - @pytest.fixture - def temp_memory_dir(self): - """Create temporary memory directory.""" - temp_dir = tempfile.mkdtemp() - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - @pytest.mark.slow - def test_realistic_multi_cycle_scenario(self, temp_memory_dir): - """Test a realistic scenario across multiple cycles (uses heavy Qwen3 model).""" - memory_manager = MemoryManager(memory_dir=temp_memory_dir) - project_dir = "/tmp/realistic-project" - goal = "Build REST API with authentication" - - # Initialize - memory_manager.initialize_project(project_dir, goal) - - # Cycle 1: Initial implementation - memory_manager.add_memory( - content="Decided to use FastAPI framework", - memory_type="decision", - cycle=1 - ) - memory_manager.add_memory( - content="Implemented basic user registration endpoint", - memory_type="trace", - cycle=1 - ) - - # Cycle 2: Hit an issue - memory_manager.add_memory( - content="Tried using bcrypt for password hashing but had installation issues on M1 Mac", - memory_type="failed_approach", - cycle=2 - ) - memory_manager.add_memory( - content="Switched to passlib with argon2 - works perfectly", - memory_type="decision", - cycle=2 - ) - - # Cycle 3: Continuing implementation - memory_manager.add_memory( - content="All authentication logic in src/api/auth.py", - memory_type="code_location", - cycle=3 - ) - memory_manager.add_memory( - content="API uses JWT tokens with 24h expiry, stored in httpOnly cookies", - memory_type="pattern", - cycle=3 - ) - - # Cycle 4: Search for authentication context - results = memory_manager.search( - "authentication implementation approach", - limit=10 - ) - - # Should find relevant memories - assert len(results) > 0 - - # Should include the passlib decision - contents = [r["content"] for r in results] - assert any("passlib" in c or "argon2" in c for c in contents) - - # Should include the bcrypt failure (to avoid repeating) - assert any("bcrypt" in c for c in contents) - - # Search for code location - results = memory_manager.search( - "where is authentication code", - limit=5, - memory_types=["code_location"] - ) - - assert len(results) > 0 - assert any("src/api/auth.py" in r["content"] for r in results) - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) - diff --git a/tests/test_memory_isolation.py b/tests/test_memory_isolation.py deleted file mode 100644 index 7be6a06..0000000 --- a/tests/test_memory_isolation.py +++ /dev/null @@ -1,187 +0,0 @@ -""" -Isolation tests for memory system. -Verifies that different projects have completely isolated memories. -""" - -import pytest -import tempfile -import shutil -from pathlib import Path -import sys - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from memory.manager import MemoryManager - - -@pytest.mark.slow -class TestProjectIsolation: - """Test that different projects have isolated memories (uses heavy Qwen3 model).""" - - @pytest.fixture - def temp_memory_dir(self): - """Create temporary memory directory.""" - temp_dir = tempfile.mkdtemp() - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - @pytest.fixture - def memory_manager(self, temp_memory_dir): - """Create MemoryManager instance.""" - return MemoryManager(memory_dir=temp_memory_dir) - - def test_two_projects_have_separate_collections(self, memory_manager): - """Test that two projects create separate Chroma collections.""" - project1 = "/tmp/isolated-project-1" - project2 = "/tmp/isolated-project-2" - - # Get collection names - collection1 = memory_manager._get_collection_name(project1) - collection2 = memory_manager._get_collection_name(project2) - - # Should be different - assert collection1 != collection2 - - # Should be deterministic (same input = same hash) - assert collection1 == memory_manager._get_collection_name(project1) - assert collection2 == memory_manager._get_collection_name(project2) - - def test_memories_dont_leak_between_projects(self, memory_manager): - """Test that memories from one project don't appear in another.""" - project1 = "/tmp/isolated-project-alpha" - project2 = "/tmp/isolated-project-beta" - - # Project 1: Add memories about authentication - memory_manager.initialize_project(project1, "Build auth system") - memory_manager.add_memory("Using JWT tokens for auth", "decision", 1) - memory_manager.add_memory("Password hashing with bcrypt", "pattern", 1) - memory_manager.add_memory("Auth middleware in src/auth/", "code_location", 2) - - assert memory_manager.current_collection.count() == 3 - - # Project 2: Add memories about e-commerce - memory_manager.initialize_project(project2, "Build e-commerce site") - memory_manager.add_memory("Using Stripe for payments", "decision", 1) - memory_manager.add_memory("Product catalog in MongoDB", "pattern", 1) - - # Project 2 should only have 2 memories - assert memory_manager.current_collection.count() == 2 - - # Search in project 2 for auth-related content - results = memory_manager.search("authentication JWT", limit=10) - - # Should NOT find any auth memories from project 1 - for result in results: - assert "JWT" not in result["content"] - assert "bcrypt" not in result["content"] - assert "auth" not in result["content"].lower() - - # Should find e-commerce memories - results = memory_manager.search("payment", limit=10) - assert len(results) > 0 - assert any("Stripe" in r["content"] for r in results) - - def test_switching_between_projects(self, memory_manager): - """Test switching between projects maintains isolation.""" - project_a = "/tmp/project-a" - project_b = "/tmp/project-b" - - # Initialize project A - memory_manager.initialize_project(project_a, "Project A") - memory_manager.add_memory("Project A memory 1", "learning", 1) - memory_manager.add_memory("Project A memory 2", "decision", 2) - - # Switch to project B - memory_manager.initialize_project(project_b, "Project B") - memory_manager.add_memory("Project B memory 1", "learning", 1) - - # Switch back to project A - memory_manager.initialize_project(project_a, "Project A") - - # Should still have 2 memories - assert memory_manager.current_collection.count() == 2 - - # Search should only return project A memories - results = memory_manager.search("memory", limit=10) - assert len(results) == 2 - assert all("Project A" in r["content"] for r in results) - - def test_concurrent_projects_in_same_memory_dir(self, temp_memory_dir): - """Test that multiple MemoryManager instances can work with different projects.""" - # Create two separate memory managers (simulating concurrent processes) - manager1 = MemoryManager(memory_dir=temp_memory_dir) - manager2 = MemoryManager(memory_dir=temp_memory_dir) - - project1 = "/tmp/concurrent-project-1" - project2 = "/tmp/concurrent-project-2" - - # Initialize different projects - manager1.initialize_project(project1, "Goal 1") - manager2.initialize_project(project2, "Goal 2") - - # Add memories - manager1.add_memory("Manager 1 memory", "learning", 1) - manager2.add_memory("Manager 2 memory", "learning", 1) - - # Each should have 1 memory - assert manager1.current_collection.count() == 1 - assert manager2.current_collection.count() == 1 - - # Verify isolation - results1 = manager1.search("memory", limit=10) - results2 = manager2.search("memory", limit=10) - - assert len(results1) == 1 - assert len(results2) == 1 - assert "Manager 1" in results1[0]["content"] - assert "Manager 2" in results2[0]["content"] - - def test_cleanup_only_affects_target_project(self, memory_manager): - """Test that cleanup doesn't affect other projects.""" - project1 = "/tmp/cleanup-project-1" - project2 = "/tmp/cleanup-project-2" - project3 = "/tmp/cleanup-project-3" - - # Create memories in all projects - for project in [project1, project2, project3]: - memory_manager.initialize_project(project, f"Goal for {project}") - memory_manager.add_memory(f"Memory for {project}", "learning", 1) - - # Clear project 2 - memory_manager.clear_project_memory(project2) - - # Project 1 should still have memories - memory_manager.initialize_project(project1, "Goal") - assert memory_manager.current_collection.count() == 1 - - # Project 2 should be empty - memory_manager.initialize_project(project2, "Goal") - assert memory_manager.current_collection.count() == 0 - - # Project 3 should still have memories - memory_manager.initialize_project(project3, "Goal") - assert memory_manager.current_collection.count() == 1 - - def test_hash_collision_resistance(self, memory_manager): - """Test that similar project paths generate different hashes.""" - project_paths = [ - "/tmp/project", - "/tmp/project1", - "/tmp/project2", - "/tmp/projects", - "/tmp/my-project" - ] - - hashes = [memory_manager._get_collection_name(p) for p in project_paths] - - # All hashes should be unique - assert len(hashes) == len(set(hashes)) - - # Each hash should be 16 characters (MD5 truncated) - assert all(len(h) == 16 for h in hashes) - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) - diff --git a/tests/test_memory_lightweight.py b/tests/test_memory_lightweight.py deleted file mode 100644 index 2ac726e..0000000 --- a/tests/test_memory_lightweight.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Lightweight embedding tests using sentence-transformers. -Fast tests for CI that verify HuggingFace integration without heavy model downloads. -""" - -import pytest -import sys -from pathlib import Path - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - - -@pytest.mark.lightweight -class TestLightweightEmbeddings: - """Fast embedding tests using lightweight model.""" - - def test_huggingface_pipeline_works(self, lightweight_memory_manager): - """Verify HuggingFace model loading and embedding generation.""" - # Test embedding generation - embeddings = lightweight_memory_manager._get_embeddings(["test text"]) - - assert len(embeddings) == 1 - assert isinstance(embeddings[0], list) - assert len(embeddings[0]) == 384 # MiniLM-L6-v2 dimension - - def test_save_and_retrieve_memories(self, lightweight_memory_manager, isolated_tmp_dir): - """Test full save/retrieve cycle with semantic search.""" - project_dir = isolated_tmp_dir / "project" - project_dir.mkdir() - - # Initialize and add memories - lightweight_memory_manager.initialize_project(str(project_dir), "Test goal") - - lightweight_memory_manager.add_memory( - "Using FastAPI for REST API", - "decision", 1 - ) - lightweight_memory_manager.add_memory( - "JWT authentication with 24h expiry", - "pattern", 2 - ) - - # Semantic search should work - results = lightweight_memory_manager.search("API framework", limit=5) - - assert len(results) > 0 - assert any("FastAPI" in r["content"] for r in results) - diff --git a/tests/test_memory_manager.py b/tests/test_memory_manager.py deleted file mode 100644 index 0cdc49f..0000000 --- a/tests/test_memory_manager.py +++ /dev/null @@ -1,287 +0,0 @@ -""" -Unit tests for MemoryManager. -Tests CRUD operations, embeddings, search, and project isolation. -""" - -import pytest -import tempfile -import shutil -import os -from pathlib import Path -import sys - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from memory.manager import MemoryManager - - -@pytest.mark.slow -class TestMemoryManager: - """Test MemoryManager functionality (uses heavy Qwen3 model).""" - - @pytest.fixture - def temp_memory_dir(self): - """Create temporary memory directory.""" - temp_dir = tempfile.mkdtemp() - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - @pytest.fixture - def memory_manager(self, temp_memory_dir): - """Create MemoryManager instance.""" - return MemoryManager(memory_dir=temp_memory_dir) - - def test_initialization(self, memory_manager): - """Test MemoryManager initializes correctly.""" - assert memory_manager is not None - assert memory_manager.chroma_client is not None - assert memory_manager.model is not None - assert memory_manager.tokenizer is not None - assert memory_manager.current_collection is None - - def test_model_loading(self, memory_manager): - """Test Qwen3 model loads successfully.""" - # Model should be loaded - assert memory_manager.model is not None - assert memory_manager.tokenizer is not None - - # Test embedding generation - embeddings = memory_manager._get_embeddings(["test text"]) - assert len(embeddings) == 1 - assert isinstance(embeddings[0], list) - assert len(embeddings[0]) > 0 # Should have dimensions - - def test_project_initialization(self, memory_manager, temp_memory_dir): - """Test project memory initialization.""" - project_dir = "/tmp/test-project-1" - goal = "Build a test project" - - memory_manager.initialize_project(project_dir, goal) - - # Should have current collection - assert memory_manager.current_collection is not None - - # Collection should be empty for new project - count = memory_manager.current_collection.count() - assert count == 0 - - def test_add_memory(self, memory_manager): - """Test adding memories.""" - project_dir = "/tmp/test-project-2" - memory_manager.initialize_project(project_dir, "Test goal") - - # Add a memory - memory_manager.add_memory( - content="This is a test learning", - memory_type="learning", - cycle=1 - ) - - # Should have 1 memory - count = memory_manager.current_collection.count() - assert count == 1 - - # Add more memories - memory_manager.add_memory( - content="Failed approach: tried X", - memory_type="failed_approach", - cycle=2 - ) - memory_manager.add_memory( - content="Decision: chose Y", - memory_type="decision", - cycle=2 - ) - - count = memory_manager.current_collection.count() - assert count == 3 - - def test_semantic_search(self, memory_manager): - """Test semantic search functionality.""" - project_dir = "/tmp/test-project-3" - memory_manager.initialize_project(project_dir, "Test goal") - - # Add some memories - memory_manager.add_memory( - content="Authentication uses JWT tokens with 24h expiry", - memory_type="decision", - cycle=1 - ) - memory_manager.add_memory( - content="Database uses PostgreSQL with connection pooling", - memory_type="pattern", - cycle=2 - ) - memory_manager.add_memory( - content="Tried bcrypt but had Node 18 compatibility issues", - memory_type="failed_approach", - cycle=3 - ) - - # Search for authentication - results = memory_manager.search("authentication approach", limit=5) - - # Should find the JWT decision - assert len(results) > 0 - assert any("JWT" in r["content"] for r in results) - - # Top result should be about auth - assert "auth" in results[0]["content"].lower() or "JWT" in results[0]["content"] - - def test_memory_type_filtering(self, memory_manager): - """Test filtering by memory type.""" - project_dir = "/tmp/test-project-4" - memory_manager.initialize_project(project_dir, "Test goal") - - # Add different types - memory_manager.add_memory("Pattern 1", "pattern", 1) - memory_manager.add_memory("Decision 1", "decision", 1) - memory_manager.add_memory("Failed approach 1", "failed_approach", 2) - - # Search with type filter - results = memory_manager.search( - "approach", - limit=10, - memory_types=["failed_approach"] - ) - - # Should only return failed_approach type - assert len(results) > 0 - assert all(r["type"] == "failed_approach" for r in results) - - def test_project_isolation(self, memory_manager): - """Test that different projects have isolated memories.""" - project1 = "/tmp/test-project-isolation-1" - project2 = "/tmp/test-project-isolation-2" - - # Initialize project 1 and add memory - memory_manager.initialize_project(project1, "Goal 1") - memory_manager.add_memory("Project 1 memory", "learning", 1) - - count1 = memory_manager.current_collection.count() - assert count1 == 1 - - # Switch to project 2 - memory_manager.initialize_project(project2, "Goal 2") - - # Should be empty (different project) - count2 = memory_manager.current_collection.count() - assert count2 == 0 - - # Add memory to project 2 - memory_manager.add_memory("Project 2 memory", "learning", 1) - count2 = memory_manager.current_collection.count() - assert count2 == 1 - - # Switch back to project 1 - memory_manager.initialize_project(project1, "Goal 1") - - # Should still have 1 memory (isolated) - count1 = memory_manager.current_collection.count() - assert count1 == 1 - - # Search should only return project 1 memory - results = memory_manager.search("memory", limit=10) - assert len(results) == 1 - assert "Project 1" in results[0]["content"] - - def test_embedding_caching(self, memory_manager): - """Test that embeddings are cached for repeated queries.""" - project_dir = "/tmp/test-project-5" - memory_manager.initialize_project(project_dir, "Test goal") - - # Add a memory - memory_manager.add_memory("Test content", "learning", 1) - - # Clear cache info - cache_info_before = memory_manager._get_embeddings_cached.cache_info() - - # Search multiple times with same query - memory_manager.search("test query") - memory_manager.search("test query") - memory_manager.search("test query") - - # Cache should have hits - cache_info_after = memory_manager._get_embeddings_cached.cache_info() - assert cache_info_after.hits > cache_info_before.hits - - def test_clear_project_memory(self, memory_manager): - """Test clearing project memory.""" - project_dir = "/tmp/test-project-6" - memory_manager.initialize_project(project_dir, "Test goal") - - # Add memories - memory_manager.add_memory("Memory 1", "learning", 1) - memory_manager.add_memory("Memory 2", "decision", 2) - - assert memory_manager.current_collection.count() == 2 - - # Clear memories - memory_manager.clear_project_memory(project_dir) - - # Collection should be deleted - reinitialize to check - memory_manager.initialize_project(project_dir, "Test goal") - assert memory_manager.current_collection.count() == 0 - - def test_memory_metadata(self, memory_manager): - """Test that metadata is stored correctly.""" - project_dir = "/tmp/test-project-7" - memory_manager.initialize_project(project_dir, "Test goal") - - # Add memory with custom metadata - memory_manager.add_memory( - content="Test content", - memory_type="decision", - cycle=5, - metadata={"custom_field": "custom_value"} - ) - - # Search and verify metadata - results = memory_manager.search("test", limit=1) - assert len(results) == 1 - assert results[0]["type"] == "decision" - assert results[0]["cycle"] == 5 - - -@pytest.mark.slow -class TestMemoryManagerEdgeCases: - """Test edge cases and error handling (uses heavy Qwen3 model).""" - - @pytest.fixture - def temp_memory_dir(self): - """Create temporary memory directory.""" - temp_dir = tempfile.mkdtemp() - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - @pytest.fixture - def memory_manager(self, temp_memory_dir): - """Create MemoryManager instance.""" - return MemoryManager(memory_dir=temp_memory_dir) - - def test_add_memory_without_initialization(self, memory_manager): - """Test that adding memory without project initialization raises error.""" - with pytest.raises(ValueError, match="Project not initialized"): - memory_manager.add_memory("Test", "learning", 1) - - def test_search_without_initialization(self, memory_manager): - """Test search without initialization returns empty list.""" - results = memory_manager.search("test") - assert results == [] - - def test_empty_search_query(self, memory_manager): - """Test search with empty query.""" - memory_manager.initialize_project("/tmp/test", "Goal") - results = memory_manager.search("") - assert isinstance(results, list) - - def test_clear_nonexistent_project(self, memory_manager): - """Test clearing memory for project that doesn't exist.""" - # Should not raise error - memory_manager.clear_project_memory("/tmp/nonexistent-project") - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) - diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py deleted file mode 100644 index 1de7f20..0000000 --- a/tests/test_orchestrator.py +++ /dev/null @@ -1,603 +0,0 @@ -""" -Integration tests for Orchestrator. -Tests full cycle execution, git integration, and completion checking. -""" - -import pytest -import tempfile -import shutil -import os -import subprocess -import json -import logging -import sys -from pathlib import Path -from unittest.mock import Mock, patch, MagicMock - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from orchestrator import Orchestrator -import config - - -class TestOrchestrator: - """Test Orchestrator functionality.""" - - @pytest.fixture - def temp_project_dir(self): - """Create temporary project directory.""" - temp_dir = tempfile.mkdtemp(prefix="test-project-") - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - @pytest.fixture - def temp_system_dir(self): - """Create temporary system directory for config.""" - temp_dir = tempfile.mkdtemp(prefix="test-system-") - # Create subdirectories - os.makedirs(os.path.join(temp_dir, "state"), exist_ok=True) - os.makedirs(os.path.join(temp_dir, "logs"), exist_ok=True) - os.makedirs(os.path.join(temp_dir, "memory"), exist_ok=True) - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - @pytest.fixture(autouse=True) - def patch_config(self, temp_system_dir): - """Patch config to use temp directories.""" - with patch.dict('os.environ', {'FIRETEAM_DIR': temp_system_dir}): - # Reload config to pick up new env var - import importlib - import config as config_module - importlib.reload(config_module) - yield - # Reload again to restore - importlib.reload(config_module) - - def test_initialization(self, temp_project_dir): - """Test Orchestrator initialization.""" - goal = "Build a test application" - - orch = Orchestrator(temp_project_dir, goal, debug=False) - - assert orch.project_dir == os.path.abspath(temp_project_dir) - assert orch.goal == goal - assert orch.debug is False - assert orch.keep_memory is False - assert orch.state_manager is not None - assert orch.memory is not None - assert orch.planner is not None - assert orch.executor is not None - assert orch.reviewer is not None - assert orch.running is True - - def test_initialization_with_debug(self, temp_project_dir): - """Test Orchestrator initialization with debug mode.""" - orch = Orchestrator(temp_project_dir, "Test goal", debug=True) - assert orch.debug is True - - def test_initialization_with_keep_memory(self, temp_project_dir): - """Test Orchestrator initialization with keep_memory flag.""" - orch = Orchestrator(temp_project_dir, "Test goal", keep_memory=True) - assert orch.keep_memory is True - - def test_setup_logging(self, temp_project_dir): - """Test logging setup.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - assert orch.logger is not None - assert isinstance(orch.logger, logging.Logger) - assert orch.logger.name == "orchestrator" - - def test_initialize_git_repo_new(self, temp_project_dir): - """Test git repository initialization for new project.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - branch_name = orch.initialize_git_repo() - - # Should return branch name - assert branch_name is not None - assert "fireteam-" in branch_name - - # .git directory should exist - assert os.path.exists(os.path.join(temp_project_dir, ".git")) - - # Should be on the created branch - result = subprocess.run( - ["git", "branch", "--show-current"], - cwd=temp_project_dir, - capture_output=True, - text=True - ) - assert result.returncode == 0 - assert branch_name in result.stdout - - def test_initialize_git_repo_existing(self, temp_project_dir): - """Test git repository initialization for existing repo.""" - # Initialize git repo first - subprocess.run(["git", "init"], cwd=temp_project_dir, check=True) - subprocess.run( - ["git", "config", "user.name", "Test User"], - cwd=temp_project_dir, - check=True - ) - subprocess.run( - ["git", "config", "user.email", "test@test.com"], - cwd=temp_project_dir, - check=True - ) - - # Create initial commit - with open(os.path.join(temp_project_dir, "README.md"), "w") as f: - f.write("# Test") - subprocess.run(["git", "add", "."], cwd=temp_project_dir, check=True) - subprocess.run( - ["git", "commit", "-m", "Initial"], - cwd=temp_project_dir, - check=True - ) - - # Now initialize orchestrator - orch = Orchestrator(temp_project_dir, "Test goal") - branch_name = orch.initialize_git_repo() - - # Should create new branch - assert branch_name is not None - assert "fireteam-" in branch_name - - def test_commit_changes(self, temp_project_dir): - """Test committing changes.""" - orch = Orchestrator(temp_project_dir, "Test goal") - orch.initialize_git_repo() - - # Make some changes - test_file = os.path.join(temp_project_dir, "test.txt") - with open(test_file, "w") as f: - f.write("Test content") - - # Commit changes - orch.commit_changes(1, "Test changes") - - # Check commit exists - result = subprocess.run( - ["git", "log", "--oneline"], - cwd=temp_project_dir, - capture_output=True, - text=True - ) - assert "Cycle 1" in result.stdout - assert "Test changes" in result.stdout - - def test_commit_changes_no_changes(self, temp_project_dir): - """Test committing when there are no changes.""" - orch = Orchestrator(temp_project_dir, "Test goal") - orch.initialize_git_repo() - - # Try to commit without changes - should handle gracefully - orch.commit_changes(1, "No changes") - - # Should not crash - - @patch('subprocess.run') - def test_push_to_remote_exists(self, mock_run, temp_project_dir): - """Test pushing to remote when remote exists.""" - # Mock successful remote check and push - mock_run.side_effect = [ - MagicMock(returncode=0, stdout="https://github.com/test/repo.git"), - MagicMock(returncode=0) - ] - - orch = Orchestrator(temp_project_dir, "Test goal") - orch.push_to_remote() - - # Should have called git remote and git push - assert mock_run.call_count == 2 - - @patch('subprocess.run') - def test_push_to_remote_no_remote(self, mock_run, temp_project_dir): - """Test pushing when no remote exists.""" - # Mock failed remote check - mock_run.return_value = MagicMock(returncode=1) - - orch = Orchestrator(temp_project_dir, "Test goal") - orch.push_to_remote() - - # Should handle gracefully - - def test_check_completion_not_complete(self, temp_project_dir): - """Test completion check when not complete.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - state = { - "completion_percentage": 50, - "validation_checks": 0 - } - - is_complete = orch.check_completion(state) - assert is_complete is False - - def test_check_completion_single_validation(self, temp_project_dir): - """Test completion check with single validation.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - state = { - "completion_percentage": 96, - "validation_checks": 0 - } - - is_complete = orch.check_completion(state) - assert is_complete is False - - def test_check_completion_multiple_validations(self, temp_project_dir): - """Test completion check with multiple validations.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - # First validation - state = {"completion_percentage": 96, "validation_checks": 0} - orch.check_completion(state) - - # Second validation - state = orch.state_manager.load_state() - state["completion_percentage"] = 97 - orch.state_manager.update_state(state) - orch.check_completion(state) - - # Third validation - should complete - state = orch.state_manager.load_state() - state["completion_percentage"] = 98 - orch.state_manager.update_state(state) - is_complete = orch.check_completion(state) - - assert is_complete is True - - def test_check_completion_reset_on_drop(self, temp_project_dir): - """Test validation checks reset when percentage drops.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - # First validation - state = {"completion_percentage": 96, "validation_checks": 0} - orch.check_completion(state) - - state = orch.state_manager.load_state() - assert state["validation_checks"] == 1 - - # Drop below threshold - state["completion_percentage"] = 90 - orch.state_manager.update_state(state) - orch.check_completion(state) - - # Should reset - state = orch.state_manager.load_state() - assert state["validation_checks"] == 0 - - @patch.object(Orchestrator, 'commit_changes') - def test_run_cycle_structure(self, mock_commit, temp_project_dir): - """Test that run_cycle follows proper structure.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - # Initialize memory for project - orch.memory.initialize_project(temp_project_dir, "Test goal") - - # Mock agent responses - with patch.object(orch.planner, 'execute') as mock_planner, \ - patch.object(orch.executor, 'execute') as mock_executor, \ - patch.object(orch.reviewer, 'execute') as mock_reviewer: - - # Setup mocks - mock_planner.return_value = { - "success": True, - "plan": "Test plan" - } - mock_executor.return_value = { - "success": True, - "execution_result": "Test execution" - } - mock_reviewer.return_value = { - "success": True, - "review": "Test review", - "completion_percentage": 50, - "learnings": [] - } - - # Run cycle - state = { - "cycle_number": 1, - "completion_percentage": 0 - } - - result = orch.run_cycle(state) - - # All agents should have been called - assert mock_planner.called - assert mock_executor.called - assert mock_reviewer.called - - # State should be updated - assert "current_plan" in result - assert "last_execution_result" in result - assert "last_review" in result - - @patch.object(Orchestrator, 'commit_changes') - def test_run_cycle_planner_failure(self, mock_commit, temp_project_dir): - """Test run_cycle when planner fails.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - with patch.object(orch.planner, 'execute') as mock_planner: - mock_planner.return_value = { - "success": False, - "error": "Planner error" - } - - state = {"cycle_number": 1} - result = orch.run_cycle(state) - - # Should return original state - assert result == state - - @patch.object(Orchestrator, 'commit_changes') - def test_run_cycle_executor_failure(self, mock_commit, temp_project_dir): - """Test run_cycle when executor fails.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - with patch.object(orch.planner, 'execute') as mock_planner, \ - patch.object(orch.executor, 'execute') as mock_executor: - - mock_planner.return_value = { - "success": True, - "plan": "Test plan" - } - mock_executor.return_value = { - "success": False, - "error": "Executor error" - } - - state = {"cycle_number": 1} - result = orch.run_cycle(state) - - # Should return original state - assert result == state - - @patch.object(Orchestrator, 'commit_changes') - def test_run_cycle_reviewer_failure(self, mock_commit, temp_project_dir): - """Test run_cycle when reviewer fails.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - # Initialize memory for project - orch.memory.initialize_project(temp_project_dir, "Test goal") - - with patch.object(orch.planner, 'execute') as mock_planner, \ - patch.object(orch.executor, 'execute') as mock_executor, \ - patch.object(orch.reviewer, 'execute') as mock_reviewer: - - mock_planner.return_value = { - "success": True, - "plan": "Test plan" - } - mock_executor.return_value = { - "success": True, - "execution_result": "Test execution" - } - mock_reviewer.return_value = { - "success": False, - "error": "Reviewer error" - } - - state = {"cycle_number": 1} - result = orch.run_cycle(state) - - # Should return original state - assert result == state - - @patch.object(Orchestrator, 'commit_changes') - def test_run_cycle_learning_extraction(self, mock_commit, temp_project_dir): - """Test that learnings are extracted and stored.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - with patch.object(orch.planner, 'execute') as mock_planner, \ - patch.object(orch.executor, 'execute') as mock_executor, \ - patch.object(orch.reviewer, 'execute') as mock_reviewer, \ - patch.object(orch.memory, 'add_memory') as mock_add_memory: - - mock_planner.return_value = { - "success": True, - "plan": "Test plan" - } - mock_executor.return_value = { - "success": True, - "execution_result": "Test execution" - } - mock_reviewer.return_value = { - "success": True, - "review": "Test review", - "completion_percentage": 50, - "learnings": [ - {"type": "pattern", "content": "Using MVC"}, - {"type": "decision", "content": "Chose SQLite"} - ] - } - - state = {"cycle_number": 1} - orch.run_cycle(state) - - # Memory should have been called for learnings - assert mock_add_memory.call_count >= 2 - - def test_goal_alignment_check(self, temp_project_dir): - """Test that goal alignment check happens at proper intervals.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - # Initialize memory for project - orch.memory.initialize_project(temp_project_dir, "Test goal") - - # Mock agents - with patch.object(orch.planner, 'execute') as mock_planner, \ - patch.object(orch.executor, 'execute') as mock_executor, \ - patch.object(orch.reviewer, 'execute') as mock_reviewer, \ - patch.object(orch, 'commit_changes'): - - mock_planner.return_value = {"success": True, "plan": "Test"} - mock_executor.return_value = {"success": True, "execution_result": "Test"} - mock_reviewer.return_value = { - "success": True, - "review": "Test", - "completion_percentage": 50, - "learnings": [] - } - - # Run cycle 3 - should trigger alignment check - state = {"cycle_number": 3, "completion_percentage": 50} - orch.run_cycle(state) - - # Check that logger logged alignment check - # (We'd need to capture logs to verify, but at least it shouldn't crash) - - def test_memory_manager_injection(self, temp_project_dir): - """Test that memory manager is injected into agents.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - # All agents should have memory manager - assert orch.planner.memory == orch.memory - assert orch.executor.memory == orch.memory - assert orch.reviewer.memory == orch.memory - - def test_state_manager_integration(self, temp_project_dir): - """Test state manager integration.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - # Initialize state - state = orch.state_manager.initialize_project(temp_project_dir, "Test goal") - - assert state is not None - assert state["project_dir"] == os.path.abspath(temp_project_dir) - assert state["goal"] == "Test goal" - - def test_signal_handler(self, temp_project_dir): - """Test signal handler sets running flag.""" - import signal - - orch = Orchestrator(temp_project_dir, "Test goal") - - assert orch.running is True - - # Simulate signal - orch._signal_handler(signal.SIGINT, None) - - assert orch.running is False - - def test_validation_mode_trigger(self, temp_project_dir): - """Test that validation mode is triggered at high completion.""" - orch = Orchestrator(temp_project_dir, "Test goal") - - # Initialize memory for project - orch.memory.initialize_project(temp_project_dir, "Test goal") - - with patch.object(orch.planner, 'execute') as mock_planner, \ - patch.object(orch.executor, 'execute') as mock_executor, \ - patch.object(orch.reviewer, 'execute') as mock_reviewer, \ - patch.object(orch, 'commit_changes'): - - mock_planner.return_value = {"success": True, "plan": "Test"} - mock_executor.return_value = {"success": True, "execution_result": "Test"} - mock_reviewer.return_value = { - "success": True, - "review": "Test", - "completion_percentage": 96, - "learnings": [] - } - - # Run cycle with high completion - state = {"cycle_number": 1, "completion_percentage": 96} - orch.run_cycle(state) - - # Reviewer should have been called with is_validation=True - call_args = mock_reviewer.call_args - assert call_args is not None - assert call_args[1].get("is_validation") is True - - -class TestOrchestratorCLI: - """Test Orchestrator CLI interface.""" - - def test_main_missing_arguments(self): - """Test that CLI requires arguments.""" - from orchestrator import main - - with pytest.raises(SystemExit): - with patch('sys.argv', ['orchestrator.py']): - main() - - @patch('orchestrator.Orchestrator') - def test_main_with_arguments(self, mock_orch_class): - """Test CLI with proper arguments.""" - from orchestrator import main - - # Mock orchestrator instance - mock_instance = Mock() - mock_instance.run.return_value = 0 - mock_orch_class.return_value = mock_instance - - with patch('sys.argv', [ - 'orchestrator.py', - '--project-dir', '/tmp/test', - '--goal', 'Test goal' - ]): - # Expect SystemExit - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 0 - - # Should create orchestrator and run - assert mock_orch_class.called - assert mock_instance.run.called - - @patch('orchestrator.Orchestrator') - def test_main_with_debug_flag(self, mock_orch_class): - """Test CLI with debug flag.""" - from orchestrator import main - - mock_instance = Mock() - mock_instance.run.return_value = 0 - mock_orch_class.return_value = mock_instance - - with patch('sys.argv', [ - 'orchestrator.py', - '--project-dir', '/tmp/test', - '--goal', 'Test goal', - '--debug' - ]): - # Expect SystemExit - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 0 - - # Should pass debug flag - call_args = mock_orch_class.call_args - assert call_args[1]['debug'] is True - - @patch('orchestrator.Orchestrator') - def test_main_with_keep_memory_flag(self, mock_orch_class): - """Test CLI with keep-memory flag.""" - from orchestrator import main - - mock_instance = Mock() - mock_instance.run.return_value = 0 - mock_orch_class.return_value = mock_instance - - with patch('sys.argv', [ - 'orchestrator.py', - '--project-dir', '/tmp/test', - '--goal', 'Test goal', - '--keep-memory' - ]): - # Expect SystemExit - with pytest.raises(SystemExit) as exc_info: - main() - - assert exc_info.value.code == 0 - - # Should pass keep_memory flag - call_args = mock_orch_class.call_args - assert call_args[1]['keep_memory'] is True - diff --git a/tests/test_state_manager.py b/tests/test_state_manager.py deleted file mode 100644 index ca5dae7..0000000 --- a/tests/test_state_manager.py +++ /dev/null @@ -1,426 +0,0 @@ -""" -Unit tests for StateManager. -Tests state initialization, persistence, locking, and completion tracking. -""" - -import pytest -import tempfile -import shutil -import json -import time -import os -from pathlib import Path -import sys -from threading import Thread - -# Add src to path -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from state.manager import StateManager - - -class TestStateManager: - """Test StateManager functionality.""" - - @pytest.fixture - def temp_state_dir(self): - """Create temporary state directory.""" - temp_dir = tempfile.mkdtemp(prefix="test-state-") - yield temp_dir - shutil.rmtree(temp_dir, ignore_errors=True) - - @pytest.fixture - def state_manager(self, temp_state_dir): - """Create StateManager instance.""" - return StateManager(state_dir=temp_state_dir) - - def test_initialization(self, state_manager, temp_state_dir): - """Test StateManager initializes correctly.""" - assert state_manager is not None - assert state_manager.state_dir == Path(temp_state_dir) - assert state_manager.state_file == Path(temp_state_dir) / "current.json" - assert state_manager.lock_file == Path(temp_state_dir) / "state.lock" - - # State directory should exist - assert state_manager.state_dir.exists() - - def test_initialize_project(self, state_manager): - """Test project initialization creates proper state.""" - project_dir = "/tmp/test-project" - goal = "Build a web application" - - state = state_manager.initialize_project(project_dir, goal) - - # Check state structure - assert state is not None - assert isinstance(state, dict) - - # Required fields - assert "project_dir" in state - assert "goal" in state - assert "status" in state - assert "cycle_number" in state - assert "completion_percentage" in state - assert "validation_checks" in state - assert "started_at" in state - assert "updated_at" in state - assert "completed" in state - - # Field values - assert os.path.abspath(project_dir) == state["project_dir"] - assert state["goal"] == goal - assert state["status"] == "planning" - assert state["cycle_number"] == 0 - assert state["completion_percentage"] == 0 - assert state["validation_checks"] == 0 - assert state["completed"] is False - - # State file should exist - assert state_manager.state_file.exists() - - def test_load_state(self, state_manager): - """Test loading state from disk.""" - # Initially, no state should exist - state = state_manager.load_state() - assert state is None - - # Initialize project - project_dir = "/tmp/test-project" - goal = "Test goal" - initialized_state = state_manager.initialize_project(project_dir, goal) - - # Now load state should return data - loaded_state = state_manager.load_state() - assert loaded_state is not None - assert loaded_state["project_dir"] == os.path.abspath(project_dir) - assert loaded_state["goal"] == goal - - def test_update_state(self, state_manager): - """Test updating state.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - # Update state - updates = { - "status": "executing", - "cycle_number": 5, - "completion_percentage": 75 - } - updated_state = state_manager.update_state(updates) - - # Check updates applied - assert updated_state["status"] == "executing" - assert updated_state["cycle_number"] == 5 - assert updated_state["completion_percentage"] == 75 - - # Original fields should still exist - assert "project_dir" in updated_state - assert "goal" in updated_state - - # updated_at should be refreshed - assert "updated_at" in updated_state - - def test_get_status(self, state_manager): - """Test getting status for CLI display.""" - # No state initially - status = state_manager.get_status() - assert status["status"] == "idle" - assert "No active project" in status["message"] - - # Initialize project - project_dir = "/tmp/test-project" - goal = "Test goal" - state_manager.initialize_project(project_dir, goal) - - # Get status - status = state_manager.get_status() - assert status["status"] == "planning" - assert status["project_dir"] == os.path.abspath(project_dir) - assert status["goal"] == goal - assert status["cycle_number"] == 0 - assert status["completion_percentage"] == 0 - assert "last_updated" in status - assert status["completed"] is False - - def test_mark_completed(self, state_manager): - """Test marking project as completed.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - # Mark completed - state_manager.mark_completed() - - # Load state and check - state = state_manager.load_state() - assert state["status"] == "completed" - assert state["completed"] is True - assert "completed_at" in state - - def test_clear_state(self, state_manager): - """Test clearing state.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - assert state_manager.state_file.exists() - - # Clear state - state_manager.clear_state() - - # State file should not exist - assert not state_manager.state_file.exists() - - # Load state should return None - state = state_manager.load_state() - assert state is None - - def test_increment_cycle(self, state_manager): - """Test incrementing cycle counter.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - initial_state = state_manager.load_state() - assert initial_state["cycle_number"] == 0 - - # Increment cycle - state_manager.increment_cycle() - - # Check cycle incremented - state = state_manager.load_state() - assert state["cycle_number"] == 1 - - # Increment again - state_manager.increment_cycle() - state = state_manager.load_state() - assert state["cycle_number"] == 2 - - def test_update_completion_percentage_success(self, state_manager): - """Test successful completion percentage update.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - # Update with valid percentage - result = state_manager.update_completion_percentage(50, logger=None) - - assert result == 50 - - # Check state updated - state = state_manager.load_state() - assert state["completion_percentage"] == 50 - assert state["last_known_completion"] == 50 - assert state["consecutive_parse_failures"] == 0 - - def test_update_completion_percentage_parse_failure(self, state_manager): - """Test completion percentage update with parse failure.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - # Set initial percentage - state_manager.update_completion_percentage(60) - - # Simulate parse failure (None) - result = state_manager.update_completion_percentage(None) - - # Should fall back to last known - assert result == 60 - - # Check state - state = state_manager.load_state() - assert state["completion_percentage"] == 60 - assert state["consecutive_parse_failures"] == 1 - - def test_update_completion_percentage_multiple_failures(self, state_manager): - """Test completion percentage with multiple consecutive failures.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - # Set initial percentage - state_manager.update_completion_percentage(70) - - # First failure - result1 = state_manager.update_completion_percentage(None) - assert result1 == 70 - - # Second failure - result2 = state_manager.update_completion_percentage(None) - assert result2 == 70 - - # Third failure - should reset to 0 - result3 = state_manager.update_completion_percentage(None) - assert result3 == 0 - - # Check state - state = state_manager.load_state() - assert state["completion_percentage"] == 0 - assert state["consecutive_parse_failures"] == 3 - - def test_update_completion_percentage_reset_counter(self, state_manager): - """Test that successful parse resets failure counter.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - # Set initial percentage - state_manager.update_completion_percentage(50) - - # Fail once - state_manager.update_completion_percentage(None) - state = state_manager.load_state() - assert state["consecutive_parse_failures"] == 1 - - # Success should reset counter - state_manager.update_completion_percentage(75) - state = state_manager.load_state() - assert state["consecutive_parse_failures"] == 0 - assert state["completion_percentage"] == 75 - - def test_state_persistence(self, state_manager): - """Test that state persists across manager instances.""" - # Initialize project - project_dir = "/tmp/test-project" - goal = "Test goal" - state_manager.initialize_project(project_dir, goal) - - # Update state - state_manager.update_state({ - "status": "executing", - "cycle_number": 3, - "completion_percentage": 60 - }) - - # Create new manager instance with same directory - new_manager = StateManager(state_dir=state_manager.state_dir) - - # Load state with new manager - state = new_manager.load_state() - assert state is not None - assert state["project_dir"] == os.path.abspath(project_dir) - assert state["goal"] == goal - assert state["status"] == "executing" - assert state["cycle_number"] == 3 - assert state["completion_percentage"] == 60 - - def test_state_isolation(self, temp_state_dir): - """Test that different state directories are isolated.""" - # Create two managers with different directories - temp_dir1 = tempfile.mkdtemp(prefix="test-state-1-") - temp_dir2 = tempfile.mkdtemp(prefix="test-state-2-") - - try: - manager1 = StateManager(state_dir=temp_dir1) - manager2 = StateManager(state_dir=temp_dir2) - - # Initialize different projects - manager1.initialize_project("/tmp/project-1", "Goal 1") - manager2.initialize_project("/tmp/project-2", "Goal 2") - - # States should be independent - state1 = manager1.load_state() - state2 = manager2.load_state() - - assert state1["goal"] == "Goal 1" - assert state2["goal"] == "Goal 2" - assert state1["project_dir"] != state2["project_dir"] - finally: - shutil.rmtree(temp_dir1, ignore_errors=True) - shutil.rmtree(temp_dir2, ignore_errors=True) - - def test_file_locking(self, state_manager, temp_state_dir): - """Test that file locking prevents concurrent access issues.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - # Test that we can acquire and release locks - state_manager._acquire_lock() - assert hasattr(state_manager, 'lock_fd') - state_manager._release_lock() - - # Lock file should exist - assert state_manager.lock_file.exists() - - def test_concurrent_updates(self, state_manager): - """Test concurrent state updates with locking.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - # Test that file locking mechanism exists and is functional - # We don't actually test concurrent updates due to threading complexity - # Instead, test sequential updates work - state_manager.update_state({"cycle_number": 1}) - state1 = state_manager.load_state() - assert state1["cycle_number"] == 1 - - state_manager.update_state({"cycle_number": 2}) - state2 = state_manager.load_state() - assert state2["cycle_number"] == 2 - - state_manager.update_state({"cycle_number": 3}) - state3 = state_manager.load_state() - assert state3["cycle_number"] == 3 - - # Final state should exist and be valid - assert state3 is not None - assert state3["cycle_number"] == 3 - - def test_updated_at_timestamp(self, state_manager): - """Test that updated_at timestamp is maintained.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - initial_state = state_manager.load_state() - initial_updated_at = initial_state["updated_at"] - - # Wait a bit - time.sleep(0.1) - - # Update state - state_manager.update_state({"status": "executing"}) - - # updated_at should be different - updated_state = state_manager.load_state() - assert updated_state["updated_at"] != initial_updated_at - - def test_project_reinitialize_clears_old_state(self, state_manager): - """Test that reinitializing a project clears previous state.""" - # Initialize first project - state_manager.initialize_project("/tmp/project-1", "Goal 1") - state_manager.update_state({ - "cycle_number": 5, - "completion_percentage": 80 - }) - - # Reinitialize with different project - state_manager.initialize_project("/tmp/project-2", "Goal 2") - - # State should be reset - state = state_manager.load_state() - assert state["project_dir"] == os.path.abspath("/tmp/project-2") - assert state["goal"] == "Goal 2" - assert state["cycle_number"] == 0 - assert state["completion_percentage"] == 0 - - def test_state_json_format(self, state_manager): - """Test that state file is valid JSON.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - # Read file directly - with open(state_manager.state_file, 'r') as f: - data = json.load(f) - - # Should be valid dict - assert isinstance(data, dict) - assert "project_dir" in data - assert "goal" in data - - def test_validation_checks_tracking(self, state_manager): - """Test validation checks tracking.""" - # Initialize project - state_manager.initialize_project("/tmp/test-project", "Test goal") - - # Update validation checks - state_manager.update_state({"validation_checks": 1}) - state = state_manager.load_state() - assert state["validation_checks"] == 1 - - state_manager.update_state({"validation_checks": 2}) - state = state_manager.load_state() - assert state["validation_checks"] == 2 - diff --git a/tests/test_terminal_bench_integration.py b/tests/test_terminal_bench_integration.py deleted file mode 100644 index afe858c..0000000 --- a/tests/test_terminal_bench_integration.py +++ /dev/null @@ -1,73 +0,0 @@ -""" -Integration test with terminal-bench. -Verifies Fireteam achieves 100% accuracy on terminal-bench hello-world task. -""" - -import pytest -import subprocess -import shutil -import sys -from pathlib import Path - -# Add parent to path for helpers -sys.path.insert(0, str(Path(__file__).parent)) -from helpers import TerminalBenchParser - - -@pytest.mark.integration -@pytest.mark.slow -class TestTerminalBenchIntegration: - """Integration test with terminal-bench.""" - - def test_hello_world_task(self): - """Test Fireteam achieves 100% on terminal-bench hello-world.""" - - # Check if tb is installed - if not shutil.which('tb'): - pytest.skip("terminal-bench (tb) not installed") - - # Run terminal-bench via subprocess - cmd = [ - 'tb', 'run', - '--agent-import-path', 'benchmark.adapters.fireteam_adapter:FireteamAdapter', - '--dataset', 'terminal-bench-core==0.1.1', - '--task-id', 'hello-world', - '--global-agent-timeout-sec', '600', - '--log-level', 'debug', - '--livestream' # Enable real-time output - ] - - print("\n🚀 Running terminal-bench hello-world task...") - print(f"Command: {' '.join(cmd)}\n") - print("="*60) - print("Note: Terminal-bench output will stream below in real-time\n") - sys.stdout.flush() - - # Run terminal-bench with real-time output (--livestream makes it stream to console) - # subprocess.call() lets output go directly to stdout/stderr for real-time viewing - try: - return_code = subprocess.call(cmd, timeout=700) - - print("\n" + "="*60) - print(f"Terminal-bench completed with return code: {return_code}") - print("="*60) - sys.stdout.flush() - - except subprocess.TimeoutExpired: - pytest.fail("Terminal-bench timed out after 700s") - except FileNotFoundError: - pytest.skip("terminal-bench (tb) command not found") - - # Assert on return code (0 = success) - assert return_code == 0, ( - f"Terminal-bench failed with return code {return_code}.\n" - f"Check the output above for details." - ) - - print(f"\n✅ Terminal-bench hello-world task completed successfully!") - print(" Task passed with 100% accuracy (verified by terminal-bench)") - - # Note: With --livestream and direct output, we rely on terminal-bench's - # own success/failure reporting rather than parsing output ourselves. - # Return code 0 means the task passed all checks. -