Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 35 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ This code was designed for NVIDIA GPUs with CUDA using PyTorch. Support for othe

### Installation

For Linux:
```bash
conda create -n ai_scientist python=3.11
conda activate ai_scientist
Expand All @@ -60,9 +61,18 @@ sudo apt-get install texlive-full
# Install pypi requirements
pip install -r requirements.txt
```

When installing `texlive-full`, you may need to [hold Enter](https://askubuntu.com/questions/956006/pregenerating-context-markiv-format-this-may-take-some-time-takes-forever).

For Mac:

Install `MacTex`: [Link](https://mirror.ctan.org/systems/mac/mactex/MacTeX.pkg)
```bash
conda create -n ai_scientist python=3.11
conda activate ai_scientist
pip install -r requirements.txt
pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
```

### Supported Models and API Keys

We support a wide variety of models including open-weight and API-only models. In general, we recommend only using frontier models above the capability of the original GPT-4.
Expand All @@ -71,10 +81,18 @@ We support a wide variety of models including open-weight and API-only models. I

By default, this uses the `OPENAI_API_KEY` environment variable.

```bash
export OPENAI_API_KEY="YOUR KEY HERE"
```

#### Anthropic API (Claude Sonnet 3.5)

By default, this uses the `ANTHROPIC_API_KEY` environment variable.

```bash
export ANTHROPIC_API_KEY="YOUR KEY HERE"
```

##### Claude models via Bedrock

For Claude models provided by [Amazon Bedrock](https://aws.amazon.com/bedrock/), please install these additional packages:
Expand Down Expand Up @@ -138,7 +156,13 @@ python data/text8/prepare.py
```bash
# Set up NanoGPT baseline run
# NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE!
cd templates/nanoGPT && python experiment.py --out_dir run_0 && python plot.py
cd templates/nanoGPT && python experiment.py --out_dir run_0 -is_mac && python plot.py && cd ../..
```

for mac user, add `-is_mac` as an argument
```bash
# NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE!
cd templates/nanoGPT && python experiment.py --out_dir run_0 -is_mac && python plot.py && cd ../..
```

#### Create NanoGPT_lite baseline run. We use this for sanity-checking
Expand All @@ -147,6 +171,13 @@ cd templates/nanoGPT && python experiment.py --out_dir run_0 && python plot.py
cd templates/nanoGPT_lite && python experiment.py --out_dir run_0 && python plot.py
```

for mac user, add `-is_mac` as an argument

```bash
# NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE!
cd templates/nanoGPT_lite && python experiment.py --out_dir run_0 -is_mac && python plot.py && cd ../..
```

### Setup 2D Diffusion

```bash
Expand All @@ -157,7 +188,7 @@ pip install .
pip install scikit-learn

# Set up 2D Diffusion baseline run
cd templates/2d_diffusion && python experiment.py --out_dir run_0 && python plot.py
cd templates/2d_diffusion && python experiment.py --out_dir run_0 && python plot.py && cd ../..
```

### Setup Grokking
Expand All @@ -167,7 +198,7 @@ cd templates/2d_diffusion && python experiment.py --out_dir run_0 && python plot
pip install einops

# Set up Grokking baseline run
cd templates/grokking && python experiment.py --out_dir run_0 && python plot.py
cd templates/grokking && python experiment.py --out_dir run_0 && python plot.py && cd ../..
```


Expand Down
14 changes: 10 additions & 4 deletions templates/nanoGPT/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):


# --- END model.py ---
def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0):
def train(dataset="shakespeare_char", out_dir="run_0", is_mac=True, seed_offset=0):
# -----------------------------------------------------------------------------
# default config values designed to train a gpt2 (124M) on OpenWebText
# data
Expand Down Expand Up @@ -348,13 +348,14 @@ def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0):
# DDP settings
backend = "nccl" # 'nccl', 'gloo', etc.
# system
device = "cuda" # Always use CUDA
device = "mps" if is_mac else "cuda" # Always use CUDA
# device = "cuda" # Always use CUDA
dtype = (
"bfloat16"
if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
else "float16"
) # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
compile = True # do not torch compile the model on macbooks
compile = not is_mac # do not torch compile the model on macbooks

# various inits, derived attributes, I/O setup
# if not ddp, we are running on a single gpu, and one process
Expand Down Expand Up @@ -679,6 +680,8 @@ def get_lr(it):

parser = argparse.ArgumentParser(description="Run experiment")
parser.add_argument("--out_dir", type=str, default="run_0", help="Output directory")
parser.add_argument('-is_mac', action='store_true', help="Indicate that you are using a Mac")

args = parser.parse_args()

if __name__ == "__main__":
Expand All @@ -689,12 +692,15 @@ def get_lr(it):
}

out_dir = args.out_dir

is_mac = args.is_mac

all_results = {}
final_infos = {}
for dataset in ["shakespeare_char", "enwik8", "text8"]:
final_info_list = []
for seed_offset in range(num_seeds[dataset]):
final_info, train_info, val_info = train(dataset, out_dir, seed_offset)
final_info, train_info, val_info = train(dataset, out_dir, is_mac, seed_offset)
all_results[f"{dataset}_{seed_offset}_final_info"] = final_info
all_results[f"{dataset}_{seed_offset}_train_info"] = train_info
all_results[f"{dataset}_{seed_offset}_val_info"] = val_info
Expand Down
13 changes: 9 additions & 4 deletions templates/nanoGPT_lite/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):


# --- END model.py ---
def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0):
def train(dataset="shakespeare_char", out_dir="run_0", is_mac=False, seed_offset=0):
# -----------------------------------------------------------------------------
# default config values designed to train a gpt2 (124M) on OpenWebText
# data
Expand Down Expand Up @@ -348,13 +348,15 @@ def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0):
# DDP settings
backend = "nccl" # 'nccl', 'gloo', etc.
# system
device = "cuda" # Always use CUDA
# device = "cuda" # Always use CUDA
device = "mps" if is_mac else "cuda"
dtype = (
"bfloat16"
if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
else "float16"
) # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
compile = True # do not torch compile the model on macbooks
# compile = True # do not torch compile the model on macbooks
compile = not is_mac # do not torch compile the model on macbooks

# various inits, derived attributes, I/O setup
# if not ddp, we are running on a single gpu, and one process
Expand Down Expand Up @@ -677,6 +679,7 @@ def get_lr(it):

parser = argparse.ArgumentParser(description="Run experiment")
parser.add_argument("--out_dir", type=str, default="run_0", help="Output directory")
parser.add_argument("-is_mac", action="store_true", help="Indicate that you are using a Mac")
args = parser.parse_args()

if __name__ == "__main__":
Expand All @@ -685,12 +688,14 @@ def get_lr(it):
}

out_dir = args.out_dir
is_mac = args.is_mac

all_results = {}
final_infos = {}
for dataset in num_seeds.keys():
final_info_list = []
for seed_offset in range(num_seeds[dataset]):
final_info, train_info, val_info = train(dataset, out_dir, seed_offset)
final_info, train_info, val_info = train(dataset, out_dir, is_mac, seed_offset)
all_results[f"{dataset}_{seed_offset}_final_info"] = final_info
all_results[f"{dataset}_{seed_offset}_train_info"] = train_info
all_results[f"{dataset}_{seed_offset}_val_info"] = val_info
Expand Down