diff --git a/README.md b/README.md index b04fa6cc..7514d188 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ This code was designed for NVIDIA GPUs with CUDA using PyTorch. Support for othe ### Installation +For Linux: ```bash conda create -n ai_scientist python=3.11 conda activate ai_scientist @@ -60,9 +61,18 @@ sudo apt-get install texlive-full # Install pypi requirements pip install -r requirements.txt ``` - When installing `texlive-full`, you may need to [hold Enter](https://askubuntu.com/questions/956006/pregenerating-context-markiv-format-this-may-take-some-time-takes-forever). +For Mac: + +Install `MacTex`: [Link](https://mirror.ctan.org/systems/mac/mactex/MacTeX.pkg) +```bash +conda create -n ai_scientist python=3.11 +conda activate ai_scientist +pip install -r requirements.txt +pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu +``` + ### Supported Models and API Keys We support a wide variety of models including open-weight and API-only models. In general, we recommend only using frontier models above the capability of the original GPT-4. @@ -71,10 +81,18 @@ We support a wide variety of models including open-weight and API-only models. I By default, this uses the `OPENAI_API_KEY` environment variable. +```bash +export OPENAI_API_KEY="YOUR KEY HERE" +``` + #### Anthropic API (Claude Sonnet 3.5) By default, this uses the `ANTHROPIC_API_KEY` environment variable. +```bash +export ANTHROPIC_API_KEY="YOUR KEY HERE" +``` + ##### Claude models via Bedrock For Claude models provided by [Amazon Bedrock](https://aws.amazon.com/bedrock/), please install these additional packages: @@ -138,7 +156,13 @@ python data/text8/prepare.py ```bash # Set up NanoGPT baseline run # NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE! -cd templates/nanoGPT && python experiment.py --out_dir run_0 && python plot.py +cd templates/nanoGPT && python experiment.py --out_dir run_0 -is_mac && python plot.py && cd ../.. +``` + +for mac user, add `-is_mac` as an argument +```bash +# NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE! +cd templates/nanoGPT && python experiment.py --out_dir run_0 -is_mac && python plot.py && cd ../.. ``` #### Create NanoGPT_lite baseline run. We use this for sanity-checking @@ -147,6 +171,13 @@ cd templates/nanoGPT && python experiment.py --out_dir run_0 && python plot.py cd templates/nanoGPT_lite && python experiment.py --out_dir run_0 && python plot.py ``` +for mac user, add `-is_mac` as an argument + +```bash +# NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE! +cd templates/nanoGPT_lite && python experiment.py --out_dir run_0 -is_mac && python plot.py && cd ../.. +``` + ### Setup 2D Diffusion ```bash @@ -157,7 +188,7 @@ pip install . pip install scikit-learn # Set up 2D Diffusion baseline run -cd templates/2d_diffusion && python experiment.py --out_dir run_0 && python plot.py +cd templates/2d_diffusion && python experiment.py --out_dir run_0 && python plot.py && cd ../.. ``` ### Setup Grokking @@ -167,7 +198,7 @@ cd templates/2d_diffusion && python experiment.py --out_dir run_0 && python plot pip install einops # Set up Grokking baseline run -cd templates/grokking && python experiment.py --out_dir run_0 && python plot.py +cd templates/grokking && python experiment.py --out_dir run_0 && python plot.py && cd ../.. ``` diff --git a/templates/nanoGPT/experiment.py b/templates/nanoGPT/experiment.py index 303aebde..2eab08dd 100644 --- a/templates/nanoGPT/experiment.py +++ b/templates/nanoGPT/experiment.py @@ -313,7 +313,7 @@ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): # --- END model.py --- -def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0): +def train(dataset="shakespeare_char", out_dir="run_0", is_mac=True, seed_offset=0): # ----------------------------------------------------------------------------- # default config values designed to train a gpt2 (124M) on OpenWebText # data @@ -348,13 +348,14 @@ def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0): # DDP settings backend = "nccl" # 'nccl', 'gloo', etc. # system - device = "cuda" # Always use CUDA + device = "mps" if is_mac else "cuda" # Always use CUDA + # device = "cuda" # Always use CUDA dtype = ( "bfloat16" if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else "float16" ) # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler - compile = True # do not torch compile the model on macbooks + compile = not is_mac # do not torch compile the model on macbooks # various inits, derived attributes, I/O setup # if not ddp, we are running on a single gpu, and one process @@ -679,6 +680,8 @@ def get_lr(it): parser = argparse.ArgumentParser(description="Run experiment") parser.add_argument("--out_dir", type=str, default="run_0", help="Output directory") +parser.add_argument('-is_mac', action='store_true', help="Indicate that you are using a Mac") + args = parser.parse_args() if __name__ == "__main__": @@ -689,12 +692,15 @@ def get_lr(it): } out_dir = args.out_dir + + is_mac = args.is_mac + all_results = {} final_infos = {} for dataset in ["shakespeare_char", "enwik8", "text8"]: final_info_list = [] for seed_offset in range(num_seeds[dataset]): - final_info, train_info, val_info = train(dataset, out_dir, seed_offset) + final_info, train_info, val_info = train(dataset, out_dir, is_mac, seed_offset) all_results[f"{dataset}_{seed_offset}_final_info"] = final_info all_results[f"{dataset}_{seed_offset}_train_info"] = train_info all_results[f"{dataset}_{seed_offset}_val_info"] = val_info diff --git a/templates/nanoGPT_lite/experiment.py b/templates/nanoGPT_lite/experiment.py index 892ba72a..55183dac 100644 --- a/templates/nanoGPT_lite/experiment.py +++ b/templates/nanoGPT_lite/experiment.py @@ -313,7 +313,7 @@ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): # --- END model.py --- -def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0): +def train(dataset="shakespeare_char", out_dir="run_0", is_mac=False, seed_offset=0): # ----------------------------------------------------------------------------- # default config values designed to train a gpt2 (124M) on OpenWebText # data @@ -348,13 +348,15 @@ def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0): # DDP settings backend = "nccl" # 'nccl', 'gloo', etc. # system - device = "cuda" # Always use CUDA + # device = "cuda" # Always use CUDA + device = "mps" if is_mac else "cuda" dtype = ( "bfloat16" if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else "float16" ) # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler - compile = True # do not torch compile the model on macbooks + # compile = True # do not torch compile the model on macbooks + compile = not is_mac # do not torch compile the model on macbooks # various inits, derived attributes, I/O setup # if not ddp, we are running on a single gpu, and one process @@ -677,6 +679,7 @@ def get_lr(it): parser = argparse.ArgumentParser(description="Run experiment") parser.add_argument("--out_dir", type=str, default="run_0", help="Output directory") +parser.add_argument("-is_mac", action="store_true", help="Indicate that you are using a Mac") args = parser.parse_args() if __name__ == "__main__": @@ -685,12 +688,14 @@ def get_lr(it): } out_dir = args.out_dir + is_mac = args.is_mac + all_results = {} final_infos = {} for dataset in num_seeds.keys(): final_info_list = [] for seed_offset in range(num_seeds[dataset]): - final_info, train_info, val_info = train(dataset, out_dir, seed_offset) + final_info, train_info, val_info = train(dataset, out_dir, is_mac, seed_offset) all_results[f"{dataset}_{seed_offset}_final_info"] = final_info all_results[f"{dataset}_{seed_offset}_train_info"] = train_info all_results[f"{dataset}_{seed_offset}_val_info"] = val_info