SakanaAI · junhua · Aug 22, 2024 · Aug 22, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/README.md b/README.md
@@ -51,6 +51,7 @@ This code was designed for NVIDIA GPUs with CUDA using PyTorch. Support for othe
 
 ### Installation
 
+For Linux:
 ```bash
 conda create -n ai_scientist python=3.11
 conda activate ai_scientist
@@ -60,9 +61,18 @@ sudo apt-get install texlive-full
 # Install pypi requirements
 pip install -r requirements.txt
 ```
-
 When installing `texlive-full`, you may need to [hold Enter](https://askubuntu.com/questions/956006/pregenerating-context-markiv-format-this-may-take-some-time-takes-forever).
 
+For Mac:
+
+Install `MacTex`: [Link](https://mirror.ctan.org/systems/mac/mactex/MacTeX.pkg)
+```bash
+conda create -n ai_scientist python=3.11
+conda activate ai_scientist
+pip install -r requirements.txt
+pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+```
+
 ### Supported Models and API Keys
 
 We support a wide variety of models including open-weight and API-only models. In general, we recommend only using frontier models above the capability of the original GPT-4.
@@ -71,10 +81,18 @@ We support a wide variety of models including open-weight and API-only models. I
 
 By default, this uses the `OPENAI_API_KEY` environment variable.
 
+```bash
+export OPENAI_API_KEY="YOUR KEY HERE"
+```
+
 #### Anthropic API (Claude Sonnet 3.5)
 
 By default, this uses the `ANTHROPIC_API_KEY` environment variable.
 
+```bash
+export ANTHROPIC_API_KEY="YOUR KEY HERE"
+```
+
 ##### Claude models via Bedrock
 
 For Claude models provided by [Amazon Bedrock](https://aws.amazon.com/bedrock/), please install these additional packages:
@@ -138,7 +156,13 @@ python data/text8/prepare.py
 ```bash
 # Set up NanoGPT baseline run
 # NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE!
-cd templates/nanoGPT && python experiment.py --out_dir run_0 && python plot.py
+cd templates/nanoGPT && python experiment.py --out_dir run_0 -is_mac && python plot.py && cd ../..
+```
+
+for mac user, add `-is_mac` as an argument
+```bash
+# NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE!
+cd templates/nanoGPT && python experiment.py --out_dir run_0 -is_mac && python plot.py && cd ../..
 ```
 
 #### Create NanoGPT_lite baseline run. We use this for sanity-checking
@@ -147,6 +171,13 @@ cd templates/nanoGPT && python experiment.py --out_dir run_0 && python plot.py
 cd templates/nanoGPT_lite && python experiment.py --out_dir run_0 && python plot.py
 ```
 
+for mac user, add `-is_mac` as an argument
+
+```bash
+# NOTE: YOU MUST FIRST RUN THE PREPARE SCRIPTS ABOVE!
+cd templates/nanoGPT_lite && python experiment.py --out_dir run_0 -is_mac && python plot.py && cd ../..
+```
+
 ### Setup 2D Diffusion
 
 ```bash
@@ -157,7 +188,7 @@ pip install .
 pip install scikit-learn
 
 # Set up 2D Diffusion baseline run
-cd templates/2d_diffusion && python experiment.py --out_dir run_0 && python plot.py
+cd templates/2d_diffusion && python experiment.py --out_dir run_0 && python plot.py && cd ../..
 ```
 
 ### Setup Grokking
@@ -167,7 +198,7 @@ cd templates/2d_diffusion && python experiment.py --out_dir run_0 && python plot
 pip install einops
 
 # Set up Grokking baseline run
-cd templates/grokking && python experiment.py --out_dir run_0 && python plot.py
+cd templates/grokking && python experiment.py --out_dir run_0 && python plot.py && cd ../..
 ```
 
 

diff --git a/templates/nanoGPT/experiment.py b/templates/nanoGPT/experiment.py
@@ -313,7 +313,7 @@ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
 
 
 # --- END model.py ---
-def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0):
+def train(dataset="shakespeare_char", out_dir="run_0", is_mac=True, seed_offset=0):
     # -----------------------------------------------------------------------------
     # default config values designed to train a gpt2 (124M) on OpenWebText
     # data
@@ -348,13 +348,14 @@ def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0):
     # DDP settings
     backend = "nccl"  # 'nccl', 'gloo', etc.
     # system
-    device = "cuda"  # Always use CUDA
+    device =  "mps" if is_mac else "cuda"  # Always use CUDA
+    # device = "cuda"  # Always use CUDA
     dtype = (
         "bfloat16"
         if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
         else "float16"
     )  # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
-    compile = True  # do not torch compile the model on macbooks
+    compile = not is_mac  # do not torch compile the model on macbooks
 
     # various inits, derived attributes, I/O setup
     # if not ddp, we are running on a single gpu, and one process
@@ -679,6 +680,8 @@ def get_lr(it):
 
 parser = argparse.ArgumentParser(description="Run experiment")
 parser.add_argument("--out_dir", type=str, default="run_0", help="Output directory")
+parser.add_argument('-is_mac', action='store_true', help="Indicate that you are using a Mac")
+
 args = parser.parse_args()
 
 if __name__ == "__main__":
@@ -689,12 +692,15 @@ def get_lr(it):
     }
 
     out_dir = args.out_dir
+
+    is_mac = args.is_mac
+
     all_results = {}
     final_infos = {}
     for dataset in ["shakespeare_char", "enwik8", "text8"]:
         final_info_list = []
         for seed_offset in range(num_seeds[dataset]):
-            final_info, train_info, val_info = train(dataset, out_dir, seed_offset)
+            final_info, train_info, val_info = train(dataset, out_dir, is_mac, seed_offset)
             all_results[f"{dataset}_{seed_offset}_final_info"] = final_info
             all_results[f"{dataset}_{seed_offset}_train_info"] = train_info
             all_results[f"{dataset}_{seed_offset}_val_info"] = val_info

diff --git a/templates/nanoGPT_lite/experiment.py b/templates/nanoGPT_lite/experiment.py
@@ -313,7 +313,7 @@ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
 
 
 # --- END model.py ---
-def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0):
+def train(dataset="shakespeare_char", out_dir="run_0", is_mac=False, seed_offset=0):
     # -----------------------------------------------------------------------------
     # default config values designed to train a gpt2 (124M) on OpenWebText
     # data
@@ -348,13 +348,15 @@ def train(dataset="shakespeare_char", out_dir="run_0", seed_offset=0):
     # DDP settings
     backend = "nccl"  # 'nccl', 'gloo', etc.
     # system
-    device = "cuda"  # Always use CUDA
+    # device = "cuda"  # Always use CUDA
+    device = "mps" if is_mac else "cuda"
     dtype = (
         "bfloat16"
         if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
         else "float16"
     )  # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
-    compile = True  # do not torch compile the model on macbooks
+    # compile = True  # do not torch compile the model on macbooks
+    compile = not is_mac  # do not torch compile the model on macbooks
 
     # various inits, derived attributes, I/O setup
     # if not ddp, we are running on a single gpu, and one process
@@ -677,6 +679,7 @@ def get_lr(it):
 
 parser = argparse.ArgumentParser(description="Run experiment")
 parser.add_argument("--out_dir", type=str, default="run_0", help="Output directory")
+parser.add_argument("-is_mac", action="store_true",  help="Indicate that you are using a Mac")
 args = parser.parse_args()
 
 if __name__ == "__main__":
@@ -685,12 +688,14 @@ def get_lr(it):
     }
 
     out_dir = args.out_dir
+    is_mac = args.is_mac
+
     all_results = {}
     final_infos = {}
     for dataset in num_seeds.keys():
         final_info_list = []
         for seed_offset in range(num_seeds[dataset]):
-            final_info, train_info, val_info = train(dataset, out_dir, seed_offset)
+            final_info, train_info, val_info = train(dataset, out_dir, is_mac, seed_offset)
             all_results[f"{dataset}_{seed_offset}_final_info"] = final_info
             all_results[f"{dataset}_{seed_offset}_train_info"] = train_info
             all_results[f"{dataset}_{seed_offset}_val_info"] = val_info