sparklerz · sparklerz · Jun 10, 2025
diff --git a/llm-foundry-finetune/configs/finetune_mpt7b.yaml b/llm-foundry-finetune/configs/finetune_mpt7b.yaml
@@ -32,13 +32,13 @@ tokenizer:
 train_loader:
   name: finetuning
   dataset:
-    hf_name: text
+    hf_name: json
     split: train
-    # Use Hugging Face's text loader which expects plain-text files.
-    # Point to the split's .txt file generated by ``prepare_dolly.py``.
+    # Use the JSONL files generated by ``prepare_dolly.py`` which contain
+    # ``prompt`` and ``response`` keys.
     data_files:
-      train: data/dolly_15k_txt/train/train.txt
-    decoder_only_format: true
+      train: data/dolly_15k_txt/train.jsonl
+    decoder_only_format: false
     shuffle: true
     max_seq_len: 1024
   drop_last: false
@@ -52,11 +52,11 @@ train_loader:
 eval_loader:
   name: finetuning
   dataset:
-    hf_name: text
+    hf_name: json
     split: validation
     data_files:
-      validation: data/dolly_15k_txt/validation/validation.txt
-    decoder_only_format: true
+      validation: data/dolly_15k_txt/validation.jsonl
+    decoder_only_format: false
     shuffle: false
     max_seq_len: 1024
   drop_last: false