diff --git a/README.md b/README.md
index d0b153e762..9f031ca497 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,9 @@ NOTE: AutoTrain is free! You only pay for the resources you use in case you deci
| Extractive Question Answering | ✅ | Coming Soon | [extractive_qa.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/extractive_question_answering) |
| Image Classification | ✅ | Coming Soon | [image_classification.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/image_classification) |
| Image Scoring/Regression | ✅ | Coming Soon | [image_regression.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/image_scoring) |
+| Audio Classification | ✅ | Coming Soon | [audio_classification.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/audio_classification) |
+| Audio Detection | ✅ | Coming Soon | [audio_detection.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/audio_detection) |
+| Audio Segmentation | ✅ | Coming Soon | [audio_segmentation.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/audio_segmentation) |
| VLM | 🟥 | Coming Soon | [vlm.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/vlm) |
diff --git a/colabs/audio_classification.ipynb b/colabs/audio_classification.ipynb
new file mode 100644
index 0000000000..99f37cc1f6
--- /dev/null
+++ b/colabs/audio_classification.ipynb
@@ -0,0 +1,62 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: audio_classification # do not change\n",
+ "base_model: facebook/wav2vec2-base # the model to be used from hugging face hub\n",
+ "project_name: autotrain-audio-classification-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " audio_column: audio\n",
+ " target_column: labels\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 8\n",
+ " lr: 1e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/audio_detection.ipynb b/colabs/audio_detection.ipynb
new file mode 100644
index 0000000000..8e4215e3fd
--- /dev/null
+++ b/colabs/audio_detection.ipynb
@@ -0,0 +1,62 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: audio_detection # do not change\n",
+ "base_model: facebook/wav2vec2-base # the model to be used from hugging face hub\n",
+ "project_name: autotrain-audio-detection-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " audio_column: audio\n",
+ " target_column: labels\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 8\n",
+ " lr: 1e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/audio_segmentation.ipynb b/colabs/audio_segmentation.ipynb
new file mode 100644
index 0000000000..97950e4dea
--- /dev/null
+++ b/colabs/audio_segmentation.ipynb
@@ -0,0 +1,62 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: audio_segmentation # do not change\n",
+ "base_model: facebook/wav2vec2-base # the model to be used from hugging face hub\n",
+ "project_name: autotrain-audio-segmentation-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " audio_column: audio\n",
+ " target_column: segments\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 8\n",
+ " lr: 1e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/clm.ipynb b/colabs/clm.ipynb
new file mode 100644
index 0000000000..bab8c59b3e
--- /dev/null
+++ b/colabs/clm.ipynb
@@ -0,0 +1,63 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: lm_training # do not change\n",
+ "base_model: gpt2 # the model to be used from hugging face hub\n",
+ "project_name: autotrain-clm-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " text_column: text\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 4\n",
+ " block_size: 512\n",
+ " model_max_length: 1024\n",
+ " lr: 2e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: cosine\n",
+ " gradient_accumulation: 4\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/extractive_question_answering.ipynb b/colabs/extractive_question_answering.ipynb
new file mode 100644
index 0000000000..0e3f667741
--- /dev/null
+++ b/colabs/extractive_question_answering.ipynb
@@ -0,0 +1,64 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: extractive_question_answering # do not change\n",
+ "base_model: google-bert/bert-base-uncased # the model to be used from hugging face hub\n",
+ "project_name: autotrain-extractive-qa-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " context_column: context\n",
+ " question_column: question\n",
+ " answer_column: answers\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 8\n",
+ " max_seq_length: 384\n",
+ " lr: 2e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/generic.ipynb b/colabs/generic.ipynb
new file mode 100644
index 0000000000..22240c0cb1
--- /dev/null
+++ b/colabs/generic.ipynb
@@ -0,0 +1,62 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: generic # do not change\n",
+ "base_model: your_model_name # the model to be used from hugging face hub\n",
+ "project_name: autotrain-generic-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # adjust based on your data\n",
+ " text_column: text\n",
+ " target_column: labels\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 8\n",
+ " lr: 2e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/image_instance_segmentation.ipynb b/colabs/image_instance_segmentation.ipynb
new file mode 100644
index 0000000000..fb090ad366
--- /dev/null
+++ b/colabs/image_instance_segmentation.ipynb
@@ -0,0 +1,62 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: image_instance_segmentation # do not change\n",
+ "base_model: facebook/mask2former-swin-large-coco-instance # the model to be used from hugging face hub\n",
+ "project_name: autotrain-image-instance-segmentation-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " image_column: image\n",
+ " objects_column: objects\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 4\n",
+ " lr: 5e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 2\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/image_regression.ipynb b/colabs/image_regression.ipynb
new file mode 100644
index 0000000000..d878822302
--- /dev/null
+++ b/colabs/image_regression.ipynb
@@ -0,0 +1,62 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: image_regression # do not change\n",
+ "base_model: google/vit-base-patch16-224 # the model to be used from hugging face hub\n",
+ "project_name: autotrain-image-regression-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " image_column: image\n",
+ " target_column: target\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 8\n",
+ " lr: 5e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/image_semantic_segmentation.ipynb b/colabs/image_semantic_segmentation.ipynb
new file mode 100644
index 0000000000..53a1fa4f09
--- /dev/null
+++ b/colabs/image_semantic_segmentation.ipynb
@@ -0,0 +1,62 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: image_semantic_segmentation # do not change\n",
+ "base_model: nvidia/segformer-b0-finetuned-ade-512-512 # the model to be used from hugging face hub\n",
+ "project_name: autotrain-image-semantic-segmentation-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " image_column: image\n",
+ " target_column: annotation\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 4\n",
+ " lr: 6e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 2\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/object_detection.ipynb b/colabs/object_detection.ipynb
new file mode 100644
index 0000000000..31be3568a5
--- /dev/null
+++ b/colabs/object_detection.ipynb
@@ -0,0 +1,62 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: object_detection # do not change\n",
+ "base_model: facebook/detr-resnet-50 # the model to be used from hugging face hub\n",
+ "project_name: autotrain-object-detection-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " image_column: image\n",
+ " objects_column: objects\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 4\n",
+ " lr: 1e-4\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 2\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/sent_transformers.ipynb b/colabs/sent_transformers.ipynb
new file mode 100644
index 0000000000..21e75922d1
--- /dev/null
+++ b/colabs/sent_transformers.ipynb
@@ -0,0 +1,63 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: sentence_transformers # do not change\n",
+ "base_model: sentence-transformers/all-MiniLM-L6-v2 # the model to be used from hugging face hub\n",
+ "project_name: autotrain-sentence-transformers-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " sentence1_column: sentence1\n",
+ " sentence2_column: sentence2\n",
+ " target_column: score\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 16\n",
+ " lr: 2e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/seq2seq.ipynb b/colabs/seq2seq.ipynb
new file mode 100644
index 0000000000..68467069f1
--- /dev/null
+++ b/colabs/seq2seq.ipynb
@@ -0,0 +1,64 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: seq2seq # do not change\n",
+ "base_model: google-t5/t5-small # the model to be used from hugging face hub\n",
+ "project_name: autotrain-seq2seq-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " text_column: article\n",
+ " target_column: highlights\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 8\n",
+ " max_seq_length: 512\n",
+ " max_target_length: 128\n",
+ " lr: 3e-4\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/tabular.ipynb b/colabs/tabular.ipynb
new file mode 100644
index 0000000000..a5b994b8cc
--- /dev/null
+++ b/colabs/tabular.ipynb
@@ -0,0 +1,59 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: tabular # do not change\n",
+ "base_model: xgboost # can be xgboost, lightgbm, catboost, randomforest, etc.\n",
+ "project_name: autotrain-tabular-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # adjust based on your data\n",
+ " id_column: id\n",
+ " target_columns:\n",
+ " - target\n",
+ "\n",
+ "params:\n",
+ " task: classification # can be \"classification\" or \"regression\"\n",
+ " num_trials: 10 # number of hyperparameter optimization trials\n",
+ " time_limit: 600 # time limit in seconds\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/text_classification.ipynb b/colabs/text_classification.ipynb
new file mode 100644
index 0000000000..862a42ed4f
--- /dev/null
+++ b/colabs/text_classification.ipynb
@@ -0,0 +1,63 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: text_classification # do not change\n",
+ "base_model: google-bert/bert-base-uncased # the model to be used from hugging face hub\n",
+ "project_name: autotrain-text-classification-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " text_column: text\n",
+ " target_column: labels\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 8\n",
+ " max_seq_length: 512\n",
+ " lr: 1e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/text_regression.ipynb b/colabs/text_regression.ipynb
new file mode 100644
index 0000000000..de8f3a0427
--- /dev/null
+++ b/colabs/text_regression.ipynb
@@ -0,0 +1,63 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: text_regression # do not change\n",
+ "base_model: google-bert/bert-base-uncased # the model to be used from hugging face hub\n",
+ "project_name: autotrain-text-regression-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " text_column: text\n",
+ " target_column: target\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 8\n",
+ " max_seq_length: 512\n",
+ " lr: 1e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/token_classification.ipynb b/colabs/token_classification.ipynb
new file mode 100644
index 0000000000..b6cbdd7987
--- /dev/null
+++ b/colabs/token_classification.ipynb
@@ -0,0 +1,63 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: token_classification # do not change\n",
+ "base_model: google-bert/bert-base-uncased # the model to be used from hugging face hub\n",
+ "project_name: autotrain-token-classification-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " tokens_column: tokens\n",
+ " tags_column: ner_tags\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 8\n",
+ " max_seq_length: 512\n",
+ " lr: 2e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 1\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/colabs/vlm.ipynb b/colabs/vlm.ipynb
new file mode 100644
index 0000000000..206a45f31a
--- /dev/null
+++ b/colabs/vlm.ipynb
@@ -0,0 +1,62 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile config.yml\n",
+ "task: vlm # do not change\n",
+ "base_model: microsoft/kosmos-2-patch14-224 # the model to be used from hugging face hub\n",
+ "project_name: autotrain-vlm-model # the name of the project, must be unique\n",
+ "log: tensorboard # do not change\n",
+ "backend: local # do not change\n",
+ "\n",
+ "data:\n",
+ " path: data/ # the path to the data folder\n",
+ " train_split: train # this folder inside data/ will be used for training\n",
+ " valid_split: null # this folder inside data/ will be used for validation. If not available, set it to null\n",
+ " column_mapping: # do not change\n",
+ " image_column: image\n",
+ " text_column: text\n",
+ "\n",
+ "params:\n",
+ " epochs: 3\n",
+ " batch_size: 4\n",
+ " lr: 1e-5\n",
+ " optimizer: adamw_torch\n",
+ " scheduler: linear\n",
+ " gradient_accumulation: 4\n",
+ " mixed_precision: fp16\n",
+ "\n",
+ "hub:\n",
+ " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n",
+ " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n",
+ " push_to_hub: true # set to true if you want to push the model to the hub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "HF_USERNAME = userdata.get('HF_USERNAME')\n",
+ "HF_TOKEN = userdata.get('HF_TOKEN')\n",
+ "os.environ['HF_USERNAME'] = HF_USERNAME\n",
+ "os.environ['HF_TOKEN'] = HF_TOKEN\n",
+ "!autotrain --config config.yml"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/configs/audio_classification/hub_dataset.yml b/configs/audio_classification/hub_dataset.yml
new file mode 100644
index 0000000000..9e7efd132c
--- /dev/null
+++ b/configs/audio_classification/hub_dataset.yml
@@ -0,0 +1,33 @@
+task: audio-classification
+base_model: facebook/wav2vec2-base
+project_name: my-autotrain-audio-clf
+log: tensorboard
+backend: local
+
+data_path: superb
+train_split: train
+valid_split: validation
+
+column_mapping:
+ audio_column: audio
+ target_column: label
+
+parameters:
+ learning_rate: 3e-5
+ epochs: 5
+ batch_size: 8
+ warmup_ratio: 0.1
+ weight_decay: 0.01
+ mixed_precision: fp16
+ gradient_accumulation: 1
+ auto_find_batch_size: false
+ push_to_hub: false
+ logging_steps: -1
+ eval_strategy: epoch
+ save_total_limit: 1
+ early_stopping_patience: 5
+ early_stopping_threshold: 0.01
+ max_length: 480000 # 30 seconds at 16kHz
+ sampling_rate: 16000
+ feature_extractor_normalize: true
+ feature_extractor_return_attention_mask: true
\ No newline at end of file
diff --git a/configs/audio_classification/local.yml b/configs/audio_classification/local.yml
new file mode 100644
index 0000000000..0200a4fc0d
--- /dev/null
+++ b/configs/audio_classification/local.yml
@@ -0,0 +1,43 @@
+task: audio-classification
+base_model: facebook/wav2vec2-base
+project_name: my-autotrain-audio-clf-local
+log: tensorboard
+backend: local
+
+# Local data path - should contain audio files and CSV with labels
+data_path: /path/to/audio/dataset.csv
+train_split: train
+valid_split: validation
+
+column_mapping:
+ audio_column: audio_path
+ target_column: label
+
+parameters:
+ learning_rate: 3e-5
+ epochs: 5
+ batch_size: 8
+ warmup_ratio: 0.1
+ weight_decay: 0.01
+ mixed_precision: fp16
+ gradient_accumulation: 1
+ auto_find_batch_size: false
+ push_to_hub: false
+ logging_steps: -1
+ eval_strategy: epoch
+ save_total_limit: 1
+ early_stopping_patience: 5
+ early_stopping_threshold: 0.01
+ max_length: 480000 # 30 seconds at 16kHz
+ sampling_rate: 16000
+ feature_extractor_normalize: true
+ feature_extractor_return_attention_mask: true
+
+# Note: For local audio classification:
+# - audio_path column should contain paths to audio files (.wav, .mp3, .flac)
+# - label column should contain class labels (strings or integers)
+# - CSV format: audio_path,label
+# Example:
+# /path/to/audio1.wav,speech
+# /path/to/audio2.wav,music
+# /path/to/audio3.wav,noise
\ No newline at end of file
diff --git a/configs/audio_detection/hub_dataset.yml b/configs/audio_detection/hub_dataset.yml
new file mode 100644
index 0000000000..e2040dbced
--- /dev/null
+++ b/configs/audio_detection/hub_dataset.yml
@@ -0,0 +1,47 @@
+task: audio-detection
+base_model: facebook/wav2vec2-base
+project_name: my-autotrain-audio-detection-hub
+log: tensorboard
+backend: local
+
+# Hub dataset configuration
+data_path: audiofolder/audio_detection_dataset
+train_split: train
+valid_split: validation
+
+column_mapping:
+ audio_column: audio
+ events_column: events
+
+parameters:
+ learning_rate: 3e-5
+ epochs: 3
+ batch_size: 8
+ warmup_ratio: 0.1
+ weight_decay: 0.01
+ mixed_precision: fp16
+ gradient_accumulation: 1
+ auto_find_batch_size: false
+ push_to_hub: false
+ logging_steps: -1
+ eval_strategy: epoch
+ save_total_limit: 1
+ early_stopping_patience: 5
+ early_stopping_threshold: 0.01
+ max_length: 480000 # 30 seconds at 16kHz
+ sampling_rate: 16000
+ event_overlap_threshold: 0.5 # IoU threshold for overlapping events
+ confidence_threshold: 0.1 # Minimum confidence threshold for event detection
+
+# Hub settings
+hub:
+ username: ${HF_USERNAME}
+ token: ${HF_TOKEN}
+ push_to_hub: true
+
+# Note: For hub audio detection datasets:
+# - The dataset should have 'audio' and 'events' columns
+# - Events should be formatted as a list of dictionaries:
+# [{"start": 0.0, "end": 2.5, "label": "speech"}, {"start": 2.5, "end": 3.0, "label": "silence"}]
+# - Audio column should contain audio data (array or file paths)
+# - Similar to object detection but for temporal events in audio
\ No newline at end of file
diff --git a/configs/audio_detection/local.yml b/configs/audio_detection/local.yml
new file mode 100644
index 0000000000..8db2f96d8d
--- /dev/null
+++ b/configs/audio_detection/local.yml
@@ -0,0 +1,43 @@
+task: audio-detection
+base_model: facebook/wav2vec2-base
+project_name: my-autotrain-audio-detection-local
+log: tensorboard
+backend: local
+
+# Local data path - should contain audio files and CSV with event annotations
+data_path: /path/to/audio/dataset.csv
+train_split: train
+valid_split: validation
+
+column_mapping:
+ audio_column: audio_path
+ events_column: events
+
+parameters:
+ learning_rate: 3e-5
+ epochs: 3
+ batch_size: 8
+ warmup_ratio: 0.1
+ weight_decay: 0.01
+ mixed_precision: fp16
+ gradient_accumulation: 1
+ auto_find_batch_size: false
+ push_to_hub: false
+ logging_steps: -1
+ eval_strategy: epoch
+ save_total_limit: 1
+ early_stopping_patience: 5
+ early_stopping_threshold: 0.01
+ max_length: 480000 # 30 seconds at 16kHz
+ sampling_rate: 16000
+ event_overlap_threshold: 0.5 # IoU threshold for overlapping events
+ confidence_threshold: 0.1 # Minimum confidence threshold for event detection
+
+# Note: For local audio detection:
+# - audio_path column should contain paths to audio files (.wav, .mp3, .flac)
+# - events column should contain event annotations as JSON list
+# - CSV format: audio_path,events
+# Example:
+# /path/to/audio1.wav,"[{""start"": 0.0, ""end"": 2.5, ""label"": ""speech""}, {""start"": 2.5, ""end"": 3.0, ""label"": ""silence""}]"
+# /path/to/audio2.wav,"[{""start"": 1.0, ""end"": 4.0, ""label"": ""music""}, {""start"": 4.0, ""end"": 5.0, ""label"": ""noise""}]"
+# /path/to/audio3.wav,"[{""start"": 0.5, ""end"": 3.5, ""label"": ""car_crash""}]"
\ No newline at end of file
diff --git a/configs/audio_segmentation/hub_dataset.yml b/configs/audio_segmentation/hub_dataset.yml
new file mode 100644
index 0000000000..b6f2c2c968
--- /dev/null
+++ b/configs/audio_segmentation/hub_dataset.yml
@@ -0,0 +1,49 @@
+task: audio_segmentation
+base_model: microsoft/speecht5_vc
+project_name: autotrain-audio-segmentation-hub
+log: tensorboard
+backend: spaces-a10g-large
+
+# Hub dataset configuration
+data_path: audiofolder/audio_segmentation_dataset
+train_split: train
+valid_split: validation
+audio_column: audio
+target_column: segments
+
+# Training parameters
+epochs: 10
+batch_size: 16
+lr: 2e-5
+scheduler: cosine
+optimizer: adamw_torch
+weight_decay: 0.01
+warmup_ratio: 0.05
+gradient_accumulation: 2
+mixed_precision: fp16
+logging_steps: 25
+save_total_limit: 5
+eval_strategy: steps
+early_stopping_patience: 5
+early_stopping_threshold: 0.005
+
+# Audio specific parameters
+max_length: 320000 # 20 seconds at 16kHz (shorter for better memory usage)
+sampling_rate: 16000
+feature_extractor_normalize: true
+feature_extractor_return_attention_mask: true
+
+# Segmentation specific parameters
+segment_length: 3.0 # seconds (shorter segments for better granularity)
+overlap_length: 0.3 # seconds
+min_segment_length: 0.5 # seconds
+
+# Model parameters
+seed: 42
+max_grad_norm: 1.0
+auto_find_batch_size: true
+push_to_hub: true
+
+# Hub settings
+token: ${HF_TOKEN}
+username: ${HF_USERNAME}
\ No newline at end of file
diff --git a/configs/audio_segmentation/local.yml b/configs/audio_segmentation/local.yml
new file mode 100644
index 0000000000..ca9e78f124
--- /dev/null
+++ b/configs/audio_segmentation/local.yml
@@ -0,0 +1,44 @@
+task: audio_segmentation
+base_model: microsoft/speecht5_vc
+project_name: autotrain-audio-segmentation-local
+log: tensorboard
+backend: local
+
+data_path: data/
+train_split: train
+valid_split: validation
+audio_column: audio_path
+target_column: segments
+
+# Training parameters
+epochs: 5
+batch_size: 8
+lr: 3e-5
+scheduler: linear
+optimizer: adamw_torch
+weight_decay: 0.01
+warmup_ratio: 0.1
+gradient_accumulation: 1
+mixed_precision: fp16
+logging_steps: 50
+save_total_limit: 3
+eval_strategy: epoch
+early_stopping_patience: 3
+early_stopping_threshold: 0.01
+
+# Audio specific parameters
+max_length: 480000 # 30 seconds at 16kHz
+sampling_rate: 16000
+feature_extractor_normalize: true
+feature_extractor_return_attention_mask: true
+
+# Segmentation specific parameters
+segment_length: 5.0 # seconds
+overlap_length: 0.5 # seconds
+min_segment_length: 1.0 # seconds
+
+# Model parameters
+seed: 42
+max_grad_norm: 1.0
+auto_find_batch_size: false
+push_to_hub: false
\ No newline at end of file
diff --git a/configs/image_instance_segmentation/hub_dataset.yml b/configs/image_instance_segmentation/hub_dataset.yml
new file mode 100644
index 0000000000..fa0856b252
--- /dev/null
+++ b/configs/image_instance_segmentation/hub_dataset.yml
@@ -0,0 +1,32 @@
+task: image_instance_segmentation
+base_model: facebook/detr-resnet-50-panoptic
+data_path: username/dataset_name
+train_split: train
+valid_split: validation
+image_column: image
+target_column: instance_mask
+bbox_column: bbox
+category_column: category
+epochs: 3
+batch_size: 2
+lr: 5e-5
+seed: 42
+gradient_accumulation: 1
+eval_strategy: epoch
+save_total_limit: 1
+auto_find_batch_size: false
+mixed_precision: null
+warmup_ratio: 0.1
+weight_decay: 0.0
+optimizer: adamw_torch
+scheduler: linear
+project_name: instance-segmentation-model
+log: none
+early_stopping_patience: 5
+early_stopping_threshold: 0.01
+push_to_hub: false
+repo_id: username/repo_name
+token: your_hf_token_here
+ignore_mismatched_sizes: true
+reduce_labels: false
+max_instances: 100
\ No newline at end of file
diff --git a/configs/image_instance_segmentation/local.yml b/configs/image_instance_segmentation/local.yml
new file mode 100644
index 0000000000..cc8fb924a9
--- /dev/null
+++ b/configs/image_instance_segmentation/local.yml
@@ -0,0 +1,32 @@
+task: image_instance_segmentation
+base_model: facebook/detr-resnet-50-panoptic
+data_path: data/
+train_split: train
+valid_split: validation
+image_column: image
+target_column: instance_mask
+bbox_column: bbox
+category_column: category
+epochs: 3
+batch_size: 2
+lr: 5e-5
+seed: 42
+gradient_accumulation: 1
+eval_strategy: epoch
+save_total_limit: 1
+auto_find_batch_size: false
+mixed_precision: null
+warmup_ratio: 0.1
+weight_decay: 0.0
+optimizer: adamw_torch
+scheduler: linear
+project_name: instance-segmentation-model
+log: none
+early_stopping_patience: 5
+early_stopping_threshold: 0.01
+push_to_hub: false
+repo_id: username/repo_name
+token: your_hf_token_here
+ignore_mismatched_sizes: true
+reduce_labels: false
+max_instances: 100
\ No newline at end of file
diff --git a/configs/image_semantic_segmentation/hub_dataset.yml b/configs/image_semantic_segmentation/hub_dataset.yml
new file mode 100644
index 0000000000..28360e2c41
--- /dev/null
+++ b/configs/image_semantic_segmentation/hub_dataset.yml
@@ -0,0 +1,36 @@
+task: image_semantic_segmentation
+base_model: nvidia/mit-b0
+project_name: autotrain-image-semantic-segmentation-hub
+log: tensorboard
+backend: spaces-a10g-large
+
+# Hub dataset configuration
+data_path: your_username/your_segmentation_dataset
+train_split: train
+valid_split: validation
+column_mapping:
+ image_column: image
+ target_column: segmentation_mask
+
+# Training parameters
+params:
+ epochs: 10
+ batch_size: 4
+ lr: 3e-5
+ optimizer: adamw_torch
+ scheduler: linear
+ gradient_accumulation: 2
+ mixed_precision: fp16
+ ignore_mismatched_sizes: true
+ reduce_labels: false
+ logging_steps: 50
+ save_total_limit: 3
+ eval_strategy: epoch
+ early_stopping_patience: 3
+ early_stopping_threshold: 0.01
+
+# Hub settings
+hub:
+ username: ${HF_USERNAME}
+ token: ${HF_TOKEN}
+ push_to_hub: true
\ No newline at end of file
diff --git a/configs/image_semantic_segmentation/local.yml b/configs/image_semantic_segmentation/local.yml
new file mode 100644
index 0000000000..057b588414
--- /dev/null
+++ b/configs/image_semantic_segmentation/local.yml
@@ -0,0 +1,29 @@
+task: image_semantic_segmentation
+base_model: nvidia/mit-b0
+project_name: autotrain-image-semantic-segmentation-model
+log: tensorboard
+backend: local
+
+data:
+ path: data/
+ train_split: train # this folder inside data/ will be used for training, it contains the images and masks.
+ valid_split: null
+ column_mapping:
+ image_column: image
+ target_column: segmentation_mask
+
+params:
+ epochs: 3
+ batch_size: 1
+ lr: 5e-5
+ optimizer: adamw_torch
+ scheduler: linear
+ gradient_accumulation: 1
+ mixed_precision: fp16
+ ignore_mismatched_sizes: true
+ reduce_labels: false
+
+hub:
+ username: ${HF_USERNAME}
+ token: ${HF_TOKEN}
+ push_to_hub: true
\ No newline at end of file
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index fbb26c0f87..f1e56871da 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -31,6 +31,8 @@
title: Image Classification / Regression
- local: tasks/object_detection
title: Object Detection
+ - local: tasks/audio
+ title: Audio Tasks
- local: tasks/seq2seq
title: Seq2Seq
- local: tasks/token_classification
diff --git a/docs/source/tasks/audio.mdx b/docs/source/tasks/audio.mdx
new file mode 100644
index 0000000000..01c12365e6
--- /dev/null
+++ b/docs/source/tasks/audio.mdx
@@ -0,0 +1,193 @@
+# Audio Tasks with AutoTrain Advanced
+
+AutoTrain Advanced supports multiple audio-related machine learning tasks. This guide covers everything you need to know about training audio models.
+
+## Supported Audio Tasks
+
+AutoTrain supports three main audio tasks:
+- **Audio Classification**: Classify audio files into categories
+- **Audio Segmentation**: Segment audio into different classes over time
+- **Audio Detection**: Detect and locate specific events in audio
+
+Config file task names:
+- `audio-classification` / `audio_classification`
+- `audio-segmentation` / `audio_segmentation`
+- `audio-detection` / `audio_detection`
+
+## Data Format
+
+All audio tasks support data in ZIP format containing audio files and metadata.
+
+### Audio Classification
+
+For audio classification, your data should be in ZIP format with the following structure:
+
+```
+dataset.zip
+├── audio1.wav
+├── audio2.wav
+├── audio3.mp3
+└── metadata.jsonl
+```
+
+The `metadata.jsonl` file should contain:
+
+```json
+{"file_name": "audio1.wav", "label": "speech"}
+{"file_name": "audio2.wav", "label": "music"}
+{"file_name": "audio3.mp3", "label": "noise"}
+```
+
+Example use cases:
+- Speech vs music classification
+- Emotion recognition from audio
+- Environmental sound classification
+
+### Audio Segmentation
+
+For audio segmentation, use the same ZIP structure but with temporal segment annotations:
+
+```json
+{"file_name": "audio1.wav", "segments": [{"start": 0.0, "end": 2.5, "label": "speech"}, {"start": 2.5, "end": 5.0, "label": "music"}]}
+{"file_name": "audio2.wav", "segments": [{"start": 0.0, "end": 1.0, "label": "silence"}, {"start": 1.0, "end": 3.0, "label": "speech"}]}
+```
+
+Example use cases:
+- Speaker diarization
+- Music genre segmentation
+- Audio scene analysis
+
+### Audio Detection
+
+For audio detection, provide event annotations with precise timing:
+
+```json
+{"file_name": "audio1.wav", "events": [{"start": 1.23, "end": 1.87, "label": "car_horn"}, {"start": 3.45, "end": 4.12, "label": "siren"}]}
+{"file_name": "audio2.wav", "events": [{"start": 0.5, "end": 2.1, "label": "dog_bark"}]}
+```
+
+Example use cases:
+- Sound event detection
+- Anomaly detection in audio
+- Audio surveillance systems
+
+## Column Mapping
+
+### Audio Classification
+Your dataset columns should map to:
+- `audio_column`: Path to audio files (default: "audio_path")
+- `target_column`: Classification labels (default: "intent")
+
+### Audio Segmentation
+Your dataset columns should map to:
+- `audio_column`: Path to audio files (default: "audio_path")
+- `target_column`: Segment annotations (default: "segments")
+
+### Audio Detection
+Your dataset columns should map to:
+- `audio_column`: Path to audio files (default: "audio_path")
+- `events_column`: Event annotations (default: "events")
+
+## Training
+
+### Local Training
+
+To train an audio model locally, use:
+
+```bash
+autotrain --config config.yaml
+```
+
+Example configuration for audio classification:
+
+```yaml
+task: audio-classification
+base_model: facebook/wav2vec2-base
+project_name: my-audio-classifier
+log: tensorboard
+backend: local
+
+data:
+ path: ./my-audio-dataset.zip
+ train_split: train
+ valid_split: validation
+ column_mapping:
+ audio_column: audio_path
+ target_column: intent
+
+params:
+ lr: 3e-5
+ epochs: 10
+ batch_size: 8
+ seed: 42
+ eval_strategy: epoch
+ save_total_limit: 1
+ auto_find_batch_size: true
+
+hub:
+ username: your-username
+ token: ${HF_TOKEN}
+ push_to_hub: true
+```
+
+### Recommended Models
+
+**Audio Classification:**
+- `facebook/wav2vec2-base`
+- `microsoft/unispeech-sat-base`
+- `facebook/hubert-base-ls960`
+
+**Audio Segmentation:**
+- `facebook/wav2vec2-base`
+- `microsoft/wavlm-base`
+
+**Audio Detection:**
+- `facebook/wav2vec2-base`
+- `microsoft/unispeech-sat-base`
+
+## Tips and Best Practices
+
+1. **Audio Format**: WAV and MP3 formats are supported
+2. **Sample Rate**: Most models work best with 16kHz audio
+3. **Duration**: Keep audio clips under 30 seconds for best performance
+4. **Data Quality**: Ensure consistent audio quality across your dataset
+5. **Balanced Dataset**: Try to have balanced classes for classification tasks
+6. **Validation Split**: Reserve 10-20% of data for validation
+
+## Python API
+
+You can also train using Python:
+
+```python
+from autotrain import AutoTrain
+
+# Audio Classification
+project = AutoTrain(
+ task="audio-classification",
+ project_name="my-audio-classifier",
+ base_model="facebook/wav2vec2-base",
+ data_path="./audio-dataset.zip",
+ lr=3e-5,
+ epochs=10,
+ batch_size=8,
+)
+project.train()
+```
+
+## Inference
+
+After training, use your model for inference:
+
+```python
+from transformers import pipeline
+
+# Load your trained model
+classifier = pipeline(
+ "audio-classification",
+ model="your-username/my-audio-classifier"
+)
+
+# Classify audio
+result = classifier("path/to/audio.wav")
+print(result)
+```
\ No newline at end of file
diff --git a/docs/source/tasks/image_semantic_segmentation.mdx b/docs/source/tasks/image_semantic_segmentation.mdx
new file mode 100644
index 0000000000..345a97da9e
--- /dev/null
+++ b/docs/source/tasks/image_semantic_segmentation.mdx
@@ -0,0 +1,180 @@
+# Image Semantic Segmentation
+
+Image semantic segmentation is a computer vision task that involves classifying each pixel in an image to a specific class or category. AutoTrain simplifies the process, enabling you to train a state-of-the-art semantic segmentation model by providing labeled images and their corresponding segmentation masks.
+
+## Preparing your data
+
+AutoTrain supports the **standard semantic segmentation format** used by popular annotation tools like CVAT, Supervisely, and Pascal VOC. You can provide your data as either a **zip file** or a **directory**.
+
+### Supported Input Formats
+
+**Option 1: Zip File** (Recommended)
+Upload a single zip file containing your entire dataset.
+
+**Option 2: Directory**
+Upload a directory with the proper structure.
+
+**Option 3: Hugging Face Dataset**
+Use a Hugging Face dataset with column mapping.
+
+### Directory Structure
+
+AutoTrain automatically detects various common naming conventions:
+
+```
+segmentation_data.zip # or segmentation_data/
+├── images/ # or: img/, image/, imgs/
+│ ├── image1.jpg
+│ ├── image2.png
+│ └── ...
+├── masks/ # or: mask/, annotations/, ann/, segmentations/, labels/
+│ ├── image1.png # Same base name as corresponding image
+│ ├── image2.png
+│ └── ...
+└── classes.txt # OPTIONAL: class names (one per line)
+```
+
+### Alternative: Flat Structure
+```
+segmentation_data/
+├── image1.jpg # Images and masks in same directory
+├── image1.png # (masks distinguished by .png extension)
+├── image2.jpg
+├── image2.png
+└── classes.txt # Optional
+```
+
+### File Requirements
+
+**Images:**
+- Formats: JPG, JPEG, PNG, BMP, WEBP, TIFF, TIF, JFIF, AVIF, HEIC, HEIF
+- Any resolution (will be automatically resized for training)
+
+**Masks:**
+- Format: PNG (grayscale)
+- Pixel values represent class IDs:
+ - 0 = background
+ - 1 = first class
+ - 2 = second class
+ - etc.
+- Same base filename as corresponding image
+
+**Classes File (Optional):**
+- Filename: `classes.txt`, `labelmap.txt`, or `labels.txt`
+- Format: One class name per line
+- If not provided, classes will be auto-named as `class_0`, `class_1`, etc.
+
+Example classes.txt:
+```
+background
+person
+car
+bicycle
+dog
+cat
+```
+
+### Compatible with Annotation Tools
+
+This format is directly compatible with exports from:
+- **CVAT** (Computer Vision Annotation Tool)
+- **Supervisely**
+- **Labelme**
+- **Pascal VOC segmentation format**
+- **VGG Image Annotator (VIA)**
+
+### Hugging Face Dataset Format
+
+For Hugging Face datasets, use the column mapping:
+- `image_column`: column containing the input images
+- `target_column`: column containing the segmentation masks
+
+## Configuration Options
+
+### Key Parameters
+
+- `model`: Pre-trained model to use (default: "nvidia/mit-b0")
+- `batch_size`: Training batch size (default: 2, smaller due to memory requirements)
+- `learning_rate`: Learning rate for training (default: 5e-5)
+- `epochs`: Number of training epochs (default: 3)
+- `ignore_mismatched_sizes`: Ignore size mismatches when loading model (default: true)
+- `reduce_labels`: Whether to reduce label ids by 1 (useful for some datasets) (default: false)
+
+### Example Configuration
+
+```yaml
+task: image_semantic_segmentation
+base_model: nvidia/mit-b0
+project_name: my-segmentation-model
+log: tensorboard
+backend: local
+
+data:
+ path: data/
+ train_split: train
+ valid_split: validation
+ column_mapping:
+ image_column: image
+ target_column: segmentation_mask
+
+params:
+ epochs: 10
+ batch_size: 4
+ lr: 3e-5
+ optimizer: adamw_torch
+ scheduler: linear
+ mixed_precision: fp16
+ ignore_mismatched_sizes: true
+ reduce_labels: false
+
+hub:
+ username: ${HF_USERNAME}
+ token: ${HF_TOKEN}
+ push_to_hub: true
+```
+
+## Supported Models
+
+AutoTrain supports various pre-trained models for semantic segmentation, including:
+
+- MIT (nvidia/mit-b0, nvidia/mit-b1, etc.)
+- SegFormer models
+- Other transformer-based segmentation models available on Hugging Face Hub
+
+## CLI Usage
+
+Train a semantic segmentation model using the CLI:
+
+```bash
+autotrain image-semantic-segmentation \
+ --train \
+ --project-name my-segmentation-project \
+ --data-path /path/to/data \
+ --model nvidia/mit-b0 \
+ --epochs 10 \
+ --batch-size 4 \
+ --lr 3e-5 \
+ --push-to-hub \
+ --username your-hf-username \
+ --token your-hf-token
+```
+
+## Use Cases
+
+Image semantic segmentation is useful for:
+
+- Medical image analysis (tumor detection, organ segmentation)
+- Autonomous driving (road, vehicle, pedestrian segmentation)
+- Satellite imagery analysis (land use classification)
+- Industrial quality control
+- Agricultural monitoring
+- Scene understanding and parsing
+
+## Tips for Better Results
+
+1. **Data Quality**: Ensure your segmentation masks are accurate and consistent
+2. **Class Balance**: Try to have balanced representation of different classes
+3. **Data Augmentation**: Use appropriate augmentation that preserves mask-image correspondence
+4. **Model Selection**: Choose models pre-trained on similar domains when possible
+5. **Batch Size**: Start with smaller batch sizes due to memory requirements
+6. **Learning Rate**: Use lower learning rates for fine-tuning pre-trained models
\ No newline at end of file
diff --git a/notebooks/audio_classification.ipynb b/notebooks/audio_classification.ipynb
new file mode 100644
index 0000000000..5644762535
--- /dev/null
+++ b/notebooks/audio_classification.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Audio Classification using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train an audio classification model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.audio_classification.params import AudioClassificationParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = AudioClassificationParams(\n",
+ " model=\"facebook/wav2vec2-base\",\n",
+ " data_path=\"speech_commands\", # path to the dataset on huggingface hub\n",
+ " audio_column=\"audio\", # the column in the dataset that contains the audio\n",
+ " target_column=\"label\", # the column in the dataset that contains the labels\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=8,\n",
+ " lr=1e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=1,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-audio-classification\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?AudioClassificationParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = AudioClassificationParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " audio_column=\"audio\", # this is the column name in the CSV/JSONL file which contains the audio file paths\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/audio_detection.ipynb b/notebooks/audio_detection.ipynb
new file mode 100644
index 0000000000..ca73de41ff
--- /dev/null
+++ b/notebooks/audio_detection.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Audio Detection using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train an audio detection model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.audio_detection.params import AudioDetectionParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = AudioDetectionParams(\n",
+ " model=\"facebook/wav2vec2-base\",\n",
+ " data_path=\"your_audio_detection_dataset\", # path to the dataset on huggingface hub\n",
+ " audio_column=\"audio\", # the column in the dataset that contains the audio\n",
+ " target_column=\"labels\", # the column in the dataset that contains the detection labels\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=8,\n",
+ " lr=1e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=1,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-audio-detection\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?AudioDetectionParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = AudioDetectionParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " audio_column=\"audio\", # this is the column name in the CSV/JSONL file which contains the audio file paths\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/audio_segmentation.ipynb b/notebooks/audio_segmentation.ipynb
new file mode 100644
index 0000000000..873506e8c3
--- /dev/null
+++ b/notebooks/audio_segmentation.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Audio Segmentation using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train an audio segmentation model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.audio_segmentation.params import AudioSegmentationParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = AudioSegmentationParams(\n",
+ " model=\"facebook/wav2vec2-base\",\n",
+ " data_path=\"your_audio_segmentation_dataset\", # path to the dataset on huggingface hub\n",
+ " audio_column=\"audio\", # the column in the dataset that contains the audio\n",
+ " target_column=\"segments\", # the column in the dataset that contains the segmentation labels\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=8,\n",
+ " lr=1e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=1,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-audio-segmentation\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?AudioSegmentationParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = AudioSegmentationParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " audio_column=\"audio\", # this is the column name in the CSV/JSONL file which contains the audio file paths\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/clm.ipynb b/notebooks/clm.ipynb
new file mode 100644
index 0000000000..b0731f3268
--- /dev/null
+++ b/notebooks/clm.ipynb
@@ -0,0 +1,117 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Causal Language Modeling using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train a causal language model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.clm.params import LLMTrainingParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = LLMTrainingParams(\n",
+ " model=\"gpt2\",\n",
+ " data_path=\"wikitext-103-v1\", # path to the dataset on huggingface hub\n",
+ " text_column=\"text\", # the column in the dataset that contains the text\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=4,\n",
+ " block_size=512,\n",
+ " model_max_length=1024,\n",
+ " lr=2e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"cosine\",\n",
+ " gradient_accumulation=4,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-clm\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?LLMTrainingParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = LLMTrainingParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " text_column=\"text\", # this is the column name in the CSV/JSONL file which contains the text\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/extractive_question_answering.ipynb b/notebooks/extractive_question_answering.ipynb
new file mode 100644
index 0000000000..3069b99b9a
--- /dev/null
+++ b/notebooks/extractive_question_answering.ipynb
@@ -0,0 +1,119 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Extractive Question Answering using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train an extractive question answering model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = ExtractiveQuestionAnsweringParams(\n",
+ " model=\"google-bert/bert-base-uncased\",\n",
+ " data_path=\"squad\", # path to the dataset on huggingface hub\n",
+ " context_column=\"context\", # the column in the dataset that contains the context\n",
+ " question_column=\"question\", # the column in the dataset that contains the questions\n",
+ " answer_column=\"answers\", # the column in the dataset that contains the answers\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=8,\n",
+ " max_seq_length=384,\n",
+ " lr=2e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=1,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-extractive-qa\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?ExtractiveQuestionAnsweringParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = ExtractiveQuestionAnsweringParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " context_column=\"context\", # this is the column name in the CSV/JSONL file which contains the context\n",
+ " question_column=\"question\", # this is the column name in the CSV/JSONL file which contains the questions\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/generic.ipynb b/notebooks/generic.ipynb
new file mode 100644
index 0000000000..a397edf746
--- /dev/null
+++ b/notebooks/generic.ipynb
@@ -0,0 +1,113 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Generic Training using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train a generic model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.generic.params import GenericParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = GenericParams(\n",
+ " model=\"your_model_name\",\n",
+ " data_path=\"your_dataset\", # path to the dataset on huggingface hub\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=8,\n",
+ " lr=2e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=1,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-generic\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?GenericParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = GenericParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/image_classification.ipynb b/notebooks/image_classification.ipynb
new file mode 100644
index 0000000000..9140b33b42
--- /dev/null
+++ b/notebooks/image_classification.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Image Classification using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train an image classification model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.image_classification.params import ImageClassificationParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = ImageClassificationParams(\n",
+ " model=\"google/vit-base-patch16-224\",\n",
+ " data_path=\"imagenet-1k\", # path to the dataset on huggingface hub\n",
+ " image_column=\"image\", # the column in the dataset that contains the image\n",
+ " target_column=\"label\", # the column in the dataset that contains the labels\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=8,\n",
+ " lr=5e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=1,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-image-classification\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?ImageClassificationParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = ImageClassificationParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " image_column=\"image\", # this is the column name in the CSV/JSONL file which contains the image file paths\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/image_instance_segmentation.ipynb b/notebooks/image_instance_segmentation.ipynb
new file mode 100644
index 0000000000..d58845c1fc
--- /dev/null
+++ b/notebooks/image_instance_segmentation.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Image Instance Segmentation using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train an image instance segmentation model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.image_instance_segmentation.params import ImageInstanceSegmentationParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = ImageInstanceSegmentationParams(\n",
+ " model=\"facebook/mask2former-swin-large-coco-instance\",\n",
+ " data_path=\"your_instance_segmentation_dataset\", # path to the dataset on huggingface hub\n",
+ " image_column=\"image\", # the column in the dataset that contains the image\n",
+ " objects_column=\"objects\", # the column in the dataset that contains the instance segmentation annotations\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=4,\n",
+ " lr=5e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=2,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-image-instance-segmentation\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?ImageInstanceSegmentationParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = ImageInstanceSegmentationParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " image_column=\"image\", # this is the column name in the CSV/JSONL file which contains the image file paths\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/image_regression.ipynb b/notebooks/image_regression.ipynb
new file mode 100644
index 0000000000..5f6603f473
--- /dev/null
+++ b/notebooks/image_regression.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Image Regression using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train an image regression model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.image_regression.params import ImageRegressionParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = ImageRegressionParams(\n",
+ " model=\"google/vit-base-patch16-224\",\n",
+ " data_path=\"your_image_regression_dataset\", # path to the dataset on huggingface hub\n",
+ " image_column=\"image\", # the column in the dataset that contains the image\n",
+ " target_column=\"target\", # the column in the dataset that contains the regression targets\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=8,\n",
+ " lr=5e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=1,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-image-regression\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?ImageRegressionParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = ImageRegressionParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " image_column=\"image\", # this is the column name in the CSV/JSONL file which contains the image file paths\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/image_semantic_segmentation.ipynb b/notebooks/image_semantic_segmentation.ipynb
new file mode 100644
index 0000000000..6b7b925e21
--- /dev/null
+++ b/notebooks/image_semantic_segmentation.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Image Semantic Segmentation using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train an image semantic segmentation model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.image_semantic_segmentation.params import ImageSemanticSegmentationParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = ImageSemanticSegmentationParams(\n",
+ " model=\"nvidia/segformer-b0-finetuned-ade-512-512\",\n",
+ " data_path=\"scene_parse_150\", # path to the dataset on huggingface hub\n",
+ " image_column=\"image\", # the column in the dataset that contains the image\n",
+ " target_column=\"annotation\", # the column in the dataset that contains the segmentation masks\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=4,\n",
+ " lr=6e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=2,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-image-semantic-segmentation\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?ImageSemanticSegmentationParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = ImageSemanticSegmentationParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " image_column=\"image\", # this is the column name in the CSV/JSONL file which contains the image file paths\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/object_detection.ipynb b/notebooks/object_detection.ipynb
new file mode 100644
index 0000000000..465d8790a7
--- /dev/null
+++ b/notebooks/object_detection.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Object Detection using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train an object detection model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.object_detection.params import ObjectDetectionParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = ObjectDetectionParams(\n",
+ " model=\"facebook/detr-resnet-50\",\n",
+ " data_path=\"detection-datasets/coco\", # path to the dataset on huggingface hub\n",
+ " image_column=\"image\", # the column in the dataset that contains the image\n",
+ " objects_column=\"objects\", # the column in the dataset that contains the object annotations\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=4,\n",
+ " lr=1e-4,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=2,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-object-detection\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?ObjectDetectionParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = ObjectDetectionParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " image_column=\"image\", # this is the column name in the CSV/JSONL file which contains the image file paths\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/sent_transformers.ipynb b/notebooks/sent_transformers.ipynb
new file mode 100644
index 0000000000..4a3ed5d3b1
--- /dev/null
+++ b/notebooks/sent_transformers.ipynb
@@ -0,0 +1,118 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sentence Transformers using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train a sentence transformer model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.sent_transformers.params import SentenceTransformersParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = SentenceTransformersParams(\n",
+ " model=\"sentence-transformers/all-MiniLM-L6-v2\",\n",
+ " data_path=\"sentence-transformers/stsb\", # path to the dataset on huggingface hub\n",
+ " sentence1_column=\"sentence1\", # the column in the dataset that contains the first sentence\n",
+ " sentence2_column=\"sentence2\", # the column in the dataset that contains the second sentence\n",
+ " target_column=\"score\", # the column in the dataset that contains the similarity scores\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=16,\n",
+ " lr=2e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=1,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-sentence-transformers\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?SentenceTransformersParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = SentenceTransformersParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " sentence1_column=\"sentence1\", # this is the column name in the CSV/JSONL file which contains the first sentence\n",
+ " sentence2_column=\"sentence2\", # this is the column name in the CSV/JSONL file which contains the second sentence\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/seq2seq.ipynb b/notebooks/seq2seq.ipynb
new file mode 100644
index 0000000000..e72eb28c49
--- /dev/null
+++ b/notebooks/seq2seq.ipynb
@@ -0,0 +1,119 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sequence-to-Sequence using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train a sequence-to-sequence model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.seq2seq.params import Seq2SeqParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = Seq2SeqParams(\n",
+ " model=\"google-t5/t5-small\",\n",
+ " data_path=\"cnn_dailymail\", # path to the dataset on huggingface hub\n",
+ " text_column=\"article\", # the column in the dataset that contains the input text\n",
+ " target_column=\"highlights\", # the column in the dataset that contains the target text\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=8,\n",
+ " max_seq_length=512,\n",
+ " max_target_length=128,\n",
+ " lr=3e-4,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=1,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-seq2seq\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?Seq2SeqParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = Seq2SeqParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " text_column=\"article\", # this is the column name in the CSV/JSONL file which contains the input text\n",
+ " target_column=\"highlights\", # this is the column name in the CSV/JSONL file which contains the target text\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/tabular.ipynb b/notebooks/tabular.ipynb
new file mode 100644
index 0000000000..2f8d3be98a
--- /dev/null
+++ b/notebooks/tabular.ipynb
@@ -0,0 +1,113 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Tabular Data Training using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train a model on tabular data using AutoTrain Advanced.\n",
+ "You can replace the model with any supported tabular model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.tabular.params import TabularParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = TabularParams(\n",
+ " model=\"xgboost\", # can be xgboost, lightgbm, catboost, randomforest, etc.\n",
+ " data_path=\"your_tabular_dataset\", # path to the dataset on huggingface hub or local path\n",
+ " target_columns=[\"target\"], # the column(s) in the dataset that contain the target values\n",
+ " id_column=\"id\", # the column that contains unique identifiers (optional)\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " task=\"classification\", # can be \"classification\" or \"regression\"\n",
+ " num_trials=10, # number of hyperparameter optimization trials\n",
+ " time_limit=600, # time limit in seconds\n",
+ " project_name=\"autotrain-tabular\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?TabularParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV format and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = TabularParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.csv is located\n",
+ " target_columns=[\"target\"], # the column name(s) in the CSV file which contains the target\n",
+ " categorical_columns=[\"cat_col1\", \"cat_col2\"], # list of categorical columns (optional)\n",
+ " numerical_columns=[\"num_col1\", \"num_col2\"], # list of numerical columns (optional)\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/token_classification.ipynb b/notebooks/token_classification.ipynb
new file mode 100644
index 0000000000..7dcd4dcacd
--- /dev/null
+++ b/notebooks/token_classification.ipynb
@@ -0,0 +1,118 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Token Classification using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train a token classification model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.token_classification.params import TokenClassificationParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = TokenClassificationParams(\n",
+ " model=\"google-bert/bert-base-uncased\",\n",
+ " data_path=\"conll2003\", # path to the dataset on huggingface hub\n",
+ " tokens_column=\"tokens\", # the column in the dataset that contains the tokens\n",
+ " tags_column=\"ner_tags\", # the column in the dataset that contains the NER tags\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=8,\n",
+ " max_seq_length=512,\n",
+ " lr=2e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=1,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-token-classification\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?TokenClassificationParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = TokenClassificationParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " tokens_column=\"tokens\", # this is the column name in the CSV/JSONL file which contains the tokens\n",
+ " tags_column=\"ner_tags\", # this is the column name in the CSV/JSONL file which contains the tags\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/vlm.ipynb b/notebooks/vlm.ipynb
new file mode 100644
index 0000000000..1b896dd1d9
--- /dev/null
+++ b/notebooks/vlm.ipynb
@@ -0,0 +1,117 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Vision Language Model using AutoTrain Advanced\n",
+ "\n",
+ "In this notebook, we will train a vision language model using AutoTrain Advanced.\n",
+ "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n",
+ "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from autotrain.trainers.vlm.params import VLMTrainingParams\n",
+ "from autotrain.project import AutoTrainProject"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "HF_USERNAME = \"your_huggingface_username\"\n",
+ "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n",
+ "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n",
+ "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = VLMTrainingParams(\n",
+ " model=\"microsoft/kosmos-2-patch14-224\",\n",
+ " data_path=\"your_vlm_dataset\", # path to the dataset on huggingface hub\n",
+ " image_column=\"image\", # the column in the dataset that contains the image\n",
+ " text_column=\"text\", # the column in the dataset that contains the text\n",
+ " train_split=\"train\",\n",
+ " valid_split=\"validation\",\n",
+ " epochs=3,\n",
+ " batch_size=4,\n",
+ " lr=1e-5,\n",
+ " optimizer=\"adamw_torch\",\n",
+ " scheduler=\"linear\",\n",
+ " gradient_accumulation=4,\n",
+ " mixed_precision=\"fp16\",\n",
+ " project_name=\"autotrain-vlm\",\n",
+ " log=\"tensorboard\",\n",
+ " push_to_hub=True,\n",
+ " username=HF_USERNAME,\n",
+ " token=HF_TOKEN,\n",
+ ")\n",
+ "# tip: you can use `?VLMTrainingParams` to see the full list of allowed parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n",
+ "\n",
+ "```python\n",
+ "params = VLMTrainingParams(\n",
+ " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n",
+ " image_column=\"image\", # this is the column name in the CSV/JSONL file which contains the image file paths\n",
+ " text_column=\"text\", # this is the column name in the CSV/JSONL file which contains the text\n",
+ " train_split = \"train\" # this is the filename without extension\n",
+ " valid_split = \"valid\" # this is the filename without extension\n",
+ " .\n",
+ " .\n",
+ " .\n",
+ ")\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this will train the model locally\n",
+ "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n",
+ "project.create()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "autotrain",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/requirements.txt b/requirements.txt
index 46cedd0ab5..94de8f947a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ ipadic==1.0.0
jiwer==3.0.5
joblib==1.4.2
loguru==0.7.3
-pandas==2.2.3
+pandas==2.3.0
nltk==3.9.1
optuna==4.1.0
Pillow==11.0.0
@@ -14,12 +14,12 @@ scikit-learn==1.6.0
sentencepiece==0.2.0
tqdm==4.67.1
werkzeug==3.1.3
-xgboost==2.1.3
-huggingface_hub==0.27.0
+xgboost==3.0.2
+huggingface_hub==0.33.1
requests==2.32.3
einops==0.8.0
packaging==24.2
-cryptography==44.0.0
+cryptography==44.0.1
nvitop==1.3.2
# latest versions
tensorboard==2.18.0
@@ -47,3 +47,5 @@ timm==1.0.12
torchmetrics==1.6.0
pycocotools==2.0.8
sentence-transformers==3.3.1
+# audio processing dependencies
+librosa==0.10.2
diff --git a/src/autotrain/app/api_routes.py b/src/autotrain/app/api_routes.py
index 8563ab15b8..8759245a1c 100644
--- a/src/autotrain/app/api_routes.py
+++ b/src/autotrain/app/api_routes.py
@@ -11,10 +11,15 @@
from autotrain.app.params import HIDDEN_PARAMS, PARAMS, AppParams
from autotrain.app.utils import token_verification
from autotrain.project import AutoTrainProject
+from autotrain.trainers.audio_classification.params import AudioClassificationParams
+from autotrain.trainers.audio_detection.params import AudioDetectionParams
+from autotrain.trainers.audio_segmentation.params import AudioSegmentationParams
from autotrain.trainers.clm.params import LLMTrainingParams
from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams
from autotrain.trainers.image_classification.params import ImageClassificationParams
from autotrain.trainers.image_regression.params import ImageRegressionParams
+from autotrain.trainers.image_semantic_segmentation.params import ImageSemanticSegmentationParams
+from autotrain.trainers.image_instance_segmentation.params import ImageInstanceSegmentationParams
from autotrain.trainers.object_detection.params import ObjectDetectionParams
from autotrain.trainers.sent_transformers.params import SentenceTransformersParams
from autotrain.trainers.seq2seq.params import Seq2SeqParams
@@ -25,7 +30,7 @@
from autotrain.trainers.vlm.params import VLMTrainingParams
-FIELDS_TO_EXCLUDE = HIDDEN_PARAMS + ["push_to_hub"]
+FIELDS_TO_EXCLUDE = HIDDEN_PARAMS
def create_api_base_model(base_class, class_name):
@@ -107,11 +112,14 @@ def create_api_base_model(base_class, class_name):
TokenClassificationParamsAPI = create_api_base_model(TokenClassificationParams, "TokenClassificationParamsAPI")
SentenceTransformersParamsAPI = create_api_base_model(SentenceTransformersParams, "SentenceTransformersParamsAPI")
ImageRegressionParamsAPI = create_api_base_model(ImageRegressionParams, "ImageRegressionParamsAPI")
+ImageSemanticSegmentationParamsAPI = create_api_base_model(ImageSemanticSegmentationParams, "ImageSemanticSegmentationParamsAPI")
+ImageInstanceSegmentationParamsAPI = create_api_base_model(ImageInstanceSegmentationParams, "ImageInstanceSegmentationParamsAPI")
VLMTrainingParamsAPI = create_api_base_model(VLMTrainingParams, "VLMTrainingParamsAPI")
-ExtractiveQuestionAnsweringParamsAPI = create_api_base_model(
- ExtractiveQuestionAnsweringParams, "ExtractiveQuestionAnsweringParamsAPI"
-)
+ExtractiveQuestionAnsweringParamsAPI = create_api_base_model(ExtractiveQuestionAnsweringParams, "ExtractiveQuestionAnsweringParamsAPI")
ObjectDetectionParamsAPI = create_api_base_model(ObjectDetectionParams, "ObjectDetectionParamsAPI")
+AudioClassificationParamsAPI = create_api_base_model(AudioClassificationParams, "AudioClassificationParamsAPI")
+AudioSegmentationParamsAPI = create_api_base_model(AudioSegmentationParams, "AudioSegmentationParamsAPI")
+AudioDetectionParamsAPI = create_api_base_model(AudioDetectionParams, "AudioDetectionParamsAPI")
class LLMSFTColumnMapping(BaseModel):
@@ -149,6 +157,16 @@ class ImageRegressionColumnMapping(BaseModel):
target_column: str
+class ImageSemanticSegmentationColumnMapping(BaseModel):
+ image_column: str
+ target_column: str
+
+
+class ImageInstanceSegmentationColumnMapping(BaseModel):
+ image_column: str
+ objects_column: str
+
+
class Seq2SeqColumnMapping(BaseModel):
text_column: str
target_column: str
@@ -224,6 +242,21 @@ class ObjectDetectionColumnMapping(BaseModel):
objects_column: str
+class AudioClassificationColumnMapping(BaseModel):
+ audio_column: str
+ target_column: str
+
+
+class AudioSegmentationColumnMapping(BaseModel):
+ audio_column: str
+ target_column: str
+
+
+class AudioDetectionColumnMapping(BaseModel):
+ audio_column: str
+ events_column: str
+
+
class APICreateProjectModel(BaseModel):
"""
APICreateProjectModel is a Pydantic model that defines the schema for creating a project.
@@ -271,10 +304,15 @@ class APICreateProjectModel(BaseModel):
"tabular-classification",
"tabular-regression",
"image-regression",
+ "image-semantic-segmentation",
+ "image-instance-segmentation",
"vlm:captioning",
"vlm:vqa",
"extractive-question-answering",
"image-object-detection",
+ "audio-classification",
+ "audio-segmentation",
+ "audio-detection",
]
base_model: str
hardware: Literal[
@@ -309,9 +347,14 @@ class APICreateProjectModel(BaseModel):
TextRegressionParamsAPI,
TokenClassificationParamsAPI,
ImageRegressionParamsAPI,
+ ImageSemanticSegmentationParamsAPI,
+ ImageInstanceSegmentationParamsAPI,
VLMTrainingParamsAPI,
ExtractiveQuestionAnsweringParamsAPI,
ObjectDetectionParamsAPI,
+ AudioClassificationParamsAPI,
+ AudioSegmentationParamsAPI,
+ AudioDetectionParamsAPI,
]
username: str
column_mapping: Optional[
@@ -334,9 +377,14 @@ class APICreateProjectModel(BaseModel):
STTripletColumnMapping,
STQAColumnMapping,
ImageRegressionColumnMapping,
+ ImageSemanticSegmentationColumnMapping,
+ ImageInstanceSegmentationColumnMapping,
VLMColumnMapping,
ExtractiveQuestionAnsweringColumnMapping,
ObjectDetectionColumnMapping,
+ AudioClassificationColumnMapping,
+ AudioSegmentationColumnMapping,
+ AudioDetectionColumnMapping,
]
] = None
hub_dataset: str
@@ -496,6 +544,22 @@ def validate_column_mapping(cls, values):
if not values.get("column_mapping").get("target_column"):
raise ValueError("target_column is required for image-regression")
values["column_mapping"] = ImageRegressionColumnMapping(**values["column_mapping"])
+ elif values.get("task") == "image-semantic-segmentation":
+ if not values.get("column_mapping"):
+ raise ValueError("column_mapping is required for image-semantic-segmentation")
+ if not values.get("column_mapping").get("image_column"):
+ raise ValueError("image_column is required for image-semantic-segmentation")
+ if not values.get("column_mapping").get("target_column"):
+ raise ValueError("target_column is required for image-semantic-segmentation")
+ values["column_mapping"] = ImageSemanticSegmentationColumnMapping(**values["column_mapping"])
+ elif values.get("task") == "image-instance-segmentation":
+ if not values.get("column_mapping"):
+ raise ValueError("column_mapping is required for image-instance-segmentation")
+ if not values.get("column_mapping").get("image_column"):
+ raise ValueError("image_column is required for image-instance-segmentation")
+ if not values.get("column_mapping").get("objects_column"):
+ raise ValueError("objects_column is required for image-instance-segmentation")
+ values["column_mapping"] = ImageInstanceSegmentationColumnMapping(**values["column_mapping"])
elif values.get("task") == "vlm:captioning":
if not values.get("column_mapping"):
raise ValueError("column_mapping is required for vlm:captioning")
@@ -534,6 +598,30 @@ def validate_column_mapping(cls, values):
if not values.get("column_mapping").get("objects_column"):
raise ValueError("objects_column is required for image-object-detection")
values["column_mapping"] = ObjectDetectionColumnMapping(**values["column_mapping"])
+ elif values.get("task") == "audio-classification":
+ if not values.get("column_mapping"):
+ raise ValueError("column_mapping is required for audio-classification")
+ if not values.get("column_mapping").get("audio_column"):
+ raise ValueError("audio_column is required for audio-classification")
+ if not values.get("column_mapping").get("target_column"):
+ raise ValueError("target_column is required for audio-classification")
+ values["column_mapping"] = AudioClassificationColumnMapping(**values["column_mapping"])
+ elif values.get("task") == "audio-segmentation":
+ if not values.get("column_mapping"):
+ raise ValueError("column_mapping is required for audio-segmentation")
+ if not values.get("column_mapping").get("audio_column"):
+ raise ValueError("audio_column is required for audio-segmentation")
+ if not values.get("column_mapping").get("target_column"):
+ raise ValueError("target_column is required for audio-segmentation")
+ values["column_mapping"] = AudioSegmentationColumnMapping(**values["column_mapping"])
+ elif values.get("task") == "audio-detection":
+ if not values.get("column_mapping"):
+ raise ValueError("column_mapping is required for audio-detection")
+ if not values.get("column_mapping").get("audio_column"):
+ raise ValueError("audio_column is required for audio-detection")
+ if not values.get("column_mapping").get("events_column"):
+ raise ValueError("events_column is required for audio-detection")
+ values["column_mapping"] = AudioDetectionColumnMapping(**values["column_mapping"])
return values
@model_validator(mode="before")
@@ -567,12 +655,22 @@ def validate_params(cls, values):
values["params"] = SentenceTransformersParamsAPI(**values["params"])
elif values.get("task") == "image-regression":
values["params"] = ImageRegressionParamsAPI(**values["params"])
+ elif values.get("task") == "image-semantic-segmentation":
+ values["params"] = ImageSemanticSegmentationParamsAPI(**values["params"])
+ elif values.get("task") == "image-instance-segmentation":
+ values["params"] = ImageInstanceSegmentationParamsAPI(**values["params"])
elif values.get("task").startswith("vlm:"):
values["params"] = VLMTrainingParamsAPI(**values["params"])
elif values.get("task") == "extractive-question-answering":
values["params"] = ExtractiveQuestionAnsweringParamsAPI(**values["params"])
elif values.get("task") == "image-object-detection":
values["params"] = ObjectDetectionParamsAPI(**values["params"])
+ elif values.get("task") == "audio-classification":
+ values["params"] = AudioClassificationParamsAPI(**values["params"])
+ elif values.get("task") == "audio-segmentation":
+ values["params"] = AudioSegmentationParamsAPI(**values["params"])
+ elif values.get("task") == "audio-detection":
+ values["params"] = AudioDetectionParamsAPI(**values["params"])
return values
diff --git a/src/autotrain/app/colab.py b/src/autotrain/app/colab.py
index 2193ba048f..4d1e0e857d 100644
--- a/src/autotrain/app/colab.py
+++ b/src/autotrain/app/colab.py
@@ -34,7 +34,12 @@ def colab_app():
"Token Classification",
"Image Classification",
"Image Regression",
+ "Image Semantic Segmentation",
+ "Image Instance Segmentation",
"Object Detection",
+ "Audio Classification",
+ "Audio Segmentation",
+ "Audio Detection",
"Tabular Classification",
"Tabular Regression",
"ST Pair",
@@ -56,7 +61,12 @@ def colab_app():
"Token Classification": "token-classification",
"Image Classification": "image-classification",
"Image Regression": "image-regression",
+ "Image Semantic Segmentation": "image-semantic-segmentation",
+ "Image Instance Segmentation": "image-instance-segmentation",
"Object Detection": "image-object-detection",
+ "Audio Classification": "audio-classification",
+ "Audio Segmentation": "audio-segmentation",
+ "Audio Detection": "audio-detection",
"Tabular Classification": "tabular:classification",
"Tabular Regression": "tabular:regression",
"ST Pair": "st:pair",
@@ -68,7 +78,7 @@ def colab_app():
def _get_params(task, param_type):
_p = get_task_params(task, param_type=param_type)
- _p["push_to_hub"] = True
+ _p["push_to_hub"] = False
_p = json.dumps(_p, indent=4)
return _p
@@ -266,10 +276,30 @@ def update_col_mapping(*args):
col_mapping.value = '{"image": "image", "label": "target"}'
dataset_source_dropdown.disabled = False
valid_split.disabled = False
+ elif task == "image-semantic-segmentation":
+ col_mapping.value = '{"image": "image", "target": "segmentation_mask"}'
+ dataset_source_dropdown.disabled = False
+ valid_split.disabled = False
+ elif task == "image-instance-segmentation":
+ col_mapping.value = '{"image": "image", "objects": "objects"}'
+ dataset_source_dropdown.disabled = False
+ valid_split.disabled = False
elif task == "image-object-detection":
col_mapping.value = '{"image": "image", "objects": "objects"}'
dataset_source_dropdown.disabled = False
valid_split.disabled = False
+ elif task == "audio-classification":
+ col_mapping.value = '{"audio": "audio_path", "label": "intent"}'
+ dataset_source_dropdown.disabled = False
+ valid_split.disabled = False
+ elif task == "audio-segmentation":
+ col_mapping.value = '{"audio": "audio_path", "label": "segments"}'
+ dataset_source_dropdown.disabled = False
+ valid_split.disabled = False
+ elif task == "audio-detection":
+ col_mapping.value = '{"audio": "audio_path", "events": "events"}'
+ dataset_source_dropdown.disabled = False
+ valid_split.disabled = False
elif task == "tabular:classification":
col_mapping.value = '{"id": "id", "label": ["target"]}'
dataset_source_dropdown.disabled = False
@@ -318,8 +348,18 @@ def update_base_model(*args):
base_model.value = MODEL_CHOICES["token-classification"][0]
elif TASK_MAP[task_dropdown.value] == "text-regression":
base_model.value = MODEL_CHOICES["text-regression"][0]
+ elif TASK_MAP[task_dropdown.value] == "image-semantic-segmentation":
+ base_model.value = MODEL_CHOICES["image-semantic-segmentation"][0]
+ elif TASK_MAP[task_dropdown.value] == "image-instance-segmentation":
+ base_model.value = MODEL_CHOICES["image-instance-segmentation"][0]
elif TASK_MAP[task_dropdown.value] == "image-object-detection":
base_model.value = MODEL_CHOICES["image-object-detection"][0]
+ elif TASK_MAP[task_dropdown.value] == "audio-classification":
+ base_model.value = MODEL_CHOICES["audio-classification"][0]
+ elif TASK_MAP[task_dropdown.value] == "audio-segmentation":
+ base_model.value = MODEL_CHOICES["audio-segmentation"][0]
+ elif TASK_MAP[task_dropdown.value] == "audio-detection":
+ base_model.value = MODEL_CHOICES["audio-detection"][0]
elif TASK_MAP[task_dropdown.value].startswith("st:"):
base_model.value = MODEL_CHOICES["sentence-transformers"][0]
else:
@@ -342,7 +382,7 @@ def start_training(b):
if chat_template is not None:
params_val = {k: v for k, v in params_val.items() if k != "chat_template"}
- push_to_hub = params_val.get("push_to_hub", True)
+ push_to_hub = params_val.get("push_to_hub", False)
if "push_to_hub" in params_val:
params_val = {k: v for k, v in params_val.items() if k != "push_to_hub"}
diff --git a/src/autotrain/app/models.py b/src/autotrain/app/models.py
index 1d1f658113..d08cbad6f9 100644
--- a/src/autotrain/app/models.py
+++ b/src/autotrain/app/models.py
@@ -133,6 +133,46 @@ def _fetch_image_classification_models():
return hub_models
+def _fetch_image_segmentation_models():
+ """
+ Fetches and sorts image segmentation models from the Hugging Face model hub.
+
+ This function retrieves models suitable for image semantic segmentation tasks.
+ It fetches models tagged with 'image-segmentation' task from the transformers library.
+
+ Returns:
+ list: A sorted list of model identifiers from the Hugging Face model hub.
+ """
+ hub_models = list(
+ list_models(
+ task="image-segmentation",
+ library="transformers",
+ sort="downloads",
+ direction=-1,
+ limit=100,
+ full=False,
+ )
+ )
+ hub_models = get_sorted_models(hub_models)
+
+ trending_models = list(
+ list_models(
+ task="image-segmentation",
+ library="transformers",
+ sort="likes7d",
+ direction=-1,
+ limit=30,
+ full=False,
+ )
+ )
+ if len(trending_models) > 0:
+ trending_models = get_sorted_models(trending_models)
+ hub_models = [m for m in hub_models if m not in trending_models]
+ hub_models = trending_models + hub_models
+
+ return hub_models
+
+
def _fetch_image_object_detection_models():
hub_models = list(
list_models(
@@ -333,12 +373,197 @@ def _fetch_vlm_models():
return hub_models
+def _fetch_audio_classification_models():
+ """
+ Fetches and sorts audio classification models from the Hugging Face model hub.
+
+ This function retrieves models for the task "audio-classification"
+ from the Hugging Face model hub, sorts them by the number of downloads.
+ Additionally, it fetches trending models based on the number
+ of likes in the past 7 days, sorts them, and places them at the beginning of the list
+ if they are not already included.
+
+ Returns:
+ list: A sorted list of model identifiers from the Hugging Face model hub.
+ """
+ hub_models = list(
+ list_models(
+ task="audio-classification",
+ library="transformers",
+ sort="downloads",
+ direction=-1,
+ limit=100,
+ full=False,
+ )
+ )
+ hub_models = get_sorted_models(hub_models)
+
+ trending_models = list(
+ list_models(
+ task="audio-classification",
+ library="transformers",
+ sort="likes7d",
+ direction=-1,
+ limit=30,
+ full=False,
+ )
+ )
+ if len(trending_models) > 0:
+ trending_models = get_sorted_models(trending_models)
+ hub_models = [m for m in hub_models if m not in trending_models]
+ hub_models = trending_models + hub_models
+
+ return hub_models
+
+
+def _fetch_audio_segmentation_models():
+ """
+ Fetches and sorts audio segmentation models from the Hugging Face model hub.
+
+ This function retrieves models suitable for audio segmentation tasks such as
+ speaker diarization, voice activity detection, and speech/music segmentation.
+ It includes audio classification models that can be fine-tuned for segmentation.
+
+ Returns:
+ list: A sorted list of model identifiers from the Hugging Face model hub.
+ """
+ # Get audio classification models (can be used for segmentation)
+ hub_models1 = list(
+ list_models(
+ task="audio-classification",
+ library="transformers",
+ sort="downloads",
+ direction=-1,
+ limit=50,
+ full=False,
+ )
+ )
+
+ # Get automatic speech recognition models (useful for segmentation)
+ hub_models2 = list(
+ list_models(
+ task="automatic-speech-recognition",
+ library="transformers",
+ sort="downloads",
+ direction=-1,
+ limit=50,
+ full=False,
+ )
+ )
+
+ hub_models = list(hub_models1) + list(hub_models2)
+ hub_models = get_sorted_models(hub_models)
+
+ # Get trending models
+ trending_models1 = list(
+ list_models(
+ task="audio-classification",
+ library="transformers",
+ sort="likes7d",
+ direction=-1,
+ limit=15,
+ full=False,
+ )
+ )
+
+ trending_models2 = list(
+ list_models(
+ task="automatic-speech-recognition",
+ library="transformers",
+ sort="likes7d",
+ direction=-1,
+ limit=15,
+ full=False,
+ )
+ )
+
+ trending_models = list(trending_models1) + list(trending_models2)
+ if len(trending_models) > 0:
+ trending_models = get_sorted_models(trending_models)
+ hub_models = [m for m in hub_models if m not in trending_models]
+ hub_models = trending_models + hub_models
+
+ return hub_models
+
+
+def _fetch_audio_detection_models():
+ """
+ Fetches and sorts audio detection models from the Hugging Face model hub.
+
+ This function retrieves models suitable for audio detection tasks such as
+ event detection, audio classification, and temporal audio analysis.
+ It includes audio classification models that can be fine-tuned for detection.
+
+ Returns:
+ list: A sorted list of model identifiers from the Hugging Face model hub.
+ """
+ # Get audio classification models (can be used for detection)
+ hub_models1 = list(
+ list_models(
+ task="audio-classification",
+ library="transformers",
+ sort="downloads",
+ direction=-1,
+ limit=50,
+ full=False,
+ )
+ )
+
+ # Get automatic speech recognition models (useful for audio analysis)
+ hub_models2 = list(
+ list_models(
+ task="automatic-speech-recognition",
+ library="transformers",
+ sort="downloads",
+ direction=-1,
+ limit=30,
+ full=False,
+ )
+ )
+
+ hub_models = list(hub_models1) + list(hub_models2)
+ hub_models = get_sorted_models(hub_models)
+
+ # Get trending models
+ trending_models1 = list(
+ list_models(
+ task="audio-classification",
+ library="transformers",
+ sort="likes7d",
+ direction=-1,
+ limit=15,
+ full=False,
+ )
+ )
+
+ trending_models2 = list(
+ list_models(
+ task="automatic-speech-recognition",
+ library="transformers",
+ sort="likes7d",
+ direction=-1,
+ limit=10,
+ full=False,
+ )
+ )
+
+ trending_models = list(trending_models1) + list(trending_models2)
+ if len(trending_models) > 0:
+ trending_models = get_sorted_models(trending_models)
+ hub_models = [m for m in hub_models if m not in trending_models]
+ hub_models = trending_models + hub_models
+
+ return hub_models
+
+
def fetch_models():
_mc = collections.defaultdict(list)
_mc["text-classification"] = _fetch_text_classification_models()
_mc["llm"] = _fetch_llm_models()
_mc["image-classification"] = _fetch_image_classification_models()
_mc["image-regression"] = _fetch_image_classification_models()
+ _mc["image-semantic-segmentation"] = _fetch_image_segmentation_models()
+ _mc["image-instance-segmentation"] = _fetch_image_segmentation_models()
_mc["seq2seq"] = _fetch_seq2seq_models()
_mc["token-classification"] = _fetch_token_classification_models()
_mc["text-regression"] = _fetch_text_classification_models()
@@ -346,6 +571,9 @@ def fetch_models():
_mc["sentence-transformers"] = _fetch_st_models()
_mc["vlm"] = _fetch_vlm_models()
_mc["extractive-qa"] = _fetch_text_classification_models()
+ _mc["audio-classification"] = _fetch_audio_classification_models()
+ _mc["audio-segmentation"] = _fetch_audio_segmentation_models()
+ _mc["audio-detection"] = _fetch_audio_detection_models()
# tabular-classification
_mc["tabular-classification"] = [
diff --git a/src/autotrain/app/params.py b/src/autotrain/app/params.py
index a6f4addbc5..72d25e02c6 100644
--- a/src/autotrain/app/params.py
+++ b/src/autotrain/app/params.py
@@ -2,10 +2,15 @@
from dataclasses import dataclass
from typing import Optional
+from autotrain.trainers.audio_classification.params import AudioClassificationParams
+from autotrain.trainers.audio_detection.params import AudioDetectionParams
+from autotrain.trainers.audio_segmentation.params import AudioSegmentationParams
from autotrain.trainers.clm.params import LLMTrainingParams
from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams
from autotrain.trainers.image_classification.params import ImageClassificationParams
from autotrain.trainers.image_regression.params import ImageRegressionParams
+from autotrain.trainers.image_semantic_segmentation.params import ImageSemanticSegmentationParams
+from autotrain.trainers.image_instance_segmentation.params import ImageInstanceSegmentationParams
from autotrain.trainers.object_detection.params import ObjectDetectionParams
from autotrain.trainers.sent_transformers.params import SentenceTransformersParams
from autotrain.trainers.seq2seq.params import Seq2SeqParams
@@ -67,7 +72,6 @@
"answer_column",
]
-
PARAMS = {}
PARAMS["llm"] = LLMTrainingParams(
target_modules="all-linear",
@@ -121,6 +125,14 @@
mixed_precision="fp16",
log="tensorboard",
).model_dump()
+PARAMS["image-semantic-segmentation"] = ImageSemanticSegmentationParams(
+ mixed_precision="fp16",
+ log="tensorboard",
+).model_dump()
+PARAMS["image-instance-segmentation"] = ImageInstanceSegmentationParams(
+ mixed_precision="fp16",
+ log="tensorboard",
+).model_dump()
PARAMS["vlm"] = VLMTrainingParams(
mixed_precision="fp16",
target_modules="all-linear",
@@ -135,6 +147,18 @@
max_seq_length=512,
max_doc_stride=128,
).model_dump()
+PARAMS["audio-classification"] = AudioClassificationParams(
+ mixed_precision="fp16",
+ log="tensorboard",
+).model_dump()
+PARAMS["audio-detection"] = AudioDetectionParams(
+ mixed_precision="fp16",
+ log="tensorboard",
+).model_dump()
+PARAMS["audio-segmentation"] = AudioSegmentationParams(
+ mixed_precision="fp16",
+ log="tensorboard",
+).model_dump()
@dataclass
@@ -212,10 +236,20 @@ def munge(self):
return self._munge_params_sent_transformers()
elif self.task == "image-regression":
return self._munge_params_img_reg()
+ elif self.task == "image-semantic-segmentation":
+ return self._munge_params_img_semantic_seg()
+ elif self.task == "image-instance-segmentation":
+ return self._munge_params_img_instance_seg()
elif self.task.startswith("vlm"):
return self._munge_params_vlm()
elif self.task == "extractive-qa":
return self._munge_params_extractive_qa()
+ elif self.task == "audio-classification":
+ return self._munge_params_audio_clf()
+ elif self.task == "audio-detection":
+ return self._munge_params_audio_det()
+ elif self.task == "audio-segmentation":
+ return self._munge_params_audio_seg()
else:
raise ValueError(f"Unknown task: {self.task}")
@@ -437,6 +471,40 @@ def _munge_params_img_reg(self):
return ImageRegressionParams(**_params)
+ def _munge_params_img_semantic_seg(self):
+ _params = self._munge_common_params()
+ _params["model"] = self.base_model
+ if "log" not in _params:
+ _params["log"] = "tensorboard"
+ if not self.using_hub_dataset:
+ _params["image_column"] = "autotrain_image"
+ _params["target_column"] = "autotrain_label"
+ _params["valid_split"] = "validation"
+ else:
+ _params["image_column"] = self.column_mapping.get("image" if not self.api else "image_column", "image")
+ _params["target_column"] = self.column_mapping.get("target" if not self.api else "target_column", "segmentation_mask")
+ _params["train_split"] = self.train_split
+ _params["valid_split"] = self.valid_split
+
+ return ImageSemanticSegmentationParams(**_params)
+
+ def _munge_params_img_instance_seg(self):
+ _params = self._munge_common_params()
+ _params["model"] = self.base_model
+ if "log" not in _params:
+ _params["log"] = "tensorboard"
+ if not self.using_hub_dataset:
+ _params["image_column"] = "autotrain_image"
+ _params["target_column"] = "autotrain_objects"
+ _params["valid_split"] = "validation"
+ else:
+ _params["image_column"] = self.column_mapping.get("image" if not self.api else "image_column", "image")
+ _params["target_column"] = self.column_mapping.get("objects" if not self.api else "target_column", "objects")
+ _params["train_split"] = self.train_split
+ _params["valid_split"] = self.valid_split
+
+ return ImageInstanceSegmentationParams(**_params)
+
def _munge_params_img_obj_det(self):
_params = self._munge_common_params()
_params["model"] = self.base_model
@@ -488,6 +556,54 @@ def _munge_params_tabular(self):
return TabularParams(**_params)
+ def _munge_params_audio_clf(self):
+ _params = self._munge_common_params()
+ _params["model"] = self.base_model
+ if "log" not in _params:
+ _params["log"] = "tensorboard"
+ if not self.using_hub_dataset:
+ _params["audio_column"] = "autotrain_audio"
+ _params["target_column"] = "autotrain_label"
+ _params["valid_split"] = "validation"
+ else:
+ _params["audio_column"] = self.column_mapping.get("audio" if not self.api else "audio_column", "audio")
+ _params["target_column"] = self.column_mapping.get("label" if not self.api else "target_column", "label")
+ _params["train_split"] = self.train_split
+ _params["valid_split"] = self.valid_split
+ return AudioClassificationParams(**_params)
+
+ def _munge_params_audio_det(self):
+ _params = self._munge_common_params()
+ _params["model"] = self.base_model
+ if "log" not in _params:
+ _params["log"] = "tensorboard"
+ if not self.using_hub_dataset:
+ _params["audio_column"] = "autotrain_audio"
+ _params["events_column"] = "autotrain_events"
+ _params["valid_split"] = "validation"
+ else:
+ _params["audio_column"] = self.column_mapping.get("audio" if not self.api else "audio_column", "audio")
+ _params["events_column"] = self.column_mapping.get("events" if not self.api else "events_column", "events")
+ _params["train_split"] = self.train_split
+ _params["valid_split"] = self.valid_split
+ return AudioDetectionParams(**_params)
+
+ def _munge_params_audio_seg(self):
+ _params = self._munge_common_params()
+ _params["model"] = self.base_model
+ if "log" not in _params:
+ _params["log"] = "tensorboard"
+ if not self.using_hub_dataset:
+ _params["audio_column"] = "autotrain_audio"
+ _params["target_column"] = "autotrain_label"
+ _params["valid_split"] = "validation"
+ else:
+ _params["audio_column"] = self.column_mapping.get("audio" if not self.api else "audio_column", "audio")
+ _params["target_column"] = self.column_mapping.get("label" if not self.api else "target_column", "label")
+ _params["train_split"] = self.train_split
+ _params["valid_split"] = self.valid_split
+ return AudioSegmentationParams(**_params)
+
def get_task_params(task, param_type):
"""
@@ -688,6 +804,22 @@ def get_task_params(task, param_type):
"early_stopping_threshold",
]
task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params}
+ if task == "image-semantic-segmentation" and param_type == "basic":
+ more_hidden_params = [
+ "warmup_ratio",
+ "weight_decay",
+ "max_grad_norm",
+ "seed",
+ "logging_steps",
+ "auto_find_batch_size",
+ "save_total_limit",
+ "eval_strategy",
+ "early_stopping_patience",
+ "early_stopping_threshold",
+ "ignore_mismatched_sizes",
+ "reduce_labels",
+ ]
+ task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params}
if task == "image-object-detection" and param_type == "basic":
more_hidden_params = [
"warmup_ratio",
@@ -735,5 +867,65 @@ def get_task_params(task, param_type):
"early_stopping_threshold",
]
task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params}
+ if task == "audio-classification" and param_type == "basic":
+ more_hidden_params = [
+ "warmup_ratio",
+ "weight_decay",
+ "max_grad_norm",
+ "seed",
+ "logging_steps",
+ "auto_find_batch_size",
+ "save_total_limit",
+ "eval_strategy",
+ "early_stopping_patience",
+ "early_stopping_threshold",
+ "feature_extractor_normalize",
+ "feature_extractor_return_attention_mask",
+ "gradient_accumulation",
+ "max_length",
+ "sampling_rate",
+ ]
+ task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params}
+ if task == "audio-segmentation" and param_type == "basic":
+ more_hidden_params = [
+ "warmup_ratio",
+ "weight_decay",
+ "max_grad_norm",
+ "seed",
+ "logging_steps",
+ "auto_find_batch_size",
+ "save_total_limit",
+ "eval_strategy",
+ "early_stopping_patience",
+ "early_stopping_threshold",
+ "feature_extractor_normalize",
+ "feature_extractor_return_attention_mask",
+ "gradient_accumulation",
+ "max_length",
+ "sampling_rate",
+ "segment_length",
+ "overlap_length",
+ "min_segment_length",
+ ]
+ task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params}
+ if task == "audio-detection" and param_type == "basic":
+ more_hidden_params = [
+ "warmup_ratio",
+ "weight_decay",
+ "max_grad_norm",
+ "seed",
+ "logging_steps",
+ "auto_find_batch_size",
+ "save_total_limit",
+ "eval_strategy",
+ "early_stopping_patience",
+ "early_stopping_threshold",
+ "gradient_accumulation",
+ "max_length",
+ "sampling_rate",
+ "event_overlap_threshold",
+ "confidence_threshold",
+ ]
+ task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params}
return task_params
diff --git a/src/autotrain/app/templates/index.html b/src/autotrain/app/templates/index.html
index 0ee5226c9d..7c0ae4dcb2 100644
--- a/src/autotrain/app/templates/index.html
+++ b/src/autotrain/app/templates/index.html
@@ -84,6 +84,26 @@
fields = ['image', 'label'];
fieldNames = ['image', 'target'];
break;
+ case 'image-semantic-segmentation':
+ fields = ['image', 'label'];
+ fieldNames = ['image', 'segmentation_mask'];
+ break;
+ case 'image-instance-segmentation':
+ fields = ['image', 'objects'];
+ fieldNames = ['image', 'objects'];
+ break;
+ case 'audio-classification':
+ fields = ['audio', 'label'];
+ fieldNames = ['audio_path', 'intent'];
+ break;
+ case 'audio-segmentation':
+ fields = ['audio', 'label'];
+ fieldNames = ['audio_path', 'segments'];
+ break;
+ case 'audio-detection':
+ fields = ['audio', 'events'];
+ fieldNames = ['audio_path', 'events'];
+ break;
case 'image-object-detection':
fields = ['image', 'objects'];
fieldNames = ['image', 'objects'];
@@ -220,8 +240,15 @@
+