huggingface · feos7c5 · Jul 3, 2025 · Jul 3, 2025 · Jul 3, 2025 · Aug 10, 2025
diff --git a/README.md b/README.md
@@ -20,6 +20,9 @@ NOTE: AutoTrain is free! You only pay for the resources you use in case you deci
 | Extractive Question Answering | ✅ | Coming Soon | [extractive_qa.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/extractive_question_answering) |
 | Image Classification | ✅ | Coming Soon | [image_classification.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/image_classification) |
 | Image Scoring/Regression | ✅ | Coming Soon | [image_regression.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/image_scoring) |
+| Audio Classification | ✅ | Coming Soon | [audio_classification.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/audio_classification) |
+| Audio Detection | ✅ | Coming Soon | [audio_detection.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/audio_detection) |
+| Audio Segmentation | ✅ | Coming Soon | [audio_segmentation.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/audio_segmentation) |
 | VLM | 🟥 | Coming Soon | [vlm.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/vlm) |
 
 

diff --git a/configs/audio_classification/hub_dataset.yml b/configs/audio_classification/hub_dataset.yml
@@ -0,0 +1,33 @@
+task: audio-classification
+base_model: facebook/wav2vec2-base
+project_name: my-autotrain-audio-clf
+log: tensorboard
+backend: local
+
+data_path: superb
+train_split: train
+valid_split: validation
+
+column_mapping:
+  audio_column: audio
+  target_column: label
+
+parameters:
+  learning_rate: 3e-5
+  epochs: 5
+  batch_size: 8
+  warmup_ratio: 0.1
+  weight_decay: 0.01
+  mixed_precision: fp16
+  gradient_accumulation: 1
+  auto_find_batch_size: false
+  push_to_hub: false
+  logging_steps: -1
+  eval_strategy: epoch
+  save_total_limit: 1
+  early_stopping_patience: 5
+  early_stopping_threshold: 0.01
+  max_length: 480000  # 30 seconds at 16kHz
+  sampling_rate: 16000
+  feature_extractor_normalize: true
+  feature_extractor_return_attention_mask: true 
diff --git a/configs/audio_classification/local.yml b/configs/audio_classification/local.yml
@@ -0,0 +1,43 @@
+task: audio-classification
+base_model: facebook/wav2vec2-base
+project_name: my-autotrain-audio-clf-local
+log: tensorboard
+backend: local
+
+# Local data path - should contain audio files and CSV with labels
+data_path: /path/to/audio/dataset.csv
+train_split: train
+valid_split: validation
+
+column_mapping:
+  audio_column: audio_path
+  target_column: label
+
+parameters:
+  learning_rate: 3e-5
+  epochs: 5
+  batch_size: 8
+  warmup_ratio: 0.1
+  weight_decay: 0.01
+  mixed_precision: fp16
+  gradient_accumulation: 1
+  auto_find_batch_size: false
+  push_to_hub: false
+  logging_steps: -1
+  eval_strategy: epoch
+  save_total_limit: 1
+  early_stopping_patience: 5
+  early_stopping_threshold: 0.01
+  max_length: 480000  # 30 seconds at 16kHz
+  sampling_rate: 16000
+  feature_extractor_normalize: true
+  feature_extractor_return_attention_mask: true
+
+# Note: For local audio classification:
+# - audio_path column should contain paths to audio files (.wav, .mp3, .flac)
+# - label column should contain class labels (strings or integers)
+# - CSV format: audio_path,label
+#   Example:
+#   /path/to/audio1.wav,speech
+#   /path/to/audio2.wav,music
+#   /path/to/audio3.wav,noise 
diff --git a/configs/audio_detection/hub_dataset.yml b/configs/audio_detection/hub_dataset.yml
@@ -0,0 +1,47 @@
+task: audio-detection
+base_model: facebook/wav2vec2-base
+project_name: my-autotrain-audio-detection-hub
+log: tensorboard
+backend: local
+
+# Hub dataset configuration
+data_path: audiofolder/audio_detection_dataset
+train_split: train
+valid_split: validation
+
+column_mapping:
+  audio_column: audio
+  events_column: events
+
+parameters:
+  learning_rate: 3e-5
+  epochs: 3
+  batch_size: 8
+  warmup_ratio: 0.1
+  weight_decay: 0.01
+  mixed_precision: fp16
+  gradient_accumulation: 1
+  auto_find_batch_size: false
+  push_to_hub: false
+  logging_steps: -1
+  eval_strategy: epoch
+  save_total_limit: 1
+  early_stopping_patience: 5
+  early_stopping_threshold: 0.01
+  max_length: 480000  # 30 seconds at 16kHz
+  sampling_rate: 16000
+  event_overlap_threshold: 0.5  # IoU threshold for overlapping events
+  confidence_threshold: 0.1  # Minimum confidence threshold for event detection
+
+# Hub settings
+hub:
+  username: ${HF_USERNAME}
+  token: ${HF_TOKEN}
+  push_to_hub: true
+
+# Note: For hub audio detection datasets:
+# - The dataset should have 'audio' and 'events' columns
+# - Events should be formatted as a list of dictionaries:
+#   [{"start": 0.0, "end": 2.5, "label": "speech"}, {"start": 2.5, "end": 3.0, "label": "silence"}]
+# - Audio column should contain audio data (array or file paths)
+# - Similar to object detection but for temporal events in audio 
diff --git a/configs/audio_detection/local.yml b/configs/audio_detection/local.yml
@@ -0,0 +1,43 @@
+task: audio-detection
+base_model: facebook/wav2vec2-base
+project_name: my-autotrain-audio-detection-local
+log: tensorboard
+backend: local
+
+# Local data path - should contain audio files and CSV with event annotations
+data_path: /path/to/audio/dataset.csv
+train_split: train
+valid_split: validation
+
+column_mapping:
+  audio_column: audio_path
+  events_column: events
+
+parameters:
+  learning_rate: 3e-5
+  epochs: 3
+  batch_size: 8
+  warmup_ratio: 0.1
+  weight_decay: 0.01
+  mixed_precision: fp16
+  gradient_accumulation: 1
+  auto_find_batch_size: false
+  push_to_hub: false
+  logging_steps: -1
+  eval_strategy: epoch
+  save_total_limit: 1
+  early_stopping_patience: 5
+  early_stopping_threshold: 0.01
+  max_length: 480000  # 30 seconds at 16kHz
+  sampling_rate: 16000
+  event_overlap_threshold: 0.5  # IoU threshold for overlapping events
+  confidence_threshold: 0.1  # Minimum confidence threshold for event detection
+
+# Note: For local audio detection:
+# - audio_path column should contain paths to audio files (.wav, .mp3, .flac)
+# - events column should contain event annotations as JSON list
+# - CSV format: audio_path,events
+#   Example:
+#   /path/to/audio1.wav,"[{""start"": 0.0, ""end"": 2.5, ""label"": ""speech""}, {""start"": 2.5, ""end"": 3.0, ""label"": ""silence""}]"
+#   /path/to/audio2.wav,"[{""start"": 1.0, ""end"": 4.0, ""label"": ""music""}, {""start"": 4.0, ""end"": 5.0, ""label"": ""noise""}]"
+#   /path/to/audio3.wav,"[{""start"": 0.5, ""end"": 3.5, ""label"": ""car_crash""}]" 
diff --git a/configs/audio_segmentation/hub_dataset.yml b/configs/audio_segmentation/hub_dataset.yml
@@ -0,0 +1,49 @@
+task: audio_segmentation
+base_model: microsoft/speecht5_vc
+project_name: autotrain-audio-segmentation-hub
+log: tensorboard
+backend: spaces-a10g-large
+
+# Hub dataset configuration
+data_path: audiofolder/audio_segmentation_dataset
+train_split: train
+valid_split: validation
+audio_column: audio
+target_column: segments
+
+# Training parameters
+epochs: 10
+batch_size: 16
+lr: 2e-5
+scheduler: cosine
+optimizer: adamw_torch
+weight_decay: 0.01
+warmup_ratio: 0.05
+gradient_accumulation: 2
+mixed_precision: fp16
+logging_steps: 25
+save_total_limit: 5
+eval_strategy: steps
+early_stopping_patience: 5
+early_stopping_threshold: 0.005
+
+# Audio specific parameters
+max_length: 320000  # 20 seconds at 16kHz (shorter for better memory usage)
+sampling_rate: 16000
+feature_extractor_normalize: true
+feature_extractor_return_attention_mask: true
+
+# Segmentation specific parameters
+segment_length: 3.0  # seconds (shorter segments for better granularity)
+overlap_length: 0.3  # seconds
+min_segment_length: 0.5  # seconds
+
+# Model parameters
+seed: 42
+max_grad_norm: 1.0
+auto_find_batch_size: true
+push_to_hub: true
+
+# Hub settings
+token: ${HF_TOKEN}
+username: ${HF_USERNAME} 
diff --git a/configs/audio_segmentation/local.yml b/configs/audio_segmentation/local.yml
@@ -0,0 +1,44 @@
+task: audio_segmentation
+base_model: microsoft/speecht5_vc
+project_name: autotrain-audio-segmentation-local
+log: tensorboard
+backend: local
+
+data_path: data/
+train_split: train
+valid_split: validation
+audio_column: audio_path
+target_column: segments
+
+# Training parameters
+epochs: 5
+batch_size: 8
+lr: 3e-5
+scheduler: linear
+optimizer: adamw_torch
+weight_decay: 0.01
+warmup_ratio: 0.1
+gradient_accumulation: 1
+mixed_precision: fp16
+logging_steps: 50
+save_total_limit: 3
+eval_strategy: epoch
+early_stopping_patience: 3
+early_stopping_threshold: 0.01
+
+# Audio specific parameters
+max_length: 480000  # 30 seconds at 16kHz
+sampling_rate: 16000
+feature_extractor_normalize: true
+feature_extractor_return_attention_mask: true
+
+# Segmentation specific parameters
+segment_length: 5.0  # seconds
+overlap_length: 0.5  # seconds
+min_segment_length: 1.0  # seconds
+
+# Model parameters
+seed: 42
+max_grad_norm: 1.0
+auto_find_batch_size: false
+push_to_hub: false 
diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,7 @@ ipadic==1.0.0
 jiwer==3.0.5
 joblib==1.4.2
 loguru==0.7.3
-pandas==2.2.3
+pandas==2.3.0
 nltk==3.9.1
 optuna==4.1.0
 Pillow==11.0.0
@@ -14,12 +14,12 @@ scikit-learn==1.6.0
 sentencepiece==0.2.0
 tqdm==4.67.1
 werkzeug==3.1.3
-xgboost==2.1.3
-huggingface_hub==0.27.0
+xgboost==3.0.2
+huggingface_hub==0.33.1
 requests==2.32.3
 einops==0.8.0
 packaging==24.2
-cryptography==44.0.0
+cryptography==44.0.1
 nvitop==1.3.2
 # latest versions
 tensorboard==2.18.0
@@ -47,3 +47,5 @@ timm==1.0.12
 torchmetrics==1.6.0
 pycocotools==2.0.8
 sentence-transformers==3.3.1
+# audio processing dependencies
+librosa==0.10.2