Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ NOTE: AutoTrain is free! You only pay for the resources you use in case you deci
| Extractive Question Answering | ✅ | Coming Soon | [extractive_qa.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/extractive_question_answering) |
| Image Classification | ✅ | Coming Soon | [image_classification.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/image_classification) |
| Image Scoring/Regression | ✅ | Coming Soon | [image_regression.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/image_scoring) |
| Audio Classification | ✅ | Coming Soon | [audio_classification.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/audio_classification) |
| Audio Detection | ✅ | Coming Soon | [audio_detection.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/audio_detection) |
| Audio Segmentation | ✅ | Coming Soon | [audio_segmentation.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/audio_segmentation) |
| VLM | 🟥 | Coming Soon | [vlm.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/vlm) |


Expand Down
33 changes: 33 additions & 0 deletions configs/audio_classification/hub_dataset.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
task: audio-classification
base_model: facebook/wav2vec2-base
project_name: my-autotrain-audio-clf
log: tensorboard
backend: local

data_path: superb
train_split: train
valid_split: validation

column_mapping:
audio_column: audio
target_column: label

parameters:
learning_rate: 3e-5
epochs: 5
batch_size: 8
warmup_ratio: 0.1
weight_decay: 0.01
mixed_precision: fp16
gradient_accumulation: 1
auto_find_batch_size: false
push_to_hub: false
logging_steps: -1
eval_strategy: epoch
save_total_limit: 1
early_stopping_patience: 5
early_stopping_threshold: 0.01
max_length: 480000 # 30 seconds at 16kHz
sampling_rate: 16000
feature_extractor_normalize: true
feature_extractor_return_attention_mask: true
43 changes: 43 additions & 0 deletions configs/audio_classification/local.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
task: audio-classification
base_model: facebook/wav2vec2-base
project_name: my-autotrain-audio-clf-local
log: tensorboard
backend: local

# Local data path - should contain audio files and CSV with labels
data_path: /path/to/audio/dataset.csv
train_split: train
valid_split: validation

column_mapping:
audio_column: audio_path
target_column: label

parameters:
learning_rate: 3e-5
epochs: 5
batch_size: 8
warmup_ratio: 0.1
weight_decay: 0.01
mixed_precision: fp16
gradient_accumulation: 1
auto_find_batch_size: false
push_to_hub: false
logging_steps: -1
eval_strategy: epoch
save_total_limit: 1
early_stopping_patience: 5
early_stopping_threshold: 0.01
max_length: 480000 # 30 seconds at 16kHz
sampling_rate: 16000
feature_extractor_normalize: true
feature_extractor_return_attention_mask: true

# Note: For local audio classification:
# - audio_path column should contain paths to audio files (.wav, .mp3, .flac)
# - label column should contain class labels (strings or integers)
# - CSV format: audio_path,label
# Example:
# /path/to/audio1.wav,speech
# /path/to/audio2.wav,music
# /path/to/audio3.wav,noise
47 changes: 47 additions & 0 deletions configs/audio_detection/hub_dataset.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
task: audio-detection
base_model: facebook/wav2vec2-base
project_name: my-autotrain-audio-detection-hub
log: tensorboard
backend: local

# Hub dataset configuration
data_path: audiofolder/audio_detection_dataset
train_split: train
valid_split: validation

column_mapping:
audio_column: audio
events_column: events

parameters:
learning_rate: 3e-5
epochs: 3
batch_size: 8
warmup_ratio: 0.1
weight_decay: 0.01
mixed_precision: fp16
gradient_accumulation: 1
auto_find_batch_size: false
push_to_hub: false
logging_steps: -1
eval_strategy: epoch
save_total_limit: 1
early_stopping_patience: 5
early_stopping_threshold: 0.01
max_length: 480000 # 30 seconds at 16kHz
sampling_rate: 16000
event_overlap_threshold: 0.5 # IoU threshold for overlapping events
confidence_threshold: 0.1 # Minimum confidence threshold for event detection

# Hub settings
hub:
username: ${HF_USERNAME}
token: ${HF_TOKEN}
push_to_hub: true

# Note: For hub audio detection datasets:
# - The dataset should have 'audio' and 'events' columns
# - Events should be formatted as a list of dictionaries:
# [{"start": 0.0, "end": 2.5, "label": "speech"}, {"start": 2.5, "end": 3.0, "label": "silence"}]
# - Audio column should contain audio data (array or file paths)
# - Similar to object detection but for temporal events in audio
43 changes: 43 additions & 0 deletions configs/audio_detection/local.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
task: audio-detection
base_model: facebook/wav2vec2-base
project_name: my-autotrain-audio-detection-local
log: tensorboard
backend: local

# Local data path - should contain audio files and CSV with event annotations
data_path: /path/to/audio/dataset.csv
train_split: train
valid_split: validation

column_mapping:
audio_column: audio_path
events_column: events

parameters:
learning_rate: 3e-5
epochs: 3
batch_size: 8
warmup_ratio: 0.1
weight_decay: 0.01
mixed_precision: fp16
gradient_accumulation: 1
auto_find_batch_size: false
push_to_hub: false
logging_steps: -1
eval_strategy: epoch
save_total_limit: 1
early_stopping_patience: 5
early_stopping_threshold: 0.01
max_length: 480000 # 30 seconds at 16kHz
sampling_rate: 16000
event_overlap_threshold: 0.5 # IoU threshold for overlapping events
confidence_threshold: 0.1 # Minimum confidence threshold for event detection

# Note: For local audio detection:
# - audio_path column should contain paths to audio files (.wav, .mp3, .flac)
# - events column should contain event annotations as JSON list
# - CSV format: audio_path,events
# Example:
# /path/to/audio1.wav,"[{""start"": 0.0, ""end"": 2.5, ""label"": ""speech""}, {""start"": 2.5, ""end"": 3.0, ""label"": ""silence""}]"
# /path/to/audio2.wav,"[{""start"": 1.0, ""end"": 4.0, ""label"": ""music""}, {""start"": 4.0, ""end"": 5.0, ""label"": ""noise""}]"
# /path/to/audio3.wav,"[{""start"": 0.5, ""end"": 3.5, ""label"": ""car_crash""}]"
49 changes: 49 additions & 0 deletions configs/audio_segmentation/hub_dataset.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
task: audio_segmentation
base_model: microsoft/speecht5_vc
project_name: autotrain-audio-segmentation-hub
log: tensorboard
backend: spaces-a10g-large

# Hub dataset configuration
data_path: audiofolder/audio_segmentation_dataset
train_split: train
valid_split: validation
audio_column: audio
target_column: segments

# Training parameters
epochs: 10
batch_size: 16
lr: 2e-5
scheduler: cosine
optimizer: adamw_torch
weight_decay: 0.01
warmup_ratio: 0.05
gradient_accumulation: 2
mixed_precision: fp16
logging_steps: 25
save_total_limit: 5
eval_strategy: steps
early_stopping_patience: 5
early_stopping_threshold: 0.005

# Audio specific parameters
max_length: 320000 # 20 seconds at 16kHz (shorter for better memory usage)
sampling_rate: 16000
feature_extractor_normalize: true
feature_extractor_return_attention_mask: true

# Segmentation specific parameters
segment_length: 3.0 # seconds (shorter segments for better granularity)
overlap_length: 0.3 # seconds
min_segment_length: 0.5 # seconds

# Model parameters
seed: 42
max_grad_norm: 1.0
auto_find_batch_size: true
push_to_hub: true

# Hub settings
token: ${HF_TOKEN}
username: ${HF_USERNAME}
44 changes: 44 additions & 0 deletions configs/audio_segmentation/local.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
task: audio_segmentation
base_model: microsoft/speecht5_vc
project_name: autotrain-audio-segmentation-local
log: tensorboard
backend: local

data_path: data/
train_split: train
valid_split: validation
audio_column: audio_path
target_column: segments

# Training parameters
epochs: 5
batch_size: 8
lr: 3e-5
scheduler: linear
optimizer: adamw_torch
weight_decay: 0.01
warmup_ratio: 0.1
gradient_accumulation: 1
mixed_precision: fp16
logging_steps: 50
save_total_limit: 3
eval_strategy: epoch
early_stopping_patience: 3
early_stopping_threshold: 0.01

# Audio specific parameters
max_length: 480000 # 30 seconds at 16kHz
sampling_rate: 16000
feature_extractor_normalize: true
feature_extractor_return_attention_mask: true

# Segmentation specific parameters
segment_length: 5.0 # seconds
overlap_length: 0.5 # seconds
min_segment_length: 1.0 # seconds

# Model parameters
seed: 42
max_grad_norm: 1.0
auto_find_batch_size: false
push_to_hub: false
10 changes: 6 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ ipadic==1.0.0
jiwer==3.0.5
joblib==1.4.2
loguru==0.7.3
pandas==2.2.3
pandas==2.3.0
nltk==3.9.1
optuna==4.1.0
Pillow==11.0.0
Expand All @@ -14,12 +14,12 @@ scikit-learn==1.6.0
sentencepiece==0.2.0
tqdm==4.67.1
werkzeug==3.1.3
xgboost==2.1.3
huggingface_hub==0.27.0
xgboost==3.0.2
huggingface_hub==0.33.1
requests==2.32.3
einops==0.8.0
packaging==24.2
cryptography==44.0.0
cryptography==44.0.1
nvitop==1.3.2
# latest versions
tensorboard==2.18.0
Expand Down Expand Up @@ -47,3 +47,5 @@ timm==1.0.12
torchmetrics==1.6.0
pycocotools==2.0.8
sentence-transformers==3.3.1
# audio processing dependencies
librosa==0.10.2
Loading