Audio Embeddings With Tabular Classification Model
In this example, we will build an audio classification model using
PyTorch
and Wav2Vec2
, a pretrained model for processing audio
data. This guide will walk you through each step of the process,
including setting up the environment, loading and preprocessing data,
defining and training a model, and evaluating its performance.
First, we will import the the necessary libraries and set up the environment.
import torchaudio
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
from transformers import Wav2Vec2Model
import torch
import os
import modlee
import lightning.pytorch as pl
from sklearn.model_selection import train_test_split
torchaudio.set_audio_backend("sox_io")
Now we will set our Modlee API key and initialize the Modlee package.
Make sure that you have a Modlee account and an API key from the
dashboard. Replace
replace-with-your-api-key
with your API key.
os.environ['MODLEE_API_KEY'] = "replace-with-your-api-key"
modlee.init(api_key=os.environ['MODLEE_API_KEY'])
Now, we will prepare our data. For this example, we will manually
download the Human Words Audio
dataset from Kaggle and upload it to
the environment.
Visit the Human Words Audio dataset
page
on Kaggle and click the Download button to save the Animals
directory to your local machine.
Copy the path to that donwloaded file, which will be used later. This
snippet loads the Wav2Vec2
model. We’ll use it to convert audio into
embeddings.
This snippet loads the Wav2Vec2
model. Wav2Vec2
is a model
designed for speech processing. We’ll use it to convert audio into
embeddings.
# Set device to GPU if available, otherwise use CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the pre-trained Wav2Vec2 model and move it to the specified device.
wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)
This function converts raw audio waveforms into embeddings using the
Wav2Vec2
model.
def get_wav2vec_embeddings(waveforms):
with torch.no_grad():
inputs = torch.tensor(waveforms).to(device)
embeddings = wav2vec(inputs).last_hidden_state.mean(dim=1)
return embeddings
The AudioDataset
class handles loading and preprocessing of audio
files.
class AudioDataset(TensorDataset):
def __init__(self, audio_paths, labels, target_length=16000):
self.audio_paths = audio_paths
self.labels = labels
self.target_length = target_length
def __len__(self):
return len(self.audio_paths)
def __getitem__(self, idx):
audio_path = self.audio_paths[idx]
label = self.labels[idx]
waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
waveform = waveform.mean(dim=0)
# Pad or truncate the waveform to the target length
if waveform.size(0) < self.target_length:
waveform = torch.cat([waveform, torch.zeros(self.target_length - waveform.size(0))])
else:
waveform = waveform[:self.target_length]
return waveform, label
This function loads audio files and their corresponding labels from a directory structure.
def load_dataset(data_dir):
audio_paths = []
labels = []
# Loop through each subdirectory in the data directory
for label_dir in os.listdir(data_dir):
label_dir_path = os.path.join(data_dir, label_dir)
if os.path.isdir(label_dir_path):
# Loop through each file in the directory
for file_name in os.listdir(label_dir_path):
if file_name.endswith('.wav'):
audio_paths.append(os.path.join(label_dir_path, file_name))
labels.append(label_dir)
return audio_paths, labels
We define a simple Multi-Layer Perceptron (MLP) model for
classification. This model takes the embeddings from Wav2Vec2
as
input.
class MLP(modlee.model.TabularClassificationModleeModel):
def __init__(self, input_size, num_classes):
super().__init__()
self.model = torch.nn.Sequential(
torch.nn.Linear(input_size, 256),
torch.nn.ReLU(),
torch.nn.Linear(256, 128),
torch.nn.ReLU(),
torch.nn.Linear(128, num_classes)
)
self.loss_fn = torch.nn.CrossEntropyLoss()
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx):
x, y_target = batch
y_pred = self(x)
loss = self.loss_fn(y_pred, y_target)
return {"loss": loss}
def validation_step(self, val_batch, batch_idx):
x, y_target = val_batch
y_pred = self(x)
val_loss = self.loss_fn(y_pred, y_target)
return {'val_loss': val_loss}
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
return optimizer
Wav2Vec2
transforms raw audio data into numerical embeddings that a
model can interpret. We preprocess the audio by normalizing and padding
it to a fixed length. Then, Wav2Vec2
generates embeddings for each
audio clip.
def precompute_embeddings(dataloader):
embeddings_list = []
labels_list = []
for inputs, labels in dataloader:
inputs = inputs.to(device)
embeddings = get_wav2vec_embeddings(inputs)
embeddings_list.append(embeddings.cpu())
labels_list.append(labels)
embeddings_list = torch.cat(embeddings_list, dim=0)
labels_list = torch.cat(labels_list, dim=0)
return embeddings_list, labels_list
We create a function to train and evaluate our model.
def train_model(modlee_model, train_dataloader, val_dataloader, num_epochs=1):
with modlee.start_run() as run:
# Create a PyTorch Lightning trainer
trainer = pl.Trainer(max_epochs=num_epochs)
# Train the model using the training and validation data loaders
trainer.fit(
model=modlee_model,
train_dataloaders=train_dataloader,
val_dataloaders=val_dataloader
)
Finally, we load the dataset, preprocess it, and train the model.
Add your path to the dataset in data_dir
.
# Path to dataset
data_dir = 'path-to-dataset'
# Load dataset
audio_paths, labels = load_dataset(data_dir)
# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
# Split dataset into training and validation sets
train_paths, val_paths, train_labels, val_labels = train_test_split(audio_paths, labels,
test_size=0.2, random_state=42)
# Create datasets and dataloaders
target_length = 16000
train_dataset = AudioDataset(train_paths, train_labels, target_length=target_length)
val_dataset = AudioDataset(val_paths, val_labels, target_length=target_length)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)
# Precompute embeddings
print("Precomputing embeddings for training and validation data...")
train_embeddings, train_labels = precompute_embeddings(train_dataloader)
val_embeddings, val_labels = precompute_embeddings(val_dataloader)
# Create TensorDataset for precomputed embeddings and labels
train_embedding_dataset = TensorDataset(train_embeddings, train_labels)
val_embedding_dataset = TensorDataset(val_embeddings, val_labels)
# Create DataLoaders for the precomputed embeddings
train_embedding_loader = DataLoader(train_embedding_dataset, batch_size=4, shuffle=True)
val_embedding_loader = DataLoader(val_embedding_dataset, batch_size=4, shuffle=False)
# Define number of classes
num_classes = len(label_encoder.classes_)
mlp_audio = MLP(input_size=768, num_classes=num_classes).to(device)
# Train and evaluate the model
train_model(mlp_audio, train_embedding_loader,val_embedding_loader)
Finally, we can view the saved assets from training. With Modlee, your training assets are automatically saved, preserving valuable insights for future reference and collaboration.
last_run_path = modlee.last_run_path()
print(f"Run path: {last_run_path}")
artifacts_path = os.path.join(last_run_path, 'artifacts')
artifacts = sorted(os.listdir(artifacts_path))
print(f"Saved artifacts: {artifacts}")