Automate Experiment Documentation
This example notebook uses the modlee
package to document a machine
learning experiment with a user-built model. We train a simple
convolutional classifier on the simple Fashion MNIST dataset. After
training, we can reuse the model from the auto-documented model class.
Prerequisites for this tutorial include familiarity with
PyTorch and
Lightning.
Here is a video explanation of this exercise.
# Boilerplate imports
import os, sys
import ssl
ssl._create_default_https_context = ssl._create_unverified_context # Disable SSL verification
import lightning.pytorch as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
Import modlee
and initialize with an API key.
# Set the API key to an environment variable
os.environ['MODLEE_API_KEY'] = "replace-with-your-api-key"
# Modlee-specific imports
import modlee
# Initialize Modlee with the API key
modlee.init(api_key=os.environ['MODLEE_API_KEY'])
Load the training data; we’ll use torch
’s Fashion MNIST dataset.
# Get Fashion MNIST, and convert from grayscale to RGB for compatibility with the model
train_dataloader, val_dataloader = modlee.utils.get_fashion_mnist(num_output_channels=3)
num_classes = len(train_dataloader.dataset.classes)
Next, we build the model from a pretrained torchvision ResNet
model.
To enable automatic documentation, wrap the model in the
modlee.model.ImageClassificationModleeModel
class.
ImageClassificationModleeModel
subclasses
lightning.pytorch.LightningModule
and uses the same structure for
the training_step
, validation_step
, and configure_optimizers
functions. Under the hood, ImageClassificationModleeModel
also
contains the callbacks to document the experiment metafeatures.
# Use a pretrained torchvision ResNet
classifier_model = torchvision.models.resnet18(num_classes=10)
# Subclass the ImageClassificationModleeModel class to enable automatic documentation
class ModleeClassifier(modlee.model.ImageClassificationModleeModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.model = classifier_model
self.loss_fn = F.cross_entropy
def forward(self, x):
return self.model(x)
# Define the training step
def training_step(self, batch, batch_idx):
x, y_target = batch
y_pred = self(x)
loss = self.loss_fn(y_pred, y_target)
return {"loss": loss}
# Define the validation step
def validation_step(self, val_batch, batch_idx):
x, y_target = val_batch
y_pred = self(x)
val_loss = self.loss_fn(y_pred, y_target)
return {'val_loss': val_loss}
# Set up the optimizer for training
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
return optimizer
# Create an instance of the model wrapped in Modlee's documentation class
modlee_model = ModleeClassifier()
Run the training loop, just for one epoch.
with modlee.start_run() as run:
# Create a PyTorch Lightning trainer
trainer = pl.Trainer(max_epochs=1)
# Train the model using the training and validation data loaders
trainer.fit(
model=modlee_model,
train_dataloaders=train_dataloader,
val_dataloaders=val_dataloader
)
| Name | Type | Params
-------------------------------------
0 | model | Classifier | 44.4 K
-------------------------------------
44.4 K Trainable params
0 Non-trainable params
44.4 K Total params
0.178 Total estimated model params size (MB)
Epoch 0: 100%|██████████| 938/938 [00:16<00:00, 57.47it/s, v_num=0]
modlee
with mlflow
underneath will document the experiment in an
automatically generated assets
folder. With Modlee, your training
assets are automatically saved, preserving valuable insights for future
reference and collaboration.
# Get the path to the last run's saved data
last_run_path = modlee.last_run_path()
print(f"Run path: {last_run_path}")
# Get the path to the saved artifacts
artifacts_path = os.path.join(last_run_path, 'artifacts')
artifacts = os.listdir(artifacts_path)
print(f"Saved artifacts: {artifacts}")
# Set the artifacts path as an environment variable
os.environ['ARTIFACTS_PATH'] = artifacts_path
# Add the artifacts directory to the system path
sys.path.insert(0, artifacts_path)
Run path: /home/ubuntu/projects/modlee_pypi/examples/mlruns/0/7a47086681324d0e924f9076a1262de9/artifacts/model_graph.py
Saved artifacts: ['transforms.txt', 'model_graph.py', 'model_graph.txt', 'model_size', 'model', 'cached_vars', 'stats_rep', 'snapshot_1.npy', 'lightning_logs', 'snapshot_0.npy', 'model.py', 'loss_calls.txt', 'model_summary.txt']
# Print out the first few lines of the model
print("Model graph:")
!sed -n -e 1,15p $ARTIFACTS_PATH/model_graph.py
!echo " ..."
!sed -n -e 58,68p $ARTIFACTS_PATH/model_graph.py
!echo " ..."
Model graph:
import torch, onnx2torch
from torch import tensor
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
setattr(self,'Conv', torch.nn.modules.conv.Conv2d(**{'in_channels':3,'out_channels':64,'kernel_size':(7, 7),'stride':(2, 2),'padding':(3, 3),'dilation':(1, 1),'groups':1,'padding_mode':'zeros'}))
setattr(self,'Relu', torch.nn.modules.activation.ReLU(**{'inplace':False}))
setattr(self,'MaxPool', torch.nn.modules.pooling.MaxPool2d(**{'kernel_size':[3, 3],'stride':[2, 2],'padding':[1, 1],'dilation':[1, 1],'return_indices':False,'ceil_mode':False}))
setattr(self,'Conv_1', torch.nn.modules.conv.Conv2d(**{'in_channels':64,'out_channels':64,'kernel_size':(3, 3),'stride':(1, 1),'padding':(1, 1),'dilation':(1, 1),'groups':1,'padding_mode':'zeros'}))
setattr(self,'Relu_1', torch.nn.modules.activation.ReLU(**{'inplace':False}))
setattr(self,'Conv_2', torch.nn.modules.conv.Conv2d(**{'in_channels':64,'out_channels':64,'kernel_size':(3, 3),'stride':(1, 1),'padding':(1, 1),'dilation':(1, 1),'groups':1,'padding_mode':'zeros'}))
setattr(self,'Add', onnx2torch.node_converters.binary_math_operations.OnnxBinaryMathOperation(**{'operation_type':'Add','broadcast':None,'axis':None}))
...
def forward(self, input_1):
conv = self.Conv(input_1); input_1 = None
relu = self.Relu(conv); conv = None
max_pool = self.MaxPool(relu); relu = None
conv_1 = self.Conv_1(max_pool)
relu_1 = self.Relu_1(conv_1); conv_1 = None
conv_2 = self.Conv_2(relu_1); relu_1 = None
add = self.Add(conv_2, max_pool); conv_2 = max_pool = None
relu_2 = self.Relu_2(add); add = None
conv_3 = self.Conv_3(relu_2)
...
# Print the first lines of the data metafeatures
print("Data metafeatures:")
!head -20 $ARTIFACTS_PATH/stats_rep
Data metafeatures:
{
"dataset_size": 60032,
"num_sample": 1000,
"batch_element_0": {
"raw": {
"feature_shape": [
960,
3,
28,
28
],
"stats": {
"kmeans": {
"2": {
"inertia": "155588.50824155417",
"silhouette_score": "0.19201575",
"calinski_harabasz_score": "248.3331975601121",
"davies_bouldin_score": "1.9090644142081366",
"time_taken": "0.6537415981292725"
},
We can build the model from the cached model_graph.Model
class and
confirm that we can pass an input through it. Note that this model’s
weights will be uninitialized.
# Rebuilding from the object
import model_graph
rebuilt_model = model_graph.Model()
# Set models to inference
modlee_model.eval(); rebuilt_model.eval()
Next, pass an input from the train dataloader through the rebuilt network and check that the output shape is equal to the original data.
# Get a batch from the training loader
x, y = next(iter(train_dataloader))
with torch.no_grad():
y_original = modlee_model(x)
y_rebuilt = rebuilt_model(x)
assert y_original.shape == y_rebuilt.shape
print(f"Original input and output shapes: {x.shape}, {y_original.shape}")
print(f"Output shape from module-rebuilt model: {y_rebuilt.shape}")
Alternatively, to load the model from the last checkpoint, we can load
it directly from the cached model.pth
.
# Reloading from the checkpoint
reloaded_model = torch.load(os.path.join(artifacts_path, 'model', 'data','model.pth'))
y_reloaded = reloaded_model(x)
#Ensure the output shapes match
assert y_original.shape == y_reloaded.shape
print(f"Output shape from checkpoint-reloaded model: {y_reloaded.shape}")
Original input and output shapes: torch.Size([64, 3, 28, 28]), torch.Size([64, 10])
Output shape from module-rebuilt model: torch.Size([64, 10])
Output shape from checkpoint-reloaded model: torch.Size([64, 10])