Machine Learning

Run ML training and inference on powerful GPUs. Lyceum provides CUDA-enabled environments with pre-installed ML frameworks.

PyTorch MNIST Training

Train a neural network on the classic MNIST dataset with GPU acceleration:

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory // 1024**3} GB")

# Data preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Load MNIST dataset
train_dataset = torchvision.datasets.MNIST(
    root='./data', 
    train=True, 
    download=True, 
    transform=transform
)

test_dataset = torchvision.datasets.MNIST(
    root='./data', 
    train=False, 
    download=True, 
    transform=transform
)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

# Define neural network
class MNISTNet(nn.Module):
    def __init__(self):
        super(MNISTNet, self).__init__()
        self.flatten = nn.Flatten()
        self.network = nn.Sequential(
            nn.Linear(28 * 28, 1024),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 10)
        )
    
    def forward(self, x):
        x = self.flatten(x)
        return self.network(x)

# Initialize model
model = MNISTNet().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Training loop
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()
        
        if batch_idx % 100 == 0:
            print(f'Batch {batch_idx}, Loss: {loss.item():.4f}')
    
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100. * correct / total
    return epoch_loss, epoch_acc

# Test function
def test_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            _, predicted = torch.max(output, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    
    accuracy = 100. * correct / total
    return accuracy

# Train for a few epochs
num_epochs = 5
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    test_acc = test_model(model, test_loader, device)
    
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
    print(f"Test Accuracy: {test_acc:.2f}%")

# Save the trained model
torch.save(model.state_dict(), '/lyceum/storage/mnist_model.pth')
print("\nModel saved to storage!")

print(f"\nFinal Test Accuracy: {test_acc:.2f}%")
if test_acc > 90:
    print("🎉 Great! Model achieved >90% accuracy")
else:
    print("🔄 Consider training longer or tuning hyperparameters")

GPU Information and Setup

Check your GPU configuration and capabilities:

import torch
import subprocess
import sys

def check_gpu_setup():
    """Comprehensive GPU setup check"""
    
    print("=== GPU Configuration Check ===\n")
    
    # Check CUDA availability
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    
    if torch.cuda.is_available():
        print(f"CUDA version: {torch.version.cuda}")
        print(f"Number of GPUs: {torch.cuda.device_count()}")
        
        for i in range(torch.cuda.device_count()):
            props = torch.cuda.get_device_properties(i)
            print(f"\nGPU {i}: {props.name}")
            print(f"  Memory: {props.total_memory // 1024**3} GB")
            print(f"  Compute capability: {props.major}.{props.minor}")
            print(f"  Multiprocessors: {props.multi_processor_count}")
    
    # Run nvidia-smi if available
    try:
        print("\n=== nvidia-smi output ===")
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        print(result.stdout)
    except FileNotFoundError:
        print("nvidia-smi not available")
    
    # Test tensor operations
    if torch.cuda.is_available():
        print("\n=== GPU Test ===")
        
        # Create test tensors
        x = torch.randn(1000, 1000).cuda()
        y = torch.randn(1000, 1000).cuda()
        
        # Time matrix multiplication
        import time
        start_time = time.time()
        z = torch.mm(x, y)
        torch.cuda.synchronize()  # Wait for GPU
        end_time = time.time()
        
        print(f"GPU matrix multiplication (1000x1000): {end_time - start_time:.4f} seconds")
        print("✅ GPU is working correctly!")
    else:
        print("❌ No GPU available - using CPU")

check_gpu_setup()

Image Classification with Transfer Learning

Use a pre-trained model for image classification:

import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image
import requests
from io import BytesIO

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained ResNet model
model = models.resnet18(pretrained=True)
model = model.to(device)
model.eval()

# Image preprocessing for ImageNet models
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                        std=[0.229, 0.224, 0.225]),
])

# ImageNet class labels (top 10 for demo)
imagenet_labels = [
    "tench", "goldfish", "great_white_shark", "tiger_shark", "hammerhead",
    "electric_ray", "stingray", "cock", "hen", "ostrich"
]

def classify_image_from_url(image_url):
    """Download and classify an image from URL"""
    try:
        # Download image
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content)).convert('RGB')
        
        # Preprocess
        input_tensor = preprocess(image)
        input_batch = input_tensor.unsqueeze(0).to(device)
        
        # Inference
        with torch.no_grad():
            output = model(input_batch)
        
        # Get top predictions
        probabilities = torch.nn.functional.softmax(output[0], dim=0)
        top5_prob, top5_catid = torch.topk(probabilities, 5)
        
        print(f"Image URL: {image_url}")
        print("Top 5 predictions:")
        for i in range(5):
            category_id = top5_catid[i].item()
            probability = top5_prob[i].item()
            # Use actual ImageNet labels if available, otherwise use category ID
            label = imagenet_labels[category_id] if category_id < len(imagenet_labels) else f"class_{category_id}"
            print(f"  {i+1}. {label}: {probability:.3f}")
        
        return top5_catid, top5_prob
        
    except Exception as e:
        print(f"Error processing image: {e}")
        return None, None

# Example: Classify a sample image
sample_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/320px-American_Eskimo_Dog.jpg"
classify_image_from_url(sample_image_url)

print("\n" + "="*50)
print("🎯 Try your own images by changing the image_url!")
print("💡 Upload images to storage and classify them locally")

Simple Neural Network from Scratch

Build a neural network without high-level frameworks:

import numpy as np
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

class SimpleNeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights and biases
        self.W1 = np.random.randn(input_size, hidden_size) * 0.1
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.1
        self.b2 = np.zeros((1, output_size))
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -250, 250)))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)
    
    def forward(self, X):
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.sigmoid(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.sigmoid(self.z2)
        return self.a2
    
    def backward(self, X, y, output):
        m = X.shape[0]
        
        # Output layer gradients
        dz2 = output - y
        dW2 = (1/m) * np.dot(self.a1.T, dz2)
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
        
        # Hidden layer gradients
        dz1 = np.dot(dz2, self.W2.T) * self.sigmoid_derivative(self.a1)
        dW1 = (1/m) * np.dot(X.T, dz1)
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
        
        return dW1, db1, dW2, db2
    
    def train(self, X, y, epochs, learning_rate):
        losses = []
        
        for epoch in range(epochs):
            # Forward pass
            output = self.forward(X)
            
            # Calculate loss (mean squared error)
            loss = np.mean((output - y) ** 2)
            losses.append(loss)
            
            # Backward pass
            dW1, db1, dW2, db2 = self.backward(X, y, output)
            
            # Update weights and biases
            self.W1 -= learning_rate * dW1
            self.b1 -= learning_rate * db1
            self.W2 -= learning_rate * dW2
            self.b2 -= learning_rate * db2
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.6f}")
        
        return losses

# Generate sample data (XOR problem)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

print("Training Neural Network on XOR problem...")
print("Input data:")
print(X)
print("Target output:")
print(y.flatten())

# Create and train the network
nn = SimpleNeuralNetwork(input_size=2, hidden_size=4, output_size=1)
losses = nn.train(X, y, epochs=1000, learning_rate=5.0)

# Test the trained network
print("\n" + "="*40)
print("Testing trained network:")
predictions = nn.forward(X)

for i in range(len(X)):
    pred = predictions[i][0]
    target = y[i][0]
    print(f"Input: {X[i]} -> Predicted: {pred:.4f}, Target: {target}")

# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(losses)
plt.title('Training Loss Over Time')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.grid(True)
plt.savefig('/lyceum/storage/training_loss.png', dpi=300, bbox_inches='tight')
print(f"\nTraining loss plot saved to storage!")

# Final accuracy
final_predictions = (predictions > 0.5).astype(int)
accuracy = np.mean(final_predictions == y) * 100
print(f"Final Accuracy: {accuracy:.1f}%")

Model Deployment Preparation

Prepare your trained model for deployment:

import torch
import torch.nn as nn
import json
from datetime import datetime

# Example model class (same as training)
class MNISTNet(nn.Module):
    def __init__(self):
        super(MNISTNet, self).__init__()
        self.flatten = nn.Flatten()
        self.network = nn.Sequential(
            nn.Linear(28 * 28, 1024),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 10)
        )
    
    def forward(self, x):
        x = self.flatten(x)
        return self.network(x)

def export_model_for_deployment():
    """Export model with metadata for deployment"""
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load trained model
    model = MNISTNet()
    try:
        model.load_state_dict(torch.load('/lyceum/storage/mnist_model.pth', map_location=device))
        model.eval()
        print("✅ Model loaded successfully")
    except FileNotFoundError:
        print("❌ Model file not found. Train a model first!")
        return
    
    # Create deployment package
    deployment_info = {
        "model_name": "mnist_classifier",
        "version": "1.0.0",
        "framework": "pytorch",
        "framework_version": torch.__version__,
        "input_shape": [1, 28, 28],
        "output_classes": 10,
        "class_names": [str(i) for i in range(10)],
        "created_at": datetime.now().isoformat(),
        "device_used": str(device),
        "accuracy": "~95%",  # Update with actual accuracy
        "preprocessing": {
            "normalize": True,
            "mean": [0.1307],
            "std": [0.3081]
        }
    }
    
    # Save model in deployment format
    torch.save({
        'model_state_dict': model.state_dict(),
        'model_class': 'MNISTNet',
        'metadata': deployment_info
    }, '/lyceum/storage/mnist_deployment.pth')
    
    # Save metadata as JSON
    with open('/lyceum/storage/model_info.json', 'w') as f:
        json.dump(deployment_info, f, indent=2)
    
    # Create inference example
    inference_example = '''
import torch
import torchvision.transforms as transforms
from PIL import Image

# Load model
checkpoint = torch.load('mnist_deployment.pth')
model = MNISTNet()
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Preprocessing
transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((28, 28)),
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

def predict(image_path):
    image = Image.open(image_path)
    input_tensor = transform(image).unsqueeze(0)
    
    with torch.no_grad():
        output = model(input_tensor)
        predicted_class = torch.argmax(output, dim=1).item()
        confidence = torch.softmax(output, dim=1).max().item()
    
    return predicted_class, confidence
'''
    
    with open('/lyceum/storage/inference_example.py', 'w') as f:
        f.write(inference_example)
    
    print("📦 Deployment package created:")
    print("  - mnist_deployment.pth (model + metadata)")
    print("  - model_info.json (deployment info)")
    print("  - inference_example.py (usage example)")
    
    return deployment_info

# Export the model
export_model_for_deployment()

Use execution_type: "gpu" when training models to leverage CUDA acceleration. Most deep learning frameworks will automatically use GPU when available.

GPU instances have higher costs than CPU. Use them for training and intensive inference, but consider CPU for simple preprocessing or testing.

Trained models and results are automatically saved to your storage and can be downloaded after execution completes.

Getting Started

Common Use Cases

PyTorch MNIST Training

GPU Information and Setup

Image Classification with Transfer Learning

Simple Neural Network from Scratch

Model Deployment Preparation

Getting Started

Common Use Cases

​PyTorch MNIST Training

​GPU Information and Setup

​Image Classification with Transfer Learning

​Simple Neural Network from Scratch

​Model Deployment Preparation

PyTorch MNIST Training

GPU Information and Setup

Image Classification with Transfer Learning

Simple Neural Network from Scratch

Model Deployment Preparation