diff --git a/examples/tutorials/gpu_monitoring/README.md b/examples/tutorials/gpu_monitoring/README.md new file mode 100644 index 00000000000..4744f3ced43 --- /dev/null +++ b/examples/tutorials/gpu_monitoring/README.md @@ -0,0 +1,115 @@ +# GPU Monitoring Example + +This example demonstrates how to monitor GPU usage in Metaflow flows, which is essential for ML training workloads. + +## Features + +- GPU availability detection +- GPU memory monitoring +- Resource allocation with `@resources` decorator +- Simulated training workflow + +## Running the Example + +```bash +python gpu_flow.py run +``` + +To run with custom epochs: +```bash +python gpu_flow.py run --epochs 5 +``` + +## Use Cases + +This pattern is useful for: +- ML model training pipelines +- GPU utilization monitoring +- Cost optimization for GPU workloads +- Multi-GPU training setup validation + +## Requirements + +- Metaflow +- PyTorch (optional, for actual GPU detection) + +## Notes + +The example gracefully handles environments without GPUs or PyTorch installed, +making it suitable for testing in various environments. + +## Example Output + +When running on a GPU-enabled environment: +``` +Starting GPU monitoring example flow +GPU Available: True +GPU Name: NVIDIA Tesla V100 +GPU Memory: 16.00 GB +Training for 3 epochs... +Epoch 1/3 completed + GPU Memory - Allocated: 0.12 GB, Reserved: 0.25 GB +Epoch 2/3 completed + GPU Memory - Allocated: 0.12 GB, Reserved: 0.25 GB +Epoch 3/3 completed + GPU Memory - Allocated: 0.12 GB, Reserved: 0.25 GB + +Flow completed successfully! +Training completed: True +Used GPU: NVIDIA Tesla V100 +``` + +When running on CPU-only environment: +``` +Starting GPU monitoring example flow +PyTorch not installed, skipping GPU check +Training for 3 epochs... +Epoch 1/3 completed +Epoch 2/3 completed +Epoch 3/3 completed + +Flow completed successfully! +Training completed: True +``` + +## Advanced Usage + +### Monitoring Multiple GPUs + +To extend this example for multi-GPU monitoring, you can modify the `check_gpu` step: + +```python +@resources(gpu=2, memory=32000) +@step +def check_gpu(self): + """Check multiple GPUs.""" + import torch + if torch.cuda.is_available(): + gpu_count = torch.cuda.device_count() + print(f"Number of GPUs: {gpu_count}") + for i in range(gpu_count): + print(f"GPU {i}: {torch.cuda.get_device_name(i)}") +``` + +### Real-time Monitoring + +For production use cases, you might want to log GPU metrics to a monitoring system: + +```python +# Log to your monitoring system +metrics = { + 'gpu_utilization': torch.cuda.utilization(), + 'gpu_memory_used': torch.cuda.memory_allocated(), + 'gpu_temperature': torch.cuda.temperature() +} +# Send metrics to CloudWatch, Datadog, etc. +``` + +## Related Examples + +- See the `pytorch_tutorial` for more PyTorch-specific patterns +- Check `distributed_training` for multi-node GPU training examples + +## Contributing + +Found an issue or want to improve this example? Pull requests are welcome! \ No newline at end of file diff --git a/examples/tutorials/gpu_monitoring/gpu_flow.py b/examples/tutorials/gpu_monitoring/gpu_flow.py new file mode 100644 index 00000000000..2920d5843b8 --- /dev/null +++ b/examples/tutorials/gpu_monitoring/gpu_flow.py @@ -0,0 +1,82 @@ +""" +GPU Monitoring Example for Metaflow + +This example demonstrates how to monitor GPU usage in Metaflow flows, +which is useful for ML training workloads. +""" + +from metaflow import FlowSpec, step, resources, Parameter +import time + +class GPUMonitorFlow(FlowSpec): + """ + A flow that demonstrates GPU monitoring capabilities in Metaflow. + + This is particularly useful for ML engineers working on training + infrastructure who need to track GPU utilization and memory usage. + """ + + epochs = Parameter('epochs', default=3, help='Number of training epochs') + + @step + def start(self): + """Initialize the flow and check GPU availability.""" + print("Starting GPU monitoring example flow") + self.next(self.check_gpu) + + @resources(gpu=1, memory=16000) + @step + def check_gpu(self): + """Check GPU availability and print GPU information.""" + try: + import torch + self.gpu_available = torch.cuda.is_available() + if self.gpu_available: + self.gpu_name = torch.cuda.get_device_name(0) + self.gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 + print(f"GPU Available: {self.gpu_available}") + print(f"GPU Name: {self.gpu_name}") + print(f"GPU Memory: {self.gpu_memory:.2f} GB") + else: + print("No GPU available, will simulate training") + except ImportError: + print("PyTorch not installed, skipping GPU check") + self.gpu_available = False + + self.next(self.train) + + @resources(gpu=1, memory=16000, cpu=4) + @step + def train(self): + """Simulate a training workload with GPU monitoring.""" + print(f"Training for {self.epochs} epochs...") + + for epoch in range(self.epochs): + # Simulate training + time.sleep(1) + print(f"Epoch {epoch + 1}/{self.epochs} completed") + + # In real scenario, you would monitor GPU here + if hasattr(self, 'gpu_available') and self.gpu_available: + try: + import torch + # Check GPU memory usage + allocated = torch.cuda.memory_allocated() / 1e9 + reserved = torch.cuda.memory_reserved() / 1e9 + print(f" GPU Memory - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB") + except: + pass + + self.training_completed = True + self.next(self.end) + + @step + def end(self): + """Finalize the flow and print summary.""" + print("\nFlow completed successfully!") + print(f"Training completed: {self.training_completed}") + if hasattr(self, 'gpu_name'): + print(f"Used GPU: {self.gpu_name}") + +if __name__ == '__main__': + GPUMonitorFlow() \ No newline at end of file