A rough comparison of training times for CPU vs GPU on Google Colab. Exact times will vary depending on hardware availability, model characteristics, and current environmental conditions (e.g. noisy neighbor) but this should provide a crude baseline for comparious. The benchmark consists of training a CNN on randomly generated training data.
import timeit import torch import torch.nn as nn import torchvision.models as models
model = models.vgg19()
def train(model, input_size, output_size, optimizer, loss_fn, epochs, device): model.requires_grad_(True) model.to(device) model.train() for _ in range(epochs): input = torch.rand(input_size, device=device) target = torch.rand(output_size, device=device) optimizer.zero_grad() output = model(input) loss = loss_fn(output, target) loss.backward() optimizer.step()
def benchmark(device): batch_size = 16 channels = 3 input_width = 224 input_height = 224 classes = 1000 input_size = (batch_size, channels, input_width, input_height) output_size = (batch_size, classes) train(model, input_size, output_size, optimizer=torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9), loss_fn=nn.CrossEntropyLoss(), epochs=2, device=device)
timeit.timeit('benchmark(torch.device("cpu"))', number=1, globals=globals())
61.28752636399999
timeit.timeit('benchmark(torch.device("cuda"))', number=1, globals=globals())
10.384778273999984
Let's see what kind of GPU Collab is using.
torch.cuda.get_device_name()
'Tesla T4'
torch.cuda.get_device_properties(torch.cuda.current_device())
_CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15109MB, multi_processor_count=40)
!pip install pynvml
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting pynvml Downloading pynvml-11.4.1-py3-none-any.whl (46 kB) [?25l [K |███████ | 10 kB 35.0 MB/s eta 0:00:01 [K |██████████████ | 20 kB 43.3 MB/s eta 0:00:01 [K |█████████████████████ | 30 kB 40.1 MB/s eta 0:00:01 [K |████████████████████████████ | 40 kB 20.8 MB/s eta 0:00:01 [K |████████████████████████████████| 46 kB 4.9 MB/s [?25hInstalling collected packages: pynvml Successfully installed pynvml-11.4.1
print('Memory usage:', torch.cuda.memory_usage(torch.cuda.current_device())) print('Utilization:', torch.cuda.utilization(torch.cuda.current_device())) print('GPU Processes:', torch.cuda.list_gpu_processes(torch.cuda.current_device())) print('Mem info:', torch.cuda.mem_get_info(torch.cuda.current_device())) print('Memory stats:', torch.cuda.memory_stats(torch.cuda.current_device())) print('Memory summary:', torch.cuda.memory_summary(torch.cuda.current_device()))
Memory usage: 0 Utilization: 0 GPU Processes: GPU:0 process 2302 uses 4937.000 MB GPU memory Mem info: (10663755776, 15843721216) Memory stats: OrderedDict([('active.all.allocated', 476), ('active.all.current', 76), ('active.all.freed', 400), ('active.all.peak', 156), ('active.large_pool.allocated', 297), ('active.large_pool.current', 30), ('active.large_pool.freed', 267), ('active.large_pool.peak', 75), ('active.small_pool.allocated', 179), ('active.small_pool.current', 46), ('active.small_pool.freed', 133), ('active.small_pool.peak', 83), ('active_bytes.all.allocated', 30445222912), ('active_bytes.all.current', 1150126080), ('active_bytes.all.freed', 29295096832), ('active_bytes.all.peak', 3395650560), ('active_bytes.large_pool.allocated', 30426529792), ('active_bytes.large_pool.current', 1147928576), ('active_bytes.large_pool.freed', 29278601216), ('active_bytes.large_pool.peak', 3391946752), ('active_bytes.small_pool.allocated', 18693120), ('active_bytes.small_pool.current', 2197504), ('active_bytes.small_pool.freed', 16495616), ('active_bytes.small_pool.peak', 5716992), ('allocated_bytes.all.allocated', 30445222912), ('allocated_bytes.all.current', 1150126080), ('allocated_bytes.all.freed', 29295096832), ('allocated_bytes.all.peak', 3395650560), ('allocated_bytes.large_pool.allocated', 30426529792), ('allocated_bytes.large_pool.current', 1147928576), ('allocated_bytes.large_pool.freed', 29278601216), ('allocated_bytes.large_pool.peak', 3391946752), ('allocated_bytes.small_pool.allocated', 18693120), ('allocated_bytes.small_pool.current', 2197504), ('allocated_bytes.small_pool.freed', 16495616), ('allocated_bytes.small_pool.peak', 5716992), ('allocation.all.allocated', 476), ('allocation.all.current', 76), ('allocation.all.freed', 400), ('allocation.all.peak', 156), ('allocation.large_pool.allocated', 297), ('allocation.large_pool.current', 30), ('allocation.large_pool.freed', 267), ('allocation.large_pool.peak', 75), ('allocation.small_pool.allocated', 179), ('allocation.small_pool.current', 46), ('allocation.small_pool.freed', 133), ('allocation.small_pool.peak', 83), ('inactive_split.all.allocated', 139), ('inactive_split.all.current', 10), ('inactive_split.all.freed', 129), ('inactive_split.all.peak', 18), ('inactive_split.large_pool.allocated', 112), ('inactive_split.large_pool.current', 9), ('inactive_split.large_pool.freed', 103), ('inactive_split.large_pool.peak', 14), ('inactive_split.small_pool.allocated', 27), ('inactive_split.small_pool.current', 1), ('inactive_split.small_pool.freed', 26), ('inactive_split.small_pool.peak', 6), ('inactive_split_bytes.all.allocated', 32807424000), ('inactive_split_bytes.all.current', 32667648), ('inactive_split_bytes.all.freed', 32774756352), ('inactive_split_bytes.all.peak', 1207795712), ('inactive_split_bytes.large_pool.allocated', 32783597568), ('inactive_split_bytes.large_pool.current', 30670848), ('inactive_split_bytes.large_pool.freed', 32752926720), ('inactive_split_bytes.large_pool.peak', 1205927936), ('inactive_split_bytes.small_pool.allocated', 23826432), ('inactive_split_bytes.small_pool.current', 1996800), ('inactive_split_bytes.small_pool.freed', 21829632), ('inactive_split_bytes.small_pool.peak', 2604032), ('max_split_size', -1), ('num_alloc_retries', 0), ('num_ooms', 0), ('oversize_allocations.allocated', 0), ('oversize_allocations.current', 0), ('oversize_allocations.freed', 0), ('oversize_allocations.peak', 0), ('oversize_segments.allocated', 0), ('oversize_segments.current', 0), ('oversize_segments.freed', 0), ('oversize_segments.peak', 0), ('reserved_bytes.all.allocated', 4016046080), ('reserved_bytes.all.current', 4016046080), ('reserved_bytes.all.freed', 0), ('reserved_bytes.all.peak', 4016046080), ('reserved_bytes.large_pool.allocated', 4009754624), ('reserved_bytes.large_pool.current', 4009754624), ('reserved_bytes.large_pool.freed', 0), ('reserved_bytes.large_pool.peak', 4009754624), ('reserved_bytes.small_pool.allocated', 6291456), ('reserved_bytes.small_pool.current', 6291456), ('reserved_bytes.small_pool.freed', 0), ('reserved_bytes.small_pool.peak', 6291456), ('segment.all.allocated', 24), ('segment.all.current', 24), ('segment.all.freed', 0), ('segment.all.peak', 24), ('segment.large_pool.allocated', 21), ('segment.large_pool.current', 21), ('segment.large_pool.freed', 0), ('segment.large_pool.peak', 21), ('segment.small_pool.allocated', 3), ('segment.small_pool.current', 3), ('segment.small_pool.freed', 0), ('segment.small_pool.peak', 3)]) Memory summary: |===========================================================================| | PyTorch CUDA memory summary, device ID 0 | |---------------------------------------------------------------------------| | CUDA OOMs: 0 | cudaMalloc retries: 0 | |===========================================================================| | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | |---------------------------------------------------------------------------| | Allocated memory | 1096 MB | 3238 MB | 29034 MB | 27937 MB | | from large pool | 1094 MB | 3234 MB | 29017 MB | 27922 MB | | from small pool | 2 MB | 5 MB | 17 MB | 15 MB | |---------------------------------------------------------------------------| | Active memory | 1096 MB | 3238 MB | 29034 MB | 27937 MB | | from large pool | 1094 MB | 3234 MB | 29017 MB | 27922 MB | | from small pool | 2 MB | 5 MB | 17 MB | 15 MB | |---------------------------------------------------------------------------| | GPU reserved memory | 3830 MB | 3830 MB | 3830 MB | 0 B | | from large pool | 3824 MB | 3824 MB | 3824 MB | 0 B | | from small pool | 6 MB | 6 MB | 6 MB | 0 B | |---------------------------------------------------------------------------| | Non-releasable memory | 31902 KB | 1151 MB | 31287 MB | 31256 MB | | from large pool | 29952 KB | 1150 MB | 31264 MB | 31235 MB | | from small pool | 1950 KB | 2 MB | 22 MB | 20 MB | |---------------------------------------------------------------------------| | Allocations | 76 | 156 | 476 | 400 | | from large pool | 30 | 75 | 297 | 267 | | from small pool | 46 | 83 | 179 | 133 | |---------------------------------------------------------------------------| | Active allocs | 76 | 156 | 476 | 400 | | from large pool | 30 | 75 | 297 | 267 | | from small pool | 46 | 83 | 179 | 133 | |---------------------------------------------------------------------------| | GPU reserved segments | 24 | 24 | 24 | 0 | | from large pool | 21 | 21 | 21 | 0 | | from small pool | 3 | 3 | 3 | 0 | |---------------------------------------------------------------------------| | Non-releasable allocs | 10 | 18 | 139 | 129 | | from large pool | 9 | 14 | 112 | 103 | | from small pool | 1 | 6 | 27 | 26 | |---------------------------------------------------------------------------| | Oversize allocations | 0 | 0 | 0 | 0 | |---------------------------------------------------------------------------| | Oversize GPU segments | 0 | 0 | 0 | 0 | |===========================================================================|