{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 98598, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Kernel: 98598us</b><br>Percentage: 52.46%</div>", 1941, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Memcpy: 1941us</b><br>Percentage: 1.03%</div>", 90, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Memset: 90us</b><br>Percentage: 0.05%</div>", 2796, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Runtime: 2796us</b><br>Percentage: 1.49%</div>", 69317, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>DataLoader: 69317us</b><br>Percentage: 36.88%</div>", 14091, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>CPU Exec: 14091us</b><br>Percentage: 7.5%</div>", 1115, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 187948us<br><b>Other: 1115us</b><br>Percentage: 0.59%</div>"], ["6", 98570, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Kernel: 98570us</b><br>Percentage: 56.28%</div>", 1947, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Memcpy: 1947us</b><br>Percentage: 1.11%</div>", 89, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Memset: 89us</b><br>Percentage: 0.05%</div>", 2762, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Runtime: 2762us</b><br>Percentage: 1.58%</div>", 57669, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>DataLoader: 57669us</b><br>Percentage: 32.92%</div>", 12968, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>CPU Exec: 12968us</b><br>Percentage: 7.4%</div>", 1148, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 175153us<br><b>Other: 1148us</b><br>Percentage: 0.66%</div>"], ["7", 98596, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Kernel: 98596us</b><br>Percentage: 54.86%</div>", 1931, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Memcpy: 1931us</b><br>Percentage: 1.07%</div>", 91, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Memset: 91us</b><br>Percentage: 0.05%</div>", 2877, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Runtime: 2877us</b><br>Percentage: 1.6%</div>", 61257, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>DataLoader: 61257us</b><br>Percentage: 34.08%</div>", 13768, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>CPU Exec: 13768us</b><br>Percentage: 7.66%</div>", 1213, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 179733us<br><b>Other: 1213us</b><br>Percentage: 0.67%</div>"], ["8", 98623, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Kernel: 98623us</b><br>Percentage: 56.5%</div>", 1938, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Memcpy: 1938us</b><br>Percentage: 1.11%</div>", 89, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Memset: 89us</b><br>Percentage: 0.05%</div>", 2841, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Runtime: 2841us</b><br>Percentage: 1.63%</div>", 56453, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>DataLoader: 56453us</b><br>Percentage: 32.34%</div>", 13420, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>CPU Exec: 13420us</b><br>Percentage: 7.69%</div>", 1200, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 174564us<br><b>Other: 1200us</b><br>Percentage: 0.69%</div>"], ["9", 98504, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Kernel: 98504us</b><br>Percentage: 54.07%</div>", 1937, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Memcpy: 1937us</b><br>Percentage: 1.06%</div>", 87, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Memset: 87us</b><br>Percentage: 0.05%</div>", 2788, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Runtime: 2788us</b><br>Percentage: 1.53%</div>", 62690, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>DataLoader: 62690us</b><br>Percentage: 34.41%</div>", 15025, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>CPU Exec: 15025us</b><br>Percentage: 8.25%</div>", 1141, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 182172us<br><b>Other: 1141us</b><br>Percentage: 0.63%</div>"], ["10", 98641, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Kernel: 98641us</b><br>Percentage: 59.43%</div>", 1798, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Memcpy: 1798us</b><br>Percentage: 1.08%</div>", 88, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Memset: 88us</b><br>Percentage: 0.05%</div>", 3381, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Runtime: 3381us</b><br>Percentage: 2.04%</div>", 48185, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>DataLoader: 48185us</b><br>Percentage: 29.03%</div>", 12773, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>CPU Exec: 12773us</b><br>Percentage: 7.7%</div>", 1117, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 165983us<br><b>Other: 1117us</b><br>Percentage: 0.67%</div>"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 177592, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 98589, "extra": 55.51}, {"name": "Memcpy", "description": "", "value": 1915, "extra": 1.08}, {"name": "Memset", "description": "", "value": 89, "extra": 0.05}, {"name": "Runtime", "description": "", "value": 2908, "extra": 1.64}, {"name": "DataLoader", "description": "", "value": 59262, "extra": 33.37}, {"name": "CPU Exec", "description": "", "value": 13674, "extra": 7.7}, {"name": "Other", "description": "", "value": 1156, "extra": 0.65}]}], "recommendations": "<ul><li>This run has high time cost on input data loading. 33.4% of the step time is in DataLoader. You could try to set num_workers on DataLoader's construction and <a href=\"https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading\" target=\"_blank\">enable multi-processes on data loading</a>.</li><li>Kernels with 68% time are launched by Tensor Cores eligible operators. You could enable <a href=\"https://pytorch.org/docs/stable/amp.html\" target=\"_blank\">Automatic Mixed Precision</a> to speedup by using FP16.</li></ul>", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "55.51 %"}, {"title": "Est. SM Efficiency", "value": "54.68 %"}, {"title": "Est. Achieved Occupancy", "value": "49.13 %"}, {"title": "Kernel Time using Tensor Cores", "value": "0.0 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. The higher, the better. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. The higher, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nFor most cases such as memory bandwidth bounded kernels, the higher the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted average of all kernels' OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization.\n\nKernel using Tensor Cores:\nTotal GPU Time for Tensor Core kernels / Total GPU Time for all kernels.\n"}}
{"device_total_time": {"title": "Device Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward", 273428], ["CudnnConvolutionBackward", 273428], ["aten::cudnn_convolution_backward_weight", 142461], ["aten::cudnn_convolution_backward_input", 130967], ["aten::cudnn_convolution", 126619], ["aten::_convolution", 126619], ["aten::convolution", 126619], ["aten::conv2d", 126619], ["aten::cudnn_batch_norm_backward", 61939], ["CudnnBatchNormBackward", 61939], ["aten::cudnn_batch_norm", 34245], ["aten::_batch_norm_impl_index", 34245], ["aten::batch_norm", 34245], ["aten::threshold_backward", 27298], ["ReluBackward1", 27298], ["aten::add_", 24098], ["aten::clamp_min", 17860], ["aten::clamp_min_", 17860], ["aten::relu_", 17860], ["aten::add", 16038], ["aten::copy_", 11492], ["aten::to", 11492], ["aten::max_pool2d_with_indices_backward", 4677], ["MaxPool2DWithIndicesBackward", 4677], ["torch::autograd::AccumulateGrad", 3030], ["aten::mul_", 2409], ["aten::fill_", 1887], ["aten::zero_", 1881], ["aten::max_pool2d_with_indices", 1420], ["aten::max_pool2d", 1420], ["aten::mm", 275], ["AddmmBackward", 275], ["aten::mean", 212], ["aten::adaptive_avg_pool2d", 212], ["aten::addmm", 197], ["aten::linear", 197], ["aten::div", 144], ["MeanBackward1", 144], ["aten::cross_entropy_loss", 60], ["aten::_log_softmax_backward_data", 53], ["LogSoftmaxBackward", 53], ["aten::sum", 44], ["aten::_log_softmax", 42], ["aten::log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss", 18], ["aten::nll_loss_nd", 18], ["aten::nll_loss_backward", 18], ["NllLossBackward", 18], ["aten::ones_like", 6]]}, "device_self_time": {"title": "Device Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward_weight", 142461], ["aten::cudnn_convolution_backward_input", 130967], ["aten::cudnn_convolution", 126619], ["aten::cudnn_batch_norm_backward", 61939], ["aten::cudnn_batch_norm", 34245], ["aten::threshold_backward", 27298], ["aten::add_", 24098], ["aten::clamp_min", 17860], ["aten::add", 16038], ["aten::copy_", 11492], ["aten::max_pool2d_with_indices_backward", 3822], ["aten::mul_", 2409], ["aten::fill_", 1887], ["aten::max_pool2d_with_indices", 1420], ["aten::mm", 275], ["aten::mean", 212], ["aten::addmm", 197], ["aten::div", 144], ["aten::_log_softmax_backward_data", 53], ["aten::sum", 44], ["aten::_log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss_backward", 18]]}, "host_total_time": {"title": "Host Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["CudnnConvolutionBackward", 90989], ["aten::batch_norm", 87977], ["aten::cudnn_convolution_backward", 87772], ["aten::add_", 78125], ["aten::_batch_norm_impl_index", 78071], ["aten::conv2d", 77781], ["aten::cudnn_batch_norm", 71527], ["aten::convolution", 70394], ["aten::empty", 68147], ["aten::to", 64332], ["aten::_convolution", 64243], ["aten::cudnn_convolution", 56998], ["aten::copy_", 52853], ["aten::cudnn_convolution_backward_input", 41445], ["aten::cudnn_convolution_backward_weight", 40246], ["aten::div", 35158], ["CudnnBatchNormBackward", 34608], ["aten::contiguous", 31137], ["aten::cudnn_batch_norm_backward", 30460], ["aten::mul_", 29081], ["torch::autograd::AccumulateGrad", 28494], ["aten::zero_", 27597], ["aten::empty_like", 26064], ["aten::stack", 24346], ["aten::relu_", 24181], ["aten::add", 19289], ["aten::cat", 17085], ["aten::fill_", 17059], ["aten::_cat", 16933], ["aten::clamp_min_", 15665], ["aten::view", 14027], ["aten::resize_", 12406], ["aten::empty_strided", 11829], ["ReluBackward1", 11656], ["aten::clamp_min", 10311], ["aten::permute", 9775], ["aten::threshold_backward", 9482], ["aten::as_strided", 7600], ["aten::unsqueeze", 6603], ["aten::linear", 1408], ["AddmmBackward", 1303], ["aten::cross_entropy_loss", 1180], ["aten::zeros", 1105], ["aten::addmm", 1034], ["MeanBackward1", 987], ["aten::mm", 860], ["NllLossBackward", 716], ["aten::max_pool2d", 687], ["aten::nll_loss_backward", 614], ["aten::t", 584], ["aten::log_softmax", 567], ["aten::max_pool2d_with_indices", 562], ["aten::adaptive_avg_pool2d", 561], ["aten::nll_loss_nd", 495], ["MaxPool2DWithIndicesBackward", 484], ["aten::ones_like", 452], ["aten::mean", 445], ["aten::_log_softmax", 433], ["aten::nll_loss", 414], ["aten::max_pool2d_with_indices_backward", 411], ["LogSoftmaxBackward", 359], ["aten::narrow", 350], ["aten::nll_loss_forward", 346], ["aten::transpose", 329], ["aten::sum", 327], ["aten::_log_softmax_backward_data", 306], ["aten::expand", 229], ["aten::slice", 223], ["aten::detach_", 208], ["AddBackward0", 175], ["aten::flatten", 164], ["TBackward", 103], ["detach_", 100], ["ViewBackward", 80], ["aten::reshape", 55], ["aten::conj", 12]]}, "host_self_time": {"title": "Host Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::empty", 68147], ["aten::add_", 51013], ["aten::copy_", 40255], ["aten::cudnn_convolution", 33121], ["aten::cudnn_convolution_backward_input", 29324], ["aten::cudnn_convolution_backward_weight", 22804], ["aten::mul_", 20515], ["aten::div", 20135], ["aten::cudnn_batch_norm", 19843], ["aten::_cat", 16282], ["aten::to", 14834], ["aten::add", 14329], ["aten::view", 14027], ["aten::resize_", 12406], ["aten::cudnn_batch_norm_backward", 12238], ["aten::empty_strided", 11829], ["aten::empty_like", 11742], ["aten::zero_", 10693], ["aten::batch_norm", 9906], ["aten::fill_", 9879], ["aten::relu_", 8516], ["aten::as_strided", 7600], ["aten::conv2d", 7387], ["aten::_convolution", 7245], ["aten::clamp_min", 7106], ["aten::_batch_norm_impl_index", 6544], ["aten::convolution", 6151], ["aten::threshold_backward", 6090], ["aten::cudnn_convolution_backward", 6081], ["aten::permute", 5515], ["aten::contiguous", 5510], ["torch::autograd::AccumulateGrad", 5457], ["aten::clamp_min_", 5354], ["CudnnBatchNormBackward", 4148], ["aten::unsqueeze", 3574], ["CudnnConvolutionBackward", 3217], ["ReluBackward1", 2174], ["aten::zeros", 659], ["aten::stack", 658], ["aten::addmm", 639], ["aten::mm", 575], ["MeanBackward1", 541], ["aten::max_pool2d_with_indices", 477], ["aten::nll_loss_backward", 388], ["aten::nll_loss_forward", 266], ["aten::t", 255], ["aten::mean", 234], ["aten::transpose", 197], ["AddmmBackward", 182], ["aten::max_pool2d_with_indices_backward", 176], ["AddBackward0", 175], ["aten::_log_softmax", 170], ["aten::sum", 153], ["aten::cat", 152], ["aten::expand", 150], ["aten::narrow", 127], ["aten::max_pool2d", 125], ["aten::linear", 124], ["aten::slice", 123], ["aten::cross_entropy_loss", 118], ["aten::adaptive_avg_pool2d", 116], ["aten::detach_", 108], ["aten::_log_softmax_backward_data", 108], ["NllLossBackward", 102], ["detach_", 100], ["aten::ones_like", 95], ["aten::log_softmax", 90], ["aten::flatten", 84], ["aten::nll_loss_nd", 81], ["MaxPool2DWithIndicesBackward", 73], ["aten::nll_loss", 68], ["LogSoftmaxBackward", 53], ["aten::reshape", 29], ["ViewBackward", 25], ["TBackward", 18], ["aten::conj", 12]]}}
{"metadata": {"sort": "device_self_duration", "tooltips": {"tc_eligible": "Whether this operator is eligible to use Tensor Cores.", "tc_self_ratio": "Time of self-kernels with Tensor Cores / Time of self-kernels.", "tc_total_ratio": "Time of kernels with Tensor Cores / Time of kernels."}}, "data": [{"name": "aten::cudnn_convolution_backward_weight", "calls": 318, "device_self_duration": 142461, "device_total_duration": 142461, "host_self_duration": 22804, "host_total_duration": 40246, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward_input", "calls": 312, "device_self_duration": 130967, "device_total_duration": 130967, "host_self_duration": 29324, "host_total_duration": 41445, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution", "calls": 318, "device_self_duration": 126619, "device_total_duration": 126619, "host_self_duration": 33121, "host_total_duration": 56998, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::cudnn_batch_norm_backward", "calls": 318, "device_self_duration": 61939, "device_total_duration": 61939, "host_self_duration": 12238, "host_total_duration": 30460, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_batch_norm", "calls": 318, "device_self_duration": 34245, "device_total_duration": 34245, "host_self_duration": 19843, "host_total_duration": 71527, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::threshold_backward", "calls": 294, "device_self_duration": 27298, "device_total_duration": 27298, "host_self_duration": 6090, "host_total_duration": 9482, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::add_", "calls": 2994, "device_self_duration": 24098, "device_total_duration": 24098, "host_self_duration": 51013, "host_total_duration": 78125, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::clamp_min", "calls": 294, "device_self_duration": 17860, "device_total_duration": 17860, "host_self_duration": 7106, "host_total_duration": 10311, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::add", "calls": 414, "device_self_duration": 16038, "device_total_duration": 16038, "host_self_duration": 14329, "host_total_duration": 19289, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::copy_", "calls": 588, "device_self_duration": 11492, "device_total_duration": 11492, "host_self_duration": 40255, "host_total_duration": 52853, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices_backward", "calls": 6, "device_self_duration": 3822, "device_total_duration": 4677, "host_self_duration": 176, "host_total_duration": 411, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::mul_", "calls": 966, "device_self_duration": 2409, "device_total_duration": 2409, "host_self_duration": 20515, "host_total_duration": 29081, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::fill_", "calls": 978, "device_self_duration": 1887, "device_total_duration": 1887, "host_self_duration": 9879, "host_total_duration": 17059, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices", "calls": 6, "device_self_duration": 1420, "device_total_duration": 1420, "host_self_duration": 477, "host_total_duration": 562, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::mm", "calls": 12, "device_self_duration": 275, "device_total_duration": 275, "host_self_duration": 575, "host_total_duration": 860, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::mean", "calls": 6, "device_self_duration": 212, "device_total_duration": 212, "host_self_duration": 234, "host_total_duration": 445, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::addmm", "calls": 6, "device_self_duration": 197, "device_total_duration": 197, "host_self_duration": 639, "host_total_duration": 1034, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::div", "calls": 198, "device_self_duration": 144, "device_total_duration": 144, "host_self_duration": 20135, "host_total_duration": 35158, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::_log_softmax_backward_data", "calls": 6, "device_self_duration": 53, "device_total_duration": 53, "host_self_duration": 108, "host_total_duration": 306, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::sum", "calls": 6, "device_self_duration": 44, "device_total_duration": 44, "host_self_duration": 153, "host_total_duration": 327, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::_log_softmax", "calls": 6, "device_self_duration": 42, "device_total_duration": 42, "host_self_duration": 170, "host_total_duration": 433, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_forward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 266, "host_total_duration": 346, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_backward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 388, "host_total_duration": 614, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::empty", "calls": 4404, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 68147, "host_total_duration": 68147, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::zero_", "calls": 996, "device_self_duration": 0, "device_total_duration": 1881, "host_self_duration": 10693, "host_total_duration": 27597, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::zeros", "calls": 24, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 659, "host_total_duration": 1105, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::view", "calls": 846, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 14027, "host_total_duration": 14027, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::as_strided", "calls": 432, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 7600, "host_total_duration": 7600, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::permute", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 5515, "host_total_duration": 9775, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::empty_like", "calls": 528, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 11742, "host_total_duration": 26064, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::contiguous", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 5510, "host_total_duration": 31137, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::empty_strided", "calls": 402, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 11829, "host_total_duration": 11829, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::to", "calls": 414, "device_self_duration": 0, "device_total_duration": 11492, "host_self_duration": 14834, "host_total_duration": 64332, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::unsqueeze", "calls": 192, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 3574, "host_total_duration": 6603, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::resize_", "calls": 1902, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 12406, "host_total_duration": 12406, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::slice", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 123, "host_total_duration": 223, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::narrow", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 127, "host_total_duration": 350, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::_cat", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 16282, "host_total_duration": 16933, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::cat", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 152, "host_total_duration": 17085, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::stack", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 658, "host_total_duration": 24346, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "detach_", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 100, "host_total_duration": 100, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::detach_", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 108, "host_total_duration": 208, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::_convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 7245, "host_total_duration": 64243, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 6151, "host_total_duration": 70394, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::conv2d", "calls": 318, "device_self_duration": 0, "device_total_duration": 126619, "host_self_duration": 7387, "host_total_duration": 77781, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::_batch_norm_impl_index", "calls": 318, "device_self_duration": 0, "device_total_duration": 34245, "host_self_duration": 6544, "host_total_duration": 78071, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::batch_norm", "calls": 318, "device_self_duration": 0, "device_total_duration": 34245, "host_self_duration": 9906, "host_total_duration": 87977, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::clamp_min_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17860, "host_self_duration": 5354, "host_total_duration": 15665, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::relu_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17860, "host_self_duration": 8516, "host_total_duration": 24181, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 1420, "host_self_duration": 125, "host_total_duration": 687, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::adaptive_avg_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 212, "host_self_duration": 116, "host_total_duration": 561, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::flatten", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 84, "host_total_duration": 164, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::transpose", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 197, "host_total_duration": 329, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::t", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 255, "host_total_duration": 584, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::expand", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 150, "host_total_duration": 229, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::linear", "calls": 6, "device_self_duration": 0, "device_total_duration": 197, "host_self_duration": 124, "host_total_duration": 1408, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::log_softmax", "calls": 6, "device_self_duration": 0, "device_total_duration": 42, "host_self_duration": 90, "host_total_duration": 567, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 68, "host_total_duration": 414, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_nd", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 81, "host_total_duration": 495, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::cross_entropy_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 60, "host_self_duration": 118, "host_total_duration": 1180, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::ones_like", "calls": 6, "device_self_duration": 0, "device_total_duration": 6, "host_self_duration": 95, "host_total_duration": 452, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "NllLossBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 102, "host_total_duration": 716, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "LogSoftmaxBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 53, "host_self_duration": 53, "host_total_duration": 359, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::conj", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 12, "host_total_duration": 12, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "AddmmBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 275, "host_self_duration": 182, "host_total_duration": 1303, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "torch::autograd::AccumulateGrad", "calls": 966, "device_self_duration": 0, "device_total_duration": 3030, "host_self_duration": 5457, "host_total_duration": 28494, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "TBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 18, "host_total_duration": 103, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "aten::reshape", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 29, "host_total_duration": 55, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "ViewBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 25, "host_total_duration": 80, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "MeanBackward1", "calls": 6, "device_self_duration": 0, "device_total_duration": 144, "host_self_duration": 541, "host_total_duration": 987, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "ReluBackward1", "calls": 294, "device_self_duration": 0, "device_total_duration": 27298, "host_self_duration": 2174, "host_total_duration": 11656, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "AddBackward0", "calls": 96, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 175, "host_total_duration": 175, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "CudnnBatchNormBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 61939, "host_self_duration": 4148, "host_total_duration": 34608, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward", "calls": 318, "device_self_duration": 0, "device_total_duration": 273428, "host_self_duration": 6081, "host_total_duration": 87772, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "CudnnConvolutionBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 273428, "host_self_duration": 3217, "host_total_duration": 90989, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "MaxPool2DWithIndicesBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 4677, "host_self_duration": 73, "host_total_duration": 484, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}]}
{"metadata": {"sort": "Total Duration (us)"}, "data": {"columns": [{"type": "string", "name": "Name"}, {"type": "string", "name": "Tensor Cores Used", "tooltip": "Whether this kernel uses Tensor Cores."}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM = blocks of this kernel / SM number of this GPU.\nIf this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, using each call's execution duration as weight."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nFor most cases such as memory bandwidth bounded kernels, the higher the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, using each call's execution duration as weight. It shows fine-grained low-level GPU utilization."}], "rows": [["void cudnn::detail::dgrad_engine<float, 512, 6, 5, 3, 3, 3, false>(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", "No", 162, 80756, 498, 1017, 323, 42.25, 29.97], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 7, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 156, 66472, 426, 745, 345, 9.78, 38.0], ["void cudnn::bn_bw_1C11_kernel_new<float, float, float2, 512, true, 1>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", "No", 264, 59642, 226, 915, 45, 4.34, 67.98], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<float>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<float>, at::detail::Array<char*, 3>)", "No", 3090, 39814, 13, 378, 1, 641.54, 92.32], ["void implicit_convolve_sgemm<float, float, 1024, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 90, 36957, 411, 748, 347, 12.34, 50.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3>)", "No", 294, 27298, 93, 377, 13, 653.06, 100.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, 512, true, 1>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", "No", 150, 27060, 180, 452, 53, 3.12, 64.06], ["void implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 60, 25782, 430, 729, 352, 3.9, 42.09], ["volta_sgemm_64x64_nt", "No", 102, 21084, 207, 279, 184, 10.24, 19.38], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", "No", 48, 20448, 426, 676, 307, 6.83, 25.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>)", "No", 294, 17860, 61, 252, 5, 666.65, 100.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", "No", 36, 12704, 353, 362, 344, 22.4, 25.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", "No", 30, 9597, 320, 510, 252, 12.9, 19.0], ["volta_sgemm_128x32_nt", "No", 24, 8629, 360, 477, 18, 0.97, 11.51], ["volta_sgemm_64x64_nn", "No", 42, 8551, 204, 217, 195, 12.34, 24.14], ["volta_scudnn_128x64_relu_interior_nn_v1", "No", 30, 8022, 267, 316, 94, 37.1, 25.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", "No", 12, 7817, 651, 671, 635, 15.96, 19.0], ["void cudnn::bn_fw_tr_1C11_singleread<float, 512, true, 1, 2, 0>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", "No", 168, 7185, 43, 89, 13, 12.57, 75.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 5, 5, 3, 3, 3, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 12, 7068, 589, 987, 193, 85.34, 37.5], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)", "No", 120, 5369, 45, 73, 19, 10.0, 50.0], ["void implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 12, 5219, 435, 437, 432, 9.8, 31.0], ["void explicit_convolve_sgemm<float, int, 1024, 5, 5, 3, 3, 3, 0, false>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, unsigned long long, int, unsigned long long, int, float, float, int, float const*, float const*)", "No", 6, 4759, 793, 796, 790, 9.8, 31.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", "No", 120, 4710, 39, 66, 17, 10.11, 50.0], ["volta_scudnn_128x128_stridedB_interior_nn_v1", "No", 18, 4693, 261, 281, 252, 9.8, 25.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", "No", 78, 4692, 60, 126, 20, 15.46, 38.0], ["void cudnn::ops::scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)", "No", 162, 4631, 29, 143, 5, 496.39, 100.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)", "No", 78, 4573, 59, 125, 17, 15.69, 50.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 8, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 6, 4065, 678, 692, 652, 6.4, 25.0], ["void implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 6, 3917, 653, 686, 595, 4.9, 25.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw<float, float>(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", "No", 6, 3822, 637, 638, 636, 1254.4, 100.0], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", "No", 6, 3720, 620, 623, 614, 5.6, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", "No", 6, 3627, 604, 606, 603, 39.2, 25.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", "No", 12, 3501, 292, 296, 286, 19.6, 25.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_nhwc_tn_v1", "No", 6, 3270, 545, 627, 526, 4.9, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", "No", 12, 3265, 272, 279, 254, 9.8, 25.0], ["volta_scudnn_128x64_relu_xregs_large_nn_v1", "No", 6, 3200, 533, 607, 516, 4.9, 19.0], ["volta_sgemm_32x128_nn", "No", 18, 3053, 170, 171, 168, 22.05, 50.0], ["volta_scudnn_128x128_relu_interior_nn_v1", "No", 6, 3010, 502, 508, 495, 9.8, 25.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", "No", 6, 2995, 499, 505, 493, 19.6, 25.0], ["volta_sgemm_32x128_nt", "No", 18, 2843, 158, 159, 156, 22.05, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)", "No", 120, 2662, 22, 67, 5, 8.68, 73.22], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2> >(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>)", "No", 966, 2409, 2, 25, 1, 43.72, 58.39], ["void cudnn::bn_bw_1C11_singleread<float, 512, true, 1, 2, 0>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", "No", 54, 2297, 43, 73, 18, 20.81, 75.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>)", "No", 978, 1887, 2, 143, 0, 599.07, 86.78], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)", "No", 78, 1504, 19, 69, 5, 8.06, 41.33], ["void at::native::(anonymous namespace)::max_pool_forward_nchw<float, float>(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", "No", 6, 1420, 237, 239, 234, 313.6, 100.0], ["void cudnn::cnn::im2col4d_kernel<float, long>(cudnn::cnn::im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*)", "No", 6, 614, 102, 103, 101, 0.95, 24.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", "No", 6, 584, 97, 100, 93, 9.8, 19.0], ["void nchwToNhwcKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", "No", 12, 453, 38, 68, 9, 73.28, 100.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "No", 138, 342, 2, 4, 1, 0.13, 1.73], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2>)", "No", 318, 322, 1, 2, 1, 0.01, 0.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4>)", "No", 6, 212, 35, 36, 35, 51.2, 100.0], ["volta_sgemm_64x32_sliced1x4_nn", "No", 6, 150, 25, 26, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", "No", 6, 149, 25, 26, 24, 1.0, 13.0], ["void at::native::unrolled_elementwise_kernel<at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", "No", 6, 144, 24, 24, 24, 156.8, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)", "No", 36, 134, 4, 5, 2, 0.4, 3.0], ["void nhwcToNchwKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", "No", 6, 105, 18, 18, 17, 22.4, 100.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", "No", 66, 81, 1, 2, 1, 0.15, 1.68], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", "No", 66, 81, 1, 2, 1, 0.02, 0.0], ["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", "No", 72, 73, 1, 2, 1, 0.02, 0.0], ["void (anonymous namespace)::softmax_warp_backward<float, float, float, 10, true>(float*, float const*, float const*, int, int, int)", "No", 6, 53, 9, 9, 8, 0.1, 1.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "No", 6, 44, 7, 8, 7, 0.03, 0.0], ["void (anonymous namespace)::softmax_warp_forward<float, float, float, 10, true>(float*, float const*, int, int, int)", "No", 6, 42, 7, 7, 7, 0.1, 1.0], ["void splitKreduce_kernel<float, float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*, float const*)", "No", 12, 30, 2, 3, 2, 4.44, 28.0], ["void at::native::unrolled_elementwise_kernel<at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast)", "No", 6, 30, 5, 5, 5, 1.56, 5.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)", "No", 6, 18, 3, 3, 3, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)", "No", 6, 12, 2, 2, 2, 0.01, 0.0]]}}
{"total": {"columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["void cudnn::detail::dgrad_engine<float, 512, 6, 5, 3, 3, 3, false>(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 80756], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 7, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 66472], ["void cudnn::bn_bw_1C11_kernel_new<float, float, float2, 512, true, 1>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 59642], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<float>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<float>, at::detail::Array<char*, 3>)", 39814], ["void implicit_convolve_sgemm<float, float, 1024, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 36957], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3>)", 27298], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, 512, true, 1>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 27060], ["void implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 25782], ["volta_sgemm_64x64_nt", 21084], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 20448], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>)", 17860], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 12704], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 9597], ["volta_sgemm_128x32_nt", 8629], ["volta_sgemm_64x64_nn", 8551], ["volta_scudnn_128x64_relu_interior_nn_v1", 8022], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 7817], ["void cudnn::bn_fw_tr_1C11_singleread<float, 512, true, 1, 2, 0>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 7185], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 5, 5, 3, 3, 3, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 7068], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)", 5369], ["void implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 5219], ["void explicit_convolve_sgemm<float, int, 1024, 5, 5, 3, 3, 3, 0, false>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, unsigned long long, int, unsigned long long, int, float, float, int, float const*, float const*)", 4759], ["void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 4710], ["volta_scudnn_128x128_stridedB_interior_nn_v1", 4693], ["void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 4692], ["void cudnn::ops::scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)", 4631], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)", 4573], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 8, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 4065], ["void implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 3917], ["void at::native::(anonymous namespace)::max_pool_backward_nchw<float, float>(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 3822], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 3720], ["volta_scudnn_128x64_relu_medium_nn_v1", 3627], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 3501], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_nhwc_tn_v1", 3270], ["volta_scudnn_128x64_relu_small_nn_v1", 3265], ["volta_scudnn_128x64_relu_xregs_large_nn_v1", 3200], ["volta_sgemm_32x128_nn", 3053], ["volta_scudnn_128x128_relu_interior_nn_v1", 3010], ["volta_scudnn_128x128_stridedB_small_nn_v1", 2995], ["volta_sgemm_32x128_nt", 2843], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)", 2662], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2> >(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>)", 2409], ["void cudnn::bn_bw_1C11_singleread<float, 512, true, 1, 2, 0>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 2297], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>)", 1887], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)", 1504], ["void at::native::(anonymous namespace)::max_pool_forward_nchw<float, float>(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 1420], ["void cudnn::cnn::im2col4d_kernel<float, long>(cudnn::cnn::im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*)", 614], ["volta_scudnn_128x64_stridedB_small_nn_v1", 584], ["void nchwToNhwcKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 453], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 342], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2>)", 322], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4>)", 212], ["volta_sgemm_64x32_sliced1x4_nn", 150], ["volta_sgemm_64x32_sliced1x4_tn", 149], ["void at::native::unrolled_elementwise_kernel<at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 144], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)", 134], ["void nhwcToNchwKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 105], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 81], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 81], ["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 73], ["void (anonymous namespace)::softmax_warp_backward<float, float, float, 10, true>(float*, float const*, float const*, int, int, int)", 53], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 44], ["void (anonymous namespace)::softmax_warp_forward<float, float, float, 10, true>(float*, float const*, int, int, int)", 42], ["void splitKreduce_kernel<float, float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*, float const*)", 30], ["void at::native::unrolled_elementwise_kernel<at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast)", 30], ["void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)", 18], ["void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)", 12]]}}
{"steps": {"columns": [{"type": "string", "name": "Step"}, {"type": "number", "name": "Kernel"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memcpy"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Memset"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Runtime"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "DataLoader"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "CPU Exec"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}, {"type": "number", "name": "Other"}, {"type": "string", "role": "tooltip", "p": {"html": "true"}}], "rows": [["5", 99778, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Kernel: 99778us</b><br>Percentage: 54.73%</div>", 3606, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Memcpy: 3606us</b><br>Percentage: 1.98%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Memset: 98us</b><br>Percentage: 0.05%</div>", 41028, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Runtime: 41028us</b><br>Percentage: 22.51%</div>", 4341, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>DataLoader: 4341us</b><br>Percentage: 2.38%</div>", 27460, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>CPU Exec: 27460us</b><br>Percentage: 15.06%</div>", 5995, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 5<br>Total: 182306us<br><b>Other: 5995us</b><br>Percentage: 3.29%</div>"], ["6", 99208, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Kernel: 99208us</b><br>Percentage: 78.62%</div>", 2948, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Memcpy: 2948us</b><br>Percentage: 2.34%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Memset: 98us</b><br>Percentage: 0.08%</div>", 3406, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Runtime: 3406us</b><br>Percentage: 2.7%</div>", 0, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>DataLoader: 0us</b><br>Percentage: 0.0%</div>", 16404, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>CPU Exec: 16404us</b><br>Percentage: 13.0%</div>", 4119, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 6<br>Total: 126183us<br><b>Other: 4119us</b><br>Percentage: 3.26%</div>"], ["7", 99114, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Kernel: 99114us</b><br>Percentage: 77.93%</div>", 2949, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Memcpy: 2949us</b><br>Percentage: 2.32%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Memset: 98us</b><br>Percentage: 0.08%</div>", 3417, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Runtime: 3417us</b><br>Percentage: 2.69%</div>", 6, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>DataLoader: 6us</b><br>Percentage: 0.0%</div>", 19521, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>CPU Exec: 19521us</b><br>Percentage: 15.35%</div>", 2076, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 7<br>Total: 127181us<br><b>Other: 2076us</b><br>Percentage: 1.63%</div>"], ["8", 99021, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Kernel: 99021us</b><br>Percentage: 80.45%</div>", 2975, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Memcpy: 2975us</b><br>Percentage: 2.42%</div>", 97, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Memset: 97us</b><br>Percentage: 0.08%</div>", 3544, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Runtime: 3544us</b><br>Percentage: 2.88%</div>", 0, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>DataLoader: 0us</b><br>Percentage: 0.0%</div>", 15464, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>CPU Exec: 15464us</b><br>Percentage: 12.56%</div>", 1978, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 8<br>Total: 123079us<br><b>Other: 1978us</b><br>Percentage: 1.61%</div>"], ["9", 98791, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Kernel: 98791us</b><br>Percentage: 60.44%</div>", 3596, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Memcpy: 3596us</b><br>Percentage: 2.2%</div>", 97, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Memset: 97us</b><br>Percentage: 0.06%</div>", 8275, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Runtime: 8275us</b><br>Percentage: 5.06%</div>", 1370, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>DataLoader: 1370us</b><br>Percentage: 0.84%</div>", 43905, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>CPU Exec: 43905us</b><br>Percentage: 26.86%</div>", 7427, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 9<br>Total: 163461us<br><b>Other: 7427us</b><br>Percentage: 4.54%</div>"], ["10", 98956, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Kernel: 98956us</b><br>Percentage: 79.68%</div>", 2885, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Memcpy: 2885us</b><br>Percentage: 2.32%</div>", 98, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Memset: 98us</b><br>Percentage: 0.08%</div>", 3714, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Runtime: 3714us</b><br>Percentage: 2.99%</div>", 1400, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>DataLoader: 1400us</b><br>Percentage: 1.13%</div>", 13235, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>CPU Exec: 13235us</b><br>Percentage: 10.66%</div>", 3910, "<div class=\"visualization-tooltip\" style=\"white-space: nowrap;\">Step 10<br>Total: 124198us<br><b>Other: 3910us</b><br>Percentage: 3.15%</div>"]]}, "performance": [{"name": "Average Step Time", "description": "", "value": 141068, "extra": 100, "children": [{"name": "Kernel", "description": "", "value": 99145, "extra": 70.28}, {"name": "Memcpy", "description": "", "value": 3160, "extra": 2.24}, {"name": "Memset", "description": "", "value": 98, "extra": 0.07}, {"name": "Runtime", "description": "", "value": 10564, "extra": 7.49}, {"name": "DataLoader", "description": "", "value": 1186, "extra": 0.84}, {"name": "CPU Exec", "description": "", "value": 22665, "extra": 16.07}, {"name": "Other", "description": "", "value": 4251, "extra": 3.01}]}], "recommendations": "<ul><li>Kernels with 68% time are launched by Tensor Cores eligible operators. You could enable <a href=\"https://pytorch.org/docs/stable/amp.html\" target=\"_blank\">Automatic Mixed Precision</a> to speedup by using FP16.</li></ul>", "environments": [{"title": "Number of Worker(s)", "value": "1"}, {"title": "Device Type", "value": "GPU"}], "gpu_metrics": {"title": "GPU Summary", "data": [{"title": "GPU 0:", "value": ""}, {"title": "Name", "value": "Tesla V100-DGXS-32GB"}, {"title": "Memory", "value": "31.74 GB"}, {"title": "Compute Capability", "value": "7.0"}, {"title": "GPU Utilization", "value": "70.27 %"}, {"title": "Est. SM Efficiency", "value": "69.22 %"}, {"title": "Est. Achieved Occupancy", "value": "48.91 %"}, {"title": "Kernel Time using Tensor Cores", "value": "0.0 %"}], "tooltip": "The GPU usage metrics:\n\nGPU Utilization:\nGPU busy time / All steps time. The higher, the better. GPU busy time is the time during which there is at least one GPU kernel running on it. All steps time is the total time of all profiler steps(or called as iterations).\n\nEst. SM Efficiency:\nEstimated Stream Multiprocessor Efficiency. The higher, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%). This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by all steps time.\n\nEst. Achieved Occupancy:\nFor most cases such as memory bandwidth bounded kernels, the higher the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This overall number is the weighted average of all kernels' OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization.\n\nKernel using Tensor Cores:\nTotal GPU Time for Tensor Core kernels / Total GPU Time for all kernels.\n"}}
{"device_total_time": {"title": "Device Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward", 274794], ["CudnnConvolutionBackward", 274794], ["aten::cudnn_convolution_backward_weight", 141300], ["aten::cudnn_convolution_backward_input", 133494], ["aten::cudnn_convolution", 128683], ["aten::_convolution", 128683], ["aten::convolution", 128683], ["aten::conv2d", 128683], ["aten::cudnn_batch_norm_backward", 61899], ["CudnnBatchNormBackward", 61899], ["aten::cudnn_batch_norm", 34315], ["aten::_batch_norm_impl_index", 34315], ["aten::batch_norm", 34315], ["aten::threshold_backward", 27280], ["ReluBackward1", 27280], ["aten::add_", 24052], ["aten::to", 18959], ["aten::copy_", 18959], ["aten::clamp_min", 17862], ["aten::clamp_min_", 17862], ["aten::relu_", 17862], ["aten::add", 16026], ["aten::max_pool2d_with_indices_backward", 4695], ["MaxPool2DWithIndicesBackward", 4695], ["torch::autograd::AccumulateGrad", 3012], ["aten::mul_", 2395], ["aten::fill_", 1888], ["aten::zero_", 1882], ["aten::max_pool2d_with_indices", 1422], ["aten::max_pool2d", 1422], ["aten::mm", 274], ["AddmmBackward", 274], ["aten::mean", 210], ["aten::adaptive_avg_pool2d", 210], ["aten::addmm", 197], ["aten::linear", 197], ["aten::div", 145], ["MeanBackward1", 145], ["aten::cross_entropy_loss", 60], ["aten::_log_softmax_backward_data", 51], ["LogSoftmaxBackward", 51], ["aten::sum", 45], ["aten::_log_softmax", 42], ["aten::log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss", 18], ["aten::nll_loss_nd", 18], ["aten::nll_loss_backward", 18], ["NllLossBackward", 18], ["aten::ones_like", 6]]}, "device_self_time": {"title": "Device Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::cudnn_convolution_backward_weight", 141300], ["aten::cudnn_convolution_backward_input", 133494], ["aten::cudnn_convolution", 128683], ["aten::cudnn_batch_norm_backward", 61899], ["aten::cudnn_batch_norm", 34315], ["aten::threshold_backward", 27280], ["aten::add_", 24052], ["aten::copy_", 18959], ["aten::clamp_min", 17862], ["aten::add", 16026], ["aten::max_pool2d_with_indices_backward", 3838], ["aten::mul_", 2395], ["aten::fill_", 1888], ["aten::max_pool2d_with_indices", 1422], ["aten::mm", 274], ["aten::mean", 210], ["aten::addmm", 197], ["aten::div", 145], ["aten::_log_softmax_backward_data", 51], ["aten::sum", 45], ["aten::_log_softmax", 42], ["aten::nll_loss_forward", 18], ["aten::nll_loss_backward", 18]]}, "host_total_time": {"title": "Host Total Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["CudnnConvolutionBackward", 119890], ["aten::cudnn_convolution_backward", 115797], ["aten::batch_norm", 105589], ["aten::add_", 97540], ["aten::_batch_norm_impl_index", 95925], ["aten::conv2d", 91000], ["aten::cudnn_batch_norm", 87823], ["aten::empty", 82024], ["aten::convolution", 81781], ["aten::_convolution", 74086], ["aten::cudnn_convolution", 64167], ["aten::cudnn_convolution_backward_weight", 60712], ["aten::to", 57776], ["aten::copy_", 56915], ["aten::cudnn_convolution_backward_input", 47359], ["CudnnBatchNormBackward", 41825], ["torch::autograd::AccumulateGrad", 37189], ["aten::cudnn_batch_norm_backward", 36641], ["aten::mul_", 35389], ["aten::relu_", 29432], ["aten::zero_", 28309], ["aten::add", 23831], ["aten::clamp_min_", 19059], ["aten::empty_like", 18591], ["aten::fill_", 17657], ["aten::resize_", 15019], ["ReluBackward1", 14944], ["aten::clamp_min", 12503], ["aten::threshold_backward", 12062], ["aten::view", 9046], ["AddmmBackward", 2026], ["aten::linear", 1463], ["aten::mm", 1424], ["aten::zeros", 1319], ["aten::cross_entropy_loss", 1225], ["aten::addmm", 1060], ["NllLossBackward", 889], ["aten::nll_loss_backward", 747], ["aten::t", 725], ["MeanBackward1", 663], ["aten::max_pool2d", 599], ["MaxPool2DWithIndicesBackward", 590], ["aten::adaptive_avg_pool2d", 581], ["aten::log_softmax", 580], ["aten::nll_loss_nd", 507], ["LogSoftmaxBackward", 500], ["aten::max_pool2d_with_indices_backward", 493], ["aten::ones_like", 470], ["aten::div", 469], ["aten::mean", 454], ["aten::empty_strided", 453], ["aten::_log_softmax_backward_data", 424], ["aten::max_pool2d_with_indices", 422], ["aten::_log_softmax", 420], ["aten::nll_loss", 418], ["aten::transpose", 413], ["aten::sum", 411], ["aten::nll_loss_forward", 343], ["aten::detach_", 323], ["aten::as_strided", 244], ["aten::expand", 237], ["aten::set_", 221], ["AddBackward0", 200], ["aten::flatten", 163], ["detach_", 156], ["TBackward", 151], ["ViewBackward", 132], ["aten::reshape", 88], ["aten::conj", 15]]}, "host_self_time": {"title": "Host Self Time (us)", "columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["aten::empty", 82024], ["aten::add_", 62385], ["aten::cudnn_convolution", 35632], ["aten::cudnn_convolution_backward_input", 31902], ["aten::cudnn_convolution_backward_weight", 30672], ["aten::mul_", 24617], ["aten::cudnn_batch_norm", 23800], ["aten::add", 17808], ["aten::cudnn_batch_norm_backward", 15118], ["aten::resize_", 15019], ["aten::zero_", 10815], ["aten::relu_", 10373], ["aten::_convolution", 9919], ["aten::batch_norm", 9664], ["aten::fill_", 9660], ["aten::conv2d", 9219], ["aten::view", 9046], ["aten::clamp_min", 8409], ["aten::empty_like", 8385], ["aten::_batch_norm_impl_index", 8102], ["aten::threshold_backward", 7820], ["aten::cudnn_convolution_backward", 7726], ["aten::convolution", 7695], ["torch::autograd::AccumulateGrad", 7181], ["aten::clamp_min_", 6556], ["CudnnBatchNormBackward", 5184], ["CudnnConvolutionBackward", 4093], ["ReluBackward1", 2882], ["aten::mm", 1032], ["aten::zeros", 877], ["aten::addmm", 652], ["aten::to", 547], ["aten::nll_loss_backward", 463], ["aten::empty_strided", 453], ["aten::div", 343], ["aten::max_pool2d_with_indices", 325], ["aten::t", 312], ["aten::nll_loss_forward", 264], ["aten::transpose", 254], ["aten::as_strided", 244], ["AddmmBackward", 244], ["aten::mean", 233], ["aten::copy_", 230], ["aten::set_", 221], ["aten::max_pool2d_with_indices_backward", 213], ["aten::sum", 201], ["AddBackward0", 200], ["aten::max_pool2d", 177], ["aten::_log_softmax", 168], ["aten::detach_", 167], ["detach_", 156], ["aten::expand", 152], ["NllLossBackward", 142], ["aten::_log_softmax_backward_data", 142], ["aten::linear", 139], ["aten::cross_entropy_loss", 138], ["aten::adaptive_avg_pool2d", 127], ["aten::log_softmax", 106], ["MaxPool2DWithIndicesBackward", 97], ["aten::ones_like", 96], ["MeanBackward1", 95], ["aten::nll_loss_nd", 89], ["aten::flatten", 88], ["LogSoftmaxBackward", 76], ["aten::nll_loss", 75], ["ViewBackward", 44], ["aten::reshape", 43], ["TBackward", 33], ["aten::conj", 15]]}}
{"metadata": {"sort": "device_self_duration", "tooltips": {"tc_eligible": "Whether this operator is eligible to use Tensor Cores.", "tc_self_ratio": "Time of self-kernels with Tensor Cores / Time of self-kernels.", "tc_total_ratio": "Time of kernels with Tensor Cores / Time of kernels."}}, "data": [{"name": "aten::cudnn_convolution_backward_weight", "calls": 318, "device_self_duration": 141300, "device_total_duration": 141300, "host_self_duration": 30672, "host_total_duration": 60712, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward_input", "calls": 312, "device_self_duration": 133494, "device_total_duration": 133494, "host_self_duration": 31902, "host_total_duration": 47359, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution", "calls": 318, "device_self_duration": 128683, "device_total_duration": 128683, "host_self_duration": 35632, "host_total_duration": 64167, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::cudnn_batch_norm_backward", "calls": 318, "device_self_duration": 61899, "device_total_duration": 61899, "host_self_duration": 15118, "host_total_duration": 36641, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_batch_norm", "calls": 318, "device_self_duration": 34315, "device_total_duration": 34315, "host_self_duration": 23800, "host_total_duration": 87823, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::threshold_backward", "calls": 294, "device_self_duration": 27280, "device_total_duration": 27280, "host_self_duration": 7820, "host_total_duration": 12062, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::add_", "calls": 2994, "device_self_duration": 24052, "device_total_duration": 24052, "host_self_duration": 62385, "host_total_duration": 97540, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::copy_", "calls": 12, "device_self_duration": 18959, "device_total_duration": 18959, "host_self_duration": 230, "host_total_duration": 56915, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::clamp_min", "calls": 294, "device_self_duration": 17862, "device_total_duration": 17862, "host_self_duration": 8409, "host_total_duration": 12503, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::add", "calls": 414, "device_self_duration": 16026, "device_total_duration": 16026, "host_self_duration": 17808, "host_total_duration": 23831, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices_backward", "calls": 6, "device_self_duration": 3838, "device_total_duration": 4695, "host_self_duration": 213, "host_total_duration": 493, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::mul_", "calls": 966, "device_self_duration": 2395, "device_total_duration": 2395, "host_self_duration": 24617, "host_total_duration": 35389, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::fill_", "calls": 978, "device_self_duration": 1888, "device_total_duration": 1888, "host_self_duration": 9660, "host_total_duration": 17657, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d_with_indices", "calls": 6, "device_self_duration": 1422, "device_total_duration": 1422, "host_self_duration": 325, "host_total_duration": 422, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::mm", "calls": 12, "device_self_duration": 274, "device_total_duration": 274, "host_self_duration": 1032, "host_total_duration": 1424, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::mean", "calls": 6, "device_self_duration": 210, "device_total_duration": 210, "host_self_duration": 233, "host_total_duration": 454, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::addmm", "calls": 6, "device_self_duration": 197, "device_total_duration": 197, "host_self_duration": 652, "host_total_duration": 1060, "tc_eligible": "Yes", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::div", "calls": 6, "device_self_duration": 145, "device_total_duration": 145, "host_self_duration": 343, "host_total_duration": 469, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::_log_softmax_backward_data", "calls": 6, "device_self_duration": 51, "device_total_duration": 51, "host_self_duration": 142, "host_total_duration": 424, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::sum", "calls": 6, "device_self_duration": 45, "device_total_duration": 45, "host_self_duration": 201, "host_total_duration": 411, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::_log_softmax", "calls": 6, "device_self_duration": 42, "device_total_duration": 42, "host_self_duration": 168, "host_total_duration": 420, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_forward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 264, "host_total_duration": 343, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_backward", "calls": 6, "device_self_duration": 18, "device_total_duration": 18, "host_self_duration": 463, "host_total_duration": 747, "tc_eligible": "No", "tc_self_ratio": 0.0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::empty", "calls": 4212, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 82024, "host_total_duration": 82024, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::zero_", "calls": 996, "device_self_duration": 0, "device_total_duration": 1882, "host_self_duration": 10815, "host_total_duration": 28309, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::zeros", "calls": 24, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 877, "host_total_duration": 1319, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::to", "calls": 36, "device_self_duration": 0, "device_total_duration": 18959, "host_self_duration": 547, "host_total_duration": 57776, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "detach_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 156, "host_total_duration": 156, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::detach_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 167, "host_total_duration": 323, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::set_", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 221, "host_total_duration": 221, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::empty_strided", "calls": 18, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 453, "host_total_duration": 453, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::resize_", "calls": 1896, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 15019, "host_total_duration": 15019, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::_convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 9919, "host_total_duration": 74086, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::convolution", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 7695, "host_total_duration": 81781, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::conv2d", "calls": 318, "device_self_duration": 0, "device_total_duration": 128683, "host_self_duration": 9219, "host_total_duration": 91000, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::empty_like", "calls": 336, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 8385, "host_total_duration": 18591, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::view", "calls": 654, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 9046, "host_total_duration": 9046, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::_batch_norm_impl_index", "calls": 318, "device_self_duration": 0, "device_total_duration": 34315, "host_self_duration": 8102, "host_total_duration": 95925, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::batch_norm", "calls": 318, "device_self_duration": 0, "device_total_duration": 34315, "host_self_duration": 9664, "host_total_duration": 105589, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::clamp_min_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17862, "host_self_duration": 6556, "host_total_duration": 19059, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::relu_", "calls": 294, "device_self_duration": 0, "device_total_duration": 17862, "host_self_duration": 10373, "host_total_duration": 29432, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::max_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 1422, "host_self_duration": 177, "host_total_duration": 599, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::adaptive_avg_pool2d", "calls": 6, "device_self_duration": 0, "device_total_duration": 210, "host_self_duration": 127, "host_total_duration": 581, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::flatten", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 88, "host_total_duration": 163, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::as_strided", "calls": 42, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 244, "host_total_duration": 244, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::transpose", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 254, "host_total_duration": 413, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::t", "calls": 30, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 312, "host_total_duration": 725, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::expand", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 152, "host_total_duration": 237, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": true}, {"name": "aten::linear", "calls": 6, "device_self_duration": 0, "device_total_duration": 197, "host_self_duration": 139, "host_total_duration": 1463, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::log_softmax", "calls": 6, "device_self_duration": 0, "device_total_duration": 42, "host_self_duration": 106, "host_total_duration": 580, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 75, "host_total_duration": 418, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::nll_loss_nd", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 89, "host_total_duration": 507, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::cross_entropy_loss", "calls": 6, "device_self_duration": 0, "device_total_duration": 60, "host_self_duration": 138, "host_total_duration": 1225, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "aten::ones_like", "calls": 6, "device_self_duration": 0, "device_total_duration": 6, "host_self_duration": 96, "host_total_duration": 470, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": true}, {"name": "NllLossBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 18, "host_self_duration": 142, "host_total_duration": 889, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "LogSoftmaxBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 51, "host_self_duration": 76, "host_total_duration": 500, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::conj", "calls": 12, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 15, "host_total_duration": 15, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "AddmmBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 274, "host_self_duration": 244, "host_total_duration": 2026, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "torch::autograd::AccumulateGrad", "calls": 966, "device_self_duration": 0, "device_total_duration": 3012, "host_self_duration": 7181, "host_total_duration": 37189, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "TBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 33, "host_total_duration": 151, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "aten::reshape", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 43, "host_total_duration": 88, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "ViewBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 44, "host_total_duration": 132, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "MeanBackward1", "calls": 6, "device_self_duration": 0, "device_total_duration": 145, "host_self_duration": 95, "host_total_duration": 663, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "ReluBackward1", "calls": 294, "device_self_duration": 0, "device_total_duration": 27280, "host_self_duration": 2882, "host_total_duration": 14944, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "AddBackward0", "calls": 96, "device_self_duration": 0, "device_total_duration": 0, "host_self_duration": 200, "host_total_duration": 200, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0, "has_call_stack": false}, {"name": "CudnnBatchNormBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 61899, "host_self_duration": 5184, "host_total_duration": 41825, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "aten::cudnn_convolution_backward", "calls": 318, "device_self_duration": 0, "device_total_duration": 274794, "host_self_duration": 7726, "host_total_duration": 115797, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "CudnnConvolutionBackward", "calls": 318, "device_self_duration": 0, "device_total_duration": 274794, "host_self_duration": 4093, "host_total_duration": 119890, "tc_eligible": "Yes", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}, {"name": "MaxPool2DWithIndicesBackward", "calls": 6, "device_self_duration": 0, "device_total_duration": 4695, "host_self_duration": 97, "host_total_duration": 590, "tc_eligible": "No", "tc_self_ratio": 0, "tc_total_ratio": 0.0, "has_call_stack": false}]}
{"metadata": {"sort": "Total Duration (us)"}, "data": {"columns": [{"type": "string", "name": "Name"}, {"type": "string", "name": "Tensor Cores Used", "tooltip": "Whether this kernel uses Tensor Cores."}, {"type": "number", "name": "Calls"}, {"type": "number", "name": "Total Duration (us)"}, {"type": "number", "name": "Mean Duration (us)"}, {"type": "number", "name": "Max Duration (us)"}, {"type": "number", "name": "Min Duration (us)"}, {"type": "number", "name": "Mean Blocks Per SM", "tooltip": "Blocks Per SM = blocks of this kernel / SM number of this GPU.\nIf this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.\n\"Mean Blocks per SM\" is the weighted average of all calls of this kernel, using each call's execution duration as weight."}, {"type": "number", "name": "Mean Est. Achieved Occupancy (%)", "tooltip": "Est. Achieved Occupancy:\nFor most cases such as memory bandwidth bounded kernels, the higher the better. Occupancy is the ratio of active warps on an SM to the maximum number of active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple factors such as kernel shape, kernel used resource, and the GPU compute capability.\nEst. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). This \"Mean\" number is the weighted average of all calls' OCC_K of the kernel, using each call's execution duration as weight. It shows fine-grained low-level GPU utilization."}], "rows": [["void cudnn::detail::dgrad_engine<float, 512, 6, 5, 3, 3, 3, false>(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", "No", 180, 86855, 483, 1023, 323, 45.33, 30.04], ["void cudnn::bn_bw_1C11_kernel_new<float, float, float2, 512, true, 1>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", "No", 264, 59568, 226, 923, 45, 4.33, 67.92], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 7, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 90, 43471, 483, 742, 363, 8.18, 38.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<float>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<float>, at::detail::Array<char*, 3>)", "No", 3090, 39753, 13, 376, 1, 641.51, 92.35], ["void implicit_convolve_sgemm<float, float, 1024, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 90, 37016, 411, 735, 346, 12.39, 50.0], ["void implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 72, 35106, 488, 822, 350, 3.83, 41.64], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3>)", "No", 294, 27280, 93, 377, 13, 653.26, 100.0], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, 512, true, 1>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", "No", 150, 27084, 181, 454, 53, 3.12, 64.02], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", "No", 72, 25342, 352, 629, 323, 3.21, 25.0], ["volta_sgemm_64x64_nt", "No", 102, 21125, 207, 281, 184, 10.28, 19.38], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", "No", 48, 20473, 427, 681, 309, 6.82, 25.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>)", "No", 294, 17862, 61, 252, 5, 666.77, 100.0], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", "No", 36, 12761, 354, 365, 344, 22.4, 25.0], ["volta_scudnn_128x64_stridedB_interior_nn_v1", "No", 30, 9559, 319, 508, 255, 12.91, 19.0], ["volta_sgemm_128x32_nt", "No", 24, 8658, 361, 479, 18, 0.97, 11.51], ["volta_sgemm_64x64_nn", "No", 42, 8544, 203, 210, 197, 12.35, 24.14], ["volta_scudnn_128x64_relu_interior_nn_v1", "No", 30, 7976, 266, 316, 92, 37.08, 25.0], ["void implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 12, 7939, 662, 733, 584, 7.54, 25.0], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", "No", 12, 7819, 652, 670, 634, 15.96, 19.0], ["void cudnn::bn_fw_tr_1C11_singleread<float, 512, true, 1, 2, 0>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", "No", 168, 7231, 43, 89, 11, 12.63, 75.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 5, 5, 3, 3, 3, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 12, 7068, 589, 990, 192, 85.38, 37.51], ["void cudnn::ops::scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)", "No", 180, 5901, 33, 142, 5, 525.02, 100.0], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)", "No", 120, 5314, 44, 72, 20, 10.02, 50.0], ["void implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", "No", 12, 5221, 435, 440, 431, 9.8, 31.0], ["void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", "No", 78, 4681, 60, 126, 20, 15.46, 38.0], ["void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", "No", 120, 4648, 39, 67, 17, 10.15, 50.0], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)", "No", 78, 4559, 58, 126, 17, 15.71, 50.0], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 8, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", "No", 6, 4038, 673, 691, 649, 6.4, 25.0], ["void at::native::(anonymous namespace)::max_pool_backward_nchw<float, float>(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", "No", 6, 3838, 640, 643, 637, 1254.4, 100.0], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nhwc_tn_v1", "No", 6, 3697, 616, 621, 614, 2.6, 25.0], ["volta_scudnn_128x64_relu_medium_nn_v1", "No", 6, 3647, 608, 620, 602, 39.2, 25.0], ["volta_scudnn_128x128_stridedB_medium_nn_v1", "No", 12, 3550, 296, 309, 286, 19.6, 25.0], ["volta_scudnn_128x64_relu_small_nn_v1", "No", 12, 3273, 273, 286, 258, 9.8, 25.0], ["volta_sgemm_32x128_nn", "No", 18, 3059, 170, 173, 167, 22.05, 50.0], ["volta_scudnn_128x128_stridedB_small_nn_v1", "No", 6, 3034, 506, 520, 491, 19.6, 25.0], ["volta_sgemm_32x128_nt", "No", 18, 2837, 158, 159, 156, 22.05, 50.0], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)", "No", 120, 2632, 22, 67, 4, 8.75, 73.78], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2> >(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>)", "No", 966, 2395, 2, 25, 1, 44.01, 58.56], ["void cudnn::bn_bw_1C11_singleread<float, 512, true, 1, 2, 0>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", "No", 54, 2331, 43, 75, 19, 20.83, 75.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>)", "No", 978, 1888, 2, 143, 0, 600.2, 86.95], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)", "No", 78, 1484, 19, 69, 3, 8.13, 41.71], ["void at::native::(anonymous namespace)::max_pool_forward_nchw<float, float>(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", "No", 6, 1422, 237, 243, 234, 313.6, 100.0], ["volta_scudnn_128x64_stridedB_small_nn_v1", "No", 6, 582, 97, 99, 94, 9.8, 19.0], ["void nchwToNhwcKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", "No", 12, 383, 32, 34, 29, 71.72, 100.0], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2>)", "No", 318, 325, 1, 2, 1, 0.01, 0.0], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", "No", 108, 216, 2, 5, 1, 0.16, 2.0], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4>)", "No", 6, 210, 35, 35, 35, 51.2, 100.0], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", "No", 132, 155, 1, 2, 1, 0.16, 1.83], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", "No", 132, 150, 1, 2, 1, 0.02, 0.0], ["volta_sgemm_64x32_sliced1x4_nn", "No", 6, 149, 25, 25, 24, 2.0, 25.0], ["volta_sgemm_64x32_sliced1x4_tn", "No", 6, 148, 25, 25, 24, 1.0, 13.0], ["void at::native::unrolled_elementwise_kernel<at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", "No", 6, 145, 24, 25, 24, 156.8, 100.0], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)", "No", 36, 126, 4, 5, 2, 0.4, 3.0], ["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", "No", 54, 57, 1, 2, 1, 0.02, 0.0], ["void nhwcToNchwKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", "No", 6, 54, 9, 10, 8, 12.8, 100.0], ["void (anonymous namespace)::softmax_warp_backward<float, float, float, 10, true>(float*, float const*, float const*, int, int, int)", "No", 6, 51, 8, 9, 8, 0.1, 1.0], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "No", 6, 45, 8, 8, 7, 0.03, 0.0], ["void (anonymous namespace)::softmax_warp_forward<float, float, float, 10, true>(float*, float const*, int, int, int)", "No", 6, 42, 7, 7, 7, 0.1, 1.0], ["void splitKreduce_kernel<float, float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*, float const*)", "No", 12, 31, 3, 4, 2, 4.39, 27.74], ["void at::native::unrolled_elementwise_kernel<at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast)", "No", 6, 30, 5, 5, 5, 1.56, 5.0], ["void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)", "No", 6, 18, 3, 3, 3, 0.01, 0.0], ["void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)", "No", 6, 12, 2, 2, 2, 0.01, 0.0]]}}
{"total": {"columns": [{"type": "string", "name": "name"}, {"type": "number", "name": "value"}], "rows": [["void cudnn::detail::dgrad_engine<float, 512, 6, 5, 3, 3, 3, false>(int, int, int, float const*, int, float const*, int, float*, kernel_grad_params, unsigned long long, int, unsigned long long, int, float, int, int, int)", 86855], ["void cudnn::bn_bw_1C11_kernel_new<float, float, float2, 512, true, 1>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float)", 59568], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 7, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 43471], ["void at::native::vectorized_elementwise_kernel<4, at::native::AddFunctor<float>, at::detail::Array<char*, 3> >(int, at::native::AddFunctor<float>, at::detail::Array<char*, 3>)", 39753], ["void implicit_convolve_sgemm<float, float, 1024, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 37016], ["void implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 35106], ["void at::native::vectorized_elementwise_kernel<4, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3> >(int, at::native::threshold_kernel_impl<float>(at::TensorIteratorBase&, float, float)::{lambda(float, float)#1}, at::detail::Array<char*, 3>)", 27280], ["void cudnn::bn_fw_tr_1C11_kernel_NCHW<float, float, 512, true, 1>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float)", 27084], ["volta_scudnn_128x128_stridedB_splitK_medium_nn_v1", 25342], ["volta_sgemm_64x64_nt", 21125], ["volta_scudnn_128x128_stridedB_splitK_small_nn_v1", 20473], ["void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2> >(int, at::native::(anonymous namespace)::clamp_min_scalar_kernel_impl(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>)", 17862], ["volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1", 12761], ["volta_scudnn_128x64_stridedB_interior_nn_v1", 9559], ["volta_sgemm_128x32_nt", 8658], ["volta_sgemm_64x64_nn", 8544], ["volta_scudnn_128x64_relu_interior_nn_v1", 7976], ["void implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 7939], ["volta_scudnn_128x64_stridedB_splitK_xregs_large_nn_v1", 7819], ["void cudnn::bn_fw_tr_1C11_singleread<float, 512, true, 1, 2, 0>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float const*, float, float, float*, float*, float*, float*, float, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)", 7231], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 5, 5, 3, 3, 3, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 7068], ["void cudnn::ops::scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)", 5901], ["void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)", 5314], ["void implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, false, true, true>(int, int, int, float const*, int, float*, float const*, kernel_conv_params, unsigned long long, int, float, float, int, float const*, float const*, bool, int, int)", 5221], ["void cudnn::winograd_nonfused::winogradWgradData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 4681], ["void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)", 4648], ["void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)", 4559], ["void cudnn::cnn::wgrad_alg0_engine<float, 128, 6, 8, 3, 3, 5, false, 512>(int, int, int, float const*, int, float*, float const*, kernel_grad_params, unsigned long long, int, float, int, int, int, int)", 4038], ["void at::native::(anonymous namespace)::max_pool_backward_nchw<float, float>(int, float const*, long const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*)", 3838], ["volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nhwc_tn_v1", 3697], ["volta_scudnn_128x64_relu_medium_nn_v1", 3647], ["volta_scudnn_128x128_stridedB_medium_nn_v1", 3550], ["volta_scudnn_128x64_relu_small_nn_v1", 3273], ["volta_sgemm_32x128_nn", 3059], ["volta_scudnn_128x128_stridedB_small_nn_v1", 3034], ["volta_sgemm_32x128_nt", 2837], ["void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)", 2632], ["void at::native::vectorized_elementwise_kernel<4, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2> >(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>)", 2395], ["void cudnn::bn_bw_1C11_singleread<float, 512, true, 1, 2, 0>(float, float, float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float const*, float*, float*, float const*, float const*, float, cudnn::reduced_divisor, int, cudnn::reduced_divisor, cudnn::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool)", 2331], ["void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<float>, at::detail::Array<char*, 1> >(int, at::native::FillFunctor<float>, at::detail::Array<char*, 1>)", 1888], ["void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)", 1484], ["void at::native::(anonymous namespace)::max_pool_forward_nchw<float, float>(int, float const*, int, int, int, int, int, int, int, int, int, int, int, int, int, int, float*, long*)", 1422], ["volta_scudnn_128x64_stridedB_small_nn_v1", 582], ["void nchwToNhwcKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 383], ["void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2> >(int, at::native::BUnaryFunctor<at::native::AddFunctor<long> >, at::detail::Array<char*, 2>)", 325], ["cask_cudnn::computeOffsetsKernel(cask_cudnn::ComputeOffsetsParams)", 216], ["void at::native::reduce_kernel<512, 1, at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::MeanOps<float, float>, unsigned int, float, 4>)", 210], ["cask_cudnn::computeWgradSplitKOffsetsKernel(cask_cudnn::ComputeSplitKOffsetsParams)", 155], ["cask_cudnn::computeWgradBOffsetsKernel(cask_cudnn::ComputeWgradBOffsetsParams)", 150], ["volta_sgemm_64x32_sliced1x4_nn", 149], ["volta_sgemm_64x32_sliced1x4_tn", 148], ["void at::native::unrolled_elementwise_kernel<at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, at::native::MulScalarFunctor<float, float>, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, OffsetCalculator<1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast)", 145], ["void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)", 126], ["cask_cudnn::computeBOffsetsKernel(cask_cudnn::ComputeBOffsetsParams)", 57], ["void nhwcToNchwKernel<float, float, float, true, false, (cudnnKernelDataType_t)0>(int, int, int, int, float const*, float*, float, float)", 54], ["void (anonymous namespace)::softmax_warp_backward<float, float, float, 10, true>(float*, float const*, float const*, int, int, int)", 51], ["void at::native::reduce_kernel<128, 4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", 45], ["void (anonymous namespace)::softmax_warp_forward<float, float, float, 10, true>(float*, float const*, int, int, int)", 42], ["void splitKreduce_kernel<float, float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*, float const*)", 31], ["void at::native::unrolled_elementwise_kernel<at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast>(int, at::native::copy_device_to_device(at::TensorIterator&, bool)::{lambda()#2}::operator()() const::{lambda()#8}::operator()() const::{lambda(float)#1}, at::detail::Array<char*, 2>, OffsetCalculator<1, unsigned int>, char*, at::native::memory::LoadWithoutCast, at::detail::Array<char*, 2>::StoreWithoutCast)", 30], ["void cunn_ClassNLLCriterion_updateOutput_kernel<float, float>(float*, float*, float*, long*, float*, int, int, int, int, long)", 18], ["void cunn_ClassNLLCriterion_updateGradInput_kernel<float>(float*, float*, long*, float*, float*, int, int, int, int, long)", 12]]}}
