The art of doing more with less.
blockDim
dim3 block_dim_1(128); // 用于一维 dim3 block_dim_2(16, 16); // 用于二维 dim3 block_dim_3(8, 8, 8); // 用于三维 dim3 block_dim_v100(1024); // 让v100显卡满载;很多老显卡不支持,最多768个
cudaOccupancyMaxPotentialBlockSize
从 CUDA 6.5 开始,提供了一个很有用的函数 cudaOccupancyMaxPotentialBlockSize,该函数定义在 <cuda_runtime.h>,接口及含义见代码中的注释。
<cuda_runtime.h>
template <class T> cudaError_t __inline__ __host__ CUDART_DEVICE cudaOccupancyMaxPotentialBlockSize( int *minGridSize, // Suggested min grid size to achieve a full machine launch. int *blockSize, // Suggested block size to achieve maximum occupancy. T func, // Kernel function. size_t dynamicSMemSize = 0, //Size of dynamically allocated shared memory. Of course, it is known at runtime before any kernel launch. The size of the statically allocated shared memory is not needed as it is inferred by the properties of func. int blockSizeLimit = 0) //blockSizeLimit = Maximum size for each block. In the case of 1D kernels, it can coincide with the number of input elements. { return cudaOccupancyMaxPotentialBlockSizeVariableSMem(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit); }
gridDim
cudaOccupancyMaxActiveBlocksPerMultiprocessor
void __global__ primitiveZcopy( const double *real_in, const double *imag_in, double *real_out, double *imag_out) { const size_t i = blockDim.x * blockIdx.x + threadIdx.x; real_out[i] = real_in[i]; imag_out[i] = imag_in[i]; }
blockDim.x * gridDim.x
19260817
void __global__ ifZcopy( const size_t n, const double *real_in, const double *imag_in, double *real_out, double *imag_out) { const size_t i = blockDim.x * blockIdx.x + threadIdx.x; if (i < n) { real_out[i] = real_in[i]; imag_out[i] = imag_in[i]; } }
if
void __global__ simpleZcopy( const size_t n, const double *real_in, const double *imag_in, double *real_out, double *imag_out) { for (size_t i = blockDim.x * blockIdx.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { real_out[i] = real_in[i]; imag_out[i] = imag_in[i]; } }
for
#pragma omp parallel for
void __global__ simpleZcopy( const size_t n, const double *real_in, const double *imag_in, double *real_out, double *imag_out) { #pragma unroll(32) for (size_t i = blockDim.x * blockIdx.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { real_out[i] = real_in[i]; imag_out[i] = imag_in[i]; } }
#pragma unroll(32)
template
template<size_t UNROLL_SIZE> void __global__ simpleZcopy( const size_t n, const double *real_in, const double *imag_in, double *real_out, double *imag_out) { #pragma unroll(UNROLL_SIZE) for (size_t i = blockDim.x * blockIdx.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { real_out[i] = real_in[i]; imag_out[i] = imag_in[i]; } }
#define
int numThreads, minGridSize, blockSize; cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, simpleZcopy<32>); numThreads = minGridSize * blockSize; if(n % numThreads == 0 && n / numThreads % 32 == 0) simpleZcopy<32><<< minGridSize, blockSize>>>( n, real_in, imag_in, real_out, imag_out); else simpleZcopy<1><<< minGridSize, blockSize>>>( n, real_in, imag_in, real_out, imag_out);
thrust
thrust::copy
cublas_v2
cublasDcopy
cudaMemcpy
template <size_t BLOCK_SIZE> void __global__ simpleMatMatMul( const float *Ac, const float *B, float *C, const size_t m, const size_t n, const size_t p) { const size_t r = blockIdx.y * blockDim.y + threadIdx.y, c = blockIdx.x * blockDim.x + threadIdx.x; float res = 0; for (size_t t = 0; t < n; t += BLOCK_SIZE) { float __shared__ sAc[BLOCK_SIZE][BLOCK_SIZE], sB[BLOCK_SIZE][BLOCK_SIZE]; __syncthreads(); sAc[threadIdx.y][threadIdx.x] = r < m && t + threadIdx.x < n ? Ac[(t + threadIdx.x) * m + r] : 0; sB[threadIdx.x][threadIdx.y] = c < p && t + threadIdx.y < n ? B[(t + threadIdx.y) * p + c] : 0; __syncthreads(); for (size_t i = 0; i < blockDim.x; ++i) res += sAc[i][threadIdx.y] * sB[i][threadIdx.x]; } if (r < m && c < p) C[r * p + c] = res; }
template <size_t BLOCK_SIZE> void __global__ naiveMatMatMul( const float *Ac, const float *B, float *C, const size_t m, const size_t n, const size_t p) { const size_t r = blockIdx.y * blockDim.y + threadIdx.y, c = blockIdx.x * blockDim.x + threadIdx.x; float res = 0; for (size_t t = 0; t < n; t += BLOCK_SIZE) { float __shared__ sAc[BLOCK_SIZE][BLOCK_SIZE | 1], sB[BLOCK_SIZE][BLOCK_SIZE | 1]; __syncthreads(); sAc[threadIdx.y][threadIdx.x] = r < m && t + threadIdx.x < n ? Ac[(t + threadIdx.x) * m + r] : 0; sB[threadIdx.x][threadIdx.y] = c < p && t + threadIdx.y < n ? B[(t + threadIdx.y) * p + c] : 0; __syncthreads(); for (size_t i = 0; i < blockDim.x; ++i) res += sAc[i][threadIdx.y] * sB[i][threadIdx.x]; } if (r < m && c < p) C[r * p + c] = res; }