《GPU高性能编程CUDA实战》
Chapter 1~3 吹眠曲
select_divice();
cudaDeviceProp prop; int dev; HANDLE_ERROR( cudaGetDevice( &dev ) ); printf( "ID of current CUDA device: %d\n", dev ); memset( &prop, 0, sizeof( cudaDeviceProp ) ); prop.major = 1; prop.minor = 3; HANDLE_ERROR( cudaChooseDevice( &dev, &prop ) ); printf( "ID of CUDA device closest to revision 1.3: %d\n", dev ); HANDLE_ERROR( cudaSetDevice( dev ) );
Chapter 4 kernel并行编程
kernel <<< grids—DIM3,threads-DIM3 >>> ( *args )
Chapter 5 线程协作
分治的思想
__shared__ memory; __syncthreads(); DIM3 blocksPerGrid( , );// grid 最大2维 无限制,cuda核心 DIM3 threadsPerBlock( , , );// block 最大3维 x*y*z <= 1024 设备有限制的 我的1060super限制为 1024.
Chapter 6 常量内存
__constant__ ;// 优化内存的 读取 性能。
Chapter 7 纹理内存 - 只读 n[i] ,n [i, j]
texture<float> textIn; cudaBindTexture(NULL, textIn, cuda_param, int size); tex1Dfetch( textIn, i ); python的 textIn[i] de实现。 cudaUnBindTexture(textIn); text2D<float, 2> textIn; cudaBindTexture2D(NULL, textIn, cuda_param, int size); text2D( textIn, i ,j ); python的 textIn[i,j] de实现。 cudaUnBindTexture(textIn);
Chapter 8 图形互操作性 引用&别名
GLuint buffer; // for 显示display GL类内部直接使用 static cudaGraphicsResource *source; // for 动态渲染 static uint4* devPtr; // for write modify & // GL端: glGenBuffers(1, &buffer); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, buffer ); glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, int size,buffer, NULL, 0); // cuda C 端: cudaGraphicsGLRegisterBuffer(&resource ,buffer, cudaGraphicsMapFlagsNone); cudaGraphicsMapResources(1, &resource, NULL); cudaGraphicsResourcesMapPointer((void**)&devPtr, &size, resource); operater_kernel<<< DIM / 1024 , 1024(max) >>> (devPtr); your_GL_display(); // ~ cudaGraphicsUnMapResources(1, &resource, NULL); cudaGraphicsGLUnRegisterBuffer(resource); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0 ); glDeleteBuffers(1, &buffer);
Chapter 9 原子性
多线程读写 冲突 等待队列特长
/*分治:对一片显存区域的多线程读写,使用共享显存处理每个block。*/ __shared__ unsigned int temp[256]; temp[threadIdx.x] = 0; __syncthreads(); int i = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; while (i < size) {
atomicAdd( &temp[buffer[i]], 1 ); i += stride; } __syncthreads(); // 分 atomicAdd( &(histo[threadIdx.x]), temp[threadIdx.x] );// 合
Chapter 10 流
多上下文编程,JUST LIKE TensorRT::Contexture。
HANDLE_ERROR( cudaMalloc( (void**)&dev_c1, N * sizeof(int) ) ); HANDLE_ERROR( cudaHostAlloc( (void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault ) ); ...coding... HANDLE_ERROR( cudaMemcpyAsync( dev_b0, host_b0, N * sizeof(int), cudaMemcpyHostToDevice, stream0 ) ); HANDLE_ERROR( cudaMemcpyAsync( dev_b1, host_b1, N * sizeof(int), cudaMemcpyHostToDevice, stream1 ) ); ...coding... kernel<<<N/256,256,0,stream0>>>( dev_a1, dev_b1, dev_c1 ); kernel<<<N/256,256,0,stream1>>>( dev_a2, dev_b2, dev_c2 ); HANDLE_ERROR( cudaStreamSynchronize( stream0 ) ); HANDLE_ERROR( cudaStreamSynchronize( stream1 ) ); ~ .......cudaFreeHost cudaFree cudaStreamDestroy ......
Chapter 11 多GPU的CUDA C
cudaHostAlloc(); malloc(); cudaMalloc();
使用主机内存条来充当显存使用。
|............................................................................................................................... float *a, *b; float *dev_a, *dev_b; // allocate memory on the CPU side a = (float*)malloc( size*sizeof(float) ); HANDLE_ERROR( cudaMalloc( (void**)&dev_a, size*sizeof(float) ) ); HANDLE_ERROR( cudaMemcpy( dev_a, a, size*sizeof(float), cudaMemcpyHostToDevice ) ); kernel<<<blocksPerGrid,threadsPerBlock>>>( size, dev_a); HANDLE_ERROR( cudaMemcpy( dev_a, a, size*sizeof(float), cudaMemcpyHostToDevice ) ); HANDLE_ERROR( cudaMemcpy( dev_b, b, size*sizeof(float), cudaMemcpyHostToDevice ) ); |............................................................................................................................... // allocate the memory on the CPU HANDLE_ERROR( cudaHostAlloc( (void**)&a, size*sizeof(float), cudaHostAllocWriteCombined | // up the performence of read cudaHostAllocMapped ) ); // 映射主机内存到GPU // find out the GPU pointers HANDLE_ERROR( cudaHostGetDevicePointer( &dev_a, a, 0 ) ); // 之后 获取 GPU 的指针 kernel<<<blocksPerGrid,threadsPerBlock>>>( size, dev_a); HANDLE_ERROR( cudaThreadSynchronize() ); HANDLE_ERROR( cudaFreeHost( a ) );