gpu高性能编程cuda实战 pdf_python编程从入门

(26) 2024-10-04 18:01:01

《GPU高性能编程CUDA实战》

Chapter 1~3 吹眠曲

select_divice();

 cudaDeviceProp prop; int dev; HANDLE_ERROR( cudaGetDevice( &dev ) ); printf( "ID of current CUDA device: %d\n", dev ); memset( &prop, 0, sizeof( cudaDeviceProp ) ); prop.major = 1; prop.minor = 3; HANDLE_ERROR( cudaChooseDevice( &dev, &prop ) ); printf( "ID of CUDA device closest to revision 1.3: %d\n", dev ); HANDLE_ERROR( cudaSetDevice( dev ) ); 

Chapter 4 kernel并行编程

 kernel <<< grids—DIM3,threads-DIM3 >>>*args ) 

Chapter 5 线程协作

分治的思想

 __shared__ memory; __syncthreads(); DIM3 blocksPerGrid( , );// grid 最大2维 无限制,cuda核心 DIM3 threadsPerBlock( , , );// block 最大3维 x*y*z <= 1024 设备有限制的 我的1060super限制为 1024. 

Chapter 6 常量内存

 __constant__ ;// 优化内存的 读取 性能。 

Chapter 7 纹理内存 - 只读 n[i] ,n [i, j]

 texture<float> textIn; cudaBindTexture(NULL, textIn, cuda_param, int size); tex1Dfetch( textIn, i ); python的 textIn[i] de实现。 cudaUnBindTexture(textIn); text2D<float2> textIn; cudaBindTexture2D(NULL, textIn, cuda_param, int size); text2D( textIn, i ,j ); python的 textIn[i,j] de实现。 cudaUnBindTexture(textIn); 

Chapter 8 图形互操作性 引用&别名

 GLuint buffer; // for 显示display GL类内部直接使用 static cudaGraphicsResource *source; // for 动态渲染 static uint4* devPtr; // for write modify & // GL端: glGenBuffers(1, &buffer)glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, buffer ); glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, int size,buffer, NULL, 0); // cuda C 端: cudaGraphicsGLRegisterBuffer(&resource ,buffer, cudaGraphicsMapFlagsNone); cudaGraphicsMapResources(1, &resource, NULL); cudaGraphicsResourcesMapPointer((void**)&devPtr, &size, resource); operater_kernel<<< DIM / 1024 , 1024(max) >>> (devPtr); your_GL_display(); // ~  cudaGraphicsUnMapResources(1, &resource, NULL); cudaGraphicsGLUnRegisterBuffer(resource); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0 ); glDeleteBuffers(1, &buffer)

Chapter 9 原子性

多线程读写 冲突 等待队列特长

/*分治:对一片显存区域的多线程读写,使用共享显存处理每个block。*/ __shared__ unsigned int temp[256]; temp[threadIdx.x] = 0; __syncthreads(); int i = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; while (i < size) { 
    atomicAdd( &temp[buffer[i]], 1 ); i += stride; } __syncthreads(); // 分 atomicAdd( &(histo[threadIdx.x]), temp[threadIdx.x] );// 合 

Chapter 10 流

多上下文编程,JUST LIKE TensorRT::Contexture。

 HANDLE_ERROR( cudaMalloc( (void**)&dev_c1, N * sizeof(int) ) ); HANDLE_ERROR( cudaHostAlloc( (void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault ) ); ...coding... HANDLE_ERROR( cudaMemcpyAsync( dev_b0, host_b0, N * sizeof(int), cudaMemcpyHostToDevice, stream0 ) ); HANDLE_ERROR( cudaMemcpyAsync( dev_b1, host_b1, N * sizeof(int), cudaMemcpyHostToDevice, stream1 ) ); ...coding... kernel<<<N/256,256,0,stream0>>>( dev_a1, dev_b1, dev_c1 ); kernel<<<N/256,256,0,stream1>>>( dev_a2, dev_b2, dev_c2 ); HANDLE_ERROR( cudaStreamSynchronize( stream0 ) ); HANDLE_ERROR( cudaStreamSynchronize( stream1 ) ); ~ .......cudaFreeHost cudaFree cudaStreamDestroy ...... 

Chapter 11 多GPU的CUDA C

cudaHostAlloc(); malloc(); cudaMalloc();
使用主机内存条来充当显存使用。

|............................................................................................................................... float *a, *b; float *dev_a, *dev_b; // allocate memory on the CPU side a = (float*)malloc( size*sizeof(float) ); HANDLE_ERROR( cudaMalloc( (void**)&dev_a, size*sizeof(float) ) ); HANDLE_ERROR( cudaMemcpy( dev_a, a, size*sizeof(float), cudaMemcpyHostToDevice ) ); kernel<<<blocksPerGrid,threadsPerBlock>>>( size, dev_a); HANDLE_ERROR( cudaMemcpy( dev_a, a, size*sizeof(float), cudaMemcpyHostToDevice ) ); HANDLE_ERROR( cudaMemcpy( dev_b, b, size*sizeof(float), cudaMemcpyHostToDevice ) ); |............................................................................................................................... // allocate the memory on the CPU HANDLE_ERROR( cudaHostAlloc( (void**)&a, size*sizeof(float), cudaHostAllocWriteCombined | // up the performence of read cudaHostAllocMapped ) ); // 映射主机内存到GPU // find out the GPU pointers HANDLE_ERROR( cudaHostGetDevicePointer( &dev_a, a, 0 ) ); // 之后 获取 GPU 的指针 kernel<<<blocksPerGrid,threadsPerBlock>>>( size, dev_a); HANDLE_ERROR( cudaThreadSynchronize() ); HANDLE_ERROR( cudaFreeHost( a ) ); 
THE END

发表回复