{"id":4960,"date":"2024-10-04T18:01:01","date_gmt":"2024-10-04T10:01:01","guid":{"rendered":""},"modified":"2024-10-04T18:01:01","modified_gmt":"2024-10-04T10:01:01","slug":"gpu\u9ad8\u6027\u80fd\u7f16\u7a0bcuda\u5b9e\u6218 pdf_python\u7f16\u7a0b\u4ece\u5165\u95e8","status":"publish","type":"post","link":"https:\/\/mushiming.com\/4960.html","title":{"rendered":"gpu\u9ad8\u6027\u80fd\u7f16\u7a0bcuda\u5b9e\u6218 pdf_python\u7f16\u7a0b\u4ece\u5165\u95e8"},"content":{"rendered":"

\n <\/path> \n<\/svg> <\/p>\n

\u300aGPU\u9ad8\u6027\u80fd\u7f16\u7a0bCUDA\u5b9e\u6218\u300b<\/p>\n

\n

Chapter 1\uff5e3 \u5439\u7720\u66f2<\/h3>\n

select_divice();<\/p>\n<\/blockquote>\n

 cudaDeviceProp prop;<\/span> int<\/span> dev;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaGetDevice<\/span>(<\/span> &<\/span>dev )<\/span> )<\/span>;<\/span> printf<\/span>(<\/span> \"ID of current CUDA device: %d\\n\"<\/span>,<\/span> dev )<\/span>;<\/span> memset<\/span>(<\/span> &<\/span>prop,<\/span> 0<\/span>,<\/span> sizeof<\/span>(<\/span> cudaDeviceProp )<\/span> )<\/span>;<\/span> prop.<\/span>major =<\/span> 1<\/span>;<\/span> prop.<\/span>minor =<\/span> 3<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaChooseDevice<\/span>(<\/span> &<\/span>dev,<\/span> &<\/span>prop )<\/span> )<\/span>;<\/span> printf<\/span>(<\/span> \"ID of CUDA device closest to revision 1.3: %d\\n\"<\/span>,<\/span> dev )<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaSetDevice<\/span>(<\/span> dev )<\/span> )<\/span>;<\/span> <\/code><\/pre>\n
\n

Chapter 4 kernel\u5e76\u884c\u7f16\u7a0b<\/h3>\n<\/blockquote>\n
 kernel <<<\/span><<\/span> grids\u2014DIM3,<\/span>threads-<\/span>DIM3 >><\/span>><\/span> \uff08 *<\/span>args \uff09 <\/code><\/pre>\n
\n

Chapter 5 \u7ebf\u7a0b\u534f\u4f5c<\/h3>\n

\u5206\u6cbb\u7684\u601d\u60f3<\/p>\n<\/blockquote>\n

 __shared__ memory;<\/span> __syncthreads<\/span>(<\/span>)<\/span>;<\/span> DIM3 blocksPerGrid<\/span>(<\/span> ,<\/span> )<\/span>;<\/span>\/\/ grid \u6700\u59272\u7ef4 \u65e0\u9650\u5236\uff0ccuda\u6838\u5fc3<\/span> DIM3 threadsPerBlock<\/span>(<\/span> ,<\/span> ,<\/span> )<\/span>;<\/span>\/\/ block \u6700\u59273\u7ef4 x*y*z <= 1024 \u8bbe\u5907\u6709\u9650\u5236\u7684 \u6211\u76841060super\u9650\u5236\u4e3a 1024.<\/span> <\/code><\/pre>\n
\n

Chapter 6 \u5e38\u91cf\u5185\u5b58<\/h3>\n<\/blockquote>\n
 __constant__ ;<\/span>\/\/ \u4f18\u5316\u5185\u5b58\u7684 \u8bfb\u53d6 \u6027\u80fd\u3002<\/span> <\/code><\/pre>\n
\n

Chapter 7 \u7eb9\u7406\u5185\u5b58 - \u53ea\u8bfb n[i] ,n [i, j]<\/h3>\n<\/blockquote>\n
 texture<<\/span>float<\/span>><\/span> textIn;<\/span> cudaBindTexture<\/span>(<\/span>NULL<\/span>,<\/span> textIn,<\/span> cuda_param,<\/span> int<\/span> size)<\/span>;<\/span> tex1Dfetch<\/span>(<\/span> textIn,<\/span> i )<\/span>;<\/span> python\u7684 textIn[<\/span>i]<\/span> de\u5b9e\u73b0\u3002 cudaUnBindTexture<\/span>(<\/span>textIn)<\/span>;<\/span> text2D<<\/span>float<\/span>\uff0c 2<\/span>><\/span> textIn;<\/span> cudaBindTexture2D<\/span>(<\/span>NULL<\/span>,<\/span> textIn,<\/span> cuda_param,<\/span> int<\/span> size)<\/span>;<\/span> text2D<\/span>(<\/span> textIn,<\/span> i \uff0cj )<\/span>;<\/span> python\u7684 textIn[<\/span>i\uff0cj]<\/span> de\u5b9e\u73b0\u3002 cudaUnBindTexture<\/span>(<\/span>textIn)<\/span>;<\/span> <\/code><\/pre>\n
\n

Chapter 8 \u56fe\u5f62\u4e92\u64cd\u4f5c\u6027 \u5f15\u7528&\u522b\u540d<\/h3>\n<\/blockquote>\n
 GLuint buffer;<\/span> \/\/ for \u663e\u793adisplay GL\u7c7b\u5185\u90e8\u76f4\u63a5\u4f7f\u7528 static<\/span> cudaGraphicsResource *<\/span>source;<\/span> \/\/ for \u52a8\u6001\u6e32\u67d3 static<\/span> uint4*<\/span> devPtr;<\/span> \/\/ for write modify &<\/span> \/\/ GL\u7aef\uff1a<\/span> glGenBuffers<\/span>(<\/span>1<\/span>,<\/span> &<\/span>buffer)<\/span>\uff1b glBindBuffer<\/span>(<\/span>GL_PIXEL_UNPACK_BUFFER_ARB,<\/span> buffer )<\/span>;<\/span> glBufferData<\/span>(<\/span>GL_PIXEL_UNPACK_BUFFER_ARB,<\/span> int<\/span> size,<\/span>buffer,<\/span> NULL<\/span>,<\/span> 0<\/span>)<\/span>;<\/span> \/\/ cuda C \u7aef\uff1a<\/span> cudaGraphicsGLRegisterBuffer<\/span>(<\/span>&<\/span>resource ,<\/span>buffer,<\/span> cudaGraphicsMapFlagsNone)<\/span>;<\/span> cudaGraphicsMapResources<\/span>(<\/span>1<\/span>,<\/span> &<\/span>resource,<\/span> NULL<\/span>)<\/span>;<\/span> cudaGraphicsResourcesMapPointer<\/span>(<\/span>(<\/span>void<\/span>*<\/span>*<\/span>)<\/span>&<\/span>devPtr,<\/span> &<\/span>size,<\/span> resource)<\/span>;<\/span> operater_kernel<<<\/span><<\/span> DIM \/<\/span> 1024<\/span> ,<\/span> 1024<\/span>(<\/span>max)<\/span> >><\/span>><\/span> (<\/span>devPtr)<\/span>;<\/span> your_GL_display<\/span>(<\/span>)<\/span>;<\/span> \/\/ \uff5e <\/span> cudaGraphicsUnMapResources<\/span>(<\/span>1<\/span>,<\/span> &<\/span>resource,<\/span> NULL<\/span>)<\/span>;<\/span> cudaGraphicsGLUnRegisterBuffer<\/span>(<\/span>resource)<\/span>;<\/span> glBindBuffer<\/span>(<\/span>GL_PIXEL_UNPACK_BUFFER_ARB,<\/span> 0<\/span> )<\/span>;<\/span> glDeleteBuffers<\/span>(<\/span>1<\/span>,<\/span> &<\/span>buffer)<\/span>\uff1b <\/code><\/pre>\n
\n

Chapter 9 \u539f\u5b50\u6027<\/h3>\n

\u591a\u7ebf\u7a0b\u8bfb\u5199 \u51b2\u7a81 \u7b49\u5f85\u961f\u5217\u7279\u957f<\/p>\n<\/blockquote>\n

\/*\u5206\u6cbb\uff1a\u5bf9\u4e00\u7247\u663e\u5b58\u533a\u57df\u7684\u591a\u7ebf\u7a0b\u8bfb\u5199\uff0c\u4f7f\u7528\u5171\u4eab\u663e\u5b58\u5904\u7406\u6bcf\u4e2ablock\u3002*\/<\/span> __shared__ unsigned<\/span> int<\/span> temp[<\/span>256<\/span>]<\/span>;<\/span> temp[<\/span>threadIdx.<\/span>x]<\/span> =<\/span> 0<\/span>;<\/span> __syncthreads<\/span>(<\/span>)<\/span>;<\/span> int<\/span> i =<\/span> threadIdx.<\/span>x +<\/span> blockIdx.<\/span>x *<\/span> blockDim.<\/span>x;<\/span> int<\/span> stride =<\/span> blockDim.<\/span>x *<\/span> gridDim.<\/span>x;<\/span> while<\/span> (<\/span>i <<\/span> size)<\/span> { \n   <\/span> atomicAdd<\/span>(<\/span> &<\/span>temp[<\/span>buffer[<\/span>i]<\/span>]<\/span>,<\/span> 1<\/span> )<\/span>;<\/span> i +<\/span>=<\/span> stride;<\/span> }<\/span> __syncthreads<\/span>(<\/span>)<\/span>;<\/span> \/\/ \u5206<\/span> atomicAdd<\/span>(<\/span> &<\/span>(<\/span>histo[<\/span>threadIdx.<\/span>x]<\/span>)<\/span>,<\/span> temp[<\/span>threadIdx.<\/span>x]<\/span> )<\/span>;<\/span>\/\/ \u5408<\/span> <\/code><\/pre>\n
\n

Chapter 10 \u6d41<\/h3>\n

\u591a\u4e0a\u4e0b\u6587\u7f16\u7a0b\uff0cJUST LIKE TensorRT::Contexture\u3002<\/p>\n<\/blockquote>\n

 HANDLE_ERROR<\/span>(<\/span> cudaMalloc<\/span>(<\/span> (<\/span>void<\/span>*<\/span>*<\/span>)<\/span>&<\/span>dev_c1,<\/span> N *<\/span> sizeof<\/span>(<\/span>int<\/span>)<\/span> )<\/span> )<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaHostAlloc<\/span>(<\/span> (<\/span>void<\/span>*<\/span>*<\/span>)<\/span>&<\/span>host_c,<\/span> FULL_DATA_SIZE *<\/span> sizeof<\/span>(<\/span>int<\/span>)<\/span>,<\/span> cudaHostAllocDefault )<\/span> )<\/span>;<\/span> .<\/span>.<\/span>.<\/span>coding.<\/span>.<\/span>.<\/span> HANDLE_ERROR<\/span>(<\/span> cudaMemcpyAsync<\/span>(<\/span> dev_b0,<\/span> host_b0,<\/span> N *<\/span> sizeof<\/span>(<\/span>int<\/span>)<\/span>,<\/span> cudaMemcpyHostToDevice,<\/span> stream0 )<\/span> )<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaMemcpyAsync<\/span>(<\/span> dev_b1,<\/span> host_b1,<\/span> N *<\/span> sizeof<\/span>(<\/span>int<\/span>)<\/span>,<\/span> cudaMemcpyHostToDevice,<\/span> stream1 )<\/span> )<\/span>;<\/span> .<\/span>.<\/span>.<\/span>coding.<\/span>.<\/span>.<\/span> kernel<<<\/span><<\/span>N\/<\/span>256<\/span>,<\/span>256<\/span>,<\/span>0<\/span>,<\/span>stream0>><\/span>><\/span>(<\/span> dev_a1,<\/span> dev_b1,<\/span> dev_c1 )<\/span>;<\/span> kernel<<<\/span><<\/span>N\/<\/span>256<\/span>,<\/span>256<\/span>,<\/span>0<\/span>,<\/span>stream1>><\/span>><\/span>(<\/span> dev_a2,<\/span> dev_b2,<\/span> dev_c2 )<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaStreamSynchronize<\/span>(<\/span> stream0 )<\/span> )<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaStreamSynchronize<\/span>(<\/span> stream1 )<\/span> )<\/span>;<\/span> ~<\/span> .<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>cudaFreeHost cudaFree cudaStreamDestroy .<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span> <\/code><\/pre>\n
\n

Chapter 11 \u591aGPU\u7684CUDA C<\/h3>\n

cudaHostAlloc(); malloc(); cudaMalloc();
\u4f7f\u7528\u4e3b\u673a\u5185\u5b58\u6761\u6765\u5145\u5f53\u663e\u5b58\u4f7f\u7528\u3002<\/p>\n<\/blockquote>\n

|<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span> float<\/span> *<\/span>a,<\/span> *<\/span>b;<\/span> float<\/span> *<\/span>dev_a,<\/span> *<\/span>dev_b;<\/span> \/\/ allocate memory on the CPU side<\/span> a =<\/span> (<\/span>float<\/span>*<\/span>)<\/span>malloc<\/span>(<\/span> size*<\/span>sizeof<\/span>(<\/span>float<\/span>)<\/span> )<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaMalloc<\/span>(<\/span> (<\/span>void<\/span>*<\/span>*<\/span>)<\/span>&<\/span>dev_a,<\/span> size*<\/span>sizeof<\/span>(<\/span>float<\/span>)<\/span> )<\/span> )<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaMemcpy<\/span>(<\/span> dev_a,<\/span> a,<\/span> size*<\/span>sizeof<\/span>(<\/span>float<\/span>)<\/span>,<\/span> cudaMemcpyHostToDevice )<\/span> )<\/span>;<\/span> kernel<<<\/span><<\/span>blocksPerGrid,<\/span>threadsPerBlock>><\/span>><\/span>(<\/span> size,<\/span> dev_a)<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaMemcpy<\/span>(<\/span> dev_a,<\/span> a,<\/span> size*<\/span>sizeof<\/span>(<\/span>float<\/span>)<\/span>,<\/span> cudaMemcpyHostToDevice )<\/span> )<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaMemcpy<\/span>(<\/span> dev_b,<\/span> b,<\/span> size*<\/span>sizeof<\/span>(<\/span>float<\/span>)<\/span>,<\/span> cudaMemcpyHostToDevice )<\/span> )<\/span>;<\/span> |<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span>.<\/span> \/\/ allocate the memory on the CPU<\/span> HANDLE_ERROR<\/span>(<\/span> cudaHostAlloc<\/span>(<\/span> (<\/span>void<\/span>*<\/span>*<\/span>)<\/span>&<\/span>a,<\/span> size*<\/span>sizeof<\/span>(<\/span>float<\/span>)<\/span>,<\/span> cudaHostAllocWriteCombined |<\/span> \/\/ up the performence of read<\/span> cudaHostAllocMapped )<\/span> )<\/span>;<\/span> \/\/ \u6620\u5c04\u4e3b\u673a\u5185\u5b58\u5230GPU<\/span> \/\/ find out the GPU pointers<\/span> HANDLE_ERROR<\/span>(<\/span> cudaHostGetDevicePointer<\/span>(<\/span> &<\/span>dev_a,<\/span> a,<\/span> 0<\/span> )<\/span> )<\/span>;<\/span> \/\/ \u4e4b\u540e \u83b7\u53d6 GPU \u7684\u6307\u9488<\/span> kernel<<<\/span><<\/span>blocksPerGrid,<\/span>threadsPerBlock>><\/span>><\/span>(<\/span> size,<\/span> dev_a)<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaThreadSynchronize<\/span>(<\/span>)<\/span> )<\/span>;<\/span> HANDLE_ERROR<\/span>(<\/span> cudaFreeHost<\/span>(<\/span> a )<\/span> )<\/span>;<\/span> <\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"gpu\u9ad8\u6027\u80fd\u7f16\u7a0bcuda\u5b9e\u6218 pdf_python\u7f16\u7a0b\u4ece\u5165\u95e8\u300aGPU\u9ad8\u6027\u80fd\u7f16\u7a0bCUDA\u5b9e\u6218\u300bChapter1\uff5e3\u5439\u7720\u66f2Chapter4kernel\u5e76\u884c\u7f16\u7a0b kernelgrid...","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"_links":{"self":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts\/4960"}],"collection":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/comments?post=4960"}],"version-history":[{"count":0,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/posts\/4960\/revisions"}],"wp:attachment":[{"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/media?parent=4960"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/categories?post=4960"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/mushiming.com\/wp-json\/wp\/v2\/tags?post=4960"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}