#include<cooperative_groups.h>/* * 分散共有メモリを用いてヒストグラムを計算するカーネル * * 方針: * 全てのビンを各スレッドブロックで分けて共有メモリに保持し、計算が全部終わったらグローバルメモリに書き込む。 * 入力はスレッド数でmodを取り、自身のスレッドIDの同値類のそれぞれの値を配列のインデックスと見てビンに足す。*/__global__voidclusterHist_kernel(int*bins,constintnbins,constintbins_per_block,constint*__restrict__input,size_tarray_size){// 共有メモリを動的に確保するため、extern宣言するextern__shared__intsmem[];namespacecg=cooperative_groups;inttid=cg::this_grid().thread_rank();// Cluster initialization, size and calculating local bin offsets.cg::cluster_groupcluster=cg::this_cluster();intcluster_size=cluster.dim_blocks().x;for(inti=threadIdx.x;i<bins_per_block;i+=blockDim.x){smem[i]=0;//Initialize shared memory histogram to zeros}// cluster synchronization ensures that shared memory is initialized to zero in// all thread blocks in the cluster. It also ensures that all thread blocks// have started executing and they exist concurrently.cluster.sync();for(inti=tid;i<array_size;i+=blockDim.x*gridDim.x){intldata=input[i];//Find the right histogram bin.intbinid=ldata;if(ldata<0){binid=0;}elseif(ldata>=nbins){binid=nbins-1;}//Find destination block rank and offset for computing//distributed shared memory histogramintdst_block_rank=(int)(binid/bins_per_block);intdst_offset=binid%bins_per_block;//Pointer to target block shared memoryint*dst_smem=cluster.map_shared_rank(smem,dst_block_rank);//Perform atomic update of the histogram binatomicAdd(dst_smem+dst_offset,1);}// cluster synchronization is required to ensure all distributed shared// memory operations are completed and no thread block exits while// other thread blocks are still accessing distributed shared memorycluster.sync();// Perform global memory histogram, using the local distributed memory histogramfor(inti=threadIdx.x;i<bins_per_block;i+=blockDim.x){atomicAdd(&bins[i],smem[i]);}}
constexprintarray_size=64;constexprintthreads_per_block=16;constexprintnbins=16;cudaLaunchConfig_tconfig={0};config.gridDim=array_size/threads_per_block;config.blockDim=threads_per_block;// cluster_size depends on the histogram size.// ( cluster_size == 1 ) implies no distributed shared memory,// just thread block local shared memoryintcluster_size=2;// size 2 is an example hereintnbins_per_block=nbins/cluster_size;//dynamic shared memory size is per block.//Distributed shared memory size = cluster_size * nbins_per_block * sizeof(int)config.dynamicSmemBytes=nbins_per_block*sizeof(int);// 動的共有メモリの最大サイズを変更autoe=cudaFuncSetAttribute((void*)clusterHist_kernel,cudaFuncAttributeMaxDynamicSharedMemorySize,config.dynamicSmemBytes);assert(e==cudaSuccess);cudaLaunchAttributeattribute[1];attribute[0].id=cudaLaunchAttributeClusterDimension;attribute[0].val.clusterDim.x=cluster_size;attribute[0].val.clusterDim.y=1;attribute[0].val.clusterDim.z=1;config.numAttrs=1;config.attrs=attribute;int*dev_bins;e=cudaMalloc(&dev_bins,nbins*sizeof(int));assert(e==cudaSuccess);std::vector<int>input;std::random_deviceseed_gen;std::mt19937engine(seed_gen());// 範囲外のチェックをしているか確認するために、わざと範囲外を生成するように指定std::uniform_int_distribution<int>dist(-1,nbins);for(std::size_ti=0;i<array_size;++i){input.push_back(dist(engine));}int*dev_input;e=cudaMalloc(&dev_input,input.size()*sizeof(int));assert(e==cudaSuccess);e=cudaMemcpy(dev_input,input.data(),input.size()*sizeof(int),cudaMemcpyHostToDevice);assert(e==cudaSuccess);e=cudaLaunchKernelEx(&config,clusterHist_kernel,dev_bins,nbins,nbins_per_block,dev_input,array_size);assert(e==cudaSuccess);