visual studio 2012 - Getting CUB DeviceScan to work when called from a kernel -
i trying learn how use cub perhaps rewrite integrator code. i've been looking @ examples , code snippets in docs, have not yet found example of i'm trying do. specifically, run inclusivesum called master thread. i've seen, examples call function host, rather device, hint can done here : http://nvlabs.github.io/cub/structcub_1_1_device_scan.html#a7bcc25e4d9c14a23f71431cf1a6b2bd5
"when calling method kernel code, sure define cub_cdp macro in compiler's macro definitions."
i've tried adding in visual studio 2012 going project's properties->cuda linker-> command line , adding "-dcub_cdp." i'm not sure if correct, following build line :
"nvcc.exe" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --use-local-env --cl-version 2012 -ccbin "c:\program files (x86)\microsoft visual studio 11.0\vc\bin\x86_amd64" -rdc=true -i"c:\program files\nvidia gpu computing toolkit\cuda\v6.0\include" -i"c:\program files\nvidia gpu computing toolkit\cuda\v6.0\include" -g --keep-dir x64\debug -maxrregcount=0 --machine 64 --compile -cudart static -dcub_cdp -g -d_mbcs -xcompiler "/ehsc /w3 /nologo /od /zi /rtc1 /mt " -o "x64\debug\algorithm test.cu.obj" "c:\users...\algorithm test.cu"
my test code involves test kernel run 1 thread simulate how actual code works.
#define cub_stderr #define gpuerrchk(ans) { gpuassert((ans), __file__, __line__); } #define numpoints 5*1024 #define numthreadsperblock 256 #define numblockspergrid 32 #define maxlength numthreadsperblock*numblockspergrid //use multiple of 256 #include <cuda.h> #include <cuda_runtime.h> #include <device_launch_parameters.h> #include <iostream> #include <fstream> #include <iomanip> //display 2 decimal places #include <math.h> #include <arrayfunctions.h> #include <ctime> //for timers #include <sstream> //for filename #include <assert.h> #include <stdlib.h> #include <cub/cub.cuh> #if defined(__cuda_arch__) && (__cuda_arch__ < 200) #undef assert #define assert(arg) #endif __device__ __constant__ int numthreads = numthreadsperblock; //number of threads per block __device__ __constant__ int numblocks = numblockspergrid; //number of blocks per grid __device__ __constant__ int maxlength = maxlength; __device__ double concsort[maxlength]; inline void gpuassert(cudaerror_t code, char *file, int line, bool abort=true) { //error checking if (code != cudasuccess) { fprintf(stderr,"gpuassert: %s %s %d\n", cudageterrorstring(code), file, line); if (abort) exit(code); } } using namespace std; using namespace cub; __global__ void test(double*); int main(int argc, char** argv) { cudadevicesetsharedmemconfig(cudasharedmembanksizeeightbyte); cudasetdevice(0); std::cout << std::fixed; //displays 2 decimal places. std::cout << std::setprecision(16); //displays 2 decimal places. const int maxlength = maxlength; //number of discrete concentrations tracking. double concs[maxlength] = {}; //meant store initial concentrations . std::cout<<" "; std::cout<<"\n"; double *d_concs; //the concentrations specific timestep. size_t size_concs = sizeof(concs); gpuerrchk(cudamalloc((void**)&d_concs, size_concs)); gpuerrchk(cudamemcpy(d_concs, &concs, size_concs, cudamemcpyhosttodevice)); //run integrator. std::clock_t start; double duration; start = std::clock(); test<<<1,1>>>(d_concs); std::cout<<"\n"; gpuerrchk( cudapeekatlasterror() ); gpuerrchk( cudadevicesynchronize() ); duration = (std::clock() - start)/ (double) clocks_per_sec; std::cout<<"the calculation took long: "<< duration <<'\n'; std::cout<<"\n"; gpuerrchk(cudamemcpy(concs, d_concs, size_concs, cudamemcpydevicetohost)); cudadevicesynchronize(); ///* (int i=0; < 33; i++) { std::cout << "\n"; std::cout << concs[i]; } //*/ cudadevicereset(); //clean memory. return 0; } __global__ void test(double* concs) { int size=maxlength; int threads = numthreadsperblock; int blocks = numblockspergrid; (int = 0; < size; i++) concs[i] = * .00000000001; ///* void *d_temp_storage = null; size_t temp_storage_bytes = 0; cubdebug(cub::devicescan::inclusivesum(d_temp_storage, temp_storage_bytes, concs, concs, size)); cudamalloc(&d_temp_storage, temp_storage_bytes); cubdebug(cub::devicescan::inclusivesum(d_temp_storage, temp_storage_bytes, concs, concs, size)); }
i following errors, following post, suggest defining macro cub_cdp error :
1>c:/users/karsten chu/new google drive/research/visual studio 2012/projects/dynamic parallelism test/dynamic parallelism test/algorithm test.cu(146): error : calling __host__ function("exit") __global__ function("test") not allowed 1>c:/users/karsten chu/new google drive/research/visual studio 2012/projects/dynamic parallelism test/dynamic parallelism test/algorithm test.cu(148): error : calling __host__ function("exit") __global__ function("test") not allowed
https://groups.google.com/forum/#!searchin/cub-users/cub_cdp/cub-users/9ltp52ohosg/um9_ruy11e0j
i'd appreciate think learning how use library me start focusing on physics rather than...anything physics.
remove cubdebugexit()
wrapper cub calls in test kernel. code compile.
instead of this:
cubdebugexit(cub::devicescan::exclusivesum(d_temp_storage, temp_storage_bytes, concs, concsort, maxlength)); cudamalloc(&d_temp_storage, temp_storage_bytes); cubdebugexit(cub::devicescan::exclusivesum(d_temp_storage, temp_storage_bytes, concs, concsort, maxlength));
do this:
cub::devicescan::exclusivesum(d_temp_storage, temp_storage_bytes, concs, concsort, maxlength); cudamalloc(&d_temp_storage, temp_storage_bytes); cub::devicescan::exclusivesum(d_temp_storage, temp_storage_bytes, concs, concsort, maxlength);
the cubdebugexit macro not usable in device code.
if prefer, can use cubdebug()
instead of cubdebugexit()
wrapper/macro.
Comments
Post a Comment