cuda - What is the significance of 'sharedMemBytes' argument in kernel call cuLaunchKernel()? -


i trying implement simple matrix multiplication program using shared memory in jcuda.

following jcudasharedmatrixmul.java code:

import static jcuda.driver.jcudadriver.cuctxcreate; import static jcuda.driver.jcudadriver.cuctxsynchronize; import static jcuda.driver.jcudadriver.cudeviceget; import static jcuda.driver.jcudadriver.cuinit; import static jcuda.driver.jcudadriver.culaunchkernel; import static jcuda.driver.jcudadriver.cumemalloc; import static jcuda.driver.jcudadriver.cumemfree; import static jcuda.driver.jcudadriver.cumemcpydtoh; import static jcuda.driver.jcudadriver.cumemcpyhtod; import static jcuda.driver.jcudadriver.cumodulegetfunction; import static jcuda.driver.jcudadriver.cumoduleload; import static jcuda.runtime.jcuda.cudaeventcreate; import static jcuda.runtime.jcuda.cudaeventrecord; import static jcuda.runtime.jcuda.*;  import java.io.bytearrayoutputstream; import java.io.file; import java.io.ioexception; import java.io.inputstream; import java.util.scanner;  import jcuda.pointer; import jcuda.sizeof; import jcuda.driver.cucontext; import jcuda.driver.cudevice; import jcuda.driver.cudeviceptr; import jcuda.driver.cufunction; import jcuda.driver.cumodule; import jcuda.driver.jcudadriver; import jcuda.runtime.cudaevent_t;    public class jcudasharedmatrixmul {      public static void main(string[] args) throws ioexception      {         // enable exceptions , omit subsequent error checks         jcudadriver.setexceptionsenabled(true);          // create ptx file calling nvcc         string ptxfilename = prepareptxfile("jcudasharedmatrixmulkernel.cu");          //initialize driver , create context first device.         cuinit(0);         cudevice device = new cudevice();         cudeviceget (device, 0);         cucontext context = new cucontext();         cuctxcreate(context, 0, device);          //load ptx file         cumodule module = new cumodule();         cumoduleload(module,ptxfilename);          //obtain function pointer add function         cufunction function = new cufunction();         cumodulegetfunction(function, module, "jcudasharedmatrixmulkernel");          int numrows = 16;         int numcols = 16;          //allocate , fill host input matrices:         float hostmatrixa[] = new float[numrows*numcols];         float hostmatrixb[] = new float[numrows*numcols];         float hostmatrixc[] = new float[numrows*numcols];           for(int = 0; i<numrows; i++)          {             for(int j = 0; j<numcols; j++)             {                 hostmatrixa[i*numcols+j] = (float) 1;                 hostmatrixb[i*numcols+j] = (float) 1;             }         }         // allocate device input data, , copy         // host input data device         cudeviceptr devmatrixa = new cudeviceptr();         cumemalloc(devmatrixa, numrows * numcols * sizeof.float);          //this part gives me error         cumemcpyhtod(devmatrixa, pointer.to(hostmatrixa), numrows * numcols * sizeof.float);          cudeviceptr devmatrixb = new cudeviceptr();         cumemalloc(devmatrixb, numrows * numcols * sizeof.float);          //this part gives me error         cumemcpyhtod(devmatrixb, pointer.to(hostmatrixb ), numrows * numcols * sizeof.float);          //allocate device matrix c store output         cudeviceptr devmatrixc = new cudeviceptr();         cumemalloc(devmatrixc, numrows * numcols * sizeof.float);          // set kernel parameters: pointer array         // of pointers point actual values.          pointer kernelparameters = pointer.to(                                 pointer.to(new int[]{numcols}),                                  pointer.to(devmatrixa),                                 pointer.to(devmatrixb),                                 pointer.to(devmatrixc));          //kernel thread configuration         int blocksize = 16;         int gridsize = 1;          cudaevent_t start = new cudaevent_t();         cudaevent_t stop = new cudaevent_t();         cudaeventcreate(start);         cudaeventcreate(stop);         long start_nano=system.nanotime();         cudaeventrecord(start, null);          culaunchkernel(function,                         gridsize, 1, 1,                        blocksize, 16, 1,                        250, null, kernelparameters, null);          cuctxsynchronize();         cudaeventrecord(stop, null);         long end_nano=system.nanotime();         float elapsedtimemsarray[] = { float.nan };         cudaeventelapsedtime(elapsedtimemsarray, start, stop);         float elapsedtimems = elapsedtimemsarray[0];         system.out.println("time required (using cudaevent elapsed time) = " + " " +elapsedtimems+                 "time required (using nanotime)= "+(end_nano-start_nano)/1000000);         // allocate host output memory , copy device output         // host.          //this part gives me error         cumemcpydtoh(pointer.to(hostmatrixc), devmatrixc, numrows * numcols * sizeof.float);            //verify result         (int =0; i<numrows; i++)         {             (int j =0; j<numrows; j++)             {                 system.out.print("   "+ hostmatrixc[i*numcols+j]);             }             system.out.println("");         }          cumemfree(devmatrixa);         cumemfree(devmatrixb);         cumemfree(devmatrixc);     }      private static string prepareptxfile(string cufilename) throws ioexception     {         int endindex = cufilename.lastindexof('.');         if (endindex == -1)             endindex = cufilename.length()-1;         {         }         string ptxfilename = cufilename.substring(0, endindex+1)+"ptx";         file ptxfile = new file(ptxfilename);         if (ptxfile.exists())         {             return ptxfilename;         }          file cufile = new file(cufilename);         if (!cufile.exists())         {             throw new ioexception("input file not found: "+cufilename);         }         string modelstring = "-m"+system.getproperty("sun.arch.data.model");         string command = "nvcc " + modelstring + " -ptx "+ cufile.getpath()+" -o "+ptxfilename;          system.out.println("executing\n"+command);         process process = runtime.getruntime().exec(command);          string errormessage = new string(tobytearray(process.geterrorstream()));         string outputmessage = new string(tobytearray(process.getinputstream()));         int exitvalue = 0;         try         {             exitvalue = process.waitfor();         }         catch (interruptedexception e)         {             thread.currentthread().interrupt();             throw new ioexception(                 "interrupted while waiting nvcc output", e);         }          if (exitvalue != 0)         {             system.out.println("nvcc process exitvalue "+exitvalue);             system.out.println("errormessage:\n"+errormessage);             system.out.println("outputmessage:\n"+outputmessage);             throw new ioexception(                 "could not create .ptx file: "+errormessage);         }          system.out.println("finished creating ptx file");         return ptxfilename;      }      private static byte[] tobytearray(inputstream inputstream) throws ioexception     {         bytearrayoutputstream baos = new bytearrayoutputstream();         byte buffer[] = new byte[8192];         while (true)         {             int read = inputstream.read(buffer);             if (read == -1)             {                 break;             }             baos.write(buffer, 0, read);         }         return baos.tobytearray();     }   } 

following jcudasharedmatrixmulkernel.cu code:

extern "c" __global__ void jcudasharedmatrixmulkernel(int n,float *ad,float *bd,float *cd) {     float pvalue=0;     int tile=blockdim.x;     int ty=threadidx.y;     int tx=threadidx.x;      __shared__ float ads[4][4];     __shared__ float bds[4][4];      int row = blockidx.y * blockdim.y + threadidx.y;     int col = blockidx.x * blockdim.x + threadidx.x;      for(int i=0;i< n/tile;++i)     {         ads[ty][tx] = ad[row * n + (i * tile) + tx];         bds[ty][tx] = bd[(i * tile + ty) * n + col];          __syncthreads();          for(int k=0;k<tile;k++)                 pvalue += ads[ty][k] * bds[k][tx];          __syncthreads();       }      cd[row * n + col] = pvalue; } 

in above example total shared memory used per block 2*4*4*4 = 128 bytes. in culaunchkernel when define sharedmembytes parameter 0(zero) gives me following error:

**exception in thread "main" jcuda.cudaexception: cuda_error_launch_failed     @ jcuda.driver.jcudadriver.checkresult(jcudadriver.java:282)     @ jcuda.driver.jcudadriver.cuctxsynchronize(jcudadriver.java:1795)     @ jcudasharedmatrixmul.main(jcudasharedmatrixmul.java:121)** 

when define 128 gives same above error. when make 129 gives me correct output! when give value between 129 49024 gives me correct result. question why not able correct output when defining 128? maximum shared memory can defined? why 129-49024 range working here?

you're launching blocks of 16x16 threads:

    culaunchkernel(function,                     gridsize, 1, 1,                    blocksize, 16, 1,  <-- first 2 params block.x , block.y                    250, null, kernelparameters, null); 

so __shared__ float ads[4][4]; should not working @ all. example, these lines of kernel code accessing shared arrays out-of-bounds threads:

    ads[ty][tx] = ad[row * n + (i * tile) + tx];     bds[ty][tx] = bd[(i * tile + ty) * n + col];          ^   ^          |   tx goes 0..15 16x16 threadblock          ty goes 0..15 16x16 threadblock 

your code broken in respect. if run code cuda-memcheck may catch these out-of-bounds accesses, in "passing" case. looking @ matrixmuldrv cuda sample code, instructive, , you'll see shared memory allocation 2*block_size*block_size, should case well, shared memory definitions should [16][16] not [4][4] may shared memory allocation granularity happens work when exceed 128 bytes, there defect in code.

your shared definitions should be:

__shared__ float ads[16][16]; __shared__ float bds[16][16]; 

since above allocations static allocations, , sharedmembytes parameter defined as dynamic shared memory allocation, example don't need allocate (0 ok) dynamic shared memory, , still works. difference between static , dynamic covered here.

the maximum shared memory per block available in the documentation, or if run cuda devicequery sample code. 48k bytes cc2.0 , newer devices.


Comments

Popular posts from this blog

javascript - Jquery show_hide, what to add in order to make the page scroll to the bottom of the hidden field once button is clicked -

javascript - Highcharts multi-color line -

javascript - Enter key does not work in search box -