I am just starting to learn how to use CUDA. I am trying to run some simple example code:
float *ah, *bh, *ad, *bd;
ah = (float *)malloc(sizeof(float)*4);
bh = (float *)malloc(sizeof(float)*4);
cudaMalloc((void **) &ad, sizeof(float)*4);
cudaMalloc((void **) &bd, sizeof(float)*4);
... initialize ah ...
/* copy array on device */
cudaMemcpy(ad,ah,sizeof(float)*N,cudaMemcpyHostToDevice);
cudaMemcpy(bd,ad,sizeof(float)*N,cudaMemcpyDeviceToDevice);
cudaMemcpy(bh,bd,sizeof(float)*N,cudaMemcpyDeviceToHost);
When I run in emulation mode (nvcc -deviceemu) it runs fine (and actually copies the array). But when I run it in regular mode, it runs w/o error, but never copies the data. It's as if the cudaMemcpy lines are just ignored.
What am I doing wrong?
Thank you very much, Jason
-
You should check for errors, ideally on each malloc and memcpy but just doing it once at the end will be sufficient (
cudaGetErrorString(cudaGetLastError())
.Just to check the obvious:
- You do have a CUDA capable GPU, right? Run the
deviceQuery
SDK sample to check the device is working correctly and all the drivers are installed and working. N
(in the memcpy) is equal to 4 (in the malloc), right?
- You do have a CUDA capable GPU, right? Run the
-
i am having a similar problem. i have cuda driver 3.0 installed and cuda toolkit v3.0 installed and an nvidia g210m. when i run a very simple array reverse program normally (using nvcc file.cu) it runs but the output comes out wrong( no reversal). however the very same prog wen run under -deviceemu mode gives the right output. y is this so?
-
See if you have a CUDA enabled device. Probably you can try running the code below and see what info you get:
#include <cstdio> int main( void ) { cudaDeviceProp prop; int count; cudaGetDeviceCount( &count ); for (int i=0; i< count; i++) { cudaGetDeviceProperties( &prop, i ); printf( " --- General Information for device %d ---\n", i ); printf( "Name: %s\n", prop.name ); printf( "Compute capability: %d.%d\n", prop.major, prop.minor ); printf( "Clock rate: %d\n", prop.clockRate ); printf( "Device copy overlap: " ); if (prop.deviceOverlap) printf( "Enabled\n" ); else printf( "Disabled\n"); printf( "Kernel execution timeout : " ); if (prop.kernelExecTimeoutEnabled) printf( "Enabled\n" ); else printf( "Disabled\n" ); printf( " --- Memory Information for device %d ---\n", i ); printf( "Total global mem: %ld\n", prop.totalGlobalMem ); printf( "Total constant Mem: %ld\n", prop.totalConstMem ); printf( "Max mem pitch: %ld\n", prop.memPitch ); printf( "Texture Alignment: %ld\n", prop.textureAlignment ); printf( " --- MP Information for device %d ---\n", i ); printf( "Multiprocessor count: %d\n", prop.multiProcessorCount ); printf( "Shared mem per mp: %ld\n", prop.sharedMemPerBlock ); printf( "Registers per mp: %d\n", prop.regsPerBlock ); printf( "Threads in warp: %d\n", prop.warpSize ); printf( "Max threads per block: %d\n", prop.maxThreadsPerBlock ); printf( "Max thread dimensions: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2] ); printf( "Max grid dimensions: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2] ); printf( "\n" ); } }
0 comments:
Post a Comment