2014-04-14 8 views
6

Vielen Dank für das Lesen meiner Themen.cudaDeviceSynchronize() Fehlercode 77: cudaErrorIllegalAddress

Ich mache CUDA arbeiten, aber immer cudaDeviceSynchronize() Fehlercode 77: cudaErrorIllegalAddress, ohne eine Ahnung warum. Ich habe sowohl nach dem Code als auch nach der Funktion gesucht, überraschenderweise sind nur ein paar Datensätze aufgetaucht. Sehr eigenartig.

Ich fasse im Grunde alle Pixel von Bildern zusammen. Um meine Fragen haben so viel Bezug zu machen, wie es kann, ich alle Code meine CUDA bin hier zeigt:

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 
#include "thorcalgpu.h" 
#include <stdio.h> 
#include "math.h" 
#include <vector> 
#include <algorithm> 
#include <stdlib.h> 
#include <stdio.h> 
#include <vector> 
#include <numeric> 
#include <iostream> 

using namespace std; 

float random_float(void) 
{ 
    return static_cast<float>(rand())/RAND_MAX; 
} 


__global__ void reduceSum(unsigned short *input, 
          unsigned long long *per_block_results, 
          const int n) 
{ 
    extern __shared__ unsigned long long sdata[]; 

    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 

    // load input into __shared__ memory 
    unsigned short x = 0; 
    if(i < n) 
    { 
     x = input[i]; 
    } 
    sdata[threadIdx.x] = x; 
    __syncthreads(); 

    // contiguous range pattern 
    for(int offset = blockDim.x/2; offset > 0; offset >>= 1) 
    { 
     if(threadIdx.x < offset) 
     { 
      // add a partial sum upstream to our own 
      sdata[threadIdx.x] += sdata[threadIdx.x + offset]; 
     } 

     // wait until all threads in the block have 
     // updated their partial sums 
     __syncthreads(); 
    } 

    // thread 0 writes the final result 
    if(threadIdx.x == 0) 
    { 
     per_block_results[blockIdx.x] = sdata[0]; 
    } 
} 

// Helper function for using CUDA to add vectors in parallel. 
//template <class T> 
cudaError_t gpuWrapper(float *mean, int N, vector<string> filelist) 
{ 
    int size = N*N; 
    unsigned long long* dev_sum = 0; 
    unsigned short* dev_img = 0; 
    cudaError_t cudaStatus; 
    const int block_size = 512; 
    const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0); 
    int L = filelist.size(); 

    // Choose which GPU to run on, change this on a multi-GPU system. 

    double totalgpuinittime = 0; 
    StartCounter(7); 

    cudaStatus = cudaSetDevice(0); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
     goto Error; 
    } 

    // Allocate GPU buffers for three vectors (two input, one output) . 
    cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short)); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long)); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    totalgpuinittime = GetCounter(7); 

    unsigned short* img; 
    unsigned short* pimg; 
    unsigned long long* sum = new unsigned long long[num_blocks]; 
    unsigned long long* psum = sum; 

    cout<<endl; 
    cout << "gpu looping starts, and in progress ..." << endl; 
    StartCounter(6); 

    double totalfileiotime = 0; 
    double totalh2dcpytime = 0; 
    double totalkerneltime = 0; 
    double totald2hcpytime = 0; 
    double totalcpusumtime = 0; 
    double totalloopingtime = 0; 

    for (int k = 0; k < L; k++) 
    { 
     StartCounter(1); 
     img = (unsigned short*)LoadTIFF(filelist[k].c_str()); 
     totalfileiotime += GetCounter(1); 

     psum = sum; 
     pimg = img; 

     float gpumean = 0; 

     memset(psum, 0, sizeof(unsigned long long)*num_blocks); 

     StartCounter(2); 
     // Copy input vectors from host memory to GPU buffers. 
     cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice); 
     if (cudaStatus != cudaSuccess) 
     { 
      fprintf(stderr, "cudaMemcpy failed!"); 
      goto Error; 
     } 

     cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice); 
     if (cudaStatus != cudaSuccess) 
     { 
      fprintf(stderr, "cudaMemcpy failed!"); 
      goto Error; 
     } 

     totalh2dcpytime += GetCounter(2); 

     StartCounter(3); 
     //reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size); 
     //reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size); 
      reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size); 
     totalkerneltime += GetCounter(3); 

     // Check for any errors launching the kernel 
     cudaStatus = cudaGetLastError(); 
     if (cudaStatus != cudaSuccess) 
     { 
      fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
      goto Error; 
     } 

     // cudaDeviceSynchronize waits for the kernel to finish, and returns 
     // any errors encountered during the launch. 

       // !!!!!! following is where the code 77 error occurs!!!!!!! 
     cudaStatus = cudaDeviceSynchronize(); 
     if (cudaStatus != cudaSuccess) 
     { 
      fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 
      goto Error; 
     } 

     // Copy output vector from GPU buffer to host memory. 
     StartCounter(4); 
     cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long), cudaMemcpyDeviceToHost); 
     if (cudaStatus != cudaSuccess) 
     { 
      fprintf(stderr, "cudaMemcpy failed!"); 
      goto Error; 
     } 
     totald2hcpytime += GetCounter(4); 

     StartCounter(5); 
     for (int i = 0; i < num_blocks; i++) 
     { 
      gpumean += *psum; 
      psum++; 
     } 

     gpumean /= N*N; 
     totalcpusumtime += GetCounter(5); 

     delete img; 
     img = NULL; 

     cout<<gpumean<<endl; 

    } 

    int S = 1e+6; 
    int F = filelist.size(); 
    float R = S/F; 

    totalloopingtime = GetCounter(6); 
    cout<<"gpu looping ends."<<endl<<endl; 
    cout<< "analysis:"<<endl; 
    cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl; 
    cout<<"file I/O time: "<<endl; 
    cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl; 
    cout<<"host-to-device copy time: "<<endl; 
    cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl; 
    cout<<"pure gpu kerneling time: "<<endl; 
    cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl; 
    cout<<"device-to-host copy time: "<<endl; 
    cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl; 
    /*cout<<"cpu summing time: "<<endl; 
    cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/ 

    /*cout <<"gpu looping time: " << endl; 
    cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/ 


Error: 
    cudaFree(dev_sum); 
    cudaFree(dev_img); 

    delete sum; 
    sum = NULL; 

    return cudaStatus; 
} 

void kernel(float* &mean, int N, vector<string> filelist) 
{ 
    // wrapper and kernel 
    cudaError_t cudaStatus = gpuWrapper(mean, N, filelist); 

    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "gpuWapper failed!"); 

    } 

    // printf("mean is: %f\n", mean); 

    // cudaDeviceReset must be called before exiting in order for profiling and 
    // tracing tools such as Nsight and Visual Profiler to show complete traces. 

    StartCounter(8); 
    cudaStatus = cudaDeviceReset(); 
    if (cudaStatus != cudaSuccess) 
    { 
     fprintf(stderr, "cudaDeviceReset failed!"); 

    } 
    cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl; 
    //return *mean; 
} 

Ich habe zugewiesen genug und äquivalente Speicherplatz sowohl für Host- und Gerätespeicher. Irgendwelche Kommentare werden geschätzt.

Antwort

7

Dies ist möglicherweise nicht die einzige Fehlerquelle im Code, aber Sie weisen keinen dynamischen gemeinsamen Speicher für den Reduktionskernel zu, was zu dem rechtswidrigen Adressierungsfehler führt, den Sie sehen. Der richtige Kernel-Start sollte bei der Reduktion Kernel läuft für jeden Thread etwas wie

size_t shm_size = block_size * sizeof(unsigned long long); 
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size); 

Dies ordnet das Äquivalent einer unsigned long long sein, die (durch meine sehr flüchtiges Lesen des Codes), um die Shared-Memory-Array machen sollte sdata Die richtige Größe für den Kernel, um ohne Zugriff außerhalb des Bereichs auf dieses Array zu laufen.