2016-05-25 12 views

(Entschuldigung für mein Englisch) Ich studiere CUDA. Ich habe diesen Code in C CUDA über Hitze 2D, aber wenn ich den Code kompiliere, erhalten Fehler (nicht Syntaxfehler). Dieser Fehler ist "Verstoßsegment". Ich denke, dass es für eine Speicherkapazität ist, aber ich bin mir nicht sicher, und ich weiß nicht, was ich tun soll. Hilf mir bitte.Lösen von Hitze2D in CUDA

#include <stdio.h> 
    #include <stdlib.h> 
    #include <math.h> 
    #include <string.h> 
    #include <sys/time.h> 
    #include <getopt.h> 

    #include <cuda.h> 

    #define MAXSTEP  1000 
    #define CX   0.001 
    #define CY   0.001 

    #define NTHREADS 32 

    void setupBoundaryConditions (double** X, unsigned long int sizex, unsigned long int sizey); 
    void initializeArray   (double** X, unsigned long int sizex, unsigned long int sizey); 

    double** make2DDoubleArray (unsigned long int sizex, unsigned long int sizey); 
    void  free2DDoubleArray (double **X, unsigned long int size); 
    void  save (double** X, unsigned long int x, unsigned long int y, char* filename); 
    double gettime(void); 

    * subroutine update - CUDA implementation 

    __global__ void update(unsigned long int NXPROB, unsigned long int NYPROB, double *X1, double *X2) 
     long int i, j; 
     long int CENTRE, NORD, SUD, EST, OEST; 

     i = blockIdx.x * blockDim.x + threadIdx.x; 
     j = blockIdx.y * blockDim.y + threadIdx.y; 

     CENTRE = i + j*NXPROB;  //(i,j) - CENTER 
     NORD = i + (j+1)*NXPROB; //(i,j+1) - N 
     SUD = i + (j-1)*NXPROB;  //(i,j-1) - S 
     EST = (i+1) + j*NXPROB;  //(i+1,j) - E 
     OEST = (i-1) + j*NXPROB; //(i-1,j) - W 

     //ERROR: expression must have pointer-to-object type 
     // --> it's refering to X2 and X1 
     /*if(i>0 && i<NXPROB && j>0 && j<NYPROB) 
      X2[i][j] = X1[i][j] 
        + CX * (X1[i+1][j] + X1[i-1][j] - 2.0 * X1[i][j]) 
        + CY * (X1[i][j+1] + X1[i][j-1] - 2.0 * X1[i][j]); 

     //ALTERNATIVE 1: 
     if(i>0 && i<NXPROB && j>0 && j<NYPROB) 
      X2[CENTRE] = X1[CENTRE] 
        + CX * (X1[EST] + X1[OEST] - 2.0 * X1[CENTRE]) 
        + CY * (X1[NORD] + X1[SUD] - 2.0 * X1[CENTRE]); 

     * Main 

    int main(int argc, char* argv[]) { 
     long int s, it; 
     unsigned int flag, verbose; 
     unsigned int NX, NY, NXPROB, NYPROB; 
     double start, end; 
     int iz; 

     // Defaut values 
     NX = 100; 
     NY = 100; 

     // create file and verbose flags 
     flag = 0; 
     verbose = 0; 

     // Parse command line options 
     int opt; 
     char *file = NULL; 
     while ((opt = getopt(argc, argv, "hvs:f:")) != -1) { 
      switch (opt) { 
      case 'v': 
       verbose = 1; 
      case 's': 
       if(!(s=atoi(optarg))) { 
        fprintf(stderr, "Cannot parse %s value.\n", optarg); 
       NX = NY = s; 
      case 'f': 
       file = optarg; 
       flag = 1; 
      case 'h': 
       fprintf(stderr, "Usage: %s [-s SIZE] [-f output file]\n", argv[0]); 

     // Set initial data values 
     NXPROB = NX - 1; 
     NYPROB = NY - 1; 

     if(verbose) { 
      fprintf(stdout, "[INFO] Setting map size to %d (%dx%d)\n", NX*NY, NX, NY); 
      fprintf(stdout, "[INFO] Max iter %d\n", MAXSTEP); 
     if(verbose && flag) { 
      fprintf(stdout, "[INFO] Using output file %s\n", file); 

     // Program starts here 
     start = gettime(); 

     // CPU Memory allocation   
     double** X[2]; 
     X[0] = make2DDoubleArray (NX, NY); 
     X[1] = make2DDoubleArray (NX, NY); 

     // Set initial and boundary conditions 
     initializeArray (X[0], NX, NY); 
     setupBoundaryConditions(X[0], NX, NY); 
     setupBoundaryConditions(X[1], NX, NY); 

     // GPU Memory allocation 
     double *d_X1, *d_X2; 
     cudaMalloc((void **)&d_X1, NX*NY*sizeof(double)); 
     cudaMalloc((void **)&d_X2, NX*NY*sizeof(double)); 

     // Copy CPU --> GPU 
     cudaMemcpy(d_X1, X, NX*NY*sizeof(double), cudaMemcpyHostToDevice); 
     cudaMemcpy(d_X2, X, NX*NY*sizeof(double), cudaMemcpyHostToDevice); 

     dim3 dimBlock(NTHREADS,NTHREADS); 
     dim3 dimGrid(1,1); 

     // Main calculations 
     iz = 0; 
     for (it = 0; it < MAXSTEP; it++) 
      if(verbose && (it%(MAXSTEP/10) == 0)) { 
       fprintf(stdout, "[INFO] iteration %ld, time %.3f seconds\n", it, gettime()-start); 
      // The first update has an error: d_X1 and d_X2 
      //update<<<dimGrid, dimBlock>>>(NXPROB, NYPROB, d_X1[iz], d_X2[1-iz]); 
      update<<<dimGrid, dimBlock>>>(NXPROB, NYPROB, d_X1, d_X2); 
      iz = 1 - iz; 

     // Copy GPU --> CPU 
     cudaMemcpy(X, d_X1, NX*NY*sizeof(double), cudaMemcpyDeviceToHost); 
     cudaMemcpy(X, d_X2, NX*NY*sizeof(double), cudaMemcpyDeviceToHost); 


     // Save output file 
     if(flag) save(X[iz], NX, NY, file); 

     free2DDoubleArray(X[0], NX); 
     free2DDoubleArray(X[1], NX); 

     // End time 
     end = gettime(); 

     // Get information: wall clock time, problem size, ... 
      fprintf(stdout, "[INFO] Convergence after %d steps\n", MAXSTEP); 
      fprintf(stdout, "[INFO] Problem size %d [%dx%d]\n", NY*NX, NX, NY); 
      fprintf(stdout, "[INFO] Wall clock time %lf seconds\n",(end-start)); 
      if(flag) fprintf(stdout, "[INFO] Output file %s\n", file); 
      printf("Time %.3f seconds, Size %d [%dx%d]\n", end - start, NY*NX, NX, NY); 



    void setupBoundaryConditions(double** X, unsigned long int x, unsigned long int y) { 
     /* set boundary conditions for ix, jy = 0 and ix, jy = n-1 */ 
     unsigned long int i, j; 
     double leftBC = 0, 
      rightBC = 0, 
      topBC = 0, 
      bottomBC = 0; 

     /* setup the bottom and top BCs, jy = 0 and jy = n-1 or arraySizeY - 1 */ 
     for (i = 0; i < x; i++) 
      X[i][0] = bottomBC; //bottom BC 
      X[i][y-1] = topBC; //top BC 

     /* setup the left and right BCs, ix = 0 and ix = arraySizeX - 1 */ 
     for (j = 0; j < y; j++) 
      X[0][j] = leftBC; //left BC 
      X[x-1][j] = rightBC; //right BC 

     /* set the values at the corner nodes as averages of both sides*/ 
     // bottom left 
     X[0][0]  = 0.5 * (leftBC + bottomBC); 
     // top left 
     X[0][y-1] = 0.5 * (topBC + leftBC); 
     // top right 
     X[x-1][y-1] = 0.5 * (topBC + rightBC); 
     // bottom right 
     X[x-1][0] = 0.5 * (bottomBC + rightBC); 

    void initializeArray(double** X, unsigned long int x, unsigned long int y) { 
     unsigned long int i, j; 

     for (i = 1; i < x; i++) 
      for (j = 1; j < y; j++) 
       X[i][j] = (double)MAXSTEP+(i * (x - i - 1) * j * (y - j - 1)); 

     for (i = 0; i < x; i++) 
      X[i][0] = 0; //bottom BC 
      X[i][y-1] = 0; //top BC 
     for (j = 1; j < y; j++) 
      X[0][j] = 0; //left BC 
      X[x-1][j] = 0; //right BC 

    void free2DDoubleArray(double **X, unsigned long int size) { 
     unsigned long int i; 
     for (i = 0; i < size; ++i) { 

    double** make2DDoubleArray(unsigned long int x, unsigned long int y) { 
      unsigned long int ix; 
     double** X; 
     X = (double**) malloc(x*sizeof(double*)); 
     for (ix = 0; ix < x; ix++) { 
      X[ix] =(double*) malloc(y*sizeof(double)); 
     return X; 

    void save(double** X, unsigned long int x, unsigned long int y, char* filename) { 
     unsigned long int i, j; 
     FILE* file; 
     file = fopen(filename,"w"); 

     for (i = 0; i < x; i++) 
      for (j = 0; j < y; j++) 
       fprintf(file,"%8.3f ", X[i][j]); 

    /* Timing function */ 
    double gettime(void) { 
     struct timeval tv; 
     return tv.tv_sec + 1e-6*tv.tv_usec; 

Was ist die Frage, die Sie stellen möchten? Welches Problem stellst du fest? –


Scheint dies ist eine andere "Bitte mach es für mich" -Frage ... Es sei denn, Sie klären mit echter Frage. – Taro


Ich habe die Frage bearbeitet. Kann es jemand erklären? Vielen Dank – RogerRF



Der Fehler Speicherkapazität bestand darin, zwei Matrix/Arrays in das Gerät zu erstellen. Dieser Teil:

// GPU Memory allocation 
double *d_X1, *d_X2; 
cudaMalloc((void **)&d_X1, NX*NY*sizeof(double)); 
cudaMalloc((void **)&d_X2, NX*NY*sizeof(double)); 

Die Lösung:

// GPU Memory allocation 
double *d_X[2]; 
cudaMalloc((void **)&d_X[0], NX*NY*sizeof(double)); 
cudaMalloc((void **)&d_X[1], NX*NY*sizeof(double)); 

komplette Code:

#include <stdio.h> 
    #include <stdlib.h> 
    #include <math.h> 
    #include <string.h> 
    #include <sys/time.h> 
    #include <getopt.h> 

    #include <cuda.h> 

    #define MAXSTEP  1000 
    #define CX   0.001 
    #define CY   0.001 

    #define NTHREADS 32 

    void setupBoundaryConditions (double** X, unsigned long int sizex, unsigned long int sizey); 
    void initializeArray   (double** X, unsigned long int sizex, unsigned long int sizey); 

    double** make2DDoubleArray (unsigned long int sizex, unsigned long int sizey); 
    void  save (double** X, unsigned long int x, unsigned long int y, char* filename); 
    double gettime(void); 

    * subroutine update - CUDA implementation 

    __global__ void update(bool shared_memory, unsigned int NX, unsigned long int NXPROB, unsigned long int NYPROB, double *d_X1, double *d_X2) 
      long int i; 
      long int CENTRE, NORD, SUD, EST, OEST; 

      i = blockIdx.x * blockDim.x + threadIdx.x; 

      CENTRE = i;    //(i,j) 
      NORD = i + (NXPROB+1); //(i,j+1) 
      SUD = i + (NXPROB-1); //(i,j-1) 
      EST = i + 1;   //(i+1,j) 
      OEST = i - 1;   //(i-1,j) 

      if(i>NYPROB && i<NXPROB*NYPROB) 
       d_X2[CENTRE] = d_X1[CENTRE] 
         + CX * (d_X1[EST] + d_X1[OEST] - 2.0 * d_X1[CENTRE]) 
         + CY * (d_X1[NORD] + d_X1[SUD] - 2.0 * d_X1[CENTRE]); 
     * If you want execute this part, 
     * then you put true to "shared_memory" in main 
      long int i, j, tx, ty, i2d; 
      __shared__ double temp[NTHREADS][NTHREADS]; 
      double part_x, part_y; 

      tx = threadIdx.x; 
      ty = threadIdx.y; 
      i = blockIdx.x * blockDim.x + tx; 
      j = blockIdx.y * blockDim.y + ty; 

      i2d = i + NXPROB*j; 

      if(i2d < NXPROB*NYPROB) 
       temp[tx][ty] = d_X1[i2d]; 


       if((tx>0) && (tx < NTHREADS-1) && tx < NTHREADS-1){ 
        part_x = (temp[tx+1][ty] - 2*temp[tx][ty] + temp[tx-1][ty]); 
        part_y = (temp[tx][ty+1] - 2*temp[tx][ty] + temp[tx][ty-1]); 
       d_X2[i2d] = d_X1[i2d] + CX*CY*(part_x + part_y); 

     * Main 

    int main(int argc, char* argv[]) { 
     long int s, it; 
     unsigned int flag, verbose; 
     unsigned int NX, NY, NXPROB, NYPROB; 
     double start, end; 
     int iz; 

     dim3 dimBlock, dimGrid; 
     bool shared_memory = false;  //FALSE = no shared memory - TRUE = shared memory 

     // Defaut values 
     NX = 100; 
     NY = 100; 

     // create file and verbose flags 
     flag = 0; 
     verbose = 0; 

     // Parse command line options 
     int opt; 
     char *file = NULL; 
     while ((opt = getopt(argc, argv, "hvs:f:")) != -1) { 
      switch (opt) { 
      case 'v': 
       verbose = 1; 
      case 's': 
       if(!(s=atoi(optarg))) { 
        fprintf(stderr, "Cannot parse %s value.\n", optarg); 
       NX = NY = s; 
      case 'f': 
       file = optarg; 
       flag = 1; 
      case 'h': 
       fprintf(stderr, "Usage: %s [-s SIZE] [-f output file]\n", argv[0]); 

     // Set initial data values 
     NXPROB = NX - 1; 
     NYPROB = NY - 1; 

     if(verbose) { 
      fprintf(stdout, "[INFO] Setting map size to %d (%dx%d)\n", NX*NY, NX, NY); 
      fprintf(stdout, "[INFO] Max iter %d\n", MAXSTEP); 
     if(verbose && flag) { 
      fprintf(stdout, "[INFO] Using output file %s\n", file); 

     // Program starts here 
     start = gettime(); 

     // CPU Memory allocation   
     double** X[2]; 
     X[0] = make2DDoubleArray (NX, NY); 
     X[1] = make2DDoubleArray (NX, NY); 

     // Set initial and boundary conditions 
     initializeArray (X[0], NX, NY); 
     setupBoundaryConditions(X[0], NX, NY); 
     setupBoundaryConditions(X[1], NX, NY); 

     // GPU Memory allocation 
     double *d_X[2]; 
     cudaMalloc((void **)&d_X[0], NX*NY*sizeof(double)); 
     cudaMalloc((void **)&d_X[1], NX*NY*sizeof(double)); 

     // Copy CPU --> GPU 
     cudaMemcpy(d_X[0], X[0], NX*NY*sizeof(double), cudaMemcpyHostToDevice); 
     cudaMemcpy(d_X[1], X[1], NX*NY*sizeof(double), cudaMemcpyHostToDevice); 

     dimBlock = dim3(NTHREADS,NTHREADS); 
     dimGrid = dim3(ceil(NX/dimBlock.x),ceil(NY/dimBlock.y)); 

     // Main calculations 
     iz = 0; 
     for (it = 0; it < MAXSTEP; it++) 
      if(verbose && (it%(MAXSTEP/10) == 0)) { 
       fprintf(stdout, "[INFO] iteration %ld, time %.3f seconds\n", it, gettime()-start); 
      update<<<dimGrid, dimBlock>>>(shared_memory, NX, NXPROB, NYPROB, d_X[iz], d_X[1-iz]); 
      iz = 1 - iz; 


     // Copy GPU --> CPU 
     //cudaMemcpy(X[iz], d_X[iz], NX*NY*sizeof(double), cudaMemcpyDeviceToHost); 
     cudaMemcpy(X, d_X, NX*NY*sizeof(double), cudaMemcpyDeviceToHost); 

     // Save output file 
     if(flag) save(X[iz], NX, NY, file); 

     // End time 
     end = gettime(); 

     // Get information: wall clock time, problem size, ... 
      fprintf(stdout, "[INFO] Convergence after %d steps\n", MAXSTEP); 
      fprintf(stdout, "[INFO] Problem size %d [%dx%d]\n", NY*NX, NX, NY); 
      fprintf(stdout, "[INFO] Wall clock time %lf seconds\n",(end-start)); 
      if(flag) fprintf(stdout, "[INFO] Output file %s\n", file); 
      printf("Time %.3f seconds, Size %d [%dx%d]\n", end - start, NY*NX, NX, NY); 



    void setupBoundaryConditions(double** X, unsigned long int x, unsigned long int y) { 
     /* set boundary conditions for ix, jy = 0 and ix, jy = n-1 */ 
     unsigned long int i, j; 
     double leftBC = 0, 
      rightBC = 0, 
      topBC = 0, 
      bottomBC = 0; 

     /* setup the bottom and top BCs, jy = 0 and jy = n-1 or arraySizeY - 1 */ 
     for (i = 0; i < x; i++) 
      X[i][0] = bottomBC; //bottom BC 
      X[i][y-1] = topBC; //top BC 

     /* setup the left and right BCs, ix = 0 and ix = arraySizeX - 1 */ 
     for (j = 0; j < y; j++) 
      X[0][j] = leftBC; //left BC 
      X[x-1][j] = rightBC; //right BC 

     /* set the values at the corner nodes as averages of both sides*/ 
     // bottom left 
     X[0][0]  = 0.5 * (leftBC + bottomBC); 
     // top left 
     X[0][y-1] = 0.5 * (topBC + leftBC); 
     // top right 
     X[x-1][y-1] = 0.5 * (topBC + rightBC); 
     // bottom right 
     X[x-1][0] = 0.5 * (bottomBC + rightBC); 

    void initializeArray(double** X, unsigned long int x, unsigned long int y) { 
     unsigned long int i, j; 

     for (i = 1; i < x; i++) 
      for (j = 1; j < y; j++) 
       X[i][j] = (double)MAXSTEP+(i * (x - i - 1) * j * (y - j - 1)); 

     for (i = 0; i < x; i++) 
      X[i][0] = 0; //bottom BC 
      X[i][y-1] = 0; //top BC 
     for (j = 1; j < y; j++) 
      X[0][j] = 0; //left BC 
      X[x-1][j] = 0; //right BC 

    double** make2DDoubleArray(unsigned long int x, unsigned long int y) { 
      unsigned long int ix; 
     double** X; 
     X = (double**) malloc(x*sizeof(double*)); 
     for (ix = 0; ix < x; ix++) { 
      X[ix] =(double*) malloc(y*sizeof(double)); 
     return X; 

    void save(double** X, unsigned long int x, unsigned long int y, char* filename) { 
     unsigned long int i, j; 
     FILE* file; 
     file = fopen(filename,"w"); 

     for (i = 0; i < x; i++) 
      for (j = 0; j < y; j++) 
       fprintf(file,"%8.3f ", X[i][j]); 

    /* Timing function */ 
    double gettime(void) { 
     struct timeval tv; 
     return tv.tv_sec + 1e-6*tv.tv_usec; 

Obwohl es einige Fehler in Folge sein.