So vermeiden Sie illegalen Speicherzugriff in CUDA

-1

Ich habe in CUDA Probleme mit dem Speicherzugriff festgestellt. Der Kern meines Codes istSo vermeiden Sie illegalen Speicherzugriff in CUDA

long long addr0,addr1; 
addr0=(long long)my_array; 
addr1 = (addr0^(1 << position)); 
long long *r_addr0, *r_addr1; 
r_addr0 = (long long *)addr0; 
r_addr1 = (long long *)addr1; 
i = *r_addr0; 
j = *r_addr1;

Wo my_array die Adresse des Geräts Array ist.

I speichert die Adresse des my_array in r_addr0, dann drehen i das Bit von r_addr0 eins nach dem anderen. z.B.

0000 0000 1011 0000 0011 1111 1110 0000 0000 0000 0000 0 addr of my_array 
0000 0000 1011 0000 0011 1111 1110 0000 0000 0000 0001 1 flip last bit 
0000 0000 1011 0100 0011 1111 1110 0000 0000 0000 0000 31 flip 31 bit.

drucke ich die Adresse von r_addr0 und r_addr1 jedes Mal, und es funktioniert gut für die ersten 31 Bits, aber ich begegnete illegale Speicheradresse Ausgabe nach 32 Bit. Ich benutze Tesla K80, die 12GB Speicher an Bord haben.

Weiß jemand, wie in CUDA

kompletten Code siehe unten illegalen Speicherzugriff zu vermeiden:

# include <stdio.h> 
# include <stdint.h> 
# include "cuda_runtime.h" 

//compile nvcc test.cu -o test 

__global__ void global_latency (int * my_array, int position, int *d_time); 
int row_bits(int * h_a, long long N, int pos, int * h_time); 

int main(){ 
    cudaSetDevice(0); 
    long long i, N; 
    int *h_a; 
    int h_time0; 
    int h_time1; 
    int *h_time; 
    N = 2*1024*1024*1024L;//2G elements, 4 bytes per element, 8 GB memory used. 
    printf("\n=====%10.4f GB array with %d GB elements,discover row bits====\n", sizeof(int)*(float)N/1024/1024/1024,N/1024/1024/1024); 
    /* allocate arrays on CPU */ 
    h_a = (int *)malloc(sizeof(int) * N); 
    h_time = (int *)malloc(sizeof(int)*N); 
/* initialize array elements*/ 
    for (i=0L; i<N; i++){ 
    h_a[i] = i%(1024*1024); 
    } 

    for (int k=0;k<2;k++){ 
    h_time[k]=0; 
    } 
    printf("... ... ...\n... ... ...\n"); 
    for (int pos = 0; pos < 64; pos++){ 
    h_time0=0; 
    h_time1=0; 
    for (int j=0;j<5;j++){ 
    row_bits(h_a,N,pos,h_time); 
    h_time0 +=h_time[0]; 
    h_time1 +=h_time[1]; 
    } 
    printf("position = %d, time0 = %d, time1 = %d\n", pos+1,h_time0/5, h_time1/5); 
    } 
    printf("===============================================\n\n"); 
    free(h_a); 
    return 0; 
} 

int row_bits(int * h_a, long long N, int pos, int * h_time) { 
    cudaError_t error_id; 
    int *d_a; 
    /* allocate arrays on GPU */ 
    error_id = cudaMalloc ((void **) &d_a, sizeof(int) * N); 
    if (error_id != cudaSuccess) { 
printf("Error 1.0 is %s\n", cudaGetErrorString(error_id)); 
    } 
    /* copy array elements from CPU to GPU */ 
    error_id = cudaMemcpy(d_a, h_a, sizeof(int) * N, cudaMemcpyHostToDevice); 
    if (error_id != cudaSuccess) { 
    printf("Error 1.1 is %s\n", cudaGetErrorString(error_id)); 
    } 

    //int *h_time = (int *)malloc(sizeof(int)); 
    int *d_time; 
    error_id = cudaMalloc ((void **) &d_time, 4*sizeof(int)); 
    if (error_id != cudaSuccess) 
    printf("Error 1.2 is %s\n", cudaGetErrorString(error_id)); 

    cudaThreadSynchronize(); 
    /* launch kernel*/ 
    dim3 Db = dim3(1); 
    dim3 Dg = dim3(1,1,1); 

    global_latency <<<Dg, Db>>>(d_a, pos,d_time); 

    cudaThreadSynchronize(); 

    error_id = cudaGetLastError(); 
    if (error_id != cudaSuccess) { 
    printf("Error kernel is %s\n", cudaGetErrorString(error_id)); 
    } 

    /* copy results from GPU to CPU */ 
    cudaThreadSynchronize(); 

    error_id = cudaMemcpy((void *)h_time, (void *)d_time, 4*sizeof(int),  cudaMemcpyDeviceToHost); 
    if (error_id != cudaSuccess) { 
    printf("Error 2.0 is %s\n", cudaGetErrorString(error_id)); 
    } 
    cudaThreadSynchronize(); 

    /* free memory on GPU */ 
    cudaFree(d_a); 
    cudaFree(d_time); 


    cudaDeviceReset(); 
    return 0; 
} 


__global__ void global_latency (int * my_array, int position, int *d_time) { 

    //int tid = blockIdx.x*blockDim.x+threadIdx.x; 

    int start_time=0; 
    int mid_time=0; 
    int end_time=0; 

__shared__ int s_tvalue[2];//2: number of threads per block 

    int i, j; 
    s_tvalue[0]=0; 
    s_tvalue[1]=0; 
    long long addr0,addr1; 
    //printf("%p\n",my_array); 
    //int * p = (int *)0x0; 
    //addr0 = (long long)p; 
    addr0=(long long)my_array; 
    //printf("Address i :%p\n",addr0); 
    addr1 = (addr0^(1 << position)); 
    //printf("Address i':%p\n",addr1); 
    //start_time = clock(); 
    long long *r_addr0, *r_addr1; 
    r_addr0 = (long long *)addr0; 
    r_addr1 = (long long *)addr1; 

    start_time = clock(); 

    i = *r_addr0; 
    s_tvalue[0] = i; 
    mid_time = clock(); 
    j = *r_addr1; 
    s_tvalue[1] = j; 
    //printf("%p",p); 
    //k =(int)p; 
    //printf("%d\n",k); 

    //printf("%d",k); 
    //__syncthreads(); 
    end_time = clock(); 

    d_time[0] = mid_time-start_time; 
    d_time[1] = end_time-mid_time; 
    d_time[2] = s_tvalue[0]; 
    //printf("[%p]=%lld\n",addr0,d_time[1]); 
    d_time[3] = s_tvalue[1]; 
    //printf("[%p]=%lld\n",addr1,d_time[2]); 
}

Quelle

2016-06-09 Steven Huang

Wenn position=0 und ursprüngliche Adresse Bit 0 ist 0, die Sie versuchen zu setzen

j=*(int*)&(((char*)my_array)[1]);

, die die 4-Byte-Ausrichtung von t bricht er tippt int. Dies wird Ihr Programm zum Absturz bringen.

Wenn position=3 und ursprüngliche Adresse Bit 3 ist, sagen wir 1, Sie

j=*(int*)&(((char*)my_array)[-8]);

, wo die Adresse, die Sie versuchen zu setzen versuchen, bevor my_array zu lesen ist. Es ist definitiv ein unzulässiger Speicherzugriff. Tatsächlich bedeutet das Umkehren eines Bits, das ursprünglich gleich 1 ist, einen negativen Array-Index.

Auch sollten Sie besser nutzen unsigned long long oder size_t statt long long und 1ull << position statt 1 << position, um sicherzustellen, dass Sie nicht durch das Vorzeichenbit und Überlaufproblem gestört werden.

Quelle

2016-06-09 19:28:34 kangshiyin

Die Adresse von my_array ist 0xb03fe0000. Position 1 ist 0xb03fe0001, Position 2 ist 0xb03fe0002, . Die Adresse in Position 3 ist 0xb03fe0004. Diese Adresse ist immer noch vor my_arryay. Wenn Sie den Code ausführen, ist alles korrekt. Wenn die Adresse "32" erreicht wird, ändert sich die Adresse nicht mehr und es erfolgt ein Zugriff auf die ungültige Speicheradresse. –

@StevenHuang Das Umkehren eines Bits, das ursprünglich gleich 1 ist, bedeutet einen negativen Array-Index. – kangshiyin

@StevenHuang verwenden '1ull' dann. 'nicht mehr ändern' scheint übergelaufen zu sein. – kangshiyin

So vermeiden Sie illegalen Speicherzugriff in CUDA

Antwort

Verwandte Themen