Meine Cuda-Skript-Array-Ausgabe ist falsch

Dieses Wochenende versuche ich Cuda zu lernen.Meine Cuda-Skript-Array-Ausgabe ist falsch

Was ich tun möchte, ist c = a + b. Jede der Variablen (a, b und c) ist ein Array mit 5 Elementen.

Ich habe ein Problem mit dem Ergebnis. Das ist mein gewünschtes Ergebnis:

{a1, a2, a3, a4, a5} = {11.000000, 21.000000, 31.000000, 41.000000, 51.000000} 
{b1, b2, b3, b4, b5} = {1.000000, 3.000000, 5.000000, 7.000000, 11.000000} 
{c1, c2, c3, c4, c5} = {12.000000, 24.000000, 36.000000, 48.000000, 62.000000}

Aber das ist, was ich habe:

PS E:\testing\cuda2\Debug> .\cuda2.exe 
{a1, a2, a3, a4, a5} = {11.000000, 21.000000, 31.000000, 41.000000, 51.000000} 
{b1, b2, b3, b4, b5} = {1.000000, 3.000000, 5.000000, 7.000000, 11.000000} 
{c1, c2, c3, c4, c5} = {12.000000, 24.000000, 0.000000, 0.000000, 0.000000}

Wie Sie sehen können, ist das Ergebnis (c3, c4, c5) ist falsch.
Bitte sagen Sie mir, wie Sie den Code unten machen das Richtige tun.

Ich bin mit VS2015 und Cuda-Toolkit 8. Es gibt 3 Dateien, die ich in meinem Projekt Lösung geschaffen: main.cpp, simple_math.cu, simple_math.cuh;

main.cpp

#include "simple_math.cuh" 
#include <iostream> // fprintf 


int main() 
{ 
    const int arraySize = 5; 
    float a[arraySize] = { 11, 21, 31, 41, 51 }; 
    float b[arraySize] = { 1, 3, 5, 7, 11 }; 
    double c[arraySize] = { 0, 0, 0, 0, 0 }; 

    cudaError_t cudaStatus = mathWithCuda(c, a, b, arraySize, ADD); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "mathWithCuda failed!"); 
     return 1; 
    } 


    fprintf(stdout, "{a1, a2, a3, a4, a5} = {%f, %f, %f, %f, %f} \n{b1, b2, b3, b4, b5} = {%f, %f, %f, %f, %f} \n{c1, c2, c3, c4, c5} = {%f, %f, %f, %f, %f}", 
     a[0], a[1], a[2], a[3], a[4], b[0], b[1], b[2], b[3], b[4], c[0], c[1], c[2], c[3], c[4]); 


    cudaStatus = cudaDeviceReset(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceReset failed!"); 
     return 1; 
    } 

    return 0; 
}

simple_math.cuh

#ifndef SIMPLE_MATH_CUH_ 
#define SIMPLE_MATH_CUH_ 


#include <cuda_runtime.h> // cudaError_t 

#define ADD 0 
#define SUB 1 
#define MUL 2 
#define DIV 3 


cudaError_t mathWithCuda(double *c, const float *a, const float *b, unsigned int size, int mode); 

__global__ void addKernel(double *c, const float *a, const float *b); 
__global__ void subKernel(double *c, const float *a, const float *b); 
__global__ void mulKernel(double *c, const float *a, const float *b); 
__global__ void divKernel(double *c, const float *a, const float *b); 


#endif

simple_math.cu

#include <device_launch_parameters.h> // threadIdx 
#include <stdio.h> // fprintf 
#include <math.h> // ceil 
#include "simple_math.cuh" 


cudaError_t mathWithCuda(double *c, const float *a, const float *b, unsigned int arraySize, int mode) 
{ 
    float *dev_a, *dev_b; 
    double *dev_c; 
    cudaError_t cudaStatus; 


    cudaStatus = cudaSetDevice(0); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
     goto Error; 
    } 


    if ((cudaStatus = cudaMalloc((void**)&dev_c, arraySize * sizeof(double))) != cudaSuccess || 
     (cudaStatus = cudaMalloc((void**)&dev_a, arraySize * sizeof(float))) != cudaSuccess || 
     (cudaStatus = cudaMalloc((void**)&dev_b, arraySize * sizeof(float))) != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 


    if ((cudaStatus = cudaMemcpy(dev_a, a, arraySize * sizeof(float), cudaMemcpyHostToDevice)) != cudaSuccess || 
     (cudaStatus = cudaMemcpy(dev_b, b, arraySize * sizeof(float), cudaMemcpyHostToDevice)) != cudaSuccess) 
    { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 


    int blocksPerGrid, threadsPerBlock; 
    if (arraySize < 512) { 
     blocksPerGrid = 1; 
     threadsPerBlock = arraySize; 
    } 
    else { 
     blocksPerGrid = ceil(double(arraySize)/double(threadsPerBlock)); 
     threadsPerBlock = 512; 
    } 


    switch (mode) 
    { 
    case 0: 
     addKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b); 
     break; 
    case 1: 
     subKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b); 
     break; 
    case 2: 
     mulKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b); 
     break; 
    case 3: 
     divKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b); 
     break; 
    default: 
     // nothing 
     break; 
    } 


    cudaStatus = cudaGetLastError(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
     goto Error; 
    } 


    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus); 
     goto Error; 
    } 


    cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 


Error: 
    cudaFree(dev_c); 
    cudaFree(dev_a); 
    cudaFree(dev_b); 


    return cudaStatus; 
} 


__global__ void addKernel(double *c, const float *a, const float *b) 
{ 
    int i = threadIdx.x; 
    c[i] = __fadd_rn(a[i], b[i]); // a + b 
}  

__global__ void subKernel(double *c, const float *a, const float *b) 
{ 
    int i = threadIdx.x; 
    c[i] = __fsub_rn(a[i], b[i]); // a - b 
} 

__global__ void mulKernel(double *c, const float *a, const float *b) 
{ 
    int i = threadIdx.x; 
    c[i] = __fmul_rn(a[i], b[i]); // a * b 
} 

__global__ void divKernel(double *c, const float *a, const float *b) 
{ 
    int i = threadIdx.x; 
    c[i] = __fdividef(a[i], b[i]); // a/b 
}

Quelle

2017-05-01 aww

Das Problem hier zu sein scheint:

cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost);

I Sache, die Sie kopieren sollten arraySize * sizeof(double) Bisse.

Quelle

2017-05-01 11:02:51 Alex

Verdammt, nur ein paar Minuten und Sie haben es gelöst wie nichts. Vielen Dank. – aww

@aww Ein frisches Aussehen hilft manchmal) – Alex

Meine Cuda-Skript-Array-Ausgabe ist falsch

Antwort

Verwandte Themen