2016-04-28 5 views
-3

Aus irgendeinem Grund, wenn ich mein Programm ausführe, haben die Gerätevariablen einen Nullwert. Kurz bevor ich den Cuda-Kernel ausführe, haben die Gerätevariablen die richtigen Werte. Das Ausgabebild ist nur schwarz von der ursprünglichen Bildgröße. Alle Speicherzuweisungen und das Kopieren von und zum Host scheinen korrekt zu sein.Cuda - Gerätewerte 0 nach Kernel-Ausführung

Danke für jede Hilfe!

// Includes, system 
#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 
#include <math.h> 

#ifdef _WIN32 
# define WINDOWS_LEAN_AND_MEAN 
# define NOMINMAX 
# include <windows.h> 
#endif 

#define Image_Size 512 
#define Kernel_Size 3 

// Includes CUDA 
#include <cuda_runtime.h> 

// Utilities and timing functions 
#include "./inc/helper_functions.h" // includes cuda.h and cuda_runtime_api.h 

// CUDA helper functions 
#include "./inc/helper_cuda.h"   // helper functions for CUDA error check 

const char *imageFilename = "lena_bw.pgm"; 

const char *sampleName = "simpleTexture"; 

#define C_PI 3.141592653589793238462643383279502884197169399375 

void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor) 
{ 
    // This function effectively swirls an image 
    // This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup 
    // Each thread on the GPU will process exactly one pixel 
    // Before doing anything, we need to determine the current pixel we are calculating in this thread 
    // Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel 

    int i = blockIdx.y * blockDim.y + threadIdx.y; 
    int j = blockIdx.x * blockDim.x + threadIdx.x; 
    // Test to see if we're testing a valid pixel 
    if (i >= height || j >= width) return; // Don't bother doing the calculation. We're not in a valid pixel location 

    double cX = (double)width/2.0f; 
    double cY = (double)height/2.0f; 
    double relY = cY-i; 
    double relX = j-cX; 
    // relX and relY are points in our UV space 
    // Calculate the angle our points are relative to UV origin. Everything is in radians. 
    double originalAngle; 
    if (relX != 0) 
    { 
     originalAngle = atan(abs(relY)/abs(relX)); 
     if (relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle; 
     else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle; 
     else if (relX <=0 && relY <0) originalAngle += C_PI; 
    } 
    else 
    { 
     // Take care of rare special case 
     if (relY >= 0) originalAngle = 0.5f * C_PI; 
     else originalAngle = 1.5f * C_PI; 
    } 
    // Calculate the distance from the center of the UV using pythagorean distance 
    double radius = sqrt(relX*relX + relY*relY); 
    // Use any equation we want to determine how much to rotate image by 
    //double newAngle = originalAngle + factor*radius; // a progressive twist 
    double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI)); 
    // Transform source UV coordinates back into bitmap coordinates 
    int srcX = (int)(floor(radius * cos(newAngle)+0.5f)); 
    int srcY = (int)(floor(radius * sin(newAngle)+0.5f)); 
    srcX += cX; 
    srcY += cY; 
    srcY = height - srcY; 
    // Clamp the source to legal image pixel 
    if (srcX < 0) srcX = 0; 
    else if (srcX >= width) srcX = width-1; 
    if (srcY < 0) srcY = 0; 
    else if (srcY >= height) srcY = height-1; 
    // Set the pixel color 
    // Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here 
    pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX]; 
} 




//////////////////////////////////////////////////////////////////////////////// 
// Declaration, forward 
void runTest(int argc, char **argv); 

//////////////////////////////////////////////////////////////////////////////// 
// Program main 
//////////////////////////////////////////////////////////////////////////////// 
int main(int argc, char **argv) 
{ 
    printf("%s starting...\n", sampleName); 

    // Process command-line arguments 
    if (argc > 1) 
    { 
     if (checkCmdLineFlag(argc, (const char **) argv, "input")) 
     { 
      getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename); 
     } 
     else if (checkCmdLineFlag(argc, (const char **) argv, "reference")) 
     { 
      printf("-reference flag should be used with -input flag"); 
      exit(EXIT_FAILURE); 
     } 
    } 

    runTest(argc, argv); 

    cudaDeviceReset(); 
    printf("%s completed", 
      sampleName); 
    //exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); 
} 

//////////////////////////////////////////////////////////////////////////////// 
//! Run a simple test for CUDA 
//////////////////////////////////////////////////////////////////////////////// 
void runTest(int argc, char **argv) 
{ 
    int devID = findCudaDevice(argc, (const char **) argv); 
    unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float); 
    // load image from disk 
    float *hData = NULL; 
    float *host_array_kernel = 0; 

    float *device_array_Image = 0; 
    float *device_array_kernel = 0; 
    float *device_array_Result = 0; 


    unsigned int width, height; 
    char *imagePath = sdkFindFilePath(imageFilename, argv[0]); 

    if (imagePath == NULL) 
    { 
     printf("Unable to source image file: %s\n", imageFilename); 
     exit(EXIT_FAILURE); 
    } 

    sdkLoadPGM(imagePath, &hData, &width, &height); 

    unsigned int size = width * height * sizeof(float); 
    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height); 

    // Allocation of device arrays using CudaMalloc 
    cudaMalloc((void**)&device_array_Image, size); 
    cudaMalloc((void**)&device_array_kernel, kernel_bytes); 
    cudaMalloc((void**)&device_array_Result, size); 


    host_array_kernel = (float*)malloc(kernel_bytes); // kernel 


    // Allocate mem for the result on host side 
    float *hOutputDataSharp = (float *) malloc(size); 

    GenerateKernel (host_array_kernel); 


// copy arrays and kernel from host to device 
    checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice)); 
    checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice)); 



    dim3 dimBlock(16, 16, 1); 
    dim3 dimGrid(width/dimBlock.x, height/dimBlock.y, 1); 

    //Do the Convolution 
    printf("DImage : '%.8f'\n",device_array_Image); 
    printf("DKernel : '%.8f'\n",device_array_kernel); 
    //serialConvolution(hData, host_array_kernel ,hOutputDataSharp); 


    SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f); 
    printf("DResult : '%.8f'\n",device_array_Result); 
    checkCudaErrors(cudaDeviceSynchronize()); 
    cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost); 
    printf("HResult : '%.8f'\n",hOutputDataSharp); 
    // Write result to file 
    char outputSharp[1024]; 

    strcpy(outputSharp, imagePath); 
    strcpy(outputSharp, "data/serial_sharptest.pgm"); 
    sdkSavePGM(outputSharp, hOutputDataSharp, width, height); 

    cudaFree(device_array_Result); 
    cudaFree(device_array_Image); 
    cudaFree(device_array_kernel); 
    free(hData); 
    free(imagePath); 
    //free(host_array_Image); 
    free(host_array_kernel); 
    free(hOutputDataSharp); 
    //free(hOutputImage); 
    //free(hOutputKernel); 
} 
+2

keine Fehlerüberprüfung ?? –

+0

Es erzeugt keine Fehler. Das erzeugte Bild ist nur schwarz statt des verarbeiteten Bildes, das es sein soll. Ich bin wirklich neu in Cuda, im Wesentlichen vor ein paar Tagen begonnen, so dass ich immer noch in den Griff bekommen, es zu verstehen. – Alpha

+1

@Alpha: Woher wissen Sie, dass es keine Fehler erzeugt, wenn der Code nicht auf Fehler überprüft wird? – talonmies

Antwort

1

Ihr Code wird im Quellbild zu schreiben.

pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX]; 

, die device_array_Image schreibt, welche die Quelle ist, nicht das Ziel, das Sie führt zu erwarten sind

Außerdem bin ich sehr neugierig auf die Ausgabe von als device_array_Result ist in GPU-Adressraum und mit cudaMalloc zugeordnet. Auf welchem ​​Gerät liegst du?