Ich fand die Lösung. Das Problem kam nicht von clEnqueueNDRangeKernel()
früher geschah es in clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
. Ich habe die Build-Informationen mit clGetProgramBuildInfo()
abgerufen. Das Problem war, dass meine multiply_arrays.cl Datei kodiert nicht utf war 8
Für alle, die neu in OpenCL ist. Jede opencl-Funktion gibt eine Status-Ganzzahl zurück, die einem bestimmten Fehlercode zugeordnet ist. Wenn eine Funktion einen Statuscode zurückgibt, ist es möglich, einen Zeiger auf die Funktion zu übergeben. Siehe die Beispielfunktionen, die ich unten verlinkt habe. Dies ist sehr hilfreich, um Ihr Programm zu debuggen.
Returns Status Code
Status Code by Reference
main.cpp
/***
* Excerpted from "Seven Concurrency Models in Seven Weeks",
* published by The Pragmatic Bookshelf.
* Copyrights apply to this code. It may not be used to create training material,
* courses, books, articles, and the like. Contact us if you are in doubt.
* We make no guarantees that this code is fit for any purpose.
* Visit http://www.pragmaticprogrammer.com/titles/pb7con for more book information.
***/
#ifdef __APPLE__
#include <OpenCL/cl.h>
#include <mach/mach_time.h>
#else
#include <CL/cl.h>
#include <Windows.h>
#endif
#include <stdio.h>
#include<iostream>
#include <inttypes.h>
#include <chrono>
#define NUM_ELEMENTS (100000)
char* read_source(const char* filename) {
FILE *h = fopen(filename, "r");
fseek(h, 0, SEEK_END);
size_t s = ftell(h);
rewind(h);
char* program = (char*)malloc(s + 1);
fread(program, sizeof(char), s, h);
program[s] = '\0';
fclose(h);
return program;
}
void random_fill(cl_float array[], size_t size) {
for (int i = 0; i < size; ++i)
array[i] = (cl_float)rand()/RAND_MAX;
}
int main() {
//Status for Errorhandling
cl_int status;
//Identify Platform
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
//Get Id of GPU
cl_device_id device;
cl_uint num_devices = 0;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
// Create Context
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
//Use context to create Command Queue
//Que enables us to send commands to the gpu device
cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);
//Load Kernel
char* source = read_source("multiply_arrays.cl");
cl_program program = clCreateProgramWithSource(context, 1,
(const char**)&source, NULL, &status);
free(source);
// Build Program
status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
size_t len;
char *buffer;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
buffer = (char *) malloc(len);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
printf("%s\n", buffer);
//Create Kernel
cl_kernel kernel = clCreateKernel(program, "multiply_arrays", &status);
// Create Arrays with random Numbers
cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS];
random_fill(a, NUM_ELEMENTS);
random_fill(b, NUM_ELEMENTS);
//uint64_t startGPU = mach_absolute_time();
auto start = std::chrono::high_resolution_clock::now();
//Create Readonly input Buffers with value from a and b
cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_float) * NUM_ELEMENTS, a, NULL);
cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_float) * NUM_ELEMENTS, b, NULL);
//Create Output buffer write Only
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL);
//set Kernel Arguments
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
cl_event timing_event;
size_t work_units = NUM_ELEMENTS;
status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units,
NULL, 0, NULL,&timing_event);
cl_float results[NUM_ELEMENTS];
//Calculate Results and copy from output buffer to results
clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS,
results, 0, NULL, NULL);
//uint64_t endGPU = mach_absolute_time();
auto finish = std::chrono::high_resolution_clock::now();
//printf("Total (GPU): %lu ns\n\n", (unsigned long)(endGPU - startGPU));
std::cout << "Total(GPU) :"<< std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << "ns\n";
cl_ulong starttime;
clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &starttime, NULL);
cl_ulong endtime;
clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endtime, NULL);
printf("Elapsed (GPU): %lu ns\n\n", (unsigned long)(endtime - starttime));
clReleaseEvent(timing_event);
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
//uint64_t startCPU = mach_absolute_time();
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < NUM_ELEMENTS; ++i)
results[i] = a[i] * b[i];
//uint64_t endCPU = mach_absolute_time();
finish = std::chrono::high_resolution_clock::now();
//printf("Elapsed (CPU): %lu ns\n\n", (unsigned long)(endCPU - startCPU));
std::cout << "Elapsed (CPU) :" << std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << "ns\n";
return 0;
}
multiply_arrays.cl
__kernel void multiply_arrays(__global const float* inputA,
__global const float* inputB,
__global float* output) {
int i = get_global_id(0);
output[i] = inputA[i] * inputB[i];
}
//ö
ich, dass dieses mig spekulieren Wegen der Verwendung von CPU-Geräten und kleinen Arbeitslasten entscheidet die Implementierung, die Aufgabe inline auszuführen, so dass keine Ausführung zu verfolgen ist (sie ist bereits bei der Anrufrückgabe abgeschlossen). – DarkZeros
@DarkZeros danke für deinen Kommentar. Ich habe die Antwort gefunden. :-) –