Rechteckige Matrizen in CUDA multiplizieren

In dieser Hausaufgabe muss ich den Code vervollständigen, um zwei Rechteckmatrizen mit CUDA C zu multiplizieren. Nachdem ich den Code abgeschlossen hatte, reichte ich die Lösung für den Datensatz ein, während die Matrizen quadratisch waren Das Ergebnis stimmte nicht mit dem erwarteten Wert überein, wenn die Matrizen nicht quadratisch waren. HierRechteckige Matrizen in CUDA multiplizieren

ist der Code, nachdem ich die fehlenden Teile hinzugefügt:

#include <wb.h> 

#define wbCheck(stmt) do {        \ 
    cudaError_t err = stmt;       \ 
    if (err != cudaSuccess) {       \ 
     wbLog(ERROR, "Failed to run stmt ", #stmt); \ 
     return -1;          \ 
    }             \ 
} while(0) 

// Compute C = A * B 
__global__ void matrixMultiply(float * A, float * B, float * C, 
       int numARows, int numAColumns, 
       int numBRows, int numBColumns, 
       int numCRows, int numCColumns) { 
    //@@ Insert code to implement matrix multiplication here 
    int Row = blockIdx.y * blockDim.y + threadIdx.y; 
    int Col = blockIdx.x * blockDim.x + threadIdx.x; 
    if (numAColumns != numBRows) return ; 
    if ((Row < numARows) && (Col < numBColumns)){ 
     float Cvalue = 0; 
     for (int k = 0 ; k < numAColumns ; ++k) 
     Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col]; 
     C[Row*numAColumns + Col] = Cvalue; 
    } 

    } 



int main(int argc, char ** argv) { 
    wbArg_t args; 
    float * hostA; // The A matrix 
    float * hostB; // The B matrix 
    float * hostC; // The output C matrix 
    float * deviceA; 
    float * deviceB; 
    float * deviceC; 
    int numARows; // number of rows in the matrix A 
    int numAColumns; // number of columns in the matrix A 
    int numBRows; // number of rows in the matrix B 
    int numBColumns; // number of columns in the matrix B 
    int numCRows; // number of rows in the matrix C (you have to set this) 
    int numCColumns; // number of columns in the matrix C (you have to set this) 

    args = wbArg_read(argc, argv); 

    wbTime_start(Generic, "Importing data and creating memory on host"); 
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); 
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); 
    //@@ Set numCRows and numCColumns 
    numCRows = 0; 
    numCColumns = 0; 
    numCRows = numARows; 
    numCColumns = numBColumns; 
    //@@ Allocate the hostC matrix 
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns); 
    wbTime_stop(Generic, "Importing data and creating memory on host"); 

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 

    wbTime_start(GPU, "Allocating GPU memory."); 
    //@@ Allocate GPU memory here 
    cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns); 
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns); 
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns); 

    wbTime_stop(GPU, "Allocating GPU memory."); 

    wbTime_start(GPU, "Copying input memory to the GPU."); 
    //@@ Copy memory to the GPU here 

    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice); 
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice); 
    wbTime_stop(GPU, "Copying input memory to the GPU."); 

    //@@ Initialize the grid and block dimensions here 

    dim3 DimGrid(numARows/8 , numBColumns/8, 1); 
    dim3 DimBlock(8 , 8, 1); 

    wbTime_start(Compute, "Performing CUDA computation"); 

    //@@ Launch the GPU Kernel here 
    matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns); 

    cudaThreadSynchronize(); 
    wbTime_stop(Compute, "Performing CUDA computation"); 

    wbTime_start(Copy, "Copying output memory to the CPU"); 
    //@@ Copy the GPU memory back to the CPU here 
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost); 

    wbTime_stop(Copy, "Copying output memory to the CPU"); 

    wbTime_start(GPU, "Freeing GPU Memory"); 
    //@@ Free the GPU memory here 

    cudaFree(deviceA); 
    cudaFree(deviceB); 
    cudaFree(deviceC); 
    wbTime_stop(GPU, "Freeing GPU Memory"); 

    wbSolution(args, hostC, numCRows, numCColumns); 

    free(hostA); 
    free(hostB); 
    free(hostC); 

    return 0; 
}

Ich hoffe, dass Sie mir helfen können, zu finden, welcher Teil falsch ist.

Quelle

2012-12-15 mzn.rft

ersetzen:

Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];

mit

Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];

Quelle

2012-12-16 06:12:27 ahmad

hallo Ahmad, danke für den Versuch, mir zu helfen, zu finden, welchen Teil falsch ist, und Ihren Rat nach folgendem, fand ich, dass die Bearbeitungszeit war viel besser, aber das Ergebnis immer noch nicht das erwartete Ergebnis zum Beispiel eines Der Datensatz zeigt Folgendes: Die Lösung stimmte nicht mit den erwarteten Ergebnissen in Spalte 124 und Zeile 0 überein. Erwartete 457.153, erhielt aber 422.296. Jetzt stimme ich dir zu, muss ein Fehler in der Funktion MatrixMultiply sein, vielleicht muss ich etwas anderes ändern. –

ersetzen: for (int k = 0 ; k < numAColumns ; ++k) Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col]; C[Row*numAColumns + Col] = Cvalue; }

mit for (int k = 0 ; k < numAColumns ; ++k) Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col]; C[Row*numCColumns + Col] = Cvalue; }

Quelle

2012-12-16 10:14:21 Ira

danke Ira, mit Ihrem Vorschlag habe ich noch eine Reihe von Daten korrekt, aber ich kann immer noch nicht vollständig erwartetes Ergebnis für alle Datensätze erhalten, zum Beispiel habe ich diesen Fall: Die Lösung stimmte nicht mit den erwarteten Ergebnissen überein in Spalte 200 und Zeile 0. Erwarte 415.556, bekam aber 0.807. Ich denke du hast Recht es ist etwas in der Funktion MatrixMultiply. –

ersetzen

Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];

für

Cvalue += A[Row*numAColumns +k]* B[k*numBColumns+Col];

und

C[Row*numAColumns + Col] = Cvalue;

für

C[Row*numCColumns+Col] = Cvalue;

Quelle

2012-12-16 12:35:47 ram

danke ram, deine korrekturen sind wahr, sie ähneln denen, die Ahmad und Ira vorher erwähnt haben, auch wenn ich diese korrekturen gemacht habe, aber trotzdem kann ich das ergebnis nicht richtig bekommen. –

Nach der Hilfe von Ira, Ahmad, Widder und Oli Fly, bekam ich die richtige Antwort als folgt:

#include <wb.h> 

#define wbCheck(stmt) do {         \ 
     cudaError_t err = stmt;       \ 
     if (err != cudaSuccess) {       \ 
      wbLog(ERROR, "Failed to run stmt ", #stmt); \ 
      return -1;          \ 
     }             \ 
    } while(0) 

// Compute C = A * B 
__global__ void matrixMultiply(float * A, float * B, float * C, 
        int numARows, int numAColumns, 
        int numBRows, int numBColumns, 
        int numCRows, int numCColumns) { 
    //@@ Insert code to implement matrix multiplication here 
    int Row = blockIdx.y * blockDim.y + threadIdx.y; 
    int Col = blockIdx.x * blockDim.x + threadIdx.x; 
    if (numAColumns != numBRows) return; 
    if ((Row < numARows) && (Col < numBColumns)){ 
    float Cvalue = 0; 
    for (int k = 0; k < numAColumns; ++k) 
    Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col]; 
    C[Row*numCColumns + Col] = Cvalue; 
    } 

} 

int main(int argc, char ** argv) { 
    wbArg_t args; 
    float * hostA; // The A matrix 
    float * hostB; // The B matrix 
    float * hostC; // The output C matrix 
    float * deviceA; 
    float * deviceB; 
    float * deviceC; 
    int numARows; // number of rows in the matrix A 
    int numAColumns; // number of columns in the matrix A 
    int numBRows; // number of rows in the matrix B 
    int numBColumns; // number of columns in the matrix B 
    int numCRows; // number of rows in the matrix C (you have to set this) 
    int numCColumns; // number of columns in the matrix C (you have to set this) 

    args = wbArg_read(argc, argv); 

    wbTime_start(Generic, "Importing data and creating memory on host"); 
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); 
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); 
    //@@ Set numCRows and numCColumns 
    numCRows = 0; 
    numCColumns = 0; 
    numCRows = numARows; 
    numCColumns = numBColumns; 
    //@@ Allocate the hostC matrix 
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns); 
    wbTime_stop(Generic, "Importing data and creating memory on host"); 

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 

    wbTime_start(GPU, "Allocating GPU memory."); 
    //@@ Allocate GPU memory here 
    cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns); 
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns); 
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns); 

    wbTime_stop(GPU, "Allocating GPU memory."); 

    wbTime_start(GPU, "Copying input memory to the GPU."); 
    //@@ Copy memory to the GPU here 

    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice); 
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice); 
    wbTime_stop(GPU, "Copying input memory to the GPU."); 

    //@@ Initialize the grid and block dimensions here 

    dim3 DimGrid((numCColumns - 1)/8 + 1, (numCRows - 1)/8 + 1, 1); 
    dim3 DimBlock(8 , 8, 1); 

    wbTime_start(Compute, "Performing CUDA computation"); 

    //@@ Launch the GPU Kernel here 
    matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns); 

    cudaThreadSynchronize(); 
    wbTime_stop(Compute, "Performing CUDA computation"); 

    wbTime_start(Copy, "Copying output memory to the CPU"); 
    //@@ Copy the GPU memory back to the CPU here 
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost); 

    wbTime_stop(Copy, "Copying output memory to the CPU"); 

    wbTime_start(GPU, "Freeing GPU Memory"); 
    //@@ Free the GPU memory here 

    cudaFree(deviceA); 
    cudaFree(deviceB); 
    cudaFree(deviceC); 
    wbTime_stop(GPU, "Freeing GPU Memory"); 

    wbSolution(args, hostC, numCRows, numCColumns); 

    free(hostA); 
    free(hostB); 
    free(hostC); 

    return 0; 
}

Quelle

2012-12-16 16:28:43

Danke, dass Sie diese Frage gestellt haben. Es hat mir wirklich geholfen. Eine Sache, die ich gerne stellen würde, haben Sie es jemals für Datasets funktioniert, wo die Dimensionen der Matrix keine Vielfachen von 8 sind? –

@ Abraham, einige der Datensätze waren mit den folgenden Dimensionen: (die Abmessungen der Matrix A ist 200 * 100, Abmessungen der Matrix B ist 100 * 256) und es gab einen anderen Datensatz (Abmessungen von A ist 100 * 128, Abmessungen von B ist 128 * 50) –

Ich glaube, die Antwort für # 2 ist falsch mit diesem Code, weil eine der Dimensionen (100) nicht durch die Blockgröße (8) teilbar ist.Das Setup des Rasters berücksichtigt das nicht. –

Wir können tiled Matrixmultiplikation verwenden und ich fand, dass es eine bessere Ausführungszeit hat.

#include <wb.h> 

#define wbCheck(stmt) do {         \ 
     cudaError_t err = stmt;       \ 
     if (err != cudaSuccess) {       \ 
      wbLog(ERROR, "Failed to run stmt ", #stmt); \ 
      return -1;          \ 
     }             \ 
    } while(0) 

// Compute C = A * B 
__global__ void matrixMultiplyShared(float * A, float * B, float * C, 
          int numARows, int numAColumns, 
          int numBRows, int numBColumns, 
          int numCRows, int numCColumns) { 
    //@@ Insert code to implement matrix multiplication here 
    //@@ You have to use shared memory for this MP 
    const int TILE_WIDTH = 32; 
    __shared__ float sharedA[TILE_WIDTH][TILE_WIDTH]; 
    __shared__ float sharedB[TILE_WIDTH][TILE_WIDTH]; 
    int bx = blockIdx.x; 
    int by = blockIdx.y; 
    int tx = threadIdx.x; 
    int ty = threadIdx.y; 
    int Row = by*TILE_WIDTH + ty; 
    int Col = bx*TILE_WIDTH + tx; 
    float Cvalue = 0.0; 
    if (numAColumns != numBRows) return ; 
    for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++) 
    { 

     if (i*TILE_WIDTH + tx < numAColumns && Row < numARows){ 
      sharedA[ty][tx] = A[Row*numAColumns + i*TILE_WIDTH + tx]; 
     }else{ 
      sharedA[ty][tx] = 0.0; 
     } 

     if (i*TILE_WIDTH + ty < numBRows && Col < numBColumns){ 
      sharedB[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + Col]; 
     }else{ 
      sharedB[ty][tx] = 0.0; 
     } 
     __syncthreads(); 
     if(Row < numARows && Col < numBColumns){ 

      for(int j = 0; j < TILE_WIDTH; j++) 
      Cvalue += sharedA[ty][j] * sharedB[j][tx]; 
     } 

     __syncthreads(); 
    } 

    if (Row < numCRows && Col < numCColumns) 
     C[Row*numCColumns + Col] = Cvalue; 
}  




int main(int argc, char ** argv) { 
    wbArg_t args; 
    float * hostA; // The A matrix 
    float * hostB; // The B matrix 
    float * hostC; // The output C matrix 
    float * deviceA; 
    float * deviceB; 
    float * deviceC; 
    int numARows; // number of rows in the matrix A 
    int numAColumns; // number of columns in the matrix A 
    int numBRows; // number of rows in the matrix B 
    int numBColumns; // number of columns in the matrix B 
    int numCRows; // number of rows in the matrix C (you have to set this) 
    int numCColumns; // number of columns in the matrix C (you have to set this) 
    int TILE_WIDTH = 32; 

    args = wbArg_read(argc, argv); 

    wbTime_start(Generic, "Importing data and creating memory on host"); 
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); 
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); 
    //@@ Set numCRows and numCColumns 
    numCRows = 0; 
    numCColumns = 0; 
    numCRows = numARows; 
    numCColumns = numBColumns; 
    //@@ Allocate the hostC matrix 
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns); 
    wbTime_stop(Generic, "Importing data and creating memory on host"); 

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 

    wbTime_start(GPU, "Allocating GPU memory."); 
    //@@ Allocate GPU memory here 
    cudaMalloc((void**)&deviceA , sizeof(float)*numARows*numAColumns); 
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns); 
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns); 

    wbTime_stop(GPU, "Allocating GPU memory."); 

    wbTime_start(GPU, "Copying input memory to the GPU."); 
    //@@ Copy memory to the GPU here 
    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice); 
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice); 

    wbTime_stop(GPU, "Copying input memory to the GPU."); 

    //@@ Initialize the grid and block dimensions here 
    int dimX = (int)(ceil((float)numCColumns/TILE_WIDTH)); 
    int dimY = (int)(ceil((float)numCRows/TILE_WIDTH)); 
    dim3 DimGrid(dimX, dimY); 
    dim3 DimBlock(TILE_WIDTH, TILE_WIDTH); 



    wbTime_start(Compute, "Performing CUDA computation"); 
    //@@ Launch the GPU Kernel here 
    matrixMultiplyShared<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns); 

    cudaThreadSynchronize(); 
    wbTime_stop(Compute, "Performing CUDA computation"); 

    wbTime_start(Copy, "Copying output memory to the CPU"); 
    //@@ Copy the GPU memory back to the CPU here 
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost); 

    wbTime_stop(Copy, "Copying output memory to the CPU"); 

    wbTime_start(GPU, "Freeing GPU Memory"); 
    //@@ Free the GPU memory here 
    cudaFree(deviceA); 
    cudaFree(deviceB); 
    cudaFree(deviceC); 

    wbTime_stop(GPU, "Freeing GPU Memory"); 

    wbSolution(args, hostC, numCRows, numCColumns); 

    free(hostA); 
    free(hostB); 
    free(hostC); 

    return 0; 
}

Quelle

2013-01-05 15:57:27

Rechteckige Matrizen in CUDA multiplizieren

Antwort

Verwandte Themen