Parallele Implementierung für mehrere SVDs mit CUDA

Ich bin neu bei der parallelen Programmierung mit GPU, also entschuldige ich mich, wenn die Frage breit oder vage ist. Ich bin mir bewusst, dass es in der CULA-Bibliothek einige parallele SVD-Funktionen gibt, aber was sollte die Strategie sein, wenn ich eine große Anzahl von relativ kleinen Matrizen zur Faktorisierung habe? Zum Beispiel habe ich n Matrizen mit Dimension d, n ist groß und d ist klein. Wie kann dieser Prozess parallelisiert werden? Kann mir jemand einen Hinweis geben?Parallele Implementierung für mehrere SVDs mit CUDA

Quelle

2013-07-01 Logan Yang

Sie können einen Blick auf die Batched Operations Beitrag des CULA-Blogs für eine Diskussion Ihres Problems werfen. unter

EDIT

Von dem, was ich aus Ihrem Kommentar verstehen, würden Sie jeden Thread gerne einen separaten SVD zu berechnen. Im Grunde sollte jeder Thread ein standardmäßiges, sequentielles SVD-Schema ausführen. Für, dass einige möglicherweise nützliche Hinweise:

Numerical Recipes

Golub, Van Loan, Matrix Computations

Wenn Sie diesen Ansatz verwenden, aber ich fürchte, Sie nicht in der Lage sein wird, mehr cuBLAS zu verwenden, wie diejenigen sind host Funktionen nicht aufrufbar von der device (es sei denn, Sie haben keine Rechenleistung >3.5, siehe das simpleDevLibCUBLAS Beispiel.). Aber im Grunde denke ich, dass Sie das Chargenkonzept irgendwie selbst implementieren.

Wenn Sie zu einer Standard-Parallel-GPU Implementierung gehen zu entscheiden, unter der Referenz von Interesse sein könnte:

Singular Value Decomposition on GPU using CUDA

Quelle

2013-07-01 13:02:41 JackOLantern

Analog zur dosierten Löser/inverse Matrix-Code auf der CUDA geschrieben registrierten Entwickler Website finden Sie eine Matrix-per-Thread oder eine Matrix-per-Thread-Block-Ansatz in Erwägung ziehen könnte. Dies funktioniert gut, wenn die Stapelgröße groß ist und die Matrizen sehr klein sind. Was sind typische Werte für n und d in Ihrem Fall? – njuffa

BLAS Batch-Modus hat nur Matrixmultiplikation, oder? Wie kann ich es für SVD verwenden? Und könnten Sie mir ein Codebeispiel geben, wie man die Threads oder Blöcke in der GPU partitioniert und jede Einheit eine SVD parallel laufen lässt? Zum Beispiel wenn n = 500 d = 20 ist. Vielen Dank! –

Ich habe meinen Beitrag bearbeitet. Ich hoffe, es wird hilfreich sein. – JackOLantern

Meine vorherige Antwort ist jetzt out-of-date. Ab Februar 2015 bietet CUDA 7 (derzeit in der Release-Candidate-Version) vollständige SVD-Funktionen in seiner cuSOLVER-Bibliothek. Im Folgenden stelle ich ein Beispiel für die Generierung der Singulärwertzerlegung mit CUDA cuSOLVER vor.

In Bezug auf das spezifische Problem, das Sie steigen (Berechnung der SVD von mehreren Matrizen kleiner Größe), sollten Sie das Beispiel, das ich unten bereitstellen, mithilfe von Streams anpassen. einen Strom zu jeder Aufgabe zu assoziieren Sie

cudaStreamCreate()

und

cusolverDnSetStream()

kernel.cu

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include<iostream> 
#include<iomanip> 
#include<stdlib.h> 
#include<stdio.h> 
#include<assert.h> 
#include<math.h> 

#include <cusolverDn.h> 
#include <cuda_runtime_api.h> 

#include "Utilities.cuh" 

/********/ 
/* MAIN */ 
/********/ 
int main(){ 

    // --- gesvd only supports Nrows >= Ncols 
    // --- column major memory ordering 

    const int Nrows = 7; 
    const int Ncols = 5; 

    // --- cuSOLVE input/output parameters/arrays 
    int work_size = 0; 
    int *devInfo;   gpuErrchk(cudaMalloc(&devInfo,   sizeof(int))); 

    // --- CUDA solver initialization 
    cusolverDnHandle_t solver_handle; 
    cusolverDnCreate(&solver_handle); 

    // --- Setting the host, Nrows x Ncols matrix 
    double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double)); 
    for(int j = 0; j < Nrows; j++) 
     for(int i = 0; i < Ncols; i++) 
      h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j)); 

    // --- Setting the device matrix and moving the host matrix to the device 
    double *d_A;   gpuErrchk(cudaMalloc(&d_A,  Nrows * Ncols * sizeof(double))); 
    gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice)); 

    // --- host side SVD results space 
    double *h_U = (double *)malloc(Nrows * Nrows  * sizeof(double)); 
    double *h_V = (double *)malloc(Ncols * Ncols  * sizeof(double)); 
    double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double)); 

    // --- device side SVD workspace and matrices 
    double *d_U;   gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows  * sizeof(double))); 
    double *d_V;   gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols  * sizeof(double))); 
    double *d_S;   gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double))); 

    // --- CUDA SVD initialization 
    cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size)); 
    double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double))); 

    // --- CUDA SVD execution 
    cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo)); 
    int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost)); 
    if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n"; 

    // --- Moving the results from device to host 
    gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows  * sizeof(double), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols  * sizeof(double), cudaMemcpyDeviceToHost)); 

    std::cout << "Singular values\n"; 
    for(int i = 0; i < min(Nrows, Ncols); i++) 
     std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl; 

    std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n"; 
    for(int j = 0; j < Nrows; j++) { 
     printf("\n"); 
     for(int i = 0; i < Nrows; i++) 
      printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]); 
    } 

    std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n"; 
    for(int i = 0; i < Ncols; i++) { 
     printf("\n"); 
     for(int j = 0; j < Ncols; j++) 
      printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]); 
    } 

    cusolverDnDestroy(solver_handle); 

    return 0; 

}

Utilities.cuh

#ifndef UTILITIES_CUH 
#define UTILITIES_CUH 

extern "C" int iDivUp(int, int); 
extern "C" void gpuErrchk(cudaError_t); 
extern "C" void cusolveSafeCall(cusolverStatus_t); 

#endif

012 verwenden können

Utilities.cu

#include <stdio.h> 
#include <assert.h> 

#include "cuda_runtime.h" 
#include <cuda.h> 

#include <cusolverDn.h> 

/*******************/ 
/* iDivUp FUNCTION */ 
/*******************/ 
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a/b + 1) : (a/b); } 

/********************/ 
/* CUDA ERROR CHECK */ 
/********************/ 
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api 
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) { exit(code); } 
    } 
} 

extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); } 

/**************************/ 
/* CUSOLVE ERROR CHECKING */ 
/**************************/ 
static const char *_cudaGetErrorEnum(cusolverStatus_t error) 
{ 
    switch (error) 
    { 
     case CUSOLVER_STATUS_SUCCESS: 
      return "CUSOLVER_SUCCESS"; 

     case CUSOLVER_STATUS_NOT_INITIALIZED: 
      return "CUSOLVER_STATUS_NOT_INITIALIZED"; 

     case CUSOLVER_STATUS_ALLOC_FAILED: 
      return "CUSOLVER_STATUS_ALLOC_FAILED"; 

     case CUSOLVER_STATUS_INVALID_VALUE: 
      return "CUSOLVER_STATUS_INVALID_VALUE"; 

     case CUSOLVER_STATUS_ARCH_MISMATCH: 
      return "CUSOLVER_STATUS_ARCH_MISMATCH"; 

     case CUSOLVER_STATUS_EXECUTION_FAILED: 
      return "CUSOLVER_STATUS_EXECUTION_FAILED"; 

     case CUSOLVER_STATUS_INTERNAL_ERROR: 
      return "CUSOLVER_STATUS_INTERNAL_ERROR"; 

     case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: 
      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; 

    } 

    return "<unknown>"; 
} 

inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line) 
{ 
    if(CUSOLVER_STATUS_SUCCESS != err) { 
     fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ 
           _cudaGetErrorEnum(err)); \ 
     cudaDeviceReset(); assert(0); \ 
    } 
} 

extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

Quelle

2015-02-07 08:19:57 JackOLantern

Was halten Sie von diesem Ansatz gegenüber MAGMA? –

@AndreasYankopolus Ich habe die beiden Bibliotheken nicht verglichen, sorry. – JackOLantern

Parallele Implementierung für mehrere SVDs mit CUDA

Antwort

Verwandte Themen