Lösen linearer Gleichungssysteme AX = B mit CUDA

Kann ich die neue cuSOLVER Bibliothek verwenden (CUDA 7) lineare Systeme der FormLösen linearer Gleichungssysteme AX = B mit CUDA

AX = B

wo A, X und B sind NxN dichte Matrizen zu lösen?

Quelle

2015-03-01 Shay S.

Ja verwendet werden. Im Rahmen von cuSOLVER können Sie die QR-Zerlegung verwenden, siehe [QR-Zerlegung zur Lösung linearer Systeme in CUDA] (http://stackoverflow.com/questions/22399794/qr-decomposition-to-solve-linear-systems-in-cuda). Alternativ können Sie die Matrix-Inverse durch die sukzessive Aufzählung von 'cublas getrfBatched()' (die die LU-Zerlegung einer Matrix berechnet) und cublas getriBatched() '(die die Inverse der Matrix ausgehend von ihrer LU berechnet) berechnen Zersetzung). – JackOLantern

Eine letzte Möglichkeit ist die Verwendung von 'cublas getrfBatched()' gefolgt von einem zweifachen Aufruf von 'cublas trsm()' (welches obere oder untere dreieckige lineare Systeme löst). – JackOLantern

Ja.

nr Ansatz. 1

Im Rahmen von cuSOLVER können Sie QR-Zerlegung verwenden, siehe QR decomposition to solve linear systems in CUDA.

nr Ansatz. 2

Alternativ kann die inverse Matrix durch die sukzessive involation von

cublas<t>getrfBatched()

berechnen, die die LU-Zerlegung einer Matrix berechnet, und

cublas<t>getriBatched()

welche berechnet die Inverse der Matrix ausgehend von seiner LU-Zerlegung.

nr Ansatz. 3

Eine letzte Möglichkeit wird unter Verwendung von

cublas<t>getrfBatched()

durch einen zweifachen Aufruf von

gefolgt

cublas<t>trsm()

die oberen oder unteren dreieckigen linearen Systeme löst.

Wie von Robert Crovella darauf hingewiesen, kann die Antwort auf die Größe und den Typ der beteiligten Matrizen variieren.

-Code für Ansatz nr. 1

Bitte siehe QR decomposition to solve linear systems in CUDA.

Code für Ansätze nr. 2 und nr. 3

Unten, ich berichte ein ausgearbeitetes Beispiel für die Implementierung der Ansätze nr. 2 und 3. Hankel matrices werden verwendet, um die Ansätze mit gut konditionierten invertierbaren Matrizen zu versorgen. Bitte beachten Sie, dass Ansatz Nr. 3 erfordert das Permutieren (Umordnen) des Systemkoeffizientenvektors gemäß dem Pivot-Array, das nach dem Aufruf von cublas<t>getrfBatched() erhalten wurde. Diese Permutation kann bequem auf der CPU durchgeführt werden.

#include <stdio.h> 
#include <fstream> 
#include <iomanip> 
#include <stdlib.h>  /* srand, rand */ 
#include <time.h>  /* time */ 

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include "cublas_v2.h" 

#include "Utilities.cuh" 
#include "TimingGPU.cuh" 

#define prec_save 10 

#define BLOCKSIZE 256 

#define BLOCKSIZEX 16 
#define BLOCKSIZEY 16 

/************************************/ 
/* SAVE REAL ARRAY FROM CPU TO FILE */ 
/************************************/ 
template <class T> 
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) { 

    std::ofstream outfile; 
    outfile.open(filename); 
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n"; 
    outfile.close(); 

} 

/************************************/ 
/* SAVE REAL ARRAY FROM GPU TO FILE */ 
/************************************/ 
template <class T> 
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) { 

    T *h_in = (T *)malloc(M * sizeof(T)); 

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost)); 

    std::ofstream outfile; 
    outfile.open(filename); 
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n"; 
    outfile.close(); 

} 

/***************************************************/ 
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */ 
/***************************************************/ 
// --- https://en.wikipedia.org/wiki/Hankel_matrix 
void setHankelMatrix(double * __restrict h_A, const int N) { 

    double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double)); 

    // --- Initialize random seed 
    srand(time(NULL)); 

    // --- Generate random numbers 
    for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand(); 

    // --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent. 
    for (int i = 0; i < N; i++) 
     for (int j = 0; j < N; j++) 
      h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2]; 

    free(h_atemp); 

} 

/***********************************************/ 
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */ 
/***********************************************/ 
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref, 
           double * __restrict h_y, const int N) { 

    for (int k = 0; k < N; k++) h_y[k] = 0.f; 

    for (int m = 0; m < N; m++) 
     for (int n = 0; n < N; n++) 
      h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n]; 

} 

/************************************/ 
/* COEFFICIENT REARRANGING FUNCTION */ 
/************************************/ 
void rearrange(double *vec, int *pivotArray, int N){ 
    for (int i = 0; i < N; i++) { 
     double temp = vec[i]; 
     vec[i] = vec[pivotArray[i] - 1]; 
     vec[pivotArray[i] - 1] = temp; 
    } 
} 

/********/ 
/* MAIN */ 
/********/ 
int main() { 

    const unsigned int N = 1000; 

    const unsigned int Nmatrices = 1; 

    // --- CUBLAS initialization 
    cublasHandle_t cublas_handle; 
    cublasSafeCall(cublasCreate(&cublas_handle)); 

    TimingGPU timerLU, timerApproach1, timerApproach2; 
    double timingLU, timingApproach1, timingApproach2; 

    /***********************/ 
    /* SETTING THE PROBLEM */ 
    /***********************/ 
    // --- Matrices to be inverted (only one in this example) 
    double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double)); 

    // --- Setting the Hankel matrix 
    setHankelMatrix(h_A, N); 

    // --- Defining the solution 
    double *h_xref = (double *)malloc(N * sizeof(double)); 
    for (int k = 0; k < N; k++) h_xref[k] = 1.f; 

    // --- Coefficient vectors (only one in this example) 
    double *h_y = (double *)malloc(N * sizeof(double)); 

    computeCoefficientsVector(h_A, h_xref, h_y, N); 

    // --- Result (only one in this example) 
    double *h_x = (double *)malloc(N * sizeof(double)); 

    // --- Allocate device space for the input matrices 
    double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double))); 
    double *d_y; gpuErrchk(cudaMalloc(&d_y, N *     sizeof(double))); 
    double *d_x; gpuErrchk(cudaMalloc(&d_x, N *     sizeof(double))); 

    // --- Move the relevant matrices from host to device 
    gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_y, h_y, N *     sizeof(double), cudaMemcpyHostToDevice)); 

    /**********************************/ 
    /* COMPUTING THE LU DECOMPOSITION */ 
    /**********************************/ 
    timerLU.StartCounter(); 

    // --- Creating the array of pointers needed as input/output to the batched getrf 
    double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *)); 
    for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N; 

    double **d_inout_pointers; 
    gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *))); 
    gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice)); 
    free(h_inout_pointers); 

    int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int))); 
    int *d_InfoArray; gpuErrchk(cudaMalloc(&d_InfoArray,  Nmatrices * sizeof(int))); 

    int *h_InfoArray = (int *)malloc(Nmatrices * sizeof(int)); 

    cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices)); 
    //cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices)); 

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); 

    for (int i = 0; i < Nmatrices; i++) 
     if (h_InfoArray[i] != 0) { 
      fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i); 
      cudaDeviceReset(); 
      exit(EXIT_FAILURE); 
     } 

    timingLU = timerLU.GetCounter(); 
    printf("Timing LU decomposition %f [ms]\n", timingLU); 

    /*********************************/ 
    /* CHECKING THE LU DECOMPOSITION */ 
    /*********************************/ 
    saveCPUrealtxt(h_A,   "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N); 
    saveCPUrealtxt(h_y,   "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N); 
    saveGPUrealtxt(d_A,   "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N); 
    saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N); 

    /******************************************************************************/ 
    /* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */ 
    /******************************************************************************/ 
    timerApproach1.StartCounter(); 

    // --- Allocate device space for the inverted matrices 
    double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double))); 

    // --- Creating the array of pointers needed as output to the batched getri 
    double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *)); 
    for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double)); 

    double **d_out_pointers; 
    gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *))); 
    gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice)); 
    free(h_out_pointers); 

    cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices)); 

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); 

    for (int i = 0; i < Nmatrices; i++) 
     if (h_InfoArray[i] != 0) { 
     fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i); 
     cudaDeviceReset(); 
     exit(EXIT_FAILURE); 
     } 

    double alpha1 = 1.f; 
    double beta1 = 0.f; 

    cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1)); 

    timingApproach1 = timingLU + timerApproach1.GetCounter(); 
    printf("Timing approach 1 %f [ms]\n", timingApproach1); 

    /**************************/ 
    /* CHECKING APPROACH NR.1 */ 
    /**************************/ 
    saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N); 

    /*************************************************************/ 
    /* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */ 
    /*************************************************************/ 
    timerApproach2.StartCounter(); 

    double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double))); 

    gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); 
    int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int)); 
    gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); 

    rearrange(h_y, h_pivotArray, N); 
    gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice)); 

    // --- Now P*A=L*U 
    //  Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y 

    // --- 1st phase - solve Ly = b 
    const double alpha = 1.f; 

    // --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result 

    // --- Lower triangular part 
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N)); 

    // --- Upper triangular part 
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N)); 

    timingApproach2 = timingLU + timerApproach2.GetCounter(); 
    printf("Timing approach 2 %f [ms]\n", timingApproach2); 

    /**************************/ 
    /* CHECKING APPROACH NR.2 */ 
    /**************************/ 
    saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N); 

    return 0; 
}

Die Utilities.cu und Utilities.cuh Dateien ein solches Beispiel laufen benötigt werden, in diesem github page gehalten. Die TimingGPU.cu und TimingGPU.cuh Dateien werden bei diesem github page verwaltet.

einige nützliche Hinweise auf den dritten Ansatz:

NAG Fortran Library Routine Document

Scientific Computing Software Library (SCSL) User’s Guide

https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c

EDIT

Timings (in ms) für Ansätze nr. 2 und 3 (Tests mit einer GTX960-Karte, cc. 5.2).

N  LU decomposition  Approach nr. 2  Approach nr. 3 
100  1.08     2.75     1.28 
500  45.4     161     45.7 
1000  302     1053     303

Wie sich herausstellt, nähern Sie sich nr. 3 ist bequemer und seine Kosten sind im Wesentlichen die Kosten der Berechnung der LU-Faktorisierung. Außerdem:

durch LU-Zerlegung lineare Gleichungssysteme zu lösen, ist schneller als die Verwendung von QR-Zerlegung (siehe QR decomposition to solve linear systems in CUDA);
LU-Zerlegung ist auf quadratische lineare Systeme beschränkt, während QR-Zerlegung bei nichtquadratischen linearen Systemen hilft.

Die unter Matlab-Code kann zur Überprüfung der Ergebnisse

clear all 
close all 
clc 

warning off 

N = 1000; 

% --- Setting the problem solution 
x = ones(N, 1); 

%%%%%%%%%%%%%%%%%%%%% 
% NxN HANKEL MATRIX % 
%%%%%%%%%%%%%%%%%%%%% 
% --- https://en.wikipedia.org/wiki/Hankel_matrix 
load A.txt 
load y.txt 

A = reshape(A, N, N); 

yMatlab = A * x; 
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2))/sum(sum(abs(yMatlab).^2)))); 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
% COMPUTATION OF THE LU DECOMPOSITION % 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
[Lmatlab, Umatlab] = lu(A); 

load Adecomposed.txt 
Adecomposed = reshape(Adecomposed, N, N); 

L = eye(N); 
for k = 1 : N 
    L(k + 1 : N, k) = Adecomposed(k + 1 : N, k); 
end 
U = zeros(N); 
for k = 1 : N 
    U(k, k : N) = Adecomposed(k, k : N); 
end 

load pivotArray.txt 
Pj = eye(N); 
for j = 1 : N 
    tempVector = Pj(j, :); 
    Pj(j, :) = Pj(pivotArray(j), :); 
    Pj(pivotArray(j), :) = tempVector; 
end 

fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2))/sum(sum(abs(Pj * A).^2)))); 

xprime  = inv(Lmatlab) * yMatlab; 
xMatlab  = inv(Umatlab) * xprime; 

fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2))/sum(sum(abs(x).^2)))); 

load xApproach1.txt 
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2))/sum(sum(abs(x).^2)))); 

load xApproach2.txt 
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2))/sum(sum(abs(x).^2))));

Quelle

2015-03-01 21:45:39 JackOLantern

Lösen linearer Gleichungssysteme AX = B mit CUDA

Antwort

Verwandte Themen