working dsyevd example on GPU with logging

This commit is contained in:
Anton Ljungdahl 2024-10-15 17:03:05 +02:00
parent 879ae444a1
commit 48b67775f8
15 changed files with 8512 additions and 33 deletions

2
.gitignore vendored
View File

@ -94,3 +94,5 @@ dkms.conf
*.out
*.app
#visual studio
.vs

View File

@ -4,16 +4,31 @@ ctime -begin timeBuild.ctm
@rem /WX /W4 /wd4201 /wd4100 /wd4189 /wd4244 /wd4127 /wd4456
@rem set CommonCompilerFlags="/nologo /Zi /FC"
set CommonCompilerFlags=/nologo /Zi /FC
set CommonCompilerFlags=/nologo /Zi /FC /Od
@rem /WX /W4 /wd4201 /wd4100 /wd4189 /wd4244 /wd4127 /wd4456
@rem
@rem
set cuda_root=D:/lib/cudatoolkit/lib/x64
set mkl_root=D:/lib/oneAPI_mkl/mkl/2021.3.0
set mkl_core=%mkl_root%/lib/intel64/mkl_core.lib
set mkl_intel_lp64=%mkl_root%/lib/intel64/mkl_intel_lp64.lib
set mkl_intel_thread=%mkl_root%/lib/intel64/mkl_intel_thread.lib
set MKLCOMPILER=D:/lib/oneAPI_mkl/compiler/2021.3.0/windows/compiler
set libiomp5md=%MKLCOMPILER%/lib/intel64_win/libiomp5md.lib
set CudaSources=../src/cuda_entry_point.cu
set CudaObj=cuda_entry_point.obj
set Sources=../src/main.c
set CudaLib=D:/lib/cudatoolkit/lib/x64
set CudaSolver=%CudaLib%/cusolver.lib
IF NOT EXIST .\build mkdir .\build
pushd .\build
cl %CommonCompilerFlags% %Sources% -Feprogram.exe
@rem nvcc -Xcompiler %CommonCompilerFlags% -o program.exe %Sources% %CudaSources% -lcusolver
rem cl /c %CommonCompilerFlags% %Sources% /I"%mkl_root%\include"
rem FOR MSVC /link %mkl_core% %mkl_intel_lp64% %mkl_intel_thread% %libiomp5md%
nvcc -c %CudaSources% -o %CudaObj% -g -G -lcusolver
rem nvcc -o program.exe -lcusolver -L%mkl_root%/lib/intel64 -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -l%MKLCOMPILER%/lib/intel64_win/libiomp5md
cl %CommonCompilerFlags% %Sources% /I"%mkl_root%\include" /link %CudaObj% %cuda_root%/cudart.lib %cuda_root%/cusolver.lib %mkl_core% %mkl_intel_lp64% %mkl_intel_thread% %libiomp5md%
set LastError=%ERRORLEVEL%
popd

BIN
build/main.rdi Normal file

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,8 @@
{
"folders":
[
{
"path": "."
}
]
}

File diff suppressed because one or more lines are too long

4927
external/cusolverDn.h vendored Normal file

File diff suppressed because it is too large Load Diff

318
external/cusolverMg.h vendored Normal file
View File

@ -0,0 +1,318 @@
/*
* Copyright 2019 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CUSOLVERMG_H_)
#define CUSOLVERMG_H_
#include <stdint.h>
#include "cusolverDn.h"
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
struct cusolverMgContext;
typedef struct cusolverMgContext *cusolverMgHandle_t;
/**
* \beief This enum decides how 1D device Ids (or process ranks) get mapped to
* a 2D grid.
*/
typedef enum {
CUDALIBMG_GRID_MAPPING_ROW_MAJOR = 1,
CUDALIBMG_GRID_MAPPING_COL_MAJOR = 0
} cusolverMgGridMapping_t;
/** \brief Opaque structure of the distributed grid */
typedef void *cudaLibMgGrid_t;
/** \brief Opaque structure of the distributed matrix descriptor */
typedef void *cudaLibMgMatrixDesc_t;
cusolverStatus_t CUSOLVERAPI cusolverMgCreate(cusolverMgHandle_t *handle);
cusolverStatus_t CUSOLVERAPI cusolverMgDestroy(cusolverMgHandle_t handle);
cusolverStatus_t CUSOLVERAPI cusolverMgDeviceSelect(
cusolverMgHandle_t handle,
int nbDevices,
int deviceId[]);
/**
* \brief Allocates resources related to the shared memory device grid.
* \param[out] grid the opaque data strcuture that holds the grid
* \param[in] numRowDevices number of devices in the row
* \param[in] numColDevices number of devices in the column
* \param[in] deviceId This array of size height * width stores the
* device-ids of the 2D grid; each entry must correspond to a valid
* gpu or to -1 (denoting CPU). \param[in] mapping whether the 2D grid is in
* row/column major \returns the status code
*/
cusolverStatus_t CUSOLVERAPI cusolverMgCreateDeviceGrid(
cudaLibMgGrid_t * grid,
int32_t numRowDevices,
int32_t numColDevices,
const int32_t deviceId[],
cusolverMgGridMapping_t mapping);
/**
* \brief Releases the allocated resources related to the distributed grid.
* \param[in] grid the opaque data strcuture that holds the distributed grid
* \returns the status code
*/
cusolverStatus_t CUSOLVERAPI cusolverMgDestroyGrid(cudaLibMgGrid_t grid);
/**
* \brief Allocates resources related to the distributed matrix descriptor.
* \param[out] desc the opaque data strcuture that holds the descriptor
* \param[in] numRows number of total rows
* \param[in] numCols number of total columns
* \param[in] rowBlockSize row block size
* \param[in] colBlockSize column block size
* \param[in] dataType the data type of each element in cudaDataType
* \param[in] grid the opaque data structure of the distributed grid
* \returns the status code
*/
cusolverStatus_t CUSOLVERAPI cusolverMgCreateMatrixDesc(
cudaLibMgMatrixDesc_t *desc,
int64_t numRows,
int64_t numCols,
int64_t rowBlockSize,
int64_t colBlockSize,
cudaDataType dataType,
const cudaLibMgGrid_t grid);
/**
* \brief Releases the allocated resources related to the distributed matrix
* descriptor. \param[in] desc the opaque data strcuture that holds the
* descriptor \returns the status code
*/
cusolverStatus_t CUSOLVERAPI
cusolverMgDestroyMatrixDesc(cudaLibMgMatrixDesc_t desc);
cusolverStatus_t CUSOLVERAPI cusolverMgSyevd_bufferSize(
cusolverMgHandle_t handle,
cusolverEigMode_t jobz,
cublasFillMode_t uplo,
int N,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
void * W,
cudaDataType dataTypeW,
cudaDataType computeType,
int64_t * lwork);
cusolverStatus_t CUSOLVERAPI cusolverMgSyevd(
cusolverMgHandle_t handle,
cusolverEigMode_t jobz,
cublasFillMode_t uplo,
int N,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
void * W,
cudaDataType dataTypeW,
cudaDataType computeType,
void * array_d_work[],
int64_t lwork,
int * info);
cusolverStatus_t CUSOLVERAPI cusolverMgGetrf_bufferSize(
cusolverMgHandle_t handle,
int M,
int N,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
int * array_d_IPIV[],
cudaDataType computeType,
int64_t * lwork);
cusolverStatus_t CUSOLVERAPI cusolverMgGetrf(
cusolverMgHandle_t handle,
int M,
int N,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
int * array_d_IPIV[],
cudaDataType computeType,
void * array_d_work[],
int64_t lwork,
int * info);
cusolverStatus_t CUSOLVERAPI cusolverMgGetrs_bufferSize(
cusolverMgHandle_t handle,
cublasOperation_t TRANS,
int N,
int NRHS,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
int * array_d_IPIV[],
void * array_d_B[],
int IB,
int JB,
cudaLibMgMatrixDesc_t descrB,
cudaDataType computeType,
int64_t * lwork);
cusolverStatus_t CUSOLVERAPI cusolverMgGetrs(
cusolverMgHandle_t handle,
cublasOperation_t TRANS,
int N,
int NRHS,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
int * array_d_IPIV[],
void * array_d_B[],
int IB,
int JB,
cudaLibMgMatrixDesc_t descrB,
cudaDataType computeType,
void * array_d_work[],
int64_t lwork,
int * info);
cusolverStatus_t CUSOLVERAPI cusolverMgPotrf_bufferSize(
cusolverMgHandle_t handle,
cublasFillMode_t uplo,
int N,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
cudaDataType computeType,
int64_t * lwork);
cusolverStatus_t CUSOLVERAPI cusolverMgPotrf(
cusolverMgHandle_t handle,
cublasFillMode_t uplo,
int N,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
cudaDataType computeType,
void * array_d_work[],
int64_t lwork,
int * h_info);
cusolverStatus_t CUSOLVERAPI cusolverMgPotrs_bufferSize(
cusolverMgHandle_t handle,
cublasFillMode_t uplo,
int n,
int nrhs,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
void * array_d_B[],
int IB,
int JB,
cudaLibMgMatrixDesc_t descrB,
cudaDataType computeType,
int64_t * lwork);
cusolverStatus_t CUSOLVERAPI cusolverMgPotrs(
cusolverMgHandle_t handle,
cublasFillMode_t uplo,
int n,
int nrhs,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
void * array_d_B[],
int IB,
int JB,
cudaLibMgMatrixDesc_t descrB,
cudaDataType computeType,
void * array_d_work[],
int64_t lwork,
int * h_info);
cusolverStatus_t CUSOLVERAPI cusolverMgPotri_bufferSize(
cusolverMgHandle_t handle,
cublasFillMode_t uplo,
int N,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
cudaDataType computeType,
int64_t * lwork);
cusolverStatus_t CUSOLVERAPI cusolverMgPotri(
cusolverMgHandle_t handle,
cublasFillMode_t uplo,
int N,
void * array_d_A[],
int IA,
int JA,
cudaLibMgMatrixDesc_t descrA,
cudaDataType computeType,
void * array_d_work[],
int64_t lwork,
int * h_info);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif // CUSOLVERMG_H_

339
external/cusolverRf.h vendored Normal file
View File

@ -0,0 +1,339 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CUSOLVERRF_H_)
#define CUSOLVERRF_H_
#include "driver_types.h"
#include "cuComplex.h"
#include "cusolver_common.h"
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
/* CUSOLVERRF mode */
typedef enum {
CUSOLVERRF_RESET_VALUES_FAST_MODE_OFF = 0, // default
CUSOLVERRF_RESET_VALUES_FAST_MODE_ON = 1
} cusolverRfResetValuesFastMode_t;
/* CUSOLVERRF matrix format */
typedef enum {
CUSOLVERRF_MATRIX_FORMAT_CSR = 0, // default
CUSOLVERRF_MATRIX_FORMAT_CSC = 1
} cusolverRfMatrixFormat_t;
/* CUSOLVERRF unit diagonal */
typedef enum {
CUSOLVERRF_UNIT_DIAGONAL_STORED_L = 0, // default
CUSOLVERRF_UNIT_DIAGONAL_STORED_U = 1,
CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L = 2,
CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_U = 3
} cusolverRfUnitDiagonal_t;
/* CUSOLVERRF factorization algorithm */
typedef enum {
CUSOLVERRF_FACTORIZATION_ALG0 = 0, // default
CUSOLVERRF_FACTORIZATION_ALG1 = 1,
CUSOLVERRF_FACTORIZATION_ALG2 = 2,
} cusolverRfFactorization_t;
/* CUSOLVERRF triangular solve algorithm */
typedef enum {
CUSOLVERRF_TRIANGULAR_SOLVE_ALG1 = 1, // default
CUSOLVERRF_TRIANGULAR_SOLVE_ALG2 = 2,
CUSOLVERRF_TRIANGULAR_SOLVE_ALG3 = 3
} cusolverRfTriangularSolve_t;
/* CUSOLVERRF numeric boost report */
typedef enum {
CUSOLVERRF_NUMERIC_BOOST_NOT_USED = 0, // default
CUSOLVERRF_NUMERIC_BOOST_USED = 1
} cusolverRfNumericBoostReport_t;
/* Opaque structure holding CUSOLVERRF library common */
struct cusolverRfCommon;
typedef struct cusolverRfCommon* cusolverRfHandle_t;
/* CUSOLVERRF create (allocate memory) and destroy (free memory) in the handle
*/
cusolverStatus_t CUSOLVERAPI cusolverRfCreate(cusolverRfHandle_t* handle);
cusolverStatus_t CUSOLVERAPI cusolverRfDestroy(cusolverRfHandle_t handle);
/* CUSOLVERRF set and get input format */
cusolverStatus_t CUSOLVERAPI cusolverRfGetMatrixFormat(
cusolverRfHandle_t handle,
cusolverRfMatrixFormat_t* format,
cusolverRfUnitDiagonal_t* diag);
cusolverStatus_t CUSOLVERAPI cusolverRfSetMatrixFormat(
cusolverRfHandle_t handle,
cusolverRfMatrixFormat_t format,
cusolverRfUnitDiagonal_t diag);
/* CUSOLVERRF set and get numeric properties */
cusolverStatus_t CUSOLVERAPI cusolverRfSetNumericProperties(
cusolverRfHandle_t handle,
double zero,
double boost);
cusolverStatus_t CUSOLVERAPI cusolverRfGetNumericProperties(
cusolverRfHandle_t handle,
double* zero,
double* boost);
cusolverStatus_t CUSOLVERAPI cusolverRfGetNumericBoostReport(
cusolverRfHandle_t handle,
cusolverRfNumericBoostReport_t* report);
/* CUSOLVERRF choose the triangular solve algorithm */
cusolverStatus_t CUSOLVERAPI cusolverRfSetAlgs(
cusolverRfHandle_t handle,
cusolverRfFactorization_t factAlg,
cusolverRfTriangularSolve_t solveAlg);
cusolverStatus_t CUSOLVERAPI cusolverRfGetAlgs(
cusolverRfHandle_t handle,
cusolverRfFactorization_t* factAlg,
cusolverRfTriangularSolve_t* solveAlg);
/* CUSOLVERRF set and get fast mode */
cusolverStatus_t CUSOLVERAPI cusolverRfGetResetValuesFastMode(
cusolverRfHandle_t handle,
cusolverRfResetValuesFastMode_t* fastMode);
cusolverStatus_t CUSOLVERAPI cusolverRfSetResetValuesFastMode(
cusolverRfHandle_t handle,
cusolverRfResetValuesFastMode_t fastMode);
/*** Non-Batched Routines ***/
/* CUSOLVERRF setup of internal structures from host or device memory */
cusolverStatus_t CUSOLVERAPI
cusolverRfSetupHost(/* Input (in the host memory) */
int n,
int nnzA,
int* h_csrRowPtrA,
int* h_csrColIndA,
double* h_csrValA,
int nnzL,
int* h_csrRowPtrL,
int* h_csrColIndL,
double* h_csrValL,
int nnzU,
int* h_csrRowPtrU,
int* h_csrColIndU,
double* h_csrValU,
int* h_P,
int* h_Q,
/* Output */
cusolverRfHandle_t handle);
cusolverStatus_t CUSOLVERAPI
cusolverRfSetupDevice(/* Input (in the device memory) */
int n,
int nnzA,
int* csrRowPtrA,
int* csrColIndA,
double* csrValA,
int nnzL,
int* csrRowPtrL,
int* csrColIndL,
double* csrValL,
int nnzU,
int* csrRowPtrU,
int* csrColIndU,
double* csrValU,
int* P,
int* Q,
/* Output */
cusolverRfHandle_t handle);
/* CUSOLVERRF update the matrix values (assuming the reordering, pivoting
and consequently the sparsity pattern of L and U did not change),
and zero out the remaining values. */
cusolverStatus_t CUSOLVERAPI
cusolverRfResetValues(/* Input (in the device memory) */
int n,
int nnzA,
int* csrRowPtrA,
int* csrColIndA,
double* csrValA,
int* P,
int* Q,
/* Output */
cusolverRfHandle_t handle);
/* CUSOLVERRF analysis (for parallelism) */
cusolverStatus_t CUSOLVERAPI cusolverRfAnalyze(cusolverRfHandle_t handle);
/* CUSOLVERRF re-factorization (for parallelism) */
cusolverStatus_t CUSOLVERAPI cusolverRfRefactor(cusolverRfHandle_t handle);
/* CUSOLVERRF extraction: Get L & U packed into a single matrix M */
cusolverStatus_t CUSOLVERAPI
cusolverRfAccessBundledFactorsDevice(/* Input */
cusolverRfHandle_t handle,
/* Output (in the host memory) */
int* nnzM,
/* Output (in the device memory) */
int** Mp,
int** Mi,
double** Mx);
cusolverStatus_t CUSOLVERAPI
cusolverRfExtractBundledFactorsHost(/* Input */
cusolverRfHandle_t handle,
/* Output (in the host memory) */
int* h_nnzM,
int** h_Mp,
int** h_Mi,
double** h_Mx);
/* CUSOLVERRF extraction: Get L & U individually */
cusolverStatus_t CUSOLVERAPI
cusolverRfExtractSplitFactorsHost(/* Input */
cusolverRfHandle_t handle,
/* Output (in the host memory) */
int* h_nnzL,
int** h_csrRowPtrL,
int** h_csrColIndL,
double** h_csrValL,
int* h_nnzU,
int** h_csrRowPtrU,
int** h_csrColIndU,
double** h_csrValU);
/* CUSOLVERRF (forward and backward triangular) solves */
cusolverStatus_t CUSOLVERAPI
cusolverRfSolve(/* Input (in the device memory) */
cusolverRfHandle_t handle,
int* P,
int* Q,
int nrhs, // only nrhs=1 is supported
double* Temp, // of size ldt*nrhs (ldt>=n)
int ldt,
/* Input/Output (in the device memory) */
double* XF,
/* Input */
int ldxf);
/*** Batched Routines ***/
/* CUSOLVERRF-batch setup of internal structures from host */
cusolverStatus_t CUSOLVERAPI
cusolverRfBatchSetupHost(/* Input (in the host memory)*/
int batchSize,
int n,
int nnzA,
int* h_csrRowPtrA,
int* h_csrColIndA,
double* h_csrValA_array[],
int nnzL,
int* h_csrRowPtrL,
int* h_csrColIndL,
double* h_csrValL,
int nnzU,
int* h_csrRowPtrU,
int* h_csrColIndU,
double* h_csrValU,
int* h_P,
int* h_Q,
/* Output (in the device memory) */
cusolverRfHandle_t handle);
/* CUSOLVERRF-batch update the matrix values (assuming the reordering,
pivoting and consequently the sparsity pattern of L and U did not change),
and zero out the remaining values. */
cusolverStatus_t CUSOLVERAPI
cusolverRfBatchResetValues(/* Input (in the device memory) */
int batchSize,
int n,
int nnzA,
int* csrRowPtrA,
int* csrColIndA,
double* csrValA_array[],
int* P,
int* Q,
/* Output */
cusolverRfHandle_t handle);
/* CUSOLVERRF-batch analysis (for parallelism) */
cusolverStatus_t CUSOLVERAPI
cusolverRfBatchAnalyze(cusolverRfHandle_t handle);
/* CUSOLVERRF-batch re-factorization (for parallelism) */
cusolverStatus_t CUSOLVERAPI
cusolverRfBatchRefactor(cusolverRfHandle_t handle);
/* CUSOLVERRF-batch (forward and backward triangular) solves */
cusolverStatus_t CUSOLVERAPI
cusolverRfBatchSolve(/* Input (in the device memory) */
cusolverRfHandle_t handle,
int* P,
int* Q,
int nrhs, // only nrhs=1 is supported
double* Temp, // of size 2*batchSize*(n*nrhs)
int ldt, // only ldt=n is supported
/* Input/Output (in the device memory) */
double* XF_array[],
/* Input */
int ldxf);
/* CUSOLVERRF-batch obtain the position of zero pivot */
cusolverStatus_t CUSOLVERAPI
cusolverRfBatchZeroPivot(/* Input */
cusolverRfHandle_t handle,
/* Output (in the host memory) */
int* position);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif /* CUSOLVERRF_H_ */

923
external/cusolverSp.h vendored Normal file
View File

@ -0,0 +1,923 @@
/*
* Copyright 2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CUSOLVERSP_H_)
#define CUSOLVERSP_H_
#include "cusparse.h"
#include "cublas_v2.h"
#include "cusolver_common.h"
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
struct cusolverSpContext;
typedef struct cusolverSpContext *cusolverSpHandle_t;
struct csrqrInfo;
typedef struct csrqrInfo *csrqrInfo_t;
cusolverStatus_t CUSOLVERAPI cusolverSpCreate(cusolverSpHandle_t *handle);
cusolverStatus_t CUSOLVERAPI cusolverSpDestroy(cusolverSpHandle_t handle);
cusolverStatus_t CUSOLVERAPI
cusolverSpSetStream(cusolverSpHandle_t handle, cudaStream_t streamId);
cusolverStatus_t CUSOLVERAPI
cusolverSpGetStream(cusolverSpHandle_t handle, cudaStream_t *streamId);
cusolverStatus_t CUSOLVERAPI cusolverSpXcsrissymHost(
cusolverSpHandle_t handle,
int m,
int nnzA,
const cusparseMatDescr_t descrA,
const int * csrRowPtrA,
const int * csrEndPtrA,
const int * csrColIndA,
int * issym);
/* -------- GPU linear solver by LU factorization
* solve A*x = b, A can be singular
* [ls] stands for linear solve
* [v] stands for vector
* [lu] stands for LU factorization
*/
cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvluHost(
cusolverSpHandle_t handle,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
const float * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const float * b,
float tol,
int reorder,
float * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvluHost(
cusolverSpHandle_t handle,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
const double * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const double * b,
double tol,
int reorder,
double * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvluHost(
cusolverSpHandle_t handle,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
const cuComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const cuComplex * b,
float tol,
int reorder,
cuComplex * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvluHost(
cusolverSpHandle_t handle,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const cuDoubleComplex * b,
double tol,
int reorder,
cuDoubleComplex * x,
int * singularity);
/* -------- GPU linear solver by QR factorization
* solve A*x = b, A can be singular
* [ls] stands for linear solve
* [v] stands for vector
* [qr] stands for QR factorization
*/
cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqr(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const float * b,
float tol,
int reorder,
float * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqr(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const double * b,
double tol,
int reorder,
double * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqr(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const cuComplex * b,
float tol,
int reorder,
cuComplex * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqr(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const cuDoubleComplex * b,
double tol,
int reorder,
cuDoubleComplex * x,
int * singularity);
/* -------- CPU linear solver by QR factorization
* solve A*x = b, A can be singular
* [ls] stands for linear solve
* [v] stands for vector
* [qr] stands for QR factorization
*/
cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqrHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const float * b,
float tol,
int reorder,
float * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqrHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const double * b,
double tol,
int reorder,
double * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqrHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const cuComplex * b,
float tol,
int reorder,
cuComplex * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqrHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const cuDoubleComplex * b,
double tol,
int reorder,
cuDoubleComplex * x,
int * singularity);
/* -------- CPU linear solver by Cholesky factorization
* solve A*x = b, A can be singular
* [ls] stands for linear solve
* [v] stands for vector
* [chol] stands for Cholesky factorization
*
* Only works for symmetric positive definite matrix.
* The upper part of A is ignored.
*/
cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvcholHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const float * b,
float tol,
int reorder,
float * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvcholHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const double * b,
double tol,
int reorder,
double * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvcholHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const cuComplex * b,
float tol,
int reorder,
cuComplex * x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvcholHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const cuDoubleComplex * b,
double tol,
int reorder,
cuDoubleComplex * x,
int * singularity);
/* -------- GPU linear solver by Cholesky factorization
* solve A*x = b, A can be singular
* [ls] stands for linear solve
* [v] stands for vector
* [chol] stands for Cholesky factorization
*
* Only works for symmetric positive definite matrix.
* The upper part of A is ignored.
*/
cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvchol(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const float * b,
float tol,
int reorder,
// output
float *x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvchol(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const double * b,
double tol,
int reorder,
// output
double *x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvchol(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const cuComplex * b,
float tol,
int reorder,
// output
cuComplex *x,
int * singularity);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvchol(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrVal,
const int * csrRowPtr,
const int * csrColInd,
const cuDoubleComplex * b,
double tol,
int reorder,
// output
cuDoubleComplex *x,
int * singularity);
/* ----------- CPU least square solver by QR factorization
* solve min|b - A*x|
* [lsq] stands for least square
* [v] stands for vector
* [qr] stands for QR factorization
*/
cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsqvqrHost(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const float * b,
float tol,
int * rankA,
float * x,
int * p,
float * min_norm);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsqvqrHost(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const double * b,
double tol,
int * rankA,
double * x,
int * p,
double * min_norm);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsqvqrHost(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const cuComplex * b,
float tol,
int * rankA,
cuComplex * x,
int * p,
float * min_norm);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsqvqrHost(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const cuDoubleComplex * b,
double tol,
int * rankA,
cuDoubleComplex * x,
int * p,
double * min_norm);
/* --------- CPU eigenvalue solver by shift inverse
* solve A*x = lambda * x
* where lambda is the eigenvalue nearest mu0.
* [eig] stands for eigenvalue solver
* [si] stands for shift-inverse
*/
cusolverStatus_t CUSOLVERAPI cusolverSpScsreigvsiHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
float mu0,
const float * x0,
int maxite,
float tol,
float * mu,
float * x);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigvsiHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
double mu0,
const double * x0,
int maxite,
double tol,
double * mu,
double * x);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigvsiHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
cuComplex mu0,
const cuComplex * x0,
int maxite,
float tol,
cuComplex * mu,
cuComplex * x);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigvsiHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
cuDoubleComplex mu0,
const cuDoubleComplex * x0,
int maxite,
double tol,
cuDoubleComplex * mu,
cuDoubleComplex * x);
/* --------- GPU eigenvalue solver by shift inverse
* solve A*x = lambda * x
* where lambda is the eigenvalue nearest mu0.
* [eig] stands for eigenvalue solver
* [si] stands for shift-inverse
*/
cusolverStatus_t CUSOLVERAPI cusolverSpScsreigvsi(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
float mu0,
const float * x0,
int maxite,
float eps,
float * mu,
float * x);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigvsi(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
double mu0,
const double * x0,
int maxite,
double eps,
double * mu,
double * x);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigvsi(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
cuComplex mu0,
const cuComplex * x0,
int maxite,
float eps,
cuComplex * mu,
cuComplex * x);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigvsi(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
cuDoubleComplex mu0,
const cuDoubleComplex * x0,
int maxite,
double eps,
cuDoubleComplex * mu,
cuDoubleComplex * x);
// ----------- enclosed eigenvalues
cusolverStatus_t CUSOLVERAPI cusolverSpScsreigsHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
cuComplex left_bottom_corner,
cuComplex right_upper_corner,
int * num_eigs);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigsHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
cuDoubleComplex left_bottom_corner,
cuDoubleComplex right_upper_corner,
int * num_eigs);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigsHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
cuComplex left_bottom_corner,
cuComplex right_upper_corner,
int * num_eigs);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigsHost(
cusolverSpHandle_t handle,
int m,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
cuDoubleComplex left_bottom_corner,
cuDoubleComplex right_upper_corner,
int * num_eigs);
/* --------- CPU symrcm
* Symmetric reverse Cuthill McKee permutation
*
*/
cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymrcmHost(
cusolverSpHandle_t handle,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
const int * csrRowPtrA,
const int * csrColIndA,
int * p);
/* --------- CPU symmdq
* Symmetric minimum degree algorithm by quotient graph
*
*/
cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymmdqHost(
cusolverSpHandle_t handle,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
const int * csrRowPtrA,
const int * csrColIndA,
int * p);
/* --------- CPU symmdq
* Symmetric Approximate minimum degree algorithm by quotient graph
*
*/
cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymamdHost(
cusolverSpHandle_t handle,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
const int * csrRowPtrA,
const int * csrColIndA,
int * p);
/* --------- CPU metis
* symmetric reordering
*/
cusolverStatus_t CUSOLVERAPI cusolverSpXcsrmetisndHost(
cusolverSpHandle_t handle,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
const int * csrRowPtrA,
const int * csrColIndA,
const int64_t * options,
int * p);
/* --------- CPU zfd
* Zero free diagonal reordering
*/
cusolverStatus_t CUSOLVERAPI cusolverSpScsrzfdHost(
cusolverSpHandle_t handle,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
int * P,
int * numnz);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsrzfdHost(
cusolverSpHandle_t handle,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
int * P,
int * numnz);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsrzfdHost(
cusolverSpHandle_t handle,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
int * P,
int * numnz);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsrzfdHost(
cusolverSpHandle_t handle,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
int * P,
int * numnz);
/* --------- CPU permuation
* P*A*Q^T
*
*/
cusolverStatus_t CUSOLVERAPI cusolverSpXcsrperm_bufferSizeHost(
cusolverSpHandle_t handle,
int m,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
const int * csrRowPtrA,
const int * csrColIndA,
const int * p,
const int * q,
size_t * bufferSizeInBytes);
cusolverStatus_t CUSOLVERAPI cusolverSpXcsrpermHost(
cusolverSpHandle_t handle,
int m,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
int * csrRowPtrA,
int * csrColIndA,
const int * p,
const int * q,
int * map,
void * pBuffer);
/*
* Low-level API: Batched QR
*
*/
cusolverStatus_t CUSOLVERAPI cusolverSpCreateCsrqrInfo(csrqrInfo_t *info);
cusolverStatus_t CUSOLVERAPI cusolverSpDestroyCsrqrInfo(csrqrInfo_t info);
cusolverStatus_t CUSOLVERAPI cusolverSpXcsrqrAnalysisBatched(
cusolverSpHandle_t handle,
int m,
int n,
int nnzA,
const cusparseMatDescr_t descrA,
const int * csrRowPtrA,
const int * csrColIndA,
csrqrInfo_t info);
cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrBufferInfoBatched(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrVal,
const int * csrRowPtr,
const int * csrColInd,
int batchSize,
csrqrInfo_t info,
size_t * internalDataInBytes,
size_t * workspaceInBytes);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrBufferInfoBatched(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrVal,
const int * csrRowPtr,
const int * csrColInd,
int batchSize,
csrqrInfo_t info,
size_t * internalDataInBytes,
size_t * workspaceInBytes);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrBufferInfoBatched(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrVal,
const int * csrRowPtr,
const int * csrColInd,
int batchSize,
csrqrInfo_t info,
size_t * internalDataInBytes,
size_t * workspaceInBytes);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrBufferInfoBatched(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrVal,
const int * csrRowPtr,
const int * csrColInd,
int batchSize,
csrqrInfo_t info,
size_t * internalDataInBytes,
size_t * workspaceInBytes);
cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrsvBatched(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const float * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const float * b,
float * x,
int batchSize,
csrqrInfo_t info,
void * pBuffer);
cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrsvBatched(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const double * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const double * b,
double * x,
int batchSize,
csrqrInfo_t info,
void * pBuffer);
cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrsvBatched(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const cuComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const cuComplex * b,
cuComplex * x,
int batchSize,
csrqrInfo_t info,
void * pBuffer);
cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrsvBatched(
cusolverSpHandle_t handle,
int m,
int n,
int nnz,
const cusparseMatDescr_t descrA,
const cuDoubleComplex * csrValA,
const int * csrRowPtrA,
const int * csrColIndA,
const cuDoubleComplex * b,
cuDoubleComplex * x,
int batchSize,
csrqrInfo_t info,
void * pBuffer);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif // define CUSOLVERSP_H_

1107
external/cusolverSp_LOWLEVEL_PREVIEW.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,28 +1,174 @@
#include <stdio.h>
#include "../external/cusolver_common.h"
#define no_name_mangle extern "C"
#include <windows.h>
#define WIN32_LEAN_AND_MEAN
__global__ void hello_from_GPU() {
printf("HELLO FROM GPUUUU");
#include "../external/cusolver_common.h"
#include "../external/cusolverDn.h"
#ifdef __cplusplus
# define no_name_mangle extern "C"
#else
# define no_name_mangle
#endif
#define function static
#define GLOBAL_LOG_BUFFER_SIZE 4096*4096
__device__ int last_index;
__device__ char *device_log_buffer;
char *log_buffer;
#define D_LOG(msg) d_debug_printf((msg))
//#define D_LOGF(msg, ...) d_debug_printfv((msg), __VA_ARGS__)
#define H_LOG(msg) h_debug_printf((msg))
#define H_LOGF(msg, ...) h_debug_printfv((msg), __VA_ARGS__)
__host__ function void h_debug_printf(const char *msg);
__host__ function void h_debug_printfv(const char *fmt, ...);
__device__ function void d_debug_printf(const char *msg);
//__device__ function void d_debug_printfv(const char *fmt, ...);
__global__ void init_device_logging()
{
last_index = 0;
}
void check_cusolver_version() {
__host__ function void
h_debug_printf(const char* msg)
{
OutputDebugStringA(msg);
}
__host__ function void
h_debug_printfv(const char *fmt, ...)
{
char buffer[2048] = {0};
va_list args;
va_start(args, fmt);
vsnprintf(&buffer[0], sizeof(buffer), fmt, args);
va_end(args);
OutputDebugStringA(buffer);
}
__device__ function void
d_debug_printf(const char *msg) {
int i = 0;
while(msg[i] != '\0')
{
device_log_buffer[last_index + i] = msg[i];
i += 1;
}
last_index = last_index + 1;
}
__global__ void hello_from_GPU()
{
D_LOG("HELLO FROM GPUUUU\n");
}
void
check_cusolver_version()
{
int major=-1, minor=-1, patch=-1;
cusolverGetProperty(MAJOR_VERSION, &major);
cusolverGetProperty(MINOR_VERSION, &minor);
cusolverGetProperty(PATCH_LEVEL, &patch);
printf("CUSOLVER Version (Major,Minor,PatchLevel): %d.%d.%d\n", major,minor,patch);
H_LOGF("CUSOLVER Version (Major,Minor,PatchLevel): %d.%d.%d\n", major,minor,patch);
}
no_name_mangle void cuda_entry_point() {
no_name_mangle void
cuda_entry_point()
{
H_LOG("\n\n-- Entered CUDA code:\n\n");
char *temp_device_log_buffer;
cudaMalloc((void**)&temp_device_log_buffer, GLOBAL_LOG_BUFFER_SIZE * sizeof(char));
cudaMemcpyToSymbol(device_log_buffer, &temp_device_log_buffer, sizeof(char *));
check_cusolver_version();
hello_from_GPU<<<1, 1>>>();
init_device_logging<<<1, 1>>>();
cudaDeviceSynchronize();
hello_from_GPU<<<1, 1>>>();
cudaDeviceSynchronize();
log_buffer = (char *)malloc(GLOBAL_LOG_BUFFER_SIZE * sizeof(char));
cudaMemcpy(log_buffer, temp_device_log_buffer, GLOBAL_LOG_BUFFER_SIZE * sizeof(char), cudaMemcpyDeviceToHost);
// Print all messages from cuda.
H_LOG(log_buffer);
H_LOG("\n\n Testing cuSolver Eigenvalue problem \n\n");
const int size = 5;
const int mat_size = size*size;
double A[mat_size] = {
6.39, 0.13, -8.23, 5.71, -3.18,
0.13, 8.37, -4.46, -6.10, 7.21,
-8.23, -4.46, -9.58, -9.25, -7.42,
5.71, -6.10, -9.25, 3.72, 8.54,
-3.18, 7.21, -7.42, 8.54, 2.51};
double *device_mat;
cudaMalloc((void**)&device_mat, mat_size * sizeof(double));
cudaMemcpy(device_mat, A, mat_size * sizeof(double), cudaMemcpyHostToDevice);
cusolverDnHandle_t cusolverH = 0;
cusolverDnCreate(&cusolverH);
double W[size];
double *d_W;
cudaMalloc((void**)&d_W, size * sizeof(double));
int lwork;
cusolverDnDsyevd_bufferSize(
cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
size, device_mat, size, d_W, &lwork
);
double *d_work;
cudaMalloc((void**)&d_work, lwork * sizeof(double));
int *device_info;
cudaMalloc((void**)&device_info, sizeof(int));
cusolverDnDsyevd(
cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
size, device_mat, size, d_W, d_work, lwork, device_info
);
double out_mat[mat_size];
cudaMemcpy(W, d_W, size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(out_mat, device_mat, mat_size * sizeof(double), cudaMemcpyDeviceToHost);
H_LOG("Results on GPU dsyevd:\n");
H_LOG("Eigenvalues:\n");
for(int i = 0; i < size; i++)
{
H_LOGF("%6.2f\n", W[i]);
}
H_LOG("\n");
H_LOG("Eigenvectors (columnwise):\n");
for(int i = 0; i < size; i++)
{
for(int j = 0; j < size; j++)
{
H_LOGF("%6.2f ", out_mat[j * size + i]);
}
H_LOG("\n");
}
cudaFree(temp_device_log_buffer);
}

View File

@ -14,7 +14,7 @@
#include "base/base_inc.c"
#include "os/os_inc.c"
#include "os/os_entry_point.c"
#include "test_mkl.c"
#define grid_file_path_bin "D:\\dev\\eigsol_gpu\\out\\grid.bin"
#define grid_file_path "D:\\dev\\eigsol_gpu\\out\\grid.dat"
@ -53,8 +53,6 @@ struct BSplineCtx
F64 *bsplines;
};
//~ Globals
global Grid g_grid = {0};
global BSplineCtx g_bspline_ctx = {0};
@ -205,11 +203,11 @@ set_up_bspline_context(Arena* arena)
{
// Create knotpoint sequence.
U32 k = 4;
U32 N = 14;
U32 bspl_N = 14;
g_bspline_ctx.order = k;
g_bspline_ctx.num_knotpoints = N;
g_bspline_ctx.num_bsplines = N-k;
g_bspline_ctx.num_phys_points = N-(2*k)+2; // Remove k points at each end, and then add back the first and last points.
g_bspline_ctx.num_knotpoints = bspl_N;
g_bspline_ctx.num_bsplines = bspl_N-k;
g_bspline_ctx.num_phys_points = bspl_N-(2*k)+2; // Remove k points at each end, and then add back the first and last points.
g_bspline_ctx.arena = arena;
g_bspline_ctx.knotpoints = PushArray(arena, F64, g_bspline_ctx.num_knotpoints);
// Set up physical points;
@ -308,36 +306,37 @@ write_bsplines_to_matrix_F64(Arena *arena)
str8_list_push(scratch.arena, &bspline_array_list, row_joined);
}
write_string_list_to_file(scratch.arena, str8_lit(bspline_array_file_path), &bspline_array_list);
}
function
void EntryPoint()
void EntryPoint(void)
{
OS_InitReceipt os_receipt = OS_init();
OS_InitGfxReceipt os_gfx_receipt = OS_gfx_init(os_receipt);
Arena *arena = m_make_arena();
//- Set up grid and write to file.
set_up_grid(arena);
write_array_binary_F64(str8_lit(grid_file_path_bin), g_grid.points, g_grid.num_steps);
write_array_F64(str8_lit(grid_file_path), g_grid.points, g_grid.num_steps, "%13.6e\n");
//- The BSpline context is the knotpoints and the BSpline order etc.
set_up_bspline_context(arena);
write_array_F64(str8_lit(knotpoints_file_path),
g_bspline_ctx.knotpoints, g_bspline_ctx.num_knotpoints,
"%13.6e\n");
write_array_F64(str8_lit(knotpoints_file_path), g_bspline_ctx.knotpoints, g_bspline_ctx.num_knotpoints, "%13.6e\n");
//- Then we generate the BSplines and save them off for reference and debugging.
write_bsplines_to_matrix_F64(arena);
//- Test MKL
test_mkl_zgeev();
test_mkl_dsyevd();
cuda_entry_point();
}

272
src/test_mkl.c Normal file
View File

@ -0,0 +1,272 @@
/*******************************************************************************
* Copyright 2009-2021 Intel Corporation.
*
* This software and the related documents are Intel copyrighted materials, and
* your use of them is governed by the express license under which they were
* provided to you (License). Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute, disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents are provided as is, with no express
* or implied warranties, other than those that are expressly stated in the
* License.
*******************************************************************************/
/*
ZGEEV Example.
==============
Program computes the eigenvalues and left and right eigenvectors of a general
rectangular matrix A:
( -3.84, 2.25) ( -8.94, -4.75) ( 8.95, -6.53) ( -9.87, 4.82)
( -0.66, 0.83) ( -4.40, -3.82) ( -3.50, -4.26) ( -3.15, 7.36)
( -3.99, -4.73) ( -5.88, -6.60) ( -3.36, -0.40) ( -0.75, 5.23)
( 7.74, 4.18) ( 3.66, -7.53) ( 2.58, 3.60) ( 4.59, 5.41)
Description.
============
The routine computes for an n-by-n complex nonsymmetric matrix A, the
eigenvalues and, optionally, the left and/or right eigenvectors. The right
eigenvector v(j) of A satisfies
A*v(j)= lambda(j)*v(j)
where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies
u(j)H*A = lambda(j)*u(j)H
where u(j)H denotes the conjugate transpose of u(j). The computed
eigenvectors are normalized to have Euclidean norm equal to 1 and
largest component real.
Example Program Results.
========================
ZGEEV Example Program Results
Eigenvalues
( -9.43,-12.98) ( -3.44, 12.69) ( 0.11, -3.40) ( 5.76, 7.13)
Left eigenvectors
( 0.24, -0.18) ( 0.61, 0.00) ( -0.18, -0.33) ( 0.28, 0.09)
( 0.79, 0.00) ( -0.05, -0.27) ( 0.82, 0.00) ( -0.55, 0.16)
( 0.22, -0.27) ( -0.21, 0.53) ( -0.37, 0.15) ( 0.45, 0.09)
( -0.02, 0.41) ( 0.40, -0.24) ( 0.06, 0.12) ( 0.62, 0.00)
Right eigenvectors
( 0.43, 0.33) ( 0.83, 0.00) ( 0.60, 0.00) ( -0.31, 0.03)
( 0.51, -0.03) ( 0.08, -0.25) ( -0.40, -0.20) ( 0.04, 0.34)
( 0.62, 0.00) ( -0.25, 0.28) ( -0.09, -0.48) ( 0.36, 0.06)
( -0.23, 0.11) ( -0.10, -0.32) ( -0.43, 0.13) ( 0.81, 0.00)
*/
#include <stdlib.h>
#include <stdio.h>
#include <mkl.h>
/* Auxiliary routines prototypes */
function void
print_matrix_cmplx16( char* desc, int m, int n, MKL_Complex16* a, int lda );
#ifndef TEST_MKL_N
#define TEST_MKL_N 4
#endif
/* Main program */
static void
test_mkl_zgeev(void) {
/* Locals */
int N = TEST_MKL_N;
MKL_INT n = N, lda = N, ldvl = N, ldvr = N, info, lwork;
MKL_Complex16 wkopt;
MKL_Complex16* work;
/* Local arrays */
/* rwork dimension should be at least 2*n */
double rwork[2*TEST_MKL_N];
MKL_Complex16 w[TEST_MKL_N], vl[TEST_MKL_N*TEST_MKL_N], vr[TEST_MKL_N*TEST_MKL_N];
MKL_Complex16 a[TEST_MKL_N*TEST_MKL_N] = {
{-3.84, 2.25}, {-0.66, 0.83}, {-3.99, -4.73}, { 7.74, 4.18},
{-8.94, -4.75}, {-4.40, -3.82}, {-5.88, -6.60}, { 3.66, -7.53},
{ 8.95, -6.53}, {-3.50, -4.26}, {-3.36, -0.40}, { 2.58, 3.60},
{-9.87, 4.82}, {-3.15, 7.36}, {-0.75, 5.23}, { 4.59, 5.41}
};
/* Executable statements */
OutputDebugString( " ZGEEV Example Program Results\n" );
/* Query and allocate the optimal workspace */
lwork = -1;
zgeev( "Vectors", "Vectors", &n, a, &lda, w, vl, &ldvl, vr, &ldvr,
&wkopt, &lwork, rwork, &info );
lwork = (MKL_INT)wkopt.real;
work = (MKL_Complex16*)malloc( lwork*sizeof(MKL_Complex16) );
/* Solve eigenproblem */
zgeev( "Vectors", "Vectors", &n, a, &lda, w, vl, &ldvl, vr, &ldvr,
work, &lwork, rwork, &info );
/* Check for convergence */
if( info > 0 ) {
OutputDebugString( "The algorithm failed to compute eigenvalues.\n" );
exit( 1 );
}
/* Print eigenvalues */
print_matrix_cmplx16( "Eigenvalues", 1, n, w, 1 );
/* Print left eigenvectors */
print_matrix_cmplx16( "Left eigenvectors", n, n, vl, ldvl );
/* Print right eigenvectors */
print_matrix_cmplx16( "Right eigenvectors", n, n, vr, ldvr );
/* Free workspace */
free( (void*)work );
//exit( 0 );
} /* End of ZGEEV Example */
/* Auxiliary routine: printing a matrix */
function void
print_matrix_cmplx16( char* desc, int m, int n, MKL_Complex16* a, int lda ) {
ArenaTemp scratch = scratch_get(0,0);
int i, j;
String8 newline = str8_lit("\n");
String8 header = str8_pushf(scratch.arena, "\n %s\n", desc );
OutputDebugString(header.str);
//printf("\n %s \n", desc);
for( i = 0; i < m; i++ ) {
for( j = 0; j < n; j++ ) {
String8 outstr = str8_pushf(scratch.arena, " (%6.2f,%6.2f)", a[i+j*lda].real, a[i+j*lda].imag );
OutputDebugString(outstr.str);
//printf(" (%6.2f,%6.2f)", a[i+j*lda].real, a[i+j*lda].imag );
}
OutputDebugString(newline.str);
//printf("\n");
}
}
/*******************************************************************************
* Copyright 2009-2021 Intel Corporation.
*
* This software and the related documents are Intel copyrighted materials, and
* your use of them is governed by the express license under which they were
* provided to you (License). Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute, disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents are provided as is, with no express
* or implied warranties, other than those that are expressly stated in the
* License.
*******************************************************************************/
/*
DSYEVD Example.
==============
Program computes all eigenvalues and eigenvectors of a real symmetric
matrix A using divide and conquer algorithm, where A is:
6.39 0.13 -8.23 5.71 -3.18
0.13 8.37 -4.46 -6.10 7.21
-8.23 -4.46 -9.58 -9.25 -7.42
5.71 -6.10 -9.25 3.72 8.54
-3.18 7.21 -7.42 8.54 2.51
Description.
============
The routine computes all eigenvalues and, optionally, eigenvectors of an
n-by-n real symmetric matrix A. The eigenvector v(j) of A satisfies
A*v(j) = lambda(j)*v(j)
where lambda(j) is its eigenvalue. The computed eigenvectors are
orthonormal.
If the eigenvectors are requested, then this routine uses a divide and
conquer algorithm to compute eigenvalues and eigenvectors.
Example Program Results.
========================
DSYEVD Example Program Results
Eigenvalues
-17.44 -11.96 6.72 14.25 19.84
Eigenvectors (stored columnwise)
-0.26 0.31 -0.74 0.33 0.42
-0.17 -0.39 -0.38 -0.80 0.16
-0.89 0.04 0.09 0.03 -0.45
-0.29 -0.59 0.34 0.31 0.60
-0.19 0.63 0.44 -0.38 0.48
*/
#include <stdlib.h>
#include <stdio.h>
#include <mkl.h>
/* Auxiliary routines prototypes */
extern void print_matrix( char* desc, int m, int n, double* a, int lda );
/* Parameters */
#define N 5
#define LDA N
/* Main program */
function void
test_mkl_dsyevd(void) {
/* Locals */
MKL_INT n = N, lda = LDA, info, lwork, liwork;
MKL_INT iwkopt;
MKL_INT* iwork;
double wkopt;
double* work;
/* Local arrays */
double w[N];
double a[LDA*N] = {
6.39, 0.00, 0.00, 0.00, 0.00,
0.13, 8.37, 0.00, 0.00, 0.00,
-8.23, -4.46, -9.58, 0.00, 0.00,
5.71, -6.10, -9.25, 3.72, 0.00,
-3.18, 7.21, -7.42, 8.54, 2.51
};
/* Executable statements */
OutputDebugString( " DSYEVD Example Program Results\n" );
/* Query and allocate the optimal workspace */
lwork = -1;
liwork = -1;
dsyevd( "Vectors", "Upper", &n, a, &lda, w, &wkopt, &lwork, &iwkopt,
&liwork, &info );
lwork = (MKL_INT)wkopt;
work = (double*)malloc( lwork*sizeof(double) );
liwork = iwkopt;
iwork = (MKL_INT*)malloc( liwork*sizeof(MKL_INT) );
/* Solve eigenproblem */
dsyevd( "Vectors", "Upper", &n, a, &lda, w, work, &lwork, iwork,
&liwork, &info );
/* Check for convergence */
if( info > 0 ) {
OutputDebugString( "The algorithm failed to compute eigenvalues.\n" );
exit( 1 );
}
/* Print eigenvalues */
print_matrix( "Eigenvalues", 1, n, w, 1 );
/* Print eigenvectors */
print_matrix( "Eigenvectors (stored columnwise)", n, n, a, lda );
/* Free workspace */
free( (void*)iwork );
free( (void*)work );
} /* End of DSYEVD Example */
/* Auxiliary routine: printing a matrix */
void print_matrix( char* desc, int m, int n, double* a, int lda ) {
ArenaTemp scratch = scratch_get(0, 0);
int i, j;
//printf( "\n %s\n", desc );
String8 header = str8_pushf(scratch.arena, "\n %s\n", desc);
OutputDebugString(header.str);
for( i = 0; i < m; i++ ) {
for( j = 0; j < n; j++ )
{
String8 out = str8_pushf(scratch.arena, " %6.2f", a[i+j*lda] );
OutputDebugString(out.str);
//printf( " %6.2f", a[i+j*lda] );
}
OutputDebugString(str8_lit("\n").str);
}
}

Binary file not shown.