working dsyevd example on GPU with logging

2024-10-15 17:03:05 +02:00 · 2024-10-15 17:03:05 +02:00 · 48b67775f8
commit 48b67775f8
parent 879ae444a1
15 changed files with 8512 additions and 33 deletions
--- a/.gitignore
+++ b/.gitignore
@ -94,3 +94,5 @@ dkms.conf
 *.out
 *.app
 #visual studio
 .vs
--- a/build.bat
+++ b/build.bat
@ -4,16 +4,31 @@ ctime -begin timeBuild.ctm
@rem /WX /W4 /wd4201 /wd4100 /wd4189 /wd4244 /wd4127 /wd4456
@rem set CommonCompilerFlags="/nologo /Zi /FC"
-set CommonCompilerFlags=/nologo /Zi /FC 
+set CommonCompilerFlags=/nologo /Zi /FC /Od
@rem /WX /W4 /wd4201 /wd4100 /wd4189 /wd4244 /wd4127 /wd4456
@rem
@rem
 set cuda_root=D:/lib/cudatoolkit/lib/x64
 set mkl_root=D:/lib/oneAPI_mkl/mkl/2021.3.0
 set mkl_core=%mkl_root%/lib/intel64/mkl_core.lib
 set mkl_intel_lp64=%mkl_root%/lib/intel64/mkl_intel_lp64.lib
 set mkl_intel_thread=%mkl_root%/lib/intel64/mkl_intel_thread.lib
 set MKLCOMPILER=D:/lib/oneAPI_mkl/compiler/2021.3.0/windows/compiler
 set libiomp5md=%MKLCOMPILER%/lib/intel64_win/libiomp5md.lib
 set CudaSources=../src/cuda_entry_point.cu
 set CudaObj=cuda_entry_point.obj
 set Sources=../src/main.c
-set CudaLib=D:/lib/cudatoolkit/lib/x64
+
 set CudaSolver=%CudaLib%/cusolver.lib
 IF NOT EXIST .\build mkdir .\build
 pushd .\build
-cl %CommonCompilerFlags% %Sources% -Feprogram.exe
+rem cl /c %CommonCompilerFlags% %Sources% /I"%mkl_root%\include"
-@rem nvcc -Xcompiler %CommonCompilerFlags% -o program.exe %Sources% %CudaSources% -lcusolver
+rem FOR MSVC /link %mkl_core% %mkl_intel_lp64% %mkl_intel_thread% %libiomp5md% 
 nvcc -c %CudaSources% -o %CudaObj% -g -G -lcusolver
 rem nvcc -o program.exe -lcusolver -L%mkl_root%/lib/intel64 -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -l%MKLCOMPILER%/lib/intel64_win/libiomp5md
 cl %CommonCompilerFlags% %Sources% /I"%mkl_root%\include" /link %CudaObj% %cuda_root%/cudart.lib %cuda_root%/cusolver.lib %mkl_core% %mkl_intel_lp64% %mkl_intel_thread% %libiomp5md% 
 set LastError=%ERRORLEVEL%
 popd
--- a/build/main.rdi
+++ b/build/main.rdi
--- a/build/program.rdi
+++ b/build/program.rdi
--- a/eigsol_gpu.sublime-project
+++ b/eigsol_gpu.sublime-project
@ -0,0 +1,8 @@
 {
 	"folders":
 	[
 		{
 			"path": "."
 		}
 	]
 }
--- a/eigsol_gpu.sublime-workspace
+++ b/eigsol_gpu.sublime-workspace
--- a/external/cusolverDn.h
+++ b/external/cusolverDn.h
--- a/external/cusolverMg.h
+++ b/external/cusolverMg.h
@ -0,0 +1,318 @@
 /*
 * Copyright 2019 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(CUSOLVERMG_H_)
  #define CUSOLVERMG_H_
  #include <stdint.h>
  #include "cusolverDn.h"
  #if defined(__cplusplus)
 extern "C" {
  #endif /* __cplusplus */
  struct cusolverMgContext;
  typedef struct cusolverMgContext *cusolverMgHandle_t;
  /**
   * \beief This enum decides how 1D device Ids (or process ranks) get mapped to
   * a 2D grid.
   */
  typedef enum {
    CUDALIBMG_GRID_MAPPING_ROW_MAJOR = 1,
    CUDALIBMG_GRID_MAPPING_COL_MAJOR = 0
  } cusolverMgGridMapping_t;
  /** \brief Opaque structure of the distributed grid */
  typedef void *cudaLibMgGrid_t;
  /** \brief Opaque structure of the distributed matrix descriptor */
  typedef void *cudaLibMgMatrixDesc_t;
  cusolverStatus_t CUSOLVERAPI cusolverMgCreate(cusolverMgHandle_t *handle);
  cusolverStatus_t CUSOLVERAPI cusolverMgDestroy(cusolverMgHandle_t handle);
  cusolverStatus_t CUSOLVERAPI cusolverMgDeviceSelect(
    cusolverMgHandle_t handle,
    int                nbDevices,
    int                deviceId[]);
  /**
   * \brief Allocates resources related to the shared memory device grid.
   * \param[out] grid the opaque data strcuture that holds the grid
   * \param[in] numRowDevices number of devices in the row
   * \param[in] numColDevices number of devices in the column
   * \param[in] deviceId This array of size height * width stores the
   *            device-ids of the 2D grid; each entry must correspond to a valid
   * gpu or to -1 (denoting CPU). \param[in] mapping whether the 2D grid is in
   * row/column major \returns the status code
   */
  cusolverStatus_t CUSOLVERAPI cusolverMgCreateDeviceGrid(
    cudaLibMgGrid_t *       grid,
    int32_t                 numRowDevices,
    int32_t                 numColDevices,
    const int32_t           deviceId[],
    cusolverMgGridMapping_t mapping);
  /**
   * \brief Releases the allocated resources related to the distributed grid.
   * \param[in] grid the opaque data strcuture that holds the distributed grid
   * \returns the status code
   */
  cusolverStatus_t CUSOLVERAPI cusolverMgDestroyGrid(cudaLibMgGrid_t grid);
  /**
   * \brief Allocates resources related to the distributed matrix descriptor.
   * \param[out] desc the opaque data strcuture that holds the descriptor
   * \param[in] numRows number of total rows
   * \param[in] numCols number of total columns
   * \param[in] rowBlockSize row block size
   * \param[in] colBlockSize column block size
   * \param[in] dataType the data type of each element in cudaDataType
   * \param[in] grid the opaque data structure of the distributed grid
   * \returns the status code
   */
  cusolverStatus_t CUSOLVERAPI cusolverMgCreateMatrixDesc(
    cudaLibMgMatrixDesc_t *desc,
    int64_t                numRows,
    int64_t                numCols,
    int64_t                rowBlockSize,
    int64_t                colBlockSize,
    cudaDataType           dataType,
    const cudaLibMgGrid_t  grid);
  /**
   * \brief Releases the allocated resources related to the distributed matrix
   * descriptor. \param[in] desc the opaque data strcuture that holds the
   * descriptor \returns the status code
   */
  cusolverStatus_t CUSOLVERAPI
    cusolverMgDestroyMatrixDesc(cudaLibMgMatrixDesc_t desc);
  cusolverStatus_t CUSOLVERAPI cusolverMgSyevd_bufferSize(
    cusolverMgHandle_t    handle,
    cusolverEigMode_t     jobz,
    cublasFillMode_t      uplo,
    int                   N,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    void *                W,
    cudaDataType          dataTypeW,
    cudaDataType          computeType,
    int64_t *             lwork);
  cusolverStatus_t CUSOLVERAPI cusolverMgSyevd(
    cusolverMgHandle_t    handle,
    cusolverEigMode_t     jobz,
    cublasFillMode_t      uplo,
    int                   N,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    void *                W,
    cudaDataType          dataTypeW,
    cudaDataType          computeType,
    void *                array_d_work[],
    int64_t               lwork,
    int *                 info);
  cusolverStatus_t CUSOLVERAPI cusolverMgGetrf_bufferSize(
    cusolverMgHandle_t    handle,
    int                   M,
    int                   N,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    int *                 array_d_IPIV[],
    cudaDataType          computeType,
    int64_t *             lwork);
  cusolverStatus_t CUSOLVERAPI cusolverMgGetrf(
    cusolverMgHandle_t    handle,
    int                   M,
    int                   N,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    int *                 array_d_IPIV[],
    cudaDataType          computeType,
    void *                array_d_work[],
    int64_t               lwork,
    int *                 info);
  cusolverStatus_t CUSOLVERAPI cusolverMgGetrs_bufferSize(
    cusolverMgHandle_t    handle,
    cublasOperation_t     TRANS,
    int                   N,
    int                   NRHS,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    int *                 array_d_IPIV[],
    void *                array_d_B[],
    int                   IB,
    int                   JB,
    cudaLibMgMatrixDesc_t descrB,
    cudaDataType          computeType,
    int64_t *             lwork);
  cusolverStatus_t CUSOLVERAPI cusolverMgGetrs(
    cusolverMgHandle_t    handle,
    cublasOperation_t     TRANS,
    int                   N,
    int                   NRHS,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    int *                 array_d_IPIV[],
    void *                array_d_B[],
    int                   IB,
    int                   JB,
    cudaLibMgMatrixDesc_t descrB,
    cudaDataType          computeType,
    void *                array_d_work[],
    int64_t               lwork,
    int *                 info);
  cusolverStatus_t CUSOLVERAPI cusolverMgPotrf_bufferSize(
    cusolverMgHandle_t    handle,
    cublasFillMode_t      uplo,
    int                   N,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    cudaDataType          computeType,
    int64_t *             lwork);
  cusolverStatus_t CUSOLVERAPI cusolverMgPotrf(
    cusolverMgHandle_t    handle,
    cublasFillMode_t      uplo,
    int                   N,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    cudaDataType          computeType,
    void *                array_d_work[],
    int64_t               lwork,
    int *                 h_info);
  cusolverStatus_t CUSOLVERAPI cusolverMgPotrs_bufferSize(
    cusolverMgHandle_t    handle,
    cublasFillMode_t      uplo,
    int                   n,
    int                   nrhs,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    void *                array_d_B[],
    int                   IB,
    int                   JB,
    cudaLibMgMatrixDesc_t descrB,
    cudaDataType          computeType,
    int64_t *             lwork);
  cusolverStatus_t CUSOLVERAPI cusolverMgPotrs(
    cusolverMgHandle_t    handle,
    cublasFillMode_t      uplo,
    int                   n,
    int                   nrhs,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    void *                array_d_B[],
    int                   IB,
    int                   JB,
    cudaLibMgMatrixDesc_t descrB,
    cudaDataType          computeType,
    void *                array_d_work[],
    int64_t               lwork,
    int *                 h_info);
  cusolverStatus_t CUSOLVERAPI cusolverMgPotri_bufferSize(
    cusolverMgHandle_t    handle,
    cublasFillMode_t      uplo,
    int                   N,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    cudaDataType          computeType,
    int64_t *             lwork);
  cusolverStatus_t CUSOLVERAPI cusolverMgPotri(
    cusolverMgHandle_t    handle,
    cublasFillMode_t      uplo,
    int                   N,
    void *                array_d_A[],
    int                   IA,
    int                   JA,
    cudaLibMgMatrixDesc_t descrA,
    cudaDataType          computeType,
    void *                array_d_work[],
    int64_t               lwork,
    int *                 h_info);
  #if defined(__cplusplus)
 }
  #endif /* __cplusplus */
 #endif // CUSOLVERMG_H_
--- a/external/cusolverRf.h
+++ b/external/cusolverRf.h
@ -0,0 +1,339 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(CUSOLVERRF_H_)
  #define CUSOLVERRF_H_
  #include "driver_types.h"
  #include "cuComplex.h"
  #include "cusolver_common.h"
  #if defined(__cplusplus)
 extern "C" {
  #endif /* __cplusplus */
  /* CUSOLVERRF mode */
  typedef enum {
    CUSOLVERRF_RESET_VALUES_FAST_MODE_OFF = 0, // default
    CUSOLVERRF_RESET_VALUES_FAST_MODE_ON = 1
  } cusolverRfResetValuesFastMode_t;
  /* CUSOLVERRF matrix format */
  typedef enum {
    CUSOLVERRF_MATRIX_FORMAT_CSR = 0, // default
    CUSOLVERRF_MATRIX_FORMAT_CSC = 1
  } cusolverRfMatrixFormat_t;
  /* CUSOLVERRF unit diagonal */
  typedef enum {
    CUSOLVERRF_UNIT_DIAGONAL_STORED_L = 0, // default
    CUSOLVERRF_UNIT_DIAGONAL_STORED_U = 1,
    CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L = 2,
    CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_U = 3
  } cusolverRfUnitDiagonal_t;
  /* CUSOLVERRF factorization algorithm */
  typedef enum {
    CUSOLVERRF_FACTORIZATION_ALG0 = 0, // default
    CUSOLVERRF_FACTORIZATION_ALG1 = 1,
    CUSOLVERRF_FACTORIZATION_ALG2 = 2,
  } cusolverRfFactorization_t;
  /* CUSOLVERRF triangular solve algorithm */
  typedef enum {
    CUSOLVERRF_TRIANGULAR_SOLVE_ALG1 = 1, // default
    CUSOLVERRF_TRIANGULAR_SOLVE_ALG2 = 2,
    CUSOLVERRF_TRIANGULAR_SOLVE_ALG3 = 3
  } cusolverRfTriangularSolve_t;
  /* CUSOLVERRF numeric boost report */
  typedef enum {
    CUSOLVERRF_NUMERIC_BOOST_NOT_USED = 0, // default
    CUSOLVERRF_NUMERIC_BOOST_USED = 1
  } cusolverRfNumericBoostReport_t;
  /* Opaque structure holding CUSOLVERRF library common */
  struct cusolverRfCommon;
  typedef struct cusolverRfCommon* cusolverRfHandle_t;
  /* CUSOLVERRF create (allocate memory) and destroy (free memory) in the handle
   */
  cusolverStatus_t CUSOLVERAPI cusolverRfCreate(cusolverRfHandle_t* handle);
  cusolverStatus_t CUSOLVERAPI cusolverRfDestroy(cusolverRfHandle_t handle);
  /* CUSOLVERRF set and get input format */
  cusolverStatus_t CUSOLVERAPI cusolverRfGetMatrixFormat(
    cusolverRfHandle_t        handle,
    cusolverRfMatrixFormat_t* format,
    cusolverRfUnitDiagonal_t* diag);
  cusolverStatus_t CUSOLVERAPI cusolverRfSetMatrixFormat(
    cusolverRfHandle_t       handle,
    cusolverRfMatrixFormat_t format,
    cusolverRfUnitDiagonal_t diag);
  /* CUSOLVERRF set and get numeric properties */
  cusolverStatus_t CUSOLVERAPI cusolverRfSetNumericProperties(
    cusolverRfHandle_t handle,
    double             zero,
    double             boost);
  cusolverStatus_t CUSOLVERAPI cusolverRfGetNumericProperties(
    cusolverRfHandle_t handle,
    double*            zero,
    double*            boost);
  cusolverStatus_t CUSOLVERAPI cusolverRfGetNumericBoostReport(
    cusolverRfHandle_t              handle,
    cusolverRfNumericBoostReport_t* report);
  /* CUSOLVERRF choose the triangular solve algorithm */
  cusolverStatus_t CUSOLVERAPI cusolverRfSetAlgs(
    cusolverRfHandle_t          handle,
    cusolverRfFactorization_t   factAlg,
    cusolverRfTriangularSolve_t solveAlg);
  cusolverStatus_t CUSOLVERAPI cusolverRfGetAlgs(
    cusolverRfHandle_t           handle,
    cusolverRfFactorization_t*   factAlg,
    cusolverRfTriangularSolve_t* solveAlg);
  /* CUSOLVERRF set and get fast mode */
  cusolverStatus_t CUSOLVERAPI cusolverRfGetResetValuesFastMode(
    cusolverRfHandle_t               handle,
    cusolverRfResetValuesFastMode_t* fastMode);
  cusolverStatus_t CUSOLVERAPI cusolverRfSetResetValuesFastMode(
    cusolverRfHandle_t              handle,
    cusolverRfResetValuesFastMode_t fastMode);
  /*** Non-Batched Routines ***/
  /* CUSOLVERRF setup of internal structures from host or device memory */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfSetupHost(/* Input (in the host memory) */
                        int     n,
                        int     nnzA,
                        int*    h_csrRowPtrA,
                        int*    h_csrColIndA,
                        double* h_csrValA,
                        int     nnzL,
                        int*    h_csrRowPtrL,
                        int*    h_csrColIndL,
                        double* h_csrValL,
                        int     nnzU,
                        int*    h_csrRowPtrU,
                        int*    h_csrColIndU,
                        double* h_csrValU,
                        int*    h_P,
                        int*    h_Q,
                        /* Output */
                        cusolverRfHandle_t handle);
  cusolverStatus_t CUSOLVERAPI
    cusolverRfSetupDevice(/* Input (in the device memory) */
                          int     n,
                          int     nnzA,
                          int*    csrRowPtrA,
                          int*    csrColIndA,
                          double* csrValA,
                          int     nnzL,
                          int*    csrRowPtrL,
                          int*    csrColIndL,
                          double* csrValL,
                          int     nnzU,
                          int*    csrRowPtrU,
                          int*    csrColIndU,
                          double* csrValU,
                          int*    P,
                          int*    Q,
                          /* Output */
                          cusolverRfHandle_t handle);
  /* CUSOLVERRF update the matrix values (assuming the reordering, pivoting
     and consequently the sparsity pattern of L and U did not change),
     and zero out the remaining values. */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfResetValues(/* Input (in the device memory) */
                          int     n,
                          int     nnzA,
                          int*    csrRowPtrA,
                          int*    csrColIndA,
                          double* csrValA,
                          int*    P,
                          int*    Q,
                          /* Output */
                          cusolverRfHandle_t handle);
  /* CUSOLVERRF analysis (for parallelism) */
  cusolverStatus_t CUSOLVERAPI cusolverRfAnalyze(cusolverRfHandle_t handle);
  /* CUSOLVERRF re-factorization (for parallelism) */
  cusolverStatus_t CUSOLVERAPI cusolverRfRefactor(cusolverRfHandle_t handle);
  /* CUSOLVERRF extraction: Get L & U packed into a single matrix M */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfAccessBundledFactorsDevice(/* Input */
                                         cusolverRfHandle_t handle,
                                         /* Output (in the host memory) */
                                         int* nnzM,
                                         /* Output (in the device memory) */
                                         int**    Mp,
                                         int**    Mi,
                                         double** Mx);
  cusolverStatus_t CUSOLVERAPI
    cusolverRfExtractBundledFactorsHost(/* Input */
                                        cusolverRfHandle_t handle,
                                        /* Output (in the host memory) */
                                        int*     h_nnzM,
                                        int**    h_Mp,
                                        int**    h_Mi,
                                        double** h_Mx);
  /* CUSOLVERRF extraction: Get L & U individually */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfExtractSplitFactorsHost(/* Input */
                                      cusolverRfHandle_t handle,
                                      /* Output (in the host memory) */
                                      int*     h_nnzL,
                                      int**    h_csrRowPtrL,
                                      int**    h_csrColIndL,
                                      double** h_csrValL,
                                      int*     h_nnzU,
                                      int**    h_csrRowPtrU,
                                      int**    h_csrColIndU,
                                      double** h_csrValU);
  /* CUSOLVERRF (forward and backward triangular) solves */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfSolve(/* Input (in the device memory) */
                    cusolverRfHandle_t handle,
                    int*               P,
                    int*               Q,
                    int                nrhs, // only nrhs=1 is supported
                    double*            Temp, // of size ldt*nrhs (ldt>=n)
                    int                ldt,
                    /* Input/Output (in the device memory) */
                    double* XF,
                    /* Input */
                    int ldxf);
  /*** Batched Routines ***/
  /* CUSOLVERRF-batch setup of internal structures from host */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfBatchSetupHost(/* Input (in the host memory)*/
                             int     batchSize,
                             int     n,
                             int     nnzA,
                             int*    h_csrRowPtrA,
                             int*    h_csrColIndA,
                             double* h_csrValA_array[],
                             int     nnzL,
                             int*    h_csrRowPtrL,
                             int*    h_csrColIndL,
                             double* h_csrValL,
                             int     nnzU,
                             int*    h_csrRowPtrU,
                             int*    h_csrColIndU,
                             double* h_csrValU,
                             int*    h_P,
                             int*    h_Q,
                             /* Output (in the device memory) */
                             cusolverRfHandle_t handle);
  /* CUSOLVERRF-batch update the matrix values (assuming the reordering,
     pivoting and consequently the sparsity pattern of L and U did not change),
     and zero out the remaining values. */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfBatchResetValues(/* Input (in the device memory) */
                               int     batchSize,
                               int     n,
                               int     nnzA,
                               int*    csrRowPtrA,
                               int*    csrColIndA,
                               double* csrValA_array[],
                               int*    P,
                               int*    Q,
                               /* Output */
                               cusolverRfHandle_t handle);
  /* CUSOLVERRF-batch analysis (for parallelism) */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfBatchAnalyze(cusolverRfHandle_t handle);
  /* CUSOLVERRF-batch re-factorization (for parallelism) */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfBatchRefactor(cusolverRfHandle_t handle);
  /* CUSOLVERRF-batch (forward and backward triangular) solves */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfBatchSolve(/* Input (in the device memory) */
                         cusolverRfHandle_t handle,
                         int*               P,
                         int*               Q,
                         int                nrhs, // only nrhs=1 is supported
                         double* Temp, // of size 2*batchSize*(n*nrhs)
                         int     ldt,  // only ldt=n is supported
                         /* Input/Output (in the device memory) */
                         double* XF_array[],
                         /* Input */
                         int ldxf);
  /* CUSOLVERRF-batch obtain the position of zero pivot */
  cusolverStatus_t CUSOLVERAPI
    cusolverRfBatchZeroPivot(/* Input */
                             cusolverRfHandle_t handle,
                             /* Output (in the host memory) */
                             int* position);
  #if defined(__cplusplus)
 }
  #endif /* __cplusplus */
 #endif /* CUSOLVERRF_H_ */
--- a/external/cusolverSp.h
+++ b/external/cusolverSp.h
@ -0,0 +1,923 @@
 /*
 * Copyright 2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(CUSOLVERSP_H_)
  #define CUSOLVERSP_H_
  #include "cusparse.h"
  #include "cublas_v2.h"
  #include "cusolver_common.h"
  #if defined(__cplusplus)
 extern "C" {
  #endif /* __cplusplus */
  struct cusolverSpContext;
  typedef struct cusolverSpContext *cusolverSpHandle_t;
  struct csrqrInfo;
  typedef struct csrqrInfo *csrqrInfo_t;
  cusolverStatus_t CUSOLVERAPI cusolverSpCreate(cusolverSpHandle_t *handle);
  cusolverStatus_t CUSOLVERAPI cusolverSpDestroy(cusolverSpHandle_t handle);
  cusolverStatus_t CUSOLVERAPI
    cusolverSpSetStream(cusolverSpHandle_t handle, cudaStream_t streamId);
  cusolverStatus_t CUSOLVERAPI
    cusolverSpGetStream(cusolverSpHandle_t handle, cudaStream_t *streamId);
  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrissymHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const int *              csrRowPtrA,
    const int *              csrEndPtrA,
    const int *              csrColIndA,
    int *                    issym);
  /* -------- GPU linear solver by LU factorization
   *       solve A*x = b, A can be singular
   * [ls] stands for linear solve
   * [v] stands for vector
   * [lu] stands for LU factorization
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvluHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const float *            csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const float *            b,
    float                    tol,
    int                      reorder,
    float *                  x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvluHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const double *           csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const double *           b,
    double                   tol,
    int                      reorder,
    double *                 x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvluHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const cuComplex *        b,
    float                    tol,
    int                      reorder,
    cuComplex *              x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvluHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const cuDoubleComplex *  b,
    double                   tol,
    int                      reorder,
    cuDoubleComplex *        x,
    int *                    singularity);
  /* -------- GPU linear solver by QR factorization
   *       solve A*x = b, A can be singular
   * [ls] stands for linear solve
   * [v] stands for vector
   * [qr] stands for QR factorization
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqr(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const float *            b,
    float                    tol,
    int                      reorder,
    float *                  x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqr(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const double *           b,
    double                   tol,
    int                      reorder,
    double *                 x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqr(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const cuComplex *        b,
    float                    tol,
    int                      reorder,
    cuComplex *              x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqr(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const cuDoubleComplex *  b,
    double                   tol,
    int                      reorder,
    cuDoubleComplex *        x,
    int *                    singularity);
  /* -------- CPU linear solver by QR factorization
   *       solve A*x = b, A can be singular
   * [ls] stands for linear solve
   * [v] stands for vector
   * [qr] stands for QR factorization
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqrHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const float *            b,
    float                    tol,
    int                      reorder,
    float *                  x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqrHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const double *           b,
    double                   tol,
    int                      reorder,
    double *                 x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqrHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const cuComplex *        b,
    float                    tol,
    int                      reorder,
    cuComplex *              x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqrHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const cuDoubleComplex *  b,
    double                   tol,
    int                      reorder,
    cuDoubleComplex *        x,
    int *                    singularity);
  /* -------- CPU linear solver by Cholesky factorization
   *       solve A*x = b, A can be singular
   * [ls] stands for linear solve
   * [v] stands for vector
   * [chol] stands for Cholesky factorization
   *
   * Only works for symmetric positive definite matrix.
   * The upper part of A is ignored.
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvcholHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const float *            b,
    float                    tol,
    int                      reorder,
    float *                  x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvcholHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const double *           b,
    double                   tol,
    int                      reorder,
    double *                 x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvcholHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const cuComplex *        b,
    float                    tol,
    int                      reorder,
    cuComplex *              x,
    int *                    singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvcholHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const cuDoubleComplex *  b,
    double                   tol,
    int                      reorder,
    cuDoubleComplex *        x,
    int *                    singularity);
  /* -------- GPU linear solver by Cholesky factorization
   *       solve A*x = b, A can be singular
   * [ls] stands for linear solve
   * [v] stands for vector
   * [chol] stands for Cholesky factorization
   *
   * Only works for symmetric positive definite matrix.
   * The upper part of A is ignored.
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvchol(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const float *            b,
    float                    tol,
    int                      reorder,
    // output
    float *x,
    int *  singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvchol(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const double *           b,
    double                   tol,
    int                      reorder,
    // output
    double *x,
    int *   singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvchol(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const cuComplex *        b,
    float                    tol,
    int                      reorder,
    // output
    cuComplex *x,
    int *      singularity);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvchol(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    const cuDoubleComplex *  b,
    double                   tol,
    int                      reorder,
    // output
    cuDoubleComplex *x,
    int *            singularity);
  /* ----------- CPU least square solver by QR factorization
   *       solve min|b - A*x|
   * [lsq] stands for least square
   * [v] stands for vector
   * [qr] stands for QR factorization
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsqvqrHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const float *            b,
    float                    tol,
    int *                    rankA,
    float *                  x,
    int *                    p,
    float *                  min_norm);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsqvqrHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const double *           b,
    double                   tol,
    int *                    rankA,
    double *                 x,
    int *                    p,
    double *                 min_norm);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsqvqrHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const cuComplex *        b,
    float                    tol,
    int *                    rankA,
    cuComplex *              x,
    int *                    p,
    float *                  min_norm);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsqvqrHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const cuDoubleComplex *  b,
    double                   tol,
    int *                    rankA,
    cuDoubleComplex *        x,
    int *                    p,
    double *                 min_norm);
  /* --------- CPU eigenvalue solver by shift inverse
   *      solve A*x = lambda * x
   *   where lambda is the eigenvalue nearest mu0.
   * [eig] stands for eigenvalue solver
   * [si] stands for shift-inverse
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigvsiHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    float                    mu0,
    const float *            x0,
    int                      maxite,
    float                    tol,
    float *                  mu,
    float *                  x);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigvsiHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    double                   mu0,
    const double *           x0,
    int                      maxite,
    double                   tol,
    double *                 mu,
    double *                 x);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigvsiHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    cuComplex                mu0,
    const cuComplex *        x0,
    int                      maxite,
    float                    tol,
    cuComplex *              mu,
    cuComplex *              x);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigvsiHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    cuDoubleComplex          mu0,
    const cuDoubleComplex *  x0,
    int                      maxite,
    double                   tol,
    cuDoubleComplex *        mu,
    cuDoubleComplex *        x);
  /* --------- GPU eigenvalue solver by shift inverse
   *      solve A*x = lambda * x
   *   where lambda is the eigenvalue nearest mu0.
   * [eig] stands for eigenvalue solver
   * [si] stands for shift-inverse
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigvsi(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    float                    mu0,
    const float *            x0,
    int                      maxite,
    float                    eps,
    float *                  mu,
    float *                  x);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigvsi(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    double                   mu0,
    const double *           x0,
    int                      maxite,
    double                   eps,
    double *                 mu,
    double *                 x);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigvsi(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    cuComplex                mu0,
    const cuComplex *        x0,
    int                      maxite,
    float                    eps,
    cuComplex *              mu,
    cuComplex *              x);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigvsi(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    cuDoubleComplex          mu0,
    const cuDoubleComplex *  x0,
    int                      maxite,
    double                   eps,
    cuDoubleComplex *        mu,
    cuDoubleComplex *        x);
  // ----------- enclosed eigenvalues
  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigsHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    cuComplex                left_bottom_corner,
    cuComplex                right_upper_corner,
    int *                    num_eigs);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigsHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    cuDoubleComplex          left_bottom_corner,
    cuDoubleComplex          right_upper_corner,
    int *                    num_eigs);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigsHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    cuComplex                left_bottom_corner,
    cuComplex                right_upper_corner,
    int *                    num_eigs);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigsHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    cuDoubleComplex          left_bottom_corner,
    cuDoubleComplex          right_upper_corner,
    int *                    num_eigs);
  /* --------- CPU symrcm
   *   Symmetric reverse Cuthill McKee permutation
   *
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymrcmHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    int *                    p);
  /* --------- CPU symmdq
   *   Symmetric minimum degree algorithm by quotient graph
   *
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymmdqHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    int *                    p);
  /* --------- CPU symmdq
   *   Symmetric Approximate minimum degree algorithm by quotient graph
   *
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymamdHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    int *                    p);
  /* --------- CPU metis
   *   symmetric reordering
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrmetisndHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const int64_t *          options,
    int *                    p);
  /* --------- CPU zfd
   *  Zero free diagonal reordering
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpScsrzfdHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    int *                    P,
    int *                    numnz);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrzfdHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    int *                    P,
    int *                    numnz);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrzfdHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    int *                    P,
    int *                    numnz);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrzfdHost(
    cusolverSpHandle_t       handle,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    int *                    P,
    int *                    numnz);
  /* --------- CPU permuation
   *   P*A*Q^T
   *
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrperm_bufferSizeHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const int *              p,
    const int *              q,
    size_t *                 bufferSizeInBytes);
  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrpermHost(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    int *                    csrRowPtrA,
    int *                    csrColIndA,
    const int *              p,
    const int *              q,
    int *                    map,
    void *                   pBuffer);
  /*
   *  Low-level API: Batched QR
   *
   */
  cusolverStatus_t CUSOLVERAPI cusolverSpCreateCsrqrInfo(csrqrInfo_t *info);
  cusolverStatus_t CUSOLVERAPI cusolverSpDestroyCsrqrInfo(csrqrInfo_t info);
  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrqrAnalysisBatched(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnzA,
    const cusparseMatDescr_t descrA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    csrqrInfo_t              info);
  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrBufferInfoBatched(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    int                      batchSize,
    csrqrInfo_t              info,
    size_t *                 internalDataInBytes,
    size_t *                 workspaceInBytes);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrBufferInfoBatched(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    int                      batchSize,
    csrqrInfo_t              info,
    size_t *                 internalDataInBytes,
    size_t *                 workspaceInBytes);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrBufferInfoBatched(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    int                      batchSize,
    csrqrInfo_t              info,
    size_t *                 internalDataInBytes,
    size_t *                 workspaceInBytes);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrBufferInfoBatched(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrVal,
    const int *              csrRowPtr,
    const int *              csrColInd,
    int                      batchSize,
    csrqrInfo_t              info,
    size_t *                 internalDataInBytes,
    size_t *                 workspaceInBytes);
  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrsvBatched(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const float *            csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const float *            b,
    float *                  x,
    int                      batchSize,
    csrqrInfo_t              info,
    void *                   pBuffer);
  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrsvBatched(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const double *           csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const double *           b,
    double *                 x,
    int                      batchSize,
    csrqrInfo_t              info,
    void *                   pBuffer);
  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrsvBatched(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuComplex *        csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const cuComplex *        b,
    cuComplex *              x,
    int                      batchSize,
    csrqrInfo_t              info,
    void *                   pBuffer);
  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrsvBatched(
    cusolverSpHandle_t       handle,
    int                      m,
    int                      n,
    int                      nnz,
    const cusparseMatDescr_t descrA,
    const cuDoubleComplex *  csrValA,
    const int *              csrRowPtrA,
    const int *              csrColIndA,
    const cuDoubleComplex *  b,
    cuDoubleComplex *        x,
    int                      batchSize,
    csrqrInfo_t              info,
    void *                   pBuffer);
  #if defined(__cplusplus)
 }
  #endif /* __cplusplus */
 #endif // define CUSOLVERSP_H_
--- a/external/cusolverSp_LOWLEVEL_PREVIEW.h
+++ b/external/cusolverSp_LOWLEVEL_PREVIEW.h
--- a/src/cuda_entry_point.cu
+++ b/src/cuda_entry_point.cu
@ -1,28 +1,174 @@
 #include <stdio.h>
-#include "../external/cusolver_common.h"
+#include <windows.h>
-#define no_name_mangle extern "C"
+#define WIN32_LEAN_AND_MEAN
-__global__ void hello_from_GPU() {
+#include "../external/cusolver_common.h"
-  printf("HELLO FROM GPUUUU");
+#include "../external/cusolverDn.h"
 #ifdef __cplusplus
 # define no_name_mangle extern "C"
 #else
 # define no_name_mangle 
 #endif
 #define function static
 #define GLOBAL_LOG_BUFFER_SIZE 4096*4096
 __device__ int last_index;
 __device__ char *device_log_buffer;
 char *log_buffer;
 #define D_LOG(msg) d_debug_printf((msg))
 //#define D_LOGF(msg, ...) d_debug_printfv((msg), __VA_ARGS__)
 #define H_LOG(msg) h_debug_printf((msg))
 #define H_LOGF(msg, ...) h_debug_printfv((msg), __VA_ARGS__)
 __host__ function void h_debug_printf(const char *msg); 
 __host__ function void h_debug_printfv(const char *fmt, ...); 
 __device__ function void d_debug_printf(const char *msg);
 //__device__ function void d_debug_printfv(const char *fmt, ...);
 __global__ void init_device_logging()
 {
  last_index = 0;
 }
-void check_cusolver_version() {
+
 __host__ function void
 h_debug_printf(const char* msg) 
 {
  OutputDebugStringA(msg);
 }
 __host__ function void
 h_debug_printfv(const char *fmt, ...)
 {
  char buffer[2048] = {0};
  va_list args;
  va_start(args, fmt);
  vsnprintf(&buffer[0], sizeof(buffer), fmt, args);
  va_end(args);
  OutputDebugStringA(buffer);
 }
 __device__ function void
 d_debug_printf(const char *msg) {
  int i = 0;
  while(msg[i] != '\0')
  {
    device_log_buffer[last_index + i] = msg[i];
    i += 1;
  }
  last_index = last_index + 1;
 }
 __global__ void hello_from_GPU()
 {
  D_LOG("HELLO FROM GPUUUU\n");
 }
 void
 check_cusolver_version() 
 {
  int major=-1, minor=-1, patch=-1;
  cusolverGetProperty(MAJOR_VERSION, &major);
  cusolverGetProperty(MINOR_VERSION, &minor);
  cusolverGetProperty(PATCH_LEVEL, &patch);
-  printf("CUSOLVER Version (Major,Minor,PatchLevel): %d.%d.%d\n", major,minor,patch);
+  H_LOGF("CUSOLVER Version (Major,Minor,PatchLevel): %d.%d.%d\n", major,minor,patch);
 }
-no_name_mangle void cuda_entry_point() {
+no_name_mangle void 
-
+cuda_entry_point()
 {
  H_LOG("\n\n-- Entered CUDA code:\n\n");
  char *temp_device_log_buffer;
  cudaMalloc((void**)&temp_device_log_buffer, GLOBAL_LOG_BUFFER_SIZE * sizeof(char));
  cudaMemcpyToSymbol(device_log_buffer, &temp_device_log_buffer, sizeof(char *));
  check_cusolver_version();
-  hello_from_GPU<<<1, 1>>>();
+  init_device_logging<<<1, 1>>>();
  cudaDeviceSynchronize();
  hello_from_GPU<<<1, 1>>>();
  cudaDeviceSynchronize();
  log_buffer = (char *)malloc(GLOBAL_LOG_BUFFER_SIZE * sizeof(char));
  cudaMemcpy(log_buffer, temp_device_log_buffer, GLOBAL_LOG_BUFFER_SIZE * sizeof(char), cudaMemcpyDeviceToHost);
  // Print all messages from cuda.
  H_LOG(log_buffer);
  H_LOG("\n\n Testing cuSolver Eigenvalue problem \n\n");
  const int size = 5;
  const int mat_size = size*size;
  double A[mat_size] = {
    6.39,   0.13,  -8.23,   5.71,  -3.18,
   0.13,   8.37,  -4.46,  -6.10,   7.21,
   -8.23,  -4.46,  -9.58,  -9.25,  -7.42,
   5.71,  -6.10,  -9.25,   3.72,   8.54,
   -3.18,   7.21,  -7.42,   8.54,   2.51};
  double *device_mat;
  cudaMalloc((void**)&device_mat, mat_size * sizeof(double));
  cudaMemcpy(device_mat, A, mat_size * sizeof(double), cudaMemcpyHostToDevice);
  cusolverDnHandle_t cusolverH = 0;
  cusolverDnCreate(&cusolverH);
  double W[size];
  double *d_W;
  cudaMalloc((void**)&d_W, size * sizeof(double));
  int lwork;
  cusolverDnDsyevd_bufferSize(
      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
      size, device_mat, size, d_W, &lwork
      );
  double *d_work;
  cudaMalloc((void**)&d_work, lwork * sizeof(double));
  int *device_info;
  cudaMalloc((void**)&device_info, sizeof(int));
  cusolverDnDsyevd(
      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
      size, device_mat, size, d_W, d_work, lwork, device_info
      );
  double out_mat[mat_size];
  cudaMemcpy(W, d_W, size * sizeof(double), cudaMemcpyDeviceToHost);
  cudaMemcpy(out_mat, device_mat, mat_size * sizeof(double), cudaMemcpyDeviceToHost);
  H_LOG("Results on GPU dsyevd:\n");
  H_LOG("Eigenvalues:\n");
  for(int i = 0; i < size; i++)
  {
    H_LOGF("%6.2f\n", W[i]);
  }
  H_LOG("\n");
  H_LOG("Eigenvectors (columnwise):\n");
  for(int i = 0; i < size; i++)
  {
    for(int j = 0; j < size; j++)
    {
      H_LOGF("%6.2f ", out_mat[j * size + i]); 
    }
    H_LOG("\n");
  }
  cudaFree(temp_device_log_buffer);
 }
--- a/src/main.c
+++ b/src/main.c
@ -14,7 +14,7 @@
 #include "base/base_inc.c"
 #include "os/os_inc.c"
 #include "os/os_entry_point.c"
-
+#include "test_mkl.c"
 #define grid_file_path_bin "D:\\dev\\eigsol_gpu\\out\\grid.bin"
 #define grid_file_path "D:\\dev\\eigsol_gpu\\out\\grid.dat"
@ -53,8 +53,6 @@ struct BSplineCtx
  F64 *bsplines;
 };
 //~ Globals
 global Grid g_grid = {0};
 global BSplineCtx g_bspline_ctx = {0};
@ -205,11 +203,11 @@ set_up_bspline_context(Arena* arena)
 {
  // Create knotpoint sequence.
  U32 k = 4;
-  U32 N = 14;
+  U32 bspl_N = 14;
  g_bspline_ctx.order = k; 
-  g_bspline_ctx.num_knotpoints = N; 
+  g_bspline_ctx.num_knotpoints = bspl_N; 
-  g_bspline_ctx.num_bsplines = N-k;
+  g_bspline_ctx.num_bsplines = bspl_N-k;
-  g_bspline_ctx.num_phys_points = N-(2*k)+2; // Remove k points at each end, and then add back the first and last points.
+  g_bspline_ctx.num_phys_points = bspl_N-(2*k)+2; // Remove k points at each end, and then add back the first and last points.
  g_bspline_ctx.arena = arena;
  g_bspline_ctx.knotpoints = PushArray(arena, F64, g_bspline_ctx.num_knotpoints);
  // Set up physical points;
@ -308,36 +306,37 @@ write_bsplines_to_matrix_F64(Arena *arena)
    str8_list_push(scratch.arena, &bspline_array_list, row_joined);
  }
  write_string_list_to_file(scratch.arena, str8_lit(bspline_array_file_path), &bspline_array_list);
 }
-  function
+function
-void EntryPoint() 
+void EntryPoint(void) 
 {
  OS_InitReceipt os_receipt = OS_init();
  OS_InitGfxReceipt os_gfx_receipt = OS_gfx_init(os_receipt);
  Arena *arena = m_make_arena();
  //- Set up grid and write to file.
  set_up_grid(arena);
  write_array_binary_F64(str8_lit(grid_file_path_bin), g_grid.points, g_grid.num_steps);
  write_array_F64(str8_lit(grid_file_path), g_grid.points, g_grid.num_steps, "%13.6e\n");
  //- The BSpline context is the knotpoints and the BSpline order etc.
  set_up_bspline_context(arena);
  write_array_F64(str8_lit(knotpoints_file_path), 
      g_bspline_ctx.knotpoints, g_bspline_ctx.num_knotpoints,
      "%13.6e\n");
-
+  //- Then we generate the BSplines and save them off for reference and debugging.
  write_array_F64(str8_lit(knotpoints_file_path), g_bspline_ctx.knotpoints, g_bspline_ctx.num_knotpoints, "%13.6e\n");
  write_bsplines_to_matrix_F64(arena);
  //- Test MKL
  test_mkl_zgeev();
  test_mkl_dsyevd();
  cuda_entry_point();
 }
--- a/src/test_mkl.c
+++ b/src/test_mkl.c
@ -0,0 +1,272 @@
 /*******************************************************************************
 * Copyright 2009-2021 Intel Corporation.
 *
 * This software and the related documents are Intel copyrighted  materials,  and
 * your use of  them is  governed by the  express license  under which  they were
 * provided to you (License).  Unless the License provides otherwise, you may not
 * use, modify, copy, publish, distribute,  disclose or transmit this software or
 * the related documents without Intel's prior written permission.
 *
 * This software and the related documents  are provided as  is,  with no express
 * or implied  warranties,  other  than those  that are  expressly stated  in the
 * License.
 *******************************************************************************/
 /*
   ZGEEV Example.
   ==============
   Program computes the eigenvalues and left and right eigenvectors of a general
   rectangular matrix A:
   ( -3.84,  2.25) ( -8.94, -4.75) (  8.95, -6.53) ( -9.87,  4.82)
   ( -0.66,  0.83) ( -4.40, -3.82) ( -3.50, -4.26) ( -3.15,  7.36)
   ( -3.99, -4.73) ( -5.88, -6.60) ( -3.36, -0.40) ( -0.75,  5.23)
   (  7.74,  4.18) (  3.66, -7.53) (  2.58,  3.60) (  4.59,  5.41)
   Description.
   ============
   The routine computes for an n-by-n complex nonsymmetric matrix A, the
   eigenvalues and, optionally, the left and/or right eigenvectors. The right
   eigenvector v(j) of A satisfies
   A*v(j)= lambda(j)*v(j)
   where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies
   u(j)H*A = lambda(j)*u(j)H
   where u(j)H denotes the conjugate transpose of u(j). The computed
   eigenvectors are normalized to have Euclidean norm equal to 1 and
   largest component real.
   Example Program Results.
   ========================
   ZGEEV Example Program Results
   Eigenvalues
   ( -9.43,-12.98) ( -3.44, 12.69) (  0.11, -3.40) (  5.76,  7.13)
   Left eigenvectors
   (  0.24, -0.18) (  0.61,  0.00) ( -0.18, -0.33) (  0.28,  0.09)
   (  0.79,  0.00) ( -0.05, -0.27) (  0.82,  0.00) ( -0.55,  0.16)
   (  0.22, -0.27) ( -0.21,  0.53) ( -0.37,  0.15) (  0.45,  0.09)
   ( -0.02,  0.41) (  0.40, -0.24) (  0.06,  0.12) (  0.62,  0.00)
   Right eigenvectors
   (  0.43,  0.33) (  0.83,  0.00) (  0.60,  0.00) ( -0.31,  0.03)
   (  0.51, -0.03) (  0.08, -0.25) ( -0.40, -0.20) (  0.04,  0.34)
   (  0.62,  0.00) ( -0.25,  0.28) ( -0.09, -0.48) (  0.36,  0.06)
   ( -0.23,  0.11) ( -0.10, -0.32) ( -0.43,  0.13) (  0.81,  0.00)
   */
 #include <stdlib.h>
 #include <stdio.h>
 #include <mkl.h>
 /* Auxiliary routines prototypes */
 function void 
 print_matrix_cmplx16( char* desc, int m, int n, MKL_Complex16* a, int lda );
 #ifndef TEST_MKL_N
 #define TEST_MKL_N 4
 #endif
 /* Main program */
 static void
 test_mkl_zgeev(void) {
  /* Locals */
  int N = TEST_MKL_N;
  MKL_INT n = N, lda = N, ldvl = N, ldvr = N, info, lwork;
  MKL_Complex16 wkopt;
  MKL_Complex16* work;
  /* Local arrays */
  /* rwork dimension should be at least 2*n */
  double rwork[2*TEST_MKL_N];
  MKL_Complex16 w[TEST_MKL_N], vl[TEST_MKL_N*TEST_MKL_N], vr[TEST_MKL_N*TEST_MKL_N];
  MKL_Complex16 a[TEST_MKL_N*TEST_MKL_N] = {
    {-3.84,  2.25}, {-0.66,  0.83}, {-3.99, -4.73}, { 7.74,  4.18},
    {-8.94, -4.75}, {-4.40, -3.82}, {-5.88, -6.60}, { 3.66, -7.53},
    { 8.95, -6.53}, {-3.50, -4.26}, {-3.36, -0.40}, { 2.58,  3.60},
    {-9.87,  4.82}, {-3.15,  7.36}, {-0.75,  5.23}, { 4.59,  5.41}
  };
  /* Executable statements */
  OutputDebugString( " ZGEEV Example Program Results\n" );
  /* Query and allocate the optimal workspace */
  lwork = -1;
  zgeev( "Vectors", "Vectors", &n, a, &lda, w, vl, &ldvl, vr, &ldvr,
      &wkopt, &lwork, rwork, &info );
  lwork = (MKL_INT)wkopt.real;
  work = (MKL_Complex16*)malloc( lwork*sizeof(MKL_Complex16) );
  /* Solve eigenproblem */
  zgeev( "Vectors", "Vectors", &n, a, &lda, w, vl, &ldvl, vr, &ldvr,
      work, &lwork, rwork, &info );
  /* Check for convergence */
  if( info > 0 ) {
    OutputDebugString( "The algorithm failed to compute eigenvalues.\n" );
    exit( 1 );
  }
  /* Print eigenvalues */
  print_matrix_cmplx16( "Eigenvalues", 1, n, w, 1 );
  /* Print left eigenvectors */
  print_matrix_cmplx16( "Left eigenvectors", n, n, vl, ldvl );
  /* Print right eigenvectors */
  print_matrix_cmplx16( "Right eigenvectors", n, n, vr, ldvr );
  /* Free workspace */
  free( (void*)work );
  //exit( 0 );
 } /* End of ZGEEV Example */
 /* Auxiliary routine: printing a matrix */
 function void
 print_matrix_cmplx16( char* desc, int m, int n, MKL_Complex16* a, int lda ) {
  ArenaTemp scratch = scratch_get(0,0);
  int i, j;
  String8 newline = str8_lit("\n");
  String8 header = str8_pushf(scratch.arena, "\n %s\n", desc );
  OutputDebugString(header.str);
  //printf("\n %s \n", desc);
  for( i = 0; i < m; i++ ) {
    for( j = 0; j < n; j++ ) {
      String8 outstr = str8_pushf(scratch.arena, " (%6.2f,%6.2f)", a[i+j*lda].real, a[i+j*lda].imag );
      OutputDebugString(outstr.str);
      //printf(" (%6.2f,%6.2f)", a[i+j*lda].real, a[i+j*lda].imag );
    }
    OutputDebugString(newline.str);
    //printf("\n");
  }
 }
 /*******************************************************************************
 * Copyright 2009-2021 Intel Corporation.
 *
 * This software and the related documents are Intel copyrighted  materials,  and
 * your use of  them is  governed by the  express license  under which  they were
 * provided to you (License).  Unless the License provides otherwise, you may not
 * use, modify, copy, publish, distribute,  disclose or transmit this software or
 * the related documents without Intel's prior written permission.
 *
 * This software and the related documents  are provided as  is,  with no express
 * or implied  warranties,  other  than those  that are  expressly stated  in the
 * License.
 *******************************************************************************/
 /*
   DSYEVD Example.
   ==============
   Program computes all eigenvalues and eigenvectors of a real symmetric
   matrix A using divide and conquer algorithm, where A is:
   6.39   0.13  -8.23   5.71  -3.18
   0.13   8.37  -4.46  -6.10   7.21
   -8.23  -4.46  -9.58  -9.25  -7.42
   5.71  -6.10  -9.25   3.72   8.54
   -3.18   7.21  -7.42   8.54   2.51
   Description.
   ============
   The routine computes all eigenvalues and, optionally, eigenvectors of an
   n-by-n real symmetric matrix A. The eigenvector v(j) of A satisfies
   A*v(j) = lambda(j)*v(j)
   where lambda(j) is its eigenvalue. The computed eigenvectors are
   orthonormal.
   If the eigenvectors are requested, then this routine uses a divide and
   conquer algorithm to compute eigenvalues and eigenvectors.
   Example Program Results.
   ========================
   DSYEVD Example Program Results
   Eigenvalues
   -17.44 -11.96   6.72  14.25  19.84
   Eigenvectors (stored columnwise)
   -0.26   0.31  -0.74   0.33   0.42
   -0.17  -0.39  -0.38  -0.80   0.16
   -0.89   0.04   0.09   0.03  -0.45
   -0.29  -0.59   0.34   0.31   0.60
   -0.19   0.63   0.44  -0.38   0.48
   */
 #include <stdlib.h>
 #include <stdio.h>
 #include <mkl.h>
 /* Auxiliary routines prototypes */
 extern void print_matrix( char* desc, int m, int n, double* a, int lda );
 /* Parameters */
 #define N 5
 #define LDA N
 /* Main program */
 function void 
 test_mkl_dsyevd(void) {
  /* Locals */
  MKL_INT n = N, lda = LDA, info, lwork, liwork;
  MKL_INT iwkopt;
  MKL_INT* iwork;
  double wkopt;
  double* work;
  /* Local arrays */
  double w[N];
  double a[LDA*N] = {
    6.39,  0.00,  0.00,  0.00,  0.00,
    0.13,  8.37,  0.00,  0.00,  0.00,
    -8.23, -4.46, -9.58,  0.00,  0.00,
    5.71, -6.10, -9.25,  3.72,  0.00,
    -3.18,  7.21, -7.42,  8.54,  2.51
  };
  /* Executable statements */
  OutputDebugString( " DSYEVD Example Program Results\n" );
  /* Query and allocate the optimal workspace */
  lwork = -1;
  liwork = -1;
  dsyevd( "Vectors", "Upper", &n, a, &lda, w, &wkopt, &lwork, &iwkopt,
      &liwork, &info );
  lwork = (MKL_INT)wkopt;
  work = (double*)malloc( lwork*sizeof(double) );
  liwork = iwkopt;
  iwork = (MKL_INT*)malloc( liwork*sizeof(MKL_INT) );
  /* Solve eigenproblem */
  dsyevd( "Vectors", "Upper", &n, a, &lda, w, work, &lwork, iwork,
      &liwork, &info );
  /* Check for convergence */
  if( info > 0 ) {
    OutputDebugString( "The algorithm failed to compute eigenvalues.\n" );
    exit( 1 );
  }
  /* Print eigenvalues */
  print_matrix( "Eigenvalues", 1, n, w, 1 );
  /* Print eigenvectors */
  print_matrix( "Eigenvectors (stored columnwise)", n, n, a, lda );
  /* Free workspace */
  free( (void*)iwork );
  free( (void*)work );
 } /* End of DSYEVD Example */
 /* Auxiliary routine: printing a matrix */
 void print_matrix( char* desc, int m, int n, double* a, int lda ) {
  ArenaTemp scratch = scratch_get(0, 0);
  int i, j;
  //printf( "\n %s\n", desc );
  String8 header = str8_pushf(scratch.arena, "\n %s\n", desc);
  OutputDebugString(header.str);
  for( i = 0; i < m; i++ ) {
    for( j = 0; j < n; j++ ) 
    {
      String8 out = str8_pushf(scratch.arena, " %6.2f", a[i+j*lda] );
      OutputDebugString(out.str);
      //printf( " %6.2f", a[i+j*lda] );
    }
    OutputDebugString(str8_lit("\n").str);
  }
 }
--- a/timeBuild.ctm
+++ b/timeBuild.ctm