working dsyevd example on GPU with logging

2024-10-15 17:03:05 +02:00 · 2024-10-15 17:03:05 +02:00 · 48b67775f8
commit 48b67775f8
parent 879ae444a1
15 changed files with 8512 additions and 33 deletions
--- a/.gitignore
+++ b/.gitignore
@ -94,3 +94,5 @@ dkms.conf
 *.out
 *.app

+#visual studio
+.vs
--- a/build.bat
+++ b/build.bat
@ -4,16 +4,31 @@ ctime -begin timeBuild.ctm

@rem /WX /W4 /wd4201 /wd4100 /wd4189 /wd4244 /wd4127 /wd4456
@rem set CommonCompilerFlags="/nologo /Zi /FC"
-set CommonCompilerFlags=/nologo /Zi /FC 
+set CommonCompilerFlags=/nologo /Zi /FC /Od
@rem /WX /W4 /wd4201 /wd4100 /wd4189 /wd4244 /wd4127 /wd4456
+@rem
+@rem
+
+set cuda_root=D:/lib/cudatoolkit/lib/x64
+
+set mkl_root=D:/lib/oneAPI_mkl/mkl/2021.3.0
+set mkl_core=%mkl_root%/lib/intel64/mkl_core.lib
+set mkl_intel_lp64=%mkl_root%/lib/intel64/mkl_intel_lp64.lib
+set mkl_intel_thread=%mkl_root%/lib/intel64/mkl_intel_thread.lib
+set MKLCOMPILER=D:/lib/oneAPI_mkl/compiler/2021.3.0/windows/compiler
+set libiomp5md=%MKLCOMPILER%/lib/intel64_win/libiomp5md.lib
+
 set CudaSources=../src/cuda_entry_point.cu
+set CudaObj=cuda_entry_point.obj
 set Sources=../src/main.c
-set CudaLib=D:/lib/cudatoolkit/lib/x64
-set CudaSolver=%CudaLib%/cusolver.lib
+
 IF NOT EXIST .\build mkdir .\build
 pushd .\build
-cl %CommonCompilerFlags% %Sources% -Feprogram.exe
-@rem nvcc -Xcompiler %CommonCompilerFlags% -o program.exe %Sources% %CudaSources% -lcusolver
+rem cl /c %CommonCompilerFlags% %Sources% /I"%mkl_root%\include"
+rem FOR MSVC /link %mkl_core% %mkl_intel_lp64% %mkl_intel_thread% %libiomp5md% 
+nvcc -c %CudaSources% -o %CudaObj% -g -G -lcusolver
+rem nvcc -o program.exe -lcusolver -L%mkl_root%/lib/intel64 -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -l%MKLCOMPILER%/lib/intel64_win/libiomp5md
+cl %CommonCompilerFlags% %Sources% /I"%mkl_root%\include" /link %CudaObj% %cuda_root%/cudart.lib %cuda_root%/cusolver.lib %mkl_core% %mkl_intel_lp64% %mkl_intel_thread% %libiomp5md% 
 set LastError=%ERRORLEVEL%
 popd

--- a/build/main.rdi
+++ b/build/main.rdi
--- a/build/program.rdi
+++ b/build/program.rdi
--- a/eigsol_gpu.sublime-project
+++ b/eigsol_gpu.sublime-project
@ -0,0 +1,8 @@
+{
+	"folders":
+	[
+		{
+			"path": "."
+		}
+	]
+}
--- a/eigsol_gpu.sublime-workspace
+++ b/eigsol_gpu.sublime-workspace
--- a/external/cusolverDn.h
+++ b/external/cusolverDn.h
--- a/external/cusolverMg.h
+++ b/external/cusolverMg.h
@ -0,0 +1,318 @@
+/*
+ * Copyright 2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CUSOLVERMG_H_)
+  #define CUSOLVERMG_H_
+
+  #include <stdint.h>
+  #include "cusolverDn.h"
+
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+
+  struct cusolverMgContext;
+  typedef struct cusolverMgContext *cusolverMgHandle_t;
+
+  /**
+   * \beief This enum decides how 1D device Ids (or process ranks) get mapped to
+   * a 2D grid.
+   */
+  typedef enum {
+
+    CUDALIBMG_GRID_MAPPING_ROW_MAJOR = 1,
+    CUDALIBMG_GRID_MAPPING_COL_MAJOR = 0
+
+  } cusolverMgGridMapping_t;
+
+  /** \brief Opaque structure of the distributed grid */
+  typedef void *cudaLibMgGrid_t;
+  /** \brief Opaque structure of the distributed matrix descriptor */
+  typedef void *cudaLibMgMatrixDesc_t;
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgCreate(cusolverMgHandle_t *handle);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgDestroy(cusolverMgHandle_t handle);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgDeviceSelect(
+    cusolverMgHandle_t handle,
+    int                nbDevices,
+    int                deviceId[]);
+
+  /**
+   * \brief Allocates resources related to the shared memory device grid.
+   * \param[out] grid the opaque data strcuture that holds the grid
+   * \param[in] numRowDevices number of devices in the row
+   * \param[in] numColDevices number of devices in the column
+   * \param[in] deviceId This array of size height * width stores the
+   *            device-ids of the 2D grid; each entry must correspond to a valid
+   * gpu or to -1 (denoting CPU). \param[in] mapping whether the 2D grid is in
+   * row/column major \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverMgCreateDeviceGrid(
+    cudaLibMgGrid_t *       grid,
+    int32_t                 numRowDevices,
+    int32_t                 numColDevices,
+    const int32_t           deviceId[],
+    cusolverMgGridMapping_t mapping);
+
+  /**
+   * \brief Releases the allocated resources related to the distributed grid.
+   * \param[in] grid the opaque data strcuture that holds the distributed grid
+   * \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverMgDestroyGrid(cudaLibMgGrid_t grid);
+
+  /**
+   * \brief Allocates resources related to the distributed matrix descriptor.
+   * \param[out] desc the opaque data strcuture that holds the descriptor
+   * \param[in] numRows number of total rows
+   * \param[in] numCols number of total columns
+   * \param[in] rowBlockSize row block size
+   * \param[in] colBlockSize column block size
+   * \param[in] dataType the data type of each element in cudaDataType
+   * \param[in] grid the opaque data structure of the distributed grid
+   * \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverMgCreateMatrixDesc(
+    cudaLibMgMatrixDesc_t *desc,
+    int64_t                numRows,
+    int64_t                numCols,
+    int64_t                rowBlockSize,
+    int64_t                colBlockSize,
+    cudaDataType           dataType,
+    const cudaLibMgGrid_t  grid);
+
+  /**
+   * \brief Releases the allocated resources related to the distributed matrix
+   * descriptor. \param[in] desc the opaque data strcuture that holds the
+   * descriptor \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverMgDestroyMatrixDesc(cudaLibMgMatrixDesc_t desc);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgSyevd_bufferSize(
+    cusolverMgHandle_t    handle,
+    cusolverEigMode_t     jobz,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                W,
+    cudaDataType          dataTypeW,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgSyevd(
+    cusolverMgHandle_t    handle,
+    cusolverEigMode_t     jobz,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                W,
+    cudaDataType          dataTypeW,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrf_bufferSize(
+    cusolverMgHandle_t    handle,
+    int                   M,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrf(
+    cusolverMgHandle_t    handle,
+    int                   M,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrs_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasOperation_t     TRANS,
+    int                   N,
+    int                   NRHS,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrs(
+    cusolverMgHandle_t    handle,
+    cublasOperation_t     TRANS,
+    int                   N,
+    int                   NRHS,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrf_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrf(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 h_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrs_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   n,
+    int                   nrhs,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrs(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   n,
+    int                   nrhs,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 h_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotri_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotri(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 h_info);
+
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+
+#endif // CUSOLVERMG_H_
--- a/external/cusolverRf.h
+++ b/external/cusolverRf.h
@ -0,0 +1,339 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CUSOLVERRF_H_)
+  #define CUSOLVERRF_H_
+
+  #include "driver_types.h"
+  #include "cuComplex.h"
+  #include "cusolver_common.h"
+
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+
+  /* CUSOLVERRF mode */
+  typedef enum {
+    CUSOLVERRF_RESET_VALUES_FAST_MODE_OFF = 0, // default
+    CUSOLVERRF_RESET_VALUES_FAST_MODE_ON = 1
+  } cusolverRfResetValuesFastMode_t;
+
+  /* CUSOLVERRF matrix format */
+  typedef enum {
+    CUSOLVERRF_MATRIX_FORMAT_CSR = 0, // default
+    CUSOLVERRF_MATRIX_FORMAT_CSC = 1
+  } cusolverRfMatrixFormat_t;
+
+  /* CUSOLVERRF unit diagonal */
+  typedef enum {
+    CUSOLVERRF_UNIT_DIAGONAL_STORED_L = 0, // default
+    CUSOLVERRF_UNIT_DIAGONAL_STORED_U = 1,
+    CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L = 2,
+    CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_U = 3
+  } cusolverRfUnitDiagonal_t;
+
+  /* CUSOLVERRF factorization algorithm */
+  typedef enum {
+    CUSOLVERRF_FACTORIZATION_ALG0 = 0, // default
+    CUSOLVERRF_FACTORIZATION_ALG1 = 1,
+    CUSOLVERRF_FACTORIZATION_ALG2 = 2,
+  } cusolverRfFactorization_t;
+
+  /* CUSOLVERRF triangular solve algorithm */
+  typedef enum {
+    CUSOLVERRF_TRIANGULAR_SOLVE_ALG1 = 1, // default
+    CUSOLVERRF_TRIANGULAR_SOLVE_ALG2 = 2,
+    CUSOLVERRF_TRIANGULAR_SOLVE_ALG3 = 3
+  } cusolverRfTriangularSolve_t;
+
+  /* CUSOLVERRF numeric boost report */
+  typedef enum {
+    CUSOLVERRF_NUMERIC_BOOST_NOT_USED = 0, // default
+    CUSOLVERRF_NUMERIC_BOOST_USED = 1
+  } cusolverRfNumericBoostReport_t;
+
+  /* Opaque structure holding CUSOLVERRF library common */
+  struct cusolverRfCommon;
+  typedef struct cusolverRfCommon* cusolverRfHandle_t;
+
+  /* CUSOLVERRF create (allocate memory) and destroy (free memory) in the handle
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverRfCreate(cusolverRfHandle_t* handle);
+  cusolverStatus_t CUSOLVERAPI cusolverRfDestroy(cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF set and get input format */
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetMatrixFormat(
+    cusolverRfHandle_t        handle,
+    cusolverRfMatrixFormat_t* format,
+    cusolverRfUnitDiagonal_t* diag);
+
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetMatrixFormat(
+    cusolverRfHandle_t       handle,
+    cusolverRfMatrixFormat_t format,
+    cusolverRfUnitDiagonal_t diag);
+
+  /* CUSOLVERRF set and get numeric properties */
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetNumericProperties(
+    cusolverRfHandle_t handle,
+    double             zero,
+    double             boost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetNumericProperties(
+    cusolverRfHandle_t handle,
+    double*            zero,
+    double*            boost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetNumericBoostReport(
+    cusolverRfHandle_t              handle,
+    cusolverRfNumericBoostReport_t* report);
+
+  /* CUSOLVERRF choose the triangular solve algorithm */
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetAlgs(
+    cusolverRfHandle_t          handle,
+    cusolverRfFactorization_t   factAlg,
+    cusolverRfTriangularSolve_t solveAlg);
+
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetAlgs(
+    cusolverRfHandle_t           handle,
+    cusolverRfFactorization_t*   factAlg,
+    cusolverRfTriangularSolve_t* solveAlg);
+
+  /* CUSOLVERRF set and get fast mode */
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetResetValuesFastMode(
+    cusolverRfHandle_t               handle,
+    cusolverRfResetValuesFastMode_t* fastMode);
+
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetResetValuesFastMode(
+    cusolverRfHandle_t              handle,
+    cusolverRfResetValuesFastMode_t fastMode);
+
+  /*** Non-Batched Routines ***/
+  /* CUSOLVERRF setup of internal structures from host or device memory */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfSetupHost(/* Input (in the host memory) */
+                        int     n,
+                        int     nnzA,
+                        int*    h_csrRowPtrA,
+                        int*    h_csrColIndA,
+                        double* h_csrValA,
+                        int     nnzL,
+                        int*    h_csrRowPtrL,
+                        int*    h_csrColIndL,
+                        double* h_csrValL,
+                        int     nnzU,
+                        int*    h_csrRowPtrU,
+                        int*    h_csrColIndU,
+                        double* h_csrValU,
+                        int*    h_P,
+                        int*    h_Q,
+                        /* Output */
+                        cusolverRfHandle_t handle);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfSetupDevice(/* Input (in the device memory) */
+                          int     n,
+                          int     nnzA,
+                          int*    csrRowPtrA,
+                          int*    csrColIndA,
+                          double* csrValA,
+                          int     nnzL,
+                          int*    csrRowPtrL,
+                          int*    csrColIndL,
+                          double* csrValL,
+                          int     nnzU,
+                          int*    csrRowPtrU,
+                          int*    csrColIndU,
+                          double* csrValU,
+                          int*    P,
+                          int*    Q,
+                          /* Output */
+                          cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF update the matrix values (assuming the reordering, pivoting
+     and consequently the sparsity pattern of L and U did not change),
+     and zero out the remaining values. */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfResetValues(/* Input (in the device memory) */
+                          int     n,
+                          int     nnzA,
+                          int*    csrRowPtrA,
+                          int*    csrColIndA,
+                          double* csrValA,
+                          int*    P,
+                          int*    Q,
+                          /* Output */
+                          cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF analysis (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI cusolverRfAnalyze(cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF re-factorization (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI cusolverRfRefactor(cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF extraction: Get L & U packed into a single matrix M */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfAccessBundledFactorsDevice(/* Input */
+                                         cusolverRfHandle_t handle,
+                                         /* Output (in the host memory) */
+                                         int* nnzM,
+                                         /* Output (in the device memory) */
+                                         int**    Mp,
+                                         int**    Mi,
+                                         double** Mx);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfExtractBundledFactorsHost(/* Input */
+                                        cusolverRfHandle_t handle,
+                                        /* Output (in the host memory) */
+                                        int*     h_nnzM,
+                                        int**    h_Mp,
+                                        int**    h_Mi,
+                                        double** h_Mx);
+
+  /* CUSOLVERRF extraction: Get L & U individually */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfExtractSplitFactorsHost(/* Input */
+                                      cusolverRfHandle_t handle,
+                                      /* Output (in the host memory) */
+                                      int*     h_nnzL,
+                                      int**    h_csrRowPtrL,
+                                      int**    h_csrColIndL,
+                                      double** h_csrValL,
+                                      int*     h_nnzU,
+                                      int**    h_csrRowPtrU,
+                                      int**    h_csrColIndU,
+                                      double** h_csrValU);
+
+  /* CUSOLVERRF (forward and backward triangular) solves */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfSolve(/* Input (in the device memory) */
+                    cusolverRfHandle_t handle,
+                    int*               P,
+                    int*               Q,
+                    int                nrhs, // only nrhs=1 is supported
+                    double*            Temp, // of size ldt*nrhs (ldt>=n)
+                    int                ldt,
+                    /* Input/Output (in the device memory) */
+                    double* XF,
+                    /* Input */
+                    int ldxf);
+
+  /*** Batched Routines ***/
+  /* CUSOLVERRF-batch setup of internal structures from host */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchSetupHost(/* Input (in the host memory)*/
+                             int     batchSize,
+                             int     n,
+                             int     nnzA,
+                             int*    h_csrRowPtrA,
+                             int*    h_csrColIndA,
+                             double* h_csrValA_array[],
+                             int     nnzL,
+                             int*    h_csrRowPtrL,
+                             int*    h_csrColIndL,
+                             double* h_csrValL,
+                             int     nnzU,
+                             int*    h_csrRowPtrU,
+                             int*    h_csrColIndU,
+                             double* h_csrValU,
+                             int*    h_P,
+                             int*    h_Q,
+                             /* Output (in the device memory) */
+                             cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF-batch update the matrix values (assuming the reordering,
+     pivoting and consequently the sparsity pattern of L and U did not change),
+     and zero out the remaining values. */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchResetValues(/* Input (in the device memory) */
+                               int     batchSize,
+                               int     n,
+                               int     nnzA,
+                               int*    csrRowPtrA,
+                               int*    csrColIndA,
+                               double* csrValA_array[],
+                               int*    P,
+                               int*    Q,
+                               /* Output */
+                               cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF-batch analysis (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchAnalyze(cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF-batch re-factorization (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchRefactor(cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF-batch (forward and backward triangular) solves */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchSolve(/* Input (in the device memory) */
+                         cusolverRfHandle_t handle,
+                         int*               P,
+                         int*               Q,
+                         int                nrhs, // only nrhs=1 is supported
+                         double* Temp, // of size 2*batchSize*(n*nrhs)
+                         int     ldt,  // only ldt=n is supported
+                         /* Input/Output (in the device memory) */
+                         double* XF_array[],
+                         /* Input */
+                         int ldxf);
+
+  /* CUSOLVERRF-batch obtain the position of zero pivot */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchZeroPivot(/* Input */
+                             cusolverRfHandle_t handle,
+                             /* Output (in the host memory) */
+                             int* position);
+
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+
+#endif /* CUSOLVERRF_H_ */
--- a/external/cusolverSp.h
+++ b/external/cusolverSp.h
@ -0,0 +1,923 @@
+/*
+ * Copyright 2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CUSOLVERSP_H_)
+  #define CUSOLVERSP_H_
+
+  #include "cusparse.h"
+  #include "cublas_v2.h"
+  #include "cusolver_common.h"
+
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+
+  struct cusolverSpContext;
+  typedef struct cusolverSpContext *cusolverSpHandle_t;
+
+  struct csrqrInfo;
+  typedef struct csrqrInfo *csrqrInfo_t;
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCreate(cusolverSpHandle_t *handle);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDestroy(cusolverSpHandle_t handle);
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpSetStream(cusolverSpHandle_t handle, cudaStream_t streamId);
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpGetStream(cusolverSpHandle_t handle, cudaStream_t *streamId);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrissymHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrEndPtrA,
+    const int *              csrColIndA,
+    int *                    issym);
+
+  /* -------- GPU linear solver by LU factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [lu] stands for LU factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+
+  /* -------- GPU linear solver by QR factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [qr] stands for QR factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+
+  /* -------- CPU linear solver by QR factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [qr] stands for QR factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+
+  /* -------- CPU linear solver by Cholesky factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [chol] stands for Cholesky factorization
+   *
+   * Only works for symmetric positive definite matrix.
+   * The upper part of A is ignored.
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+
+  /* -------- GPU linear solver by Cholesky factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [chol] stands for Cholesky factorization
+   *
+   * Only works for symmetric positive definite matrix.
+   * The upper part of A is ignored.
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    // output
+    float *x,
+    int *  singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    // output
+    double *x,
+    int *   singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    // output
+    cuComplex *x,
+    int *      singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    // output
+    cuDoubleComplex *x,
+    int *            singularity);
+
+  /* ----------- CPU least square solver by QR factorization
+   *       solve min|b - A*x|
+   * [lsq] stands for least square
+   * [v] stands for vector
+   * [qr] stands for QR factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float                    tol,
+    int *                    rankA,
+    float *                  x,
+    int *                    p,
+    float *                  min_norm);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double                   tol,
+    int *                    rankA,
+    double *                 x,
+    int *                    p,
+    double *                 min_norm);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    float                    tol,
+    int *                    rankA,
+    cuComplex *              x,
+    int *                    p,
+    float *                  min_norm);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int *                    rankA,
+    cuDoubleComplex *        x,
+    int *                    p,
+    double *                 min_norm);
+
+  /* --------- CPU eigenvalue solver by shift inverse
+   *      solve A*x = lambda * x
+   *   where lambda is the eigenvalue nearest mu0.
+   * [eig] stands for eigenvalue solver
+   * [si] stands for shift-inverse
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    float                    mu0,
+    const float *            x0,
+    int                      maxite,
+    float                    tol,
+    float *                  mu,
+    float *                  x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    double                   mu0,
+    const double *           x0,
+    int                      maxite,
+    double                   tol,
+    double *                 mu,
+    double *                 x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                mu0,
+    const cuComplex *        x0,
+    int                      maxite,
+    float                    tol,
+    cuComplex *              mu,
+    cuComplex *              x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          mu0,
+    const cuDoubleComplex *  x0,
+    int                      maxite,
+    double                   tol,
+    cuDoubleComplex *        mu,
+    cuDoubleComplex *        x);
+
+  /* --------- GPU eigenvalue solver by shift inverse
+   *      solve A*x = lambda * x
+   *   where lambda is the eigenvalue nearest mu0.
+   * [eig] stands for eigenvalue solver
+   * [si] stands for shift-inverse
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    float                    mu0,
+    const float *            x0,
+    int                      maxite,
+    float                    eps,
+    float *                  mu,
+    float *                  x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    double                   mu0,
+    const double *           x0,
+    int                      maxite,
+    double                   eps,
+    double *                 mu,
+    double *                 x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                mu0,
+    const cuComplex *        x0,
+    int                      maxite,
+    float                    eps,
+    cuComplex *              mu,
+    cuComplex *              x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          mu0,
+    const cuDoubleComplex *  x0,
+    int                      maxite,
+    double                   eps,
+    cuDoubleComplex *        mu,
+    cuDoubleComplex *        x);
+
+  // ----------- enclosed eigenvalues
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                left_bottom_corner,
+    cuComplex                right_upper_corner,
+    int *                    num_eigs);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          left_bottom_corner,
+    cuDoubleComplex          right_upper_corner,
+    int *                    num_eigs);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                left_bottom_corner,
+    cuComplex                right_upper_corner,
+    int *                    num_eigs);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          left_bottom_corner,
+    cuDoubleComplex          right_upper_corner,
+    int *                    num_eigs);
+
+  /* --------- CPU symrcm
+   *   Symmetric reverse Cuthill McKee permutation
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymrcmHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    p);
+
+  /* --------- CPU symmdq
+   *   Symmetric minimum degree algorithm by quotient graph
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymmdqHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    p);
+
+  /* --------- CPU symmdq
+   *   Symmetric Approximate minimum degree algorithm by quotient graph
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymamdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    p);
+
+  /* --------- CPU metis
+   *   symmetric reordering
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrmetisndHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const int64_t *          options,
+    int *                    p);
+
+  /* --------- CPU zfd
+   *  Zero free diagonal reordering
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+
+  /* --------- CPU permuation
+   *   P*A*Q^T
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrperm_bufferSizeHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const int *              p,
+    const int *              q,
+    size_t *                 bufferSizeInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrpermHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    int *                    csrRowPtrA,
+    int *                    csrColIndA,
+    const int *              p,
+    const int *              q,
+    int *                    map,
+    void *                   pBuffer);
+
+  /*
+   *  Low-level API: Batched QR
+   *
+   */
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCreateCsrqrInfo(csrqrInfo_t *info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDestroyCsrqrInfo(csrqrInfo_t info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrqrAnalysisBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfo_t              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float *                  x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double *                 x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    cuComplex *              x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    cuDoubleComplex *        x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+
+#endif // define CUSOLVERSP_H_
--- a/external/cusolverSp_LOWLEVEL_PREVIEW.h
+++ b/external/cusolverSp_LOWLEVEL_PREVIEW.h
--- a/src/cuda_entry_point.cu
+++ b/src/cuda_entry_point.cu
@ -1,28 +1,174 @@
 #include <stdio.h>
-#include "../external/cusolver_common.h"
-#define no_name_mangle extern "C"
+#include <windows.h>
+#define WIN32_LEAN_AND_MEAN

-__global__ void hello_from_GPU() {
-  printf("HELLO FROM GPUUUU");
+#include "../external/cusolver_common.h"
+#include "../external/cusolverDn.h"
+
+#ifdef __cplusplus
+# define no_name_mangle extern "C"
+#else
+# define no_name_mangle 
+#endif
+
+#define function static
+
+#define GLOBAL_LOG_BUFFER_SIZE 4096*4096
+
+__device__ int last_index;
+__device__ char *device_log_buffer;
+char *log_buffer;
+
+#define D_LOG(msg) d_debug_printf((msg))
+//#define D_LOGF(msg, ...) d_debug_printfv((msg), __VA_ARGS__)
+#define H_LOG(msg) h_debug_printf((msg))
+#define H_LOGF(msg, ...) h_debug_printfv((msg), __VA_ARGS__)
+
+__host__ function void h_debug_printf(const char *msg); 
+__host__ function void h_debug_printfv(const char *fmt, ...); 
+__device__ function void d_debug_printf(const char *msg);
+//__device__ function void d_debug_printfv(const char *fmt, ...);
+
+__global__ void init_device_logging()
+{
+  last_index = 0;
 }

-void check_cusolver_version() {
+
+__host__ function void
+h_debug_printf(const char* msg) 
+{
+  OutputDebugStringA(msg);
+}
+
+__host__ function void
+h_debug_printfv(const char *fmt, ...)
+{
+  char buffer[2048] = {0};
+  va_list args;
+  va_start(args, fmt);
+  vsnprintf(&buffer[0], sizeof(buffer), fmt, args);
+  va_end(args);
+  OutputDebugStringA(buffer);
+}
+
+__device__ function void
+d_debug_printf(const char *msg) {
+  int i = 0;
+  while(msg[i] != '\0')
+  {
+    device_log_buffer[last_index + i] = msg[i];
+    i += 1;
+  }
+  last_index = last_index + 1;
+
+}
+
+
+
+__global__ void hello_from_GPU()
+{
+
+  D_LOG("HELLO FROM GPUUUU\n");
+}
+
+void
+check_cusolver_version() 
+{

  int major=-1, minor=-1, patch=-1;
  cusolverGetProperty(MAJOR_VERSION, &major);
  cusolverGetProperty(MINOR_VERSION, &minor);
  cusolverGetProperty(PATCH_LEVEL, &patch);
-  printf("CUSOLVER Version (Major,Minor,PatchLevel): %d.%d.%d\n", major,minor,patch);
+  H_LOGF("CUSOLVER Version (Major,Minor,PatchLevel): %d.%d.%d\n", major,minor,patch);

 }



-no_name_mangle void cuda_entry_point() {
-
+no_name_mangle void 
+cuda_entry_point()
+{
+  H_LOG("\n\n-- Entered CUDA code:\n\n");
+  char *temp_device_log_buffer;
+  cudaMalloc((void**)&temp_device_log_buffer, GLOBAL_LOG_BUFFER_SIZE * sizeof(char));
+  cudaMemcpyToSymbol(device_log_buffer, &temp_device_log_buffer, sizeof(char *));
  check_cusolver_version();

-  hello_from_GPU<<<1, 1>>>();
-
+  init_device_logging<<<1, 1>>>();
  cudaDeviceSynchronize();
+  hello_from_GPU<<<1, 1>>>();
+  cudaDeviceSynchronize();
+
+
+
+  log_buffer = (char *)malloc(GLOBAL_LOG_BUFFER_SIZE * sizeof(char));
+  cudaMemcpy(log_buffer, temp_device_log_buffer, GLOBAL_LOG_BUFFER_SIZE * sizeof(char), cudaMemcpyDeviceToHost);
+
+  // Print all messages from cuda.
+  H_LOG(log_buffer);
+
+  H_LOG("\n\n Testing cuSolver Eigenvalue problem \n\n");
+  
+  const int size = 5;
+  const int mat_size = size*size;
+  double A[mat_size] = {
+    6.39,   0.13,  -8.23,   5.71,  -3.18,
+   0.13,   8.37,  -4.46,  -6.10,   7.21,
+   -8.23,  -4.46,  -9.58,  -9.25,  -7.42,
+   5.71,  -6.10,  -9.25,   3.72,   8.54,
+   -3.18,   7.21,  -7.42,   8.54,   2.51};
+
+  double *device_mat;
+  cudaMalloc((void**)&device_mat, mat_size * sizeof(double));
+  cudaMemcpy(device_mat, A, mat_size * sizeof(double), cudaMemcpyHostToDevice);
+
+  cusolverDnHandle_t cusolverH = 0;
+  cusolverDnCreate(&cusolverH);
+
+  double W[size];
+  double *d_W;
+  cudaMalloc((void**)&d_W, size * sizeof(double));
+
+  int lwork;
+  cusolverDnDsyevd_bufferSize(
+      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
+      size, device_mat, size, d_W, &lwork
+      );
+
+  double *d_work;
+  cudaMalloc((void**)&d_work, lwork * sizeof(double));
+
+  int *device_info;
+  cudaMalloc((void**)&device_info, sizeof(int));
+
+  cusolverDnDsyevd(
+      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_LOWER,
+      size, device_mat, size, d_W, d_work, lwork, device_info
+      );
+  
+  double out_mat[mat_size];
+  cudaMemcpy(W, d_W, size * sizeof(double), cudaMemcpyDeviceToHost);
+  cudaMemcpy(out_mat, device_mat, mat_size * sizeof(double), cudaMemcpyDeviceToHost);
+
+  H_LOG("Results on GPU dsyevd:\n");
+  H_LOG("Eigenvalues:\n");
+  for(int i = 0; i < size; i++)
+  {
+    H_LOGF("%6.2f\n", W[i]);
+  }
+  H_LOG("\n");
+  H_LOG("Eigenvectors (columnwise):\n");
+  for(int i = 0; i < size; i++)
+  {
+    for(int j = 0; j < size; j++)
+    {
+      H_LOGF("%6.2f ", out_mat[j * size + i]); 
+    }
+    H_LOG("\n");
+  }
+
+
+
+  cudaFree(temp_device_log_buffer);
 }
--- a/src/main.c
+++ b/src/main.c
@ -14,7 +14,7 @@
 #include "base/base_inc.c"
 #include "os/os_inc.c"
 #include "os/os_entry_point.c"
-
+#include "test_mkl.c"

 #define grid_file_path_bin "D:\\dev\\eigsol_gpu\\out\\grid.bin"
 #define grid_file_path "D:\\dev\\eigsol_gpu\\out\\grid.dat"
@ -53,8 +53,6 @@ struct BSplineCtx
  F64 *bsplines;
 };

-
-
 //~ Globals
 global Grid g_grid = {0};
 global BSplineCtx g_bspline_ctx = {0};
@ -205,11 +203,11 @@ set_up_bspline_context(Arena* arena)
 {
  // Create knotpoint sequence.
  U32 k = 4;
-  U32 N = 14;
+  U32 bspl_N = 14;
  g_bspline_ctx.order = k; 
-  g_bspline_ctx.num_knotpoints = N; 
-  g_bspline_ctx.num_bsplines = N-k;
-  g_bspline_ctx.num_phys_points = N-(2*k)+2; // Remove k points at each end, and then add back the first and last points.
+  g_bspline_ctx.num_knotpoints = bspl_N; 
+  g_bspline_ctx.num_bsplines = bspl_N-k;
+  g_bspline_ctx.num_phys_points = bspl_N-(2*k)+2; // Remove k points at each end, and then add back the first and last points.
  g_bspline_ctx.arena = arena;
  g_bspline_ctx.knotpoints = PushArray(arena, F64, g_bspline_ctx.num_knotpoints);
  // Set up physical points;
@ -308,36 +306,37 @@ write_bsplines_to_matrix_F64(Arena *arena)
    str8_list_push(scratch.arena, &bspline_array_list, row_joined);
  }

-
  write_string_list_to_file(scratch.arena, str8_lit(bspline_array_file_path), &bspline_array_list);

-
-
-
-
 }


-  function
-void EntryPoint() 
+function
+void EntryPoint(void) 
 {
  OS_InitReceipt os_receipt = OS_init();
  OS_InitGfxReceipt os_gfx_receipt = OS_gfx_init(os_receipt);

  Arena *arena = m_make_arena();

+  //- Set up grid and write to file.
  set_up_grid(arena);
-
  write_array_binary_F64(str8_lit(grid_file_path_bin), g_grid.points, g_grid.num_steps);
  write_array_F64(str8_lit(grid_file_path), g_grid.points, g_grid.num_steps, "%13.6e\n");

+  //- The BSpline context is the knotpoints and the BSpline order etc.
  set_up_bspline_context(arena);
+  write_array_F64(str8_lit(knotpoints_file_path), 
+      g_bspline_ctx.knotpoints, g_bspline_ctx.num_knotpoints,
+      "%13.6e\n");

-
-  write_array_F64(str8_lit(knotpoints_file_path), g_bspline_ctx.knotpoints, g_bspline_ctx.num_knotpoints, "%13.6e\n");
-
+  //- Then we generate the BSplines and save them off for reference and debugging.
  write_bsplines_to_matrix_F64(arena);

+  //- Test MKL
+  test_mkl_zgeev();
+  test_mkl_dsyevd();
+  cuda_entry_point();

 }

--- a/src/test_mkl.c
+++ b/src/test_mkl.c
@ -0,0 +1,272 @@
+
+/*******************************************************************************
+ * Copyright 2009-2021 Intel Corporation.
+ *
+ * This software and the related documents are Intel copyrighted  materials,  and
+ * your use of  them is  governed by the  express license  under which  they were
+ * provided to you (License).  Unless the License provides otherwise, you may not
+ * use, modify, copy, publish, distribute,  disclose or transmit this software or
+ * the related documents without Intel's prior written permission.
+ *
+ * This software and the related documents  are provided as  is,  with no express
+ * or implied  warranties,  other  than those  that are  expressly stated  in the
+ * License.
+ *******************************************************************************/
+
+/*
+   ZGEEV Example.
+   ==============
+
+   Program computes the eigenvalues and left and right eigenvectors of a general
+   rectangular matrix A:
+
+   ( -3.84,  2.25) ( -8.94, -4.75) (  8.95, -6.53) ( -9.87,  4.82)
+   ( -0.66,  0.83) ( -4.40, -3.82) ( -3.50, -4.26) ( -3.15,  7.36)
+   ( -3.99, -4.73) ( -5.88, -6.60) ( -3.36, -0.40) ( -0.75,  5.23)
+   (  7.74,  4.18) (  3.66, -7.53) (  2.58,  3.60) (  4.59,  5.41)
+
+   Description.
+   ============
+
+   The routine computes for an n-by-n complex nonsymmetric matrix A, the
+   eigenvalues and, optionally, the left and/or right eigenvectors. The right
+   eigenvector v(j) of A satisfies
+
+   A*v(j)= lambda(j)*v(j)
+
+   where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies
+
+   u(j)H*A = lambda(j)*u(j)H
+
+   where u(j)H denotes the conjugate transpose of u(j). The computed
+   eigenvectors are normalized to have Euclidean norm equal to 1 and
+   largest component real.
+
+   Example Program Results.
+   ========================
+
+   ZGEEV Example Program Results
+
+   Eigenvalues
+   ( -9.43,-12.98) ( -3.44, 12.69) (  0.11, -3.40) (  5.76,  7.13)
+
+   Left eigenvectors
+   (  0.24, -0.18) (  0.61,  0.00) ( -0.18, -0.33) (  0.28,  0.09)
+   (  0.79,  0.00) ( -0.05, -0.27) (  0.82,  0.00) ( -0.55,  0.16)
+   (  0.22, -0.27) ( -0.21,  0.53) ( -0.37,  0.15) (  0.45,  0.09)
+   ( -0.02,  0.41) (  0.40, -0.24) (  0.06,  0.12) (  0.62,  0.00)
+
+   Right eigenvectors
+   (  0.43,  0.33) (  0.83,  0.00) (  0.60,  0.00) ( -0.31,  0.03)
+   (  0.51, -0.03) (  0.08, -0.25) ( -0.40, -0.20) (  0.04,  0.34)
+   (  0.62,  0.00) ( -0.25,  0.28) ( -0.09, -0.48) (  0.36,  0.06)
+   ( -0.23,  0.11) ( -0.10, -0.32) ( -0.43,  0.13) (  0.81,  0.00)
+   */
+#include <stdlib.h>
+#include <stdio.h>
+#include <mkl.h>
+
+/* Auxiliary routines prototypes */
+function void 
+print_matrix_cmplx16( char* desc, int m, int n, MKL_Complex16* a, int lda );
+
+#ifndef TEST_MKL_N
+#define TEST_MKL_N 4
+#endif
+
+/* Main program */
+static void
+test_mkl_zgeev(void) {
+  /* Locals */
+  int N = TEST_MKL_N;
+  MKL_INT n = N, lda = N, ldvl = N, ldvr = N, info, lwork;
+  MKL_Complex16 wkopt;
+  MKL_Complex16* work;
+  /* Local arrays */
+  /* rwork dimension should be at least 2*n */
+  double rwork[2*TEST_MKL_N];
+  MKL_Complex16 w[TEST_MKL_N], vl[TEST_MKL_N*TEST_MKL_N], vr[TEST_MKL_N*TEST_MKL_N];
+  MKL_Complex16 a[TEST_MKL_N*TEST_MKL_N] = {
+    {-3.84,  2.25}, {-0.66,  0.83}, {-3.99, -4.73}, { 7.74,  4.18},
+    {-8.94, -4.75}, {-4.40, -3.82}, {-5.88, -6.60}, { 3.66, -7.53},
+    { 8.95, -6.53}, {-3.50, -4.26}, {-3.36, -0.40}, { 2.58,  3.60},
+    {-9.87,  4.82}, {-3.15,  7.36}, {-0.75,  5.23}, { 4.59,  5.41}
+  };
+  /* Executable statements */
+  OutputDebugString( " ZGEEV Example Program Results\n" );
+  /* Query and allocate the optimal workspace */
+  lwork = -1;
+  zgeev( "Vectors", "Vectors", &n, a, &lda, w, vl, &ldvl, vr, &ldvr,
+      &wkopt, &lwork, rwork, &info );
+  lwork = (MKL_INT)wkopt.real;
+  work = (MKL_Complex16*)malloc( lwork*sizeof(MKL_Complex16) );
+  /* Solve eigenproblem */
+  zgeev( "Vectors", "Vectors", &n, a, &lda, w, vl, &ldvl, vr, &ldvr,
+      work, &lwork, rwork, &info );
+  /* Check for convergence */
+  if( info > 0 ) {
+    OutputDebugString( "The algorithm failed to compute eigenvalues.\n" );
+    exit( 1 );
+  }
+  /* Print eigenvalues */
+  print_matrix_cmplx16( "Eigenvalues", 1, n, w, 1 );
+  /* Print left eigenvectors */
+  print_matrix_cmplx16( "Left eigenvectors", n, n, vl, ldvl );
+  /* Print right eigenvectors */
+  print_matrix_cmplx16( "Right eigenvectors", n, n, vr, ldvr );
+  /* Free workspace */
+  free( (void*)work );
+  //exit( 0 );
+} /* End of ZGEEV Example */
+
+/* Auxiliary routine: printing a matrix */
+function void
+print_matrix_cmplx16( char* desc, int m, int n, MKL_Complex16* a, int lda ) {
+  ArenaTemp scratch = scratch_get(0,0);
+  int i, j;
+  String8 newline = str8_lit("\n");
+  String8 header = str8_pushf(scratch.arena, "\n %s\n", desc );
+  OutputDebugString(header.str);
+  //printf("\n %s \n", desc);
+  for( i = 0; i < m; i++ ) {
+    for( j = 0; j < n; j++ ) {
+      String8 outstr = str8_pushf(scratch.arena, " (%6.2f,%6.2f)", a[i+j*lda].real, a[i+j*lda].imag );
+      OutputDebugString(outstr.str);
+      //printf(" (%6.2f,%6.2f)", a[i+j*lda].real, a[i+j*lda].imag );
+    }
+    OutputDebugString(newline.str);
+    //printf("\n");
+  }
+}
+
+/*******************************************************************************
+ * Copyright 2009-2021 Intel Corporation.
+ *
+ * This software and the related documents are Intel copyrighted  materials,  and
+ * your use of  them is  governed by the  express license  under which  they were
+ * provided to you (License).  Unless the License provides otherwise, you may not
+ * use, modify, copy, publish, distribute,  disclose or transmit this software or
+ * the related documents without Intel's prior written permission.
+ *
+ * This software and the related documents  are provided as  is,  with no express
+ * or implied  warranties,  other  than those  that are  expressly stated  in the
+ * License.
+ *******************************************************************************/
+
+/*
+   DSYEVD Example.
+   ==============
+
+   Program computes all eigenvalues and eigenvectors of a real symmetric
+   matrix A using divide and conquer algorithm, where A is:
+
+   6.39   0.13  -8.23   5.71  -3.18
+   0.13   8.37  -4.46  -6.10   7.21
+   -8.23  -4.46  -9.58  -9.25  -7.42
+   5.71  -6.10  -9.25   3.72   8.54
+   -3.18   7.21  -7.42   8.54   2.51
+
+   Description.
+   ============
+
+   The routine computes all eigenvalues and, optionally, eigenvectors of an
+   n-by-n real symmetric matrix A. The eigenvector v(j) of A satisfies
+
+   A*v(j) = lambda(j)*v(j)
+
+   where lambda(j) is its eigenvalue. The computed eigenvectors are
+   orthonormal.
+   If the eigenvectors are requested, then this routine uses a divide and
+   conquer algorithm to compute eigenvalues and eigenvectors.
+
+   Example Program Results.
+   ========================
+
+   DSYEVD Example Program Results
+
+   Eigenvalues
+   -17.44 -11.96   6.72  14.25  19.84
+
+   Eigenvectors (stored columnwise)
+   -0.26   0.31  -0.74   0.33   0.42
+   -0.17  -0.39  -0.38  -0.80   0.16
+   -0.89   0.04   0.09   0.03  -0.45
+   -0.29  -0.59   0.34   0.31   0.60
+   -0.19   0.63   0.44  -0.38   0.48
+   */
+#include <stdlib.h>
+#include <stdio.h>
+#include <mkl.h>
+
+/* Auxiliary routines prototypes */
+extern void print_matrix( char* desc, int m, int n, double* a, int lda );
+
+/* Parameters */
+#define N 5
+#define LDA N
+
+/* Main program */
+function void 
+test_mkl_dsyevd(void) {
+  /* Locals */
+  MKL_INT n = N, lda = LDA, info, lwork, liwork;
+  MKL_INT iwkopt;
+  MKL_INT* iwork;
+  double wkopt;
+  double* work;
+  /* Local arrays */
+  double w[N];
+  double a[LDA*N] = {
+    6.39,  0.00,  0.00,  0.00,  0.00,
+    0.13,  8.37,  0.00,  0.00,  0.00,
+    -8.23, -4.46, -9.58,  0.00,  0.00,
+    5.71, -6.10, -9.25,  3.72,  0.00,
+    -3.18,  7.21, -7.42,  8.54,  2.51
+  };
+  /* Executable statements */
+  OutputDebugString( " DSYEVD Example Program Results\n" );
+  /* Query and allocate the optimal workspace */
+  lwork = -1;
+  liwork = -1;
+  dsyevd( "Vectors", "Upper", &n, a, &lda, w, &wkopt, &lwork, &iwkopt,
+      &liwork, &info );
+  lwork = (MKL_INT)wkopt;
+  work = (double*)malloc( lwork*sizeof(double) );
+  liwork = iwkopt;
+  iwork = (MKL_INT*)malloc( liwork*sizeof(MKL_INT) );
+  /* Solve eigenproblem */
+  dsyevd( "Vectors", "Upper", &n, a, &lda, w, work, &lwork, iwork,
+      &liwork, &info );
+  /* Check for convergence */
+  if( info > 0 ) {
+    OutputDebugString( "The algorithm failed to compute eigenvalues.\n" );
+    exit( 1 );
+  }
+  /* Print eigenvalues */
+  print_matrix( "Eigenvalues", 1, n, w, 1 );
+  /* Print eigenvectors */
+  print_matrix( "Eigenvectors (stored columnwise)", n, n, a, lda );
+  /* Free workspace */
+  free( (void*)iwork );
+  free( (void*)work );
+} /* End of DSYEVD Example */
+
+/* Auxiliary routine: printing a matrix */
+void print_matrix( char* desc, int m, int n, double* a, int lda ) {
+  ArenaTemp scratch = scratch_get(0, 0);
+  int i, j;
+  //printf( "\n %s\n", desc );
+  String8 header = str8_pushf(scratch.arena, "\n %s\n", desc);
+  OutputDebugString(header.str);
+  for( i = 0; i < m; i++ ) {
+    for( j = 0; j < n; j++ ) 
+    {
+      String8 out = str8_pushf(scratch.arena, " %6.2f", a[i+j*lda] );
+      OutputDebugString(out.str);
+      //printf( " %6.2f", a[i+j*lda] );
+    }
+    OutputDebugString(str8_lit("\n").str);
+  }
+}
+
+
--- a/timeBuild.ctm
+++ b/timeBuild.ctm