C results

2026-03-15 18:26:40 +01:00 · 2026-03-15 18:26:40 +01:00 · 11ef165017
commit 11ef165017
parent da24f740ac
3 changed files with 356 additions and 51 deletions
--- a/build.bat
+++ b/build.bat
@ -15,6 +15,9 @@ set OUT=%~n1.exe
 pushd build
 del /F /Q *
 clang ../src/%SRC% -o %OUT% ^
+    -O3 ^
+    -march=native ^
+    -ffast-math ^
    -fopenmp ^
    -I"%MKL_ROOT%\include" ^
    -L"%MKL_ROOT%\lib" ^
--- a/cSparseResults.json
+++ b/cSparseResults.json
@ -0,0 +1,6 @@
+[
+  {"label":"FEM_3D_thermal2","rows":147900,"cols":147900,"nnz":3489300,"spmv_runs":32,"spmv_total_ns":3866100,"spmv_avg_ns":120815,"dense_rows":2048,"dense_cols":2048,"dense_runs":32,"dense_total_ns":957945100,"dense_avg_ns":29935784},
+  {"label":"ldoor","rows":952203,"cols":952203,"nnz":46522475,"spmv_runs":32,"spmv_total_ns":286909100,"spmv_avg_ns":8965909,"dense_rows":4096,"dense_cols":4096,"dense_runs":32,"dense_total_ns":7653100100,"dense_avg_ns":239159378},
+  {"label":"Cube_Coup_dt0","rows":2164760,"cols":2164760,"nnz":127206144,"spmv_runs":32,"spmv_total_ns":864947600,"spmv_avg_ns":27029612,"dense_rows":8192,"dense_cols":8192,"dense_runs":32,"dense_total_ns":59799139700,"dense_avg_ns":1868723115},
+  {"label":"nlpkkt200","rows":16240000,"cols":16240000,"nnz":448225632,"spmv_runs":32,"spmv_total_ns":2383754600,"spmv_avg_ns":74492331,"dense_rows":16384,"dense_cols":16384,"dense_runs":32,"dense_total_ns":471942950199,"dense_avg_ns":14748217193}
+]
--- a/src/main.c
+++ b/src/main.c
@ -4,16 +4,22 @@
 #include <string.h>
 #include <ctype.h>

+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
 #include "mkl.h"
 #include "mkl_spblas.h"

 typedef int32_t S32;
 typedef uint32_t U32;
+typedef int64_t S64;
+typedef uint64_t U64;
 typedef float F32;
 typedef double F64;
 typedef int32_t B32;

-static int g_spmv_runs = 16;
+static S32 g_spmv_runs = 32;
+static S32 g_dense_runs = 32;

 typedef struct CSRMatrix CSRMatrix;
 struct CSRMatrix {
@ -32,6 +38,37 @@ struct Triplet {
  double v;
 };

+// I just ahve a separate dense timing struct that the timing  
+// function fills out and then we transfer this information to the Timing struct
+// that is serialised to json
+typedef struct DenseTiming DenseTiming;
+struct DenseTiming {
+  S32 DenseRuns;
+  S32 DenseCols;
+  S32 DenseRows;
+  S64 DenseTotalNs;
+  S64 DenseAvgNs;
+};
+
+typedef struct Timing Timing;
+struct Timing {
+  char label[256];
+  S32 rows;
+  S32 cols;
+  S32 NNZ;
+
+  S32 SpMVRuns;
+  S64 SpMVTotalNs;
+  S64 SpMVAvgNs;
+
+  S32 DenseRows;
+  S32 DenseCols;
+
+  S32 DenseRuns;
+  S64 DenseTotalNs;
+  S64 DenseAvgNs;
+};
+
 static void panic(const char *msg) {
  fprintf(stderr, "%s\n", msg);
  exit(EXIT_FAILURE);
@ -51,21 +88,106 @@ static void *xcalloc(size_t count, size_t size) {
  return p;
 }

+static void matrix_label_from_path(const char *path, char *out, size_t out_size)
+{
+    const char *base = path;
+
+    const char *s1 = strrchr(path, '/');
+    const char *s2 = strrchr(path, '\\');
+
+    if (s1 && s1 >= base) base = s1 + 1;
+    if (s2 && s2 >= base) base = s2 + 1;
+
+    strncpy_s(out, out_size, base, _TRUNCATE);
+
+    char *ext = strrchr(out, '.');
+    if (ext && strcmp(ext, ".mtx") == 0) {
+        *ext = '\0';
+    }
+}
+
+static S64 now_ns(void)
+{
+    static LARGE_INTEGER freq;
+    static B32 initialized = 0;
+
+    if (!initialized) {
+        QueryPerformanceFrequency(&freq);
+        initialized = 1;
+    }
+
+    LARGE_INTEGER counter;
+    QueryPerformanceCounter(&counter);
+
+    return (S64)((counter.QuadPart * 1000000000LL) / freq.QuadPart);
+}
+
+static void timing_write_json(FILE *fp, const Timing *t)
+{
+fprintf(fp,
+        "{"
+        "\"label\":\"%s\","
+        "\"rows\":%d,"
+        "\"cols\":%d,"
+        "\"nnz\":%d,"
+        "\"spmv_runs\":%d,"
+        "\"spmv_total_ns\":%lld,"
+        "\"spmv_avg_ns\":%lld,"
+        "\"dense_rows\":%d,"
+        "\"dense_cols\":%d,"
+        "\"dense_runs\":%d,"
+        "\"dense_total_ns\":%lld,"
+        "\"dense_avg_ns\":%lld"
+        "}",
+        t->label,
+        t->rows,
+        t->cols,
+        t->NNZ,
+        t->SpMVRuns,
+        (long long)t->SpMVTotalNs,
+        (long long)t->SpMVAvgNs,
+        t->DenseRows,
+        t->DenseCols,
+        t->DenseRuns,
+        (long long)t->DenseTotalNs,
+        (long long)t->DenseAvgNs
+    );
+}
+
+static void timings_write_json_file(
+    const char *path,
+    const Timing *timings,
+    int count)
+{
+    FILE *fp = NULL;
+
+    if (fopen_s(&fp, path, "w") != 0 || fp == NULL)
+        panic("failed to open json file");
+
+    fprintf(fp, "[\n");
+
+    for (int i = 0; i < count; i++)
+    {
+        fprintf(fp, "  ");
+        timing_write_json(fp, &timings[i]);
+
+        if (i != count - 1)
+            fprintf(fp, ",");
+
+        fprintf(fp, "\n");
+    }
+
+    fprintf(fp, "]\n");
+
+    fclose(fp);
+}
+
 static char *trim_left(char *s) {
    while (*s && isspace((unsigned char)*s)) s++;
    return s;
 }

-/*
-static int starts_with(const char *s, const char *prefix) {
-    while (*prefix) {
-        if (*s++ != *prefix++) return 0;
-    }
-    return 1;
-}
-*/
-
-static int triplet_cmp(const void *a, const void *b) {
+static inline int triplet_cmp(const void *a, const void *b) {
    const Triplet *x = (const Triplet *)a;
    const Triplet *y = (const Triplet *)b;
    if (x->i < y->i) return -1;
@ -75,14 +197,17 @@ static int triplet_cmp(const void *a, const void *b) {
    return 0;
 }

-// Supports common Matrix Market coordinate cases:
-//   %%MatrixMarket matrix coordinate real general
-//   %%MatrixMarket matrix coordinate integer general
-//   %%MatrixMarket matrix coordinate pattern general
-//   %%MatrixMarket matrix coordinate real symmetric
+
+// Exact-semantics "fast" version of the slow Matrix Market reader.
+// Keeps:
+//   - general / symmetric handling
+//   - real / integer / pattern handling
+//   - duplicate summation via qsort + uniq
 //
-// For symmetric matrices, off-diagonal entries are mirrored.
-// Duplicate entries are summed during COO->CSR compaction.
+// Speeds up:
+//   - file I/O buffering
+//   - hot-loop parsing (strtol/strtod instead of sscanf_s)
+
 CSRMatrix read_matrix_market_to_csr(const char *path) {
    FILE *fp = NULL;
    errno_t err = fopen_s(&fp, path, "r");
@ -90,6 +215,9 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
        panic("failed to open Matrix Market file");
    }

+    // Bigger stdio buffer helps a lot for multi-GB files.
+    setvbuf(fp, NULL, _IOFBF, 1 << 22); // 4 MiB
+
    char line[4096];

    if (fgets(line, sizeof(line), fp) == NULL) {
@ -182,37 +310,50 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
            continue;
        }

-        int i = 0;
-        int j = 0;
+        char *p = s;
+        char *end = NULL;
+
+        long li = strtol(p, &end, 10);
+        if (end == p) {
+            fclose(fp);
+            free(trips);
+            panic("bad entry line: failed to parse row");
+        }
+        p = end;
+
+        long lj = strtol(p, &end, 10);
+        if (end == p) {
+            fclose(fp);
+            free(trips);
+            panic("bad entry line: failed to parse col");
+        }
+        p = end;
+
        double v = 1.0;

        if (is_pattern) {
-            scanned = sscanf_s(s, "%d %d", &i, &j);
-            if (scanned != 2) {
-                fclose(fp);
-                free(trips);
-                panic("bad pattern entry line");
-            }
+            // nothing else to parse
        } else if (is_integer) {
-            int iv = 0;
-            scanned = sscanf_s(s, "%d %d %d", &i, &j, &iv);
-            if (scanned != 3) {
+            long liv = strtol(p, &end, 10);
+            if (end == p) {
                fclose(fp);
                free(trips);
                panic("bad integer entry line");
            }
-            v = (double)iv;
+            v = (double)liv;
+            p = end;
        } else {
-            scanned = sscanf_s(s, "%d %d %lf", &i, &j, &v);
-            if (scanned != 3) {
+            v = strtod(p, &end);
+            if (end == p) {
                fclose(fp);
                free(trips);
                panic("bad real entry line");
            }
+            p = end;
        }

-        i -= 1;
-        j -= 1;
+        int i = (int)li - 1;
+        int j = (int)lj - 1;

        if (i < 0 || i >= rows || j < 0 || j >= cols) {
            fclose(fp);
@ -324,7 +465,8 @@ static sparse_matrix_t csr_to_mkl_handle(const CSRMatrix *A) {
    return H;
 }

-static void example_spmv(const CSRMatrix *A) {
+Timing timeSpMV(const CSRMatrix *A) {
+    Timing out = {0};
    sparse_matrix_t H = csr_to_mkl_handle(A);

    struct matrix_descr descr;
@ -333,16 +475,16 @@ static void example_spmv(const CSRMatrix *A) {
    descr.diag = SPARSE_DIAG_NON_UNIT;

    // Optional optimization path recommended by oneMKL.
-    mkl_sparse_set_mv_hint(H, SPARSE_OPERATION_NON_TRANSPOSE, descr, 1000);
+    mkl_sparse_set_mv_hint(H, SPARSE_OPERATION_NON_TRANSPOSE, descr, g_spmv_runs);
    mkl_sparse_optimize(H);

    double *x = (double *)xmalloc((size_t)A->cols * sizeof(double));
-    double *y = (double *)calloc((size_t)A->rows, sizeof(double));
-    if (!y) panic("out of memory");
+    double *y = (double *)xcalloc((size_t)A->rows, sizeof(double));

    for (MKL_INT i = 0; i < A->cols; i++) x[i] = 1.0;

-    for (int i = 0; i < g_spmv_runs; i += 1) {
+    // warmup
+    for(int i = 0; i < 2; i += 1) {
    sparse_status_t st = mkl_sparse_d_mv(
        SPARSE_OPERATION_NON_TRANSPOSE,
        1.0,
@ -355,25 +497,179 @@ static void example_spmv(const CSRMatrix *A) {
    if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
    }

+    S64 t0 = now_ns();
+    for (int i = 0; i < g_spmv_runs; i += 1) {
+    sparse_status_t st = mkl_sparse_d_mv(
+        SPARSE_OPERATION_NON_TRANSPOSE,
+        1.0,
+        H,
+        descr,
+        x,
+        0.0,
+        y
+    );
+    if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
+    }
+    S64 t1 = now_ns();
+    
+    S64 elapsed_ns = t1-t0;
+
+    out.rows = A->rows;
+    out.cols = A->cols;
+    out.NNZ = A->nnz;
+    out.SpMVRuns = g_spmv_runs;
+    out.SpMVTotalNs = elapsed_ns;
+    out.SpMVAvgNs = elapsed_ns / g_spmv_runs;
+
    printf("SpMV done for %d runs. y[0] = %.6g\n", g_spmv_runs, (A->rows > 0 ? y[0] : 0.0));

+    F64 avg_ms = (F64)out.SpMVAvgNs/1e6;
+    printf("Average time for SpMV: %.3f ms \n", avg_ms);
+
    free(x);
    free(y);
+    
+
    mkl_sparse_destroy(H);
+
+
+    return out;
 }

-int main() {
-  //const char *thermal2Path = "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx";
-  const char *bigPath = "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx";
-  printf("Reading market matrix %s \n", bigPath);
+static DenseTiming timeDenseMatmul(int inRows) {
+  DenseTiming out = {0};
+
+  MKL_INT rows = inRows;
+  MKL_INT cols = inRows;
+
+  out.DenseRows = rows;
+  out.DenseCols = cols;
+
+  S64 byteReq = rows*cols*sizeof(F64);
+  F64 GBreq = (F64)byteReq/(1024.0 * 1024.0 * 1024.0);
+  printf("Matrix of size %i x %i requires %.2f GB of memory \n", inRows, inRows, GBreq);
+
+  MKL_INT n = rows;
+
+ // Row-major dense matrices: C = A * B
+    F64 *left  = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64));
+    F64 *right = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64));
+    F64 *outMat = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64));
+
+    // Fill deterministically
+    for (MKL_INT i = 0; i < n * n; i++) {
+        left[i]  = (F64)((i % 13) + 1) * 0.1;
+        right[i] = (F64)((i % 17) + 1) * 0.1;
+        outMat[i] = 0.0;
+    }
+
+    // Warmup
+    cblas_dgemm(
+        CblasRowMajor,
+        CblasNoTrans,
+        CblasNoTrans,
+        n, n, n,
+        1.0,
+        left, n,
+        right, n,
+        0.0,
+        outMat, n
+    );
+
+    S64 t0 = now_ns();
+
+    for (int i = 0; i < g_dense_runs; i += 1) {
+        cblas_dgemm(
+            CblasRowMajor,
+            CblasNoTrans,
+            CblasNoTrans,
+            n, n, n,
+            1.0,
+            left, n,
+            right, n,
+            0.0,
+            outMat, n
+        );
+    }
+
+    S64 t1 = now_ns();
+    S64 elapsed_ns = t1 - t0;
+
+    out.DenseRuns = g_dense_runs;
+    out.DenseTotalNs = elapsed_ns;
+    out.DenseAvgNs = elapsed_ns / (S64)g_dense_runs;
+
+    printf("Dense matmul done for %d runs. C[0] = %.6g\n",
+           g_dense_runs, outMat[0]);
+
+    F64 avg_ms = (F64)out.DenseAvgNs / 1e6;
+    printf("Average time for dense matmul: %.3f ms\n", avg_ms);
+
+    free(left);
+    free(right);
+    free(outMat);
+
+    return out;
+}
+
+Timing doSpMVTimings(const char *path) {
+  printf("Reading market matrix %s \n", path);
  CSRMatrix A;
-  A = read_matrix_market_to_csr(bigPath);
+  A = read_matrix_market_to_csr(path);
  printf("Read matrix with size %d x %d and nnz = %d \n", A.rows, A.cols, A.nnz);
-  example_spmv(&A);
-
+  Timing out = timeSpMV(&A);
+  matrix_label_from_path(path, out.label, sizeof(out.label));
  free_csr(&A);
+  printf("Finished matrix %s \n", out.label);
+  return out;
+}



+int main() {
+
+  S32 numPaths = 4;
+  int denseRows[] = {2048, 4096, 4096*2, 4096*4};
+
+  const char *paths[] = {
+    "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx",
+    "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\ldoor.mtx",
+    "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\Cube_Coup_dt0.mtx",
+    "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx"
+  };
+
+// Sanity check for threading. 
+// Single thread gave avg 0.9 ms and 16 threads gave 0.14 msg avg
+#if 0
+  {
+    printf("Single thread:\n");
+    mkl_set_num_threads(1);
+    doTimings(paths[0]);
+
+    printf("16 threads:\n");
+    mkl_set_num_threads(16);
+    doTimings(paths[0]);
+  }
+#endif 
+
+  {
+    mkl_set_num_threads(16);   // pick your core count
+    mkl_set_dynamic(0);        // disable MKL changing thread count dynamically
+    printf("MKL max threads: %d\n", mkl_get_max_threads());
+  }
+
+  Timing timings[4];
+  for (S32 i = 0; i < numPaths; i += 1) {
+    timings[i] = doSpMVTimings(paths[i]);
+    DenseTiming dense = timeDenseMatmul(denseRows[i]);
+    timings[i].DenseRuns = dense.DenseRuns;
+    timings[i].DenseRows = dense.DenseRows;
+    timings[i].DenseCols = dense.DenseCols;
+    timings[i].DenseTotalNs = dense.DenseTotalNs;
+    timings[i].DenseAvgNs = dense.DenseAvgNs;
+  }
+
+  timings_write_json_file("cSparseResults.json", timings, numPaths);
+
  return 0;
 }