From 11ef16501716a7029e691e1cf6795beb70afabb4 Mon Sep 17 00:00:00 2001 From: antonl Date: Sun, 15 Mar 2026 18:26:40 +0100 Subject: [PATCH] C results --- build.bat | 3 + cSparseResults.json | 6 + src/main.c | 398 ++++++++++++++++++++++++++++++++++++++------ 3 files changed, 356 insertions(+), 51 deletions(-) create mode 100644 cSparseResults.json diff --git a/build.bat b/build.bat index ce75111..b8ca9c8 100644 --- a/build.bat +++ b/build.bat @@ -15,6 +15,9 @@ set OUT=%~n1.exe pushd build del /F /Q * clang ../src/%SRC% -o %OUT% ^ + -O3 ^ + -march=native ^ + -ffast-math ^ -fopenmp ^ -I"%MKL_ROOT%\include" ^ -L"%MKL_ROOT%\lib" ^ diff --git a/cSparseResults.json b/cSparseResults.json new file mode 100644 index 0000000..6dcef2e --- /dev/null +++ b/cSparseResults.json @@ -0,0 +1,6 @@ +[ + {"label":"FEM_3D_thermal2","rows":147900,"cols":147900,"nnz":3489300,"spmv_runs":32,"spmv_total_ns":3866100,"spmv_avg_ns":120815,"dense_rows":2048,"dense_cols":2048,"dense_runs":32,"dense_total_ns":957945100,"dense_avg_ns":29935784}, + {"label":"ldoor","rows":952203,"cols":952203,"nnz":46522475,"spmv_runs":32,"spmv_total_ns":286909100,"spmv_avg_ns":8965909,"dense_rows":4096,"dense_cols":4096,"dense_runs":32,"dense_total_ns":7653100100,"dense_avg_ns":239159378}, + {"label":"Cube_Coup_dt0","rows":2164760,"cols":2164760,"nnz":127206144,"spmv_runs":32,"spmv_total_ns":864947600,"spmv_avg_ns":27029612,"dense_rows":8192,"dense_cols":8192,"dense_runs":32,"dense_total_ns":59799139700,"dense_avg_ns":1868723115}, + {"label":"nlpkkt200","rows":16240000,"cols":16240000,"nnz":448225632,"spmv_runs":32,"spmv_total_ns":2383754600,"spmv_avg_ns":74492331,"dense_rows":16384,"dense_cols":16384,"dense_runs":32,"dense_total_ns":471942950199,"dense_avg_ns":14748217193} +] diff --git a/src/main.c b/src/main.c index da883bd..b6b7f56 100644 --- a/src/main.c +++ b/src/main.c @@ -4,16 +4,22 @@ #include #include +#define WIN32_LEAN_AND_MEAN +#include + #include "mkl.h" #include "mkl_spblas.h" typedef int32_t S32; typedef uint32_t U32; +typedef int64_t S64; +typedef uint64_t U64; typedef float F32; typedef double F64; typedef int32_t B32; -static int g_spmv_runs = 16; +static S32 g_spmv_runs = 32; +static S32 g_dense_runs = 32; typedef struct CSRMatrix CSRMatrix; struct CSRMatrix { @@ -32,6 +38,37 @@ struct Triplet { double v; }; +// I just ahve a separate dense timing struct that the timing +// function fills out and then we transfer this information to the Timing struct +// that is serialised to json +typedef struct DenseTiming DenseTiming; +struct DenseTiming { + S32 DenseRuns; + S32 DenseCols; + S32 DenseRows; + S64 DenseTotalNs; + S64 DenseAvgNs; +}; + +typedef struct Timing Timing; +struct Timing { + char label[256]; + S32 rows; + S32 cols; + S32 NNZ; + + S32 SpMVRuns; + S64 SpMVTotalNs; + S64 SpMVAvgNs; + + S32 DenseRows; + S32 DenseCols; + + S32 DenseRuns; + S64 DenseTotalNs; + S64 DenseAvgNs; +}; + static void panic(const char *msg) { fprintf(stderr, "%s\n", msg); exit(EXIT_FAILURE); @@ -51,21 +88,106 @@ static void *xcalloc(size_t count, size_t size) { return p; } +static void matrix_label_from_path(const char *path, char *out, size_t out_size) +{ + const char *base = path; + + const char *s1 = strrchr(path, '/'); + const char *s2 = strrchr(path, '\\'); + + if (s1 && s1 >= base) base = s1 + 1; + if (s2 && s2 >= base) base = s2 + 1; + + strncpy_s(out, out_size, base, _TRUNCATE); + + char *ext = strrchr(out, '.'); + if (ext && strcmp(ext, ".mtx") == 0) { + *ext = '\0'; + } +} + +static S64 now_ns(void) +{ + static LARGE_INTEGER freq; + static B32 initialized = 0; + + if (!initialized) { + QueryPerformanceFrequency(&freq); + initialized = 1; + } + + LARGE_INTEGER counter; + QueryPerformanceCounter(&counter); + + return (S64)((counter.QuadPart * 1000000000LL) / freq.QuadPart); +} + +static void timing_write_json(FILE *fp, const Timing *t) +{ +fprintf(fp, + "{" + "\"label\":\"%s\"," + "\"rows\":%d," + "\"cols\":%d," + "\"nnz\":%d," + "\"spmv_runs\":%d," + "\"spmv_total_ns\":%lld," + "\"spmv_avg_ns\":%lld," + "\"dense_rows\":%d," + "\"dense_cols\":%d," + "\"dense_runs\":%d," + "\"dense_total_ns\":%lld," + "\"dense_avg_ns\":%lld" + "}", + t->label, + t->rows, + t->cols, + t->NNZ, + t->SpMVRuns, + (long long)t->SpMVTotalNs, + (long long)t->SpMVAvgNs, + t->DenseRows, + t->DenseCols, + t->DenseRuns, + (long long)t->DenseTotalNs, + (long long)t->DenseAvgNs + ); +} + +static void timings_write_json_file( + const char *path, + const Timing *timings, + int count) +{ + FILE *fp = NULL; + + if (fopen_s(&fp, path, "w") != 0 || fp == NULL) + panic("failed to open json file"); + + fprintf(fp, "[\n"); + + for (int i = 0; i < count; i++) + { + fprintf(fp, " "); + timing_write_json(fp, &timings[i]); + + if (i != count - 1) + fprintf(fp, ","); + + fprintf(fp, "\n"); + } + + fprintf(fp, "]\n"); + + fclose(fp); +} + static char *trim_left(char *s) { while (*s && isspace((unsigned char)*s)) s++; return s; } -/* -static int starts_with(const char *s, const char *prefix) { - while (*prefix) { - if (*s++ != *prefix++) return 0; - } - return 1; -} -*/ - -static int triplet_cmp(const void *a, const void *b) { +static inline int triplet_cmp(const void *a, const void *b) { const Triplet *x = (const Triplet *)a; const Triplet *y = (const Triplet *)b; if (x->i < y->i) return -1; @@ -75,14 +197,17 @@ static int triplet_cmp(const void *a, const void *b) { return 0; } -// Supports common Matrix Market coordinate cases: -// %%MatrixMarket matrix coordinate real general -// %%MatrixMarket matrix coordinate integer general -// %%MatrixMarket matrix coordinate pattern general -// %%MatrixMarket matrix coordinate real symmetric + +// Exact-semantics "fast" version of the slow Matrix Market reader. +// Keeps: +// - general / symmetric handling +// - real / integer / pattern handling +// - duplicate summation via qsort + uniq // -// For symmetric matrices, off-diagonal entries are mirrored. -// Duplicate entries are summed during COO->CSR compaction. +// Speeds up: +// - file I/O buffering +// - hot-loop parsing (strtol/strtod instead of sscanf_s) + CSRMatrix read_matrix_market_to_csr(const char *path) { FILE *fp = NULL; errno_t err = fopen_s(&fp, path, "r"); @@ -90,6 +215,9 @@ CSRMatrix read_matrix_market_to_csr(const char *path) { panic("failed to open Matrix Market file"); } + // Bigger stdio buffer helps a lot for multi-GB files. + setvbuf(fp, NULL, _IOFBF, 1 << 22); // 4 MiB + char line[4096]; if (fgets(line, sizeof(line), fp) == NULL) { @@ -106,10 +234,10 @@ CSRMatrix read_matrix_market_to_csr(const char *path) { int scanned = sscanf_s( line, "%63s %63s %63s %63s %63s", - banner, (unsigned)_countof(banner), - object, (unsigned)_countof(object), - format, (unsigned)_countof(format), - field, (unsigned)_countof(field), + banner, (unsigned)_countof(banner), + object, (unsigned)_countof(object), + format, (unsigned)_countof(format), + field, (unsigned)_countof(field), symmetry, (unsigned)_countof(symmetry) ); @@ -182,37 +310,50 @@ CSRMatrix read_matrix_market_to_csr(const char *path) { continue; } - int i = 0; - int j = 0; + char *p = s; + char *end = NULL; + + long li = strtol(p, &end, 10); + if (end == p) { + fclose(fp); + free(trips); + panic("bad entry line: failed to parse row"); + } + p = end; + + long lj = strtol(p, &end, 10); + if (end == p) { + fclose(fp); + free(trips); + panic("bad entry line: failed to parse col"); + } + p = end; + double v = 1.0; if (is_pattern) { - scanned = sscanf_s(s, "%d %d", &i, &j); - if (scanned != 2) { - fclose(fp); - free(trips); - panic("bad pattern entry line"); - } + // nothing else to parse } else if (is_integer) { - int iv = 0; - scanned = sscanf_s(s, "%d %d %d", &i, &j, &iv); - if (scanned != 3) { + long liv = strtol(p, &end, 10); + if (end == p) { fclose(fp); free(trips); panic("bad integer entry line"); } - v = (double)iv; + v = (double)liv; + p = end; } else { - scanned = sscanf_s(s, "%d %d %lf", &i, &j, &v); - if (scanned != 3) { + v = strtod(p, &end); + if (end == p) { fclose(fp); free(trips); panic("bad real entry line"); } + p = end; } - i -= 1; - j -= 1; + int i = (int)li - 1; + int j = (int)lj - 1; if (i < 0 || i >= rows || j < 0 || j >= cols) { fclose(fp); @@ -289,7 +430,7 @@ CSRMatrix read_matrix_market_to_csr(const char *path) { free(uniq); return A; -} +} static void free_csr(CSRMatrix *A) { if (!A) return; @@ -324,7 +465,8 @@ static sparse_matrix_t csr_to_mkl_handle(const CSRMatrix *A) { return H; } -static void example_spmv(const CSRMatrix *A) { +Timing timeSpMV(const CSRMatrix *A) { + Timing out = {0}; sparse_matrix_t H = csr_to_mkl_handle(A); struct matrix_descr descr; @@ -333,16 +475,16 @@ static void example_spmv(const CSRMatrix *A) { descr.diag = SPARSE_DIAG_NON_UNIT; // Optional optimization path recommended by oneMKL. - mkl_sparse_set_mv_hint(H, SPARSE_OPERATION_NON_TRANSPOSE, descr, 1000); + mkl_sparse_set_mv_hint(H, SPARSE_OPERATION_NON_TRANSPOSE, descr, g_spmv_runs); mkl_sparse_optimize(H); double *x = (double *)xmalloc((size_t)A->cols * sizeof(double)); - double *y = (double *)calloc((size_t)A->rows, sizeof(double)); - if (!y) panic("out of memory"); + double *y = (double *)xcalloc((size_t)A->rows, sizeof(double)); for (MKL_INT i = 0; i < A->cols; i++) x[i] = 1.0; - for (int i = 0; i < g_spmv_runs; i += 1) { + // warmup + for(int i = 0; i < 2; i += 1) { sparse_status_t st = mkl_sparse_d_mv( SPARSE_OPERATION_NON_TRANSPOSE, 1.0, @@ -355,25 +497,179 @@ static void example_spmv(const CSRMatrix *A) { if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed"); } + S64 t0 = now_ns(); + for (int i = 0; i < g_spmv_runs; i += 1) { + sparse_status_t st = mkl_sparse_d_mv( + SPARSE_OPERATION_NON_TRANSPOSE, + 1.0, + H, + descr, + x, + 0.0, + y + ); + if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed"); + } + S64 t1 = now_ns(); + + S64 elapsed_ns = t1-t0; + + out.rows = A->rows; + out.cols = A->cols; + out.NNZ = A->nnz; + out.SpMVRuns = g_spmv_runs; + out.SpMVTotalNs = elapsed_ns; + out.SpMVAvgNs = elapsed_ns / g_spmv_runs; + printf("SpMV done for %d runs. y[0] = %.6g\n", g_spmv_runs, (A->rows > 0 ? y[0] : 0.0)); + F64 avg_ms = (F64)out.SpMVAvgNs/1e6; + printf("Average time for SpMV: %.3f ms \n", avg_ms); + free(x); free(y); + + mkl_sparse_destroy(H); + + + return out; } -int main() { - //const char *thermal2Path = "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx"; - const char *bigPath = "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx"; - printf("Reading market matrix %s \n", bigPath); +static DenseTiming timeDenseMatmul(int inRows) { + DenseTiming out = {0}; + + MKL_INT rows = inRows; + MKL_INT cols = inRows; + + out.DenseRows = rows; + out.DenseCols = cols; + + S64 byteReq = rows*cols*sizeof(F64); + F64 GBreq = (F64)byteReq/(1024.0 * 1024.0 * 1024.0); + printf("Matrix of size %i x %i requires %.2f GB of memory \n", inRows, inRows, GBreq); + + MKL_INT n = rows; + + // Row-major dense matrices: C = A * B + F64 *left = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64)); + F64 *right = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64)); + F64 *outMat = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64)); + + // Fill deterministically + for (MKL_INT i = 0; i < n * n; i++) { + left[i] = (F64)((i % 13) + 1) * 0.1; + right[i] = (F64)((i % 17) + 1) * 0.1; + outMat[i] = 0.0; + } + + // Warmup + cblas_dgemm( + CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + n, n, n, + 1.0, + left, n, + right, n, + 0.0, + outMat, n + ); + + S64 t0 = now_ns(); + + for (int i = 0; i < g_dense_runs; i += 1) { + cblas_dgemm( + CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + n, n, n, + 1.0, + left, n, + right, n, + 0.0, + outMat, n + ); + } + + S64 t1 = now_ns(); + S64 elapsed_ns = t1 - t0; + + out.DenseRuns = g_dense_runs; + out.DenseTotalNs = elapsed_ns; + out.DenseAvgNs = elapsed_ns / (S64)g_dense_runs; + + printf("Dense matmul done for %d runs. C[0] = %.6g\n", + g_dense_runs, outMat[0]); + + F64 avg_ms = (F64)out.DenseAvgNs / 1e6; + printf("Average time for dense matmul: %.3f ms\n", avg_ms); + + free(left); + free(right); + free(outMat); + + return out; +} + +Timing doSpMVTimings(const char *path) { + printf("Reading market matrix %s \n", path); CSRMatrix A; - A = read_matrix_market_to_csr(bigPath); + A = read_matrix_market_to_csr(path); printf("Read matrix with size %d x %d and nnz = %d \n", A.rows, A.cols, A.nnz); - example_spmv(&A); - + Timing out = timeSpMV(&A); + matrix_label_from_path(path, out.label, sizeof(out.label)); free_csr(&A); + printf("Finished matrix %s \n", out.label); + return out; +} +int main() { + + S32 numPaths = 4; + int denseRows[] = {2048, 4096, 4096*2, 4096*4}; + + const char *paths[] = { + "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx", + "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\ldoor.mtx", + "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\Cube_Coup_dt0.mtx", + "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx" + }; + +// Sanity check for threading. +// Single thread gave avg 0.9 ms and 16 threads gave 0.14 msg avg +#if 0 + { + printf("Single thread:\n"); + mkl_set_num_threads(1); + doTimings(paths[0]); + + printf("16 threads:\n"); + mkl_set_num_threads(16); + doTimings(paths[0]); + } +#endif + + { + mkl_set_num_threads(16); // pick your core count + mkl_set_dynamic(0); // disable MKL changing thread count dynamically + printf("MKL max threads: %d\n", mkl_get_max_threads()); + } + + Timing timings[4]; + for (S32 i = 0; i < numPaths; i += 1) { + timings[i] = doSpMVTimings(paths[i]); + DenseTiming dense = timeDenseMatmul(denseRows[i]); + timings[i].DenseRuns = dense.DenseRuns; + timings[i].DenseRows = dense.DenseRows; + timings[i].DenseCols = dense.DenseCols; + timings[i].DenseTotalNs = dense.DenseTotalNs; + timings[i].DenseAvgNs = dense.DenseAvgNs; + } + + timings_write_json_file("cSparseResults.json", timings, numPaths); + return 0; }