C results

This commit is contained in:
antonl 2026-03-15 18:26:40 +01:00
parent da24f740ac
commit 11ef165017
3 changed files with 356 additions and 51 deletions

View File

@ -15,6 +15,9 @@ set OUT=%~n1.exe
pushd build
del /F /Q *
clang ../src/%SRC% -o %OUT% ^
-O3 ^
-march=native ^
-ffast-math ^
-fopenmp ^
-I"%MKL_ROOT%\include" ^
-L"%MKL_ROOT%\lib" ^

6
cSparseResults.json Normal file
View File

@ -0,0 +1,6 @@
[
{"label":"FEM_3D_thermal2","rows":147900,"cols":147900,"nnz":3489300,"spmv_runs":32,"spmv_total_ns":3866100,"spmv_avg_ns":120815,"dense_rows":2048,"dense_cols":2048,"dense_runs":32,"dense_total_ns":957945100,"dense_avg_ns":29935784},
{"label":"ldoor","rows":952203,"cols":952203,"nnz":46522475,"spmv_runs":32,"spmv_total_ns":286909100,"spmv_avg_ns":8965909,"dense_rows":4096,"dense_cols":4096,"dense_runs":32,"dense_total_ns":7653100100,"dense_avg_ns":239159378},
{"label":"Cube_Coup_dt0","rows":2164760,"cols":2164760,"nnz":127206144,"spmv_runs":32,"spmv_total_ns":864947600,"spmv_avg_ns":27029612,"dense_rows":8192,"dense_cols":8192,"dense_runs":32,"dense_total_ns":59799139700,"dense_avg_ns":1868723115},
{"label":"nlpkkt200","rows":16240000,"cols":16240000,"nnz":448225632,"spmv_runs":32,"spmv_total_ns":2383754600,"spmv_avg_ns":74492331,"dense_rows":16384,"dense_cols":16384,"dense_runs":32,"dense_total_ns":471942950199,"dense_avg_ns":14748217193}
]

View File

@ -4,16 +4,22 @@
#include <string.h>
#include <ctype.h>
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include "mkl.h"
#include "mkl_spblas.h"
typedef int32_t S32;
typedef uint32_t U32;
typedef int64_t S64;
typedef uint64_t U64;
typedef float F32;
typedef double F64;
typedef int32_t B32;
static int g_spmv_runs = 16;
static S32 g_spmv_runs = 32;
static S32 g_dense_runs = 32;
typedef struct CSRMatrix CSRMatrix;
struct CSRMatrix {
@ -32,6 +38,37 @@ struct Triplet {
double v;
};
// I just ahve a separate dense timing struct that the timing
// function fills out and then we transfer this information to the Timing struct
// that is serialised to json
typedef struct DenseTiming DenseTiming;
struct DenseTiming {
S32 DenseRuns;
S32 DenseCols;
S32 DenseRows;
S64 DenseTotalNs;
S64 DenseAvgNs;
};
typedef struct Timing Timing;
struct Timing {
char label[256];
S32 rows;
S32 cols;
S32 NNZ;
S32 SpMVRuns;
S64 SpMVTotalNs;
S64 SpMVAvgNs;
S32 DenseRows;
S32 DenseCols;
S32 DenseRuns;
S64 DenseTotalNs;
S64 DenseAvgNs;
};
static void panic(const char *msg) {
fprintf(stderr, "%s\n", msg);
exit(EXIT_FAILURE);
@ -51,21 +88,106 @@ static void *xcalloc(size_t count, size_t size) {
return p;
}
static void matrix_label_from_path(const char *path, char *out, size_t out_size)
{
const char *base = path;
const char *s1 = strrchr(path, '/');
const char *s2 = strrchr(path, '\\');
if (s1 && s1 >= base) base = s1 + 1;
if (s2 && s2 >= base) base = s2 + 1;
strncpy_s(out, out_size, base, _TRUNCATE);
char *ext = strrchr(out, '.');
if (ext && strcmp(ext, ".mtx") == 0) {
*ext = '\0';
}
}
static S64 now_ns(void)
{
static LARGE_INTEGER freq;
static B32 initialized = 0;
if (!initialized) {
QueryPerformanceFrequency(&freq);
initialized = 1;
}
LARGE_INTEGER counter;
QueryPerformanceCounter(&counter);
return (S64)((counter.QuadPart * 1000000000LL) / freq.QuadPart);
}
static void timing_write_json(FILE *fp, const Timing *t)
{
fprintf(fp,
"{"
"\"label\":\"%s\","
"\"rows\":%d,"
"\"cols\":%d,"
"\"nnz\":%d,"
"\"spmv_runs\":%d,"
"\"spmv_total_ns\":%lld,"
"\"spmv_avg_ns\":%lld,"
"\"dense_rows\":%d,"
"\"dense_cols\":%d,"
"\"dense_runs\":%d,"
"\"dense_total_ns\":%lld,"
"\"dense_avg_ns\":%lld"
"}",
t->label,
t->rows,
t->cols,
t->NNZ,
t->SpMVRuns,
(long long)t->SpMVTotalNs,
(long long)t->SpMVAvgNs,
t->DenseRows,
t->DenseCols,
t->DenseRuns,
(long long)t->DenseTotalNs,
(long long)t->DenseAvgNs
);
}
static void timings_write_json_file(
const char *path,
const Timing *timings,
int count)
{
FILE *fp = NULL;
if (fopen_s(&fp, path, "w") != 0 || fp == NULL)
panic("failed to open json file");
fprintf(fp, "[\n");
for (int i = 0; i < count; i++)
{
fprintf(fp, " ");
timing_write_json(fp, &timings[i]);
if (i != count - 1)
fprintf(fp, ",");
fprintf(fp, "\n");
}
fprintf(fp, "]\n");
fclose(fp);
}
static char *trim_left(char *s) {
while (*s && isspace((unsigned char)*s)) s++;
return s;
}
/*
static int starts_with(const char *s, const char *prefix) {
while (*prefix) {
if (*s++ != *prefix++) return 0;
}
return 1;
}
*/
static int triplet_cmp(const void *a, const void *b) {
static inline int triplet_cmp(const void *a, const void *b) {
const Triplet *x = (const Triplet *)a;
const Triplet *y = (const Triplet *)b;
if (x->i < y->i) return -1;
@ -75,14 +197,17 @@ static int triplet_cmp(const void *a, const void *b) {
return 0;
}
// Supports common Matrix Market coordinate cases:
// %%MatrixMarket matrix coordinate real general
// %%MatrixMarket matrix coordinate integer general
// %%MatrixMarket matrix coordinate pattern general
// %%MatrixMarket matrix coordinate real symmetric
// Exact-semantics "fast" version of the slow Matrix Market reader.
// Keeps:
// - general / symmetric handling
// - real / integer / pattern handling
// - duplicate summation via qsort + uniq
//
// For symmetric matrices, off-diagonal entries are mirrored.
// Duplicate entries are summed during COO->CSR compaction.
// Speeds up:
// - file I/O buffering
// - hot-loop parsing (strtol/strtod instead of sscanf_s)
CSRMatrix read_matrix_market_to_csr(const char *path) {
FILE *fp = NULL;
errno_t err = fopen_s(&fp, path, "r");
@ -90,6 +215,9 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
panic("failed to open Matrix Market file");
}
// Bigger stdio buffer helps a lot for multi-GB files.
setvbuf(fp, NULL, _IOFBF, 1 << 22); // 4 MiB
char line[4096];
if (fgets(line, sizeof(line), fp) == NULL) {
@ -182,37 +310,50 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
continue;
}
int i = 0;
int j = 0;
char *p = s;
char *end = NULL;
long li = strtol(p, &end, 10);
if (end == p) {
fclose(fp);
free(trips);
panic("bad entry line: failed to parse row");
}
p = end;
long lj = strtol(p, &end, 10);
if (end == p) {
fclose(fp);
free(trips);
panic("bad entry line: failed to parse col");
}
p = end;
double v = 1.0;
if (is_pattern) {
scanned = sscanf_s(s, "%d %d", &i, &j);
if (scanned != 2) {
fclose(fp);
free(trips);
panic("bad pattern entry line");
}
// nothing else to parse
} else if (is_integer) {
int iv = 0;
scanned = sscanf_s(s, "%d %d %d", &i, &j, &iv);
if (scanned != 3) {
long liv = strtol(p, &end, 10);
if (end == p) {
fclose(fp);
free(trips);
panic("bad integer entry line");
}
v = (double)iv;
v = (double)liv;
p = end;
} else {
scanned = sscanf_s(s, "%d %d %lf", &i, &j, &v);
if (scanned != 3) {
v = strtod(p, &end);
if (end == p) {
fclose(fp);
free(trips);
panic("bad real entry line");
}
p = end;
}
i -= 1;
j -= 1;
int i = (int)li - 1;
int j = (int)lj - 1;
if (i < 0 || i >= rows || j < 0 || j >= cols) {
fclose(fp);
@ -324,7 +465,8 @@ static sparse_matrix_t csr_to_mkl_handle(const CSRMatrix *A) {
return H;
}
static void example_spmv(const CSRMatrix *A) {
Timing timeSpMV(const CSRMatrix *A) {
Timing out = {0};
sparse_matrix_t H = csr_to_mkl_handle(A);
struct matrix_descr descr;
@ -333,16 +475,16 @@ static void example_spmv(const CSRMatrix *A) {
descr.diag = SPARSE_DIAG_NON_UNIT;
// Optional optimization path recommended by oneMKL.
mkl_sparse_set_mv_hint(H, SPARSE_OPERATION_NON_TRANSPOSE, descr, 1000);
mkl_sparse_set_mv_hint(H, SPARSE_OPERATION_NON_TRANSPOSE, descr, g_spmv_runs);
mkl_sparse_optimize(H);
double *x = (double *)xmalloc((size_t)A->cols * sizeof(double));
double *y = (double *)calloc((size_t)A->rows, sizeof(double));
if (!y) panic("out of memory");
double *y = (double *)xcalloc((size_t)A->rows, sizeof(double));
for (MKL_INT i = 0; i < A->cols; i++) x[i] = 1.0;
for (int i = 0; i < g_spmv_runs; i += 1) {
// warmup
for(int i = 0; i < 2; i += 1) {
sparse_status_t st = mkl_sparse_d_mv(
SPARSE_OPERATION_NON_TRANSPOSE,
1.0,
@ -355,25 +497,179 @@ static void example_spmv(const CSRMatrix *A) {
if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
}
S64 t0 = now_ns();
for (int i = 0; i < g_spmv_runs; i += 1) {
sparse_status_t st = mkl_sparse_d_mv(
SPARSE_OPERATION_NON_TRANSPOSE,
1.0,
H,
descr,
x,
0.0,
y
);
if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
}
S64 t1 = now_ns();
S64 elapsed_ns = t1-t0;
out.rows = A->rows;
out.cols = A->cols;
out.NNZ = A->nnz;
out.SpMVRuns = g_spmv_runs;
out.SpMVTotalNs = elapsed_ns;
out.SpMVAvgNs = elapsed_ns / g_spmv_runs;
printf("SpMV done for %d runs. y[0] = %.6g\n", g_spmv_runs, (A->rows > 0 ? y[0] : 0.0));
F64 avg_ms = (F64)out.SpMVAvgNs/1e6;
printf("Average time for SpMV: %.3f ms \n", avg_ms);
free(x);
free(y);
mkl_sparse_destroy(H);
return out;
}
int main() {
//const char *thermal2Path = "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx";
const char *bigPath = "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx";
printf("Reading market matrix %s \n", bigPath);
static DenseTiming timeDenseMatmul(int inRows) {
DenseTiming out = {0};
MKL_INT rows = inRows;
MKL_INT cols = inRows;
out.DenseRows = rows;
out.DenseCols = cols;
S64 byteReq = rows*cols*sizeof(F64);
F64 GBreq = (F64)byteReq/(1024.0 * 1024.0 * 1024.0);
printf("Matrix of size %i x %i requires %.2f GB of memory \n", inRows, inRows, GBreq);
MKL_INT n = rows;
// Row-major dense matrices: C = A * B
F64 *left = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64));
F64 *right = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64));
F64 *outMat = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64));
// Fill deterministically
for (MKL_INT i = 0; i < n * n; i++) {
left[i] = (F64)((i % 13) + 1) * 0.1;
right[i] = (F64)((i % 17) + 1) * 0.1;
outMat[i] = 0.0;
}
// Warmup
cblas_dgemm(
CblasRowMajor,
CblasNoTrans,
CblasNoTrans,
n, n, n,
1.0,
left, n,
right, n,
0.0,
outMat, n
);
S64 t0 = now_ns();
for (int i = 0; i < g_dense_runs; i += 1) {
cblas_dgemm(
CblasRowMajor,
CblasNoTrans,
CblasNoTrans,
n, n, n,
1.0,
left, n,
right, n,
0.0,
outMat, n
);
}
S64 t1 = now_ns();
S64 elapsed_ns = t1 - t0;
out.DenseRuns = g_dense_runs;
out.DenseTotalNs = elapsed_ns;
out.DenseAvgNs = elapsed_ns / (S64)g_dense_runs;
printf("Dense matmul done for %d runs. C[0] = %.6g\n",
g_dense_runs, outMat[0]);
F64 avg_ms = (F64)out.DenseAvgNs / 1e6;
printf("Average time for dense matmul: %.3f ms\n", avg_ms);
free(left);
free(right);
free(outMat);
return out;
}
Timing doSpMVTimings(const char *path) {
printf("Reading market matrix %s \n", path);
CSRMatrix A;
A = read_matrix_market_to_csr(bigPath);
A = read_matrix_market_to_csr(path);
printf("Read matrix with size %d x %d and nnz = %d \n", A.rows, A.cols, A.nnz);
example_spmv(&A);
Timing out = timeSpMV(&A);
matrix_label_from_path(path, out.label, sizeof(out.label));
free_csr(&A);
printf("Finished matrix %s \n", out.label);
return out;
}
int main() {
S32 numPaths = 4;
int denseRows[] = {2048, 4096, 4096*2, 4096*4};
const char *paths[] = {
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx",
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\ldoor.mtx",
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\Cube_Coup_dt0.mtx",
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx"
};
// Sanity check for threading.
// Single thread gave avg 0.9 ms and 16 threads gave 0.14 msg avg
#if 0
{
printf("Single thread:\n");
mkl_set_num_threads(1);
doTimings(paths[0]);
printf("16 threads:\n");
mkl_set_num_threads(16);
doTimings(paths[0]);
}
#endif
{
mkl_set_num_threads(16); // pick your core count
mkl_set_dynamic(0); // disable MKL changing thread count dynamically
printf("MKL max threads: %d\n", mkl_get_max_threads());
}
Timing timings[4];
for (S32 i = 0; i < numPaths; i += 1) {
timings[i] = doSpMVTimings(paths[i]);
DenseTiming dense = timeDenseMatmul(denseRows[i]);
timings[i].DenseRuns = dense.DenseRuns;
timings[i].DenseRows = dense.DenseRows;
timings[i].DenseCols = dense.DenseCols;
timings[i].DenseTotalNs = dense.DenseTotalNs;
timings[i].DenseAvgNs = dense.DenseAvgNs;
}
timings_write_json_file("cSparseResults.json", timings, numPaths);
return 0;
}