C results
This commit is contained in:
parent
da24f740ac
commit
11ef165017
@ -15,6 +15,9 @@ set OUT=%~n1.exe
|
||||
pushd build
|
||||
del /F /Q *
|
||||
clang ../src/%SRC% -o %OUT% ^
|
||||
-O3 ^
|
||||
-march=native ^
|
||||
-ffast-math ^
|
||||
-fopenmp ^
|
||||
-I"%MKL_ROOT%\include" ^
|
||||
-L"%MKL_ROOT%\lib" ^
|
||||
|
||||
6
cSparseResults.json
Normal file
6
cSparseResults.json
Normal file
@ -0,0 +1,6 @@
|
||||
[
|
||||
{"label":"FEM_3D_thermal2","rows":147900,"cols":147900,"nnz":3489300,"spmv_runs":32,"spmv_total_ns":3866100,"spmv_avg_ns":120815,"dense_rows":2048,"dense_cols":2048,"dense_runs":32,"dense_total_ns":957945100,"dense_avg_ns":29935784},
|
||||
{"label":"ldoor","rows":952203,"cols":952203,"nnz":46522475,"spmv_runs":32,"spmv_total_ns":286909100,"spmv_avg_ns":8965909,"dense_rows":4096,"dense_cols":4096,"dense_runs":32,"dense_total_ns":7653100100,"dense_avg_ns":239159378},
|
||||
{"label":"Cube_Coup_dt0","rows":2164760,"cols":2164760,"nnz":127206144,"spmv_runs":32,"spmv_total_ns":864947600,"spmv_avg_ns":27029612,"dense_rows":8192,"dense_cols":8192,"dense_runs":32,"dense_total_ns":59799139700,"dense_avg_ns":1868723115},
|
||||
{"label":"nlpkkt200","rows":16240000,"cols":16240000,"nnz":448225632,"spmv_runs":32,"spmv_total_ns":2383754600,"spmv_avg_ns":74492331,"dense_rows":16384,"dense_cols":16384,"dense_runs":32,"dense_total_ns":471942950199,"dense_avg_ns":14748217193}
|
||||
]
|
||||
388
src/main.c
388
src/main.c
@ -4,16 +4,22 @@
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
|
||||
#include "mkl.h"
|
||||
#include "mkl_spblas.h"
|
||||
|
||||
typedef int32_t S32;
|
||||
typedef uint32_t U32;
|
||||
typedef int64_t S64;
|
||||
typedef uint64_t U64;
|
||||
typedef float F32;
|
||||
typedef double F64;
|
||||
typedef int32_t B32;
|
||||
|
||||
static int g_spmv_runs = 16;
|
||||
static S32 g_spmv_runs = 32;
|
||||
static S32 g_dense_runs = 32;
|
||||
|
||||
typedef struct CSRMatrix CSRMatrix;
|
||||
struct CSRMatrix {
|
||||
@ -32,6 +38,37 @@ struct Triplet {
|
||||
double v;
|
||||
};
|
||||
|
||||
// I just ahve a separate dense timing struct that the timing
|
||||
// function fills out and then we transfer this information to the Timing struct
|
||||
// that is serialised to json
|
||||
typedef struct DenseTiming DenseTiming;
|
||||
struct DenseTiming {
|
||||
S32 DenseRuns;
|
||||
S32 DenseCols;
|
||||
S32 DenseRows;
|
||||
S64 DenseTotalNs;
|
||||
S64 DenseAvgNs;
|
||||
};
|
||||
|
||||
typedef struct Timing Timing;
|
||||
struct Timing {
|
||||
char label[256];
|
||||
S32 rows;
|
||||
S32 cols;
|
||||
S32 NNZ;
|
||||
|
||||
S32 SpMVRuns;
|
||||
S64 SpMVTotalNs;
|
||||
S64 SpMVAvgNs;
|
||||
|
||||
S32 DenseRows;
|
||||
S32 DenseCols;
|
||||
|
||||
S32 DenseRuns;
|
||||
S64 DenseTotalNs;
|
||||
S64 DenseAvgNs;
|
||||
};
|
||||
|
||||
static void panic(const char *msg) {
|
||||
fprintf(stderr, "%s\n", msg);
|
||||
exit(EXIT_FAILURE);
|
||||
@ -51,21 +88,106 @@ static void *xcalloc(size_t count, size_t size) {
|
||||
return p;
|
||||
}
|
||||
|
||||
static void matrix_label_from_path(const char *path, char *out, size_t out_size)
|
||||
{
|
||||
const char *base = path;
|
||||
|
||||
const char *s1 = strrchr(path, '/');
|
||||
const char *s2 = strrchr(path, '\\');
|
||||
|
||||
if (s1 && s1 >= base) base = s1 + 1;
|
||||
if (s2 && s2 >= base) base = s2 + 1;
|
||||
|
||||
strncpy_s(out, out_size, base, _TRUNCATE);
|
||||
|
||||
char *ext = strrchr(out, '.');
|
||||
if (ext && strcmp(ext, ".mtx") == 0) {
|
||||
*ext = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
static S64 now_ns(void)
|
||||
{
|
||||
static LARGE_INTEGER freq;
|
||||
static B32 initialized = 0;
|
||||
|
||||
if (!initialized) {
|
||||
QueryPerformanceFrequency(&freq);
|
||||
initialized = 1;
|
||||
}
|
||||
|
||||
LARGE_INTEGER counter;
|
||||
QueryPerformanceCounter(&counter);
|
||||
|
||||
return (S64)((counter.QuadPart * 1000000000LL) / freq.QuadPart);
|
||||
}
|
||||
|
||||
static void timing_write_json(FILE *fp, const Timing *t)
|
||||
{
|
||||
fprintf(fp,
|
||||
"{"
|
||||
"\"label\":\"%s\","
|
||||
"\"rows\":%d,"
|
||||
"\"cols\":%d,"
|
||||
"\"nnz\":%d,"
|
||||
"\"spmv_runs\":%d,"
|
||||
"\"spmv_total_ns\":%lld,"
|
||||
"\"spmv_avg_ns\":%lld,"
|
||||
"\"dense_rows\":%d,"
|
||||
"\"dense_cols\":%d,"
|
||||
"\"dense_runs\":%d,"
|
||||
"\"dense_total_ns\":%lld,"
|
||||
"\"dense_avg_ns\":%lld"
|
||||
"}",
|
||||
t->label,
|
||||
t->rows,
|
||||
t->cols,
|
||||
t->NNZ,
|
||||
t->SpMVRuns,
|
||||
(long long)t->SpMVTotalNs,
|
||||
(long long)t->SpMVAvgNs,
|
||||
t->DenseRows,
|
||||
t->DenseCols,
|
||||
t->DenseRuns,
|
||||
(long long)t->DenseTotalNs,
|
||||
(long long)t->DenseAvgNs
|
||||
);
|
||||
}
|
||||
|
||||
static void timings_write_json_file(
|
||||
const char *path,
|
||||
const Timing *timings,
|
||||
int count)
|
||||
{
|
||||
FILE *fp = NULL;
|
||||
|
||||
if (fopen_s(&fp, path, "w") != 0 || fp == NULL)
|
||||
panic("failed to open json file");
|
||||
|
||||
fprintf(fp, "[\n");
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
fprintf(fp, " ");
|
||||
timing_write_json(fp, &timings[i]);
|
||||
|
||||
if (i != count - 1)
|
||||
fprintf(fp, ",");
|
||||
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
|
||||
fprintf(fp, "]\n");
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
static char *trim_left(char *s) {
|
||||
while (*s && isspace((unsigned char)*s)) s++;
|
||||
return s;
|
||||
}
|
||||
|
||||
/*
|
||||
static int starts_with(const char *s, const char *prefix) {
|
||||
while (*prefix) {
|
||||
if (*s++ != *prefix++) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
*/
|
||||
|
||||
static int triplet_cmp(const void *a, const void *b) {
|
||||
static inline int triplet_cmp(const void *a, const void *b) {
|
||||
const Triplet *x = (const Triplet *)a;
|
||||
const Triplet *y = (const Triplet *)b;
|
||||
if (x->i < y->i) return -1;
|
||||
@ -75,14 +197,17 @@ static int triplet_cmp(const void *a, const void *b) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Supports common Matrix Market coordinate cases:
|
||||
// %%MatrixMarket matrix coordinate real general
|
||||
// %%MatrixMarket matrix coordinate integer general
|
||||
// %%MatrixMarket matrix coordinate pattern general
|
||||
// %%MatrixMarket matrix coordinate real symmetric
|
||||
|
||||
// Exact-semantics "fast" version of the slow Matrix Market reader.
|
||||
// Keeps:
|
||||
// - general / symmetric handling
|
||||
// - real / integer / pattern handling
|
||||
// - duplicate summation via qsort + uniq
|
||||
//
|
||||
// For symmetric matrices, off-diagonal entries are mirrored.
|
||||
// Duplicate entries are summed during COO->CSR compaction.
|
||||
// Speeds up:
|
||||
// - file I/O buffering
|
||||
// - hot-loop parsing (strtol/strtod instead of sscanf_s)
|
||||
|
||||
CSRMatrix read_matrix_market_to_csr(const char *path) {
|
||||
FILE *fp = NULL;
|
||||
errno_t err = fopen_s(&fp, path, "r");
|
||||
@ -90,6 +215,9 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
|
||||
panic("failed to open Matrix Market file");
|
||||
}
|
||||
|
||||
// Bigger stdio buffer helps a lot for multi-GB files.
|
||||
setvbuf(fp, NULL, _IOFBF, 1 << 22); // 4 MiB
|
||||
|
||||
char line[4096];
|
||||
|
||||
if (fgets(line, sizeof(line), fp) == NULL) {
|
||||
@ -182,37 +310,50 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
char *p = s;
|
||||
char *end = NULL;
|
||||
|
||||
long li = strtol(p, &end, 10);
|
||||
if (end == p) {
|
||||
fclose(fp);
|
||||
free(trips);
|
||||
panic("bad entry line: failed to parse row");
|
||||
}
|
||||
p = end;
|
||||
|
||||
long lj = strtol(p, &end, 10);
|
||||
if (end == p) {
|
||||
fclose(fp);
|
||||
free(trips);
|
||||
panic("bad entry line: failed to parse col");
|
||||
}
|
||||
p = end;
|
||||
|
||||
double v = 1.0;
|
||||
|
||||
if (is_pattern) {
|
||||
scanned = sscanf_s(s, "%d %d", &i, &j);
|
||||
if (scanned != 2) {
|
||||
fclose(fp);
|
||||
free(trips);
|
||||
panic("bad pattern entry line");
|
||||
}
|
||||
// nothing else to parse
|
||||
} else if (is_integer) {
|
||||
int iv = 0;
|
||||
scanned = sscanf_s(s, "%d %d %d", &i, &j, &iv);
|
||||
if (scanned != 3) {
|
||||
long liv = strtol(p, &end, 10);
|
||||
if (end == p) {
|
||||
fclose(fp);
|
||||
free(trips);
|
||||
panic("bad integer entry line");
|
||||
}
|
||||
v = (double)iv;
|
||||
v = (double)liv;
|
||||
p = end;
|
||||
} else {
|
||||
scanned = sscanf_s(s, "%d %d %lf", &i, &j, &v);
|
||||
if (scanned != 3) {
|
||||
v = strtod(p, &end);
|
||||
if (end == p) {
|
||||
fclose(fp);
|
||||
free(trips);
|
||||
panic("bad real entry line");
|
||||
}
|
||||
p = end;
|
||||
}
|
||||
|
||||
i -= 1;
|
||||
j -= 1;
|
||||
int i = (int)li - 1;
|
||||
int j = (int)lj - 1;
|
||||
|
||||
if (i < 0 || i >= rows || j < 0 || j >= cols) {
|
||||
fclose(fp);
|
||||
@ -324,7 +465,8 @@ static sparse_matrix_t csr_to_mkl_handle(const CSRMatrix *A) {
|
||||
return H;
|
||||
}
|
||||
|
||||
static void example_spmv(const CSRMatrix *A) {
|
||||
Timing timeSpMV(const CSRMatrix *A) {
|
||||
Timing out = {0};
|
||||
sparse_matrix_t H = csr_to_mkl_handle(A);
|
||||
|
||||
struct matrix_descr descr;
|
||||
@ -333,16 +475,16 @@ static void example_spmv(const CSRMatrix *A) {
|
||||
descr.diag = SPARSE_DIAG_NON_UNIT;
|
||||
|
||||
// Optional optimization path recommended by oneMKL.
|
||||
mkl_sparse_set_mv_hint(H, SPARSE_OPERATION_NON_TRANSPOSE, descr, 1000);
|
||||
mkl_sparse_set_mv_hint(H, SPARSE_OPERATION_NON_TRANSPOSE, descr, g_spmv_runs);
|
||||
mkl_sparse_optimize(H);
|
||||
|
||||
double *x = (double *)xmalloc((size_t)A->cols * sizeof(double));
|
||||
double *y = (double *)calloc((size_t)A->rows, sizeof(double));
|
||||
if (!y) panic("out of memory");
|
||||
double *y = (double *)xcalloc((size_t)A->rows, sizeof(double));
|
||||
|
||||
for (MKL_INT i = 0; i < A->cols; i++) x[i] = 1.0;
|
||||
|
||||
for (int i = 0; i < g_spmv_runs; i += 1) {
|
||||
// warmup
|
||||
for(int i = 0; i < 2; i += 1) {
|
||||
sparse_status_t st = mkl_sparse_d_mv(
|
||||
SPARSE_OPERATION_NON_TRANSPOSE,
|
||||
1.0,
|
||||
@ -355,25 +497,179 @@ static void example_spmv(const CSRMatrix *A) {
|
||||
if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
|
||||
}
|
||||
|
||||
S64 t0 = now_ns();
|
||||
for (int i = 0; i < g_spmv_runs; i += 1) {
|
||||
sparse_status_t st = mkl_sparse_d_mv(
|
||||
SPARSE_OPERATION_NON_TRANSPOSE,
|
||||
1.0,
|
||||
H,
|
||||
descr,
|
||||
x,
|
||||
0.0,
|
||||
y
|
||||
);
|
||||
if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
|
||||
}
|
||||
S64 t1 = now_ns();
|
||||
|
||||
S64 elapsed_ns = t1-t0;
|
||||
|
||||
out.rows = A->rows;
|
||||
out.cols = A->cols;
|
||||
out.NNZ = A->nnz;
|
||||
out.SpMVRuns = g_spmv_runs;
|
||||
out.SpMVTotalNs = elapsed_ns;
|
||||
out.SpMVAvgNs = elapsed_ns / g_spmv_runs;
|
||||
|
||||
printf("SpMV done for %d runs. y[0] = %.6g\n", g_spmv_runs, (A->rows > 0 ? y[0] : 0.0));
|
||||
|
||||
F64 avg_ms = (F64)out.SpMVAvgNs/1e6;
|
||||
printf("Average time for SpMV: %.3f ms \n", avg_ms);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
|
||||
|
||||
mkl_sparse_destroy(H);
|
||||
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
int main() {
|
||||
//const char *thermal2Path = "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx";
|
||||
const char *bigPath = "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx";
|
||||
printf("Reading market matrix %s \n", bigPath);
|
||||
static DenseTiming timeDenseMatmul(int inRows) {
|
||||
DenseTiming out = {0};
|
||||
|
||||
MKL_INT rows = inRows;
|
||||
MKL_INT cols = inRows;
|
||||
|
||||
out.DenseRows = rows;
|
||||
out.DenseCols = cols;
|
||||
|
||||
S64 byteReq = rows*cols*sizeof(F64);
|
||||
F64 GBreq = (F64)byteReq/(1024.0 * 1024.0 * 1024.0);
|
||||
printf("Matrix of size %i x %i requires %.2f GB of memory \n", inRows, inRows, GBreq);
|
||||
|
||||
MKL_INT n = rows;
|
||||
|
||||
// Row-major dense matrices: C = A * B
|
||||
F64 *left = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64));
|
||||
F64 *right = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64));
|
||||
F64 *outMat = (F64 *)xmalloc((size_t)n * (size_t)n * sizeof(F64));
|
||||
|
||||
// Fill deterministically
|
||||
for (MKL_INT i = 0; i < n * n; i++) {
|
||||
left[i] = (F64)((i % 13) + 1) * 0.1;
|
||||
right[i] = (F64)((i % 17) + 1) * 0.1;
|
||||
outMat[i] = 0.0;
|
||||
}
|
||||
|
||||
// Warmup
|
||||
cblas_dgemm(
|
||||
CblasRowMajor,
|
||||
CblasNoTrans,
|
||||
CblasNoTrans,
|
||||
n, n, n,
|
||||
1.0,
|
||||
left, n,
|
||||
right, n,
|
||||
0.0,
|
||||
outMat, n
|
||||
);
|
||||
|
||||
S64 t0 = now_ns();
|
||||
|
||||
for (int i = 0; i < g_dense_runs; i += 1) {
|
||||
cblas_dgemm(
|
||||
CblasRowMajor,
|
||||
CblasNoTrans,
|
||||
CblasNoTrans,
|
||||
n, n, n,
|
||||
1.0,
|
||||
left, n,
|
||||
right, n,
|
||||
0.0,
|
||||
outMat, n
|
||||
);
|
||||
}
|
||||
|
||||
S64 t1 = now_ns();
|
||||
S64 elapsed_ns = t1 - t0;
|
||||
|
||||
out.DenseRuns = g_dense_runs;
|
||||
out.DenseTotalNs = elapsed_ns;
|
||||
out.DenseAvgNs = elapsed_ns / (S64)g_dense_runs;
|
||||
|
||||
printf("Dense matmul done for %d runs. C[0] = %.6g\n",
|
||||
g_dense_runs, outMat[0]);
|
||||
|
||||
F64 avg_ms = (F64)out.DenseAvgNs / 1e6;
|
||||
printf("Average time for dense matmul: %.3f ms\n", avg_ms);
|
||||
|
||||
free(left);
|
||||
free(right);
|
||||
free(outMat);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Timing doSpMVTimings(const char *path) {
|
||||
printf("Reading market matrix %s \n", path);
|
||||
CSRMatrix A;
|
||||
A = read_matrix_market_to_csr(bigPath);
|
||||
A = read_matrix_market_to_csr(path);
|
||||
printf("Read matrix with size %d x %d and nnz = %d \n", A.rows, A.cols, A.nnz);
|
||||
example_spmv(&A);
|
||||
|
||||
Timing out = timeSpMV(&A);
|
||||
matrix_label_from_path(path, out.label, sizeof(out.label));
|
||||
free_csr(&A);
|
||||
printf("Finished matrix %s \n", out.label);
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main() {
|
||||
|
||||
S32 numPaths = 4;
|
||||
int denseRows[] = {2048, 4096, 4096*2, 4096*4};
|
||||
|
||||
const char *paths[] = {
|
||||
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx",
|
||||
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\ldoor.mtx",
|
||||
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\Cube_Coup_dt0.mtx",
|
||||
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx"
|
||||
};
|
||||
|
||||
// Sanity check for threading.
|
||||
// Single thread gave avg 0.9 ms and 16 threads gave 0.14 msg avg
|
||||
#if 0
|
||||
{
|
||||
printf("Single thread:\n");
|
||||
mkl_set_num_threads(1);
|
||||
doTimings(paths[0]);
|
||||
|
||||
printf("16 threads:\n");
|
||||
mkl_set_num_threads(16);
|
||||
doTimings(paths[0]);
|
||||
}
|
||||
#endif
|
||||
|
||||
{
|
||||
mkl_set_num_threads(16); // pick your core count
|
||||
mkl_set_dynamic(0); // disable MKL changing thread count dynamically
|
||||
printf("MKL max threads: %d\n", mkl_get_max_threads());
|
||||
}
|
||||
|
||||
Timing timings[4];
|
||||
for (S32 i = 0; i < numPaths; i += 1) {
|
||||
timings[i] = doSpMVTimings(paths[i]);
|
||||
DenseTiming dense = timeDenseMatmul(denseRows[i]);
|
||||
timings[i].DenseRuns = dense.DenseRuns;
|
||||
timings[i].DenseRows = dense.DenseRows;
|
||||
timings[i].DenseCols = dense.DenseCols;
|
||||
timings[i].DenseTotalNs = dense.DenseTotalNs;
|
||||
timings[i].DenseAvgNs = dense.DenseAvgNs;
|
||||
}
|
||||
|
||||
timings_write_json_file("cSparseResults.json", timings, numPaths);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user