added clangd support
This commit is contained in:
parent
a30a500564
commit
10f92af6cf
29
build.bat
29
build.bat
@ -12,23 +12,26 @@ if "%SRC%"=="" (
|
||||
|
||||
set OUT=%~n1.exe
|
||||
|
||||
set CFLAGS="-O3" "-march=native" "-ffast-math" "-fopenmp" "-Wall" "-I%MKL_ROOT%\include"
|
||||
set LDFLAGS=-L%MKL_ROOT%\lib -L%COMPILER_ROOT%\lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -llibiomp5md
|
||||
|
||||
pushd build
|
||||
del /F /Q *
|
||||
clang ../src/%SRC% -o %OUT% ^
|
||||
-O3 ^
|
||||
-march=native ^
|
||||
-ffast-math ^
|
||||
-fopenmp ^
|
||||
-I"%MKL_ROOT%\include" ^
|
||||
-L"%MKL_ROOT%\lib" ^
|
||||
-L"%COMPILER_ROOT%\lib" ^
|
||||
-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core ^
|
||||
-llibiomp5md ^
|
||||
-Wall
|
||||
|
||||
clang ../src/%SRC% -o %OUT% %CFLAGS% %LDFLAGS%
|
||||
|
||||
popd
|
||||
|
||||
@rem Generate a file for clangd to understand the include files and compiler flags
|
||||
(
|
||||
echo -xc
|
||||
echo -std=c11
|
||||
for %%f in (%CFLAGS%) do echo %%~f
|
||||
) > compile_flags.txt
|
||||
|
||||
echo Build complete: build/%OUT%
|
||||
|
||||
set PATH=%PATH%;E:\lib\intel_mkl\mkl\2025.3\bin
|
||||
set PATH=%PATH%;E:\lib\intel_mkl\compiler\2025.3\bin
|
||||
set PATH=%PATH%;%MKL_ROOT%\bin
|
||||
set PATH=%PATH%;%COMPILER_ROOT%\bin
|
||||
|
||||
endlocal
|
||||
|
||||
8
compile_flags.txt
Normal file
8
compile_flags.txt
Normal file
@ -0,0 +1,8 @@
|
||||
-xc
|
||||
-std=c11
|
||||
-O3
|
||||
-march=native
|
||||
-ffast-math
|
||||
-fopenmp
|
||||
-Wall
|
||||
-IE:\lib\intel_mkl\mkl\2025.3\include
|
||||
187
src/main.c
187
src/main.c
@ -1,8 +1,8 @@
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
@ -76,27 +76,30 @@ static void panic(const char *msg) {
|
||||
|
||||
static void *xmalloc(size_t n) {
|
||||
void *p = malloc(n);
|
||||
if (!p) panic("out of memory");
|
||||
if (!p)
|
||||
panic("out of memory");
|
||||
return p;
|
||||
}
|
||||
|
||||
static void *xcalloc(size_t count, size_t size) {
|
||||
void *p = calloc(count, size);
|
||||
if(!p) {
|
||||
if (!p) {
|
||||
panic("out of memory");
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
static void matrix_label_from_path(const char *path, char *out, size_t out_size)
|
||||
{
|
||||
static void matrix_label_from_path(const char *path, char *out,
|
||||
size_t out_size) {
|
||||
const char *base = path;
|
||||
|
||||
const char *s1 = strrchr(path, '/');
|
||||
const char *s2 = strrchr(path, '\\');
|
||||
|
||||
if (s1 && s1 >= base) base = s1 + 1;
|
||||
if (s2 && s2 >= base) base = s2 + 1;
|
||||
if (s1 && s1 >= base)
|
||||
base = s1 + 1;
|
||||
if (s2 && s2 >= base)
|
||||
base = s2 + 1;
|
||||
|
||||
strncpy_s(out, out_size, base, _TRUNCATE);
|
||||
|
||||
@ -106,8 +109,7 @@ static void matrix_label_from_path(const char *path, char *out, size_t out_size)
|
||||
}
|
||||
}
|
||||
|
||||
static S64 now_ns(void)
|
||||
{
|
||||
static S64 now_ns(void) {
|
||||
static LARGE_INTEGER freq;
|
||||
static B32 initialized = 0;
|
||||
|
||||
@ -122,9 +124,8 @@ static S64 now_ns(void)
|
||||
return (S64)((counter.QuadPart * 1000000000LL) / freq.QuadPart);
|
||||
}
|
||||
|
||||
static void timing_write_json(FILE *fp, const Timing *t)
|
||||
{
|
||||
fprintf(fp,
|
||||
static void timing_write_json(FILE *fp, const Timing *t) {
|
||||
fprintf(fp,
|
||||
"{"
|
||||
"\"label\":\"%s\","
|
||||
"\"rows\":%d,"
|
||||
@ -139,26 +140,14 @@ fprintf(fp,
|
||||
"\"dense_total_ns\":%lld,"
|
||||
"\"dense_avg_ns\":%lld"
|
||||
"}",
|
||||
t->label,
|
||||
t->rows,
|
||||
t->cols,
|
||||
t->NNZ,
|
||||
t->SpMVRuns,
|
||||
(long long)t->SpMVTotalNs,
|
||||
(long long)t->SpMVAvgNs,
|
||||
t->DenseRows,
|
||||
t->DenseCols,
|
||||
t->DenseRuns,
|
||||
(long long)t->DenseTotalNs,
|
||||
(long long)t->DenseAvgNs
|
||||
);
|
||||
t->label, t->rows, t->cols, t->NNZ, t->SpMVRuns,
|
||||
(long long)t->SpMVTotalNs, (long long)t->SpMVAvgNs, t->DenseRows,
|
||||
t->DenseCols, t->DenseRuns, (long long)t->DenseTotalNs,
|
||||
(long long)t->DenseAvgNs);
|
||||
}
|
||||
|
||||
static void timings_write_json_file(
|
||||
const char *path,
|
||||
const Timing *timings,
|
||||
int count)
|
||||
{
|
||||
static void timings_write_json_file(const char *path, const Timing *timings,
|
||||
int count) {
|
||||
FILE *fp = NULL;
|
||||
|
||||
if (fopen_s(&fp, path, "w") != 0 || fp == NULL)
|
||||
@ -166,8 +155,7 @@ static void timings_write_json_file(
|
||||
|
||||
fprintf(fp, "[\n");
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
for (int i = 0; i < count; i++) {
|
||||
fprintf(fp, " ");
|
||||
timing_write_json(fp, &timings[i]);
|
||||
|
||||
@ -183,21 +171,25 @@ static void timings_write_json_file(
|
||||
}
|
||||
|
||||
static char *trim_left(char *s) {
|
||||
while (*s && isspace((unsigned char)*s)) s++;
|
||||
while (*s && isspace((unsigned char)*s))
|
||||
s++;
|
||||
return s;
|
||||
}
|
||||
|
||||
static inline int triplet_cmp(const void *a, const void *b) {
|
||||
const Triplet *x = (const Triplet *)a;
|
||||
const Triplet *y = (const Triplet *)b;
|
||||
if (x->i < y->i) return -1;
|
||||
if (x->i > y->i) return 1;
|
||||
if (x->j < y->j) return -1;
|
||||
if (x->j > y->j) return 1;
|
||||
if (x->i < y->i)
|
||||
return -1;
|
||||
if (x->i > y->i)
|
||||
return 1;
|
||||
if (x->j < y->j)
|
||||
return -1;
|
||||
if (x->j > y->j)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// Exact-semantics "fast" version of the slow Matrix Market reader.
|
||||
// Keeps:
|
||||
// - general / symmetric handling
|
||||
@ -232,14 +224,9 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
|
||||
char symmetry[64];
|
||||
|
||||
int scanned = sscanf_s(
|
||||
line,
|
||||
"%63s %63s %63s %63s %63s",
|
||||
banner, (unsigned)_countof(banner),
|
||||
object, (unsigned)_countof(object),
|
||||
format, (unsigned)_countof(format),
|
||||
field, (unsigned)_countof(field),
|
||||
symmetry, (unsigned)_countof(symmetry)
|
||||
);
|
||||
line, "%63s %63s %63s %63s %63s", banner, (unsigned)_countof(banner),
|
||||
object, (unsigned)_countof(object), format, (unsigned)_countof(format),
|
||||
field, (unsigned)_countof(field), symmetry, (unsigned)_countof(symmetry));
|
||||
|
||||
if (scanned != 5) {
|
||||
fclose(fp);
|
||||
@ -433,7 +420,8 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
|
||||
}
|
||||
|
||||
static void free_csr(CSRMatrix *A) {
|
||||
if (!A) return;
|
||||
if (!A)
|
||||
return;
|
||||
free(A->row_ptr);
|
||||
free(A->col_ind);
|
||||
free(A->values);
|
||||
@ -451,16 +439,10 @@ static sparse_matrix_t csr_to_mkl_handle(const CSRMatrix *A) {
|
||||
// oneMKL CSR creation takes row_start and row_end arrays.
|
||||
// With standard CSR row_ptr, these are row_ptr[i] and row_ptr[i+1].
|
||||
sparse_status_t st = mkl_sparse_d_create_csr(
|
||||
&H,
|
||||
SPARSE_INDEX_BASE_ZERO,
|
||||
A->rows,
|
||||
A->cols,
|
||||
A->row_ptr,
|
||||
A->row_ptr + 1,
|
||||
A->col_ind,
|
||||
A->values
|
||||
);
|
||||
if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_create_csr failed");
|
||||
&H, SPARSE_INDEX_BASE_ZERO, A->rows, A->cols, A->row_ptr, A->row_ptr + 1,
|
||||
A->col_ind, A->values);
|
||||
if (st != SPARSE_STATUS_SUCCESS)
|
||||
panic("mkl_sparse_d_create_csr failed");
|
||||
|
||||
return H;
|
||||
}
|
||||
@ -481,38 +463,27 @@ Timing timeSpMV(const CSRMatrix *A) {
|
||||
double *x = (double *)xmalloc((size_t)A->cols * sizeof(double));
|
||||
double *y = (double *)xcalloc((size_t)A->rows, sizeof(double));
|
||||
|
||||
for (MKL_INT i = 0; i < A->cols; i++) x[i] = 1.0;
|
||||
for (MKL_INT i = 0; i < A->cols; i++)
|
||||
x[i] = 1.0;
|
||||
|
||||
// warmup
|
||||
for(int i = 0; i < 2; i += 1) {
|
||||
sparse_status_t st = mkl_sparse_d_mv(
|
||||
SPARSE_OPERATION_NON_TRANSPOSE,
|
||||
1.0,
|
||||
H,
|
||||
descr,
|
||||
x,
|
||||
0.0,
|
||||
y
|
||||
);
|
||||
if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
|
||||
for (int i = 0; i < 2; i += 1) {
|
||||
sparse_status_t st = mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, H,
|
||||
descr, x, 0.0, y);
|
||||
if (st != SPARSE_STATUS_SUCCESS)
|
||||
panic("mkl_sparse_d_mv failed");
|
||||
}
|
||||
|
||||
S64 t0 = now_ns();
|
||||
for (int i = 0; i < g_spmv_runs; i += 1) {
|
||||
sparse_status_t st = mkl_sparse_d_mv(
|
||||
SPARSE_OPERATION_NON_TRANSPOSE,
|
||||
1.0,
|
||||
H,
|
||||
descr,
|
||||
x,
|
||||
0.0,
|
||||
y
|
||||
);
|
||||
if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
|
||||
sparse_status_t st = mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, H,
|
||||
descr, x, 0.0, y);
|
||||
if (st != SPARSE_STATUS_SUCCESS)
|
||||
panic("mkl_sparse_d_mv failed");
|
||||
}
|
||||
S64 t1 = now_ns();
|
||||
|
||||
S64 elapsed_ns = t1-t0;
|
||||
S64 elapsed_ns = t1 - t0;
|
||||
|
||||
out.rows = A->rows;
|
||||
out.cols = A->cols;
|
||||
@ -521,18 +492,17 @@ Timing timeSpMV(const CSRMatrix *A) {
|
||||
out.SpMVTotalNs = elapsed_ns;
|
||||
out.SpMVAvgNs = elapsed_ns / g_spmv_runs;
|
||||
|
||||
printf("SpMV done for %d runs. y[0] = %.6g\n", g_spmv_runs, (A->rows > 0 ? y[0] : 0.0));
|
||||
printf("SpMV done for %d runs. y[0] = %.6g\n", g_spmv_runs,
|
||||
(A->rows > 0 ? y[0] : 0.0));
|
||||
|
||||
F64 avg_ms = (F64)out.SpMVAvgNs/1e6;
|
||||
F64 avg_ms = (F64)out.SpMVAvgNs / 1e6;
|
||||
printf("Average time for SpMV: %.3f ms \n", avg_ms);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
|
||||
|
||||
mkl_sparse_destroy(H);
|
||||
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -545,9 +515,10 @@ static DenseTiming timeDenseMatmul(int inRows) {
|
||||
out.DenseRows = rows;
|
||||
out.DenseCols = cols;
|
||||
|
||||
S64 byteReq = rows*cols*sizeof(F64);
|
||||
F64 GBreq = (F64)byteReq/(1024.0 * 1024.0 * 1024.0);
|
||||
printf("Matrix of size %i x %i requires %.2f GB of memory \n", inRows, inRows, GBreq);
|
||||
S64 byteReq = rows * cols * sizeof(F64);
|
||||
F64 GBreq = (F64)byteReq / (1024.0 * 1024.0 * 1024.0);
|
||||
printf("Matrix of size %i x %i requires %.2f GB of memory \n", inRows, inRows,
|
||||
GBreq);
|
||||
|
||||
MKL_INT n = rows;
|
||||
|
||||
@ -564,32 +535,14 @@ static DenseTiming timeDenseMatmul(int inRows) {
|
||||
}
|
||||
|
||||
// Warmup
|
||||
cblas_dgemm(
|
||||
CblasRowMajor,
|
||||
CblasNoTrans,
|
||||
CblasNoTrans,
|
||||
n, n, n,
|
||||
1.0,
|
||||
left, n,
|
||||
right, n,
|
||||
0.0,
|
||||
outMat, n
|
||||
);
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, left, n,
|
||||
right, n, 0.0, outMat, n);
|
||||
|
||||
S64 t0 = now_ns();
|
||||
|
||||
for (int i = 0; i < g_dense_runs; i += 1) {
|
||||
cblas_dgemm(
|
||||
CblasRowMajor,
|
||||
CblasNoTrans,
|
||||
CblasNoTrans,
|
||||
n, n, n,
|
||||
1.0,
|
||||
left, n,
|
||||
right, n,
|
||||
0.0,
|
||||
outMat, n
|
||||
);
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, left,
|
||||
n, right, n, 0.0, outMat, n);
|
||||
}
|
||||
|
||||
S64 t1 = now_ns();
|
||||
@ -599,8 +552,8 @@ static DenseTiming timeDenseMatmul(int inRows) {
|
||||
out.DenseTotalNs = elapsed_ns;
|
||||
out.DenseAvgNs = elapsed_ns / (S64)g_dense_runs;
|
||||
|
||||
printf("Dense matmul done for %d runs. C[0] = %.6g\n",
|
||||
g_dense_runs, outMat[0]);
|
||||
printf("Dense matmul done for %d runs. C[0] = %.6g\n", g_dense_runs,
|
||||
outMat[0]);
|
||||
|
||||
F64 avg_ms = (F64)out.DenseAvgNs / 1e6;
|
||||
printf("Average time for dense matmul: %.3f ms\n", avg_ms);
|
||||
@ -616,7 +569,8 @@ Timing doSpMVTimings(const char *path) {
|
||||
printf("Reading market matrix %s \n", path);
|
||||
CSRMatrix A;
|
||||
A = read_matrix_market_to_csr(path);
|
||||
printf("Read matrix with size %d x %d and nnz = %d \n", A.rows, A.cols, A.nnz);
|
||||
printf("Read matrix with size %d x %d and nnz = %d \n", A.rows, A.cols,
|
||||
A.nnz);
|
||||
Timing out = timeSpMV(&A);
|
||||
matrix_label_from_path(path, out.label, sizeof(out.label));
|
||||
free_csr(&A);
|
||||
@ -624,19 +578,16 @@ Timing doSpMVTimings(const char *path) {
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main() {
|
||||
|
||||
S32 numPaths = 4;
|
||||
int denseRows[] = {1024, 2048, 4096, 4096*2};
|
||||
int denseRows[] = {1024, 2048, 4096, 4096 * 2};
|
||||
|
||||
const char *paths[] = {
|
||||
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx",
|
||||
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\ldoor.mtx",
|
||||
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\Cube_Coup_dt0.mtx",
|
||||
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx"
|
||||
};
|
||||
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx"};
|
||||
|
||||
// Sanity check for threading.
|
||||
// Single thread gave avg 0.9 ms and 16 threads gave 0.14 msg avg
|
||||
|
||||
Loading…
Reference in New Issue
Block a user