added clangd support

This commit is contained in:
antonl 2026-03-18 09:48:11 +01:00
parent a30a500564
commit 10f92af6cf
3 changed files with 412 additions and 450 deletions

View File

@ -12,23 +12,26 @@ if "%SRC%"=="" (
set OUT=%~n1.exe set OUT=%~n1.exe
set CFLAGS="-O3" "-march=native" "-ffast-math" "-fopenmp" "-Wall" "-I%MKL_ROOT%\include"
set LDFLAGS=-L%MKL_ROOT%\lib -L%COMPILER_ROOT%\lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -llibiomp5md
pushd build pushd build
del /F /Q * del /F /Q *
clang ../src/%SRC% -o %OUT% ^
-O3 ^ clang ../src/%SRC% -o %OUT% %CFLAGS% %LDFLAGS%
-march=native ^
-ffast-math ^
-fopenmp ^
-I"%MKL_ROOT%\include" ^
-L"%MKL_ROOT%\lib" ^
-L"%COMPILER_ROOT%\lib" ^
-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core ^
-llibiomp5md ^
-Wall
popd popd
@rem Generate a file for clangd to understand the include files and compiler flags
(
echo -xc
echo -std=c11
for %%f in (%CFLAGS%) do echo %%~f
) > compile_flags.txt
echo Build complete: build/%OUT% echo Build complete: build/%OUT%
set PATH=%PATH%;E:\lib\intel_mkl\mkl\2025.3\bin set PATH=%PATH%;%MKL_ROOT%\bin
set PATH=%PATH%;E:\lib\intel_mkl\compiler\2025.3\bin set PATH=%PATH%;%COMPILER_ROOT%\bin
endlocal

8
compile_flags.txt Normal file
View File

@ -0,0 +1,8 @@
-xc
-std=c11
-O3
-march=native
-ffast-math
-fopenmp
-Wall
-IE:\lib\intel_mkl\mkl\2025.3\include

View File

@ -1,8 +1,8 @@
#include <stdlib.h> #include <ctype.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include <string.h> #include <string.h>
#include <ctype.h>
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
#include <windows.h> #include <windows.h>
@ -76,7 +76,8 @@ static void panic(const char *msg) {
static void *xmalloc(size_t n) { static void *xmalloc(size_t n) {
void *p = malloc(n); void *p = malloc(n);
if (!p) panic("out of memory"); if (!p)
panic("out of memory");
return p; return p;
} }
@ -88,15 +89,17 @@ static void *xcalloc(size_t count, size_t size) {
return p; return p;
} }
static void matrix_label_from_path(const char *path, char *out, size_t out_size) static void matrix_label_from_path(const char *path, char *out,
{ size_t out_size) {
const char *base = path; const char *base = path;
const char *s1 = strrchr(path, '/'); const char *s1 = strrchr(path, '/');
const char *s2 = strrchr(path, '\\'); const char *s2 = strrchr(path, '\\');
if (s1 && s1 >= base) base = s1 + 1; if (s1 && s1 >= base)
if (s2 && s2 >= base) base = s2 + 1; base = s1 + 1;
if (s2 && s2 >= base)
base = s2 + 1;
strncpy_s(out, out_size, base, _TRUNCATE); strncpy_s(out, out_size, base, _TRUNCATE);
@ -106,8 +109,7 @@ static void matrix_label_from_path(const char *path, char *out, size_t out_size)
} }
} }
static S64 now_ns(void) static S64 now_ns(void) {
{
static LARGE_INTEGER freq; static LARGE_INTEGER freq;
static B32 initialized = 0; static B32 initialized = 0;
@ -122,8 +124,7 @@ static S64 now_ns(void)
return (S64)((counter.QuadPart * 1000000000LL) / freq.QuadPart); return (S64)((counter.QuadPart * 1000000000LL) / freq.QuadPart);
} }
static void timing_write_json(FILE *fp, const Timing *t) static void timing_write_json(FILE *fp, const Timing *t) {
{
fprintf(fp, fprintf(fp,
"{" "{"
"\"label\":\"%s\"," "\"label\":\"%s\","
@ -139,26 +140,14 @@ fprintf(fp,
"\"dense_total_ns\":%lld," "\"dense_total_ns\":%lld,"
"\"dense_avg_ns\":%lld" "\"dense_avg_ns\":%lld"
"}", "}",
t->label, t->label, t->rows, t->cols, t->NNZ, t->SpMVRuns,
t->rows, (long long)t->SpMVTotalNs, (long long)t->SpMVAvgNs, t->DenseRows,
t->cols, t->DenseCols, t->DenseRuns, (long long)t->DenseTotalNs,
t->NNZ, (long long)t->DenseAvgNs);
t->SpMVRuns,
(long long)t->SpMVTotalNs,
(long long)t->SpMVAvgNs,
t->DenseRows,
t->DenseCols,
t->DenseRuns,
(long long)t->DenseTotalNs,
(long long)t->DenseAvgNs
);
} }
static void timings_write_json_file( static void timings_write_json_file(const char *path, const Timing *timings,
const char *path, int count) {
const Timing *timings,
int count)
{
FILE *fp = NULL; FILE *fp = NULL;
if (fopen_s(&fp, path, "w") != 0 || fp == NULL) if (fopen_s(&fp, path, "w") != 0 || fp == NULL)
@ -166,8 +155,7 @@ static void timings_write_json_file(
fprintf(fp, "[\n"); fprintf(fp, "[\n");
for (int i = 0; i < count; i++) for (int i = 0; i < count; i++) {
{
fprintf(fp, " "); fprintf(fp, " ");
timing_write_json(fp, &timings[i]); timing_write_json(fp, &timings[i]);
@ -183,21 +171,25 @@ static void timings_write_json_file(
} }
static char *trim_left(char *s) { static char *trim_left(char *s) {
while (*s && isspace((unsigned char)*s)) s++; while (*s && isspace((unsigned char)*s))
s++;
return s; return s;
} }
static inline int triplet_cmp(const void *a, const void *b) { static inline int triplet_cmp(const void *a, const void *b) {
const Triplet *x = (const Triplet *)a; const Triplet *x = (const Triplet *)a;
const Triplet *y = (const Triplet *)b; const Triplet *y = (const Triplet *)b;
if (x->i < y->i) return -1; if (x->i < y->i)
if (x->i > y->i) return 1; return -1;
if (x->j < y->j) return -1; if (x->i > y->i)
if (x->j > y->j) return 1; return 1;
if (x->j < y->j)
return -1;
if (x->j > y->j)
return 1;
return 0; return 0;
} }
// Exact-semantics "fast" version of the slow Matrix Market reader. // Exact-semantics "fast" version of the slow Matrix Market reader.
// Keeps: // Keeps:
// - general / symmetric handling // - general / symmetric handling
@ -232,14 +224,9 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
char symmetry[64]; char symmetry[64];
int scanned = sscanf_s( int scanned = sscanf_s(
line, line, "%63s %63s %63s %63s %63s", banner, (unsigned)_countof(banner),
"%63s %63s %63s %63s %63s", object, (unsigned)_countof(object), format, (unsigned)_countof(format),
banner, (unsigned)_countof(banner), field, (unsigned)_countof(field), symmetry, (unsigned)_countof(symmetry));
object, (unsigned)_countof(object),
format, (unsigned)_countof(format),
field, (unsigned)_countof(field),
symmetry, (unsigned)_countof(symmetry)
);
if (scanned != 5) { if (scanned != 5) {
fclose(fp); fclose(fp);
@ -433,7 +420,8 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
} }
static void free_csr(CSRMatrix *A) { static void free_csr(CSRMatrix *A) {
if (!A) return; if (!A)
return;
free(A->row_ptr); free(A->row_ptr);
free(A->col_ind); free(A->col_ind);
free(A->values); free(A->values);
@ -451,16 +439,10 @@ static sparse_matrix_t csr_to_mkl_handle(const CSRMatrix *A) {
// oneMKL CSR creation takes row_start and row_end arrays. // oneMKL CSR creation takes row_start and row_end arrays.
// With standard CSR row_ptr, these are row_ptr[i] and row_ptr[i+1]. // With standard CSR row_ptr, these are row_ptr[i] and row_ptr[i+1].
sparse_status_t st = mkl_sparse_d_create_csr( sparse_status_t st = mkl_sparse_d_create_csr(
&H, &H, SPARSE_INDEX_BASE_ZERO, A->rows, A->cols, A->row_ptr, A->row_ptr + 1,
SPARSE_INDEX_BASE_ZERO, A->col_ind, A->values);
A->rows, if (st != SPARSE_STATUS_SUCCESS)
A->cols, panic("mkl_sparse_d_create_csr failed");
A->row_ptr,
A->row_ptr + 1,
A->col_ind,
A->values
);
if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_create_csr failed");
return H; return H;
} }
@ -481,34 +463,23 @@ Timing timeSpMV(const CSRMatrix *A) {
double *x = (double *)xmalloc((size_t)A->cols * sizeof(double)); double *x = (double *)xmalloc((size_t)A->cols * sizeof(double));
double *y = (double *)xcalloc((size_t)A->rows, sizeof(double)); double *y = (double *)xcalloc((size_t)A->rows, sizeof(double));
for (MKL_INT i = 0; i < A->cols; i++) x[i] = 1.0; for (MKL_INT i = 0; i < A->cols; i++)
x[i] = 1.0;
// warmup // warmup
for (int i = 0; i < 2; i += 1) { for (int i = 0; i < 2; i += 1) {
sparse_status_t st = mkl_sparse_d_mv( sparse_status_t st = mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, H,
SPARSE_OPERATION_NON_TRANSPOSE, descr, x, 0.0, y);
1.0, if (st != SPARSE_STATUS_SUCCESS)
H, panic("mkl_sparse_d_mv failed");
descr,
x,
0.0,
y
);
if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
} }
S64 t0 = now_ns(); S64 t0 = now_ns();
for (int i = 0; i < g_spmv_runs; i += 1) { for (int i = 0; i < g_spmv_runs; i += 1) {
sparse_status_t st = mkl_sparse_d_mv( sparse_status_t st = mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, H,
SPARSE_OPERATION_NON_TRANSPOSE, descr, x, 0.0, y);
1.0, if (st != SPARSE_STATUS_SUCCESS)
H, panic("mkl_sparse_d_mv failed");
descr,
x,
0.0,
y
);
if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
} }
S64 t1 = now_ns(); S64 t1 = now_ns();
@ -521,7 +492,8 @@ Timing timeSpMV(const CSRMatrix *A) {
out.SpMVTotalNs = elapsed_ns; out.SpMVTotalNs = elapsed_ns;
out.SpMVAvgNs = elapsed_ns / g_spmv_runs; out.SpMVAvgNs = elapsed_ns / g_spmv_runs;
printf("SpMV done for %d runs. y[0] = %.6g\n", g_spmv_runs, (A->rows > 0 ? y[0] : 0.0)); printf("SpMV done for %d runs. y[0] = %.6g\n", g_spmv_runs,
(A->rows > 0 ? y[0] : 0.0));
F64 avg_ms = (F64)out.SpMVAvgNs / 1e6; F64 avg_ms = (F64)out.SpMVAvgNs / 1e6;
printf("Average time for SpMV: %.3f ms \n", avg_ms); printf("Average time for SpMV: %.3f ms \n", avg_ms);
@ -529,10 +501,8 @@ Timing timeSpMV(const CSRMatrix *A) {
free(x); free(x);
free(y); free(y);
mkl_sparse_destroy(H); mkl_sparse_destroy(H);
return out; return out;
} }
@ -547,7 +517,8 @@ static DenseTiming timeDenseMatmul(int inRows) {
S64 byteReq = rows * cols * sizeof(F64); S64 byteReq = rows * cols * sizeof(F64);
F64 GBreq = (F64)byteReq / (1024.0 * 1024.0 * 1024.0); F64 GBreq = (F64)byteReq / (1024.0 * 1024.0 * 1024.0);
printf("Matrix of size %i x %i requires %.2f GB of memory \n", inRows, inRows, GBreq); printf("Matrix of size %i x %i requires %.2f GB of memory \n", inRows, inRows,
GBreq);
MKL_INT n = rows; MKL_INT n = rows;
@ -564,32 +535,14 @@ static DenseTiming timeDenseMatmul(int inRows) {
} }
// Warmup // Warmup
cblas_dgemm( cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, left, n,
CblasRowMajor, right, n, 0.0, outMat, n);
CblasNoTrans,
CblasNoTrans,
n, n, n,
1.0,
left, n,
right, n,
0.0,
outMat, n
);
S64 t0 = now_ns(); S64 t0 = now_ns();
for (int i = 0; i < g_dense_runs; i += 1) { for (int i = 0; i < g_dense_runs; i += 1) {
cblas_dgemm( cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, left,
CblasRowMajor, n, right, n, 0.0, outMat, n);
CblasNoTrans,
CblasNoTrans,
n, n, n,
1.0,
left, n,
right, n,
0.0,
outMat, n
);
} }
S64 t1 = now_ns(); S64 t1 = now_ns();
@ -599,8 +552,8 @@ static DenseTiming timeDenseMatmul(int inRows) {
out.DenseTotalNs = elapsed_ns; out.DenseTotalNs = elapsed_ns;
out.DenseAvgNs = elapsed_ns / (S64)g_dense_runs; out.DenseAvgNs = elapsed_ns / (S64)g_dense_runs;
printf("Dense matmul done for %d runs. C[0] = %.6g\n", printf("Dense matmul done for %d runs. C[0] = %.6g\n", g_dense_runs,
g_dense_runs, outMat[0]); outMat[0]);
F64 avg_ms = (F64)out.DenseAvgNs / 1e6; F64 avg_ms = (F64)out.DenseAvgNs / 1e6;
printf("Average time for dense matmul: %.3f ms\n", avg_ms); printf("Average time for dense matmul: %.3f ms\n", avg_ms);
@ -616,7 +569,8 @@ Timing doSpMVTimings(const char *path) {
printf("Reading market matrix %s \n", path); printf("Reading market matrix %s \n", path);
CSRMatrix A; CSRMatrix A;
A = read_matrix_market_to_csr(path); A = read_matrix_market_to_csr(path);
printf("Read matrix with size %d x %d and nnz = %d \n", A.rows, A.cols, A.nnz); printf("Read matrix with size %d x %d and nnz = %d \n", A.rows, A.cols,
A.nnz);
Timing out = timeSpMV(&A); Timing out = timeSpMV(&A);
matrix_label_from_path(path, out.label, sizeof(out.label)); matrix_label_from_path(path, out.label, sizeof(out.label));
free_csr(&A); free_csr(&A);
@ -624,8 +578,6 @@ Timing doSpMVTimings(const char *path) {
return out; return out;
} }
int main() { int main() {
S32 numPaths = 4; S32 numPaths = 4;
@ -635,8 +587,7 @@ int main() {
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx", "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx",
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\ldoor.mtx", "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\ldoor.mtx",
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\Cube_Coup_dt0.mtx", "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\Cube_Coup_dt0.mtx",
"E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx" "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx"};
};
// Sanity check for threading. // Sanity check for threading.
// Single thread gave avg 0.9 ms and 16 threads gave 0.14 msg avg // Single thread gave avg 0.9 ms and 16 threads gave 0.14 msg avg