added clangd support

2026-03-18 09:48:11 +01:00 · 2026-03-18 09:48:11 +01:00 · 10f92af6cf
commit 10f92af6cf
parent a30a500564
3 changed files with 412 additions and 450 deletions
--- a/build.bat
+++ b/build.bat
@ -12,23 +12,26 @@ if "%SRC%"=="" (
 set OUT=%~n1.exe
 set CFLAGS="-O3" "-march=native" "-ffast-math" "-fopenmp" "-Wall" "-I%MKL_ROOT%\include"
 set LDFLAGS=-L%MKL_ROOT%\lib -L%COMPILER_ROOT%\lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -llibiomp5md
 pushd build
 del /F /Q *
-clang ../src/%SRC% -o %OUT% ^
+
-    -O3 ^
+clang ../src/%SRC% -o %OUT% %CFLAGS% %LDFLAGS%
    -march=native ^
    -ffast-math ^
    -fopenmp ^
    -I"%MKL_ROOT%\include" ^
    -L"%MKL_ROOT%\lib" ^
    -L"%COMPILER_ROOT%\lib" ^
    -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core ^
    -llibiomp5md ^
    -Wall
 popd
@rem Generate a file for clangd to understand the include files and compiler flags
 (
  echo -xc
  echo -std=c11
  for %%f in (%CFLAGS%) do echo %%~f
 ) > compile_flags.txt
 echo Build complete: build/%OUT%
-set PATH=%PATH%;E:\lib\intel_mkl\mkl\2025.3\bin
+set PATH=%PATH%;%MKL_ROOT%\bin
-set PATH=%PATH%;E:\lib\intel_mkl\compiler\2025.3\bin
+set PATH=%PATH%;%COMPILER_ROOT%\bin
 endlocal
--- a/compile_flags.txt
+++ b/compile_flags.txt
@ -0,0 +1,8 @@
 -xc
 -std=c11
 -O3
 -march=native
 -ffast-math
 -fopenmp
 -Wall
 -IE:\lib\intel_mkl\mkl\2025.3\include
--- a/src/main.c
+++ b/src/main.c
@ -1,8 +1,8 @@
-#include <stdlib.h>
+#include <ctype.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
@ -76,7 +76,8 @@ static void panic(const char *msg) {
 static void *xmalloc(size_t n) {
  void *p = malloc(n);
-    if (!p) panic("out of memory");
+  if (!p)
    panic("out of memory");
  return p;
 }
@ -88,15 +89,17 @@ static void *xcalloc(size_t count, size_t size) {
  return p;
 }
-static void matrix_label_from_path(const char *path, char *out, size_t out_size)
+static void matrix_label_from_path(const char *path, char *out,
-{
+                                   size_t out_size) {
  const char *base = path;
  const char *s1 = strrchr(path, '/');
  const char *s2 = strrchr(path, '\\');
-    if (s1 && s1 >= base) base = s1 + 1;
+  if (s1 && s1 >= base)
-    if (s2 && s2 >= base) base = s2 + 1;
+    base = s1 + 1;
  if (s2 && s2 >= base)
    base = s2 + 1;
  strncpy_s(out, out_size, base, _TRUNCATE);
@ -106,8 +109,7 @@ static void matrix_label_from_path(const char *path, char *out, size_t out_size)
  }
 }
-static S64 now_ns(void)
+static S64 now_ns(void) {
 {
  static LARGE_INTEGER freq;
  static B32 initialized = 0;
@ -122,8 +124,7 @@ static S64 now_ns(void)
  return (S64)((counter.QuadPart * 1000000000LL) / freq.QuadPart);
 }
-static void timing_write_json(FILE *fp, const Timing *t)
+static void timing_write_json(FILE *fp, const Timing *t) {
 {
  fprintf(fp,
          "{"
          "\"label\":\"%s\","
@ -139,26 +140,14 @@ fprintf(fp,
          "\"dense_total_ns\":%lld,"
          "\"dense_avg_ns\":%lld"
          "}",
-        t->label,
+          t->label, t->rows, t->cols, t->NNZ, t->SpMVRuns,
-        t->rows,
+          (long long)t->SpMVTotalNs, (long long)t->SpMVAvgNs, t->DenseRows,
-        t->cols,
+          t->DenseCols, t->DenseRuns, (long long)t->DenseTotalNs,
-        t->NNZ,
+          (long long)t->DenseAvgNs);
        t->SpMVRuns,
        (long long)t->SpMVTotalNs,
        (long long)t->SpMVAvgNs,
        t->DenseRows,
        t->DenseCols,
        t->DenseRuns,
        (long long)t->DenseTotalNs,
        (long long)t->DenseAvgNs
    );
 }
-static void timings_write_json_file(
+static void timings_write_json_file(const char *path, const Timing *timings,
-    const char *path,
+                                    int count) {
    const Timing *timings,
    int count)
 {
  FILE *fp = NULL;
  if (fopen_s(&fp, path, "w") != 0 || fp == NULL)
@ -166,8 +155,7 @@ static void timings_write_json_file(
  fprintf(fp, "[\n");
-    for (int i = 0; i < count; i++)
+  for (int i = 0; i < count; i++) {
    {
    fprintf(fp, "  ");
    timing_write_json(fp, &timings[i]);
@ -183,21 +171,25 @@ static void timings_write_json_file(
 }
 static char *trim_left(char *s) {
-    while (*s && isspace((unsigned char)*s)) s++;
+  while (*s && isspace((unsigned char)*s))
    s++;
  return s;
 }
 static inline int triplet_cmp(const void *a, const void *b) {
  const Triplet *x = (const Triplet *)a;
  const Triplet *y = (const Triplet *)b;
-    if (x->i < y->i) return -1;
+  if (x->i < y->i)
-    if (x->i > y->i) return 1;
+    return -1;
-    if (x->j < y->j) return -1;
+  if (x->i > y->i)
-    if (x->j > y->j) return 1;
+    return 1;
  if (x->j < y->j)
    return -1;
  if (x->j > y->j)
    return 1;
  return 0;
 }
 // Exact-semantics "fast" version of the slow Matrix Market reader.
 // Keeps:
 //   - general / symmetric handling
@ -232,14 +224,9 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
  char symmetry[64];
  int scanned = sscanf_s(
-        line,
+      line, "%63s %63s %63s %63s %63s", banner, (unsigned)_countof(banner),
-        "%63s %63s %63s %63s %63s",
+      object, (unsigned)_countof(object), format, (unsigned)_countof(format),
-        banner,   (unsigned)_countof(banner),
+      field, (unsigned)_countof(field), symmetry, (unsigned)_countof(symmetry));
        object,   (unsigned)_countof(object),
        format,   (unsigned)_countof(format),
        field,    (unsigned)_countof(field),
        symmetry, (unsigned)_countof(symmetry)
    );
  if (scanned != 5) {
    fclose(fp);
@ -433,7 +420,8 @@ CSRMatrix read_matrix_market_to_csr(const char *path) {
 }
 static void free_csr(CSRMatrix *A) {
-    if (!A) return;
+  if (!A)
    return;
  free(A->row_ptr);
  free(A->col_ind);
  free(A->values);
@ -451,16 +439,10 @@ static sparse_matrix_t csr_to_mkl_handle(const CSRMatrix *A) {
  // oneMKL CSR creation takes row_start and row_end arrays.
  // With standard CSR row_ptr, these are row_ptr[i] and row_ptr[i+1].
  sparse_status_t st = mkl_sparse_d_create_csr(
-        &H,
+      &H, SPARSE_INDEX_BASE_ZERO, A->rows, A->cols, A->row_ptr, A->row_ptr + 1,
-        SPARSE_INDEX_BASE_ZERO,
+      A->col_ind, A->values);
-        A->rows,
+  if (st != SPARSE_STATUS_SUCCESS)
-        A->cols,
+    panic("mkl_sparse_d_create_csr failed");
        A->row_ptr,
        A->row_ptr + 1,
        A->col_ind,
        A->values
    );
    if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_create_csr failed");
  return H;
 }
@ -481,34 +463,23 @@ Timing timeSpMV(const CSRMatrix *A) {
  double *x = (double *)xmalloc((size_t)A->cols * sizeof(double));
  double *y = (double *)xcalloc((size_t)A->rows, sizeof(double));
-    for (MKL_INT i = 0; i < A->cols; i++) x[i] = 1.0;
+  for (MKL_INT i = 0; i < A->cols; i++)
    x[i] = 1.0;
  // warmup
  for (int i = 0; i < 2; i += 1) {
-    sparse_status_t st = mkl_sparse_d_mv(
+    sparse_status_t st = mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, H,
-        SPARSE_OPERATION_NON_TRANSPOSE,
+                                         descr, x, 0.0, y);
-        1.0,
+    if (st != SPARSE_STATUS_SUCCESS)
-        H,
+      panic("mkl_sparse_d_mv failed");
        descr,
        x,
        0.0,
        y
    );
    if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
  }
  S64 t0 = now_ns();
  for (int i = 0; i < g_spmv_runs; i += 1) {
-    sparse_status_t st = mkl_sparse_d_mv(
+    sparse_status_t st = mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, 1.0, H,
-        SPARSE_OPERATION_NON_TRANSPOSE,
+                                         descr, x, 0.0, y);
-        1.0,
+    if (st != SPARSE_STATUS_SUCCESS)
-        H,
+      panic("mkl_sparse_d_mv failed");
        descr,
        x,
        0.0,
        y
    );
    if (st != SPARSE_STATUS_SUCCESS) panic("mkl_sparse_d_mv failed");
  }
  S64 t1 = now_ns();
@ -521,7 +492,8 @@ Timing timeSpMV(const CSRMatrix *A) {
  out.SpMVTotalNs = elapsed_ns;
  out.SpMVAvgNs = elapsed_ns / g_spmv_runs;
-    printf("SpMV done for %d runs. y[0] = %.6g\n", g_spmv_runs, (A->rows > 0 ? y[0] : 0.0));
+  printf("SpMV done for %d runs. y[0] = %.6g\n", g_spmv_runs,
         (A->rows > 0 ? y[0] : 0.0));
  F64 avg_ms = (F64)out.SpMVAvgNs / 1e6;
  printf("Average time for SpMV: %.3f ms \n", avg_ms);
@ -529,10 +501,8 @@ Timing timeSpMV(const CSRMatrix *A) {
  free(x);
  free(y);
  mkl_sparse_destroy(H);
  return out;
 }
@ -547,7 +517,8 @@ static DenseTiming timeDenseMatmul(int inRows) {
  S64 byteReq = rows * cols * sizeof(F64);
  F64 GBreq = (F64)byteReq / (1024.0 * 1024.0 * 1024.0);
-  printf("Matrix of size %i x %i requires %.2f GB of memory \n", inRows, inRows, GBreq);
+  printf("Matrix of size %i x %i requires %.2f GB of memory \n", inRows, inRows,
         GBreq);
  MKL_INT n = rows;
@ -564,32 +535,14 @@ static DenseTiming timeDenseMatmul(int inRows) {
  }
  // Warmup
-    cblas_dgemm(
+  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, left, n,
-        CblasRowMajor,
+              right, n, 0.0, outMat, n);
        CblasNoTrans,
        CblasNoTrans,
        n, n, n,
        1.0,
        left, n,
        right, n,
        0.0,
        outMat, n
    );
  S64 t0 = now_ns();
  for (int i = 0; i < g_dense_runs; i += 1) {
-        cblas_dgemm(
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, left,
-            CblasRowMajor,
+                n, right, n, 0.0, outMat, n);
            CblasNoTrans,
            CblasNoTrans,
            n, n, n,
            1.0,
            left, n,
            right, n,
            0.0,
            outMat, n
        );
  }
  S64 t1 = now_ns();
@ -599,8 +552,8 @@ static DenseTiming timeDenseMatmul(int inRows) {
  out.DenseTotalNs = elapsed_ns;
  out.DenseAvgNs = elapsed_ns / (S64)g_dense_runs;
-    printf("Dense matmul done for %d runs. C[0] = %.6g\n",
+  printf("Dense matmul done for %d runs. C[0] = %.6g\n", g_dense_runs,
-           g_dense_runs, outMat[0]);
+         outMat[0]);
  F64 avg_ms = (F64)out.DenseAvgNs / 1e6;
  printf("Average time for dense matmul: %.3f ms\n", avg_ms);
@ -616,7 +569,8 @@ Timing doSpMVTimings(const char *path) {
  printf("Reading market matrix %s \n", path);
  CSRMatrix A;
  A = read_matrix_market_to_csr(path);
-  printf("Read matrix with size %d x %d and nnz = %d \n", A.rows, A.cols, A.nnz);
+  printf("Read matrix with size %d x %d and nnz = %d \n", A.rows, A.cols,
         A.nnz);
  Timing out = timeSpMV(&A);
  matrix_label_from_path(path, out.label, sizeof(out.label));
  free_csr(&A);
@ -624,8 +578,6 @@ Timing doSpMVTimings(const char *path) {
  return out;
 }
 int main() {
  S32 numPaths = 4;
@ -635,8 +587,7 @@ int main() {
      "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\FEM_3D_thermal2.mtx",
      "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\ldoor.mtx",
      "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\Cube_Coup_dt0.mtx",
-    "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx"
+      "E:\\dev\\go_matmul_perf\\suitesparse_test_matrices\\nlpkkt200.mtx"};
  };
 // Sanity check for threading.
 // Single thread gave avg 0.9 ms and 16 threads gave 0.14 msg avg