su2code · LwhJesse · May 18, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -77,6 +77,18 @@ class CPreconditioner {
 template <class ScalarType>
 CPreconditioner<ScalarType>::~CPreconditioner() {}
 
+/*!
+ * \class CIdentityPreconditioner
+ * \brief No-op preconditioner used when Krylov solvers run without preconditioning.
+ */
+template <class ScalarType>
+class CIdentityPreconditioner final : public CPreconditioner<ScalarType> {
+ public:
+  inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override { v = u; }
+
+  inline bool IsIdentity() const override { return true; }
+};
+
 /*!
  * \class CJacobiPreconditioner
  * \brief Specialization of preconditioner that uses CSysMatrix class.
@@ -332,6 +344,9 @@ CPreconditioner<ScalarType>* CPreconditioner<ScalarType>::Create(ENUM_LINEAR_SOL
   CPreconditioner<ScalarType>* prec = nullptr;
 
   switch (kind) {
+    case NO_PRECONDITIONER:
+      prec = new CIdentityPreconditioner<ScalarType>();
+      break;
     case JACOBI:
       prec = new CJacobiPreconditioner<ScalarType>(jacobian, geometry, config);
       break;

diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -108,6 +108,16 @@ struct CSysMatrixComms {
                        MPI_QUANTITIES commType = MPI_QUANTITIES::SOLUTION_MATRIX);
 };
 
+/*!
+ * \brief Opaque storage for CUDA/cuSPARSE resources used by the GPU SpMV path.
+ */
+struct CudaSpMVResources;
+
+/*!
+ * \brief Release cached CUDA/cuSPARSE resources used by the GPU SpMV path.
+ */
+void ReleaseCudaSpMVResources(CudaSpMVResources*& resources);
+
 /*!
  * \class CSysMatrix
  * \ingroup SpLinSys
@@ -148,8 +158,10 @@ class CSysMatrix {
   ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
+  ScalarType* d_invM = nullptr;   /*!< \brief Device inverse diagonal blocks for the Jacobi preconditioner. */
   bool useCuda = false;           /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
                                      Mainly used to conditionally free GPU memory in the class destructor. */
+  mutable CudaSpMVResources* spmv_resources = nullptr; /*!< \brief Cached cuSPARSE resources for GPU SpMV. */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -924,6 +936,13 @@ class CSysMatrix {
    */
   void BuildJacobiPreconditioner();
 
+  /*!
+   * \brief Build the Jacobi preconditioner on the GPU/device side.
+   * \note This helper is intended as the implementation hook for GPU-resident Krylov solvers.
+   *       The actual implementation belongs in CSysMatrixGPU.cu.
+   */
+  void BuildJacobiPreconditionerGPU();
+
   /*!
    * \brief Multiply CSysVector by the preconditioner
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
@@ -934,6 +953,14 @@ class CSysMatrix {
   void ComputeJacobiPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
                                    const CConfig* config) const;
 
+  /*!
+   * \brief Apply the Jacobi preconditioner on the GPU/device side.
+   * \note This helper is intended as the implementation hook for GPU-resident Krylov solvers.
+   *       The actual implementation belongs in CSysMatrixGPU.cu.
+   */
+  void ComputeJacobiPreconditionerGPU(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+                                      CGeometry* geometry, const CConfig* config) const;
+
   /*!
    * \brief Build the ILU preconditioner.
    */

diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp
@@ -304,6 +304,16 @@ class CSysSolve {
                                       ScalarType& residual, bool monitoring, const CConfig* config, FgcrodrMode mode,
                                       unsigned long custom_m) const;
 
+  /*!
+   * \brief CUDA/GPU implementation of the Flexible GMRES solver.
+   * \note This helper exists so the public solver interface can remain unchanged while the
+   *       implementation is dispatched internally based on the runtime CUDA setting.
+   *       The actual implementation lives in CSysSolveGPU.cu.
+   */
+  unsigned long FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x, const ProductType& mat_vec,
+                                     const PrecondType& precond, ScalarType tol, unsigned long m, ScalarType& residual,
+                                     bool monitoring, const CConfig* config) const;
+
   /*!
    * \brief Creates the inner solver for nested preconditioning if the settings allow it.
    * \returns True if the inner solver can be used.

diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp
@@ -240,6 +240,45 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
    */
   void GPUSetVal(ScalarType val, bool trigger = true) const;
 
+  /*!
+   * \brief Copy another vector into this vector on the device.
+   * \note This is an explicit GPU helper for Krylov solver implementations that keep the
+   *       iteration state resident on the device. The implementation belongs in
+   *       CSysVectorGPU.cu.
+   * \param[in] src - Source vector.
+   */
+  void GPUCopy(const CSysVector& src) const;
+
+  /*!
+   * \brief Scale this vector on the device.
+   * \note Explicit GPU helper for solver-side vector operations.
+   * \param[in] alpha - Scalar multiplier.
+   */
+  void GPUScale(ScalarType alpha) const;
+
+  /*!
+   * \brief Perform the AXPY operation on the device: this := this + alpha * x.
+   * \note Explicit GPU helper for solver-side vector operations.
+   * \param[in] alpha - Scalar multiplier.
+   * \param[in] x - Input vector.
+   */
+  void GPUAxpy(ScalarType alpha, const CSysVector& x) const;
+
+  /*!
+   * \brief Dot product between this vector and another vector on the device.
+   * \note Explicit GPU helper for solver-side reductions.
+   * \param[in] other - Input vector.
+   * \return Dot product result.
+   */
+  ScalarType GPUDot(const CSysVector& other) const;
+
+  /*!
+   * \brief L2 norm of this vector on the device.
+   * \note Explicit GPU helper for solver-side reductions.
+   * \return L2 norm result.
+   */
+  ScalarType GPUNorm() const;
+
   /*!
    * \brief return device pointer that points to the CSysVector values in GPU memory
    */

diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
@@ -25,6 +25,11 @@
 * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
 */
 
+#pragma once
+
+#ifndef SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
+#define SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
+
 #include<cuda_runtime.h>
 #include<iostream>
 
@@ -51,3 +56,5 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 }
 
 #define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+
+#endif  // SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
@@ -2517,6 +2517,7 @@ enum ENUM_LINEAR_SOLVER_PREC {
   PASTIX_ILU=10,  /*!< \brief PaStiX ILU(k) preconditioner. */
   PASTIX_LU_P,    /*!< \brief PaStiX LU as preconditioner. */
   PASTIX_LDLT_P,  /*!< \brief PaStiX LDLT as preconditioner. */
+  NO_PRECONDITIONER=20, /*!< \brief No preconditioner. */
 };
 static const MapType<std::string, ENUM_LINEAR_SOLVER_PREC> Linear_Solver_Prec_Map = {
   MakePair("JACOBI", JACOBI)
@@ -2526,6 +2527,7 @@ static const MapType<std::string, ENUM_LINEAR_SOLVER_PREC> Linear_Solver_Prec_Ma
   MakePair("PASTIX_ILU", PASTIX_ILU)
   MakePair("PASTIX_LU", PASTIX_LU_P)
   MakePair("PASTIX_LDLT", PASTIX_LDLT_P)
+  MakePair("NONE", NO_PRECONDITIONER)
 };
 
 /*!

diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -54,6 +54,7 @@ CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G
   col_ind_ilu = nullptr;
 
   invM = nullptr;
+  d_invM = nullptr;
 
 #ifdef USE_MKL
   MatrixMatrixProductJitter = nullptr;
@@ -67,6 +68,8 @@ template <class ScalarType>
 CSysMatrix<ScalarType>::~CSysMatrix() {
   SU2_ZONE_SCOPED
 
+  ReleaseCudaSpMVResources(spmv_resources);
+
   delete[] omp_partitions;
   MemoryAllocation::aligned_free(ILU_matrix);
   MemoryAllocation::aligned_free(matrix);
@@ -76,6 +79,7 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
     GPUMemoryAllocation::gpu_free(d_matrix);
     GPUMemoryAllocation::gpu_free(d_row_ptr);
     GPUMemoryAllocation::gpu_free(d_col_ind);
+    GPUMemoryAllocation::gpu_free(d_invM);
   }
 
 #ifdef USE_MKL
@@ -86,6 +90,10 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
 #endif
 }
 
+#ifndef HAVE_CUDA
+void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) { resources = nullptr; }
+#endif
+
 template <class ScalarType>
 void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npointdomain, unsigned short nvar,
                                         unsigned short neqn, bool EdgeConnect, CGeometry* geometry,
@@ -196,6 +204,10 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
 
   if (diag_needed) allocAndInit(invM, nPointDomain * nVar * nEqn);
 
+  if (useCuda && diag_needed) {
+    d_invM = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(nPointDomain * nVar * nEqn * sizeof(ScalarType));
+  }
+
   /*--- Thread parallel initialization. ---*/
 
   int num_threads = omp_get_max_threads();
@@ -672,6 +684,19 @@ void CSysMatrix<ScalarType>::MatrixVectorProduct(const CSysVector<ScalarType>& v
 template <class ScalarType>
 void CSysMatrix<ScalarType>::BuildJacobiPreconditioner() {
   SU2_ZONE_SCOPED
+
+  if (useCuda) {
+#ifdef HAVE_CUDA
+    BuildJacobiPreconditionerGPU();
+    return;
+#else
+    SU2_MPI::Error(
+        "\nError in building Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options "
+        "enabled in Meson to access GPU Functions",
+        CURRENT_FUNCTION);
+#endif
+  }
+
   /*--- Build Jacobi preconditioner (M = D), compute and store the inverses of the diagonal blocks. ---*/
   SU2_OMP_FOR_DYN(omp_heavy_size)
   for (unsigned long iPoint = 0; iPoint < nPointDomain; iPoint++)
@@ -684,6 +709,19 @@ void CSysMatrix<ScalarType>::ComputeJacobiPreconditioner(const CSysVector<Scalar
                                                          CSysVector<ScalarType>& prod, CGeometry* geometry,
                                                          const CConfig* config) const {
   SU2_ZONE_SCOPED
+
+  if (config->GetCUDA()) {
+#ifdef HAVE_CUDA
+    ComputeJacobiPreconditionerGPU(vec, prod, geometry, config);
+    return;
+#else
+    SU2_MPI::Error(
+        "\nError in applying Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options "
+        "enabled in Meson to access GPU Functions",
+        CURRENT_FUNCTION);
+#endif
+  }
+
   /*--- Apply Jacobi preconditioner, y = D^{-1} * x, the inverse of the diagonal is already known. ---*/
   SU2_OMP_BARRIER
   SU2_OMP_FOR_DYN(omp_heavy_size)

diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -55,6 +55,38 @@ inline cusparseIndexType_t GetCusparseIndexType() {
   }
 }
 
+struct CudaSpMVResources {
+  cusparseHandle_t handle = nullptr;
+  cusparseConstSpMatDescr_t mat = nullptr;
+  size_t buffer_size = 0;
+  void* buffer = nullptr;
+};
+
+void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) {
+  if (resources == nullptr) {
+    return;
+  }
+
+  if (resources->buffer != nullptr) {
+    gpuErrChk(cudaFree(resources->buffer));
+    resources->buffer = nullptr;
+    resources->buffer_size = 0;
+  }
+
+  if (resources->mat != nullptr) {
+    cusparseErrChk(cusparseDestroySpMat(resources->mat));
+    resources->mat = nullptr;
+  }
+
+  if (resources->handle != nullptr) {
+    cusparseErrChk(cusparseDestroy(resources->handle));
+    resources->handle = nullptr;
+  }
+
+  delete resources;
+  resources = nullptr;
+}
+
 template <class ScalarType>
 constexpr cudaDataType GetCudaDataType() {
   if constexpr (std::is_same<ScalarType, float>::value) {
@@ -83,8 +115,6 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
   ScalarType* d_vec = vec.GetDevicePointer();
   ScalarType* d_prod = prod.GetDevicePointer();
 
-  vec.HtDTransfer();
-
   const auto indexType = GetCusparseIndexType();
   const auto valueType = GetCudaDataType<ScalarType>();
 
@@ -100,42 +130,56 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
   const ScalarType alpha = 1.0;
   const ScalarType beta = 0.0;
 
-  cusparseHandle_t handle = nullptr;
-  cusparseConstSpMatDescr_t matA = nullptr;
-  cusparseDnVecDescr_t vecX = nullptr;
-  cusparseDnVecDescr_t vecY = nullptr;
+  if (spmv_resources == nullptr) {
+    spmv_resources = new CudaSpMVResources;
 
-  cusparseErrChk(cusparseCreate(&handle));
+    cusparseErrChk(cusparseCreate(&spmv_resources->handle));
 
-  cusparseErrChk(cusparseCreateConstBsr(&matA, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, d_col_ind, d_matrix,
-                                        indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, valueType, CUSPARSE_ORDER_ROW));
+    cusparseErrChk(cusparseCreateConstBsr(&spmv_resources->mat, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr,
+                                          d_col_ind, d_matrix, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO,
+                                          valueType, CUSPARSE_ORDER_ROW));
+  }
+
+  cusparseDnVecDescr_t vecX = nullptr;
+  cusparseDnVecDescr_t vecY = nullptr;
 
   cusparseErrChk(cusparseCreateDnVec(&vecX, xSize, d_vec, valueType));
   cusparseErrChk(cusparseCreateDnVec(&vecY, ySize, d_prod, valueType));
 
-  size_t bufferSize = 0;
-  void* dBuffer = nullptr;
+  size_t required_buffer_size = 0;
 
-  cusparseErrChk(cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
-                                         valueType, CUSPARSE_SPMV_BSR_ALG1, &bufferSize));
+  cusparseErrChk(cusparseSpMV_bufferSize(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                                         spmv_resources->mat, vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1,
+                                         &required_buffer_size));
 
-  if (bufferSize > 0) {
-    gpuErrChk(cudaMalloc(&dBuffer, bufferSize));
-  }
+  if (required_buffer_size > spmv_resources->buffer_size) {
+    if (spmv_resources->buffer != nullptr) {
+      gpuErrChk(cudaFree(spmv_resources->buffer));
+    }
 
-  cusparseErrChk(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, valueType,
-                              CUSPARSE_SPMV_BSR_ALG1, dBuffer));
+    if (required_buffer_size > 0) {
+      gpuErrChk(cudaMalloc(&spmv_resources->buffer, required_buffer_size));
+    } else {
+      spmv_resources->buffer = nullptr;
+    }
 
-  if (dBuffer != nullptr) {
-    gpuErrChk(cudaFree(dBuffer));
+    spmv_resources->buffer_size = required_buffer_size;
   }
 
+  cusparseErrChk(cusparseSpMV(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmv_resources->mat,
+                              vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1, spmv_resources->buffer));
+
   cusparseErrChk(cusparseDestroyDnVec(vecY));
   cusparseErrChk(cusparseDestroyDnVec(vecX));
-  cusparseErrChk(cusparseDestroySpMat(matA));
-  cusparseErrChk(cusparseDestroy(handle));
-
-  prod.DtHTransfer();
 }
-
-template class CSysMatrix<su2mixedfloat>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
+template void CSysMatrix<su2mixedfloat>::HtDTransfer(bool trigger) const;
+template void CSysMatrix<su2mixedfloat>::GPUMatrixVectorProduct(const CSysVector<su2mixedfloat>& vec,
+                                                                CSysVector<su2mixedfloat>& prod, CGeometry* geometry,
+                                                                const CConfig* config) const;
+
+#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION)
+template void CSysMatrix<passivedouble>::HtDTransfer(bool trigger) const;
+template void CSysMatrix<passivedouble>::GPUMatrixVectorProduct(const CSysVector<passivedouble>& vec,
+                                                                CSysVector<passivedouble>& prod, CGeometry* geometry,
+                                                                const CConfig* config) const;
+#endif