Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions Common/include/linear_algebra/CPreconditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ class CPreconditioner {
template <class ScalarType>
CPreconditioner<ScalarType>::~CPreconditioner() {}

/*!
* \class CIdentityPreconditioner
* \brief No-op preconditioner used when Krylov solvers run without preconditioning.
*/
template <class ScalarType>
class CIdentityPreconditioner final : public CPreconditioner<ScalarType> {
public:
inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override { v = u; }

inline bool IsIdentity() const override { return true; }
};

/*!
* \class CJacobiPreconditioner
* \brief Specialization of preconditioner that uses CSysMatrix class.
Expand Down Expand Up @@ -332,6 +344,9 @@ CPreconditioner<ScalarType>* CPreconditioner<ScalarType>::Create(ENUM_LINEAR_SOL
CPreconditioner<ScalarType>* prec = nullptr;

switch (kind) {
case NO_PRECONDITIONER:
prec = new CIdentityPreconditioner<ScalarType>();
break;
case JACOBI:
prec = new CJacobiPreconditioner<ScalarType>(jacobian, geometry, config);
break;
Expand Down
27 changes: 27 additions & 0 deletions Common/include/linear_algebra/CSysMatrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ struct CSysMatrixComms {
MPI_QUANTITIES commType = MPI_QUANTITIES::SOLUTION_MATRIX);
};

/*!
* \brief Opaque storage for CUDA/cuSPARSE resources used by the GPU SpMV path.
*/
struct CudaSpMVResources;

/*!
* \brief Release cached CUDA/cuSPARSE resources used by the GPU SpMV path.
*/
void ReleaseCudaSpMVResources(CudaSpMVResources*& resources);

/*!
* \class CSysMatrix
* \ingroup SpLinSys
Expand Down Expand Up @@ -148,8 +158,10 @@ class CSysMatrix {
ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */
const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
ScalarType* d_invM = nullptr; /*!< \brief Device inverse diagonal blocks for the Jacobi preconditioner. */
bool useCuda = false; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
Mainly used to conditionally free GPU memory in the class destructor. */
mutable CudaSpMVResources* spmv_resources = nullptr; /*!< \brief Cached cuSPARSE resources for GPU SpMV. */

ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */
unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
Expand Down Expand Up @@ -924,6 +936,13 @@ class CSysMatrix {
*/
void BuildJacobiPreconditioner();

/*!
* \brief Build the Jacobi preconditioner on the GPU/device side.
* \note This helper is intended as the implementation hook for GPU-resident Krylov solvers.
* The actual implementation belongs in CSysMatrixGPU.cu.
*/
void BuildJacobiPreconditionerGPU();

/*!
* \brief Multiply CSysVector by the preconditioner
* \param[in] vec - CSysVector to be multiplied by the preconditioner.
Expand All @@ -934,6 +953,14 @@ class CSysMatrix {
void ComputeJacobiPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
const CConfig* config) const;

/*!
* \brief Apply the Jacobi preconditioner on the GPU/device side.
* \note This helper is intended as the implementation hook for GPU-resident Krylov solvers.
* The actual implementation belongs in CSysMatrixGPU.cu.
*/
void ComputeJacobiPreconditionerGPU(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
CGeometry* geometry, const CConfig* config) const;

/*!
* \brief Build the ILU preconditioner.
*/
Expand Down
10 changes: 10 additions & 0 deletions Common/include/linear_algebra/CSysSolve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,16 @@ class CSysSolve {
ScalarType& residual, bool monitoring, const CConfig* config, FgcrodrMode mode,
unsigned long custom_m) const;

/*!
* \brief CUDA/GPU implementation of the Flexible GMRES solver.
* \note This helper exists so the public solver interface can remain unchanged while the
* implementation is dispatched internally based on the runtime CUDA setting.
* The actual implementation lives in CSysSolveGPU.cu.
*/
unsigned long FGMRES_LinSolver_GPU(const VectorType& b, VectorType& x, const ProductType& mat_vec,
const PrecondType& precond, ScalarType tol, unsigned long m, ScalarType& residual,
bool monitoring, const CConfig* config) const;

/*!
* \brief Creates the inner solver for nested preconditioning if the settings allow it.
* \returns True if the inner solver can be used.
Expand Down
39 changes: 39 additions & 0 deletions Common/include/linear_algebra/CSysVector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,45 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
*/
void GPUSetVal(ScalarType val, bool trigger = true) const;

/*!
* \brief Copy another vector into this vector on the device.
* \note This is an explicit GPU helper for Krylov solver implementations that keep the
* iteration state resident on the device. The implementation belongs in
* CSysVectorGPU.cu.
* \param[in] src - Source vector.
*/
void GPUCopy(const CSysVector& src) const;

/*!
* \brief Scale this vector on the device.
* \note Explicit GPU helper for solver-side vector operations.
* \param[in] alpha - Scalar multiplier.
*/
void GPUScale(ScalarType alpha) const;

/*!
* \brief Perform the AXPY operation on the device: this := this + alpha * x.
* \note Explicit GPU helper for solver-side vector operations.
* \param[in] alpha - Scalar multiplier.
* \param[in] x - Input vector.
*/
void GPUAxpy(ScalarType alpha, const CSysVector& x) const;

/*!
* \brief Dot product between this vector and another vector on the device.
* \note Explicit GPU helper for solver-side reductions.
* \param[in] other - Input vector.
* \return Dot product result.
*/
ScalarType GPUDot(const CSysVector& other) const;

/*!
* \brief L2 norm of this vector on the device.
* \note Explicit GPU helper for solver-side reductions.
* \return L2 norm result.
*/
ScalarType GPUNorm() const;

/*!
* \brief return device pointer that points to the CSysVector values in GPU memory
*/
Expand Down
7 changes: 7 additions & 0 deletions Common/include/linear_algebra/GPUComms.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#ifndef SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
#define SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH

#include<cuda_runtime.h>
#include<iostream>

Expand All @@ -51,3 +56,5 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
}

#define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

#endif // SU2_COMMON_LINEAR_ALGEBRA_GPUCOMMS_CUH
2 changes: 2 additions & 0 deletions Common/include/option_structure.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2517,6 +2517,7 @@ enum ENUM_LINEAR_SOLVER_PREC {
PASTIX_ILU=10, /*!< \brief PaStiX ILU(k) preconditioner. */
PASTIX_LU_P, /*!< \brief PaStiX LU as preconditioner. */
PASTIX_LDLT_P, /*!< \brief PaStiX LDLT as preconditioner. */
NO_PRECONDITIONER=20, /*!< \brief No preconditioner. */
};
static const MapType<std::string, ENUM_LINEAR_SOLVER_PREC> Linear_Solver_Prec_Map = {
MakePair("JACOBI", JACOBI)
Expand All @@ -2526,6 +2527,7 @@ static const MapType<std::string, ENUM_LINEAR_SOLVER_PREC> Linear_Solver_Prec_Ma
MakePair("PASTIX_ILU", PASTIX_ILU)
MakePair("PASTIX_LU", PASTIX_LU_P)
MakePair("PASTIX_LDLT", PASTIX_LDLT_P)
MakePair("NONE", NO_PRECONDITIONER)
};

/*!
Expand Down
38 changes: 38 additions & 0 deletions Common/src/linear_algebra/CSysMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::G
col_ind_ilu = nullptr;

invM = nullptr;
d_invM = nullptr;

#ifdef USE_MKL
MatrixMatrixProductJitter = nullptr;
Expand All @@ -67,6 +68,8 @@ template <class ScalarType>
CSysMatrix<ScalarType>::~CSysMatrix() {
SU2_ZONE_SCOPED

ReleaseCudaSpMVResources(spmv_resources);

delete[] omp_partitions;
MemoryAllocation::aligned_free(ILU_matrix);
MemoryAllocation::aligned_free(matrix);
Expand All @@ -76,6 +79,7 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
GPUMemoryAllocation::gpu_free(d_matrix);
GPUMemoryAllocation::gpu_free(d_row_ptr);
GPUMemoryAllocation::gpu_free(d_col_ind);
GPUMemoryAllocation::gpu_free(d_invM);
}

#ifdef USE_MKL
Expand All @@ -86,6 +90,10 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
#endif
}

#ifndef HAVE_CUDA
void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) { resources = nullptr; }
#endif

template <class ScalarType>
void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npointdomain, unsigned short nvar,
unsigned short neqn, bool EdgeConnect, CGeometry* geometry,
Expand Down Expand Up @@ -196,6 +204,10 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi

if (diag_needed) allocAndInit(invM, nPointDomain * nVar * nEqn);

if (useCuda && diag_needed) {
d_invM = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(nPointDomain * nVar * nEqn * sizeof(ScalarType));
}

/*--- Thread parallel initialization. ---*/

int num_threads = omp_get_max_threads();
Expand Down Expand Up @@ -672,6 +684,19 @@ void CSysMatrix<ScalarType>::MatrixVectorProduct(const CSysVector<ScalarType>& v
template <class ScalarType>
void CSysMatrix<ScalarType>::BuildJacobiPreconditioner() {
SU2_ZONE_SCOPED

if (useCuda) {
#ifdef HAVE_CUDA
BuildJacobiPreconditionerGPU();
return;
#else
SU2_MPI::Error(
"\nError in building Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options "
"enabled in Meson to access GPU Functions",
CURRENT_FUNCTION);
#endif
}

/*--- Build Jacobi preconditioner (M = D), compute and store the inverses of the diagonal blocks. ---*/
SU2_OMP_FOR_DYN(omp_heavy_size)
for (unsigned long iPoint = 0; iPoint < nPointDomain; iPoint++)
Expand All @@ -684,6 +709,19 @@ void CSysMatrix<ScalarType>::ComputeJacobiPreconditioner(const CSysVector<Scalar
CSysVector<ScalarType>& prod, CGeometry* geometry,
const CConfig* config) const {
SU2_ZONE_SCOPED

if (config->GetCUDA()) {
#ifdef HAVE_CUDA
ComputeJacobiPreconditionerGPU(vec, prod, geometry, config);
return;
#else
SU2_MPI::Error(
"\nError in applying Jacobi preconditioner\nENABLE_CUDA is set to YES\nPlease compile with CUDA options "
"enabled in Meson to access GPU Functions",
CURRENT_FUNCTION);
#endif
}

/*--- Apply Jacobi preconditioner, y = D^{-1} * x, the inverse of the diagonal is already known. ---*/
SU2_OMP_BARRIER
SU2_OMP_FOR_DYN(omp_heavy_size)
Expand Down
96 changes: 70 additions & 26 deletions Common/src/linear_algebra/CSysMatrixGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,38 @@ inline cusparseIndexType_t GetCusparseIndexType() {
}
}

struct CudaSpMVResources {
cusparseHandle_t handle = nullptr;
cusparseConstSpMatDescr_t mat = nullptr;
size_t buffer_size = 0;
void* buffer = nullptr;
};

void ReleaseCudaSpMVResources(CudaSpMVResources*& resources) {
if (resources == nullptr) {
return;
}

if (resources->buffer != nullptr) {
gpuErrChk(cudaFree(resources->buffer));
resources->buffer = nullptr;
resources->buffer_size = 0;
}

if (resources->mat != nullptr) {
cusparseErrChk(cusparseDestroySpMat(resources->mat));
resources->mat = nullptr;
}

if (resources->handle != nullptr) {
cusparseErrChk(cusparseDestroy(resources->handle));
resources->handle = nullptr;
}

delete resources;
resources = nullptr;
}

template <class ScalarType>
constexpr cudaDataType GetCudaDataType() {
if constexpr (std::is_same<ScalarType, float>::value) {
Expand Down Expand Up @@ -83,8 +115,6 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
ScalarType* d_vec = vec.GetDevicePointer();
ScalarType* d_prod = prod.GetDevicePointer();

vec.HtDTransfer();

const auto indexType = GetCusparseIndexType();
const auto valueType = GetCudaDataType<ScalarType>();

Expand All @@ -100,42 +130,56 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
const ScalarType alpha = 1.0;
const ScalarType beta = 0.0;

cusparseHandle_t handle = nullptr;
cusparseConstSpMatDescr_t matA = nullptr;
cusparseDnVecDescr_t vecX = nullptr;
cusparseDnVecDescr_t vecY = nullptr;
if (spmv_resources == nullptr) {
spmv_resources = new CudaSpMVResources;

cusparseErrChk(cusparseCreate(&handle));
cusparseErrChk(cusparseCreate(&spmv_resources->handle));

cusparseErrChk(cusparseCreateConstBsr(&matA, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, d_col_ind, d_matrix,
indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, valueType, CUSPARSE_ORDER_ROW));
cusparseErrChk(cusparseCreateConstBsr(&spmv_resources->mat, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr,
d_col_ind, d_matrix, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO,
valueType, CUSPARSE_ORDER_ROW));
}

cusparseDnVecDescr_t vecX = nullptr;
cusparseDnVecDescr_t vecY = nullptr;

cusparseErrChk(cusparseCreateDnVec(&vecX, xSize, d_vec, valueType));
cusparseErrChk(cusparseCreateDnVec(&vecY, ySize, d_prod, valueType));

size_t bufferSize = 0;
void* dBuffer = nullptr;
size_t required_buffer_size = 0;

cusparseErrChk(cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
valueType, CUSPARSE_SPMV_BSR_ALG1, &bufferSize));
cusparseErrChk(cusparseSpMV_bufferSize(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha,
spmv_resources->mat, vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1,
&required_buffer_size));

if (bufferSize > 0) {
gpuErrChk(cudaMalloc(&dBuffer, bufferSize));
}
if (required_buffer_size > spmv_resources->buffer_size) {
if (spmv_resources->buffer != nullptr) {
gpuErrChk(cudaFree(spmv_resources->buffer));
}

cusparseErrChk(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, valueType,
CUSPARSE_SPMV_BSR_ALG1, dBuffer));
if (required_buffer_size > 0) {
gpuErrChk(cudaMalloc(&spmv_resources->buffer, required_buffer_size));
} else {
spmv_resources->buffer = nullptr;
}

if (dBuffer != nullptr) {
gpuErrChk(cudaFree(dBuffer));
spmv_resources->buffer_size = required_buffer_size;
}

cusparseErrChk(cusparseSpMV(spmv_resources->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmv_resources->mat,
vecX, &beta, vecY, valueType, CUSPARSE_SPMV_BSR_ALG1, spmv_resources->buffer));

cusparseErrChk(cusparseDestroyDnVec(vecY));
cusparseErrChk(cusparseDestroyDnVec(vecX));
cusparseErrChk(cusparseDestroySpMat(matA));
cusparseErrChk(cusparseDestroy(handle));

prod.DtHTransfer();
}

template class CSysMatrix<su2mixedfloat>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
template void CSysMatrix<su2mixedfloat>::HtDTransfer(bool trigger) const;
template void CSysMatrix<su2mixedfloat>::GPUMatrixVectorProduct(const CSysVector<su2mixedfloat>& vec,
CSysVector<su2mixedfloat>& prod, CGeometry* geometry,
const CConfig* config) const;

#if defined(USE_MIXED_PRECISION) && !defined(USE_SINGLE_PRECISION)
template void CSysMatrix<passivedouble>::HtDTransfer(bool trigger) const;
template void CSysMatrix<passivedouble>::GPUMatrixVectorProduct(const CSysVector<passivedouble>& vec,
CSysVector<passivedouble>& prod, CGeometry* geometry,
const CConfig* config) const;
#endif
Loading
Loading