diff --git a/bin/cuda/gpu_benchmark.cu b/bin/cuda/gpu_benchmark.cu index fe459e0c..2c234530 100644 --- a/bin/cuda/gpu_benchmark.cu +++ b/bin/cuda/gpu_benchmark.cu @@ -1,10 +1,11 @@ -#include "gpu_benchmark.h" - #include "kernels.h" + #include +#include #include #include +#include // The macro wraps any CUDA API call #define CUDA_CHECK(ans) \ @@ -26,69 +27,10 @@ float getElapsedTime(const cudaEvent_t &gpu_start, cudaEvent_t &gpu_stop) { return gpu_elapsed_time / 1000.0f; } -// Function to run the GPU benchmark with no time limit -void runBenchmark(long max_work) { - uint32_t n = 256 * 256; - uint64_t m = max_work * 16384 / n; - - unsigned long long int *d_count; - curandState *d_state; - CUDA_CHECK(cudaMalloc((void **)&d_count, 256 * sizeof(unsigned long long int))); - CUDA_CHECK(cudaMalloc((void **)&d_state, n * sizeof(curandState))); - CUDA_CHECK(cudaMemset(d_count, 0, 256 * sizeof(unsigned long long int))); - - // set up timing stuff - cudaEvent_t gpu_start, gpu_stop; - CUDA_CHECK(cudaEventCreate(&gpu_start)); - CUDA_CHECK(cudaEventCreate(&gpu_stop)); - - // set kernel - dim3 gridSize = 256; - dim3 blockSize = 256; - setup_kernel<<>>(d_state); - - // monte carlo kernel - CUDA_CHECK(cudaEventRecord(gpu_start, 0)); - monte_carlo_kernel<<>>(d_state, d_count, m); - CUDA_CHECK(cudaDeviceSynchronize()); - - float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop); - CUDA_CHECK(cudaEventDestroy(gpu_start)); - CUDA_CHECK(cudaEventDestroy(gpu_stop)); - - // Allocate device output array - unsigned long long int *d_out = nullptr; - CUDA_CHECK(cudaMalloc((void **)&d_out, sizeof(unsigned long long int))); - - // Request and allocate temporary storage - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256)); - CUDA_CHECK(cudaMalloc((void **)&d_temp_storage, temp_storage_bytes)); - - // Run - CUDA_CHECK(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256)); - - // copy results back to the host - unsigned long long int h_count = 0; - CUDA_CHECK(cudaMemcpy(&h_count, d_out, sizeof(unsigned long long int), cudaMemcpyDeviceToHost)); - - // display results and timings for gpu - float pi = h_count * 4.0 / (n * m); - std::cout << "Approximate pi calculated on GPU is: " << pi << " and calculation took " << gpu_elapsed_time << "s\n"; - std::cout << "Benchmark completed!" << std::endl; - - CUDA_CHECK(cudaFree(d_count)); - CUDA_CHECK(cudaFree(d_state)); - CUDA_CHECK(cudaFree(d_out)); - CUDA_CHECK(cudaFree(d_temp_storage)); -} - // Function to run the GPU benchmark for a specified time -void runBenchmarkTime(long max_work, int runtime_in_seconds) { - +void runBenchmarkTime(long max_work, std::optional runtime_in_seconds) { uint32_t n = 256 * 256; - uint64_t m = max_work * 16384 / n; + uint64_t m = (max_work + n - 1) / n; // allocate memory unsigned long long int *d_count; @@ -108,13 +50,21 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) { setup_kernel<<>>(d_state); + // monte carlo kernel CUDA_CHECK(cudaEventRecord(gpu_start, 0)); + int iteration = 0; - // Run the workload loop until the specified runtime is reached - while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) { + if (!runtime_in_seconds.has_value()) { monte_carlo_kernel<<>>(d_state, d_count, m); - CUDA_CHECK(cudaDeviceSynchronize()); // Ensure the kernel has finished executing + CUDA_CHECK(cudaDeviceSynchronize()); iteration++; + } else { + // Run the workload loop until the specified runtime is reached + while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) { + monte_carlo_kernel<<>>(d_state, d_count, m); + CUDA_CHECK(cudaDeviceSynchronize()); // Ensure the kernel has finished executing + iteration++; + } } float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop); @@ -149,6 +99,8 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) { CUDA_CHECK(cudaFree(d_temp_storage)); } +void runBenchmark(long max_work) { runBenchmarkTime(max_work, std::nullopt); } + int main(int argc, char *argv[]) { // Check for the correct number of command line arguments if (argc == 2) { diff --git a/bin/cuda/gpu_benchmark.h b/bin/cuda/gpu_benchmark.h deleted file mode 100644 index 0b485cca..00000000 --- a/bin/cuda/gpu_benchmark.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef GPU_BENCHMARK_H -#define GPU_BENCHMARK_H - -#include -#include - -void runBenchmark(int max_work); -void runBenchmarkTime(int max_work, int runtime_in_seconds); - -#endif // GPU_BENCHMARK_H diff --git a/bin/hip/gpu_benchmark.hip b/bin/hip/gpu_benchmark.hip index 93c15ff8..4c3f84b1 100644 --- a/bin/hip/gpu_benchmark.hip +++ b/bin/hip/gpu_benchmark.hip @@ -3,6 +3,7 @@ #include #include +#include #define HIP_CHECK(expression) \ { \ @@ -24,69 +25,10 @@ float getElapsedTime(const hipEvent_t &gpu_start, hipEvent_t &gpu_stop) { return gpu_elapsed_time / 1000.0f; } -// Function to run the GPU benchmark with no time limit -void runBenchmark(long max_work) { - uint32_t n = 256 * 256; - uint64_t m = max_work * 16384 / n; - - unsigned long long int *d_count; - hiprandState *d_state; - HIP_CHECK(hipMalloc((void **)&d_count, 256 * sizeof(unsigned long long int))); - HIP_CHECK(hipMalloc((void **)&d_state, n * sizeof(hiprandState))); - HIP_CHECK(hipMemset(d_count, 0, 256 * sizeof(unsigned long long int))); - - // set up timing stuff - hipEvent_t gpu_start, gpu_stop; - HIP_CHECK(hipEventCreate(&gpu_start)); - HIP_CHECK(hipEventCreate(&gpu_stop)); - - // set kernel - dim3 gridSize = 256; - dim3 blockSize = 256; - setup_kernel<<>>(d_state); - - // monte carlo kernel - HIP_CHECK(hipEventRecord(gpu_start, 0)); - monte_carlo_kernel<<>>(d_state, d_count, m); - HIP_CHECK(hipDeviceSynchronize()); - - float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop); - HIP_CHECK(hipEventDestroy(gpu_start)); - HIP_CHECK(hipEventDestroy(gpu_stop)); - - // Allocate device output array - unsigned long long int *d_out = nullptr; - HIP_CHECK(hipMalloc((void **)&d_out, sizeof(unsigned long long int))); - - // Request and allocate temporary storage - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - HIP_CHECK(hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256)); - HIP_CHECK(hipMalloc((void **)&d_temp_storage, temp_storage_bytes)); - - // Run - HIP_CHECK(hipcub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_count, d_out, 256)); - - // copy results back to the host - unsigned long long int h_count = 0; - HIP_CHECK(hipMemcpy(&h_count, d_out, sizeof(unsigned long long int), hipMemcpyDeviceToHost)); - - // display results and timings for gpu - float pi = h_count * 4.0 / (n * m); - std::cout << "Approximate pi calculated on GPU is: " << pi << " and calculation took " << gpu_elapsed_time << "s\n"; - std::cout << "Benchmark completed!" << std::endl; - - HIP_CHECK(hipFree(d_count)); - HIP_CHECK(hipFree(d_state)); - HIP_CHECK(hipFree(d_out)); - HIP_CHECK(hipFree(d_temp_storage)); -} - // Function to run the GPU benchmark for a specified time -void runBenchmarkTime(long max_work, int runtime_in_seconds) { - +void runBenchmarkTime(long max_work, std::optional runtime_in_seconds) { uint32_t n = 256 * 256; - uint64_t m = max_work * 16384 / n; + uint64_t m = (max_work + n - 1) / n; // allocate memory unsigned long long int *d_count; @@ -99,7 +41,6 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) { hipEvent_t gpu_start, gpu_stop; HIP_CHECK(hipEventCreate(&gpu_start)); HIP_CHECK(hipEventCreate(&gpu_stop)); - HIP_CHECK(hipEventRecord(gpu_start, 0)); // set kernel dim3 gridSize = 256; @@ -107,12 +48,20 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) { setup_kernel<<>>(d_state); + HIP_CHECK(hipEventRecord(gpu_start, 0)); int iteration = 0; - // Run the workload loop until the specified runtime is reached - while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) { + + if (!runtime_in_seconds.has_value()) { monte_carlo_kernel<<>>(d_state, d_count, m); - HIP_CHECK(hipDeviceSynchronize()); // Ensure the kernel has finished executing + HIP_CHECK(hipDeviceSynchronize()); iteration++; + } else { + // Run the workload loop until the specified runtime is reached + while (getElapsedTime(gpu_start, gpu_stop) < runtime_in_seconds) { + monte_carlo_kernel<<>>(d_state, d_count, m); + HIP_CHECK(hipDeviceSynchronize()); // Ensure the kernel has finished executing + iteration++; + } } float gpu_elapsed_time = getElapsedTime(gpu_start, gpu_stop); @@ -147,6 +96,10 @@ void runBenchmarkTime(long max_work, int runtime_in_seconds) { HIP_CHECK(hipFree(d_temp_storage)); } +void runBenchmark(long max_work) { + runBenchmarkTime(max_work, std::nullopt); +} + int main(int argc, char *argv[]) { // Check for the correct number of command line arguments if (argc == 2) { diff --git a/bin/wfbench b/bin/wfbench index 90980e46..9b8b7f19 100755 --- a/bin/wfbench +++ b/bin/wfbench @@ -20,6 +20,7 @@ import json import logging import pandas as pd import psutil +import shutil from io import StringIO from filelock import FileLock @@ -230,11 +231,21 @@ class GPUBenchmark: @staticmethod def get_available_gpus(): - proc = subprocess.Popen(["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv"], stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, _ = proc.communicate() - df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=" ") - return df[df["utilization.gpu"] <= 5].index.to_list() + if shutil.which("nvidia-smi") is not None: + proc = subprocess.Popen(["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv"], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, _ = proc.communicate() + df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=" ") + return df[df["utilization.gpu"] <= 5].index.to_list() + elif shutil.which("amd-smi") is not None: + proc = subprocess.Popen(["amd-smi", "monitor", "-u", "--csv"], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, _ = proc.communicate() + df = pd.read_csv(StringIO(stdout.decode("utf-8")), sep=",") + return df[df["gfx"] <= 5].index.to_list() + else: + log_error("No supported GPU monitoring tool found.") + return [] def __init__(self): self.work = None @@ -250,7 +261,7 @@ class GPUBenchmark: log_debug(f"GPU benchmark instantiated for device {self.device}") def set_work(self, work: int): - self.work = work + self.work = 1000000 * work def set_time(self, duration: float): self.duration = duration @@ -261,13 +272,20 @@ class GPUBenchmark: if self.duration is not None: log_debug(f"Running GPU benchmark for {self.duration} seconds") - gpu_prog = [ - f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"] + if shutil.which("nvidia-smi") is not None: + gpu_prog = [ + f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"] + else: + gpu_prog = [ + f"HIP_DEVICE_ORDER=PCI_BUS_ID HIP_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work} {self.duration}"] else: log_debug(f"Running GPU benchmark for {self.work} units of work") - gpu_prog = [ - f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"] - + if shutil.which("nvidia-smi") is not None: + gpu_prog = [ + f"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"] + else: + gpu_prog = [ + f"HIP_DEVICE_ORDER=PCI_BUS_ID HIP_VISIBLE_DEVICES={self.device} {this_dir.joinpath('./gpu_benchmark')} {self.work}"] p = subprocess.Popen(gpu_prog, shell=True) return ProcessHandle(p)