Skip to content

Commit f5f50cd

Browse files
authored
Merge pull request #8 from pravirkr/fdmt_gpu
GPU code for FDMT
2 parents 6931117 + bcfdcad commit f5f50cd

35 files changed

+1822
-536
lines changed

.clang-format

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Language: Cpp
66
AccessModifierOffset: -4
77
AlignConsecutiveAssignments:
88
Enabled: true
9+
BinPackParameters: false
910
DerivePointerAlignment: false
1011
PackConstructorInitializers: CurrentLine
1112
PointerAlignment: Left

.clang-tidy

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -24,35 +24,34 @@ Checks: >
2424
-readability-isolate-declaration,
2525
-readability-magic-numbers,
2626
27-
WarningsAsErrors: ''
27+
WarningsAsErrors: ""
2828
HeaderFilterRegex: ".*"
29-
FormatStyle: 'file'
29+
FormatStyle: "file"
3030
CheckOptions:
31-
- { key: readability-identifier-naming.ClassCase, value: CamelCase }
32-
- { key: readability-identifier-naming.ClassMemberCase, value: lower_case }
33-
- { key: readability-identifier-naming.EnumCase, value: CamelCase }
34-
- { key: readability-identifier-naming.FunctionCase, value: lower_case }
35-
- { key: readability-identifier-naming.MacroDefinitionCase, value: UPPER_CASE }
36-
- { key: readability-identifier-naming.MemberCase, value: lower_case }
37-
- { key: readability-identifier-naming.PrivateMemberPrefix, value: m_ }
38-
- { key: readability-identifier-naming.ProtectedMemberPrefix, value: m_ }
39-
- { key: readability-identifier-naming.NamespaceCase, value: lower_case }
40-
- { key: readability-identifier-naming.ParameterCase, value: lower_case }
41-
- { key: readability-identifier-naming.TypeAliasCase, value: CamelCase }
42-
- { key: readability-identifier-naming.TypedefCase, value: CamelCase }
43-
- { key: readability-identifier-naming.VariableCase, value: lower_case }
44-
- { key: readability-identifier-naming.StaticVariableCase, value: lower_case }
45-
- { key: readability-identifier-naming.ConstexprVariableCase, value: CamelCase }
46-
- { key: readability-identifier-naming.ConstexprVariablePrefix, value: k }
47-
- { key: readability-identifier-naming.EnumConstantCase, value: CamelCase }
48-
- { key: readability-identifier-naming.EnumConstantPrefix, value: k }
49-
- { key: readability-identifier-naming.GlobalConstantCase, value: CamelCase }
50-
- { key: readability-identifier-naming.GlobalConstantPrefix, value: k }
51-
- { key: readability-identifier-naming.MemberConstantCase, value: CamelCase }
52-
- { key: readability-identifier-naming.MemberConstantPrefix, value: k }
53-
- { key: readability-identifier-naming.StaticConstantCase, value: CamelCase }
54-
- { key: readability-identifier-naming.StaticConstantPrefix, value: k }
55-
- { key: readability-identifier-naming.StructCase, value: CamelCase }
56-
- { key: readability-identifier-naming.TemplateParameterCase, value: CamelCase }
57-
...
58-
31+
readability-identifier-naming.ClassCase: "CamelCase"
32+
readability-identifier-naming.ClassMemberCase: "lower_case"
33+
readability-identifier-naming.EnumCase: "CamelCase"
34+
readability-identifier-naming.FunctionCase: "lower_case"
35+
readability-identifier-naming.MacroDefinitionCase: "UPPER_CASE"
36+
readability-identifier-naming.MemberCase: "lower_case"
37+
readability-identifier-naming.PrivateMemberPrefix: "m_"
38+
readability-identifier-naming.ProtectedMemberPrefix: "m_"
39+
readability-identifier-naming.NamespaceCase: "lower_case"
40+
readability-identifier-naming.ParameterCase: "lower_case"
41+
readability-identifier-naming.TypeAliasCase: "CamelCase"
42+
readability-identifier-naming.TypedefCase: "CamelCase"
43+
readability-identifier-naming.VariableCase: "lower_case"
44+
readability-identifier-naming.StaticVariableCase: "lower_case"
45+
readability-identifier-naming.ConstexprVariableCase: "CamelCase"
46+
readability-identifier-naming.ConstexprVariablePrefix: "k"
47+
readability-identifier-naming.EnumConstantCase: "CamelCase"
48+
readability-identifier-naming.EnumConstantPrefix: "k"
49+
readability-identifier-naming.GlobalConstantCase: "CamelCase"
50+
readability-identifier-naming.GlobalConstantPrefix: "k"
51+
readability-identifier-naming.MemberConstantCase: "CamelCase"
52+
readability-identifier-naming.MemberConstantPrefix: "k"
53+
readability-identifier-naming.StaticConstantCase: "CamelCase"
54+
readability-identifier-naming.StaticConstantPrefix: "k"
55+
readability-identifier-naming.StructCase: "CamelCase"
56+
readability-identifier-naming.TemplateParameterCase: "CamelCase"
57+
readability-function-cognitive-complexity.Threshold: "30"

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ jobs:
1313
strategy:
1414
fail-fast: false
1515
matrix:
16-
os: [ubuntu-latest]
16+
os: [ubuntu-22.04]
1717
build_type: [Debug]
18-
compiler: [{cpp: g++-13, code-cov: true, gcov: gcov-13}, {cpp: clang++-15}]
18+
compiler: [{cpp: g++-12, code-cov: true, gcov: gcov-12}, {cpp: clang++-15}]
1919
steps:
2020
- uses: actions/checkout@v4
2121
- name: Set reusable strings

CMakeLists.txt

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake_minimum_required(VERSION 3.15)
1+
cmake_minimum_required(VERSION 3.18)
22
cmake_policy(SET CMP0148 OLD)
33

44
project(
@@ -13,25 +13,43 @@ option(BUILD_PYTHON "Build Python bindings" ON)
1313
option(BUILD_TESTING "Build tests" OFF)
1414
option(BUILD_BENCHMARKS "Build benchmarks" OFF)
1515
option(CODE_COVERAGE "Enable coverage reporting" OFF)
16-
option(ENABLE_FAST_MATH "Enable fast math flags" ON)
1716

1817
set(CMAKE_CXX_STANDARD 17)
1918
set(CMAKE_CXX_STANDARD_REQUIRED ON)
2019
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
20+
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION ON)
2121
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake)
2222

23-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
24-
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
25-
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG -march=native")
26-
27-
if(ENABLE_FAST_MATH)
28-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
29-
endif()
23+
set(CMAKE_CXX_FLAGS "-Wall -Wextra")
24+
set(CMAKE_CXX_FLAGS_DEBUG "-g")
25+
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -march=native -ffast-math")
3026

3127
if(NOT CMAKE_BUILD_TYPE)
3228
set(CMAKE_BUILD_TYPE Release)
3329
endif()
3430

31+
# Find CUDA and set up the project
32+
include(CheckLanguage)
33+
check_language(CUDA)
34+
35+
if(CMAKE_CUDA_COMPILER)
36+
enable_language(CUDA)
37+
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS_EQUAL 10.0.0)
38+
message(FATAL_ERROR "Found CUDA ${CMAKE_CUDA_COMPILER_VERSION}. Need CUDA 10+")
39+
else()
40+
message(STATUS "Found CUDA ${CMAKE_CUDA_COMPILER_VERSION}.")
41+
endif()
42+
43+
set(CMAKE_CUDA_STANDARD 17)
44+
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
45+
set(CMAKE_CUDA_FLAGS "-c -Xcompiler=-Wall,-Wextra --expt-extended-lambda")
46+
set(CMAKE_CUDA_FLAGS_DEBUG "-g")
47+
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -use_fast_math -Xcompiler=-march=native")
48+
set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0)
49+
elseif(NOT CMAKE_CUDA_COMPILER)
50+
message(STATUS "Not building gpu code. No CUDA compiler found...")
51+
endif(CMAKE_CUDA_COMPILER)
52+
3553
# Initialize some default paths
3654
include(GNUInstallDirs)
3755
include(cmake/CPM.cmake)
@@ -45,6 +63,7 @@ endif()
4563

4664
# Build the C++ library
4765
set(LIBRARY_NAME dmt)
66+
set(PYLIBRARY_NAME dmtlib)
4867
add_subdirectory(lib)
4968
# Build the Python bindings
5069
if(BUILD_PYTHON)

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ pip install -U git+https://github.com/pravirkr/dmt
2323
## Usage
2424

2525
```python
26-
from dmt.libdmt import FDMT
26+
from dmtlib import FDMTCPU
2727

2828
frb = np.ones((nchans, nsamps), dtype=np.float32)
29-
thefdmt = FDMT(f_min, f_max, nchans, nsamps, tsamp, dt_max=dt_max, dt_min=0, dt_step=1)
29+
thefdmt = FDMTCPU(f_min, f_max, nchans, nsamps, tsamp, dt_max=dt_max, dt_min=0, dt_step=1)
3030
dmt_transform = thefdmt.execute(frb.astype(np.float32))
3131
```
3232

@@ -36,6 +36,6 @@ dmt_transform = thefdmt.execute(frb.astype(np.float32))
3636
f_min = 704.0, f_max = 1216.0, nchans = 4096, tsamp = 0.00008192, dt_max = 2048, nsamps = n;
3737
nthreads = 1, 8;
3838
```
39-
![](bench/results/bench_m1.png)
39+
![](bench/results/bench.png)
4040

4141

bench/CMakeLists.txt

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
CPMAddPackage(
22
NAME benchmark
3-
VERSION 1.8.3
3+
VERSION 1.8.4
44
GITHUB_REPOSITORY google/benchmark
55
OPTIONS "BENCHMARK_ENABLE_WERROR OFF" "BENCHMARK_FORCE_WERROR OFF" "BENCHMARK_ENABLE_TESTING OFF"
66
)
77

88
set(TARGET_BENCHMARKS ${PROJECT_NAME}_bench)
9-
file(GLOB TEST_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
9+
file(GLOB BENCH_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
1010

11-
add_executable(${TARGET_BENCHMARKS} ${TEST_SOURCES})
11+
# Check if CUDA was found
12+
if(CMAKE_CUDA_COMPILER)
13+
file(GLOB BENCH_SOURCES_CUDA ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
14+
list(APPEND BENCH_SOURCES ${BENCH_SOURCES_CUDA})
15+
endif()
16+
17+
add_executable(${TARGET_BENCHMARKS} ${BENCH_SOURCES})
1218
target_include_directories(${TARGET_BENCHMARKS} PUBLIC ${CMAKE_SOURCE_DIR}/include)
13-
target_link_libraries(${TARGET_BENCHMARKS} PRIVATE ${LIBRARY_NAME} benchmark::benchmark)
19+
target_link_libraries(${TARGET_BENCHMARKS} PRIVATE ${LIBRARY_NAME} benchmark::benchmark_main)

bench/bench_plots.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
"/BM_|_|/",
2323
expand=True,
2424
).apply(
25-
lambda x: ["_".join([x[1], x[2]]), "_".join([x[3], x[4]]), x[5]],
25+
lambda x: ["_".join([x[1], x[2]]), x[3], x[4]],
2626
axis=1,
2727
result_type="expand",
2828
)
@@ -48,8 +48,8 @@
4848
ax.set_title(benchmark)
4949
ax.set_xscale("log")
5050
ax.set_yscale("log")
51-
ax.set_xlabel("n")
52-
ax.set_ylabel("Time (s)")
51+
ax.set_xlabel("nsamples")
52+
ax.set_ylabel("Time (ms)")
5353
ax.legend()
5454

5555
fig.tight_layout()

bench/fdmt_b.cpp renamed to bench/fdmt_cpu_b.cpp

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <random>
66
#include <vector>
77

8-
class FDMTFixture : public benchmark::Fixture {
8+
class FDMTCPUFixture : public benchmark::Fixture {
99
public:
1010
void SetUp(const ::benchmark::State& state) override {
1111
f_min = 704.0F;
@@ -34,13 +34,22 @@ class FDMTFixture : public benchmark::Fixture {
3434
size_t nsamps{};
3535
};
3636

37-
BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_plan_seq_cpu)(benchmark::State& state) {
37+
BENCHMARK_DEFINE_F(FDMTCPUFixture, BM_fdmt_planBuffer_seq)
38+
(benchmark::State& state) {
39+
for (auto _ : state) {
40+
FDMTCPU fdmt(f_min, f_max, nchans, nsamps, tsamp, dt_max);
41+
}
42+
}
43+
44+
BENCHMARK_DEFINE_F(FDMTCPUFixture, BM_fdmt_planBuffer_par)
45+
(benchmark::State& state) {
46+
FDMTCPU::set_num_threads(8);
3847
for (auto _ : state) {
3948
FDMTCPU fdmt(f_min, f_max, nchans, nsamps, tsamp, dt_max);
4049
}
4150
}
4251

43-
BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_initialise_seq_cpu)
52+
BENCHMARK_DEFINE_F(FDMTCPUFixture, BM_fdmt_initialise_seq)
4453
(benchmark::State& state) {
4554
FDMTCPU::set_num_threads(1);
4655
FDMTCPU fdmt(f_min, f_max, nchans, nsamps, tsamp, dt_max);
@@ -52,11 +61,12 @@ BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_initialise_seq_cpu)
5261
const auto state_size = plan.state_shape[0][3] * plan.state_shape[0][4];
5362
std::vector<float> state_init(state_size, 0.0F);
5463
for (auto _ : state) {
55-
fdmt.initialise(waterfall.data(), state_init.data());
64+
fdmt.initialise(waterfall.data(), waterfall.size(), state_init.data(),
65+
state_init.size());
5666
}
5767
}
5868

59-
BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_initialise_par_cpu)
69+
BENCHMARK_DEFINE_F(FDMTCPUFixture, BM_fdmt_initialise_par)
6070
(benchmark::State& state) {
6171
FDMTCPU::set_num_threads(8);
6272
FDMTCPU fdmt(f_min, f_max, nchans, nsamps, tsamp, dt_max);
@@ -68,11 +78,12 @@ BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_initialise_par_cpu)
6878
const auto state_size = plan.state_shape[0][3] * plan.state_shape[0][4];
6979
std::vector<float> state_init(state_size, 0.0F);
7080
for (auto _ : state) {
71-
fdmt.initialise(waterfall.data(), state_init.data());
81+
fdmt.initialise(waterfall.data(), waterfall.size(), state_init.data(),
82+
state_init.size());
7283
}
7384
}
7485

75-
BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_execute_seq_cpu)
86+
BENCHMARK_DEFINE_F(FDMTCPUFixture, BM_fdmt_execute_seq)
7687
(benchmark::State& state) {
7788
FDMTCPU::set_num_threads(1);
7889
FDMTCPU fdmt(f_min, f_max, nchans, nsamps, tsamp, dt_max);
@@ -87,7 +98,7 @@ BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_execute_seq_cpu)
8798
}
8899
}
89100

90-
BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_execute_par_cpu)
101+
BENCHMARK_DEFINE_F(FDMTCPUFixture, BM_fdmt_execute_par)
91102
(benchmark::State& state) {
92103
FDMTCPU::set_num_threads(8);
93104
FDMTCPU fdmt(f_min, f_max, nchans, nsamps, tsamp, dt_max);
@@ -102,7 +113,7 @@ BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_execute_par_cpu)
102113
}
103114
}
104115

105-
BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_overall_seq_cpu)
116+
BENCHMARK_DEFINE_F(FDMTCPUFixture, BM_fdmt_overall_seq)
106117
(benchmark::State& state) {
107118
std::random_device rd;
108119
std::mt19937 gen(rd());
@@ -119,7 +130,7 @@ BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_overall_seq_cpu)
119130
}
120131
}
121132

122-
BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_overall_par_cpu)
133+
BENCHMARK_DEFINE_F(FDMTCPUFixture, BM_fdmt_overall_par)
123134
(benchmark::State& state) {
124135
std::random_device rd;
125136
std::mt19937 gen(rd());
@@ -139,26 +150,29 @@ BENCHMARK_DEFINE_F(FDMTFixture, BM_fdmt_overall_par_cpu)
139150
constexpr size_t kMinNsamps = 1 << 11;
140151
constexpr size_t kMaxNsamps = 1 << 16;
141152

142-
BENCHMARK_REGISTER_F(FDMTFixture, BM_fdmt_plan_seq_cpu)
153+
BENCHMARK_REGISTER_F(FDMTCPUFixture, BM_fdmt_planBuffer_seq)
154+
->RangeMultiplier(2)
155+
->Range(kMinNsamps, kMaxNsamps);
156+
BENCHMARK_REGISTER_F(FDMTCPUFixture, BM_fdmt_planBuffer_par)
143157
->RangeMultiplier(2)
144158
->Range(kMinNsamps, kMaxNsamps);
145-
BENCHMARK_REGISTER_F(FDMTFixture, BM_fdmt_initialise_seq_cpu)
159+
BENCHMARK_REGISTER_F(FDMTCPUFixture, BM_fdmt_initialise_seq)
146160
->RangeMultiplier(2)
147161
->Range(kMinNsamps, kMaxNsamps);
148-
BENCHMARK_REGISTER_F(FDMTFixture, BM_fdmt_initialise_par_cpu)
162+
BENCHMARK_REGISTER_F(FDMTCPUFixture, BM_fdmt_initialise_par)
149163
->RangeMultiplier(2)
150164
->Range(kMinNsamps, kMaxNsamps);
151-
BENCHMARK_REGISTER_F(FDMTFixture, BM_fdmt_execute_seq_cpu)
165+
BENCHMARK_REGISTER_F(FDMTCPUFixture, BM_fdmt_execute_seq)
152166
->RangeMultiplier(2)
153167
->Range(kMinNsamps, kMaxNsamps);
154-
BENCHMARK_REGISTER_F(FDMTFixture, BM_fdmt_execute_par_cpu)
168+
BENCHMARK_REGISTER_F(FDMTCPUFixture, BM_fdmt_execute_par)
155169
->RangeMultiplier(2)
156170
->Range(kMinNsamps, kMaxNsamps);
157-
BENCHMARK_REGISTER_F(FDMTFixture, BM_fdmt_overall_seq_cpu)
171+
BENCHMARK_REGISTER_F(FDMTCPUFixture, BM_fdmt_overall_seq)
158172
->RangeMultiplier(2)
159173
->Range(kMinNsamps, kMaxNsamps);
160-
BENCHMARK_REGISTER_F(FDMTFixture, BM_fdmt_overall_par_cpu)
174+
BENCHMARK_REGISTER_F(FDMTCPUFixture, BM_fdmt_overall_par)
161175
->RangeMultiplier(2)
162176
->Range(kMinNsamps, kMaxNsamps);
163177

164-
BENCHMARK_MAIN();
178+
// BENCHMARK_MAIN();

0 commit comments

Comments
 (0)