ttg 1.0.0
Template Task Graph (TTG): flowgraph-based programming model for high-performance distributed-memory algorithms
Loading...
Searching...
No Matches
devicefunc.h
Go to the documentation of this file.
1// SPDX-License-Identifier: BSD-3-Clause
2#ifndef TTG_PARSEC_DEVICEFUNC_H
3#define TTG_PARSEC_DEVICEFUNC_H
4
5#include "ttg/parsec/task.h"
6#include <parsec.h>
7#include <parsec/mca/device/device_gpu.h>
8
9namespace ttg_parsec {
10 namespace detail {
11 template<typename... Views, std::size_t I, std::size_t... Is>
12 bool register_device_memory(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
13 static_assert(I < MAX_PARAM_COUNT,
14 "PaRSEC only supports MAX_PARAM_COUNT device input/outputs. "
15 "Increase MAX_PARAM_COUNT and recompile PaRSEC/TTG.");
16 using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
18 assert(nullptr != caller->dev_ptr);
19 parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
20 parsec_flow_t *flows = caller->dev_ptr->flows;
21
22 auto& view = std::get<I>(views);
23 bool is_current = false;
25 /* get_parsec_data is overloaded for buffer and devicescratch */
26 parsec_data_t* data = detail::get_parsec_data(view);
27 /* TODO: check whether the device is current */
28
29
30 if (nullptr != data) {
31 auto access = PARSEC_FLOW_ACCESS_RW;
32 if constexpr (std::is_const_v<view_type>) {
33 // keep the flow at RW if it was RW to make sure we pull the data back out eventually
34 access = PARSEC_FLOW_ACCESS_READ;
35 } else if (view.scope() == ttg::scope::Allocate) {
36 access = PARSEC_FLOW_ACCESS_WRITE;
37 }
38
39 /* build the flow */
40 /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
41 flows[I] = parsec_flow_t{.name = nullptr,
42 .sym_type = PARSEC_SYM_INOUT,
43 .flow_flags = static_cast<uint8_t>(access),
44 .flow_index = I,
45 .flow_datatype_mask = ~0 };
46
47 gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
48 gpu_task->flow[I] = &flows[I];
49
50 /* set the input data copy, parsec will take care of the transfer
51 * and the buffer will look at the parsec_data_t for the current pointer */
52 //detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[data->owner_device];
53 assert(nullptr != data->device_copies[0]->original);
54 caller->parsec_task.data[I].data_in = data->device_copies[0];
55 caller->parsec_task.data[I].source_repo_entry = NULL;
56
57 } else {
58 /* ignore the flow */
59 flows[I] = parsec_flow_t{.name = nullptr,
60 .sym_type = PARSEC_FLOW_ACCESS_NONE,
61 .flow_flags = 0,
62 .flow_index = I,
63 .flow_datatype_mask = ~0 };
64 gpu_task->flow[I] = &flows[I];
65 gpu_task->flow_nb_elts[I] = 0; // size in bytes
66 caller->parsec_task.data[I].data_in = nullptr;
67 }
68
69 if constexpr (sizeof...(Is) > 0) {
70 is_current |= register_device_memory(views, std::index_sequence<Is...>{});
71 }
72 return is_current;
73 }
74 } // namespace detail
75
76 /* Takes a tuple of ttg::Views or ttg::buffers and register them
77 * with the currently executing task. Returns true if all memory
78 * is current on the target device, false if transfers are required. */
79 template<typename... Views>
80 bool register_device_memory(std::tuple<Views&...> &views) {
81 bool is_current = true;
82 if (nullptr == detail::parsec_ttg_caller) {
83 throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
84 }
85
86 if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
87 throw std::runtime_error("register_device_memory called inside a non-gpu task!");
88 }
89
90 if constexpr (sizeof...(Views) > 0) {
91 is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
92 }
93
94 /* reset all entries in the current task */
95 for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {
96 detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
97 detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
98 detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
100 detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
101 }
102
103 return is_current;
104 }
105
106 // templated to break circular dependency with fwd.h
107 template<typename T, std::size_t N>
108 bool register_device_memory(const ttg::span<T, N>& span)
109 {
110
111 if (nullptr == detail::parsec_ttg_caller) {
112 throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
113 }
114
115 if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
116 throw std::runtime_error("register_device_memory called inside a non-gpu task!");
117 }
118
119 uint8_t i; // only limited number of flows
121 assert(nullptr != caller->dev_ptr);
122 parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
123 parsec_flow_t *flows = caller->dev_ptr->flows;
124
125 bool is_current = false;
126 for (i = 0; i < span.size(); ++i) {
127 parsec_data_t* data = span[i].impl_data;
128 ttg::scope scope = span[i].scope;
129 bool is_const = span[i].is_const;
130 bool is_scratch = span[i].is_scratch;
131
132 if (nullptr != data) {
133 auto access = PARSEC_FLOW_ACCESS_RW;
134 if (is_const) {
135 access = PARSEC_FLOW_ACCESS_READ;
136 } else if (ttg::scope::Allocate == scope) {
137 access = PARSEC_FLOW_ACCESS_WRITE;
138 }
139
140 if (is_scratch) {
141 /* mark the flow as temporary so we can discard it easily */
143 }
144
145 /* build the flow */
146 /* TODO: we can probably remove the initialization here
147 * because that's been done on task creation */
148 flows[i] = parsec_flow_t{.name = nullptr,
149 .sym_type = PARSEC_SYM_INOUT,
150 .flow_flags = static_cast<uint8_t>(access),
151 .flow_index = i,
152 .flow_datatype_mask = ~0 };
153
154 gpu_task->flow_nb_elts[i] = data->nb_elts; // size in bytes
155 gpu_task->flow[i] = &flows[i];
156
157 /* set the input data copy, parsec will take care of the transfer
158 * and the buffer will look at the parsec_data_t for the current pointer */
159 //detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[data->owner_device];
160 assert(nullptr != data->device_copies[0]->original);
161 caller->parsec_task.data[i].data_in = data->device_copies[0];
162 caller->parsec_task.data[i].source_repo_entry = NULL;
163 // sanity check: we cannot sync in something that does not exist
164 if (scope == ttg::scope::SyncIn && data->device_copies[0]->version > 0) {
165#ifndef NDEBUG
166 // have to lock the data to avoid a race condition with the GPU manager
167 parsec_atomic_lock(&data->lock);
168 if (scope == ttg::scope::SyncIn && data->device_copies[0]->version > 0) {
169 // TODO: this assert would be nice to have but it's still failing spuriously
170 // due to a race with PaRSEC.
171 //assert(data->device_copies[0]->device_private != NULL);
172 }
173 parsec_atomic_unlock(&data->lock);
174#endif
175 }
176
177 } else {
178 /* ignore the flow */
179 /* TODO: we can probably remove the initialization here
180 * because that's been done on task creation */
181 flows[i] = parsec_flow_t{.name = nullptr,
182 .sym_type = PARSEC_FLOW_ACCESS_NONE,
183 .flow_flags = 0,
184 .flow_index = i,
185 .flow_datatype_mask = ~0 };
186 gpu_task->flow[i] = &flows[i];
187 gpu_task->flow_nb_elts[i] = 0; // size in bytes
188 caller->parsec_task.data[i].data_in = nullptr;
189 }
190 }
191
192 /* reset all remaining entries in the current task */
193 for (; i < MAX_PARAM_COUNT; ++i) {
194 detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
195 detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
196 detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
198 detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
199 }
200 // we cannot allow the calling thread to submit kernels so say we're not ready
201 return is_current;
202 }
203
204 namespace detail {
205 template<typename... Views, std::size_t I, std::size_t... Is, bool DeviceAvail = false>
206 void mark_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
207
208 using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
209 auto& view = std::get<I>(views);
210
211 /* get_parsec_data is overloaded for buffer and devicescratch */
212 parsec_data_t* data = detail::get_parsec_data(view);
213 parsec_gpu_exec_stream_t *stream = detail::parsec_ttg_caller->dev_ptr->stream;
214
215 /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */
216 if (nullptr != data && data->owner_device != 0) {
217 parsec_device_gpu_module_t *device_module = detail::parsec_ttg_caller->dev_ptr->device;
218 if (nullptr == data->device_copies[0]->device_private) {
219 assert(nullptr != data->device_copies[0]->alloc_cb);
220 data->device_copies[0]->alloc_cb(data->device_copies[0], 0);
221 }
222
223 int ret = device_module->memcpy_async(device_module, stream,
224 data->device_copies[0]->device_private,
225 data->device_copies[data->owner_device]->device_private,
226 data->nb_elts, parsec_device_gpu_transfer_direction_d2h);
227 if (ret != PARSEC_SUCCESS) throw std::runtime_error("Failed to copy data from device to host!");
228 }
229 if constexpr (sizeof...(Is) > 0) {
230 // recursion
231 mark_device_out(views, std::index_sequence<Is...>{});
232 }
233 }
234 } // namespace detail
235
236 template<typename... Buffer>
237 void mark_device_out(std::tuple<Buffer&...> &b) {
238
239 if (nullptr == detail::parsec_ttg_caller) {
240 throw std::runtime_error("mark_device_out may only be invoked from inside a task!");
241 }
242
243 if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
244 throw std::runtime_error("mark_device_out called inside a non-gpu task!");
245 }
246
247 detail::mark_device_out(b, std::index_sequence_for<Buffer...>{});
248 }
249
250 namespace detail {
251
252 template<typename... Views, std::size_t I, std::size_t... Is>
253 void post_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
254
255 using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
256
257 if constexpr (!std::is_const_v<view_type>) {
258 auto& view = std::get<I>(views);
259
260 /* get_parsec_data is overloaded for buffer and devicescratch */
261 parsec_data_t* data = detail::get_parsec_data(view);
262 data->device_copies[0]->version = data->device_copies[data->owner_device]->version;
263 parsec_data_transfer_ownership_to_copy(data, 0, PARSEC_FLOW_ACCESS_READ);
264 }
265
266 if constexpr (sizeof...(Is) > 0) {
267 // recursion
268 post_device_out(views, std::index_sequence<Is...>{});
269 }
270 }
271 } // namespace detail
272
273 template<typename... Buffer>
274 void post_device_out(std::tuple<Buffer&...> &b) {
275 detail::post_device_out(b, std::index_sequence_for<Buffer...>{});
276 }
277
278 template<typename T>
279 parsec_data_t* buffer_data(T&& buffer) {
280 using view_type = std::remove_reference_t<T>;
282 return detail::get_parsec_data(buffer);
283 }
284
285} // namespace ttg_parsec
286
287#endif // TTG_PARSEC_DEVICEFUNC_H
constexpr auto get(typelist< T, RestOfTs... >)
Definition typelist.h:102
void mark_device_out(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition devicefunc.h:206
bool register_device_memory(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition devicefunc.h:12
void post_device_out(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition devicefunc.h:253
parsec_data_t * get_parsec_data(const ttg_parsec::Buffer< T, A > &db)
Definition buffer.h:638
thread_local parsec_ttg_task_base_t * parsec_ttg_caller
this contains PaRSEC-based TTG functionality
Definition fwd.h:19
void post_device_out(std::tuple< Buffer &... > &b)
Definition devicefunc.h:274
parsec_data_t * buffer_data(T &&buffer)
Definition devicefunc.h:279
bool register_device_memory(std::tuple< Views &... > &views)
Definition devicefunc.h:80
void mark_device_out(std::tuple< Buffer &... > &b)
Definition devicefunc.h:237
#define TTG_PARSEC_FLOW_ACCESS_TMP
Definition parsec-ext.h:9
parsec_gpu_task_t * gpu_task
Definition task.h:15
parsec_gpu_exec_stream_t * stream
Definition task.h:17
parsec_flow_t * flows
Definition task.h:16
parsec_device_gpu_module_t * device
Definition task.h:18