devicefunc.h
Go to the documentation of this file.
1 #ifndef TTG_PARSEC_DEVICEFUNC_H
2 #define TTG_PARSEC_DEVICEFUNC_H
3 
4 #include "ttg/parsec/task.h"
5 #include <parsec.h>
6 #include <parsec/mca/device/device_gpu.h>
7 
8 namespace ttg_parsec {
9  namespace detail {
10  template<typename... Views, std::size_t I, std::size_t... Is>
11  bool register_device_memory(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
12  static_assert(I < MAX_PARAM_COUNT,
13  "PaRSEC only supports MAX_PARAM_COUNT device input/outputs. "
14  "Increase MAX_PARAM_COUNT and recompile PaRSEC/TTG.");
15  using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
17  assert(nullptr != caller->dev_ptr);
18  parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
19  parsec_flow_t *flows = caller->dev_ptr->flows;
20 
21  auto& view = std::get<I>(views);
22  bool is_current = false;
23  static_assert(ttg::meta::is_buffer_v<view_type> || ttg::meta::is_devicescratch_v<view_type>);
24  /* get_parsec_data is overloaded for buffer and devicescratch */
25  parsec_data_t* data = detail::get_parsec_data(view);
26  /* TODO: check whether the device is current */
27 
28 
29  if (nullptr != data) {
30  auto access = PARSEC_FLOW_ACCESS_RW;
31  if constexpr (std::is_const_v<view_type>) {
32  // keep the flow at RW if it was RW to make sure we pull the data back out eventually
33  access = PARSEC_FLOW_ACCESS_READ;
34  } else if constexpr (ttg::meta::is_devicescratch_v<view_type>) {
35  if (view.scope() == ttg::scope::Allocate) {
36  access = PARSEC_FLOW_ACCESS_WRITE;
37  }
38  }
39 
40  /* build the flow */
41  /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
42  flows[I] = parsec_flow_t{.name = nullptr,
43  .sym_type = PARSEC_SYM_INOUT,
44  .flow_flags = static_cast<uint8_t>(access),
45  .flow_index = I,
46  .flow_datatype_mask = ~0 };
47 
48  gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
49  gpu_task->flow[I] = &flows[I];
50 
51  /* set the input data copy, parsec will take care of the transfer
52  * and the buffer will look at the parsec_data_t for the current pointer */
53  //detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[data->owner_device];
54  assert(nullptr != data->device_copies[0]->original);
55  caller->parsec_task.data[I].data_in = data->device_copies[0];
56  caller->parsec_task.data[I].source_repo_entry = NULL;
57 
58  } else {
59  /* ignore the flow */
60  flows[I] = parsec_flow_t{.name = nullptr,
61  .sym_type = PARSEC_FLOW_ACCESS_NONE,
62  .flow_flags = 0,
63  .flow_index = I,
64  .flow_datatype_mask = ~0 };
65  gpu_task->flow[I] = &flows[I];
66  gpu_task->flow_nb_elts[I] = 0; // size in bytes
67  caller->parsec_task.data[I].data_in = nullptr;
68  }
69 
70  if constexpr (sizeof...(Is) > 0) {
71  is_current |= register_device_memory(views, std::index_sequence<Is...>{});
72  }
73  return is_current;
74  }
75  } // namespace detail
76 
77  /* Takes a tuple of ttg::Views or ttg::buffers and register them
78  * with the currently executing task. Returns true if all memory
79  * is current on the target device, false if transfers are required. */
80  template<typename... Views>
81  bool register_device_memory(std::tuple<Views&...> &views) {
82  bool is_current = true;
83  if (nullptr == detail::parsec_ttg_caller) {
84  throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
85  }
86 
87  if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
88  throw std::runtime_error("register_device_memory called inside a non-gpu task!");
89  }
90 
91  if constexpr (sizeof...(Views) > 0) {
92  is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
93  }
94 
95  /* reset all entries in the current task */
96  for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {
97  detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
98  detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
99  detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
101  detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
102  }
103 
104  return is_current;
105  }
106 
107  // templated to break circular dependency with fwd.h
108  template<typename T, std::size_t N>
109  bool register_device_memory(const ttg::span<T, N>& span)
110  {
111 
112  if (nullptr == detail::parsec_ttg_caller) {
113  throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
114  }
115 
116  if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
117  throw std::runtime_error("register_device_memory called inside a non-gpu task!");
118  }
119 
120  uint8_t i; // only limited number of flows
122  assert(nullptr != caller->dev_ptr);
123  parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
124  parsec_flow_t *flows = caller->dev_ptr->flows;
125 
126  bool is_current = false;
127  for (i = 0; i < span.size(); ++i) {
128  /* get_parsec_data is overloaded for buffer and devicescratch */
129  parsec_data_t* data = span[i].impl_data;
130  ttg::scope scope = span[i].scope;
131  bool is_const = span[i].is_const;
132  bool is_scratch = span[i].is_scratch;
133 
134  if (nullptr != data) {
135  auto access = PARSEC_FLOW_ACCESS_RW;
136  if (ttg::scope::Allocate == scope) {
137  access = PARSEC_FLOW_ACCESS_WRITE;
138  } else if (is_const) {
139  access = PARSEC_FLOW_ACCESS_READ;
140  }
141 
142  if (is_scratch) {
143  /* mark the flow as temporary so we can discard it easily */
144  access |= TTG_PARSEC_FLOW_ACCESS_TMP;
145  }
146 
147  /* build the flow */
148  /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
149  flows[i] = parsec_flow_t{.name = nullptr,
150  .sym_type = PARSEC_SYM_INOUT,
151  .flow_flags = static_cast<uint8_t>(access),
152  .flow_index = i,
153  .flow_datatype_mask = ~0 };
154 
155  gpu_task->flow_nb_elts[i] = data->nb_elts; // size in bytes
156  gpu_task->flow[i] = &flows[i];
157 
158  /* set the input data copy, parsec will take care of the transfer
159  * and the buffer will look at the parsec_data_t for the current pointer */
160  //detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[data->owner_device];
161  assert(nullptr != data->device_copies[0]->original);
162  caller->parsec_task.data[i].data_in = data->device_copies[0];
163  caller->parsec_task.data[i].source_repo_entry = NULL;
164 
165  } else {
166  /* ignore the flow */
167  flows[i] = parsec_flow_t{.name = nullptr,
168  .sym_type = PARSEC_FLOW_ACCESS_NONE,
169  .flow_flags = 0,
170  .flow_index = i,
171  .flow_datatype_mask = ~0 };
172  gpu_task->flow[i] = &flows[i];
173  gpu_task->flow_nb_elts[i] = 0; // size in bytes
174  caller->parsec_task.data[i].data_in = nullptr;
175  }
176  }
177 
178  /* reset all remaining entries in the current task */
179  for (; i < MAX_PARAM_COUNT; ++i) {
180  detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
181  detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
182  detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
184  detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
185  }
186  // we cannot allow the calling thread to submit kernels so say we're not ready
187  return is_current;
188  }
189 
190  namespace detail {
191  template<typename... Views, std::size_t I, std::size_t... Is, bool DeviceAvail = false>
192  void mark_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
193 
194  using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
195  auto& view = std::get<I>(views);
196 
197  /* get_parsec_data is overloaded for buffer and devicescratch */
198  parsec_data_t* data = detail::get_parsec_data(view);
199  parsec_gpu_exec_stream_t *stream = detail::parsec_ttg_caller->dev_ptr->stream;
200 
201  /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */
202  if (data->owner_device != 0) {
203  parsec_device_gpu_module_t *device_module = detail::parsec_ttg_caller->dev_ptr->device;
204  int ret = device_module->memcpy_async(device_module, stream,
205  data->device_copies[0]->device_private,
206  data->device_copies[data->owner_device]->device_private,
207  data->nb_elts, parsec_device_gpu_transfer_direction_d2h);
208  assert(ret == PARSEC_SUCCESS);
209  }
210  if constexpr (sizeof...(Is) > 0) {
211  // recursion
212  mark_device_out(views, std::index_sequence<Is...>{});
213  }
214  }
215  } // namespace detail
216 
217  template<typename... Buffer>
218  void mark_device_out(std::tuple<Buffer&...> &b) {
219 
220  if (nullptr == detail::parsec_ttg_caller) {
221  throw std::runtime_error("mark_device_out may only be invoked from inside a task!");
222  }
223 
224  if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
225  throw std::runtime_error("mark_device_out called inside a non-gpu task!");
226  }
227 
228  detail::mark_device_out(b, std::index_sequence_for<Buffer...>{});
229  }
230 
231  namespace detail {
232 
233  template<typename... Views, std::size_t I, std::size_t... Is>
234  void post_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
235 
236  using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
237 
238  if constexpr (!std::is_const_v<view_type>) {
239  auto& view = std::get<I>(views);
240 
241  /* get_parsec_data is overloaded for buffer and devicescratch */
242  parsec_data_t* data = detail::get_parsec_data(view);
243  data->device_copies[0]->version = data->device_copies[data->owner_device]->version;
244  parsec_data_transfer_ownership_to_copy(data, 0, PARSEC_FLOW_ACCESS_READ);
245  }
246 
247  if constexpr (sizeof...(Is) > 0) {
248  // recursion
249  post_device_out(views, std::index_sequence<Is...>{});
250  }
251  }
252  } // namespace detail
253 
254  template<typename... Buffer>
255  void post_device_out(std::tuple<Buffer&...> &b) {
256  detail::post_device_out(b, std::index_sequence_for<Buffer...>{});
257  }
258 
259  template<typename T>
260  parsec_data_t* buffer_data(T&& buffer) {
261  using view_type = std::remove_reference_t<T>;
262  static_assert(ttg::meta::is_buffer_v<view_type> || ttg::meta::is_devicescratch_v<view_type>);
263  return detail::get_parsec_data(buffer);
264  }
265 
266 } // namespace ttg_parsec
267 
268 #endif // TTG_PARSEC_DEVICEFUNC_H
constexpr auto data(C &c) -> decltype(c.data())
Definition: span.h:189
std::integral_constant< bool,(Flags &const_) !=0 > is_const
void mark_device_out(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition: devicefunc.h:192
bool register_device_memory(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition: devicefunc.h:11
void post_device_out(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition: devicefunc.h:234
parsec_data_t * get_parsec_data(const ttg_parsec::Buffer< T, A > &db)
Definition: buffer.h:480
thread_local parsec_ttg_task_base_t * parsec_ttg_caller
Definition: thread_local.h:12
this contains PaRSEC-based TTG functionality
Definition: fwd.h:18
void post_device_out(std::tuple< Buffer &... > &b)
Definition: devicefunc.h:255
parsec_data_t * buffer_data(T &&buffer)
Definition: devicefunc.h:260
bool register_device_memory(std::tuple< Views &... > &views)
Definition: devicefunc.h:81
void mark_device_out(std::tuple< Buffer &... > &b)
Definition: devicefunc.h:218
scope
Definition: devicescope.h:5
TTG_IMPL_NS::Buffer< T, Allocator > Buffer
Definition: buffer.h:12
#define TTG_PARSEC_FLOW_ACCESS_TMP
Definition: parsec-ext.h:8
parsec_gpu_task_t * gpu_task
Definition: task.h:14
parsec_gpu_exec_stream_t * stream
Definition: task.h:16
parsec_flow_t * flows
Definition: task.h:15
parsec_device_gpu_module_t * device
Definition: task.h:17