devicefunc.h
Go to the documentation of this file.
1 #ifndef TTG_PARSEC_DEVICEFUNC_H
2 #define TTG_PARSEC_DEVICEFUNC_H
3 
4 #if defined(TTG_HAVE_CUDART)
5 #include <cuda.h>
6 #endif
7 
8 #include "ttg/parsec/task.h"
9 #include <parsec.h>
10 #include <parsec/mca/device/device_gpu.h>
11 
12 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
13 #include <parsec/mca/device/cuda/device_cuda.h>
14 #elif defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
15 #include <parsec/mca/device/hip/device_hip.h>
16 #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
17 
18 namespace ttg_parsec {
19  namespace detail {
20  template<typename... Views, std::size_t I, std::size_t... Is>
21  inline bool register_device_memory(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
22  static_assert(I < MAX_PARAM_COUNT,
23  "PaRSEC only supports MAX_PARAM_COUNT device input/outputs. "
24  "Increase MAX_PARAM_COUNT and recompile PaRSEC/TTG.");
25  using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
27  assert(nullptr != caller->dev_ptr);
28  parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
29  parsec_flow_t *flows = caller->dev_ptr->flows;
30 
31  auto& view = std::get<I>(views);
32  bool is_current = false;
33  static_assert(ttg::meta::is_buffer_v<view_type> || ttg::meta::is_devicescratch_v<view_type>);
34  /* get_parsec_data is overloaded for buffer and devicescratch */
35  parsec_data_t* data = detail::get_parsec_data(view);
36  /* TODO: check whether the device is current */
37 
38 
39  if (nullptr != data) {
40  auto access = PARSEC_FLOW_ACCESS_RW;
41  if constexpr (std::is_const_v<view_type>) {
42  // keep the flow at RW if it was RW to make sure we pull the data back out eventually
43  access = PARSEC_FLOW_ACCESS_READ;
44  } else if constexpr (ttg::meta::is_devicescratch_v<view_type>) {
45  if (view.scope() == ttg::scope::Allocate) {
46  access = PARSEC_FLOW_ACCESS_WRITE;
47  }
48  }
49 
50  /* build the flow */
51  /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
52  flows[I] = parsec_flow_t{.name = nullptr,
53  .sym_type = PARSEC_SYM_INOUT,
54  .flow_flags = static_cast<uint8_t>(access),
55  .flow_index = I,
56  .flow_datatype_mask = ~0 };
57 
58  gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
59  gpu_task->flow[I] = &flows[I];
60 
61  /* set the input data copy, parsec will take care of the transfer
62  * and the buffer will look at the parsec_data_t for the current pointer */
63  //detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[data->owner_device];
64  assert(nullptr != data->device_copies[0]->original);
65  caller->parsec_task.data[I].data_in = data->device_copies[0];
66  caller->parsec_task.data[I].source_repo_entry = NULL;
67 
68  } else {
69  /* ignore the flow */
70  flows[I] = parsec_flow_t{.name = nullptr,
71  .sym_type = PARSEC_FLOW_ACCESS_NONE,
72  .flow_flags = 0,
73  .flow_index = I,
74  .flow_datatype_mask = ~0 };
75  gpu_task->flow[I] = &flows[I];
76  gpu_task->flow_nb_elts[I] = 0; // size in bytes
77  caller->parsec_task.data[I].data_in = nullptr;
78  }
79 
80  if constexpr (sizeof...(Is) > 0) {
81  is_current |= register_device_memory(views, std::index_sequence<Is...>{});
82  }
83  return is_current;
84  }
85  } // namespace detail
86 
87  /* Takes a tuple of ttg::Views or ttg::buffers and register them
88  * with the currently executing task. Returns true if all memory
89  * is current on the target device, false if transfers are required. */
90  template<typename... Views>
91  inline bool register_device_memory(std::tuple<Views&...> &views) {
92  bool is_current = true;
93  if (nullptr == detail::parsec_ttg_caller) {
94  throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
95  }
96 
97  if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
98  throw std::runtime_error("register_device_memory called inside a non-gpu task!");
99  }
100 
101  if constexpr (sizeof...(Views) > 0) {
102  is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
103  }
104 
105  /* reset all entries in the current task */
106  for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {
107  detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
108  detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
109  detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
111  detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
112  }
113 
114  return is_current;
115  }
116 
117  namespace detail {
118  template<typename... Views, std::size_t I, std::size_t... Is, bool DeviceAvail = false>
119  inline void mark_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
120 
121  using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
122  auto& view = std::get<I>(views);
123 
124  /* get_parsec_data is overloaded for buffer and devicescratch */
125  parsec_data_t* data = detail::get_parsec_data(view);
126  parsec_gpu_exec_stream_t *stream = detail::parsec_ttg_caller->dev_ptr->stream;
127 
128  /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */
129  if (data->owner_device != 0) {
130  parsec_device_gpu_module_t *device_module = detail::parsec_ttg_caller->dev_ptr->device;
131  device_module->memcpy_async(device_module, stream,
132  data->device_copies[0]->device_private,
133  data->device_copies[data->owner_device]->device_private,
134  data->nb_elts, parsec_device_gpu_transfer_direction_d2h);
135  }
136  if constexpr (sizeof...(Is) > 0) {
137  // recursion
138  mark_device_out(views, std::index_sequence<Is...>{});
139  }
140  }
141  } // namespace detail
142 
143  template<typename... Buffer>
144  inline void mark_device_out(std::tuple<Buffer&...> &b) {
145 
146  if (nullptr == detail::parsec_ttg_caller) {
147  throw std::runtime_error("mark_device_out may only be invoked from inside a task!");
148  }
149 
150  if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
151  throw std::runtime_error("mark_device_out called inside a non-gpu task!");
152  }
153 
154  detail::mark_device_out(b, std::index_sequence_for<Buffer...>{});
155  }
156 
157  namespace detail {
158 
159  template<typename... Views, std::size_t I, std::size_t... Is>
160  inline void post_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
161 
162  using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
163 
164  if constexpr (!std::is_const_v<view_type>) {
165  auto& view = std::get<I>(views);
166 
167  /* get_parsec_data is overloaded for buffer and devicescratch */
168  parsec_data_t* data = detail::get_parsec_data(view);
169  data->device_copies[0]->version = data->device_copies[data->owner_device]->version;
170  parsec_data_transfer_ownership_to_copy(data, 0, PARSEC_FLOW_ACCESS_READ);
171  }
172 
173  if constexpr (sizeof...(Is) > 0) {
174  // recursion
175  post_device_out(views, std::index_sequence<Is...>{});
176  }
177  }
178  } // namespace detail
179  template<typename... Buffer>
180  inline void post_device_out(std::tuple<Buffer&...> &b) {
181  detail::post_device_out(b, std::index_sequence_for<Buffer...>{});
182  }
183 
184 } // namespace ttg_parsec
185 
186 #endif // TTG_PARSEC_DEVICEFUNC_H
constexpr auto data(C &c) -> decltype(c.data())
Definition: span.h:189
void mark_device_out(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition: devicefunc.h:119
bool register_device_memory(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition: devicefunc.h:21
void post_device_out(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition: devicefunc.h:160
parsec_data_t * get_parsec_data(const ttg_parsec::Buffer< T, A > &db)
Definition: buffer.h:442
thread_local parsec_ttg_task_base_t * parsec_ttg_caller
Definition: thread_local.h:12
this contains PaRSEC-based TTG functionality
Definition: fwd.h:18
void post_device_out(std::tuple< Buffer &... > &b)
Definition: devicefunc.h:180
bool register_device_memory(std::tuple< Views &... > &views)
Definition: devicefunc.h:91
void mark_device_out(std::tuple< Buffer &... > &b)
Definition: devicefunc.h:144
TTG_IMPL_NS::Buffer< T, Allocator > Buffer
Definition: buffer.h:11
parsec_gpu_task_t * gpu_task
Definition: task.h:14
parsec_gpu_exec_stream_t * stream
Definition: task.h:16
parsec_flow_t * flows
Definition: task.h:15
parsec_device_gpu_module_t * device
Definition: task.h:17