devicefunc.h
Go to the documentation of this file.
1 #ifndef TTG_PARSEC_DEVICEFUNC_H
2 #define TTG_PARSEC_DEVICEFUNC_H
3 
4 #if defined(TTG_HAVE_CUDART)
5 #include <cuda.h>
6 #endif
7 
8 #include "ttg/parsec/task.h"
9 #include <parsec.h>
10 #include <parsec/mca/device/device_gpu.h>
11 
12 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
13 #include <parsec/mca/device/cuda/device_cuda.h>
14 #elif defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
15 #include <parsec/mca/device/hip/device_hip.h>
16 #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
17 
18 namespace ttg_parsec {
19  namespace detail {
20  template<typename... Views, std::size_t I, std::size_t... Is>
21  inline bool register_device_memory(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
22  static_assert(I < MAX_PARAM_COUNT,
23  "PaRSEC only supports MAX_PARAM_COUNT device input/outputs. "
24  "Increase MAX_PARAM_COUNT and recompile PaRSEC/TTG.");
25  using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
27  assert(nullptr != caller->dev_ptr);
28  parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
29  parsec_flow_t *flows = caller->dev_ptr->flows;
30 
31  auto& view = std::get<I>(views);
32  bool is_current = false;
33  static_assert(ttg::meta::is_buffer_v<view_type> || ttg::meta::is_devicescratch_v<view_type>);
34  /* get_parsec_data is overloaded for buffer and devicescratch */
35  parsec_data_t* data = detail::get_parsec_data(view);
36  /* TODO: check whether the device is current */
37 
38  auto access = PARSEC_FLOW_ACCESS_RW;
39  if constexpr (std::is_const_v<view_type>) {
40  // keep the flow at RW if it was RW to make sure we pull the data back out eventually
41  //if (flows[I].flow_flags != PARSEC_FLOW_ACCESS_RW) {
42  access = PARSEC_FLOW_ACCESS_READ;
43  //}
44  } else if constexpr (ttg::meta::is_devicescratch_v<view_type>) {
45  if (view.scope() == ttg::scope::Allocate) {
46  access = PARSEC_FLOW_ACCESS_WRITE;
47  }
48  }
49 
50  //std::cout << "register_device_memory task " << detail::parsec_ttg_caller << " data " << I << " "
51  // << data << " size " << data->nb_elts << std::endl;
52 
53  /* build the flow */
54  /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
55  flows[I] = parsec_flow_t{.name = nullptr,
56  .sym_type = PARSEC_SYM_INOUT,
57  .flow_flags = static_cast<uint8_t>(access),
58  .flow_index = I,
59  .flow_datatype_mask = ~0 };
60 
61  gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
62  gpu_task->flow[I] = &flows[I];
63 
64  /* set the input data copy, parsec will take care of the transfer
65  * and the buffer will look at the parsec_data_t for the current pointer */
66  //detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[data->owner_device];
67  assert(nullptr != data->device_copies[0]->original);
68  caller->parsec_task.data[I].data_in = data->device_copies[0];
69  caller->parsec_task.data[I].source_repo_entry = NULL;
70 
71  if constexpr (sizeof...(Is) > 0) {
72  is_current |= register_device_memory(views, std::index_sequence<Is...>{});
73  }
74  return is_current;
75  }
76  } // namespace detail
77 
78  /* Takes a tuple of ttg::Views or ttg::buffers and register them
79  * with the currently executing task. Returns true if all memory
80  * is current on the target device, false if transfers are required. */
81  template<typename... Views>
82  inline bool register_device_memory(std::tuple<Views&...> &views) {
83  bool is_current = true;
84  if (nullptr == detail::parsec_ttg_caller) {
85  throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
86  }
87 
88  if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
89  throw std::runtime_error("register_device_memory called inside a non-gpu task!");
90  }
91 
92  if constexpr (sizeof...(Views) > 0) {
93  is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
94  }
95 
96  /* reset all entries in the current task */
97  for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {
98  detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
99  detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
100  detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
102  detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
103  }
104 
105  return is_current;
106  }
107 
108  namespace detail {
109  template<typename... Views, std::size_t I, std::size_t... Is, bool DeviceAvail = false>
110  inline void mark_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
111 
112  using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
113  auto& view = std::get<I>(views);
114 
115  /* get_parsec_data is overloaded for buffer and devicescratch */
116  parsec_data_t* data = detail::get_parsec_data(view);
117  parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task;
118  parsec_gpu_exec_stream_t *stream = detail::parsec_ttg_caller->dev_ptr->stream;
119 
120  /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */
121  parsec_device_gpu_module_t *device_module = detail::parsec_ttg_caller->dev_ptr->device;
122  device_module->memcpy_async(device_module, stream,
123  data->device_copies[0]->device_private,
124  data->device_copies[data->owner_device]->device_private,
125  data->nb_elts, parsec_device_gpu_transfer_direction_d2h);
126 
127  if constexpr (sizeof...(Is) > 0) {
128  // recursion
129  mark_device_out(views, std::index_sequence<Is...>{});
130  }
131  }
132  } // namespace detail
133 
134  template<typename... Buffer>
135  inline void mark_device_out(std::tuple<Buffer&...> &b) {
136 
137  if (nullptr == detail::parsec_ttg_caller) {
138  throw std::runtime_error("mark_device_out may only be invoked from inside a task!");
139  }
140 
141  if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
142  throw std::runtime_error("mark_device_out called inside a non-gpu task!");
143  }
144 
145  detail::mark_device_out(b, std::index_sequence_for<Buffer...>{});
146  }
147 
148  namespace detail {
149 
150  template<typename... Views, std::size_t I, std::size_t... Is>
151  inline void post_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
152 
153  using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
154 
155  if constexpr (!std::is_const_v<view_type>) {
156  auto& view = std::get<I>(views);
157 
158  /* get_parsec_data is overloaded for buffer and devicescratch */
159  parsec_data_t* data = detail::get_parsec_data(view);
160  data->device_copies[0]->version = data->device_copies[data->owner_device]->version;
161  parsec_data_transfer_ownership_to_copy(data, 0, PARSEC_FLOW_ACCESS_READ);
162  }
163 
164  if constexpr (sizeof...(Is) > 0) {
165  // recursion
166  post_device_out(views, std::index_sequence<Is...>{});
167  }
168  }
169  } // namespace detail
170  template<typename... Buffer>
171  inline void post_device_out(std::tuple<Buffer&...> &b) {
172  detail::post_device_out(b, std::index_sequence_for<Buffer...>{});
173  }
174 
175 } // namespace ttg_parsec
176 
177 #endif // TTG_PARSEC_DEVICEFUNC_H
constexpr auto data(C &c) -> decltype(c.data())
Definition: span.h:189
void mark_device_out(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition: devicefunc.h:110
bool register_device_memory(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition: devicefunc.h:21
void post_device_out(std::tuple< Views &... > &views, std::index_sequence< I, Is... >)
Definition: devicefunc.h:151
parsec_data_t * get_parsec_data(const ttg_parsec::Buffer< T, A > &db)
Definition: buffer.h:393
thread_local parsec_ttg_task_base_t * parsec_ttg_caller
Definition: thread_local.h:12
this contains PaRSEC-based TTG functionality
Definition: fwd.h:18
void post_device_out(std::tuple< Buffer &... > &b)
Definition: devicefunc.h:171
bool register_device_memory(std::tuple< Views &... > &views)
Definition: devicefunc.h:82
void mark_device_out(std::tuple< Buffer &... > &b)
Definition: devicefunc.h:135
TTG_IMPL_NS::Buffer< T, Allocator > Buffer
Definition: buffer.h:9
parsec_gpu_task_t * gpu_task
Definition: task.h:14
parsec_gpu_exec_stream_t * stream
Definition: task.h:16
parsec_flow_t * flows
Definition: task.h:15
parsec_device_gpu_module_t * device
Definition: task.h:17