Line data Source code
1 : //@HEADER
2 : // ************************************************************************
3 : //
4 : // Kokkos v. 4.0
5 : // Copyright (2022) National Technology & Engineering
6 : // Solutions of Sandia, LLC (NTESS).
7 : //
8 : // Under the terms of Contract DE-NA0003525 with NTESS,
9 : // the U.S. Government retains certain rights in this software.
10 : //
11 : // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
12 : // See https://kokkos.org/LICENSE for license information.
13 : // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
14 : //
15 : //@HEADER
16 :
17 : #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
18 : #include <Kokkos_Macros.hpp>
19 : static_assert(false,
20 : "Including non-public Kokkos header files is not allowed.");
21 : #endif
22 : #ifndef KOKKOS_EXECPOLICY_HPP
23 : #define KOKKOS_EXECPOLICY_HPP
24 :
25 : #include <Kokkos_Core_fwd.hpp>
26 : #include <impl/Kokkos_Traits.hpp>
27 : #include <impl/Kokkos_Error.hpp>
28 : #include <impl/Kokkos_AnalyzePolicy.hpp>
29 : #include <Kokkos_Concepts.hpp>
30 : #include <Kokkos_TypeInfo.hpp>
31 : #ifndef KOKKOS_ENABLE_IMPL_TYPEINFO
32 : #include <typeinfo>
33 : #endif
34 : #include <limits>
35 :
36 : //----------------------------------------------------------------------------
37 :
38 : namespace Kokkos {
39 :
40 : struct ParallelForTag {};
41 : struct ParallelScanTag {};
42 : struct ParallelReduceTag {};
43 :
44 : struct ChunkSize {
45 : int value;
46 : explicit ChunkSize(int value_) : value(value_) {}
47 : #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
48 : template <typename T = void>
49 : KOKKOS_DEPRECATED_WITH_COMMENT("ChunkSize should be constructed explicitly.")
50 : ChunkSize(int value_) : value(value_) {}
51 : #endif
52 : };
53 :
54 : /** \brief Execution policy for work over a range of an integral type.
55 : *
56 : * Valid template argument options:
57 : *
58 : * With a specified execution space:
59 : * < ExecSpace , WorkTag , { IntConst | IntType } >
60 : * < ExecSpace , WorkTag , void >
61 : * < ExecSpace , { IntConst | IntType } , void >
62 : * < ExecSpace , void , void >
63 : *
64 : * With the default execution space:
65 : * < WorkTag , { IntConst | IntType } , void >
66 : * < WorkTag , void , void >
67 : * < { IntConst | IntType } , void , void >
68 : * < void , void , void >
69 : *
70 : * IntType is a fundamental integral type
71 : * IntConst is an Impl::integral_constant< IntType , Blocking >
72 : *
73 : * Blocking is the granularity of partitioning the range among threads.
74 : */
75 : template <class... Properties>
76 3708 : class RangePolicy : public Impl::PolicyTraits<Properties...> {
77 : public:
78 : using traits = Impl::PolicyTraits<Properties...>;
79 :
80 : private:
81 : typename traits::execution_space m_space;
82 : typename traits::index_type m_begin;
83 : typename traits::index_type m_end;
84 : typename traits::index_type m_granularity;
85 : typename traits::index_type m_granularity_mask;
86 :
87 : template <class... OtherProperties>
88 : friend class RangePolicy;
89 :
90 : public:
91 : //! Tag this class as an execution policy
92 : using execution_policy = RangePolicy<Properties...>;
93 : using member_type = typename traits::index_type;
94 : using index_type = typename traits::index_type;
95 :
96 2472 : KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const {
97 1236 : return m_space;
98 : }
99 0 : KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin; }
100 0 : KOKKOS_INLINE_FUNCTION member_type end() const { return m_end; }
101 :
102 : // TODO: find a better workaround for Clangs weird instantiation order
103 : // This thing is here because of an instantiation error, where the RangePolicy
104 : // is inserted into FunctorValue Traits, which tries decltype on the operator.
105 : // It tries to do this even though the first argument of parallel for clearly
106 : // doesn't match.
107 : void operator()(const int&) const {}
108 :
109 : template <class... OtherProperties>
110 : RangePolicy(const RangePolicy<OtherProperties...>& p)
111 : : traits(p), // base class may contain data such as desired occupancy
112 : m_space(p.m_space),
113 : m_begin(p.m_begin),
114 : m_end(p.m_end),
115 : m_granularity(p.m_granularity),
116 : m_granularity_mask(p.m_granularity_mask) {}
117 :
118 : inline RangePolicy()
119 : : m_space(),
120 : m_begin(0),
121 : m_end(0),
122 : m_granularity(0),
123 : m_granularity_mask(0) {}
124 :
125 : /** \brief Total range */
126 : template <typename IndexType1, typename IndexType2,
127 : std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
128 : std::is_convertible_v<IndexType2, member_type>),
129 : bool> = false>
130 1236 : inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end)
131 2472 : : RangePolicy(typename traits::execution_space(), work_begin, work_end) {}
132 :
133 : /** \brief Total range */
134 : template <typename IndexType1, typename IndexType2,
135 : std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
136 : std::is_convertible_v<IndexType2, member_type>),
137 : bool> = false>
138 1236 : inline RangePolicy(const typename traits::execution_space& work_space,
139 : const IndexType1 work_begin, const IndexType2 work_end)
140 1236 : : m_space(work_space),
141 1236 : m_begin(work_begin),
142 1236 : m_end(work_end),
143 1236 : m_granularity(0),
144 1236 : m_granularity_mask(0) {
145 1236 : check_conversion_safety(work_begin);
146 1236 : check_conversion_safety(work_end);
147 1236 : check_bounds_validity();
148 1236 : set_auto_chunk_size();
149 1236 : }
150 :
151 : template <typename IndexType1, typename IndexType2,
152 : std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
153 : std::is_convertible_v<IndexType2, member_type>),
154 : bool> = false>
155 : RangePolicy(const typename traits::execution_space& work_space,
156 : const IndexType1 work_begin, const IndexType2 work_end,
157 : const ChunkSize chunk_size)
158 : : m_space(work_space),
159 : m_begin(work_begin),
160 : m_end(work_end),
161 : m_granularity(0),
162 : m_granularity_mask(0) {
163 : check_conversion_safety(work_begin);
164 : check_conversion_safety(work_end);
165 : check_bounds_validity();
166 : set_chunk_size(chunk_size.value);
167 : }
168 :
169 : /** \brief Total range */
170 : template <typename IndexType1, typename IndexType2, typename... Args,
171 : std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
172 : std::is_convertible_v<IndexType2, member_type>),
173 : bool> = false>
174 : RangePolicy(const IndexType1 work_begin, const IndexType2 work_end,
175 : const ChunkSize chunk_size)
176 : : RangePolicy(typename traits::execution_space(), work_begin, work_end,
177 : chunk_size) {}
178 :
179 : public:
180 : #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
181 : KOKKOS_DEPRECATED_WITH_COMMENT("Use set_chunk_size instead")
182 : inline void set(ChunkSize chunksize) {
183 : m_granularity = chunksize.value;
184 : m_granularity_mask = m_granularity - 1;
185 : }
186 : #endif
187 :
188 : public:
189 : /** \brief return chunk_size */
190 : inline member_type chunk_size() const { return m_granularity; }
191 :
192 : /** \brief set chunk_size to a discrete value*/
193 : inline RangePolicy& set_chunk_size(int chunk_size) {
194 : m_granularity = chunk_size;
195 : m_granularity_mask = m_granularity - 1;
196 : return *this;
197 : }
198 :
199 : private:
200 : /** \brief finalize chunk_size if it was set to AUTO*/
201 1236 : inline void set_auto_chunk_size() {
202 : #ifdef KOKKOS_ENABLE_SYCL
203 : if (std::is_same_v<typename traits::execution_space, Kokkos::SYCL>) {
204 : // chunk_size <=1 lets the compiler choose the workgroup size when
205 : // launching kernels
206 : m_granularity = 1;
207 : m_granularity_mask = 0;
208 : return;
209 : }
210 : #endif
211 1236 : auto concurrency = static_cast<int64_t>(m_space.concurrency());
212 1236 : if (concurrency == 0) concurrency = 1;
213 :
214 1236 : if (m_granularity > 0) {
215 0 : if (!Impl::is_integral_power_of_two(m_granularity))
216 : Kokkos::abort("RangePolicy blocking granularity must be power of two");
217 : }
218 :
219 1920 : int64_t new_chunk_size = 1;
220 1920 : while (new_chunk_size * 100 * concurrency <
221 1920 : static_cast<int64_t>(m_end - m_begin))
222 684 : new_chunk_size *= 2;
223 1236 : if (new_chunk_size < 128) {
224 : new_chunk_size = 1;
225 3272 : while ((new_chunk_size * 40 * concurrency <
226 3272 : static_cast<int64_t>(m_end - m_begin)) &&
227 : (new_chunk_size < 128))
228 2052 : new_chunk_size *= 2;
229 : }
230 1236 : m_granularity = new_chunk_size;
231 1236 : m_granularity_mask = m_granularity - 1;
232 1236 : }
233 :
234 2028 : void check_bounds_validity() {
235 2028 : if (m_end < m_begin) {
236 0 : std::string msg = "Kokkos::RangePolicy bounds error: The lower bound (" +
237 : std::to_string(m_begin) +
238 : ") is greater than the upper bound (" +
239 : std::to_string(m_end) + ").\n";
240 : #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4
241 : Kokkos::abort(msg.c_str());
242 : #endif
243 0 : m_begin = 0;
244 0 : m_end = 0;
245 : #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
246 0 : Kokkos::Impl::log_warning(msg);
247 : #endif
248 0 : }
249 2028 : }
250 :
251 : // To be replaced with std::in_range (c++20)
252 : template <typename IndexType>
253 3264 : static void check_conversion_safety([[maybe_unused]] const IndexType bound) {
254 : // Checking that the round-trip conversion preserves input index value
255 : if constexpr (std::is_convertible_v<member_type, IndexType>) {
256 : #if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \
257 : defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS)
258 : bool warn = false;
259 :
260 : if constexpr (std::is_arithmetic_v<member_type> &&
261 : (std::is_signed_v<IndexType> !=
262 : std::is_signed_v<member_type>)) {
263 : // check signed to unsigned
264 : if constexpr (std::is_signed_v<IndexType>)
265 : warn |= (bound < static_cast<IndexType>(
266 : std::numeric_limits<member_type>::min()));
267 :
268 : // check unsigned to signed
269 : if constexpr (std::is_signed_v<member_type>)
270 : warn |= (bound > static_cast<IndexType>(
271 : std::numeric_limits<member_type>::max()));
272 : }
273 :
274 : // check narrowing
275 144 : warn |=
276 : (static_cast<IndexType>(static_cast<member_type>(bound)) != bound);
277 :
278 3120 : if (warn) {
279 0 : std::string msg =
280 : "Kokkos::RangePolicy bound type error: an unsafe implicit "
281 : "conversion is performed on a bound (" +
282 : std::to_string(bound) +
283 : "), which may not preserve its original value.\n";
284 :
285 : #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4
286 : Kokkos::abort(msg.c_str());
287 : #endif
288 :
289 : #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
290 0 : Kokkos::Impl::log_warning(msg);
291 : #endif
292 0 : }
293 : #endif
294 : }
295 3120 : }
296 :
297 : public:
298 : /** \brief Subrange for a partition's rank and size.
299 : *
300 : * Typically used to partition a range over a group of threads.
301 : */
302 : struct WorkRange {
303 : using work_tag = typename RangePolicy<Properties...>::work_tag;
304 : using member_type = typename RangePolicy<Properties...>::member_type;
305 :
306 : KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin; }
307 : KOKKOS_INLINE_FUNCTION member_type end() const { return m_end; }
308 :
309 : /** \brief Subrange for a partition's rank and size.
310 : *
311 : * Typically used to partition a range over a group of threads.
312 : */
313 : KOKKOS_INLINE_FUNCTION
314 : WorkRange(const RangePolicy& range, const int part_rank,
315 : const int part_size)
316 : : m_begin(0), m_end(0) {
317 : if (part_size) {
318 : // Split evenly among partitions, then round up to the granularity.
319 : const member_type work_part =
320 : ((((range.end() - range.begin()) + (part_size - 1)) / part_size) +
321 : range.m_granularity_mask) &
322 : ~member_type(range.m_granularity_mask);
323 :
324 : m_begin = range.begin() + work_part * part_rank;
325 : m_end = m_begin + work_part;
326 :
327 : if (range.end() < m_begin) m_begin = range.end();
328 : if (range.end() < m_end) m_end = range.end();
329 : }
330 : }
331 :
332 : private:
333 : member_type m_begin;
334 : member_type m_end;
335 : WorkRange();
336 : WorkRange& operator=(const WorkRange&);
337 : };
338 : };
339 :
340 : RangePolicy() -> RangePolicy<>;
341 :
342 : RangePolicy(int64_t, int64_t) -> RangePolicy<>;
343 : RangePolicy(int64_t, int64_t, ChunkSize const&) -> RangePolicy<>;
344 :
345 : RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t) -> RangePolicy<>;
346 : RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&)
347 : -> RangePolicy<>;
348 :
349 : template <typename ES, typename = std::enable_if_t<is_execution_space_v<ES>>>
350 : RangePolicy(ES const&, int64_t, int64_t) -> RangePolicy<ES>;
351 :
352 : template <typename ES, typename = std::enable_if_t<is_execution_space_v<ES>>>
353 : RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&) -> RangePolicy<ES>;
354 :
355 : } // namespace Kokkos
356 :
357 : //----------------------------------------------------------------------------
358 : //----------------------------------------------------------------------------
359 :
360 : namespace Kokkos {
361 :
362 : namespace Impl {
363 :
364 : template <class ExecSpace, class... Properties>
365 : class TeamPolicyInternal : public Impl::PolicyTraits<Properties...> {
366 : private:
367 : using traits = Impl::PolicyTraits<Properties...>;
368 :
369 : public:
370 : using index_type = typename traits::index_type;
371 :
372 : //----------------------------------------
373 : /** \brief Query maximum team size for a given functor.
374 : *
375 : * This size takes into account execution space concurrency limitations and
376 : * scratch memory space limitations for reductions, team reduce/scan, and
377 : * team shared memory.
378 : *
379 : * This function only works for single-operator functors.
380 : * With multi-operator functors it cannot be determined
381 : * which operator will be called.
382 : */
383 : template <class FunctorType>
384 : static int team_size_max(const FunctorType&);
385 :
386 : /** \brief Query recommended team size for a given functor.
387 : *
388 : * This size takes into account execution space concurrency limitations and
389 : * scratch memory space limitations for reductions, team reduce/scan, and
390 : * team shared memory.
391 : *
392 : * This function only works for single-operator functors.
393 : * With multi-operator functors it cannot be determined
394 : * which operator will be called.
395 : */
396 : template <class FunctorType>
397 : static int team_size_recommended(const FunctorType&);
398 :
399 : template <class FunctorType>
400 : static int team_size_recommended(const FunctorType&, const int&);
401 :
402 : template <class FunctorType>
403 : int team_size_recommended(const FunctorType& functor,
404 : const int vector_length);
405 :
406 : //----------------------------------------
407 : /** \brief Construct policy with the given instance of the execution space */
408 : TeamPolicyInternal(const typename traits::execution_space&,
409 : int league_size_request, int team_size_request,
410 : int vector_length_request = 1);
411 :
412 : TeamPolicyInternal(const typename traits::execution_space&,
413 : int league_size_request, const Kokkos::AUTO_t&,
414 : int vector_length_request = 1);
415 :
416 : /** \brief Construct policy with the default instance of the execution space
417 : */
418 : TeamPolicyInternal(int league_size_request, int team_size_request,
419 : int vector_length_request = 1);
420 :
421 : TeamPolicyInternal(int league_size_request, const Kokkos::AUTO_t&,
422 : int vector_length_request = 1);
423 :
424 : /* TeamPolicyInternal( int league_size_request , int team_size_request );
425 :
426 : TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/
427 :
428 : /** \brief The actual league size (number of teams) of the policy.
429 : *
430 : * This may be smaller than the requested league size due to limitations
431 : * of the execution space.
432 : */
433 : KOKKOS_INLINE_FUNCTION int league_size() const;
434 :
435 : /** \brief The actual team size (number of threads per team) of the policy.
436 : *
437 : * This may be smaller than the requested team size due to limitations
438 : * of the execution space.
439 : */
440 : KOKKOS_INLINE_FUNCTION int team_size() const;
441 :
442 : /** \brief Whether the policy has an automatically determined team size
443 : */
444 : inline bool impl_auto_team_size() const;
445 : /** \brief Whether the policy has an automatically determined vector length
446 : */
447 : inline bool impl_auto_vector_length() const;
448 :
449 : static int vector_length_max();
450 :
451 : KOKKOS_INLINE_FUNCTION int impl_vector_length() const;
452 :
453 : inline typename traits::index_type chunk_size() const;
454 :
455 : inline TeamPolicyInternal& set_chunk_size(int chunk_size);
456 :
457 : /** \brief Parallel execution of a functor calls the functor once with
458 : * each member of the execution policy.
459 : */
460 : struct member_type {
461 : /** \brief Handle to the currently executing team shared scratch memory */
462 : KOKKOS_INLINE_FUNCTION
463 : typename traits::execution_space::scratch_memory_space team_shmem() const;
464 :
465 : /** \brief Rank of this team within the league of teams */
466 : KOKKOS_INLINE_FUNCTION int league_rank() const;
467 :
468 : /** \brief Number of teams in the league */
469 : KOKKOS_INLINE_FUNCTION int league_size() const;
470 :
471 : /** \brief Rank of this thread within this team */
472 : KOKKOS_INLINE_FUNCTION int team_rank() const;
473 :
474 : /** \brief Number of threads in this team */
475 : KOKKOS_INLINE_FUNCTION int team_size() const;
476 :
477 : /** \brief Barrier among the threads of this team */
478 : KOKKOS_INLINE_FUNCTION void team_barrier() const;
479 :
480 : /** \brief Intra-team reduction. Returns join of all values of the team
481 : * members. */
482 : template <class JoinOp>
483 : KOKKOS_INLINE_FUNCTION typename JoinOp::value_type team_reduce(
484 : const typename JoinOp::value_type, const JoinOp&) const;
485 :
486 : /** \brief Intra-team exclusive prefix sum with team_rank() ordering.
487 : *
488 : * The highest rank thread can compute the reduction total as
489 : * reduction_total = dev.team_scan( value ) + value ;
490 : */
491 : template <typename Type>
492 : KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const;
493 :
494 : /** \brief Intra-team exclusive prefix sum with team_rank() ordering
495 : * with intra-team non-deterministic ordering accumulation.
496 : *
497 : * The global inter-team accumulation value will, at the end of the
498 : * league's parallel execution, be the scan's total.
499 : * Parallel execution ordering of the league's teams is non-deterministic.
500 : * As such the base value for each team's scan operation is similarly
501 : * non-deterministic.
502 : */
503 : template <typename Type>
504 : KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value,
505 : Type* const global_accum) const;
506 : };
507 : };
508 :
509 : struct PerTeamValue {
510 : size_t value;
511 : PerTeamValue(size_t arg);
512 : };
513 :
514 : struct PerThreadValue {
515 : size_t value;
516 : PerThreadValue(size_t arg);
517 : };
518 :
519 : template <class iType, class... Args>
520 : struct ExtractVectorLength {
521 : static inline iType value(
522 : std::enable_if_t<std::is_integral_v<iType>, iType> val, Args...) {
523 : return val;
524 : }
525 : static inline std::enable_if_t<!std::is_integral_v<iType>, int> value(
526 : std::enable_if_t<!std::is_integral_v<iType>, iType>, Args...) {
527 : return 1;
528 : }
529 : };
530 :
531 : template <class iType, class... Args>
532 : inline std::enable_if_t<std::is_integral_v<iType>, iType> extract_vector_length(
533 : iType val, Args...) {
534 : return val;
535 : }
536 :
537 : template <class iType, class... Args>
538 : inline std::enable_if_t<!std::is_integral_v<iType>, int> extract_vector_length(
539 : iType, Args...) {
540 : return 1;
541 : }
542 :
543 : } // namespace Impl
544 :
545 : Impl::PerTeamValue PerTeam(const size_t& arg);
546 : Impl::PerThreadValue PerThread(const size_t& arg);
547 :
548 : struct ScratchRequest {
549 : int level;
550 :
551 : size_t per_team;
552 : size_t per_thread;
553 :
554 : inline ScratchRequest(const int& level_,
555 : const Impl::PerTeamValue& team_value) {
556 : level = level_;
557 : per_team = team_value.value;
558 : per_thread = 0;
559 : }
560 :
561 : inline ScratchRequest(const int& level_,
562 : const Impl::PerThreadValue& thread_value) {
563 : level = level_;
564 : per_team = 0;
565 : per_thread = thread_value.value;
566 : }
567 :
568 : inline ScratchRequest(const int& level_, const Impl::PerTeamValue& team_value,
569 : const Impl::PerThreadValue& thread_value) {
570 : level = level_;
571 : per_team = team_value.value;
572 : per_thread = thread_value.value;
573 : }
574 :
575 : inline ScratchRequest(const int& level_,
576 : const Impl::PerThreadValue& thread_value,
577 : const Impl::PerTeamValue& team_value) {
578 : level = level_;
579 : per_team = team_value.value;
580 : per_thread = thread_value.value;
581 : }
582 : };
583 :
584 : // Causes abnormal program termination if level is not `0` or `1`
585 : void team_policy_check_valid_storage_level_argument(int level);
586 :
587 : /** \brief Execution policy for parallel work over a league of teams of
588 : * threads.
589 : *
590 : * The work functor is called for each thread of each team such that
591 : * the team's member threads are guaranteed to be concurrent.
592 : *
593 : * The team's threads have access to team shared scratch memory and
594 : * team collective operations.
595 : *
596 : * If the WorkTag is non-void then the first calling argument of the
597 : * work functor's parentheses operator is 'const WorkTag &'.
598 : * This allows a functor to have multiple work member functions.
599 : *
600 : * Order of template arguments does not matter, since the implementation
601 : * uses variadic templates. Each and any of the template arguments can
602 : * be omitted.
603 : *
604 : * Possible Template arguments and their default values:
605 : * ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be
606 : * enabled. WorkTag (none): Tag which is used as the first argument for the
607 : * functor operator. Schedule<Type> (Schedule<Static>): Scheduling Policy
608 : * (Dynamic, or Static). IndexType<Type> (IndexType<ExecutionSpace::size_type>:
609 : * Integer Index type used to iterate over the Index space.
610 : * LaunchBounds<unsigned,unsigned> Launch Bounds for CUDA compilation,
611 : * default of LaunchBounds<0,0> indicates no launch bounds specified.
612 : */
613 : template <class... Properties>
614 : class TeamPolicy
615 : : public Impl::TeamPolicyInternal<
616 : typename Impl::PolicyTraits<Properties...>::execution_space,
617 : Properties...> {
618 : using internal_policy = Impl::TeamPolicyInternal<
619 : typename Impl::PolicyTraits<Properties...>::execution_space,
620 : Properties...>;
621 :
622 : template <class... OtherProperties>
623 : friend class TeamPolicy;
624 :
625 : public:
626 : using traits = Impl::PolicyTraits<Properties...>;
627 :
628 : using execution_policy = TeamPolicy<Properties...>;
629 :
630 : TeamPolicy() : internal_policy(0, AUTO) {}
631 :
632 : /** \brief Construct policy with the given instance of the execution space */
633 : TeamPolicy(const typename traits::execution_space& space_,
634 : int league_size_request, int team_size_request,
635 : int vector_length_request = 1)
636 : : internal_policy(space_, league_size_request, team_size_request,
637 : vector_length_request) {}
638 :
639 : TeamPolicy(const typename traits::execution_space& space_,
640 : int league_size_request, const Kokkos::AUTO_t&,
641 : int vector_length_request = 1)
642 : : internal_policy(space_, league_size_request, Kokkos::AUTO(),
643 : vector_length_request) {}
644 :
645 : TeamPolicy(const typename traits::execution_space& space_,
646 : int league_size_request, const Kokkos::AUTO_t&,
647 : const Kokkos::AUTO_t&)
648 : : internal_policy(space_, league_size_request, Kokkos::AUTO(),
649 : Kokkos::AUTO()) {}
650 : TeamPolicy(const typename traits::execution_space& space_,
651 : int league_size_request, const int team_size_request,
652 : const Kokkos::AUTO_t&)
653 : : internal_policy(space_, league_size_request, team_size_request,
654 : Kokkos::AUTO()) {}
655 : /** \brief Construct policy with the default instance of the execution space
656 : */
657 : TeamPolicy(int league_size_request, int team_size_request,
658 : int vector_length_request = 1)
659 : : internal_policy(league_size_request, team_size_request,
660 : vector_length_request) {}
661 :
662 : TeamPolicy(int league_size_request, const Kokkos::AUTO_t&,
663 : int vector_length_request = 1)
664 : : internal_policy(league_size_request, Kokkos::AUTO(),
665 : vector_length_request) {}
666 :
667 : TeamPolicy(int league_size_request, const Kokkos::AUTO_t&,
668 : const Kokkos::AUTO_t&)
669 : : internal_policy(league_size_request, Kokkos::AUTO(), Kokkos::AUTO()) {}
670 : TeamPolicy(int league_size_request, const int team_size_request,
671 : const Kokkos::AUTO_t&)
672 : : internal_policy(league_size_request, team_size_request,
673 : Kokkos::AUTO()) {}
674 :
675 : template <class... OtherProperties>
676 : TeamPolicy(const TeamPolicy<OtherProperties...> p) : internal_policy(p) {
677 : // Cannot call converting constructor in the member initializer list because
678 : // it is not a direct base.
679 : internal_policy::traits::operator=(p);
680 : }
681 :
682 : private:
683 : TeamPolicy(const internal_policy& p) : internal_policy(p) {}
684 :
685 : public:
686 : inline TeamPolicy& set_chunk_size(int chunk) {
687 : static_assert(
688 : std::is_same_v<decltype(internal_policy::set_chunk_size(chunk)),
689 : internal_policy&>,
690 : "internal set_chunk_size should return a reference");
691 : return static_cast<TeamPolicy&>(internal_policy::set_chunk_size(chunk));
692 : }
693 :
694 : inline TeamPolicy& set_scratch_size(const int& level,
695 : const Impl::PerTeamValue& per_team) {
696 : static_assert(std::is_same_v<decltype(internal_policy::set_scratch_size(
697 : level, per_team)),
698 : internal_policy&>,
699 : "internal set_chunk_size should return a reference");
700 :
701 : team_policy_check_valid_storage_level_argument(level);
702 : return static_cast<TeamPolicy&>(
703 : internal_policy::set_scratch_size(level, per_team));
704 : }
705 : inline TeamPolicy& set_scratch_size(const int& level,
706 : const Impl::PerThreadValue& per_thread) {
707 : team_policy_check_valid_storage_level_argument(level);
708 : return static_cast<TeamPolicy&>(
709 : internal_policy::set_scratch_size(level, per_thread));
710 : }
711 : inline TeamPolicy& set_scratch_size(const int& level,
712 : const Impl::PerTeamValue& per_team,
713 : const Impl::PerThreadValue& per_thread) {
714 : team_policy_check_valid_storage_level_argument(level);
715 : return static_cast<TeamPolicy&>(
716 : internal_policy::set_scratch_size(level, per_team, per_thread));
717 : }
718 : inline TeamPolicy& set_scratch_size(const int& level,
719 : const Impl::PerThreadValue& per_thread,
720 : const Impl::PerTeamValue& per_team) {
721 : team_policy_check_valid_storage_level_argument(level);
722 : return static_cast<TeamPolicy&>(
723 : internal_policy::set_scratch_size(level, per_team, per_thread));
724 : }
725 : };
726 :
727 : // Execution space not provided deduces to TeamPolicy<>
728 :
729 : TeamPolicy() -> TeamPolicy<>;
730 :
731 : TeamPolicy(int, int) -> TeamPolicy<>;
732 : TeamPolicy(int, int, int) -> TeamPolicy<>;
733 : TeamPolicy(int, Kokkos::AUTO_t const&) -> TeamPolicy<>;
734 : TeamPolicy(int, Kokkos::AUTO_t const&, int) -> TeamPolicy<>;
735 : TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) -> TeamPolicy<>;
736 : TeamPolicy(int, int, Kokkos::AUTO_t const&) -> TeamPolicy<>;
737 :
738 : // DefaultExecutionSpace deduces to TeamPolicy<>
739 :
740 : TeamPolicy(DefaultExecutionSpace const&, int, int) -> TeamPolicy<>;
741 : TeamPolicy(DefaultExecutionSpace const&, int, int, int) -> TeamPolicy<>;
742 : TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&)
743 : -> TeamPolicy<>;
744 : TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int)
745 : -> TeamPolicy<>;
746 : TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&,
747 : Kokkos::AUTO_t const&) -> TeamPolicy<>;
748 : TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&)
749 : -> TeamPolicy<>;
750 :
751 : // ES != DefaultExecutionSpace deduces to TeamPolicy<ES>
752 :
753 : template <typename ES,
754 : typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
755 : TeamPolicy(ES const&, int, int) -> TeamPolicy<ES>;
756 :
757 : template <typename ES,
758 : typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
759 : TeamPolicy(ES const&, int, int, int) -> TeamPolicy<ES>;
760 :
761 : template <typename ES,
762 : typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
763 : TeamPolicy(ES const&, int, Kokkos::AUTO_t const&) -> TeamPolicy<ES>;
764 :
765 : template <typename ES,
766 : typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
767 : TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int) -> TeamPolicy<ES>;
768 :
769 : template <typename ES,
770 : typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
771 : TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&)
772 : -> TeamPolicy<ES>;
773 :
774 : template <typename ES,
775 : typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
776 : TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&) -> TeamPolicy<ES>;
777 :
778 : namespace Impl {
779 :
780 : template <typename iType, class TeamMemberType>
781 : struct TeamThreadRangeBoundariesStruct {
782 : private:
783 : KOKKOS_INLINE_FUNCTION static iType ibegin(const iType& arg_begin,
784 : const iType& arg_end,
785 : const iType& arg_rank,
786 : const iType& arg_size) {
787 : return arg_begin +
788 : ((arg_end - arg_begin + arg_size - 1) / arg_size) * arg_rank;
789 : }
790 :
791 : KOKKOS_INLINE_FUNCTION static iType iend(const iType& arg_begin,
792 : const iType& arg_end,
793 : const iType& arg_rank,
794 : const iType& arg_size) {
795 : const iType end_ =
796 : arg_begin +
797 : ((arg_end - arg_begin + arg_size - 1) / arg_size) * (arg_rank + 1);
798 : return end_ < arg_end ? end_ : arg_end;
799 : }
800 :
801 : public:
802 : using index_type = iType;
803 : const iType start;
804 : const iType end;
805 : enum { increment = 1 };
806 : const TeamMemberType& thread;
807 :
808 : KOKKOS_INLINE_FUNCTION
809 : TeamThreadRangeBoundariesStruct(const TeamMemberType& arg_thread,
810 : const iType& arg_end)
811 : : start(
812 : ibegin(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
813 : end(iend(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
814 : thread(arg_thread) {}
815 :
816 : KOKKOS_INLINE_FUNCTION
817 : TeamThreadRangeBoundariesStruct(const TeamMemberType& arg_thread,
818 : const iType& arg_begin, const iType& arg_end)
819 : : start(ibegin(arg_begin, arg_end, arg_thread.team_rank(),
820 : arg_thread.team_size())),
821 : end(iend(arg_begin, arg_end, arg_thread.team_rank(),
822 : arg_thread.team_size())),
823 : thread(arg_thread) {}
824 : };
825 :
826 : template <typename iType, class TeamMemberType>
827 : struct TeamVectorRangeBoundariesStruct {
828 : private:
829 : KOKKOS_INLINE_FUNCTION static iType ibegin(const iType& arg_begin,
830 : const iType& arg_end,
831 : const iType& arg_rank,
832 : const iType& arg_size) {
833 : return arg_begin +
834 : ((arg_end - arg_begin + arg_size - 1) / arg_size) * arg_rank;
835 : }
836 :
837 : KOKKOS_INLINE_FUNCTION static iType iend(const iType& arg_begin,
838 : const iType& arg_end,
839 : const iType& arg_rank,
840 : const iType& arg_size) {
841 : const iType end_ =
842 : arg_begin +
843 : ((arg_end - arg_begin + arg_size - 1) / arg_size) * (arg_rank + 1);
844 : return end_ < arg_end ? end_ : arg_end;
845 : }
846 :
847 : public:
848 : using index_type = iType;
849 : const iType start;
850 : const iType end;
851 : enum { increment = 1 };
852 : const TeamMemberType& thread;
853 :
854 : KOKKOS_INLINE_FUNCTION
855 : TeamVectorRangeBoundariesStruct(const TeamMemberType& arg_thread,
856 : const iType& arg_end)
857 : : start(
858 : ibegin(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
859 : end(iend(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
860 : thread(arg_thread) {}
861 :
862 : KOKKOS_INLINE_FUNCTION
863 : TeamVectorRangeBoundariesStruct(const TeamMemberType& arg_thread,
864 : const iType& arg_begin, const iType& arg_end)
865 : : start(ibegin(arg_begin, arg_end, arg_thread.team_rank(),
866 : arg_thread.team_size())),
867 : end(iend(arg_begin, arg_end, arg_thread.team_rank(),
868 : arg_thread.team_size())),
869 : thread(arg_thread) {}
870 : };
871 :
872 : template <typename iType, class TeamMemberType>
873 : struct ThreadVectorRangeBoundariesStruct {
874 : using index_type = iType;
875 : const index_type start;
876 : const index_type end;
877 : enum { increment = 1 };
878 :
879 : KOKKOS_INLINE_FUNCTION
880 : constexpr ThreadVectorRangeBoundariesStruct(const TeamMemberType,
881 : const index_type& count) noexcept
882 : : start(static_cast<index_type>(0)), end(count) {}
883 :
884 : KOKKOS_INLINE_FUNCTION
885 : constexpr ThreadVectorRangeBoundariesStruct(
886 : const TeamMemberType, const index_type& arg_begin,
887 : const index_type& arg_end) noexcept
888 : : start(static_cast<index_type>(arg_begin)), end(arg_end) {}
889 : };
890 :
891 : template <class TeamMemberType>
892 : struct ThreadSingleStruct {
893 : const TeamMemberType& team_member;
894 : KOKKOS_INLINE_FUNCTION
895 : ThreadSingleStruct(const TeamMemberType& team_member_)
896 : : team_member(team_member_) {}
897 : };
898 :
899 : template <class TeamMemberType>
900 : struct VectorSingleStruct {
901 : const TeamMemberType& team_member;
902 : KOKKOS_INLINE_FUNCTION
903 : VectorSingleStruct(const TeamMemberType& team_member_)
904 : : team_member(team_member_) {}
905 : };
906 :
907 : } // namespace Impl
908 :
909 : /** \brief Execution policy for parallel work over a threads within a team.
910 : *
911 : * The range is split over all threads in a team. The Mapping scheme depends on
912 : * the architecture. This policy is used together with a parallel pattern as a
913 : * nested layer within a kernel launched with the TeamPolicy. This variant
914 : * expects a single count. So the range is (0,count].
915 : */
916 : template <typename iType, class TeamMemberType, class _never_use_this_overload>
917 : KOKKOS_INLINE_FUNCTION_DELETED
918 : Impl::TeamThreadRangeBoundariesStruct<iType, TeamMemberType>
919 : TeamThreadRange(const TeamMemberType&, const iType& count) = delete;
920 :
921 : /** \brief Execution policy for parallel work over a threads within a team.
922 : *
923 : * The range is split over all threads in a team. The Mapping scheme depends on
924 : * the architecture. This policy is used together with a parallel pattern as a
925 : * nested layer within a kernel launched with the TeamPolicy. This variant
926 : * expects a begin and end. So the range is (begin,end].
927 : */
928 : template <typename iType1, typename iType2, class TeamMemberType,
929 : class _never_use_this_overload>
930 : KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct<
931 : std::common_type_t<iType1, iType2>, TeamMemberType>
932 : TeamThreadRange(const TeamMemberType&, const iType1& begin,
933 : const iType2& end) = delete;
934 :
935 : /** \brief Execution policy for parallel work over a threads within a team.
936 : *
937 : * The range is split over all threads in a team. The Mapping scheme depends on
938 : * the architecture. This policy is used together with a parallel pattern as a
939 : * nested layer within a kernel launched with the TeamPolicy. This variant
940 : * expects a single count. So the range is (0,count].
941 : */
942 : template <typename iType, class TeamMemberType, class _never_use_this_overload>
943 : KOKKOS_INLINE_FUNCTION_DELETED
944 : Impl::TeamThreadRangeBoundariesStruct<iType, TeamMemberType>
945 : TeamVectorRange(const TeamMemberType&, const iType& count) = delete;
946 :
947 : /** \brief Execution policy for parallel work over a threads within a team.
948 : *
949 : * The range is split over all threads in a team. The Mapping scheme depends on
950 : * the architecture. This policy is used together with a parallel pattern as a
951 : * nested layer within a kernel launched with the TeamPolicy. This variant
952 : * expects a begin and end. So the range is (begin,end].
953 : */
954 : template <typename iType1, typename iType2, class TeamMemberType,
955 : class _never_use_this_overload>
956 : KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct<
957 : std::common_type_t<iType1, iType2>, TeamMemberType>
958 : TeamVectorRange(const TeamMemberType&, const iType1& begin,
959 : const iType2& end) = delete;
960 :
961 : /** \brief Execution policy for a vector parallel loop.
962 : *
963 : * The range is split over all vector lanes in a thread. The Mapping scheme
964 : * depends on the architecture. This policy is used together with a parallel
965 : * pattern as a nested layer within a kernel launched with the TeamPolicy. This
966 : * variant expects a single count. So the range is (0,count].
967 : */
968 : template <typename iType, class TeamMemberType, class _never_use_this_overload>
969 : KOKKOS_INLINE_FUNCTION_DELETED
970 : Impl::ThreadVectorRangeBoundariesStruct<iType, TeamMemberType>
971 : ThreadVectorRange(const TeamMemberType&, const iType& count) = delete;
972 :
973 : template <typename iType1, typename iType2, class TeamMemberType,
974 : class _never_use_this_overload>
975 : KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct<
976 : std::common_type_t<iType1, iType2>, TeamMemberType>
977 : ThreadVectorRange(const TeamMemberType&, const iType1& arg_begin,
978 : const iType2& arg_end) = delete;
979 :
980 : namespace Impl {
981 :
982 : enum class TeamMDRangeLastNestLevel : bool { NotLastNestLevel, LastNestLevel };
983 : enum class TeamMDRangeParThread : bool { NotParThread, ParThread };
984 : enum class TeamMDRangeParVector : bool { NotParVector, ParVector };
985 : enum class TeamMDRangeThreadAndVector : bool { NotBoth, Both };
986 :
987 : template <typename Rank, TeamMDRangeThreadAndVector ThreadAndVector>
988 : struct HostBasedNestLevel;
989 :
990 : template <typename Rank, TeamMDRangeThreadAndVector ThreadAndVector>
991 : struct AcceleratorBasedNestLevel;
992 :
993 : // ThreadAndVectorNestLevel determines on which nested level parallelization
994 : // happens.
995 : // - Rank is Kokkos::Rank<TotalNestLevel, Iter>
996 : // - TotalNestLevel is the total number of loop nests
997 : // - Iter is whether to go forward or backward through ranks (i.e. the
998 : // iteration order for MDRangePolicy)
999 : // - ThreadAndVector determines whether both vector and thread parallelism is
1000 : // in use
1001 : template <typename Rank, typename ExecSpace,
1002 : TeamMDRangeThreadAndVector ThreadAndVector>
1003 : struct ThreadAndVectorNestLevel;
1004 :
1005 : struct NoReductionTag {};
1006 :
1007 : template <typename Rank, typename TeamMDPolicy, typename Lambda,
1008 : typename ReductionValueType>
1009 : KOKKOS_INLINE_FUNCTION void md_parallel_impl(TeamMDPolicy const& policy,
1010 : Lambda const& lambda,
1011 : ReductionValueType&& val);
1012 : } // namespace Impl
1013 :
1014 : template <typename Rank, typename TeamHandle>
1015 : struct TeamThreadMDRange;
1016 :
1017 : template <unsigned N, Iterate OuterDir, Iterate InnerDir, typename TeamHandle>
1018 : struct TeamThreadMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> {
1019 : using NestLevelType = int;
1020 : using BoundaryType = int;
1021 : using TeamHandleType = TeamHandle;
1022 : using ExecutionSpace = typename TeamHandleType::execution_space;
1023 : using ArrayLayout = typename ExecutionSpace::array_layout;
1024 :
1025 : static constexpr NestLevelType total_nest_level =
1026 : Rank<N, OuterDir, InnerDir>::rank;
1027 : static constexpr Iterate iter = OuterDir;
1028 : static constexpr auto par_thread = Impl::TeamMDRangeParThread::ParThread;
1029 : static constexpr auto par_vector = Impl::TeamMDRangeParVector::NotParVector;
1030 :
1031 : static constexpr Iterate direction =
1032 : OuterDir == Iterate::Default ? Impl::layout_iterate_type_selector<
1033 : ArrayLayout>::outer_iteration_pattern
1034 : : iter;
1035 :
1036 : template <class... Args>
1037 : KOKKOS_FUNCTION TeamThreadMDRange(TeamHandleType const& team_, Args&&... args)
1038 : : team(team_), boundaries{static_cast<BoundaryType>(args)...} {
1039 : static_assert(sizeof...(Args) == total_nest_level);
1040 : }
1041 :
1042 : TeamHandleType const& team;
1043 : BoundaryType boundaries[total_nest_level];
1044 : };
1045 :
1046 : template <typename TeamHandle, typename... Args>
1047 : KOKKOS_DEDUCTION_GUIDE TeamThreadMDRange(TeamHandle const&, Args&&...)
1048 : -> TeamThreadMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>;
1049 :
1050 : template <typename Rank, typename TeamHandle>
1051 : struct ThreadVectorMDRange;
1052 :
1053 : template <unsigned N, Iterate OuterDir, Iterate InnerDir, typename TeamHandle>
1054 : struct ThreadVectorMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> {
1055 : using NestLevelType = int;
1056 : using BoundaryType = int;
1057 : using TeamHandleType = TeamHandle;
1058 : using ExecutionSpace = typename TeamHandleType::execution_space;
1059 : using ArrayLayout = typename ExecutionSpace::array_layout;
1060 :
1061 : static constexpr NestLevelType total_nest_level =
1062 : Rank<N, OuterDir, InnerDir>::rank;
1063 : static constexpr Iterate iter = OuterDir;
1064 : static constexpr auto par_thread = Impl::TeamMDRangeParThread::NotParThread;
1065 : static constexpr auto par_vector = Impl::TeamMDRangeParVector::ParVector;
1066 :
1067 : static constexpr Iterate direction =
1068 : OuterDir == Iterate::Default ? Impl::layout_iterate_type_selector<
1069 : ArrayLayout>::outer_iteration_pattern
1070 : : iter;
1071 :
1072 : template <class... Args>
1073 : KOKKOS_INLINE_FUNCTION ThreadVectorMDRange(TeamHandleType const& team_,
1074 : Args&&... args)
1075 : : team(team_), boundaries{static_cast<BoundaryType>(args)...} {
1076 : static_assert(sizeof...(Args) == total_nest_level);
1077 : }
1078 :
1079 : TeamHandleType const& team;
1080 : BoundaryType boundaries[total_nest_level];
1081 : };
1082 :
1083 : template <typename TeamHandle, typename... Args>
1084 : KOKKOS_DEDUCTION_GUIDE ThreadVectorMDRange(TeamHandle const&, Args&&...)
1085 : -> ThreadVectorMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>;
1086 :
1087 : template <typename Rank, typename TeamHandle>
1088 : struct TeamVectorMDRange;
1089 :
1090 : template <unsigned N, Iterate OuterDir, Iterate InnerDir, typename TeamHandle>
1091 : struct TeamVectorMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> {
1092 : using NestLevelType = int;
1093 : using BoundaryType = int;
1094 : using TeamHandleType = TeamHandle;
1095 : using ExecutionSpace = typename TeamHandleType::execution_space;
1096 : using ArrayLayout = typename ExecutionSpace::array_layout;
1097 :
1098 : static constexpr NestLevelType total_nest_level =
1099 : Rank<N, OuterDir, InnerDir>::rank;
1100 : static constexpr Iterate iter = OuterDir;
1101 : static constexpr auto par_thread = Impl::TeamMDRangeParThread::ParThread;
1102 : static constexpr auto par_vector = Impl::TeamMDRangeParVector::ParVector;
1103 :
1104 : static constexpr Iterate direction =
1105 : iter == Iterate::Default ? Impl::layout_iterate_type_selector<
1106 : ArrayLayout>::outer_iteration_pattern
1107 : : iter;
1108 :
1109 : template <class... Args>
1110 : KOKKOS_INLINE_FUNCTION TeamVectorMDRange(TeamHandleType const& team_,
1111 : Args&&... args)
1112 : : team(team_), boundaries{static_cast<BoundaryType>(args)...} {
1113 : static_assert(sizeof...(Args) == total_nest_level);
1114 : }
1115 :
1116 : TeamHandleType const& team;
1117 : BoundaryType boundaries[total_nest_level];
1118 : };
1119 :
1120 : template <typename TeamHandle, typename... Args>
1121 : KOKKOS_DEDUCTION_GUIDE TeamVectorMDRange(TeamHandle const&, Args&&...)
1122 : -> TeamVectorMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>;
1123 :
1124 : template <typename Rank, typename TeamHandle, typename Lambda,
1125 : typename ReducerValueType>
1126 : KOKKOS_INLINE_FUNCTION void parallel_reduce(
1127 : TeamThreadMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
1128 : ReducerValueType& val) {
1129 : static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/
1130 : !std::is_array_v<ReducerValueType> &&
1131 : !std::is_pointer_v<ReducerValueType> &&
1132 : !Kokkos::is_reducer_v<ReducerValueType>,
1133 : "Only scalar return types are allowed!");
1134 :
1135 : val = ReducerValueType{};
1136 : Impl::md_parallel_impl<Rank>(policy, lambda, val);
1137 : policy.team.team_reduce(
1138 : Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{val});
1139 : }
1140 :
1141 : template <typename Rank, typename TeamHandle, typename Lambda>
1142 : KOKKOS_INLINE_FUNCTION void parallel_for(
1143 : TeamThreadMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda) {
1144 : Impl::md_parallel_impl<Rank>(policy, lambda, Impl::NoReductionTag());
1145 : }
1146 :
1147 : template <typename Rank, typename TeamHandle, typename Lambda,
1148 : typename ReducerValueType>
1149 : KOKKOS_INLINE_FUNCTION void parallel_reduce(
1150 : ThreadVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
1151 : ReducerValueType& val) {
1152 : static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/
1153 : !std::is_array_v<ReducerValueType> &&
1154 : !std::is_pointer_v<ReducerValueType> &&
1155 : !Kokkos::is_reducer_v<ReducerValueType>,
1156 : "Only a scalar return types are allowed!");
1157 :
1158 : val = ReducerValueType{};
1159 : Impl::md_parallel_impl<Rank>(policy, lambda, val);
1160 : if constexpr (false
1161 : #ifdef KOKKOS_ENABLE_CUDA
1162 : || std::is_same_v<typename TeamHandle::execution_space,
1163 : Kokkos::Cuda>
1164 : #elif defined(KOKKOS_ENABLE_HIP)
1165 : || std::is_same_v<typename TeamHandle::execution_space,
1166 : Kokkos::HIP>
1167 : #elif defined(KOKKOS_ENABLE_SYCL)
1168 : || std::is_same_v<typename TeamHandle::execution_space,
1169 : Kokkos::SYCL>
1170 : #endif
1171 : )
1172 : policy.team.vector_reduce(
1173 : Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{
1174 : val});
1175 : }
1176 :
1177 : template <typename Rank, typename TeamHandle, typename Lambda>
1178 : KOKKOS_INLINE_FUNCTION void parallel_for(
1179 : ThreadVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda) {
1180 : Impl::md_parallel_impl<Rank>(policy, lambda, Impl::NoReductionTag());
1181 : }
1182 :
1183 : template <typename Rank, typename TeamHandle, typename Lambda,
1184 : typename ReducerValueType>
1185 : KOKKOS_INLINE_FUNCTION void parallel_reduce(
1186 : TeamVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
1187 : ReducerValueType& val) {
1188 : static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/
1189 : !std::is_array_v<ReducerValueType> &&
1190 : !std::is_pointer_v<ReducerValueType> &&
1191 : !Kokkos::is_reducer_v<ReducerValueType>,
1192 : "Only a scalar return types are allowed!");
1193 :
1194 : val = ReducerValueType{};
1195 : Impl::md_parallel_impl<Rank>(policy, lambda, val);
1196 : if constexpr (false
1197 : #ifdef KOKKOS_ENABLE_CUDA
1198 : || std::is_same_v<typename TeamHandle::execution_space,
1199 : Kokkos::Cuda>
1200 : #elif defined(KOKKOS_ENABLE_HIP)
1201 : || std::is_same_v<typename TeamHandle::execution_space,
1202 : Kokkos::HIP>
1203 : #elif defined(KOKKOS_ENABLE_SYCL)
1204 : || std::is_same_v<typename TeamHandle::execution_space,
1205 : Kokkos::SYCL>
1206 : #endif
1207 : )
1208 : policy.team.vector_reduce(
1209 : Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{
1210 : val});
1211 : policy.team.team_reduce(
1212 : Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{val});
1213 : }
1214 :
1215 : template <typename Rank, typename TeamHandle, typename Lambda>
1216 : KOKKOS_INLINE_FUNCTION void parallel_for(
1217 : TeamVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda) {
1218 : Impl::md_parallel_impl<Rank>(policy, lambda, Impl::NoReductionTag());
1219 : }
1220 :
1221 : namespace Impl {
1222 :
1223 : template <typename FunctorType, typename TagType,
1224 : bool HasTag = !std::is_void_v<TagType>>
1225 : struct ParallelConstructName;
1226 :
1227 : template <typename FunctorType, typename TagType>
1228 : struct ParallelConstructName<FunctorType, TagType, true> {
1229 : ParallelConstructName(std::string const& label) : label_ref(label) {
1230 : if (label.empty()) {
1231 : #ifdef KOKKOS_ENABLE_IMPL_TYPEINFO
1232 : default_name =
1233 : std::string(TypeInfo<std::remove_const_t<FunctorType>>::name()) +
1234 : "/" + std::string(TypeInfo<TagType>::name());
1235 : #else
1236 : default_name = std::string(typeid(FunctorType).name()) + "/" +
1237 : typeid(TagType).name();
1238 : #endif
1239 : }
1240 : }
1241 : std::string const& get() {
1242 : return (label_ref.empty()) ? default_name : label_ref;
1243 : }
1244 : std::string const& label_ref;
1245 : std::string default_name;
1246 : };
1247 :
1248 : template <typename FunctorType, typename TagType>
1249 0 : struct ParallelConstructName<FunctorType, TagType, false> {
1250 0 : ParallelConstructName(std::string const& label) : label_ref(label) {
1251 0 : if (label.empty()) {
1252 : #ifdef KOKKOS_ENABLE_IMPL_TYPEINFO
1253 0 : default_name = TypeInfo<std::remove_const_t<FunctorType>>::name();
1254 : #else
1255 : default_name = typeid(FunctorType).name();
1256 : #endif
1257 : }
1258 0 : }
1259 0 : std::string const& get() {
1260 0 : return (label_ref.empty()) ? default_name : label_ref;
1261 : }
1262 : std::string const& label_ref;
1263 : std::string default_name;
1264 : };
1265 :
1266 : } // namespace Impl
1267 :
1268 : } // namespace Kokkos
1269 :
1270 : namespace Kokkos {
1271 :
1272 : namespace Impl {
1273 :
1274 : template <class PatternTag, class... Args>
1275 : struct PatternImplSpecializationFromTag;
1276 :
1277 : template <class... Args>
1278 : struct PatternImplSpecializationFromTag<Kokkos::ParallelForTag, Args...>
1279 : : type_identity<ParallelFor<Args...>> {};
1280 :
1281 : template <class... Args>
1282 : struct PatternImplSpecializationFromTag<Kokkos::ParallelReduceTag, Args...>
1283 : : type_identity<ParallelReduce<Args...>> {};
1284 :
1285 : template <class... Args>
1286 : struct PatternImplSpecializationFromTag<Kokkos::ParallelScanTag, Args...>
1287 : : type_identity<ParallelScan<Args...>> {};
1288 :
1289 : template <class PatternImpl>
1290 : struct PatternTagFromImplSpecialization;
1291 :
1292 : template <class... Args>
1293 : struct PatternTagFromImplSpecialization<ParallelFor<Args...>>
1294 : : type_identity<ParallelForTag> {};
1295 :
1296 : template <class... Args>
1297 : struct PatternTagFromImplSpecialization<ParallelReduce<Args...>>
1298 : : type_identity<ParallelReduceTag> {};
1299 :
1300 : template <class... Args>
1301 : struct PatternTagFromImplSpecialization<ParallelScan<Args...>>
1302 : : type_identity<ParallelScanTag> {};
1303 :
1304 : } // end namespace Impl
1305 :
1306 : } // namespace Kokkos
1307 : #endif /* #define KOKKOS_EXECPOLICY_HPP */
|