LCOV - code coverage report
Current view: top level - build/_deps/kokkos-src/core/src - Kokkos_ExecPolicy.hpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 38 56 67.9 %
Date: 2026-02-16 14:39:39 Functions: 7 15 46.7 %

          Line data    Source code
       1             : //@HEADER
       2             : // ************************************************************************
       3             : //
       4             : //                        Kokkos v. 4.0
       5             : //       Copyright (2022) National Technology & Engineering
       6             : //               Solutions of Sandia, LLC (NTESS).
       7             : //
       8             : // Under the terms of Contract DE-NA0003525 with NTESS,
       9             : // the U.S. Government retains certain rights in this software.
      10             : //
      11             : // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
      12             : // See https://kokkos.org/LICENSE for license information.
      13             : // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      14             : //
      15             : //@HEADER
      16             : 
      17             : #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
      18             : #include <Kokkos_Macros.hpp>
      19             : static_assert(false,
      20             :               "Including non-public Kokkos header files is not allowed.");
      21             : #endif
      22             : #ifndef KOKKOS_EXECPOLICY_HPP
      23             : #define KOKKOS_EXECPOLICY_HPP
      24             : 
      25             : #include <Kokkos_Core_fwd.hpp>
      26             : #include <impl/Kokkos_Traits.hpp>
      27             : #include <impl/Kokkos_Error.hpp>
      28             : #include <impl/Kokkos_AnalyzePolicy.hpp>
      29             : #include <Kokkos_Concepts.hpp>
      30             : #include <Kokkos_TypeInfo.hpp>
      31             : #ifndef KOKKOS_ENABLE_IMPL_TYPEINFO
      32             : #include <typeinfo>
      33             : #endif
      34             : #include <limits>
      35             : 
      36             : //----------------------------------------------------------------------------
      37             : 
      38             : namespace Kokkos {
      39             : 
      40             : struct ParallelForTag {};
      41             : struct ParallelScanTag {};
      42             : struct ParallelReduceTag {};
      43             : 
      44             : struct ChunkSize {
      45             :   int value;
      46             :   explicit ChunkSize(int value_) : value(value_) {}
      47             : #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
      48             :   template <typename T = void>
      49             :   KOKKOS_DEPRECATED_WITH_COMMENT("ChunkSize should be constructed explicitly.")
      50             :   ChunkSize(int value_) : value(value_) {}
      51             : #endif
      52             : };
      53             : 
      54             : /** \brief  Execution policy for work over a range of an integral type.
      55             :  *
      56             :  * Valid template argument options:
      57             :  *
      58             :  *  With a specified execution space:
      59             :  *    < ExecSpace , WorkTag , { IntConst | IntType } >
      60             :  *    < ExecSpace , WorkTag , void >
      61             :  *    < ExecSpace , { IntConst | IntType } , void >
      62             :  *    < ExecSpace , void , void >
      63             :  *
      64             :  *  With the default execution space:
      65             :  *    < WorkTag , { IntConst | IntType } , void >
      66             :  *    < WorkTag , void , void >
      67             :  *    < { IntConst | IntType } , void , void >
      68             :  *    < void , void , void >
      69             :  *
      70             :  *  IntType  is a fundamental integral type
      71             :  *  IntConst is an Impl::integral_constant< IntType , Blocking >
      72             :  *
      73             :  *  Blocking is the granularity of partitioning the range among threads.
      74             :  */
      75             : template <class... Properties>
      76        3708 : class RangePolicy : public Impl::PolicyTraits<Properties...> {
      77             :  public:
      78             :   using traits = Impl::PolicyTraits<Properties...>;
      79             : 
      80             :  private:
      81             :   typename traits::execution_space m_space;
      82             :   typename traits::index_type m_begin;
      83             :   typename traits::index_type m_end;
      84             :   typename traits::index_type m_granularity;
      85             :   typename traits::index_type m_granularity_mask;
      86             : 
      87             :   template <class... OtherProperties>
      88             :   friend class RangePolicy;
      89             : 
      90             :  public:
      91             :   //! Tag this class as an execution policy
      92             :   using execution_policy = RangePolicy<Properties...>;
      93             :   using member_type      = typename traits::index_type;
      94             :   using index_type       = typename traits::index_type;
      95             : 
      96        2472 :   KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const {
      97        1236 :     return m_space;
      98             :   }
      99           0 :   KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin; }
     100           0 :   KOKKOS_INLINE_FUNCTION member_type end() const { return m_end; }
     101             : 
     102             :   // TODO: find a better workaround for Clangs weird instantiation order
     103             :   // This thing is here because of an instantiation error, where the RangePolicy
     104             :   // is inserted into FunctorValue Traits, which tries decltype on the operator.
     105             :   // It tries to do this even though the first argument of parallel for clearly
     106             :   // doesn't match.
     107             :   void operator()(const int&) const {}
     108             : 
     109             :   template <class... OtherProperties>
     110             :   RangePolicy(const RangePolicy<OtherProperties...>& p)
     111             :       : traits(p),  // base class may contain data such as desired occupancy
     112             :         m_space(p.m_space),
     113             :         m_begin(p.m_begin),
     114             :         m_end(p.m_end),
     115             :         m_granularity(p.m_granularity),
     116             :         m_granularity_mask(p.m_granularity_mask) {}
     117             : 
     118             :   inline RangePolicy()
     119             :       : m_space(),
     120             :         m_begin(0),
     121             :         m_end(0),
     122             :         m_granularity(0),
     123             :         m_granularity_mask(0) {}
     124             : 
     125             :   /** \brief  Total range */
     126             :   template <typename IndexType1, typename IndexType2,
     127             :             std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
     128             :                               std::is_convertible_v<IndexType2, member_type>),
     129             :                              bool> = false>
     130        1236 :   inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end)
     131        2472 :       : RangePolicy(typename traits::execution_space(), work_begin, work_end) {}
     132             : 
     133             :   /** \brief  Total range */
     134             :   template <typename IndexType1, typename IndexType2,
     135             :             std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
     136             :                               std::is_convertible_v<IndexType2, member_type>),
     137             :                              bool> = false>
     138        1236 :   inline RangePolicy(const typename traits::execution_space& work_space,
     139             :                      const IndexType1 work_begin, const IndexType2 work_end)
     140        1236 :       : m_space(work_space),
     141        1236 :         m_begin(work_begin),
     142        1236 :         m_end(work_end),
     143        1236 :         m_granularity(0),
     144        1236 :         m_granularity_mask(0) {
     145        1236 :     check_conversion_safety(work_begin);
     146        1236 :     check_conversion_safety(work_end);
     147        1236 :     check_bounds_validity();
     148        1236 :     set_auto_chunk_size();
     149        1236 :   }
     150             : 
     151             :   template <typename IndexType1, typename IndexType2,
     152             :             std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
     153             :                               std::is_convertible_v<IndexType2, member_type>),
     154             :                              bool> = false>
     155             :   RangePolicy(const typename traits::execution_space& work_space,
     156             :               const IndexType1 work_begin, const IndexType2 work_end,
     157             :               const ChunkSize chunk_size)
     158             :       : m_space(work_space),
     159             :         m_begin(work_begin),
     160             :         m_end(work_end),
     161             :         m_granularity(0),
     162             :         m_granularity_mask(0) {
     163             :     check_conversion_safety(work_begin);
     164             :     check_conversion_safety(work_end);
     165             :     check_bounds_validity();
     166             :     set_chunk_size(chunk_size.value);
     167             :   }
     168             : 
     169             :   /** \brief  Total range */
     170             :   template <typename IndexType1, typename IndexType2, typename... Args,
     171             :             std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
     172             :                               std::is_convertible_v<IndexType2, member_type>),
     173             :                              bool> = false>
     174             :   RangePolicy(const IndexType1 work_begin, const IndexType2 work_end,
     175             :               const ChunkSize chunk_size)
     176             :       : RangePolicy(typename traits::execution_space(), work_begin, work_end,
     177             :                     chunk_size) {}
     178             : 
     179             :  public:
     180             : #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
     181             :   KOKKOS_DEPRECATED_WITH_COMMENT("Use set_chunk_size instead")
     182             :   inline void set(ChunkSize chunksize) {
     183             :     m_granularity      = chunksize.value;
     184             :     m_granularity_mask = m_granularity - 1;
     185             :   }
     186             : #endif
     187             : 
     188             :  public:
     189             :   /** \brief return chunk_size */
     190             :   inline member_type chunk_size() const { return m_granularity; }
     191             : 
     192             :   /** \brief set chunk_size to a discrete value*/
     193             :   inline RangePolicy& set_chunk_size(int chunk_size) {
     194             :     m_granularity      = chunk_size;
     195             :     m_granularity_mask = m_granularity - 1;
     196             :     return *this;
     197             :   }
     198             : 
     199             :  private:
     200             :   /** \brief finalize chunk_size if it was set to AUTO*/
     201        1236 :   inline void set_auto_chunk_size() {
     202             : #ifdef KOKKOS_ENABLE_SYCL
     203             :     if (std::is_same_v<typename traits::execution_space, Kokkos::SYCL>) {
     204             :       // chunk_size <=1 lets the compiler choose the workgroup size when
     205             :       // launching kernels
     206             :       m_granularity      = 1;
     207             :       m_granularity_mask = 0;
     208             :       return;
     209             :     }
     210             : #endif
     211        1236 :     auto concurrency = static_cast<int64_t>(m_space.concurrency());
     212        1236 :     if (concurrency == 0) concurrency = 1;
     213             : 
     214        1236 :     if (m_granularity > 0) {
     215           0 :       if (!Impl::is_integral_power_of_two(m_granularity))
     216             :         Kokkos::abort("RangePolicy blocking granularity must be power of two");
     217             :     }
     218             : 
     219        1920 :     int64_t new_chunk_size = 1;
     220        1920 :     while (new_chunk_size * 100 * concurrency <
     221        1920 :            static_cast<int64_t>(m_end - m_begin))
     222         684 :       new_chunk_size *= 2;
     223        1236 :     if (new_chunk_size < 128) {
     224             :       new_chunk_size = 1;
     225        3272 :       while ((new_chunk_size * 40 * concurrency <
     226        3272 :               static_cast<int64_t>(m_end - m_begin)) &&
     227             :              (new_chunk_size < 128))
     228        2052 :         new_chunk_size *= 2;
     229             :     }
     230        1236 :     m_granularity      = new_chunk_size;
     231        1236 :     m_granularity_mask = m_granularity - 1;
     232        1236 :   }
     233             : 
     234        2028 :   void check_bounds_validity() {
     235        2028 :     if (m_end < m_begin) {
     236           0 :       std::string msg = "Kokkos::RangePolicy bounds error: The lower bound (" +
     237             :                         std::to_string(m_begin) +
     238             :                         ") is greater than the upper bound (" +
     239             :                         std::to_string(m_end) + ").\n";
     240             : #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4
     241             :       Kokkos::abort(msg.c_str());
     242             : #endif
     243           0 :       m_begin = 0;
     244           0 :       m_end   = 0;
     245             : #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
     246           0 :       Kokkos::Impl::log_warning(msg);
     247             : #endif
     248           0 :     }
     249        2028 :   }
     250             : 
     251             :   // To be replaced with std::in_range (c++20)
     252             :   template <typename IndexType>
     253        3264 :   static void check_conversion_safety([[maybe_unused]] const IndexType bound) {
     254             :     // Checking that the round-trip conversion preserves input index value
     255             :     if constexpr (std::is_convertible_v<member_type, IndexType>) {
     256             : #if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \
     257             :     defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS)
     258             :       bool warn = false;
     259             : 
     260             :       if constexpr (std::is_arithmetic_v<member_type> &&
     261             :                     (std::is_signed_v<IndexType> !=
     262             :                      std::is_signed_v<member_type>)) {
     263             :         // check signed to unsigned
     264             :         if constexpr (std::is_signed_v<IndexType>)
     265             :           warn |= (bound < static_cast<IndexType>(
     266             :                                std::numeric_limits<member_type>::min()));
     267             : 
     268             :         // check unsigned to signed
     269             :         if constexpr (std::is_signed_v<member_type>)
     270             :           warn |= (bound > static_cast<IndexType>(
     271             :                                std::numeric_limits<member_type>::max()));
     272             :       }
     273             : 
     274             :       // check narrowing
     275         144 :       warn |=
     276             :           (static_cast<IndexType>(static_cast<member_type>(bound)) != bound);
     277             : 
     278        3120 :       if (warn) {
     279           0 :         std::string msg =
     280             :             "Kokkos::RangePolicy bound type error: an unsafe implicit "
     281             :             "conversion is performed on a bound (" +
     282             :             std::to_string(bound) +
     283             :             "), which may not preserve its original value.\n";
     284             : 
     285             : #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4
     286             :         Kokkos::abort(msg.c_str());
     287             : #endif
     288             : 
     289             : #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
     290           0 :         Kokkos::Impl::log_warning(msg);
     291             : #endif
     292           0 :       }
     293             : #endif
     294             :     }
     295        3120 :   }
     296             : 
     297             :  public:
     298             :   /** \brief  Subrange for a partition's rank and size.
     299             :    *
     300             :    *  Typically used to partition a range over a group of threads.
     301             :    */
     302             :   struct WorkRange {
     303             :     using work_tag    = typename RangePolicy<Properties...>::work_tag;
     304             :     using member_type = typename RangePolicy<Properties...>::member_type;
     305             : 
     306             :     KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin; }
     307             :     KOKKOS_INLINE_FUNCTION member_type end() const { return m_end; }
     308             : 
     309             :     /** \brief  Subrange for a partition's rank and size.
     310             :      *
     311             :      *  Typically used to partition a range over a group of threads.
     312             :      */
     313             :     KOKKOS_INLINE_FUNCTION
     314             :     WorkRange(const RangePolicy& range, const int part_rank,
     315             :               const int part_size)
     316             :         : m_begin(0), m_end(0) {
     317             :       if (part_size) {
     318             :         // Split evenly among partitions, then round up to the granularity.
     319             :         const member_type work_part =
     320             :             ((((range.end() - range.begin()) + (part_size - 1)) / part_size) +
     321             :              range.m_granularity_mask) &
     322             :             ~member_type(range.m_granularity_mask);
     323             : 
     324             :         m_begin = range.begin() + work_part * part_rank;
     325             :         m_end   = m_begin + work_part;
     326             : 
     327             :         if (range.end() < m_begin) m_begin = range.end();
     328             :         if (range.end() < m_end) m_end = range.end();
     329             :       }
     330             :     }
     331             : 
     332             :    private:
     333             :     member_type m_begin;
     334             :     member_type m_end;
     335             :     WorkRange();
     336             :     WorkRange& operator=(const WorkRange&);
     337             :   };
     338             : };
     339             : 
     340             : RangePolicy() -> RangePolicy<>;
     341             : 
     342             : RangePolicy(int64_t, int64_t) -> RangePolicy<>;
     343             : RangePolicy(int64_t, int64_t, ChunkSize const&) -> RangePolicy<>;
     344             : 
     345             : RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t) -> RangePolicy<>;
     346             : RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&)
     347             :     -> RangePolicy<>;
     348             : 
     349             : template <typename ES, typename = std::enable_if_t<is_execution_space_v<ES>>>
     350             : RangePolicy(ES const&, int64_t, int64_t) -> RangePolicy<ES>;
     351             : 
     352             : template <typename ES, typename = std::enable_if_t<is_execution_space_v<ES>>>
     353             : RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&) -> RangePolicy<ES>;
     354             : 
     355             : }  // namespace Kokkos
     356             : 
     357             : //----------------------------------------------------------------------------
     358             : //----------------------------------------------------------------------------
     359             : 
     360             : namespace Kokkos {
     361             : 
     362             : namespace Impl {
     363             : 
     364             : template <class ExecSpace, class... Properties>
     365             : class TeamPolicyInternal : public Impl::PolicyTraits<Properties...> {
     366             :  private:
     367             :   using traits = Impl::PolicyTraits<Properties...>;
     368             : 
     369             :  public:
     370             :   using index_type = typename traits::index_type;
     371             : 
     372             :   //----------------------------------------
     373             :   /** \brief  Query maximum team size for a given functor.
     374             :    *
     375             :    *  This size takes into account execution space concurrency limitations and
     376             :    *  scratch memory space limitations for reductions, team reduce/scan, and
     377             :    *  team shared memory.
     378             :    *
     379             :    *  This function only works for single-operator functors.
     380             :    *  With multi-operator functors it cannot be determined
     381             :    *  which operator will be called.
     382             :    */
     383             :   template <class FunctorType>
     384             :   static int team_size_max(const FunctorType&);
     385             : 
     386             :   /** \brief  Query recommended team size for a given functor.
     387             :    *
     388             :    *  This size takes into account execution space concurrency limitations and
     389             :    *  scratch memory space limitations for reductions, team reduce/scan, and
     390             :    *  team shared memory.
     391             :    *
     392             :    *  This function only works for single-operator functors.
     393             :    *  With multi-operator functors it cannot be determined
     394             :    *  which operator will be called.
     395             :    */
     396             :   template <class FunctorType>
     397             :   static int team_size_recommended(const FunctorType&);
     398             : 
     399             :   template <class FunctorType>
     400             :   static int team_size_recommended(const FunctorType&, const int&);
     401             : 
     402             :   template <class FunctorType>
     403             :   int team_size_recommended(const FunctorType& functor,
     404             :                             const int vector_length);
     405             : 
     406             :   //----------------------------------------
     407             :   /** \brief  Construct policy with the given instance of the execution space */
     408             :   TeamPolicyInternal(const typename traits::execution_space&,
     409             :                      int league_size_request, int team_size_request,
     410             :                      int vector_length_request = 1);
     411             : 
     412             :   TeamPolicyInternal(const typename traits::execution_space&,
     413             :                      int league_size_request, const Kokkos::AUTO_t&,
     414             :                      int vector_length_request = 1);
     415             : 
     416             :   /** \brief  Construct policy with the default instance of the execution space
     417             :    */
     418             :   TeamPolicyInternal(int league_size_request, int team_size_request,
     419             :                      int vector_length_request = 1);
     420             : 
     421             :   TeamPolicyInternal(int league_size_request, const Kokkos::AUTO_t&,
     422             :                      int vector_length_request = 1);
     423             : 
     424             :   /*  TeamPolicyInternal( int league_size_request , int team_size_request );
     425             : 
     426             :     TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/
     427             : 
     428             :   /** \brief  The actual league size (number of teams) of the policy.
     429             :    *
     430             :    *  This may be smaller than the requested league size due to limitations
     431             :    *  of the execution space.
     432             :    */
     433             :   KOKKOS_INLINE_FUNCTION int league_size() const;
     434             : 
     435             :   /** \brief  The actual team size (number of threads per team) of the policy.
     436             :    *
     437             :    *  This may be smaller than the requested team size due to limitations
     438             :    *  of the execution space.
     439             :    */
     440             :   KOKKOS_INLINE_FUNCTION int team_size() const;
     441             : 
     442             :   /** \brief Whether the policy has an automatically determined team size
     443             :    */
     444             :   inline bool impl_auto_team_size() const;
     445             :   /** \brief Whether the policy has an automatically determined vector length
     446             :    */
     447             :   inline bool impl_auto_vector_length() const;
     448             : 
     449             :   static int vector_length_max();
     450             : 
     451             :   KOKKOS_INLINE_FUNCTION int impl_vector_length() const;
     452             : 
     453             :   inline typename traits::index_type chunk_size() const;
     454             : 
     455             :   inline TeamPolicyInternal& set_chunk_size(int chunk_size);
     456             : 
     457             :   /** \brief  Parallel execution of a functor calls the functor once with
     458             :    *          each member of the execution policy.
     459             :    */
     460             :   struct member_type {
     461             :     /** \brief  Handle to the currently executing team shared scratch memory */
     462             :     KOKKOS_INLINE_FUNCTION
     463             :     typename traits::execution_space::scratch_memory_space team_shmem() const;
     464             : 
     465             :     /** \brief  Rank of this team within the league of teams */
     466             :     KOKKOS_INLINE_FUNCTION int league_rank() const;
     467             : 
     468             :     /** \brief  Number of teams in the league */
     469             :     KOKKOS_INLINE_FUNCTION int league_size() const;
     470             : 
     471             :     /** \brief  Rank of this thread within this team */
     472             :     KOKKOS_INLINE_FUNCTION int team_rank() const;
     473             : 
     474             :     /** \brief  Number of threads in this team */
     475             :     KOKKOS_INLINE_FUNCTION int team_size() const;
     476             : 
     477             :     /** \brief  Barrier among the threads of this team */
     478             :     KOKKOS_INLINE_FUNCTION void team_barrier() const;
     479             : 
     480             :     /** \brief  Intra-team reduction. Returns join of all values of the team
     481             :      * members. */
     482             :     template <class JoinOp>
     483             :     KOKKOS_INLINE_FUNCTION typename JoinOp::value_type team_reduce(
     484             :         const typename JoinOp::value_type, const JoinOp&) const;
     485             : 
     486             :     /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
     487             :      *
     488             :      *  The highest rank thread can compute the reduction total as
     489             :      *    reduction_total = dev.team_scan( value ) + value ;
     490             :      */
     491             :     template <typename Type>
     492             :     KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const;
     493             : 
     494             :     /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
     495             :      *          with intra-team non-deterministic ordering accumulation.
     496             :      *
     497             :      *  The global inter-team accumulation value will, at the end of the
     498             :      *  league's parallel execution, be the scan's total.
     499             :      *  Parallel execution ordering of the league's teams is non-deterministic.
     500             :      *  As such the base value for each team's scan operation is similarly
     501             :      *  non-deterministic.
     502             :      */
     503             :     template <typename Type>
     504             :     KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value,
     505             :                                           Type* const global_accum) const;
     506             :   };
     507             : };
     508             : 
     509             : struct PerTeamValue {
     510             :   size_t value;
     511             :   PerTeamValue(size_t arg);
     512             : };
     513             : 
     514             : struct PerThreadValue {
     515             :   size_t value;
     516             :   PerThreadValue(size_t arg);
     517             : };
     518             : 
     519             : template <class iType, class... Args>
     520             : struct ExtractVectorLength {
     521             :   static inline iType value(
     522             :       std::enable_if_t<std::is_integral_v<iType>, iType> val, Args...) {
     523             :     return val;
     524             :   }
     525             :   static inline std::enable_if_t<!std::is_integral_v<iType>, int> value(
     526             :       std::enable_if_t<!std::is_integral_v<iType>, iType>, Args...) {
     527             :     return 1;
     528             :   }
     529             : };
     530             : 
     531             : template <class iType, class... Args>
     532             : inline std::enable_if_t<std::is_integral_v<iType>, iType> extract_vector_length(
     533             :     iType val, Args...) {
     534             :   return val;
     535             : }
     536             : 
     537             : template <class iType, class... Args>
     538             : inline std::enable_if_t<!std::is_integral_v<iType>, int> extract_vector_length(
     539             :     iType, Args...) {
     540             :   return 1;
     541             : }
     542             : 
     543             : }  // namespace Impl
     544             : 
     545             : Impl::PerTeamValue PerTeam(const size_t& arg);
     546             : Impl::PerThreadValue PerThread(const size_t& arg);
     547             : 
     548             : struct ScratchRequest {
     549             :   int level;
     550             : 
     551             :   size_t per_team;
     552             :   size_t per_thread;
     553             : 
     554             :   inline ScratchRequest(const int& level_,
     555             :                         const Impl::PerTeamValue& team_value) {
     556             :     level      = level_;
     557             :     per_team   = team_value.value;
     558             :     per_thread = 0;
     559             :   }
     560             : 
     561             :   inline ScratchRequest(const int& level_,
     562             :                         const Impl::PerThreadValue& thread_value) {
     563             :     level      = level_;
     564             :     per_team   = 0;
     565             :     per_thread = thread_value.value;
     566             :   }
     567             : 
     568             :   inline ScratchRequest(const int& level_, const Impl::PerTeamValue& team_value,
     569             :                         const Impl::PerThreadValue& thread_value) {
     570             :     level      = level_;
     571             :     per_team   = team_value.value;
     572             :     per_thread = thread_value.value;
     573             :   }
     574             : 
     575             :   inline ScratchRequest(const int& level_,
     576             :                         const Impl::PerThreadValue& thread_value,
     577             :                         const Impl::PerTeamValue& team_value) {
     578             :     level      = level_;
     579             :     per_team   = team_value.value;
     580             :     per_thread = thread_value.value;
     581             :   }
     582             : };
     583             : 
     584             : // Causes abnormal program termination if level is not `0` or `1`
     585             : void team_policy_check_valid_storage_level_argument(int level);
     586             : 
     587             : /** \brief  Execution policy for parallel work over a league of teams of
     588             :  * threads.
     589             :  *
     590             :  *  The work functor is called for each thread of each team such that
     591             :  *  the team's member threads are guaranteed to be concurrent.
     592             :  *
     593             :  *  The team's threads have access to team shared scratch memory and
     594             :  *  team collective operations.
     595             :  *
     596             :  *  If the WorkTag is non-void then the first calling argument of the
     597             :  *  work functor's parentheses operator is 'const WorkTag &'.
     598             :  *  This allows a functor to have multiple work member functions.
     599             :  *
     600             :  *  Order of template arguments does not matter, since the implementation
     601             :  *  uses variadic templates. Each and any of the template arguments can
     602             :  *  be omitted.
     603             :  *
     604             :  *  Possible Template arguments and their default values:
     605             :  *    ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be
     606             :  * enabled. WorkTag (none): Tag which is used as the first argument for the
     607             :  * functor operator. Schedule<Type> (Schedule<Static>): Scheduling Policy
     608             :  * (Dynamic, or Static). IndexType<Type> (IndexType<ExecutionSpace::size_type>:
     609             :  * Integer Index type used to iterate over the Index space.
     610             :  *    LaunchBounds<unsigned,unsigned> Launch Bounds for CUDA compilation,
     611             :  *    default of LaunchBounds<0,0> indicates no launch bounds specified.
     612             :  */
     613             : template <class... Properties>
     614             : class TeamPolicy
     615             :     : public Impl::TeamPolicyInternal<
     616             :           typename Impl::PolicyTraits<Properties...>::execution_space,
     617             :           Properties...> {
     618             :   using internal_policy = Impl::TeamPolicyInternal<
     619             :       typename Impl::PolicyTraits<Properties...>::execution_space,
     620             :       Properties...>;
     621             : 
     622             :   template <class... OtherProperties>
     623             :   friend class TeamPolicy;
     624             : 
     625             :  public:
     626             :   using traits = Impl::PolicyTraits<Properties...>;
     627             : 
     628             :   using execution_policy = TeamPolicy<Properties...>;
     629             : 
     630             :   TeamPolicy() : internal_policy(0, AUTO) {}
     631             : 
     632             :   /** \brief  Construct policy with the given instance of the execution space */
     633             :   TeamPolicy(const typename traits::execution_space& space_,
     634             :              int league_size_request, int team_size_request,
     635             :              int vector_length_request = 1)
     636             :       : internal_policy(space_, league_size_request, team_size_request,
     637             :                         vector_length_request) {}
     638             : 
     639             :   TeamPolicy(const typename traits::execution_space& space_,
     640             :              int league_size_request, const Kokkos::AUTO_t&,
     641             :              int vector_length_request = 1)
     642             :       : internal_policy(space_, league_size_request, Kokkos::AUTO(),
     643             :                         vector_length_request) {}
     644             : 
     645             :   TeamPolicy(const typename traits::execution_space& space_,
     646             :              int league_size_request, const Kokkos::AUTO_t&,
     647             :              const Kokkos::AUTO_t&)
     648             :       : internal_policy(space_, league_size_request, Kokkos::AUTO(),
     649             :                         Kokkos::AUTO()) {}
     650             :   TeamPolicy(const typename traits::execution_space& space_,
     651             :              int league_size_request, const int team_size_request,
     652             :              const Kokkos::AUTO_t&)
     653             :       : internal_policy(space_, league_size_request, team_size_request,
     654             :                         Kokkos::AUTO()) {}
     655             :   /** \brief  Construct policy with the default instance of the execution space
     656             :    */
     657             :   TeamPolicy(int league_size_request, int team_size_request,
     658             :              int vector_length_request = 1)
     659             :       : internal_policy(league_size_request, team_size_request,
     660             :                         vector_length_request) {}
     661             : 
     662             :   TeamPolicy(int league_size_request, const Kokkos::AUTO_t&,
     663             :              int vector_length_request = 1)
     664             :       : internal_policy(league_size_request, Kokkos::AUTO(),
     665             :                         vector_length_request) {}
     666             : 
     667             :   TeamPolicy(int league_size_request, const Kokkos::AUTO_t&,
     668             :              const Kokkos::AUTO_t&)
     669             :       : internal_policy(league_size_request, Kokkos::AUTO(), Kokkos::AUTO()) {}
     670             :   TeamPolicy(int league_size_request, const int team_size_request,
     671             :              const Kokkos::AUTO_t&)
     672             :       : internal_policy(league_size_request, team_size_request,
     673             :                         Kokkos::AUTO()) {}
     674             : 
     675             :   template <class... OtherProperties>
     676             :   TeamPolicy(const TeamPolicy<OtherProperties...> p) : internal_policy(p) {
     677             :     // Cannot call converting constructor in the member initializer list because
     678             :     // it is not a direct base.
     679             :     internal_policy::traits::operator=(p);
     680             :   }
     681             : 
     682             :  private:
     683             :   TeamPolicy(const internal_policy& p) : internal_policy(p) {}
     684             : 
     685             :  public:
     686             :   inline TeamPolicy& set_chunk_size(int chunk) {
     687             :     static_assert(
     688             :         std::is_same_v<decltype(internal_policy::set_chunk_size(chunk)),
     689             :                        internal_policy&>,
     690             :         "internal set_chunk_size should return a reference");
     691             :     return static_cast<TeamPolicy&>(internal_policy::set_chunk_size(chunk));
     692             :   }
     693             : 
     694             :   inline TeamPolicy& set_scratch_size(const int& level,
     695             :                                       const Impl::PerTeamValue& per_team) {
     696             :     static_assert(std::is_same_v<decltype(internal_policy::set_scratch_size(
     697             :                                      level, per_team)),
     698             :                                  internal_policy&>,
     699             :                   "internal set_chunk_size should return a reference");
     700             : 
     701             :     team_policy_check_valid_storage_level_argument(level);
     702             :     return static_cast<TeamPolicy&>(
     703             :         internal_policy::set_scratch_size(level, per_team));
     704             :   }
     705             :   inline TeamPolicy& set_scratch_size(const int& level,
     706             :                                       const Impl::PerThreadValue& per_thread) {
     707             :     team_policy_check_valid_storage_level_argument(level);
     708             :     return static_cast<TeamPolicy&>(
     709             :         internal_policy::set_scratch_size(level, per_thread));
     710             :   }
     711             :   inline TeamPolicy& set_scratch_size(const int& level,
     712             :                                       const Impl::PerTeamValue& per_team,
     713             :                                       const Impl::PerThreadValue& per_thread) {
     714             :     team_policy_check_valid_storage_level_argument(level);
     715             :     return static_cast<TeamPolicy&>(
     716             :         internal_policy::set_scratch_size(level, per_team, per_thread));
     717             :   }
     718             :   inline TeamPolicy& set_scratch_size(const int& level,
     719             :                                       const Impl::PerThreadValue& per_thread,
     720             :                                       const Impl::PerTeamValue& per_team) {
     721             :     team_policy_check_valid_storage_level_argument(level);
     722             :     return static_cast<TeamPolicy&>(
     723             :         internal_policy::set_scratch_size(level, per_team, per_thread));
     724             :   }
     725             : };
     726             : 
     727             : // Execution space not provided deduces to TeamPolicy<>
     728             : 
     729             : TeamPolicy() -> TeamPolicy<>;
     730             : 
     731             : TeamPolicy(int, int) -> TeamPolicy<>;
     732             : TeamPolicy(int, int, int) -> TeamPolicy<>;
     733             : TeamPolicy(int, Kokkos::AUTO_t const&) -> TeamPolicy<>;
     734             : TeamPolicy(int, Kokkos::AUTO_t const&, int) -> TeamPolicy<>;
     735             : TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) -> TeamPolicy<>;
     736             : TeamPolicy(int, int, Kokkos::AUTO_t const&) -> TeamPolicy<>;
     737             : 
     738             : // DefaultExecutionSpace deduces to TeamPolicy<>
     739             : 
     740             : TeamPolicy(DefaultExecutionSpace const&, int, int) -> TeamPolicy<>;
     741             : TeamPolicy(DefaultExecutionSpace const&, int, int, int) -> TeamPolicy<>;
     742             : TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&)
     743             :     -> TeamPolicy<>;
     744             : TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int)
     745             :     -> TeamPolicy<>;
     746             : TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&,
     747             :            Kokkos::AUTO_t const&) -> TeamPolicy<>;
     748             : TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&)
     749             :     -> TeamPolicy<>;
     750             : 
     751             : // ES != DefaultExecutionSpace deduces to TeamPolicy<ES>
     752             : 
     753             : template <typename ES,
     754             :           typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
     755             : TeamPolicy(ES const&, int, int) -> TeamPolicy<ES>;
     756             : 
     757             : template <typename ES,
     758             :           typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
     759             : TeamPolicy(ES const&, int, int, int) -> TeamPolicy<ES>;
     760             : 
     761             : template <typename ES,
     762             :           typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
     763             : TeamPolicy(ES const&, int, Kokkos::AUTO_t const&) -> TeamPolicy<ES>;
     764             : 
     765             : template <typename ES,
     766             :           typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
     767             : TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int) -> TeamPolicy<ES>;
     768             : 
     769             : template <typename ES,
     770             :           typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
     771             : TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&)
     772             :     -> TeamPolicy<ES>;
     773             : 
     774             : template <typename ES,
     775             :           typename = std::enable_if_t<Kokkos::is_execution_space_v<ES>>>
     776             : TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&) -> TeamPolicy<ES>;
     777             : 
     778             : namespace Impl {
     779             : 
     780             : template <typename iType, class TeamMemberType>
     781             : struct TeamThreadRangeBoundariesStruct {
     782             :  private:
     783             :   KOKKOS_INLINE_FUNCTION static iType ibegin(const iType& arg_begin,
     784             :                                              const iType& arg_end,
     785             :                                              const iType& arg_rank,
     786             :                                              const iType& arg_size) {
     787             :     return arg_begin +
     788             :            ((arg_end - arg_begin + arg_size - 1) / arg_size) * arg_rank;
     789             :   }
     790             : 
     791             :   KOKKOS_INLINE_FUNCTION static iType iend(const iType& arg_begin,
     792             :                                            const iType& arg_end,
     793             :                                            const iType& arg_rank,
     794             :                                            const iType& arg_size) {
     795             :     const iType end_ =
     796             :         arg_begin +
     797             :         ((arg_end - arg_begin + arg_size - 1) / arg_size) * (arg_rank + 1);
     798             :     return end_ < arg_end ? end_ : arg_end;
     799             :   }
     800             : 
     801             :  public:
     802             :   using index_type = iType;
     803             :   const iType start;
     804             :   const iType end;
     805             :   enum { increment = 1 };
     806             :   const TeamMemberType& thread;
     807             : 
     808             :   KOKKOS_INLINE_FUNCTION
     809             :   TeamThreadRangeBoundariesStruct(const TeamMemberType& arg_thread,
     810             :                                   const iType& arg_end)
     811             :       : start(
     812             :             ibegin(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
     813             :         end(iend(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
     814             :         thread(arg_thread) {}
     815             : 
     816             :   KOKKOS_INLINE_FUNCTION
     817             :   TeamThreadRangeBoundariesStruct(const TeamMemberType& arg_thread,
     818             :                                   const iType& arg_begin, const iType& arg_end)
     819             :       : start(ibegin(arg_begin, arg_end, arg_thread.team_rank(),
     820             :                      arg_thread.team_size())),
     821             :         end(iend(arg_begin, arg_end, arg_thread.team_rank(),
     822             :                  arg_thread.team_size())),
     823             :         thread(arg_thread) {}
     824             : };
     825             : 
     826             : template <typename iType, class TeamMemberType>
     827             : struct TeamVectorRangeBoundariesStruct {
     828             :  private:
     829             :   KOKKOS_INLINE_FUNCTION static iType ibegin(const iType& arg_begin,
     830             :                                              const iType& arg_end,
     831             :                                              const iType& arg_rank,
     832             :                                              const iType& arg_size) {
     833             :     return arg_begin +
     834             :            ((arg_end - arg_begin + arg_size - 1) / arg_size) * arg_rank;
     835             :   }
     836             : 
     837             :   KOKKOS_INLINE_FUNCTION static iType iend(const iType& arg_begin,
     838             :                                            const iType& arg_end,
     839             :                                            const iType& arg_rank,
     840             :                                            const iType& arg_size) {
     841             :     const iType end_ =
     842             :         arg_begin +
     843             :         ((arg_end - arg_begin + arg_size - 1) / arg_size) * (arg_rank + 1);
     844             :     return end_ < arg_end ? end_ : arg_end;
     845             :   }
     846             : 
     847             :  public:
     848             :   using index_type = iType;
     849             :   const iType start;
     850             :   const iType end;
     851             :   enum { increment = 1 };
     852             :   const TeamMemberType& thread;
     853             : 
     854             :   KOKKOS_INLINE_FUNCTION
     855             :   TeamVectorRangeBoundariesStruct(const TeamMemberType& arg_thread,
     856             :                                   const iType& arg_end)
     857             :       : start(
     858             :             ibegin(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
     859             :         end(iend(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
     860             :         thread(arg_thread) {}
     861             : 
     862             :   KOKKOS_INLINE_FUNCTION
     863             :   TeamVectorRangeBoundariesStruct(const TeamMemberType& arg_thread,
     864             :                                   const iType& arg_begin, const iType& arg_end)
     865             :       : start(ibegin(arg_begin, arg_end, arg_thread.team_rank(),
     866             :                      arg_thread.team_size())),
     867             :         end(iend(arg_begin, arg_end, arg_thread.team_rank(),
     868             :                  arg_thread.team_size())),
     869             :         thread(arg_thread) {}
     870             : };
     871             : 
     872             : template <typename iType, class TeamMemberType>
     873             : struct ThreadVectorRangeBoundariesStruct {
     874             :   using index_type = iType;
     875             :   const index_type start;
     876             :   const index_type end;
     877             :   enum { increment = 1 };
     878             : 
     879             :   KOKKOS_INLINE_FUNCTION
     880             :   constexpr ThreadVectorRangeBoundariesStruct(const TeamMemberType,
     881             :                                               const index_type& count) noexcept
     882             :       : start(static_cast<index_type>(0)), end(count) {}
     883             : 
     884             :   KOKKOS_INLINE_FUNCTION
     885             :   constexpr ThreadVectorRangeBoundariesStruct(
     886             :       const TeamMemberType, const index_type& arg_begin,
     887             :       const index_type& arg_end) noexcept
     888             :       : start(static_cast<index_type>(arg_begin)), end(arg_end) {}
     889             : };
     890             : 
     891             : template <class TeamMemberType>
     892             : struct ThreadSingleStruct {
     893             :   const TeamMemberType& team_member;
     894             :   KOKKOS_INLINE_FUNCTION
     895             :   ThreadSingleStruct(const TeamMemberType& team_member_)
     896             :       : team_member(team_member_) {}
     897             : };
     898             : 
     899             : template <class TeamMemberType>
     900             : struct VectorSingleStruct {
     901             :   const TeamMemberType& team_member;
     902             :   KOKKOS_INLINE_FUNCTION
     903             :   VectorSingleStruct(const TeamMemberType& team_member_)
     904             :       : team_member(team_member_) {}
     905             : };
     906             : 
     907             : }  // namespace Impl
     908             : 
     909             : /** \brief  Execution policy for parallel work over a threads within a team.
     910             :  *
     911             :  *  The range is split over all threads in a team. The Mapping scheme depends on
     912             :  * the architecture. This policy is used together with a parallel pattern as a
     913             :  * nested layer within a kernel launched with the TeamPolicy. This variant
     914             :  * expects a single count. So the range is (0,count].
     915             :  */
     916             : template <typename iType, class TeamMemberType, class _never_use_this_overload>
     917             : KOKKOS_INLINE_FUNCTION_DELETED
     918             :     Impl::TeamThreadRangeBoundariesStruct<iType, TeamMemberType>
     919             :     TeamThreadRange(const TeamMemberType&, const iType& count) = delete;
     920             : 
     921             : /** \brief  Execution policy for parallel work over a threads within a team.
     922             :  *
     923             :  *  The range is split over all threads in a team. The Mapping scheme depends on
     924             :  * the architecture. This policy is used together with a parallel pattern as a
     925             :  * nested layer within a kernel launched with the TeamPolicy. This variant
     926             :  * expects a begin and end. So the range is (begin,end].
     927             :  */
     928             : template <typename iType1, typename iType2, class TeamMemberType,
     929             :           class _never_use_this_overload>
     930             : KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct<
     931             :     std::common_type_t<iType1, iType2>, TeamMemberType>
     932             : TeamThreadRange(const TeamMemberType&, const iType1& begin,
     933             :                 const iType2& end) = delete;
     934             : 
     935             : /** \brief  Execution policy for parallel work over a threads within a team.
     936             :  *
     937             :  *  The range is split over all threads in a team. The Mapping scheme depends on
     938             :  * the architecture. This policy is used together with a parallel pattern as a
     939             :  * nested layer within a kernel launched with the TeamPolicy. This variant
     940             :  * expects a single count. So the range is (0,count].
     941             :  */
     942             : template <typename iType, class TeamMemberType, class _never_use_this_overload>
     943             : KOKKOS_INLINE_FUNCTION_DELETED
     944             :     Impl::TeamThreadRangeBoundariesStruct<iType, TeamMemberType>
     945             :     TeamVectorRange(const TeamMemberType&, const iType& count) = delete;
     946             : 
     947             : /** \brief  Execution policy for parallel work over a threads within a team.
     948             :  *
     949             :  *  The range is split over all threads in a team. The Mapping scheme depends on
     950             :  * the architecture. This policy is used together with a parallel pattern as a
     951             :  * nested layer within a kernel launched with the TeamPolicy. This variant
     952             :  * expects a begin and end. So the range is (begin,end].
     953             :  */
     954             : template <typename iType1, typename iType2, class TeamMemberType,
     955             :           class _never_use_this_overload>
     956             : KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct<
     957             :     std::common_type_t<iType1, iType2>, TeamMemberType>
     958             : TeamVectorRange(const TeamMemberType&, const iType1& begin,
     959             :                 const iType2& end) = delete;
     960             : 
     961             : /** \brief  Execution policy for a vector parallel loop.
     962             :  *
     963             :  *  The range is split over all vector lanes in a thread. The Mapping scheme
     964             :  * depends on the architecture. This policy is used together with a parallel
     965             :  * pattern as a nested layer within a kernel launched with the TeamPolicy. This
     966             :  * variant expects a single count. So the range is (0,count].
     967             :  */
     968             : template <typename iType, class TeamMemberType, class _never_use_this_overload>
     969             : KOKKOS_INLINE_FUNCTION_DELETED
     970             :     Impl::ThreadVectorRangeBoundariesStruct<iType, TeamMemberType>
     971             :     ThreadVectorRange(const TeamMemberType&, const iType& count) = delete;
     972             : 
     973             : template <typename iType1, typename iType2, class TeamMemberType,
     974             :           class _never_use_this_overload>
     975             : KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct<
     976             :     std::common_type_t<iType1, iType2>, TeamMemberType>
     977             : ThreadVectorRange(const TeamMemberType&, const iType1& arg_begin,
     978             :                   const iType2& arg_end) = delete;
     979             : 
     980             : namespace Impl {
     981             : 
     982             : enum class TeamMDRangeLastNestLevel : bool { NotLastNestLevel, LastNestLevel };
     983             : enum class TeamMDRangeParThread : bool { NotParThread, ParThread };
     984             : enum class TeamMDRangeParVector : bool { NotParVector, ParVector };
     985             : enum class TeamMDRangeThreadAndVector : bool { NotBoth, Both };
     986             : 
     987             : template <typename Rank, TeamMDRangeThreadAndVector ThreadAndVector>
     988             : struct HostBasedNestLevel;
     989             : 
     990             : template <typename Rank, TeamMDRangeThreadAndVector ThreadAndVector>
     991             : struct AcceleratorBasedNestLevel;
     992             : 
     993             : // ThreadAndVectorNestLevel determines on which nested level parallelization
     994             : // happens.
     995             : //   - Rank is Kokkos::Rank<TotalNestLevel, Iter>
     996             : //     - TotalNestLevel is the total number of loop nests
     997             : //     - Iter is whether to go forward or backward through ranks (i.e. the
     998             : //       iteration order for MDRangePolicy)
     999             : //   - ThreadAndVector determines whether both vector and thread parallelism is
    1000             : //     in use
    1001             : template <typename Rank, typename ExecSpace,
    1002             :           TeamMDRangeThreadAndVector ThreadAndVector>
    1003             : struct ThreadAndVectorNestLevel;
    1004             : 
    1005             : struct NoReductionTag {};
    1006             : 
    1007             : template <typename Rank, typename TeamMDPolicy, typename Lambda,
    1008             :           typename ReductionValueType>
    1009             : KOKKOS_INLINE_FUNCTION void md_parallel_impl(TeamMDPolicy const& policy,
    1010             :                                              Lambda const& lambda,
    1011             :                                              ReductionValueType&& val);
    1012             : }  // namespace Impl
    1013             : 
    1014             : template <typename Rank, typename TeamHandle>
    1015             : struct TeamThreadMDRange;
    1016             : 
    1017             : template <unsigned N, Iterate OuterDir, Iterate InnerDir, typename TeamHandle>
    1018             : struct TeamThreadMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> {
    1019             :   using NestLevelType  = int;
    1020             :   using BoundaryType   = int;
    1021             :   using TeamHandleType = TeamHandle;
    1022             :   using ExecutionSpace = typename TeamHandleType::execution_space;
    1023             :   using ArrayLayout    = typename ExecutionSpace::array_layout;
    1024             : 
    1025             :   static constexpr NestLevelType total_nest_level =
    1026             :       Rank<N, OuterDir, InnerDir>::rank;
    1027             :   static constexpr Iterate iter    = OuterDir;
    1028             :   static constexpr auto par_thread = Impl::TeamMDRangeParThread::ParThread;
    1029             :   static constexpr auto par_vector = Impl::TeamMDRangeParVector::NotParVector;
    1030             : 
    1031             :   static constexpr Iterate direction =
    1032             :       OuterDir == Iterate::Default ? Impl::layout_iterate_type_selector<
    1033             :                                          ArrayLayout>::outer_iteration_pattern
    1034             :                                    : iter;
    1035             : 
    1036             :   template <class... Args>
    1037             :   KOKKOS_FUNCTION TeamThreadMDRange(TeamHandleType const& team_, Args&&... args)
    1038             :       : team(team_), boundaries{static_cast<BoundaryType>(args)...} {
    1039             :     static_assert(sizeof...(Args) == total_nest_level);
    1040             :   }
    1041             : 
    1042             :   TeamHandleType const& team;
    1043             :   BoundaryType boundaries[total_nest_level];
    1044             : };
    1045             : 
    1046             : template <typename TeamHandle, typename... Args>
    1047             : KOKKOS_DEDUCTION_GUIDE TeamThreadMDRange(TeamHandle const&, Args&&...)
    1048             :     -> TeamThreadMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>;
    1049             : 
    1050             : template <typename Rank, typename TeamHandle>
    1051             : struct ThreadVectorMDRange;
    1052             : 
    1053             : template <unsigned N, Iterate OuterDir, Iterate InnerDir, typename TeamHandle>
    1054             : struct ThreadVectorMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> {
    1055             :   using NestLevelType  = int;
    1056             :   using BoundaryType   = int;
    1057             :   using TeamHandleType = TeamHandle;
    1058             :   using ExecutionSpace = typename TeamHandleType::execution_space;
    1059             :   using ArrayLayout    = typename ExecutionSpace::array_layout;
    1060             : 
    1061             :   static constexpr NestLevelType total_nest_level =
    1062             :       Rank<N, OuterDir, InnerDir>::rank;
    1063             :   static constexpr Iterate iter    = OuterDir;
    1064             :   static constexpr auto par_thread = Impl::TeamMDRangeParThread::NotParThread;
    1065             :   static constexpr auto par_vector = Impl::TeamMDRangeParVector::ParVector;
    1066             : 
    1067             :   static constexpr Iterate direction =
    1068             :       OuterDir == Iterate::Default ? Impl::layout_iterate_type_selector<
    1069             :                                          ArrayLayout>::outer_iteration_pattern
    1070             :                                    : iter;
    1071             : 
    1072             :   template <class... Args>
    1073             :   KOKKOS_INLINE_FUNCTION ThreadVectorMDRange(TeamHandleType const& team_,
    1074             :                                              Args&&... args)
    1075             :       : team(team_), boundaries{static_cast<BoundaryType>(args)...} {
    1076             :     static_assert(sizeof...(Args) == total_nest_level);
    1077             :   }
    1078             : 
    1079             :   TeamHandleType const& team;
    1080             :   BoundaryType boundaries[total_nest_level];
    1081             : };
    1082             : 
    1083             : template <typename TeamHandle, typename... Args>
    1084             : KOKKOS_DEDUCTION_GUIDE ThreadVectorMDRange(TeamHandle const&, Args&&...)
    1085             :     -> ThreadVectorMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>;
    1086             : 
    1087             : template <typename Rank, typename TeamHandle>
    1088             : struct TeamVectorMDRange;
    1089             : 
    1090             : template <unsigned N, Iterate OuterDir, Iterate InnerDir, typename TeamHandle>
    1091             : struct TeamVectorMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> {
    1092             :   using NestLevelType  = int;
    1093             :   using BoundaryType   = int;
    1094             :   using TeamHandleType = TeamHandle;
    1095             :   using ExecutionSpace = typename TeamHandleType::execution_space;
    1096             :   using ArrayLayout    = typename ExecutionSpace::array_layout;
    1097             : 
    1098             :   static constexpr NestLevelType total_nest_level =
    1099             :       Rank<N, OuterDir, InnerDir>::rank;
    1100             :   static constexpr Iterate iter    = OuterDir;
    1101             :   static constexpr auto par_thread = Impl::TeamMDRangeParThread::ParThread;
    1102             :   static constexpr auto par_vector = Impl::TeamMDRangeParVector::ParVector;
    1103             : 
    1104             :   static constexpr Iterate direction =
    1105             :       iter == Iterate::Default ? Impl::layout_iterate_type_selector<
    1106             :                                      ArrayLayout>::outer_iteration_pattern
    1107             :                                : iter;
    1108             : 
    1109             :   template <class... Args>
    1110             :   KOKKOS_INLINE_FUNCTION TeamVectorMDRange(TeamHandleType const& team_,
    1111             :                                            Args&&... args)
    1112             :       : team(team_), boundaries{static_cast<BoundaryType>(args)...} {
    1113             :     static_assert(sizeof...(Args) == total_nest_level);
    1114             :   }
    1115             : 
    1116             :   TeamHandleType const& team;
    1117             :   BoundaryType boundaries[total_nest_level];
    1118             : };
    1119             : 
    1120             : template <typename TeamHandle, typename... Args>
    1121             : KOKKOS_DEDUCTION_GUIDE TeamVectorMDRange(TeamHandle const&, Args&&...)
    1122             :     -> TeamVectorMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>;
    1123             : 
    1124             : template <typename Rank, typename TeamHandle, typename Lambda,
    1125             :           typename ReducerValueType>
    1126             : KOKKOS_INLINE_FUNCTION void parallel_reduce(
    1127             :     TeamThreadMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
    1128             :     ReducerValueType& val) {
    1129             :   static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/
    1130             :                 !std::is_array_v<ReducerValueType> &&
    1131             :                     !std::is_pointer_v<ReducerValueType> &&
    1132             :                     !Kokkos::is_reducer_v<ReducerValueType>,
    1133             :                 "Only scalar return types are allowed!");
    1134             : 
    1135             :   val = ReducerValueType{};
    1136             :   Impl::md_parallel_impl<Rank>(policy, lambda, val);
    1137             :   policy.team.team_reduce(
    1138             :       Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{val});
    1139             : }
    1140             : 
    1141             : template <typename Rank, typename TeamHandle, typename Lambda>
    1142             : KOKKOS_INLINE_FUNCTION void parallel_for(
    1143             :     TeamThreadMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda) {
    1144             :   Impl::md_parallel_impl<Rank>(policy, lambda, Impl::NoReductionTag());
    1145             : }
    1146             : 
    1147             : template <typename Rank, typename TeamHandle, typename Lambda,
    1148             :           typename ReducerValueType>
    1149             : KOKKOS_INLINE_FUNCTION void parallel_reduce(
    1150             :     ThreadVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
    1151             :     ReducerValueType& val) {
    1152             :   static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/
    1153             :                 !std::is_array_v<ReducerValueType> &&
    1154             :                     !std::is_pointer_v<ReducerValueType> &&
    1155             :                     !Kokkos::is_reducer_v<ReducerValueType>,
    1156             :                 "Only a scalar return types are allowed!");
    1157             : 
    1158             :   val = ReducerValueType{};
    1159             :   Impl::md_parallel_impl<Rank>(policy, lambda, val);
    1160             :   if constexpr (false
    1161             : #ifdef KOKKOS_ENABLE_CUDA
    1162             :                 || std::is_same_v<typename TeamHandle::execution_space,
    1163             :                                   Kokkos::Cuda>
    1164             : #elif defined(KOKKOS_ENABLE_HIP)
    1165             :                 || std::is_same_v<typename TeamHandle::execution_space,
    1166             :                                   Kokkos::HIP>
    1167             : #elif defined(KOKKOS_ENABLE_SYCL)
    1168             :                 || std::is_same_v<typename TeamHandle::execution_space,
    1169             :                                   Kokkos::SYCL>
    1170             : #endif
    1171             :   )
    1172             :     policy.team.vector_reduce(
    1173             :         Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{
    1174             :             val});
    1175             : }
    1176             : 
    1177             : template <typename Rank, typename TeamHandle, typename Lambda>
    1178             : KOKKOS_INLINE_FUNCTION void parallel_for(
    1179             :     ThreadVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda) {
    1180             :   Impl::md_parallel_impl<Rank>(policy, lambda, Impl::NoReductionTag());
    1181             : }
    1182             : 
    1183             : template <typename Rank, typename TeamHandle, typename Lambda,
    1184             :           typename ReducerValueType>
    1185             : KOKKOS_INLINE_FUNCTION void parallel_reduce(
    1186             :     TeamVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
    1187             :     ReducerValueType& val) {
    1188             :   static_assert(/*!Kokkos::is_view_v<ReducerValueType> &&*/
    1189             :                 !std::is_array_v<ReducerValueType> &&
    1190             :                     !std::is_pointer_v<ReducerValueType> &&
    1191             :                     !Kokkos::is_reducer_v<ReducerValueType>,
    1192             :                 "Only a scalar return types are allowed!");
    1193             : 
    1194             :   val = ReducerValueType{};
    1195             :   Impl::md_parallel_impl<Rank>(policy, lambda, val);
    1196             :   if constexpr (false
    1197             : #ifdef KOKKOS_ENABLE_CUDA
    1198             :                 || std::is_same_v<typename TeamHandle::execution_space,
    1199             :                                   Kokkos::Cuda>
    1200             : #elif defined(KOKKOS_ENABLE_HIP)
    1201             :                 || std::is_same_v<typename TeamHandle::execution_space,
    1202             :                                   Kokkos::HIP>
    1203             : #elif defined(KOKKOS_ENABLE_SYCL)
    1204             :                 || std::is_same_v<typename TeamHandle::execution_space,
    1205             :                                   Kokkos::SYCL>
    1206             : #endif
    1207             :   )
    1208             :     policy.team.vector_reduce(
    1209             :         Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{
    1210             :             val});
    1211             :   policy.team.team_reduce(
    1212             :       Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{val});
    1213             : }
    1214             : 
    1215             : template <typename Rank, typename TeamHandle, typename Lambda>
    1216             : KOKKOS_INLINE_FUNCTION void parallel_for(
    1217             :     TeamVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda) {
    1218             :   Impl::md_parallel_impl<Rank>(policy, lambda, Impl::NoReductionTag());
    1219             : }
    1220             : 
    1221             : namespace Impl {
    1222             : 
    1223             : template <typename FunctorType, typename TagType,
    1224             :           bool HasTag = !std::is_void_v<TagType>>
    1225             : struct ParallelConstructName;
    1226             : 
    1227             : template <typename FunctorType, typename TagType>
    1228             : struct ParallelConstructName<FunctorType, TagType, true> {
    1229             :   ParallelConstructName(std::string const& label) : label_ref(label) {
    1230             :     if (label.empty()) {
    1231             : #ifdef KOKKOS_ENABLE_IMPL_TYPEINFO
    1232             :       default_name =
    1233             :           std::string(TypeInfo<std::remove_const_t<FunctorType>>::name()) +
    1234             :           "/" + std::string(TypeInfo<TagType>::name());
    1235             : #else
    1236             :       default_name = std::string(typeid(FunctorType).name()) + "/" +
    1237             :                      typeid(TagType).name();
    1238             : #endif
    1239             :     }
    1240             :   }
    1241             :   std::string const& get() {
    1242             :     return (label_ref.empty()) ? default_name : label_ref;
    1243             :   }
    1244             :   std::string const& label_ref;
    1245             :   std::string default_name;
    1246             : };
    1247             : 
    1248             : template <typename FunctorType, typename TagType>
    1249           0 : struct ParallelConstructName<FunctorType, TagType, false> {
    1250           0 :   ParallelConstructName(std::string const& label) : label_ref(label) {
    1251           0 :     if (label.empty()) {
    1252             : #ifdef KOKKOS_ENABLE_IMPL_TYPEINFO
    1253           0 :       default_name = TypeInfo<std::remove_const_t<FunctorType>>::name();
    1254             : #else
    1255             :       default_name = typeid(FunctorType).name();
    1256             : #endif
    1257             :     }
    1258           0 :   }
    1259           0 :   std::string const& get() {
    1260           0 :     return (label_ref.empty()) ? default_name : label_ref;
    1261             :   }
    1262             :   std::string const& label_ref;
    1263             :   std::string default_name;
    1264             : };
    1265             : 
    1266             : }  // namespace Impl
    1267             : 
    1268             : }  // namespace Kokkos
    1269             : 
    1270             : namespace Kokkos {
    1271             : 
    1272             : namespace Impl {
    1273             : 
    1274             : template <class PatternTag, class... Args>
    1275             : struct PatternImplSpecializationFromTag;
    1276             : 
    1277             : template <class... Args>
    1278             : struct PatternImplSpecializationFromTag<Kokkos::ParallelForTag, Args...>
    1279             :     : type_identity<ParallelFor<Args...>> {};
    1280             : 
    1281             : template <class... Args>
    1282             : struct PatternImplSpecializationFromTag<Kokkos::ParallelReduceTag, Args...>
    1283             :     : type_identity<ParallelReduce<Args...>> {};
    1284             : 
    1285             : template <class... Args>
    1286             : struct PatternImplSpecializationFromTag<Kokkos::ParallelScanTag, Args...>
    1287             :     : type_identity<ParallelScan<Args...>> {};
    1288             : 
    1289             : template <class PatternImpl>
    1290             : struct PatternTagFromImplSpecialization;
    1291             : 
    1292             : template <class... Args>
    1293             : struct PatternTagFromImplSpecialization<ParallelFor<Args...>>
    1294             :     : type_identity<ParallelForTag> {};
    1295             : 
    1296             : template <class... Args>
    1297             : struct PatternTagFromImplSpecialization<ParallelReduce<Args...>>
    1298             :     : type_identity<ParallelReduceTag> {};
    1299             : 
    1300             : template <class... Args>
    1301             : struct PatternTagFromImplSpecialization<ParallelScan<Args...>>
    1302             :     : type_identity<ParallelScanTag> {};
    1303             : 
    1304             : }  // end namespace Impl
    1305             : 
    1306             : }  // namespace Kokkos
    1307             : #endif /* #define KOKKOS_EXECPOLICY_HPP */

Generated by: LCOV version 1.14