ESP32-CAN/components/eigen/Eigen/src/ThreadPool/CoreThreadPoolDevice.h

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H
#define EIGEN_CORE_THREAD_POOL_DEVICE_H

namespace Eigen {

// CoreThreadPoolDevice provides an easy-to-understand Device for parallelizing Eigen Core expressions with
// Threadpool. Expressions are recursively split evenly until the evaluation cost is less than the threshold for
// delegating the task to a thread.
/*
                 a
                / \
               /   \
              /     \
             /       \
            /         \
           /           \
          /             \
         a               e
        / \             / \
       /   \           /   \
      /     \         /     \
     a       c       e       g
    / \     / \     / \     / \
   /   \   /   \   /   \   /   \
  a     b c     d e     f g     h
*/
// Each task descends the binary tree to the left, delegates the right task to a new thread, and continues to the
// left. This ensures that work is evenly distributed to the thread pool as quickly as possible and minimizes the number
// of tasks created during the evaluation. Consider an expression that is divided into 8 chunks. The
// primary task 'a' creates tasks 'e' 'c' and 'b', and executes its portion of the expression at the bottom of the
// tree. Likewise, task 'e' creates tasks 'g' and 'f', and executes its portion of the expression.

struct CoreThreadPoolDevice {
  using Task = std::function<void()>;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool& pool, float threadCostThreshold = 3e-5f)
      : m_pool(pool) {
    eigen_assert(threadCostThreshold >= 0.0f && "threadCostThreshold must be non-negative");
    m_costFactor = threadCostThreshold;
  }

  template <int PacketSize>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const {
    eigen_assert(cost >= 0.0f && "cost must be non-negative");
    Index numOps = size / PacketSize;
    int actualThreads = numOps < m_pool.NumThreads() ? static_cast<int>(numOps) : m_pool.NumThreads();
    float totalCost = static_cast<float>(numOps) * cost;
    float idealThreads = totalCost * m_costFactor;
    if (idealThreads < static_cast<float>(actualThreads)) {
      idealThreads = numext::maxi(idealThreads, 1.0f);
      actualThreads = numext::mini(actualThreads, static_cast<int>(idealThreads));
    }
    int maxLevel = internal::log2_ceil(actualThreads);
    return maxLevel;
  }

// MSVC does not like inlining parallelForImpl
#if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG
#define EIGEN_PARALLEL_FOR_INLINE
#else
#define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE
#endif

  template <typename UnaryFunctor, int PacketSize>
  EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor& f,
                                                                   Barrier& barrier, int level) {
    while (level > 0) {
      level--;
      Index size = end - begin;
      eigen_assert(size % PacketSize == 0 && "this function assumes size is a multiple of PacketSize");
      Index mid = begin + numext::round_down(size >> 1, PacketSize);
      Task right = [this, mid, end, &f, &barrier, level]() {
        parallelForImpl<UnaryFunctor, PacketSize>(mid, end, f, barrier, level);
      };
      m_pool.Schedule(std::move(right));
      end = mid;
    }
    for (Index i = begin; i < end; i += PacketSize) f(i);
    barrier.Notify();
  }

  template <typename BinaryFunctor, int PacketSize>
  EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin,
                                                                   Index innerEnd, BinaryFunctor& f, Barrier& barrier,
                                                                   int level) {
    while (level > 0) {
      level--;
      Index outerSize = outerEnd - outerBegin;
      if (outerSize > 1) {
        Index outerMid = outerBegin + (outerSize >> 1);
        Task right = [this, &f, &barrier, outerMid, outerEnd, innerBegin, innerEnd, level]() {
          parallelForImpl<BinaryFunctor, PacketSize>(outerMid, outerEnd, innerBegin, innerEnd, f, barrier, level);
        };
        m_pool.Schedule(std::move(right));
        outerEnd = outerMid;
      } else {
        Index innerSize = innerEnd - innerBegin;
        eigen_assert(innerSize % PacketSize == 0 && "this function assumes innerSize is a multiple of PacketSize");
        Index innerMid = innerBegin + numext::round_down(innerSize >> 1, PacketSize);
        Task right = [this, &f, &barrier, outerBegin, outerEnd, innerMid, innerEnd, level]() {
          parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerMid, innerEnd, f, barrier, level);
        };
        m_pool.Schedule(std::move(right));
        innerEnd = innerMid;
      }
    }
    for (Index outer = outerBegin; outer < outerEnd; outer++)
      for (Index inner = innerBegin; inner < innerEnd; inner += PacketSize) f(outer, inner);
    barrier.Notify();
  }

#undef EIGEN_PARALLEL_FOR_INLINE

  template <typename UnaryFunctor, int PacketSize>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor& f, float cost) {
    Index size = end - begin;
    int maxLevel = calculateLevels<PacketSize>(size, cost);
    Barrier barrier(1 << maxLevel);
    parallelForImpl<UnaryFunctor, PacketSize>(begin, end, f, barrier, maxLevel);
    barrier.Wait();
  }

  template <typename BinaryFunctor, int PacketSize>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin,
                                                         Index innerEnd, BinaryFunctor& f, float cost) {
    Index outerSize = outerEnd - outerBegin;
    Index innerSize = innerEnd - innerBegin;
    Index size = outerSize * innerSize;
    int maxLevel = calculateLevels<PacketSize>(size, cost);
    Barrier barrier(1 << maxLevel);
    parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerBegin, innerEnd, f, barrier, maxLevel);
    barrier.Wait();
  }

  ThreadPool& m_pool;
  // costFactor is the cost of delegating a task to a thread
  // the inverse is used to avoid a floating point division
  float m_costFactor;
};

// specialization of coefficient-wise assignment loops for CoreThreadPoolDevice

namespace internal {

#ifdef EIGEN_PARSED_BY_DOXYGEN
struct Kernel;
#endif

template <typename Kernel>
struct cost_helper {
  using SrcEvaluatorType = typename Kernel::SrcEvaluatorType;
  using DstEvaluatorType = typename Kernel::DstEvaluatorType;
  using SrcXprType = typename SrcEvaluatorType::XprType;
  using DstXprType = typename DstEvaluatorType::XprType;
  static constexpr Index Cost = functor_cost<SrcXprType>::Cost + functor_cost<DstXprType>::Cost;
};

template <typename Kernel>
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, NoUnrolling> {
  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
  struct AssignmentFunctor : public Kernel {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
      this->assignCoeffByOuterInner(outer, inner);
    }
  };

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
    const Index innerSize = kernel.innerSize();
    const Index outerSize = kernel.outerSize();
    constexpr float cost = static_cast<float>(XprEvaluationCost);
    AssignmentFunctor functor(kernel);
    device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, 0, innerSize, functor, cost);
  }
};

template <typename Kernel>
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, InnerUnrolling> {
  using DstXprType = typename Kernel::DstEvaluatorType::XprType;
  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, InnerSize = DstXprType::InnerSizeAtCompileTime;
  struct AssignmentFunctor : public Kernel {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSize>::run(*this, outer);
    }
  };
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
    const Index outerSize = kernel.outerSize();
    AssignmentFunctor functor(kernel);
    constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
    device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, functor, cost);
  }
};

template <typename Kernel>
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, NoUnrolling> {
  using PacketType = typename Kernel::PacketType;
  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
                         SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
                         DstAlignment = Kernel::AssignmentTraits::DstAlignment;
  struct AssignmentFunctor : public Kernel {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
      this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
    }
  };
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
    const Index innerSize = kernel.innerSize();
    const Index outerSize = kernel.outerSize();
    const float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize);
    AssignmentFunctor functor(kernel);
    device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, 0, innerSize, functor, cost);
  }
};

template <typename Kernel>
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, InnerUnrolling> {
  using PacketType = typename Kernel::PacketType;
  using DstXprType = typename Kernel::DstEvaluatorType::XprType;
  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
                         SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
                         DstAlignment = Kernel::AssignmentTraits::DstAlignment,
                         InnerSize = DstXprType::InnerSizeAtCompileTime;
  struct AssignmentFunctor : public Kernel {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(*this, outer);
    }
  };
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
    const Index outerSize = kernel.outerSize();
    constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
    AssignmentFunctor functor(kernel);
    device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, functor, cost);
  }
};

template <typename Kernel>
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, SliceVectorizedTraversal, NoUnrolling> {
  using Scalar = typename Kernel::Scalar;
  using PacketType = typename Kernel::PacketType;
  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size;
  struct PacketAssignmentFunctor : public Kernel {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
      this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
    }
  };
  struct ScalarAssignmentFunctor : public Kernel {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
      const Index innerSize = this->innerSize();
      const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
      for (Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner);
    }
  };
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
    const Index outerSize = kernel.outerSize();
    const Index innerSize = kernel.innerSize();
    const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
    constexpr float packetCost = static_cast<float>(XprEvaluationCost);
    const float scalarCost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize - packetAccessSize);
    PacketAssignmentFunctor packetFunctor(kernel);
    ScalarAssignmentFunctor scalarFunctor(kernel);
    device.template parallelFor<PacketAssignmentFunctor, PacketSize>(0, outerSize, 0, packetAccessSize, packetFunctor,
                                                                     packetCost);
    device.template parallelFor<ScalarAssignmentFunctor, 1>(0, outerSize, scalarFunctor, scalarCost);
  };
};

template <typename Kernel>
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearTraversal, NoUnrolling> {
  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
  struct AssignmentFunctor : public Kernel {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { this->assignCoeff(index); }
  };
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
    const Index size = kernel.size();
    constexpr float cost = static_cast<float>(XprEvaluationCost);
    AssignmentFunctor functor(kernel);
    device.template parallelFor<AssignmentFunctor, 1>(0, size, functor, cost);
  }
};

template <typename Kernel>
struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVectorizedTraversal, NoUnrolling> {
  using Scalar = typename Kernel::Scalar;
  using PacketType = typename Kernel::PacketType;
  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost,
                         RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
                         PacketSize = unpacket_traits<PacketType>::size,
                         DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment,
                         DstAlignment = packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment
                                                                               : Kernel::AssignmentTraits::DstAlignment,
                         SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
  struct AssignmentFunctor : public Kernel {
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) {
      this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
    }
  };
  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
  using head_loop =
      unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
  using tail_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, false>;

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
    const Index size = kernel.size();
    const Index alignedStart =
        DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
    const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);

    head_loop::run(kernel, 0, alignedStart);

    constexpr float cost = static_cast<float>(XprEvaluationCost);
    AssignmentFunctor functor(kernel);
    device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);

    tail_loop::run(kernel, alignedEnd, size);
  }
};

}  // namespace internal

}  // namespace Eigen

#endif  // EIGEN_CORE_THREAD_POOL_DEVICE_H
add eigen 2026-03-20 17:16:20 +00:00			`// This file is part of Eigen, a lightweight C++ template library`
			`// for linear algebra.`
			`//`
			`// Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>`
			`//`
			`// This Source Code Form is subject to the terms of the Mozilla`
			`// Public License v. 2.0. If a copy of the MPL was not distributed`
			`// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.`

			`#ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H`
			`#define EIGEN_CORE_THREAD_POOL_DEVICE_H`

			`namespace Eigen {`

			`// CoreThreadPoolDevice provides an easy-to-understand Device for parallelizing Eigen Core expressions with`
			`// Threadpool. Expressions are recursively split evenly until the evaluation cost is less than the threshold for`
			`// delegating the task to a thread.`
			`/*`
			`a`
			`/ \`
			`/ \`
			`/ \`
			`/ \`
			`/ \`
			`/ \`
			`/ \`
			`a e`
			`/ \ / \`
			`/ \ / \`
			`/ \ / \`
			`a c e g`
			`/ \ / \ / \ / \`
			`/ \ / \ / \ / \`
			`a b c d e f g h`
			`*/`
			`// Each task descends the binary tree to the left, delegates the right task to a new thread, and continues to the`
			`// left. This ensures that work is evenly distributed to the thread pool as quickly as possible and minimizes the number`
			`// of tasks created during the evaluation. Consider an expression that is divided into 8 chunks. The`
			`// primary task 'a' creates tasks 'e' 'c' and 'b', and executes its portion of the expression at the bottom of the`
			`// tree. Likewise, task 'e' creates tasks 'g' and 'f', and executes its portion of the expression.`

			`struct CoreThreadPoolDevice {`
			`using Task = std::function<void()>;`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool& pool, float threadCostThreshold = 3e-5f)`
			`: m_pool(pool) {`
			`eigen_assert(threadCostThreshold >= 0.0f && "threadCostThreshold must be non-negative");`
			`m_costFactor = threadCostThreshold;`
			`}`

			`template <int PacketSize>`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const {`
			`eigen_assert(cost >= 0.0f && "cost must be non-negative");`
			`Index numOps = size / PacketSize;`
			`int actualThreads = numOps < m_pool.NumThreads() ? static_cast<int>(numOps) : m_pool.NumThreads();`
			`float totalCost = static_cast<float>(numOps) * cost;`
			`float idealThreads = totalCost * m_costFactor;`
			`if (idealThreads < static_cast<float>(actualThreads)) {`
			`idealThreads = numext::maxi(idealThreads, 1.0f);`
			`actualThreads = numext::mini(actualThreads, static_cast<int>(idealThreads));`
			`}`
			`int maxLevel = internal::log2_ceil(actualThreads);`
			`return maxLevel;`
			`}`

			`// MSVC does not like inlining parallelForImpl`
			`#if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG`
			`#define EIGEN_PARALLEL_FOR_INLINE`
			`#else`
			`#define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE`
			`#endif`

			`template <typename UnaryFunctor, int PacketSize>`
			`EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor& f,`
			`Barrier& barrier, int level) {`
			`while (level > 0) {`
			`level--;`
			`Index size = end - begin;`
			`eigen_assert(size % PacketSize == 0 && "this function assumes size is a multiple of PacketSize");`
			`Index mid = begin + numext::round_down(size >> 1, PacketSize);`
			`Task right = [this, mid, end, &f, &barrier, level]() {`
			`parallelForImpl<UnaryFunctor, PacketSize>(mid, end, f, barrier, level);`
			`};`
			`m_pool.Schedule(std::move(right));`
			`end = mid;`
			`}`
			`for (Index i = begin; i < end; i += PacketSize) f(i);`
			`barrier.Notify();`
			`}`

			`template <typename BinaryFunctor, int PacketSize>`
			`EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin,`
			`Index innerEnd, BinaryFunctor& f, Barrier& barrier,`
			`int level) {`
			`while (level > 0) {`
			`level--;`
			`Index outerSize = outerEnd - outerBegin;`
			`if (outerSize > 1) {`
			`Index outerMid = outerBegin + (outerSize >> 1);`
			`Task right = [this, &f, &barrier, outerMid, outerEnd, innerBegin, innerEnd, level]() {`
			`parallelForImpl<BinaryFunctor, PacketSize>(outerMid, outerEnd, innerBegin, innerEnd, f, barrier, level);`
			`};`
			`m_pool.Schedule(std::move(right));`
			`outerEnd = outerMid;`
			`} else {`
			`Index innerSize = innerEnd - innerBegin;`
			`eigen_assert(innerSize % PacketSize == 0 && "this function assumes innerSize is a multiple of PacketSize");`
			`Index innerMid = innerBegin + numext::round_down(innerSize >> 1, PacketSize);`
			`Task right = [this, &f, &barrier, outerBegin, outerEnd, innerMid, innerEnd, level]() {`
			`parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerMid, innerEnd, f, barrier, level);`
			`};`
			`m_pool.Schedule(std::move(right));`
			`innerEnd = innerMid;`
			`}`
			`}`
			`for (Index outer = outerBegin; outer < outerEnd; outer++)`
			`for (Index inner = innerBegin; inner < innerEnd; inner += PacketSize) f(outer, inner);`
			`barrier.Notify();`
			`}`

			`#undef EIGEN_PARALLEL_FOR_INLINE`

			`template <typename UnaryFunctor, int PacketSize>`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor& f, float cost) {`
			`Index size = end - begin;`
			`int maxLevel = calculateLevels<PacketSize>(size, cost);`
			`Barrier barrier(1 << maxLevel);`
			`parallelForImpl<UnaryFunctor, PacketSize>(begin, end, f, barrier, maxLevel);`
			`barrier.Wait();`
			`}`

			`template <typename BinaryFunctor, int PacketSize>`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin,`
			`Index innerEnd, BinaryFunctor& f, float cost) {`
			`Index outerSize = outerEnd - outerBegin;`
			`Index innerSize = innerEnd - innerBegin;`
			`Index size = outerSize * innerSize;`
			`int maxLevel = calculateLevels<PacketSize>(size, cost);`
			`Barrier barrier(1 << maxLevel);`
			`parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerBegin, innerEnd, f, barrier, maxLevel);`
			`barrier.Wait();`
			`}`

			`ThreadPool& m_pool;`
			`// costFactor is the cost of delegating a task to a thread`
			`// the inverse is used to avoid a floating point division`
			`float m_costFactor;`
			`};`

			`// specialization of coefficient-wise assignment loops for CoreThreadPoolDevice`

			`namespace internal {`

			`#ifdef EIGEN_PARSED_BY_DOXYGEN`
			`struct Kernel;`
			`#endif`

			`template <typename Kernel>`
			`struct cost_helper {`
			`using SrcEvaluatorType = typename Kernel::SrcEvaluatorType;`
			`using DstEvaluatorType = typename Kernel::DstEvaluatorType;`
			`using SrcXprType = typename SrcEvaluatorType::XprType;`
			`using DstXprType = typename DstEvaluatorType::XprType;`
			`static constexpr Index Cost = functor_cost<SrcXprType>::Cost + functor_cost<DstXprType>::Cost;`
			`};`

			`template <typename Kernel>`
			`struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, NoUnrolling> {`
			`static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;`
			`struct AssignmentFunctor : public Kernel {`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {`
			`this->assignCoeffByOuterInner(outer, inner);`
			`}`
			`};`

			`static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {`
			`const Index innerSize = kernel.innerSize();`
			`const Index outerSize = kernel.outerSize();`
			`constexpr float cost = static_cast<float>(XprEvaluationCost);`
			`AssignmentFunctor functor(kernel);`
			`device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, 0, innerSize, functor, cost);`
			`}`
			`};`

			`template <typename Kernel>`
			`struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, InnerUnrolling> {`
			`using DstXprType = typename Kernel::DstEvaluatorType::XprType;`
			`static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, InnerSize = DstXprType::InnerSizeAtCompileTime;`
			`struct AssignmentFunctor : public Kernel {`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {`
			`copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSize>::run(*this, outer);`
			`}`
			`};`
			`static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {`
			`const Index outerSize = kernel.outerSize();`
			`AssignmentFunctor functor(kernel);`
			`constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);`
			`device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, functor, cost);`
			`}`
			`};`

			`template <typename Kernel>`
			`struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, NoUnrolling> {`
			`using PacketType = typename Kernel::PacketType;`
			`static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,`
			`SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,`
			`DstAlignment = Kernel::AssignmentTraits::DstAlignment;`
			`struct AssignmentFunctor : public Kernel {`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {`
			`this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);`
			`}`
			`};`
			`static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {`
			`const Index innerSize = kernel.innerSize();`
			`const Index outerSize = kernel.outerSize();`
			`const float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize);`
			`AssignmentFunctor functor(kernel);`
			`device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, 0, innerSize, functor, cost);`
			`}`
			`};`

			`template <typename Kernel>`
			`struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, InnerUnrolling> {`
			`using PacketType = typename Kernel::PacketType;`
			`using DstXprType = typename Kernel::DstEvaluatorType::XprType;`
			`static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,`
			`SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,`
			`DstAlignment = Kernel::AssignmentTraits::DstAlignment,`
			`InnerSize = DstXprType::InnerSizeAtCompileTime;`
			`struct AssignmentFunctor : public Kernel {`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {`
			`copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(*this, outer);`
			`}`
			`};`
			`static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {`
			`const Index outerSize = kernel.outerSize();`
			`constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);`
			`AssignmentFunctor functor(kernel);`
			`device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, functor, cost);`
			`}`
			`};`

			`template <typename Kernel>`
			`struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, SliceVectorizedTraversal, NoUnrolling> {`
			`using Scalar = typename Kernel::Scalar;`
			`using PacketType = typename Kernel::PacketType;`
			`static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size;`
			`struct PacketAssignmentFunctor : public Kernel {`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {`
			`this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);`
			`}`
			`};`
			`struct ScalarAssignmentFunctor : public Kernel {`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {`
			`const Index innerSize = this->innerSize();`
			`const Index packetAccessSize = numext::round_down(innerSize, PacketSize);`
			`for (Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner);`
			`}`
			`};`
			`static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {`
			`const Index outerSize = kernel.outerSize();`
			`const Index innerSize = kernel.innerSize();`
			`const Index packetAccessSize = numext::round_down(innerSize, PacketSize);`
			`constexpr float packetCost = static_cast<float>(XprEvaluationCost);`
			`const float scalarCost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize - packetAccessSize);`
			`PacketAssignmentFunctor packetFunctor(kernel);`
			`ScalarAssignmentFunctor scalarFunctor(kernel);`
			`device.template parallelFor<PacketAssignmentFunctor, PacketSize>(0, outerSize, 0, packetAccessSize, packetFunctor,`
			`packetCost);`
			`device.template parallelFor<ScalarAssignmentFunctor, 1>(0, outerSize, scalarFunctor, scalarCost);`
			`};`
			`};`

			`template <typename Kernel>`
			`struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearTraversal, NoUnrolling> {`
			`static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;`
			`struct AssignmentFunctor : public Kernel {`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { this->assignCoeff(index); }`
			`};`
			`static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {`
			`const Index size = kernel.size();`
			`constexpr float cost = static_cast<float>(XprEvaluationCost);`
			`AssignmentFunctor functor(kernel);`
			`device.template parallelFor<AssignmentFunctor, 1>(0, size, functor, cost);`
			`}`
			`};`

			`template <typename Kernel>`
			`struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVectorizedTraversal, NoUnrolling> {`
			`using Scalar = typename Kernel::Scalar;`
			`using PacketType = typename Kernel::PacketType;`
			`static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost,`
			`RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,`
			`PacketSize = unpacket_traits<PacketType>::size,`
			`DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment,`
			`DstAlignment = packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment`
			`: Kernel::AssignmentTraits::DstAlignment,`
			`SrcAlignment = Kernel::AssignmentTraits::JointAlignment;`
			`struct AssignmentFunctor : public Kernel {`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}`
			`EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) {`
			`this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);`
			`}`
			`};`
			`static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;`
			`using head_loop =`
			`unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;`
			`using tail_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, false>;`

			`static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {`
			`const Index size = kernel.size();`
			`const Index alignedStart =`
			`DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);`
			`const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);`

			`head_loop::run(kernel, 0, alignedStart);`

			`constexpr float cost = static_cast<float>(XprEvaluationCost);`
			`AssignmentFunctor functor(kernel);`
			`device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);`

			`tail_loop::run(kernel, alignedEnd, size);`
			`}`
			`};`

			`} // namespace internal`

			`} // namespace Eigen`

			`#endif // EIGEN_CORE_THREAD_POOL_DEVICE_H`