llvm
567c077b - [SYCL][NFCI] Don't go through variadic for `parallel_for(range<N>, krn)` (#18019)

Commit
326 days ago
[SYCL][NFCI] Don't go through variadic for `parallel_for(range<N>, krn)` (#18019) This is a "reduction" overload that just happens to dispatch immediately to the non-reduction range+properties version of `parallel_for`. Going through the simpler overload (unused before this PR) seems to be cheaper. E.g., for ``` template <typename...> struct Name; template <typename Krn> struct Invoker { static void call(void *p, int i) { (*static_cast<Krn *>(p))(i); } }; void invoke(void (*)(void *, int)); struct Kernel { using PointersVariant = std::variant<std::int8_t *, std::int16_t *, std::uint8_t *, std::uint16_t *, float *, double *, sycl::half *>; PointersVariant lhs; PointersVariant rhs; std::size_t sz; PointersVariant out; template <typename T> Kernel(T *l, T *r, std::size_t size, T *o) : lhs(l), rhs(r), sz(size), out(o) {} void operator()(sycl::handler &h) { std::visit( [&](auto lhs_ptr, auto rhs_ptr, auto dst_ptr) { auto L = [=](auto i) { dst_ptr[i] = lhs_ptr[i] + rhs_ptr[i]; }; using N = Name<decltype(lhs_ptr), decltype(rhs_ptr), decltype(dst_ptr)>; h.parallel_for<N>(sz, L); invoke(&Invoker<decltype(L)>::call); }, lhs, rhs, out); } }; auto p = &Kernel::operator(); ``` I see 10.35s->9.9s improvement for `$ time clang++ -fsycl -c a.cpp -D__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__`
Parents
Loading