[SYCL][NFCI] Don't go through variadic for `parallel_for(range<N>, krn)` (#18019)
This is a "reduction" overload that just happens to dispatch immediately
to the non-reduction range+properties version of `parallel_for`. Going
through the simpler overload (unused before this PR) seems to be
cheaper.
E.g., for
```
template <typename...> struct Name;
template <typename Krn> struct Invoker {
static void call(void *p, int i) { (*static_cast<Krn *>(p))(i); }
};
void invoke(void (*)(void *, int));
struct Kernel {
using PointersVariant =
std::variant<std::int8_t *, std::int16_t *, std::uint8_t *,
std::uint16_t *, float *, double *, sycl::half *>;
PointersVariant lhs;
PointersVariant rhs;
std::size_t sz;
PointersVariant out;
template <typename T>
Kernel(T *l, T *r, std::size_t size, T *o)
: lhs(l), rhs(r), sz(size), out(o) {}
void operator()(sycl::handler &h) {
std::visit(
[&](auto lhs_ptr, auto rhs_ptr, auto dst_ptr) {
auto L = [=](auto i) { dst_ptr[i] = lhs_ptr[i] + rhs_ptr[i]; };
using N =
Name<decltype(lhs_ptr), decltype(rhs_ptr), decltype(dst_ptr)>;
h.parallel_for<N>(sz, L);
invoke(&Invoker<decltype(L)>::call);
},
lhs, rhs, out);
}
};
auto p = &Kernel::operator();
```
I see 10.35s->9.9s improvement for
`$ time clang++ -fsycl -c a.cpp
-D__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__`