pytorch
610f64d7 - inductor: also check index_exp when select tiling var (#106765)

Commit View On GitHub

Commit

1 year ago

inductor: also check index_exp when select tiling var (#106765) For select tiling var, currently, we only consider load and store which do not consider index exp, and meet accuracy issues: before(the index exp ```i1-1``` can not be vectrized): ``` cpp_fused_constant_pad_nd_mul_0 = async_compile.cpp(''' #include "/tmp/torchinductor_xiaobing/i5/ci5uspp363v3ky6jkccllm3bxudy2fkdpqinkqhmpehfihejs7ko.h" extern "C" void kernel(const float* in_ptr0, const float* in_ptr1, float* out_ptr0) { #pragma omp parallel num_threads(40) { { #pragma omp for for(long i0=static_cast<long>(0L); i0<static_cast<long>(64L); i0+=static_cast<long>(1L)) { #pragma GCC ivdep for(long i1=static_cast<long>(0L); i1<static_cast<long>(3136L); i1+=static_cast<long>(16L)) { #pragma GCC ivdep for(long i2=static_cast<long>(0L); i2<static_cast<long>(8L); i2+=static_cast<long>(1L)) { auto tmp0 = at::vec::Vectorized<int>(static_cast<int>((-1L) + i1)); auto tmp1 = at::vec::Vectorized<int>(static_cast<int>(0)); auto tmp2 = to_float_mask(tmp0 >= tmp1); auto tmp3 = [&] { auto tmp4 = ([&]() { __at_align__ float tmpbuf[16]; for (long i1_inner = 0; i1_inner < 16; i1_inner++) tmpbuf[i1_inner] = in_ptr0[static_cast<long>((-8L) + i2 + (8L*i1) + (8L*i1_inner) + (25088L*i0))]; return at::vec::Vectorized<float>::loadu(tmpbuf); })(); auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>((-1L) + i1 + (3136L*i2) + (25088L*i0))); auto tmp6 = tmp4 * tmp5; return tmp6; } ; auto tmp7 = decltype(tmp3())::blendv(at::vec::Vectorized<float>(0.0), tmp3(), to_float_mask(tmp2)); { __at_align__ float tmpbuf[16*sizeof(float)/sizeof(float)]; tmp7.store(tmpbuf); for (long i1_inner = 0; i1_inner < 16; i1_inner++) out_ptr0[static_cast<long>(i2 + (8L*i1) + (8L*i1_inner) + (25096L*i0))] = tmpbuf[i1_inner]; } } } #pragma GCC ivdep for(long i1=static_cast<long>(3136L); i1<static_cast<long>(3137L); i1+=static_cast<long>(1L)) { #pragma GCC ivdep for(long i2=static_cast<long>(0L); i2<static_cast<long>(8L); i2+=static_cast<long>(1L)) { auto tmp0 = static_cast<long>((-1L) + i1); auto tmp1 = static_cast<long>(0); auto tmp2 = tmp0 >= tmp1; auto tmp3 = [&] { auto tmp4 = in_ptr0[static_cast<long>((-8L) + i2 + (8L*i1) + (25088L*i0))]; auto tmp5 = in_ptr1[static_cast<long>((-1L) + i1 + (3136L*i2) + (25088L*i0))]; auto tmp6 = decltype(tmp4)(tmp4 * tmp5); return tmp6; } ; auto tmp7 = tmp2 ? tmp3() : static_cast<decltype(tmp3())>(0.0); out_ptr0[static_cast<long>(i2 + (8L*i1) + (25096L*i0))] = tmp7; } } } } } } ``` after: ``` cpp_fused_constant_pad_nd_mul_0 = async_compile.cpp(''' #include "/tmp/torchinductor_xiaobing/i5/ci5uspp363v3ky6jkccllm3bxudy2fkdpqinkqhmpehfihejs7ko.h" extern "C" void kernel(const float* in_ptr0, const float* in_ptr1, float* out_ptr0) { #pragma omp parallel num_threads(40) { { #pragma omp for for(long i0=static_cast<long>(0L); i0<static_cast<long>(64L); i0+=static_cast<long>(1L)) { #pragma GCC ivdep for(long i1=static_cast<long>(0L); i1<static_cast<long>(3137L); i1+=static_cast<long>(1L)) { #pragma omp simd simdlen(8) for(long i2=static_cast<long>(0L); i2<static_cast<long>(8L); i2+=static_cast<long>(1L)) { auto tmp0 = static_cast<long>((-1L) + i1); auto tmp1 = static_cast<long>(0); auto tmp2 = tmp0 >= tmp1; auto tmp3 = [&] { auto tmp4 = in_ptr0[static_cast<long>((-8L) + i2 + (8L*i1) + (25088L*i0))]; auto tmp5 = in_ptr1[static_cast<long>((-1L) + i1 + (3136L*i2) + (25088L*i0))]; auto tmp6 = decltype(tmp4)(tmp4 * tmp5); return tmp6; } ; auto tmp7 = tmp2 ? tmp3() : static_cast<decltype(tmp3())>(0.0); out_ptr0[static_cast<long>(i2 + (8L*i1) + (25096L*i0))] = tmp7; } } } } } } ''') ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/106765 Approved by: https://github.com/jgong5, https://github.com/jansel

Author

XiaobingSuper

Committer

pytorchmergebot

Parents

4a40e275

pytorch 610f64d7 - inductor: also check index_exp when select tiling var (#106765)

Commit

pytorch
610f64d7 - inductor: also check index_exp when select tiling var (#106765)