inductor: also check index_exp when select tiling var (#106765)
For select tiling var, currently, we only consider load and store which do not consider index exp, and meet accuracy issues:
before(the index exp ```i1-1``` can not be vectrized):
```
cpp_fused_constant_pad_nd_mul_0 = async_compile.cpp('''
#include "/tmp/torchinductor_xiaobing/i5/ci5uspp363v3ky6jkccllm3bxudy2fkdpqinkqhmpehfihejs7ko.h"
extern "C" void kernel(const float* in_ptr0,
const float* in_ptr1,
float* out_ptr0)
{
#pragma omp parallel num_threads(40)
{
{
#pragma omp for
for(long i0=static_cast<long>(0L); i0<static_cast<long>(64L); i0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long i1=static_cast<long>(0L); i1<static_cast<long>(3136L); i1+=static_cast<long>(16L))
{
#pragma GCC ivdep
for(long i2=static_cast<long>(0L); i2<static_cast<long>(8L); i2+=static_cast<long>(1L))
{
auto tmp0 = at::vec::Vectorized<int>(static_cast<int>((-1L) + i1));
auto tmp1 = at::vec::Vectorized<int>(static_cast<int>(0));
auto tmp2 = to_float_mask(tmp0 >= tmp1);
auto tmp3 = [&]
{
auto tmp4 = ([&]() { __at_align__ float tmpbuf[16]; for (long i1_inner = 0; i1_inner < 16; i1_inner++) tmpbuf[i1_inner] = in_ptr0[static_cast<long>((-8L) + i2 + (8L*i1) + (8L*i1_inner) + (25088L*i0))]; return at::vec::Vectorized<float>::loadu(tmpbuf); })();
auto tmp5 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>((-1L) + i1 + (3136L*i2) + (25088L*i0)));
auto tmp6 = tmp4 * tmp5;
return tmp6;
}
;
auto tmp7 = decltype(tmp3())::blendv(at::vec::Vectorized<float>(0.0), tmp3(), to_float_mask(tmp2));
{ __at_align__ float tmpbuf[16*sizeof(float)/sizeof(float)]; tmp7.store(tmpbuf); for (long i1_inner = 0; i1_inner < 16; i1_inner++) out_ptr0[static_cast<long>(i2 + (8L*i1) + (8L*i1_inner) + (25096L*i0))] = tmpbuf[i1_inner]; }
}
}
#pragma GCC ivdep
for(long i1=static_cast<long>(3136L); i1<static_cast<long>(3137L); i1+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long i2=static_cast<long>(0L); i2<static_cast<long>(8L); i2+=static_cast<long>(1L))
{
auto tmp0 = static_cast<long>((-1L) + i1);
auto tmp1 = static_cast<long>(0);
auto tmp2 = tmp0 >= tmp1;
auto tmp3 = [&]
{
auto tmp4 = in_ptr0[static_cast<long>((-8L) + i2 + (8L*i1) + (25088L*i0))];
auto tmp5 = in_ptr1[static_cast<long>((-1L) + i1 + (3136L*i2) + (25088L*i0))];
auto tmp6 = decltype(tmp4)(tmp4 * tmp5);
return tmp6;
}
;
auto tmp7 = tmp2 ? tmp3() : static_cast<decltype(tmp3())>(0.0);
out_ptr0[static_cast<long>(i2 + (8L*i1) + (25096L*i0))] = tmp7;
}
}
}
}
}
}
```
after:
```
cpp_fused_constant_pad_nd_mul_0 = async_compile.cpp('''
#include "/tmp/torchinductor_xiaobing/i5/ci5uspp363v3ky6jkccllm3bxudy2fkdpqinkqhmpehfihejs7ko.h"
extern "C" void kernel(const float* in_ptr0,
const float* in_ptr1,
float* out_ptr0)
{
#pragma omp parallel num_threads(40)
{
{
#pragma omp for
for(long i0=static_cast<long>(0L); i0<static_cast<long>(64L); i0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long i1=static_cast<long>(0L); i1<static_cast<long>(3137L); i1+=static_cast<long>(1L))
{
#pragma omp simd simdlen(8)
for(long i2=static_cast<long>(0L); i2<static_cast<long>(8L); i2+=static_cast<long>(1L))
{
auto tmp0 = static_cast<long>((-1L) + i1);
auto tmp1 = static_cast<long>(0);
auto tmp2 = tmp0 >= tmp1;
auto tmp3 = [&]
{
auto tmp4 = in_ptr0[static_cast<long>((-8L) + i2 + (8L*i1) + (25088L*i0))];
auto tmp5 = in_ptr1[static_cast<long>((-1L) + i1 + (3136L*i2) + (25088L*i0))];
auto tmp6 = decltype(tmp4)(tmp4 * tmp5);
return tmp6;
}
;
auto tmp7 = tmp2 ? tmp3() : static_cast<decltype(tmp3())>(0.0);
out_ptr0[static_cast<long>(i2 + (8L*i1) + (25096L*i0))] = tmp7;
}
}
}
}
}
}
''')
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/106765
Approved by: https://github.com/jgong5, https://github.com/jansel