inductor: make the vec_transpose's tiling stride doesn't depend on out_idx and tiling_idex (#103651)
For TIMM swin_base_patch4_window7_224 dynamic shape path, there has an accuracy issue with horizontal reduction with vec_transpose:
```
#pragma omp for
for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long i1=static_cast<long>(0L); i1<static_cast<long>(3136L); i1+=static_cast<long>(16L))
{
{
#pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={{0}})
float tmp_acc0 = 0;
auto tmp_acc0_vec = at::vec::Vectorized<float>(tmp_acc0);
for(long i2=static_cast<long>(0L); i2<static_cast<long>(128L); i2+=static_cast<long>(16L))
{
float tmp1[16*16] __attribute__ ((aligned (16)));
at::vec::transpose_mxn<float,16,16>(in_ptr1 + static_cast<long>(i2 + (128L*(static_cast<long>((static_cast<long>(i1) % static_cast<long>(56L))) % static_cast<long>(7L))) + (896L*(static_cast<long>(at::native::div_floor_integer(i1, 56L)) % static_cast<long>(7L))) + (6272L*(at::native::div_floor_integer((static_cast<long>(i1) % static_cast<long>(56L)), 7L))) + (50176L*(at::native::div_floor_integer(i1, 392L))) + (401408L*i0)), static_cast<long>(((-50176L)*(at::native::div_floor_integer(i1, 392L))) + ((-6272L)*(at::native::div_floor_integer((static_cast<long>(i1) % static_cast<long>(56L)), 7L))) + ((-896L)*(static_cast<long>(at::native::div_floor_integer(i1, 56L)) % static_cast<long>(7L))) + ((-128L)*(static_cast<long>((static_cast<long>(i1) % static_cast<long>(56L))) % static_cast<long>(7L))) + (128L*(static_cast<long>((static_cast<long>((1L + i1)) % static_cast<long>(56L))) % static_cast<long>(7L))) + (896L*(static_cast<long>(at::native::div_floor_integer((1L + i1), 56L)) % static_cast<long>(7L))) + (6272L*(at::native::div_floor_integer((static_cast<long>((1L + i1)) % static_cast<long>(56L)), 7L))) + (50176L*(at::native::div_floor_integer((1L + i1), 392L)))), tmp1, 16);
for (long i2_inner = 0; i2_inner < 16; i2_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(i1 + (3136L*i2) + (3136L*i2_inner) + (401408L*i0)));
auto tmp2 = at::vec::Vectorized<float>::loadu(tmp1 + static_cast<long>(16L*i2_inner));
auto tmp3 = tmp0 + tmp2;
tmp_acc0_vec = tmp_acc0_vec + tmp3;
}
}
tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (3136L*i0)));
}
}
}
```
The ```transpose_mxn```'s ```ld_src``` depends on ```i1``` which is not expected. This PR will add a check to make sure the tiling stride doesn't depend on out_idx(```i2```) and tiling_idex(```i1```)
After this PR, the generated code will be like this:
```
#pragma omp for
for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long i1=static_cast<long>(0L); i1<static_cast<long>(3136L); i1+=static_cast<long>(16L))
{
{
#pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={{0}})
float tmp_acc0 = 0;
auto tmp_acc0_vec = at::vec::Vectorized<float>(tmp_acc0);
for(long i2=static_cast<long>(0L); i2<static_cast<long>(128L); i2+=static_cast<long>(16L))
{
for (long i2_inner = 0; i2_inner < 16; i2_inner++)
{
auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(i1 + (3136L*i2) + (3136L*i2_inner) + (401408L*i0)));
auto tmp1 = ([&]() { __at_align__ float tmpbuf[16]; for (long i1_inner = 0; i1_inner < 16; i1_inner++) tmpbuf[i1_inner] = in_ptr1[static_cast<long>(i2 + i2_inner + (128L*(static_cast<long>((static_cast<long>((i1 + i1_inner)) % static_cast<long>(56L))) % static_cast<long>(7L))) + (896L*(static_cast<long>(at::native::div_floor_integer((i1 + i1_inner), 56L)) % static_cast<long>(7L))) + (6272L*(at::native::div_floor_integer((static_cast<long>((i1 + i1_inner)) % static_cast<long>(56L)), 7L))) + (50176L*(at::native::div_floor_integer((i1 + i1_inner), 392L))) + (401408L*i0))]; return at::vec::Vectorized<float>::loadu(tmpbuf); })();
auto tmp2 = tmp0 + tmp1;
tmp_acc0_vec = tmp_acc0_vec + tmp2;
}
}
tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (3136L*i0)));
}
}
}
```
How to reproduce this issue:
```
python -m torch.backends.xeon.run_cpu --node_id 0 benchmarks/dynamo/timm_models.py --accuracy --float32 -dcpu --inference -n5 --inductor --dynamic-shapes --only swin_base_patch4_window7_224
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/103651
Approved by: https://github.com/jgong5, https://github.com/jansel