pytorch
b287cb81 - inductor: make the vec_transpose's tiling stride doesn't depend on out_idx and tiling_idex (#103651)

Commit
1 year ago
inductor: make the vec_transpose's tiling stride doesn't depend on out_idx and tiling_idex (#103651) For TIMM swin_base_patch4_window7_224 dynamic shape path, there has an accuracy issue with horizontal reduction with vec_transpose: ``` #pragma omp for for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L)) { #pragma GCC ivdep for(long i1=static_cast<long>(0L); i1<static_cast<long>(3136L); i1+=static_cast<long>(16L)) { { #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={{0}}) float tmp_acc0 = 0; auto tmp_acc0_vec = at::vec::Vectorized<float>(tmp_acc0); for(long i2=static_cast<long>(0L); i2<static_cast<long>(128L); i2+=static_cast<long>(16L)) { float tmp1[16*16] __attribute__ ((aligned (16))); at::vec::transpose_mxn<float,16,16>(in_ptr1 + static_cast<long>(i2 + (128L*(static_cast<long>((static_cast<long>(i1) % static_cast<long>(56L))) % static_cast<long>(7L))) + (896L*(static_cast<long>(at::native::div_floor_integer(i1, 56L)) % static_cast<long>(7L))) + (6272L*(at::native::div_floor_integer((static_cast<long>(i1) % static_cast<long>(56L)), 7L))) + (50176L*(at::native::div_floor_integer(i1, 392L))) + (401408L*i0)), static_cast<long>(((-50176L)*(at::native::div_floor_integer(i1, 392L))) + ((-6272L)*(at::native::div_floor_integer((static_cast<long>(i1) % static_cast<long>(56L)), 7L))) + ((-896L)*(static_cast<long>(at::native::div_floor_integer(i1, 56L)) % static_cast<long>(7L))) + ((-128L)*(static_cast<long>((static_cast<long>(i1) % static_cast<long>(56L))) % static_cast<long>(7L))) + (128L*(static_cast<long>((static_cast<long>((1L + i1)) % static_cast<long>(56L))) % static_cast<long>(7L))) + (896L*(static_cast<long>(at::native::div_floor_integer((1L + i1), 56L)) % static_cast<long>(7L))) + (6272L*(at::native::div_floor_integer((static_cast<long>((1L + i1)) % static_cast<long>(56L)), 7L))) + (50176L*(at::native::div_floor_integer((1L + i1), 392L)))), tmp1, 16); for (long i2_inner = 0; i2_inner < 16; i2_inner++) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(i1 + (3136L*i2) + (3136L*i2_inner) + (401408L*i0))); auto tmp2 = at::vec::Vectorized<float>::loadu(tmp1 + static_cast<long>(16L*i2_inner)); auto tmp3 = tmp0 + tmp2; tmp_acc0_vec = tmp_acc0_vec + tmp3; } } tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (3136L*i0))); } } } ``` The ```transpose_mxn```'s ```ld_src``` depends on ```i1``` which is not expected. This PR will add a check to make sure the tiling stride doesn't depend on out_idx(```i2```) and tiling_idex(```i1```) After this PR, the generated code will be like this: ``` #pragma omp for for(long i0=static_cast<long>(0L); i0<static_cast<long>(ks0); i0+=static_cast<long>(1L)) { #pragma GCC ivdep for(long i1=static_cast<long>(0L); i1<static_cast<long>(3136L); i1+=static_cast<long>(16L)) { { #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out = omp_out + omp_in) initializer(omp_priv={{0}}) float tmp_acc0 = 0; auto tmp_acc0_vec = at::vec::Vectorized<float>(tmp_acc0); for(long i2=static_cast<long>(0L); i2<static_cast<long>(128L); i2+=static_cast<long>(16L)) { for (long i2_inner = 0; i2_inner < 16; i2_inner++) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(i1 + (3136L*i2) + (3136L*i2_inner) + (401408L*i0))); auto tmp1 = ([&]() { __at_align__ float tmpbuf[16]; for (long i1_inner = 0; i1_inner < 16; i1_inner++) tmpbuf[i1_inner] = in_ptr1[static_cast<long>(i2 + i2_inner + (128L*(static_cast<long>((static_cast<long>((i1 + i1_inner)) % static_cast<long>(56L))) % static_cast<long>(7L))) + (896L*(static_cast<long>(at::native::div_floor_integer((i1 + i1_inner), 56L)) % static_cast<long>(7L))) + (6272L*(at::native::div_floor_integer((static_cast<long>((i1 + i1_inner)) % static_cast<long>(56L)), 7L))) + (50176L*(at::native::div_floor_integer((i1 + i1_inner), 392L))) + (401408L*i0))]; return at::vec::Vectorized<float>::loadu(tmpbuf); })(); auto tmp2 = tmp0 + tmp1; tmp_acc0_vec = tmp_acc0_vec + tmp2; } } tmp_acc0_vec.store(out_ptr0 + static_cast<long>(i1 + (3136L*i0))); } } } ``` How to reproduce this issue: ``` python -m torch.backends.xeon.run_cpu --node_id 0 benchmarks/dynamo/timm_models.py --accuracy --float32 -dcpu --inference -n5 --inductor --dynamic-shapes --only swin_base_patch4_window7_224 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/103651 Approved by: https://github.com/jgong5, https://github.com/jansel
Author
Committer
Parents
Loading