pytorch
fdbc2ec5 - [Quant][Inductor] Fix the non contiguous load with uint8 data type (#106958)

Commit View On GitHub

Commit

1 year ago

[Quant][Inductor] Fix the non contiguous load with uint8 data type (#106958) **Summary** Currently, the load vectorization code generation with `non_contiguous` and `uint8` data type has issue in determining the data type. It caused wrong results in `shufflenet_v2_x1_0` model after we enable the `cat` quantization recipe. - Previously code gen with the example in this PR: ``` cpp_fused_clone_view_0 = async_compile.cpp(''' #include "/tmp/torchinductor_root/i5/ci5uspp363v3ky6jkccllm3bxudy2fkdpqinkqhmpehfihejs7ko.h" extern "C" void kernel(const unsigned char* in_ptr0, float* out_ptr0) { #pragma omp parallel num_threads(56) { { #pragma omp for for(long i0=static_cast<long>(0L); i0<static_cast<long>(232L); i0+=static_cast<long>(1L)) { for(long i1=static_cast<long>(0L); i1<static_cast<long>(784L); i1+=static_cast<long>(16L)) { auto tmp0 = ([&]() { __at_align__ float tmpbuf[16]; for (long i1_inner = 0; i1_inner < 16; i1_inner++) tmpbuf[i1_inner] = flag_to_float_scalar(in_ptr0[static_cast<long>((116L*(static_cast<long>(i0) % static_cast<long>(2L))) + (232L*i1) + (232L*i1_inner) + (at::native::div_floor_integer(i0, 2L)))]); return at::vec::Vectorized<uint8_t>::loadu_one_fourth(tmpbuf); })(); auto tmp1 = at::vec::convert_uint8_to_float(tmp0); auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0)); auto tmp3 = tmp1 - tmp2; auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1.0)); auto tmp5 = tmp3 * tmp4; auto tmp6 = tmp5 * tmp4; auto tmp7 = tmp6.round(); auto tmp8 = tmp7 + tmp2; auto tmp9 = at::vec::maximum(tmp8, tmp2); auto tmp10 = at::vec::Vectorized<float>(static_cast<float>(255.0)); auto tmp11 = at::vec::minimum(tmp9, tmp10); auto tmp12 = at::vec::convert_float_to_uint8(tmp11); auto tmp13 = at::vec::convert_uint8_to_float(tmp12); auto tmp14 = tmp13 - tmp2; auto tmp15 = tmp14 * tmp4; tmp15.store(out_ptr0 + static_cast<long>(i1 + (784L*i0))); } } } } } ''') ``` - After this PR, the code gen is: ``` cpp_fused_clone_view_0 = async_compile.cpp(''' #include "/tmp/torchinductor_root/i5/ci5uspp363v3ky6jkccllm3bxudy2fkdpqinkqhmpehfihejs7ko.h" extern "C" void kernel(const unsigned char* in_ptr0, float* out_ptr0) { #pragma omp parallel num_threads(56) { { #pragma omp for for(long i0=static_cast<long>(0L); i0<static_cast<long>(232L); i0+=static_cast<long>(1L)) { for(long i1=static_cast<long>(0L); i1<static_cast<long>(784L); i1+=static_cast<long>(16L)) { auto tmp0 = ([&]() { __at_align__ unsigned char tmpbuf[16]; for (long i1_inner = 0; i1_inner < 16; i1_inner++) tmpbuf[i1_inner] = in_ptr0[static_cast<long>((116L*(static_cast<long>(i0) % static_cast<long>(2L))) + (232L*i1) + (232L*i1_inner) + (at::native::div_floor_integer(i0, 2L)))]; return at::vec::Vectorized<uint8_t>::loadu_one_fourth(tmpbuf); })(); auto tmp1 = at::vec::convert_uint8_to_float(tmp0); auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0)); auto tmp3 = tmp1 - tmp2; auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1.0)); auto tmp5 = tmp3 * tmp4; auto tmp6 = tmp5 * tmp4; auto tmp7 = tmp6.round(); auto tmp8 = tmp7 + tmp2; auto tmp9 = at::vec::maximum(tmp8, tmp2); auto tmp10 = at::vec::Vectorized<float>(static_cast<float>(255.0)); auto tmp11 = at::vec::minimum(tmp9, tmp10); auto tmp12 = at::vec::convert_float_to_uint8(tmp11); auto tmp13 = at::vec::convert_uint8_to_float(tmp12); auto tmp14 = tmp13 - tmp2; auto tmp15 = tmp14 * tmp4; tmp15.store(out_ptr0 + static_cast<long>(i1 + (784L*i0))); } } } } } ''') ``` **Test Plan** ``` clear && python -m pytest test_cpu_repro.py -k test_non_contiguous_load_buf_quant ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/106958 Approved by: https://github.com/jgong5, https://github.com/eellison ghstack dependencies: #106836, #106838

Author

leslie-fang-intel

Committer

pytorchmergebot

Parents

9e3f3f0b

pytorch fdbc2ec5 - [Quant][Inductor] Fix the non contiguous load with uint8 data type (#106958)

Commit

pytorch
fdbc2ec5 - [Quant][Inductor] Fix the non contiguous load with uint8 data type (#106958)