[Quant][Inductor] Fix the non contiguous load with uint8 data type (#106958)
**Summary**
Currently, the load vectorization code generation with `non_contiguous` and `uint8` data type has issue in determining the data type. It caused wrong results in `shufflenet_v2_x1_0` model after we enable the `cat` quantization recipe.
- Previously code gen with the example in this PR:
```
cpp_fused_clone_view_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/i5/ci5uspp363v3ky6jkccllm3bxudy2fkdpqinkqhmpehfihejs7ko.h"
extern "C" void kernel(const unsigned char* in_ptr0,
float* out_ptr0)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=static_cast<long>(0L); i0<static_cast<long>(232L); i0+=static_cast<long>(1L))
{
for(long i1=static_cast<long>(0L); i1<static_cast<long>(784L); i1+=static_cast<long>(16L))
{
auto tmp0 = ([&]() { __at_align__ float tmpbuf[16]; for (long i1_inner = 0; i1_inner < 16; i1_inner++) tmpbuf[i1_inner] = flag_to_float_scalar(in_ptr0[static_cast<long>((116L*(static_cast<long>(i0) % static_cast<long>(2L))) + (232L*i1) + (232L*i1_inner) + (at::native::div_floor_integer(i0, 2L)))]); return at::vec::Vectorized<uint8_t>::loadu_one_fourth(tmpbuf); })();
auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0));
auto tmp3 = tmp1 - tmp2;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1.0));
auto tmp5 = tmp3 * tmp4;
auto tmp6 = tmp5 * tmp4;
auto tmp7 = tmp6.round();
auto tmp8 = tmp7 + tmp2;
auto tmp9 = at::vec::maximum(tmp8, tmp2);
auto tmp10 = at::vec::Vectorized<float>(static_cast<float>(255.0));
auto tmp11 = at::vec::minimum(tmp9, tmp10);
auto tmp12 = at::vec::convert_float_to_uint8(tmp11);
auto tmp13 = at::vec::convert_uint8_to_float(tmp12);
auto tmp14 = tmp13 - tmp2;
auto tmp15 = tmp14 * tmp4;
tmp15.store(out_ptr0 + static_cast<long>(i1 + (784L*i0)));
}
}
}
}
}
''')
```
- After this PR, the code gen is:
```
cpp_fused_clone_view_0 = async_compile.cpp('''
#include "/tmp/torchinductor_root/i5/ci5uspp363v3ky6jkccllm3bxudy2fkdpqinkqhmpehfihejs7ko.h"
extern "C" void kernel(const unsigned char* in_ptr0,
float* out_ptr0)
{
#pragma omp parallel num_threads(56)
{
{
#pragma omp for
for(long i0=static_cast<long>(0L); i0<static_cast<long>(232L); i0+=static_cast<long>(1L))
{
for(long i1=static_cast<long>(0L); i1<static_cast<long>(784L); i1+=static_cast<long>(16L))
{
auto tmp0 = ([&]() { __at_align__ unsigned char tmpbuf[16]; for (long i1_inner = 0; i1_inner < 16; i1_inner++) tmpbuf[i1_inner] = in_ptr0[static_cast<long>((116L*(static_cast<long>(i0) % static_cast<long>(2L))) + (232L*i1) + (232L*i1_inner) + (at::native::div_floor_integer(i0, 2L)))]; return at::vec::Vectorized<uint8_t>::loadu_one_fourth(tmpbuf); })();
auto tmp1 = at::vec::convert_uint8_to_float(tmp0);
auto tmp2 = at::vec::Vectorized<float>(static_cast<float>(0.0));
auto tmp3 = tmp1 - tmp2;
auto tmp4 = at::vec::Vectorized<float>(static_cast<float>(1.0));
auto tmp5 = tmp3 * tmp4;
auto tmp6 = tmp5 * tmp4;
auto tmp7 = tmp6.round();
auto tmp8 = tmp7 + tmp2;
auto tmp9 = at::vec::maximum(tmp8, tmp2);
auto tmp10 = at::vec::Vectorized<float>(static_cast<float>(255.0));
auto tmp11 = at::vec::minimum(tmp9, tmp10);
auto tmp12 = at::vec::convert_float_to_uint8(tmp11);
auto tmp13 = at::vec::convert_uint8_to_float(tmp12);
auto tmp14 = tmp13 - tmp2;
auto tmp15 = tmp14 * tmp4;
tmp15.store(out_ptr0 + static_cast<long>(i1 + (784L*i0)));
}
}
}
}
}
''')
```
**Test Plan**
```
clear && python -m pytest test_cpu_repro.py -k test_non_contiguous_load_buf_quant
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/106958
Approved by: https://github.com/jgong5, https://github.com/eellison
ghstack dependencies: #106836, #106838