[inductor][cpp] support non contiguous vectorization codegen (#99966)
Currently, cpp vectorization is supported only when the node has at least one contiguous index. The PR enables cpp vectorization when all indices in the node are non-contiguous. Specifically, the most inner index is selected as the tiling index.
### Validation
For the E2E performance and functionality, both inference and training model suites for data type float32 and bfloat16 are validated. All the results show that there is no performance regression and no new failures compared with baseline.
### Code
The modification could help certain kernels in GPT-J do vectorization. Here is a snippet of output code change.
**Before**
```
{
#pragma GCC ivdep
for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long i1=static_cast<long>(0L); i1<static_cast<long>(32L); i1+=static_cast<long>(1L))
{
auto tmp0 = in_ptr0[static_cast<long>(1L + (2L*i1) + (256L*i0))];
auto tmp1 = static_cast<float>(tmp0);
auto tmp2 = decltype(tmp1)(-tmp1);
auto tmp3 = static_cast<bfloat16>(tmp2);
out_ptr0[static_cast<long>((2L*i1) + (64L*i0))] = tmp3;
}
}
}
{
#pragma GCC ivdep
for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
{
#pragma GCC ivdep
for(long i1=static_cast<long>(0L); i1<static_cast<long>(32L); i1+=static_cast<long>(1L))
{
auto tmp0 = in_ptr0[static_cast<long>((2L*i1) + (256L*i0))];
out_ptr1[static_cast<long>((2L*i1) + (64L*i0))] = tmp0;
}
}
}
```
**After**
```
{
#pragma GCC ivdep
for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
{
for(long i1=static_cast<long>(0L); i1<static_cast<long>(32L); i1+=static_cast<long>(16L))
{
auto tmp0 = ([&]() { __at_align__ bfloat16 tmpbuf[16 * 2]; for (long i1_inner = 0; i1_inner < 16; i1_inner++) tmpbuf[i1_inner] = in_ptr0[static_cast<long>(1L + (2L*i1_inner) + (2L*i1) + (256L*i0))]; return load_bf16_as_float(tmpbuf); })();
auto tmp1 = (tmp0);
auto tmp2 = tmp1.neg();
auto tmp3 = (tmp2);
{ __at_align__ bfloat16 tmpbuf[16*sizeof(float)/sizeof(bfloat16)]; store_float_as_bf16(tmpbuf, tmp3); for (long i1_inner = 0; i1_inner < 16; i1_inner++) out_ptr0[static_cast<long>((2L*i1_inner) + (2L*i1) + (64L*i0))] = tmpbuf[i1_inner]; }
}
}
}
{
#pragma GCC ivdep
for(long i0=static_cast<long>(0L); i0<static_cast<long>(16L*ks0); i0+=static_cast<long>(1L))
{
for(long i1=static_cast<long>(0L); i1<static_cast<long>(32L); i1+=static_cast<long>(16L))
{
auto tmp0 = ([&]() { __at_align__ bfloat16 tmpbuf[16 * 2]; for (long i1_inner = 0; i1_inner < 16; i1_inner++) tmpbuf[i1_inner] = in_ptr0[static_cast<long>((2L*i1_inner) + (2L*i1) + (256L*i0))]; return at::vec::Vectorized<bfloat16>::loadu(tmpbuf, 16); })();
{ __at_align__ bfloat16 tmpbuf[16*sizeof(float)/sizeof(bfloat16)]; tmp0.store(tmpbuf, 16); for (long i1_inner = 0; i1_inner < 16; i1_inner++) out_ptr1[static_cast<long>((2L*i1_inner) + (2L*i1) + (64L*i0))] = tmpbuf[i1_inner]; }
}
}
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/99966
Approved by: https://github.com/jgong5, https://github.com/jansel