inductor: support masked load for cpu path (#107670)
For max_pooling code:
```
#pragma GCC ivdep
for(long i2=static_cast<long>(0L); i2<static_cast<long>(56L); i2+=static_cast<long>(1L))
{
for(long i3=static_cast<long>(0L); i3<static_cast<long>(64L); i3+=static_cast<long>(16L))
{
auto tmp0 = at::vec::Vectorized<int>(static_cast<int>((-1L) + (2L*i1)));
auto tmp1 = at::vec::Vectorized<int>(static_cast<int>(0));
auto tmp2 = to_float_mask(tmp0 >= tmp1);
auto tmp3 = at::vec::Vectorized<int>(static_cast<int>(112));
auto tmp4 = to_float_mask(tmp0 < tmp3);
auto tmp5 = tmp2 & tmp4;
auto tmp6 = at::vec::Vectorized<int>(static_cast<int>((-1L) + (2L*i2)));
auto tmp7 = to_float_mask(tmp6 >= tmp1);
auto tmp8 = to_float_mask(tmp6 < tmp3);
auto tmp9 = tmp7 & tmp8;
auto tmp10 = tmp5 & tmp9;
auto tmp11 = [&]
{
auto tmp12 = at::vec::Vectorized<bfloat16>::loadu(in_ptr0 + static_cast<long>((-7232L) + i3 + (128L*i2) + (14336L*i1) + (802816L*i0)), 16);
load
auto tmp13 = cvt_lowp_fp_to_fp32<bfloat16>(tmp12);
return tmp13;
}
;
auto tmp14 = decltype(tmp11())::blendv(at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity()), tmp11(), to_float_mask(tmp10));
```
the index of ```tmp12 ``` may be a correct index, such as ```i1=0, i2=0, i3=0```, the index is ```-7232L```, it is not a valid index. We may meet segmentation fault error when we call ```tmp11()```, the original behavior is that only the ```tmp10```(index check variable) is true, we can safely get the value, this PR will support masked_load to fixing this issue.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/107670
Approved by: https://github.com/jgong5, https://github.com/jansel