TorchDynamo: don't compute index for max_pooling when return_index is false (#89838)
For max_pooling, if return_index is **False**, we don't need compute the index.
Before:
```
extern "C" void kernel(const float* __restrict__ in_ptr0,
float* __restrict__ out_ptr0)
{
#pragma GCC ivdep
for(long i0=0; i0<128; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<3; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<3; i2+=1)
{
#pragma GCC ivdep
for(long i3=0; i3<3; i3+=1)
{
{
{
auto tmp0 = in_ptr0[i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp2 = in_ptr0[3 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp7 = in_ptr0[6 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp12 = in_ptr0[21 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp17 = in_ptr0[24 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp22 = in_ptr0[27 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp27 = in_ptr0[42 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp32 = in_ptr0[45 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp37 = in_ptr0[48 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp1 = static_cast<long>((2*i2) + (14*i1));
auto tmp3 = static_cast<long>(1 + (2*i2) + (14*i1));
auto tmp4 = tmp2 > tmp0;
auto tmp5 = tmp4 ? tmp3 : tmp1;
auto tmp6 = (tmp0 != tmp0) ? tmp0 : std::max(tmp2, tmp0);
auto tmp8 = static_cast<long>(2 + (2*i2) + (14*i1));
auto tmp9 = tmp7 > tmp6;
auto tmp10 = tmp9 ? tmp8 : tmp5;
auto tmp11 = (tmp6 != tmp6) ? tmp6 : std::max(tmp7, tmp6);
auto tmp13 = static_cast<long>(7 + (2*i2) + (14*i1));
auto tmp14 = tmp12 > tmp11;
auto tmp15 = tmp14 ? tmp13 : tmp10;
auto tmp16 = (tmp11 != tmp11) ? tmp11 : std::max(tmp12, tmp11);
auto tmp18 = static_cast<long>(8 + (2*i2) + (14*i1));
auto tmp19 = tmp17 > tmp16;
auto tmp20 = tmp19 ? tmp18 : tmp15;
auto tmp21 = (tmp16 != tmp16) ? tmp16 : std::max(tmp17, tmp16);
auto tmp23 = static_cast<long>(9 + (2*i2) + (14*i1));
auto tmp24 = tmp22 > tmp21;
auto tmp25 = tmp24 ? tmp23 : tmp20;
auto tmp26 = (tmp21 != tmp21) ? tmp21 : std::max(tmp22, tmp21);
auto tmp28 = static_cast<long>(14 + (2*i2) + (14*i1));
auto tmp29 = tmp27 > tmp26;
auto tmp30 = tmp29 ? tmp28 : tmp25;
auto tmp31 = (tmp26 != tmp26) ? tmp26 : std::max(tmp27, tmp26);
auto tmp33 = static_cast<long>(15 + (2*i2) + (14*i1));
auto tmp34 = tmp32 > tmp31;
auto tmp35 = tmp34 ? tmp33 : tmp30;
auto tmp36 = (tmp31 != tmp31) ? tmp31 : std::max(tmp32, tmp31);
auto tmp38 = static_cast<long>(16 + (2*i2) + (14*i1));
auto tmp39 = tmp37 > tmp36;
auto tmp40 = tmp39 ? tmp38 : tmp35;
auto tmp41 = (tmp36 != tmp36) ? tmp36 : std::max(tmp37, tmp36);
out_ptr0[i3 + (3*i2) + (9*i1) + (27*i0)] = tmp41;
}
}
}
}
}
}
}
''')
```
After:
```
extern "C" void kernel(const float* __restrict__ in_ptr0,
float* __restrict__ out_ptr0)
{
#pragma GCC ivdep
for(long i0=0; i0<128; i0+=1)
{
#pragma GCC ivdep
for(long i1=0; i1<3; i1+=1)
{
#pragma GCC ivdep
for(long i2=0; i2<3; i2+=1)
{
#pragma GCC ivdep
for(long i3=0; i3<3; i3+=1)
{
{
{
auto tmp0 = in_ptr0[i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp1 = in_ptr0[3 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp3 = in_ptr0[6 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp5 = in_ptr0[21 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp7 = in_ptr0[24 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp9 = in_ptr0[27 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp11 = in_ptr0[42 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp13 = in_ptr0[45 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp15 = in_ptr0[48 + i3 + (6*i2) + (42*i1) + (147*i0)];
auto tmp2 = (tmp0 != tmp0) ? tmp0 : std::max(tmp1, tmp0);
auto tmp4 = (tmp2 != tmp2) ? tmp2 : std::max(tmp3, tmp2);
auto tmp6 = (tmp4 != tmp4) ? tmp4 : std::max(tmp5, tmp4);
auto tmp8 = (tmp6 != tmp6) ? tmp6 : std::max(tmp7, tmp6);
auto tmp10 = (tmp8 != tmp8) ? tmp8 : std::max(tmp9, tmp8);
auto tmp12 = (tmp10 != tmp10) ? tmp10 : std::max(tmp11, tmp10);
auto tmp14 = (tmp12 != tmp12) ? tmp12 : std::max(tmp13, tmp12);
auto tmp16 = (tmp14 != tmp14) ? tmp14 : std::max(tmp15, tmp14);
out_ptr0[i3 + (3*i2) + (9*i1) + (27*i0)] = tmp16;
}
}
}
}
}
}
}
''')
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89838
Approved by: https://github.com/jgong5, https://github.com/jansel