[inductor] Lowering of rngprims philox_rand (#99289)
An example graph with Dynamic shapes on
`arg0_1` is seed, `arg1_1` is base offset.
~~~
===== Forward graph 0 =====
<eval_with_key>.5 class <lambda>(torch.nn.Module):
def forward(self, arg0_1: i64[], arg1_1: i64[], arg2_1: Sym(s0), arg3_1: f32[s0]):
# File: /scratch/anijain/work/pytorch/test/inductor/test_torchinductor.py:4605, code: a = torch.rand_like(x) * x
add: i64[] = torch.ops.aten.add.Tensor(arg1_1, 0)
philox_rand = torch.ops.rngprims.philox_rand.default([arg2_1], arg0_1, add, None, device(type='cuda', index=0), torch.float32); add = None
getitem: f32[s0] = philox_rand[0]
getitem_1: i64[] = philox_rand[1]; philox_rand = None
add_1: i64[] = torch.ops.aten.add.Tensor(getitem_1, 0); getitem_1 = None
mul: f32[s0] = torch.ops.aten.mul.Tensor(getitem, arg3_1); getitem = arg3_1 = None
# File: /scratch/anijain/work/pytorch/test/inductor/test_torchinductor.py:4606, code: a = torch.rand_like(x) * a
add_2: i64[] = torch.ops.aten.add.Tensor(arg1_1, add_1)
philox_rand_1 = torch.ops.rngprims.philox_rand.default([arg2_1], arg0_1, add_2, None, device(type='cuda', index=0), torch.float32); arg2_1 = arg0_1 = add_2 = None
getitem_2: f32[s0] = philox_rand_1[0]
getitem_3: i64[] = philox_rand_1[1]; philox_rand_1 = None
add_3: i64[] = torch.ops.aten.add.Tensor(add_1, getitem_3); add_1 = getitem_3 = None
mul_1: f32[s0] = torch.ops.aten.mul.Tensor(getitem_2, mul); getitem_2 = mul = None
# No stacktrace found for following nodes
add_4: i64[] = torch.ops.aten.add.Tensor(arg1_1, add_3); arg1_1 = add_3 = None
add_5: i64[] = torch.ops.aten.add.Tensor(add_4, 3); add_4 = None
div: i64[] = torch.ops.aten.div.Tensor_mode(add_5, 4, rounding_mode = 'floor'); add_5 = None
mul_2: i64[] = torch.ops.aten.mul.Tensor(div, 4); div = None
return (mul_1, mul_2)
~~~
Note that the output `mul2` is basically total `numel` of the random ops.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/99289
Approved by: https://github.com/jansel