TensorIterator unrolling and vectorized load - step 0, 1 (#31974)
Summary:
This is step 0 and 1 for https://github.com/pytorch/pytorch/issues/31975:
- Old code is moved to namespace `legacy`
- New `elementwise_kernel` and `launch_kernel` added to namespace `modern`, they only support 1d contiguous case for now
- In `gpu_kernel_impl`, dispatch to the new code if the problem is trivial 1d contiguous.
In terms of performance, this PR affect elementwise operators on contiguous tensors. The performance is improved slightly (up to 8%) for medium size tensors on Volta.
## compiled code
See https://github.com/zasdfgbnm/things/blob/master/2020Q1/disassembly-elementwise.ipynb
We can see that, previously, the add kernel compiles to
```
//## File "/home/xgao/pytorch-master/aten/src/ATen/native/cuda/Loops.cuh", line 71
/*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;
/*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
/*0020*/ S2R R0, SR_TID.X ;
//## File "/home/xgao/pytorch-master/aten/src/ATen/native/cuda/Loops.cuh", line 73
/*0030*/ S2R R3, SR_CTAID.X ;
/*0040*/ IMAD R0, R3, 0x200, R0 ;
//## File "/home/xgao/pytorch-master/aten/src/ATen/native/cuda/Loops.cuh", line 76
/*0050*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x160], PT ;
/*0060*/ P0 EXIT ;
//## File "/home/xgao/pytorch-master/aten/src/ATen/native/cuda/Loops.cuh", line 110
/*0070*/ IMAD R3, R0.reuse, c[0x0][0x194], RZ ;
/*0080*/ IMAD R6, R0, c[0x0][0x198], RZ ;
/*0090*/ IADD3 R4, P0, R3.reuse, c[0x0][0x178], RZ ;
/*00a0*/ IADD3 R2, P1, R6.reuse, c[0x0][0x180], RZ ;
/*00b0*/ LEA.HI.X.SX32 R5, R3, c[0x0][0x17c], 0x1, P0 ;
/*00c0*/ LEA.HI.X.SX32 R3, R6, c[0x0][0x184], 0x1, P1 ;
/*00d0*/ LDG.E.SYS R5, [R4] ;
/*00e0*/ LDG.E.SYS R2, [R2] ;
//## File "/home/xgao/pytorch-master/aten/src/ATen/native/cuda/Loops.cuh", line 77
/*00f0*/ IMAD R0, R0, c[0x0][0x190], RZ ;
/*0100*/ IADD3 R6, P0, R0, c[0x0][0x170], RZ ;
/*0110*/ LEA.HI.X.SX32 R7, R0, c[0x0][0x174], 0x1, P0 ;
//## File "/home/xgao/pytorch-master/aten/src/ATen/native/cuda/Loops.cuh", line 110
/*0120*/ FFMA R9, R2, c[0x0][0x1a0], R5 ;
//## File "/home/xgao/pytorch-master/aten/src/ATen/native/cuda/Loops.cuh", line 170
/*0130*/ STG.E.SYS [R6], R9 ;
//## File "/home/xgao/pytorch-master/aten/src/ATen/native/cuda/Loops.cuh", line 81
/*0140*/ EXIT ;
.L_16826:
/*0150*/ BRA `(.L_16826);
/*0160*/ NOP;
/*0170*/ NOP;
.L_29063:
```
Now it compiles to
```
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 210
/*0000*/ MOV R1, c[0x0][0x28] ;
/*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
/*0020*/ S2R R6, SR_CTAID.X ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 217
/*0030*/ MOV R7, 0x4 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 208
/*0040*/ S2R R3, SR_TID.X ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 210
/*0050*/ LEA R6, R6, R3, 0x8 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 225
/*0060*/ IADD3 R2, R6.reuse, 0x40, RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 217
/*0070*/ IMAD.WIDE R4, R6.reuse, R7.reuse, c[0x0][0x190] ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 225
/*0080*/ IADD3 R3, R6, 0x80, RZ ;
/*0090*/ ISETP.GE.AND P1, PT, R2, c[0x0][0x160], PT ;
/*00a0*/ ISETP.GE.AND P0, PT, R6.reuse, c[0x0][0x160], PT ;
/*00b0*/ ISETP.GE.AND P2, PT, R3, c[0x0][0x160], PT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 217
/*00c0*/ IMAD.WIDE R2, R6.reuse, R7, c[0x0][0x188] ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 225
/*00d0*/ IADD3 R14, R6, 0xc0, RZ ;
/*00e0*/ ISETP.GE.AND P3, PT, R14, c[0x0][0x160], PT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 228
/*00f0*/ @!P1 LDG.E.SYS R11, [R4+0x100] ;
/*0100*/ @!P0 LDG.E.SYS R0, [R2] ;
/*0110*/ @!P0 LDG.E.SYS R9, [R4] ;
/*0120*/ @!P1 LDG.E.SYS R8, [R2+0x100] ;
/*0130*/ @!P2 LDG.E.SYS R10, [R2+0x200] ;
/*0140*/ @!P2 LDG.E.SYS R13, [R4+0x200] ;
/*0150*/ @!P3 LDG.E.SYS R12, [R2+0x300] ;
/*0160*/ @!P3 LDG.E.SYS R15, [R4+0x300] ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 245
/*0170*/ IMAD.WIDE R6, R6, R7, c[0x0][0x180] ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 191
/*0180*/ FFMA R9, R9, c[0x0][0x168], R0 ;
/*0190*/ FFMA R11, R11, c[0x0][0x168], R8 ;
/*01a0*/ FFMA R13, R13, c[0x0][0x168], R10 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 245
/*01b0*/ @!P0 STG.E.SYS [R6], R9 ;
/*01c0*/ @!P1 STG.E.SYS [R6+0x100], R11 ;
/*01d0*/ @!P2 STG.E.SYS [R6+0x200], R13 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 191
/*01e0*/ FFMA R15, R15, c[0x0][0x168], R12 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 244
/*01f0*/ P3 EXIT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 245
/*0200*/ STG.E.SYS [R6+0x300], R15 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 248
/*0210*/ EXIT ;
.L_727:
/*0220*/ BRA `(.L_727);
/*0230*/ NOP;
/*0240*/ NOP;
/*0250*/ NOP;
/*0260*/ NOP;
/*0270*/ NOP;
.L_32233:
```
## benchmark
The benchmark is for add kernel on Volta.
See https://github.com/zasdfgbnm/things/blob/master/2020Q1/benchmark-unroll.ipynb
For tensors of size from 2^20 to 2^30, previously we had
```
1.5.0a0+dedd16b
dedd16b4181cae81e37e978cd3bf24c1ba35ca05
33 µs ± 31.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
48.7 µs ± 75 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
78.9 µs ± 122 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
140 µs ± 51.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
261 µs ± 71.4 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
506 µs ± 159 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
993 µs ± 189 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.96 ms ± 139 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.9 ms ± 955 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
7.79 ms ± 187 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
```
Now we have
```
1.5.0a0+b1a239b
b1a239be8d529e89875fe47cd09964ef3a9516ac
30.4 µs ± 18 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
45.2 µs ± 46.5 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
75 µs ± 476 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
134 µs ± 192 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
253 µs ± 354 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
489 µs ± 138 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
961 µs ± 431 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.91 ms ± 578 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.8 ms ± 88.8 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
7.57 ms ± 763 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
```
It is slightly better.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/31974
Differential Revision: D19450765
Pulled By: ngimel
fbshipit-source-id: 79601bfceb5da84ff87384ba8193793eb4095a2e