Vectorized memory access in TensorIterator GPU loop for 1d contiguous case (#32383)
Summary:
Step 2 of https://github.com/pytorch/pytorch/issues/31975
Vectorized memory access is enabled. Generated code: https://github.com/zasdfgbnm/things/blob/master/2020Q1/disassembly-elementwise-vec.ipynb
```
void at::native::modern::elementwise_kernel<4, 64, 4, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)https://github.com/pytorch/pytorch/issues/1}, at::detail::Array<char*, 3> >(int, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)https://github.com/pytorch/pytorch/issues/1}, at::detail::Array<char*, 3>)
**ASM:**
.section .text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,"ax",progbits
.sectioninfo @"SHI_REGISTERS=20"
.align 128
.global _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_
.type _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,function
.size _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,(.L_40898 - _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_)
.other _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,@"STO_CUDA_ENTRY STV_DEFAULT"
_ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_:
.text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_:
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294
/*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;
/*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;
/*0020*/ S2R R9, SR_CTAID.X ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 177
/*0030*/ S2R R0, SR_TID.X ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294
/*0040*/ IMAD.SHL.U32 R9, R9, 0x100, RZ ;
/*0050*/ IADD3 R5, -R9, c[0x0][0x160], RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
/*0060*/ SHF.R.S32.HI R17, RZ, 0x1f, R9 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 296
/*0070*/ ISETP.GE.AND P0, PT, R5, 0x100, PT ;
/*0080*/ @!P0 BRA `(.L_3173) ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
/*0090*/ IMAD.SHL.U32 R12, R9.reuse, 0x4, RZ ;
/*00a0*/ SHF.L.U64.HI R17, R9, 0x2, R17 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 260
/*00b0*/ IADD3 R8, P0, R12.reuse, c[0x0][0x188], RZ ;
/*00c0*/ IADD3 R2, P1, R12, c[0x0][0x190], RZ ;
/*00d0*/ IADD3.X R9, R17.reuse, c[0x0][0x18c], RZ, P0, !PT ;
/*00e0*/ IADD3.X R3, R17, c[0x0][0x194], RZ, P1, !PT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 218
/*00f0*/ IMAD.WIDE R8, R0, 0x10, R8 ;
/*0100*/ IMAD.WIDE R2, R0, 0x10, R2 ;
/*0110*/ LDG.E.128.SYS R8, [R8] ;
/*0120*/ LDG.E.128.SYS R4, [R2] ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256
/*0130*/ IADD3 R12, P0, R12, c[0x0][0x180], RZ ;
/*0140*/ IADD3.X R13, R17, c[0x0][0x184], RZ, P0, !PT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238
/*0150*/ IMAD.WIDE R12, R0, 0x10, R12 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
/*0160*/ FFMA R7, R7, c[0x0][0x168], R11 ;
/*0170*/ FFMA R6, R6, c[0x0][0x168], R10 ;
/*0180*/ FFMA R5, R5, c[0x0][0x168], R9 ;
/*0190*/ FFMA R4, R4, c[0x0][0x168], R8 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238
/*01a0*/ STG.E.128.SYS [R12], R4 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 301
/*01b0*/ EXIT ;
.L_3173:
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
/*01c0*/ ISETP.GE.AND P0, PT, R0, R5, PT ;
/*01d0*/ BMOV.32.CLEAR RZ, B0 ;
/*01e0*/ BSSY B0, `(.L_3174) ;
/*01f0*/ P0 BRA `(.L_3175) ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
/*0200*/ IADD3 R3, P1, R9, R0, RZ ;
/*0210*/ LEA.HI.X.SX32 R4, R0, R17, 0x1, P1 ;
/*0220*/ LEA R2, P1, R3, c[0x0][0x188], 0x2 ;
/*0230*/ LEA.HI.X R3, R3, c[0x0][0x18c], R4, 0x2, P1 ;
/*0240*/ LDG.E.SYS R8, [R2] ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
/*0250*/ IADD3 R4, R0, 0x40, RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
/*0260*/ ISETP.GE.AND P1, PT, R4, R5, PT ;
/*0270*/ P1 BRA `(.L_3175) ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
/*0280*/ LDG.E.SYS R4, [R2+0x100] ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
/*0290*/ IADD3 R6, R0, 0x80, RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
/*02a0*/ ISETP.GE.AND P1, PT, R6, R5, PT ;
/*02b0*/ P1 BRA `(.L_3175) ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
/*02c0*/ IADD3 R10, R0, 0xc0, RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
/*02d0*/ LDG.E.SYS R7, [R2+0x200] ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
/*02e0*/ ISETP.GE.AND P1, PT, R10, R5, PT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
/*02f0*/ @!P1 LDG.E.SYS R6, [R2+0x300] ;
.L_3175:
/*0300*/ BSYNC B0 ;
.L_3174:
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
/*0310*/ BMOV.32.CLEAR RZ, B0 ;
/*0320*/ BSSY B0, `(.L_3176) ;
/*0330*/ P0 BRA `(.L_3177) ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
/*0340*/ IADD3 R3, P1, R9, R0, RZ ;
/*0350*/ LEA.HI.X.SX32 R10, R0, R17, 0x1, P1 ;
/*0360*/ LEA R2, P1, R3, c[0x0][0x190], 0x2 ;
/*0370*/ LEA.HI.X R3, R3, c[0x0][0x194], R10, 0x2, P1 ;
/*0380*/ LDG.E.SYS R11, [R2] ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
/*0390*/ IADD3 R10, R0, 0x40, RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
/*03a0*/ ISETP.GE.AND P1, PT, R10, R5, PT ;
/*03b0*/ P1 BRA `(.L_3177) ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
/*03c0*/ LDG.E.SYS R13, [R2+0x100] ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
/*03d0*/ IADD3 R10, R0, 0x80, RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
/*03e0*/ ISETP.GE.AND P1, PT, R10, R5, PT ;
/*03f0*/ P1 BRA `(.L_3177) ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184
/*0400*/ IADD3 R10, R0, 0xc0, RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180
/*0410*/ ISETP.GE.AND P1, PT, R10, R5, PT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183
/*0420*/ LDG.E.SYS R10, [R2+0x200] ;
/*0430*/ @!P1 LDG.E.SYS R15, [R2+0x300] ;
.L_3177:
/*0440*/ BSYNC B0 ;
.L_3176:
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
/*0450*/ P0 EXIT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
/*0460*/ IADD3 R9, P0, R9, R0, RZ ;
/*0470*/ FFMA R11, R11, c[0x0][0x168], R8 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
/*0480*/ IADD3 R14, R0, 0x40, RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
/*0490*/ LEA.HI.X.SX32 R12, R0, R17, 0x1, P0 ;
/*04a0*/ LEA R2, P0, R9.reuse, c[0x0][0x180], 0x2 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
/*04b0*/ ISETP.GE.AND P1, PT, R14, R5, PT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
/*04c0*/ LEA.HI.X R3, R9, c[0x0][0x184], R12, 0x2, P0 ;
/*04d0*/ STG.E.SYS [R2], R11 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
/*04e0*/ P1 EXIT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
/*04f0*/ IADD3 R8, R0, 0x80, RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
/*0500*/ FFMA R13, R13, c[0x0][0x168], R4 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
/*0510*/ ISETP.GE.AND P0, PT, R8, R5, PT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
/*0520*/ STG.E.SYS [R2+0x100], R13 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
/*0530*/ P0 EXIT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197
/*0540*/ IADD3 R0, R0, 0xc0, RZ ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196
/*0550*/ FFMA R7, R10, c[0x0][0x168], R7 ;
/*0560*/ FFMA R15, R15, c[0x0][0x168], R6 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
/*0570*/ ISETP.GE.AND P0, PT, R0, R5, PT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
/*0580*/ STG.E.SYS [R2+0x200], R7 ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193
/*0590*/ P0 EXIT ;
//## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196
/*05a0*/ STG.E.SYS [R2+0x300], R15 ;
/*05b0*/ EXIT ;
.L_3178:
/*05c0*/ BRA `(.L_3178);
/*05d0*/ NOP;
/*05e0*/ NOP;
/*05f0*/ NOP;
.L_40898:
```
We can clearly see the `LDG.E.128` in it, which is a result of vectorization.
Benchmark: https://github.com/zasdfgbnm/things/blob/master/2020Q1/benchmark-vec.ipynb
Benchmark on P100, dtype `uint8`:
before:
```
1.4.0a0+a5b4d78
e1d97025eeeddcf083e9bee0c8f6a53168991a71
22.2 µs ± 89.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
34.7 µs ± 38.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
52 µs ± 312 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
86.9 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
154 µs ± 204 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
291 µs ± 668 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
566 µs ± 1.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.18 ms ± 1.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.29 ms ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.4 ms ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```
after:
```
1.4.0a0+a5b4d78
1281cdfd8188fe86241ecaf71d001809d016c3a3
24 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
30.5 µs ± 355 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.1 µs ± 300 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
67.6 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
116 µs ± 275 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
215 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
413 µs ± 791 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
824 µs ± 891 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.63 ms ± 478 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.19 ms ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```
Benchmark on P100, dtype `half`:
Before:
```
1.4.0a0+a5b4d78
1c017f0c14c91bd5125ab387a90441b0c0e2f3ad
30.8 µs ± 226 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
43.4 µs ± 164 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
69.1 µs ± 83 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
119 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
224 µs ± 99.1 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
418 µs ± 206 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
865 µs ± 237 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.69 ms ± 695 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.3 ms ± 527 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
6.77 ms ± 741 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
```
After
```
1.4.0a0+a5b4d78
7e50ee27333e7047072d328d03767b4845286356
28.9 µs ± 61.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
40.2 µs ± 244 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
63.8 µs ± 350 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
109 µs ± 196 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
199 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
380 µs ± 446 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
743 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.47 ms ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.91 ms ± 9.17 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.8 ms ± 296 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
```
cc: csarofeen ptrblck
Pull Request resolved: https://github.com/pytorch/pytorch/pull/32383
Differential Revision: D19697455
Pulled By: ngimel
fbshipit-source-id: 0707481c2f334e6634c000b4afd275b2fee8fbe1