pytorch
9c2ed257 - Vectorized memory access in TensorIterator GPU loop for 1d contiguous case (#32383)

Commit

4 years ago

Vectorized memory access in TensorIterator GPU loop for 1d contiguous case (#32383) Summary: Step 2 of https://github.com/pytorch/pytorch/issues/31975 Vectorized memory access is enabled. Generated code: https://github.com/zasdfgbnm/things/blob/master/2020Q1/disassembly-elementwise-vec.ipynb ``` void at::native::modern::elementwise_kernel<4, 64, 4, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)https://github.com/pytorch/pytorch/issues/1}, at::detail::Array<char*, 3> >(int, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()https://github.com/pytorch/pytorch/issues/1}::operator()() const::{lambda()https://github.com/pytorch/pytorch/issues/4}::operator()() const::{lambda(float, float)https://github.com/pytorch/pytorch/issues/1}, at::detail::Array<char*, 3>) **ASM:** .section .text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,"ax",progbits .sectioninfo @"SHI_REGISTERS=20" .align 128 .global _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_ .type _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,function .size _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,(.L_40898 - _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_) .other _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_,@"STO_CUDA_ENTRY STV_DEFAULT" _ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_: .text._ZN2at6native6modern18elementwise_kernelILi4ELi64ELi4EZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE2_clEvEUlffE_NS_6detail5ArrayIPcLi3EEEEEviT2_T3_: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294 /*0000*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; /*0010*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ; /*0020*/ S2R R9, SR_CTAID.X ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 177 /*0030*/ S2R R0, SR_TID.X ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 294 /*0040*/ IMAD.SHL.U32 R9, R9, 0x100, RZ ; /*0050*/ IADD3 R5, -R9, c[0x0][0x160], RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256 /*0060*/ SHF.R.S32.HI R17, RZ, 0x1f, R9 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 296 /*0070*/ ISETP.GE.AND P0, PT, R5, 0x100, PT ; /*0080*/ @!P0 BRA `(.L_3173) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256 /*0090*/ IMAD.SHL.U32 R12, R9.reuse, 0x4, RZ ; /*00a0*/ SHF.L.U64.HI R17, R9, 0x2, R17 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 260 /*00b0*/ IADD3 R8, P0, R12.reuse, c[0x0][0x188], RZ ; /*00c0*/ IADD3 R2, P1, R12, c[0x0][0x190], RZ ; /*00d0*/ IADD3.X R9, R17.reuse, c[0x0][0x18c], RZ, P0, !PT ; /*00e0*/ IADD3.X R3, R17, c[0x0][0x194], RZ, P1, !PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 218 /*00f0*/ IMAD.WIDE R8, R0, 0x10, R8 ; /*0100*/ IMAD.WIDE R2, R0, 0x10, R2 ; /*0110*/ LDG.E.128.SYS R8, [R8] ; /*0120*/ LDG.E.128.SYS R4, [R2] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 256 /*0130*/ IADD3 R12, P0, R12, c[0x0][0x180], RZ ; /*0140*/ IADD3.X R13, R17, c[0x0][0x184], RZ, P0, !PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238 /*0150*/ IMAD.WIDE R12, R0, 0x10, R12 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196 /*0160*/ FFMA R7, R7, c[0x0][0x168], R11 ; /*0170*/ FFMA R6, R6, c[0x0][0x168], R10 ; /*0180*/ FFMA R5, R5, c[0x0][0x168], R9 ; /*0190*/ FFMA R4, R4, c[0x0][0x168], R8 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 238 /*01a0*/ STG.E.128.SYS [R12], R4 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 301 /*01b0*/ EXIT ; .L_3173: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*01c0*/ ISETP.GE.AND P0, PT, R0, R5, PT ; /*01d0*/ BMOV.32.CLEAR RZ, B0 ; /*01e0*/ BSSY B0, `(.L_3174) ; /*01f0*/ P0 BRA `(.L_3175) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0200*/ IADD3 R3, P1, R9, R0, RZ ; /*0210*/ LEA.HI.X.SX32 R4, R0, R17, 0x1, P1 ; /*0220*/ LEA R2, P1, R3, c[0x0][0x188], 0x2 ; /*0230*/ LEA.HI.X R3, R3, c[0x0][0x18c], R4, 0x2, P1 ; /*0240*/ LDG.E.SYS R8, [R2] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0250*/ IADD3 R4, R0, 0x40, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*0260*/ ISETP.GE.AND P1, PT, R4, R5, PT ; /*0270*/ P1 BRA `(.L_3175) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0280*/ LDG.E.SYS R4, [R2+0x100] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0290*/ IADD3 R6, R0, 0x80, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*02a0*/ ISETP.GE.AND P1, PT, R6, R5, PT ; /*02b0*/ P1 BRA `(.L_3175) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*02c0*/ IADD3 R10, R0, 0xc0, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*02d0*/ LDG.E.SYS R7, [R2+0x200] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*02e0*/ ISETP.GE.AND P1, PT, R10, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*02f0*/ @!P1 LDG.E.SYS R6, [R2+0x300] ; .L_3175: /*0300*/ BSYNC B0 ; .L_3174: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*0310*/ BMOV.32.CLEAR RZ, B0 ; /*0320*/ BSSY B0, `(.L_3176) ; /*0330*/ P0 BRA `(.L_3177) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0340*/ IADD3 R3, P1, R9, R0, RZ ; /*0350*/ LEA.HI.X.SX32 R10, R0, R17, 0x1, P1 ; /*0360*/ LEA R2, P1, R3, c[0x0][0x190], 0x2 ; /*0370*/ LEA.HI.X R3, R3, c[0x0][0x194], R10, 0x2, P1 ; /*0380*/ LDG.E.SYS R11, [R2] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0390*/ IADD3 R10, R0, 0x40, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*03a0*/ ISETP.GE.AND P1, PT, R10, R5, PT ; /*03b0*/ P1 BRA `(.L_3177) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*03c0*/ LDG.E.SYS R13, [R2+0x100] ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*03d0*/ IADD3 R10, R0, 0x80, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*03e0*/ ISETP.GE.AND P1, PT, R10, R5, PT ; /*03f0*/ P1 BRA `(.L_3177) ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 184 /*0400*/ IADD3 R10, R0, 0xc0, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 180 /*0410*/ ISETP.GE.AND P1, PT, R10, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 183 /*0420*/ LDG.E.SYS R10, [R2+0x200] ; /*0430*/ @!P1 LDG.E.SYS R15, [R2+0x300] ; .L_3177: /*0440*/ BSYNC B0 ; .L_3176: //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0450*/ P0 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0460*/ IADD3 R9, P0, R9, R0, RZ ; /*0470*/ FFMA R11, R11, c[0x0][0x168], R8 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197 /*0480*/ IADD3 R14, R0, 0x40, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0490*/ LEA.HI.X.SX32 R12, R0, R17, 0x1, P0 ; /*04a0*/ LEA R2, P0, R9.reuse, c[0x0][0x180], 0x2 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*04b0*/ ISETP.GE.AND P1, PT, R14, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*04c0*/ LEA.HI.X R3, R9, c[0x0][0x184], R12, 0x2, P0 ; /*04d0*/ STG.E.SYS [R2], R11 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*04e0*/ P1 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197 /*04f0*/ IADD3 R8, R0, 0x80, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196 /*0500*/ FFMA R13, R13, c[0x0][0x168], R4 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0510*/ ISETP.GE.AND P0, PT, R8, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0520*/ STG.E.SYS [R2+0x100], R13 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0530*/ P0 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 197 /*0540*/ IADD3 R0, R0, 0xc0, RZ ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/Loops.cuh", line 196 /*0550*/ FFMA R7, R10, c[0x0][0x168], R7 ; /*0560*/ FFMA R15, R15, c[0x0][0x168], R6 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0570*/ ISETP.GE.AND P0, PT, R0, R5, PT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*0580*/ STG.E.SYS [R2+0x200], R7 ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 193 /*0590*/ P0 EXIT ; //## File "/home/xgao/pytorch/aten/src/ATen/native/cuda/MemoryAccess.cuh", line 196 /*05a0*/ STG.E.SYS [R2+0x300], R15 ; /*05b0*/ EXIT ; .L_3178: /*05c0*/ BRA `(.L_3178); /*05d0*/ NOP; /*05e0*/ NOP; /*05f0*/ NOP; .L_40898: ``` We can clearly see the `LDG.E.128` in it, which is a result of vectorization. Benchmark: https://github.com/zasdfgbnm/things/blob/master/2020Q1/benchmark-vec.ipynb Benchmark on P100, dtype `uint8`: before: ``` 1.4.0a0+a5b4d78 e1d97025eeeddcf083e9bee0c8f6a53168991a71 22.2 µs ± 89.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 34.7 µs ± 38.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 52 µs ± 312 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 86.9 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 154 µs ± 204 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 291 µs ± 668 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 566 µs ± 1.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.18 ms ± 1.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 2.29 ms ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 4.4 ms ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` after: ``` 1.4.0a0+a5b4d78 1281cdfd8188fe86241ecaf71d001809d016c3a3 24 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 30.5 µs ± 355 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 43.1 µs ± 300 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 67.6 µs ± 113 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 116 µs ± 275 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 215 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 413 µs ± 791 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 824 µs ± 891 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.63 ms ± 478 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 3.19 ms ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` Benchmark on P100, dtype `half`: Before: ``` 1.4.0a0+a5b4d78 1c017f0c14c91bd5125ab387a90441b0c0e2f3ad 30.8 µs ± 226 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 43.4 µs ± 164 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 69.1 µs ± 83 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 119 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 224 µs ± 99.1 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 418 µs ± 206 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 865 µs ± 237 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.69 ms ± 695 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 3.3 ms ± 527 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) 6.77 ms ± 741 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` After ``` 1.4.0a0+a5b4d78 7e50ee27333e7047072d328d03767b4845286356 28.9 µs ± 61.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 40.2 µs ± 244 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 63.8 µs ± 350 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 109 µs ± 196 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 199 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each) 380 µs ± 446 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) 743 µs ± 2.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.47 ms ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 2.91 ms ± 9.17 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 5.8 ms ± 296 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` cc: csarofeen ptrblck Pull Request resolved: https://github.com/pytorch/pytorch/pull/32383 Differential Revision: D19697455 Pulled By: ngimel fbshipit-source-id: 0707481c2f334e6634c000b4afd275b2fee8fbe1

Author

zasdfgbnm

Committer

facebook-github-bot

Parents

4baadd54

pytorch 9c2ed257 - Vectorized memory access in TensorIterator GPU loop for 1d contiguous case (#32383)

pytorch
9c2ed257 - Vectorized memory access in TensorIterator GPU loop for 1d contiguous case (#32383)