Vectorize in-place comparison operators (#33252)
Summary:
Benchmark: (Debian 10, Release build, gcc 8.3, no turbo, Intel(R) Xeon(R) E-2136 CPU @ 3.30GHz)
```python
import timeit
for op in ('gt', 'lt', 'ge', 'le', 'eq', 'ne'):
for dtype in ('torch.float', 'torch.double', 'torch.int16', 'torch.int32', 'torch.int64'):
for n, t in [(10_000, 100000),
(100_000, 10000)]:
print(f'a.{op}_(b), numel() == {n} for {t} times, dtype={dtype}')
print(timeit.timeit(f'a.{op}_(b)', setup=f'import torch; a = torch.arange(1, {n}, dtype={dtype}); b = torch.arange({n}, 1, -1, dtype={dtype})', number=t))
```
Before:
```
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.778998922000028
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6359690249992127
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.0801493119997758
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9360321379990637
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7341018620008981
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6345281440007966
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7396387640001194
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6429641230006382
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7759611700003006
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6672059659995284
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7724312530008319
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6392585769990546
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.7917451840003196
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6455550159989798
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.739991647998977
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6572993859990675
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7627949479992822
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6476544910001394
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7965036850000615
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6780715599998075
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7653547080008138
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6383065829995758
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.7895260240002244
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6508346030004759
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7409299750015634
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6383492870008922
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7620547579990671
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6474270239996258
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.8070051169997896
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6712598600006459
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7627660060006747
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6406353189995571
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.0826010620003217
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9391552950000914
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7427801039993938
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6365172640016681
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7679271510005492
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6453389289999905
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.788032889000533
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6708840760002204
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.float
1.078837263999958
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.9397531720005645
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.double
1.1031508050000411
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.9412319389994082
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.7509566959997755
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.638570957000411
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7592877549996047
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6458840529994632
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7984061539991671
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6776346309998189
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.7724407899986545
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.6581534130000364
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.8303323249983805
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.6954390920000151
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.745512373998281
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.6360954970004968
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.7569978400006221
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.6450422030011396
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.7889118379989668
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.6693385389989999
```
After:
```
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2444220920006046
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2031730359994981
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.35491806199934217
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.3905606850003096
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.16665379499863775
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10095906300011848
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21650469999985944
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.18737469400002738
a.gt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35481256200000644
a.gt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.36696120199849247
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.21976138800164335
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.20275393200063263
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3695997209997586
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39441510399956314
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15657078300137073
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.0992998069996247
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.20425128799979575
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.20352934599941364
a.lt_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35883567900054913
a.lt_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.39059587599876977
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.21457727400047588
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.18836135499986995
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.35971907199927955
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.3688875009993353
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.1576009280015569
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.09524034199966991
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.2064543649994448
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.18726435600001423
a.ge_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35351785300008487
a.ge_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.3680737989998306
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2132134399998904
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2140274829998816
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.36539215199991304
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39128020300086064
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15712150600120367
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10149904400168452
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.2103407699996751
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.2134442910009966
a.le_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.35387034300038067
a.le_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.38917528399906587
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2190484450002259
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.2030815980015177
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3710030169986567
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.36419657899932645
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.15986497499943653
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10145393699895067
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21011781599918322
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.20121852699958254
a.eq_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.36681504499938455
a.eq_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.364472848999867
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.float
0.2290963309988001
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.float
0.21674784300012107
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.double
0.3829616689999966
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.double
0.39437660300063726
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int16
0.1661020749997988
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int16
0.10052955100036343
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int32
0.21827425599985872
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int32
0.21522501399886096
a.ne_(b), numel() == 10000 for 100000 times, dtype=torch.int64
0.37058242300008715
a.ne_(b), numel() == 100000 for 10000 times, dtype=torch.int64
0.39304063900090114
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33252
Differential Revision: D20254663
Pulled By: ezyang
fbshipit-source-id: 68b7109ec4359434afbeb96df372e29608f501bb