Add uint8 support for interpolate for CPU images (#90771)
Joint work with @vfdev-5
This PR introduces native uint8 support for `interpolate()`, for `bilinear` ~and `bicubic`~ modes for CPU images (`mode=nearest[_exact]` was already supported ).
On a typical torchvision training job on ImageNet, the speedup are ~4X when AVX2 is supported, comparing the uint8 native (this PR) vs torchvision's current `Resize()`:
```
AA = antialias
float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does)
input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR)
(1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 4X 2.6ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 2.1X 1.3ms vs 0.6ms
(1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 3X 2.1ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 4X 2.4ms vs 0.6ms
(Note: we removed bicubic support for now)
(1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 4X 2.9ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 5X 3.1ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 3X 2.4ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 4X 2.8ms vs 0.7ms
```
There is still room for further speed-ups (see TODOs in the code).
#### More benchmark details
with AVX2 support - speedups typically range from 1.5X to 10X. A few edge-cases are slower, worth investigating why.
<details>
```
AA = antialias
float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does)
input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR)
(1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=1 5X 1.1ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=1 5X 1.2ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=1 2.8X 0.6ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=1 7X 1.6ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=1 5X 1.2ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=1 12X 2.9ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=1 3X 0.8ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=1 7X 1.8ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=2 2.6X 0.6ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=2 2.8X 0.6ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=2 1.7X 0.4ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=2 1.4X 0.3ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=2 2.7X 0.7ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=2 7X 1.6ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=2 1.8X 0.4ms vs 0.2ms
(1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=2 4X 1.0ms vs 0.2ms
(1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=1 4X 2.5ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=1 3.0X 1.8ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=1 3X 1.8ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=1 4X 2.3ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=1 4X 2.7ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=1 7X 4.3ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=1 3X 2.1ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=1 4X 2.6ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=2 2.7X 1.6ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=2 2.6X 1.5ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=2 2.1X 1.2ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=2 1.6X 0.9ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=2 2.8X 1.7ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=2 5X 2.8ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=2 2.3X 1.4ms vs 0.6ms
(1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=2 3X 1.9ms vs 0.6ms
(1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=1 4X 26.6ms vs 6.7ms
(1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=1 4X 23.9ms vs 6.8ms
(1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=1 2.5X 16.8ms vs 6.8ms
(1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=1 5X 33.1ms vs 6.8ms
(1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=1 4X 25.9ms vs 7.3ms
(1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=1 8X 59.6ms vs 7.3ms
(1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=1 1.9X 14.3ms vs 7.4ms
(1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=1 5X 35.4ms vs 7.3ms
(1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=2 2.0X 13.6ms vs 6.8ms
(1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=2 2.2X 14.8ms vs 6.7ms
(1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=2 1.3X 8.8ms vs 6.9ms
(1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=2 1.2X 8.4ms vs 6.8ms
(1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=2 1.8X 12.8ms vs 7.3ms
(1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=2 4X 32.1ms vs 7.2ms
(1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=2 1.4X 10.1ms vs 7.3ms
(1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=2 2.9X 20.9ms vs 7.3ms
(1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=1 1.4X 0.5ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=1 0.7X 0.2ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=1 1.3X 0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=1 1.4X 0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=1 2.1X 0.7ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=1 1.3X 0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=1 1.9X 0.6ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=1 1.0X 0.3ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=2 1.0X 0.3ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=2 0.6X 0.2ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=2 0.8X 0.3ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=2 1.4X 0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=2 1.4X 0.5ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=2 1.2X 0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=2 1.2X 0.4ms vs 0.4ms
(1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=2 0.9X 0.3ms vs 0.3ms
(1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 4X 2.6ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 2.1X 1.3ms vs 0.6ms
(1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 3X 2.1ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 4X 2.4ms vs 0.6ms
(1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 4X 2.9ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 5X 3.1ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 3X 2.4ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 4X 2.8ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=2 1.5X 1.0ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=2 1.2X 0.8ms vs 0.6ms
(1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=2 2.3X 1.5ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=2 1.9X 1.2ms vs 0.6ms
(1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=2 1.6X 1.2ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=2 4X 2.4ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=2 2.4X 1.6ms vs 0.7ms
(1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=2 2.8X 1.8ms vs 0.6ms
(1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=1 2.1X 12.8ms vs 6.1ms
(1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=1 0.6X 3.8ms vs 5.9ms
(1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=1 1.2X 7.1ms vs 6.1ms
(1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=1 1.9X 11.0ms vs 5.9ms
(1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=1 2.0X 12.6ms vs 6.4ms
(1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=1 1.0X 6.1ms vs 6.0ms
(1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=1 1.8X 11.3ms vs 6.4ms
(1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=1 0.8X 4.6ms vs 6.0ms
(1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=2 1.6X 9.3ms vs 6.0ms
(1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=2 0.3X 2.0ms vs 5.8ms
(1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=2 1.2X 7.2ms vs 6.0ms
(1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=2 0.3X 1.6ms vs 5.8ms
(1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=2 1.1X 7.1ms vs 6.5ms
(1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=2 0.6X 3.3ms vs 5.9ms
(1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=2 0.9X 5.9ms vs 6.3ms
(1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=2 0.4X 2.4ms vs 5.9ms
```
</details>
without AVX2 support - no significant speed-up, but there are various possible improvements (see TODOs)
<details>
```
AA = antialias
float = uint8->float->interpolate()->round()->clamp()->uint8 (what Resize() currently does)
input_size output_size channels_last AA mode num_threads speed-up float vs uint8 (this PR)
(1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=1 0.9X 1.5ms vs 1.6ms
(1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=1 0.9X 1.5ms vs 1.6ms
(1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=1 0.8X 0.9ms vs 1.1ms
(1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=1 1.5X 1.7ms vs 1.1ms
(1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=1 0.9X 1.6ms vs 1.8ms
(1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=1 2.1X 3.9ms vs 1.9ms
(1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=1 0.8X 1.1ms vs 1.4ms
(1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=1 1.7X 2.4ms vs 1.5ms
(1, 3, 64, 64) -> (224, 224) True True bilinear num_threads=2 0.9X 0.8ms vs 0.8ms
(1, 3, 64, 64) -> (224, 224) True False bilinear num_threads=2 0.9X 0.8ms vs 0.8ms
(1, 3, 64, 64) -> (224, 224) False True bilinear num_threads=2 0.9X 0.5ms vs 0.6ms
(1, 3, 64, 64) -> (224, 224) False False bilinear num_threads=2 0.7X 0.5ms vs 0.7ms
(1, 3, 64, 64) -> (224, 224) True True bicubic num_threads=2 0.9X 0.9ms vs 1.0ms
(1, 3, 64, 64) -> (224, 224) True False bicubic num_threads=2 2.1X 2.0ms vs 1.0ms
(1, 3, 64, 64) -> (224, 224) False True bicubic num_threads=2 0.8X 0.6ms vs 0.8ms
(1, 3, 64, 64) -> (224, 224) False False bicubic num_threads=2 1.7X 1.3ms vs 0.8ms
(1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=1 1.0X 3.0ms vs 3.0ms
(1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=1 1.0X 2.8ms vs 2.9ms
(1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=1 1.0X 2.3ms vs 2.2ms
(1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=1 1.4X 3.3ms vs 2.3ms
(1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=1 1.0X 3.5ms vs 3.5ms
(1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=1 1.7X 6.1ms vs 3.5ms
(1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=1 0.9X 2.6ms vs 2.9ms
(1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=1 1.4X 4.2ms vs 2.9ms
(1, 3, 224, 224) -> (270, 268) True True bilinear num_threads=2 1.0X 1.7ms vs 1.7ms
(1, 3, 224, 224) -> (270, 268) True False bilinear num_threads=2 0.9X 1.6ms vs 1.8ms
(1, 3, 224, 224) -> (270, 268) False True bilinear num_threads=2 0.9X 1.3ms vs 1.4ms
(1, 3, 224, 224) -> (270, 268) False False bilinear num_threads=2 0.7X 1.1ms vs 1.6ms
(1, 3, 224, 224) -> (270, 268) True True bicubic num_threads=2 1.0X 2.0ms vs 2.0ms
(1, 3, 224, 224) -> (270, 268) True False bicubic num_threads=2 1.7X 3.2ms vs 1.9ms
(1, 3, 224, 224) -> (270, 268) False True bicubic num_threads=2 0.8X 1.5ms vs 1.9ms
(1, 3, 224, 224) -> (270, 268) False False bicubic num_threads=2 1.2X 2.3ms vs 1.9ms
(1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=1 1.1X 34.7ms vs 32.4ms
(1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=1 1.0X 31.2ms vs 32.4ms
(1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=1 1.0X 23.5ms vs 22.7ms
(1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=1 1.9X 42.5ms vs 22.7ms
(1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=1 0.9X 33.9ms vs 37.4ms
(1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=1 2.2X 84.0ms vs 37.5ms
(1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=1 1.0X 28.4ms vs 28.8ms
(1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=1 2.0X 56.7ms vs 28.8ms
(1, 3, 256, 256) -> (1024, 1024) True True bilinear num_threads=2 1.1X 17.5ms vs 16.4ms
(1, 3, 256, 256) -> (1024, 1024) True False bilinear num_threads=2 1.1X 17.7ms vs 16.4ms
(1, 3, 256, 256) -> (1024, 1024) False True bilinear num_threads=2 0.8X 8.8ms vs 11.4ms
(1, 3, 256, 256) -> (1024, 1024) False False bilinear num_threads=2 1.0X 11.1ms vs 11.4ms
(1, 3, 256, 256) -> (1024, 1024) True True bicubic num_threads=2 1.1X 19.9ms vs 18.8ms
(1, 3, 256, 256) -> (1024, 1024) True False bicubic num_threads=2 2.3X 42.5ms vs 18.7ms
(1, 3, 256, 256) -> (1024, 1024) False True bicubic num_threads=2 1.0X 14.1ms vs 14.5ms
(1, 3, 256, 256) -> (1024, 1024) False False bicubic num_threads=2 2.0X 28.4ms vs 14.5ms
(1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=1 1.0X 0.6ms vs 0.6ms
(1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=1 0.7X 0.3ms vs 0.4ms
(1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=1 0.9X 0.5ms vs 0.6ms
(1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=1 1.7X 0.6ms vs 0.4ms
(1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=1 1.0X 0.8ms vs 0.8ms
(1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=1 1.1X 0.5ms vs 0.5ms
(1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=1 0.9X 0.7ms vs 0.8ms
(1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=1 0.9X 0.4ms vs 0.4ms
(1, 3, 224, 224) -> (64, 64) True True bilinear num_threads=2 1.0X 0.4ms vs 0.4ms
(1, 3, 224, 224) -> (64, 64) True False bilinear num_threads=2 0.8X 0.2ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) False True bilinear num_threads=2 0.9X 0.3ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) False False bilinear num_threads=2 1.3X 0.3ms vs 0.2ms
(1, 3, 224, 224) -> (64, 64) True True bicubic num_threads=2 1.0X 0.5ms vs 0.5ms
(1, 3, 224, 224) -> (64, 64) True False bicubic num_threads=2 1.3X 0.4ms vs 0.3ms
(1, 3, 224, 224) -> (64, 64) False True bicubic num_threads=2 0.9X 0.5ms vs 0.5ms
(1, 3, 224, 224) -> (64, 64) False False bicubic num_threads=2 1.2X 0.3ms vs 0.3ms
(1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=1 0.8X 2.1ms vs 2.5ms
(1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=1 0.7X 1.6ms vs 2.4ms
(1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=1 1.2X 2.4ms vs 2.1ms
(1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=1 1.3X 2.6ms vs 2.0ms
(1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=1 1.1X 3.4ms vs 3.0ms
(1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=1 1.7X 4.8ms vs 2.8ms
(1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=1 1.1X 2.9ms vs 2.7ms
(1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=1 1.4X 3.5ms vs 2.4ms
(1, 3, 270, 268) -> (224, 224) True True bilinear num_threads=2 0.9X 1.2ms vs 1.3ms
(1, 3, 270, 268) -> (224, 224) True False bilinear num_threads=2 1.3X 1.6ms vs 1.2ms
(1, 3, 270, 268) -> (224, 224) False True bilinear num_threads=2 0.8X 0.9ms vs 1.1ms
(1, 3, 270, 268) -> (224, 224) False False bilinear num_threads=2 1.3X 1.3ms vs 1.0ms
(1, 3, 270, 268) -> (224, 224) True True bicubic num_threads=2 1.4X 2.2ms vs 1.6ms
(1, 3, 270, 268) -> (224, 224) True False bicubic num_threads=2 1.9X 2.8ms vs 1.5ms
(1, 3, 270, 268) -> (224, 224) False True bicubic num_threads=2 0.8X 1.1ms vs 1.4ms
(1, 3, 270, 268) -> (224, 224) False False bicubic num_threads=2 1.7X 2.1ms vs 1.3ms
(1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=1 1.0X 10.0ms vs 9.9ms
(1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=1 0.7X 4.6ms vs 6.2ms
(1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=1 0.9X 9.1ms vs 9.8ms
(1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=1 1.7X 9.4ms vs 5.7ms
(1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=1 1.0X 15.2ms vs 14.8ms
(1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=1 1.0X 7.6ms vs 7.5ms
(1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=1 0.9X 13.3ms vs 14.4ms
(1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=1 0.8X 5.9ms vs 7.0ms
(1, 3, 1024, 1024) -> (256, 256) True True bilinear num_threads=2 1.2X 6.0ms vs 5.2ms
(1, 3, 1024, 1024) -> (256, 256) True False bilinear num_threads=2 0.7X 2.3ms vs 3.2ms
(1, 3, 1024, 1024) -> (256, 256) False True bilinear num_threads=2 1.0X 4.8ms vs 5.0ms
(1, 3, 1024, 1024) -> (256, 256) False False bilinear num_threads=2 0.7X 1.9ms vs 2.9ms
(1, 3, 1024, 1024) -> (256, 256) True True bicubic num_threads=2 1.6X 12.3ms vs 7.5ms
(1, 3, 1024, 1024) -> (256, 256) True False bicubic num_threads=2 1.0X 3.9ms vs 3.9ms
(1, 3, 1024, 1024) -> (256, 256) False True bicubic num_threads=2 1.0X 7.0ms vs 7.3ms
(1, 3, 1024, 1024) -> (256, 256) False False bicubic num_threads=2 0.9X 3.0ms vs 3.5ms
```
</details>
Benchmark code
<details>
```py
import operator_benchmark as op_bench
import torch
"""Microbenchmarks for interpolate operator."""
class InterpolateBenchmark(op_bench.TorchBenchmarkBase):
def init(self, input_size, output_size, channels_last=False, mode='linear', antialias=False, dtype=torch.float):
input_image = torch.randint(0, 256, size=input_size, dtype=torch.uint8, device='cpu')
if channels_last:
input_image = input_image.contiguous(memory_format=torch.channels_last)
self.inputs = {
"input_image": input_image,
"output_size": output_size,
"mode": mode,
"antialias": antialias,
"dtype":dtype,
}
self.set_module_name("interpolate")
def forward(self, input_image, output_size, mode, antialias, dtype):
if dtype == torch.float:
input_image = input_image.float()
out = torch.nn.functional.interpolate(input_image, size=output_size, mode=mode, align_corners=False, antialias=antialias)
if dtype == torch.float:
out = out.round().clamp(min=0, max=256).to(torch.uint8)
def make_config():
sizes = (
((224, 224), (64, 64)),
((270, 268), (224, 224)),
((256, 256), (1024, 1024)),
)
attrs = []
for (HW1, HW2) in sizes:
attrs.append([(1, 3, *HW1), HW2]) # 3 channels
# attrs.append([(1, 1, *HW1), HW2]) # 1 channel
attrs.append([(1, 3, *HW2), HW1]) # 3 channels
# attrs.append([(1, 1, *HW2), HW1]) # 1 channel
config = op_bench.config_list(
attr_names=["input_size", "output_size"],
attrs=attrs,
cross_product_configs={
'channels_last': [True, False],
'mode': ["bilinear", "bicubic"],
'antialias': [True, False],
# 'dtype': [torch.float, torch.uint8]
# 'dtype': [torch.uint8]
'dtype': [torch.float]
},
tags=["short"],
)
return config
config = make_config()
op_bench.generate_pt_test(config, InterpolateBenchmark)
if __name__ == "__main__":
op_bench.benchmark_runner.main()
```
```py
import re
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("f1", nargs="?", default="main")
parser.add_argument("f2", nargs="?", default="new")
args = parser.parse_args()
with open(args.f1) as f:
main = f.readlines()
with open(args.f2) as f:
new = f.readlines()
out = []
for main_line, new_line in zip(main, new):
# num_threads=1 # TODO: remove
if main_line.startswith("num_threads="):
num_threads = int(main_line.split("=")[-1])
if main_line.startswith("# Input"):
deets = f"{main_line.strip()}, {num_threads=}"
if main_line.startswith("Forward"):
main_time = float(main_line.split()[-1])
new_time = float(new_line.split()[-1])
ratio = main_time / new_time
fmt = ".1f" if ratio < 3 else ".0f"
improv = f"{ratio:{fmt}}X"
time_fmt = ",.3f" if new_time < 100 else ",.1f"
deets = deets.strip().replace("# Input: ", "")
deets = deets.replace(": ", "=")
deets = deets.replace("input_size=", "")
deets = deets.replace(", output_size=", " -> ")
deets = deets.replace("dtype=torch.", "")
deets = deets.replace("mode=", "")
deets = deets.replace("antialias=", "")
deets = deets.replace("channels_last=", "")
# deets = deets.replace("channels_last=True, ", "")
split = deets.split(",")
# size = ','.join(split[:-3])
# mode, dtype, threads = split[-3:]
# deets = f"{size:<30} {mode:<15} {dtype:<10} {threads:<15}"
size = ','.join(split[:-5])
channels_last, mode, antialias, dtype, threads= split[-5:]
deets = f"{size:<33} {channels_last:<7} {antialias:<7} {mode:<10} {threads:<15}"
l = f"{deets} {improv:<5} {main_time / 1000:{time_fmt}}ms vs {new_time / 1000:{time_fmt}}ms"
out.append(l)
def key(s):
# s = ''.join(s.split()[1:]) # remove "N.nX" part
num_threads = (int(re.findall(r"num_threads=(\d+)", s)[0]),)
input_shape, output_shape = re.findall("\(.*?\)", s)
input_shape = input_shape[1:-1] # remove parenthesis
input_HW = tuple(int(x) for x in input_shape.split(",")[-2:])
input_C = (-int(input_shape.split(",")[1]),)
output_HW = tuple(int(x) for x in output_shape[1:-1].split(","))
is_downsample = (output_HW[0] < input_HW[0],)
if "linear" in s:
mode = "linear"
elif "nearest-exact" in s:
mode = "nearest-exact"
else:
# assert "nearest" in s
mode = "nearest"
mode = (mode,)
return is_downsample + input_HW + output_HW + num_threads + input_C + mode
for i, l in enumerate(sorted(out, key=key)):
if i % 8 == 0:
print()
# if i % 10 == 0 and i % 40 != 0:
# print()
# if i % 40 == 0:
# print("-" * 100)
print(l)
```
</details>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90771
Approved by: https://github.com/peterbell10, https://github.com/ngimel