port fmod from TH to ATen (#24405)
Summary:
https://github.com/pytorch/pytorch/issues/22803
performance benchmarks:
import timeit
import torch
import itertools
import statistics
def test_perf(sizes, device, repeat, times):
def _tensor(name, sizes, device):
return '''{0} = torch.rand({1}, device="{2}");'''.format(name, sizes, device)
setup_code = 'import torch;' + _tensor('x', sizes, device) + _tensor('y', sizes, device)
test_code = '''torch.fmod(y, x);'''
if device == "cuda":
test_code = test_code + 'torch.cuda.synchronize()'
result = timeit.repeat(setup = setup_code,stmt = test_code,repeat = repeat,number = times)
mean = statistics.mean(result)
std = statistics.stdev(result)
print('''sizes = {0} std = {1} mean = {2}'''.format(sizes, std, mean))
def test_perf_for_device(device, small, mid, large):
print(device)
for s in itertools.product((small, mid, large), (small, mid, large)):
test_perf(str(s), device, 3, 300)
test_perf_for_device("cpu", 5, 100, 1000)
test_perf_for_device("cuda", 5, 100, 10000)
pytorch:master
cpu
sizes = (5, 5) std = 0.0004191587896767566 mean = 0.0052408403377436725
sizes = (5, 100) std = 0.00012129380478190695 mean = 0.006508304664748721
sizes = (5, 1000) std = 0.00018175678335131663 mean = 0.0363664986701527
sizes = (100, 5) std = 0.00034399426107962946 mean = 0.006770268999389373
sizes = (100, 100) std = 0.0006779367543473553 mean = 0.07270567266580959
sizes = (100, 1000) std = 0.01670362224705441 mean = 0.1300258070017056
sizes = (1000, 5) std = 0.010281040640935534 mean = 0.045936293997025736
sizes = (1000, 100) std = 0.012529932966256128 mean = 0.12733882099564653
sizes = (1000, 1000) std = 0.002150238308503937 mean = 1.1608000710014796
cuda
sizes = (5, 5) std = 0.00016137550559233116 mean = 0.014315356330674453
sizes = (5, 100) std = 0.0014720358192929545 mean = 0.015730336332732502
sizes = (5, 10000) std = 0.0017510024071247026 mean = 0.015462367334597124
sizes = (100, 5) std = 0.001569950832690219 mean = 0.015847195667447522
sizes = (100, 100) std = 0.000935629392520788 mean = 0.015551854667137377
sizes = (100, 10000) std = 0.002454919985869727 mean = 0.04476405966367262
sizes = (10000, 5) std = 0.0013192075275361463 mean = 0.015794202001416124
sizes = (10000, 100) std = 0.001418935833245521 mean = 0.04419450566638261
sizes = (10000, 10000) std = 0.0070977799177425 mean = 3.267501967328523
shihongzhi:feature/port_fmod
cpu
sizes = (5, 5) std = 0.0003939277361171243 mean = 0.008732202996422226
sizes = (5, 100) std = 7.568185896146914e-05 mean = 0.010897216998273507
sizes = (5, 1000) std = 3.916722355255723e-05 mean = 0.03223436966557832
sizes = (100, 5) std = 0.00016529833171236708 mean = 0.011018406672519632
sizes = (100, 100) std = 0.000155446405937598 mean = 0.055315166668151505
sizes = (100, 1000) std = 0.005295612670839835 mean = 0.09823771333321929
sizes = (1000, 5) std = 5.087993715488194e-05 mean = 0.03315563267096877
sizes = (1000, 100) std = 0.004952377745126246 mean = 0.09605619766807649
sizes = (1000, 1000) std = 0.10362095898303665 mean = 0.9652185496718934
cuda
sizes = (5, 5) std = 5.004076916927963e-05 mean = 0.016851375335439418
sizes = (5, 100) std = 0.0008912925390246038 mean = 0.01788881132476187
sizes = (5, 10000) std = 0.0009701942336158022 mean = 0.018210363331794117
sizes = (100, 5) std = 0.0007897575234315655 mean = 0.017682057005004026
sizes = (100, 100) std = 0.0012395220098068511 mean = 0.016444508665396523
sizes = (100, 10000) std = 0.000957364387413519 mean = 0.016943917328414198
sizes = (10000, 5) std = 0.0011325899538680206 mean = 0.017102815332085203
sizes = (10000, 100) std = 0.0013052748368152663 mean = 0.017058989333842572
sizes = (10000, 10000) std = 0.024267574119715446 mean = 0.30735275766831666
Pull Request resolved: https://github.com/pytorch/pytorch/pull/24405
Differential Revision: D16864196
Pulled By: VitalyFedyunin
fbshipit-source-id: d884cc9e74bb8f4ce2ad8d23c676fa914b26d8fb