[ROCm] enable foreach fastpath
Reverts #46216 now that rocm is fixed.
Benchmark to verify:
```python
import torch
import time
import torch.optim as optim
from torch.autograd import Variable
from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau, StepLR
import torch.nn as nn
import time
import torchvision
import torch.utils.benchmark as benchmark_utils
device = "cuda"
model = torchvision.models.resnet.resnet101(pretrained=True).to(device)
targets = torch.randint(0, 1000, (100, 100), device=device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.1) # <----------------------- optimizer.
# would compare optim.SGD vs optim._multi_tensor.SGD
optimizer_mta = optim._multi_tensor.SGD(model.parameters(), lr=1e-3, momentum=0.1)
running_loss = 0.0
target = torch.empty(128, dtype=torch.long, device=device).random_(5)
optimizer.zero_grad()
inputs = torch.rand(128, 3, 100, 100, device=device , requires_grad=True)
outputs = model(inputs)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
def main():
timer = benchmark_utils.Timer(
stmt="torch.cuda.synchronize();optimizer.step()",
globals=globals(),
label="str(optimizer)",
)
timer_mta = benchmark_utils.Timer(
stmt="torch.cuda.synchronize(); optimizer_mta.step()",
globals=globals(),
label="str(optimizer_mta)",
)
for _ in range(1):
for i in range(1):
print(f"Run: {i}\n{'-' * 40}")
print(f"autorange:\n{timer.blocked_autorange()}\n\n")
for i in range(1):
print(f"Run: {i}\n{'-' * 40}")
print(f"autorange:\n{timer_mta.blocked_autorange()}\n\n")
if __name__ == "__main__":
main()
```
Before revert:
```
Run: 0
----------------------------------------
autorange:
<torch.utils.benchmark.utils.common.Measurement object at 0x7f253e67c910>
str(optimizer)
7.33 ms
1 measurement, 100 runs , 1 thread
Run: 0
----------------------------------------
autorange:
<torch.utils.benchmark.utils.common.Measurement object at 0x7f253e67c510>
str(optimizer_mta)
5.76 ms
1 measurement, 100 runs , 1 thread
```
After revert:
```
Run: 0
----------------------------------------
autorange:
<torch.utils.benchmark.utils.common.Measurement object at 0x7fa2aa15e8d0>
str(optimizer)
7.35 ms
1 measurement, 100 runs , 1 thread
Run: 0
----------------------------------------
autorange:
<torch.utils.benchmark.utils.common.Measurement object at 0x7fa2aa15e4d0>
str(optimizer_mta)
3.53 ms
1 measurement, 100 runs , 1 thread
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/74417
Approved by: https://github.com/ngimel