Inductor cpp wrapper: cache the loading of the kernel (#89742)
### Pitch
Cache the loaded kernel to reduce the overhead.
#### Code before:
```cpp
std::vector<at::Tensor> call_0(std::tuple<at::Tensor&, at::Tensor&> args) {
...
auto kernel_cpp_0_lib = dlopen("/tmp/torchinductor_xxx/yr/cyr3uymlc6pgvnimx3fnynaa4t7ldafeqzhe5zpizmvorisx4hb2.so", RTLD_NOW);
assert(kernel_cpp_0_lib != nullptr);
void (*kernel_cpp_0)(const float*,const float*,float*,float*);
*(void **) (&kernel_cpp_0) = dlsym(kernel_cpp_0_lib, "kernel");
kernel_cpp_0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
...
}
```
#### Code after:
```cpp
template <typename KernelFunc>
KernelFunc load_cpp_kernel(const char* so_filename) {
KernelFunc kernel_cpp;
auto kernel_cpp_lib = dlopen(so_filename, RTLD_NOW);
assert(kernel_cpp_lib != nullptr);
*(void **) (&kernel_cpp) = dlsym(kernel_cpp_lib, "kernel");
return kernel_cpp;
}
std::vector<at::Tensor> call_0(std::tuple<at::Tensor&, at::Tensor&> args) {
...
static auto kernel_cpp_0 = load_cpp_kernel<void (*)(const float*,const float*,float*,float*)>("/tmp/torchinductor_xxx/yr/cyr3uymlc6pgvnimx3fnynaa4t7ldafeqzhe5zpizmvorisx4hb2.so");
kernel_cpp_0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
...
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89742
Approved by: https://github.com/jgong5, https://github.com/desertfire