[AArch64] Add FP8 Neon intrinsics for dot-product (#123613)
This patch adds the following intrinsics:
float16x4_t vdot_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t
vm, fpm_t fpm)
float16x8_t vdotq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, fpm_t fpm)
float16x4_t vdot_lane_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x4_t vdot_laneq_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vdotq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vdotq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x2_t vdot_f32_mf8_fpm(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t
vm, fpm_t fpm)
float32x4_t vdotq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, fpm_t fpm)
float32x2_t vdot_lane_f32_mf8_fpm(float32x2_t vd, mfloat8x8_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x2_t vdot_laneq_f32_mf8_fpm(float32x2_t vd, mfloat8x8_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vdotq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vdotq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)