[clang] [AArch64] Refactor implementation of FP8 types (NFC) (PR #123604)

Momchil Velikov via cfe-commits Mon, 20 Jan 2025 04:34:20 -0800

https://github.com/momchil-velikov created 
https://github.com/llvm/llvm-project/pull/123604


- The FP8 scalar type (`__mfp8`) was described as a vector type
- The FP8 vector types were described/assumed to have integer element type (the 
element type ought to be `__mfp8`)
- Add support for `m` type specifier (denoting `__mfp8`) in `DecodeTypeFromStr` 
and create builtin function prototypes using that specifier, instead of `int8_t`

Supersedes https://github.com/llvm/llvm-project/pull/118969

>From bb3a44020980d1675675c6a4d502b7a3f8580814 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.veli...@arm.com>
Date: Wed, 18 Dec 2024 15:43:00 +0000
Subject: [PATCH 1/2] Implement NEON FP8 vectors as VectorType

Co-Aurhored-By: Caroline Concatto <caroline.conca...@arm.com>
---
 clang/include/clang/AST/Type.h                |   5 +
 .../clang/Basic/AArch64SVEACLETypes.def       |   2 -
 clang/include/clang/Basic/TargetBuiltins.h    |   4 +-
 clang/lib/AST/ItaniumMangle.cpp               |   5 +
 clang/lib/CodeGen/CGBuiltin.cpp               |   1 +
 clang/lib/CodeGen/CGExpr.cpp                  |  11 +-
 clang/lib/CodeGen/CodeGenTypes.cpp            |   4 +-
 clang/lib/CodeGen/Targets/AArch64.cpp         |   7 +-
 clang/lib/Sema/SemaARM.cpp                    |   2 +
 clang/lib/Sema/SemaExpr.cpp                   |   7 +-
 clang/lib/Sema/SemaType.cpp                   |   3 +-
 .../AArch64/builtin-shufflevector-fp8.c       | 123 +++++++++++
 clang/test/CodeGen/AArch64/fp8-cast.c         | 193 ++++++++++++++++++
 clang/test/CodeGen/arm-mfp8.c                 |  88 ++++----
 .../aarch64-mangle-neon-vectors.cpp           |   7 +
 clang/test/CodeGenCXX/mangle-neon-vectors.cpp |  11 +
 clang/test/Sema/aarch64-fp8-cast.c            | 104 ++++++++++
 clang/test/Sema/arm-mfp8.cpp                  |  34 +--
 clang/utils/TableGen/NeonEmitter.cpp          |  11 +-
 19 files changed, 553 insertions(+), 69 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
 create mode 100644 clang/test/CodeGen/AArch64/fp8-cast.c
 create mode 100644 clang/test/Sema/aarch64-fp8-cast.c

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 3457d524c63aaa..1d9743520654eb 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public 
ExtQualsTypeCommonBase {
   bool isFloat32Type() const;
   bool isDoubleType() const;
   bool isBFloat16Type() const;
+  bool isMFloat8Type() const;
   bool isFloat128Type() const;
   bool isIbm128Type() const;
   bool isRealType() const;         // C99 6.2.5p17 (real floating + integer)
@@ -8537,6 +8538,10 @@ inline bool Type::isBFloat16Type() const {
   return isSpecificBuiltinType(BuiltinType::BFloat16);
 }
 
+inline bool Type::isMFloat8Type() const {
+  return isSpecificBuiltinType(BuiltinType::MFloat8);
+}
+
 inline bool Type::isFloat128Type() const {
   return isSpecificBuiltinType(BuiltinType::Float128);
 }
diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def 
b/clang/include/clang/Basic/AArch64SVEACLETypes.def
index 063cac1f4a58ee..2dd2754e778d60 100644
--- a/clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -201,8 +201,6 @@ SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", 
SveBoolx4, SveBoolx4T
 SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy)
 
 AARCH64_VECTOR_TYPE_MFLOAT("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 1, 8, 1)
-AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x8_t", "__MFloat8x8_t", MFloat8x8, 
MFloat8x8Ty, 8, 8, 1)
-AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, 
MFloat8x16Ty, 16, 8, 1)
 
 #undef SVE_VECTOR_TYPE
 #undef SVE_VECTOR_TYPE_BFLOAT
diff --git a/clang/include/clang/Basic/TargetBuiltins.h 
b/clang/include/clang/Basic/TargetBuiltins.h
index 4dc8b24ed8ae6c..83ef015018f1a1 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -208,7 +208,8 @@ namespace clang {
       Float16,
       Float32,
       Float64,
-      BFloat16
+      BFloat16,
+      MFloat8
     };
 
     NeonTypeFlags(unsigned F) : Flags(F) {}
@@ -230,6 +231,7 @@ namespace clang {
       switch (getEltType()) {
       case Int8:
       case Poly8:
+      case MFloat8:
         return 8;
       case Int16:
       case Float16:
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 1dd936cf4fb518..9948963d7f44b3 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3919,6 +3919,9 @@ void CXXNameMangler::mangleNeonVectorType(const 
VectorType *T) {
     case BuiltinType::Float:     EltName = "float32_t"; break;
     case BuiltinType::Half:      EltName = "float16_t"; break;
     case BuiltinType::BFloat16:  EltName = "bfloat16_t"; break;
+    case BuiltinType::MFloat8:
+      EltName = "mfloat8_t";
+      break;
     default:
       llvm_unreachable("unexpected Neon vector element type");
     }
@@ -3972,6 +3975,8 @@ static StringRef mangleAArch64VectorBase(const 
BuiltinType *EltType) {
     return "Float64";
   case BuiltinType::BFloat16:
     return "Bfloat16";
+  case BuiltinType::MFloat8:
+    return "Mfloat8";
   default:
     llvm_unreachable("Unexpected vector element base type");
   }
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b80833fd91884d..5fd4c3e34ebeba 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6819,6 +6819,7 @@ static llvm::FixedVectorType *GetNeonType(CodeGenFunction 
*CGF,
   switch (TypeFlags.getEltType()) {
   case NeonTypeFlags::Int8:
   case NeonTypeFlags::Poly8:
+  case NeonTypeFlags::MFloat8:
     return llvm::FixedVectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
   case NeonTypeFlags::Int16:
   case NeonTypeFlags::Poly16:
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 9a9a8c7f6eae09..eeff77db2d2684 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2414,8 +2414,15 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, 
LValue Dst,
         Vec = Builder.CreateBitCast(Vec, IRVecTy);
         // iN --> <N x i1>.
       }
-      Vec = Builder.CreateInsertElement(Vec, Src.getScalarVal(),
-                                        Dst.getVectorIdx(), "vecins");
+      llvm::Value *SrcVal = Src.getScalarVal();
+      // Allow inserting `<1 x T>` into an `<N x T>`. It can happen with scalar
+      // types which are mapped to vector LLVM IR types (e.g. for implementing
+      // an ABI).
+      if (auto *EltTy = dyn_cast<llvm::FixedVectorType>(SrcVal->getType());
+          EltTy && EltTy->getNumElements() == 1)
+        SrcVal = Builder.CreateBitCast(SrcVal, EltTy->getElementType());
+      Vec = Builder.CreateInsertElement(Vec, SrcVal, Dst.getVectorIdx(),
+                                        "vecins");
       if (IRStoreTy) {
         // <N x i1> --> <iN>.
         Vec = Builder.CreateBitCast(Vec, IRStoreTy);
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp 
b/clang/lib/CodeGen/CodeGenTypes.cpp
index 09191a4901f493..950b23f4e13b99 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -650,7 +650,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     // An ext_vector_type of Bool is really a vector of bits.
     llvm::Type *IRElemTy = VT->isExtVectorBoolType()
                                ? llvm::Type::getInt1Ty(getLLVMContext())
-                               : ConvertType(VT->getElementType());
+                               : (VT->getElementType()->isMFloat8Type()
+                                      ? llvm::Type::getInt8Ty(getLLVMContext())
+                                      : ConvertType(VT->getElementType()));
     ResultType = llvm::FixedVectorType::get(IRElemTy, VT->getNumElements());
     break;
   }
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp 
b/clang/lib/CodeGen/Targets/AArch64.cpp
index 7db67ecba07c8f..c702e79ff8eb98 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -383,10 +383,6 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType 
Ty, bool IsVariadicFn,
         NSRN = std::min(NSRN + 1, 8u);
       else {
         switch (BT->getKind()) {
-        case BuiltinType::MFloat8x8:
-        case BuiltinType::MFloat8x16:
-          NSRN = std::min(NSRN + 1, 8u);
-          break;
         case BuiltinType::SveBool:
         case BuiltinType::SveCount:
           NPRN = std::min(NPRN + 1, 4u);
@@ -629,8 +625,7 @@ bool 
AArch64ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
   // but with the difference that any floating-point type is allowed,
   // including __fp16.
   if (const BuiltinType *BT = Ty->getAs<BuiltinType>()) {
-    if (BT->isFloatingPoint() || BT->getKind() == BuiltinType::MFloat8x16 ||
-        BT->getKind() == BuiltinType::MFloat8x8)
+    if (BT->isFloatingPoint())
       return true;
   } else if (const VectorType *VT = Ty->getAs<VectorType>()) {
     if (auto Kind = VT->getVectorKind();
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index db418d80e0e09c..2620bbc97ba02a 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -352,6 +352,8 @@ static QualType getNeonEltType(NeonTypeFlags Flags, 
ASTContext &Context,
     return Context.DoubleTy;
   case NeonTypeFlags::BFloat16:
     return Context.BFloat16Ty;
+  case NeonTypeFlags::MFloat8:
+    return Context.MFloat8Ty;
   }
   llvm_unreachable("Invalid NeonTypeFlag!");
 }
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index ae40895980d90a..2087e8b251d8c6 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -7502,7 +7502,7 @@ static bool breakDownVectorType(QualType type, uint64_t 
&len,
   if (const VectorType *vecType = type->getAs<VectorType>()) {
     len = vecType->getNumElements();
     eltType = vecType->getElementType();
-    assert(eltType->isScalarType());
+    assert(eltType->isScalarType() || eltType->isMFloat8Type());
     return true;
   }
 
@@ -10168,6 +10168,11 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, 
ExprResult &RHS,
     return HLSL().handleVectorBinOpConversion(LHS, RHS, LHSType, RHSType,
                                               IsCompAssign);
 
+  // Any operation with MFloat8 type is only possible with C intrinsics
+  if ((LHSVecType && LHSVecType->getElementType()->isMFloat8Type()) ||
+      (RHSVecType && RHSVecType->getElementType()->isMFloat8Type()))
+    return InvalidOperands(Loc, LHS, RHS);
+
   // AltiVec-style "vector bool op vector bool" combinations are allowed
   // for some operators but not others.
   if (!AllowBothBool && LHSVecType &&
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 2ccf5a8e1d6f31..33d5378944ddbf 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -8306,7 +8306,8 @@ static bool isPermittedNeonBaseType(QualType &Ty, 
VectorKind VecKind, Sema &S) {
          BTy->getKind() == BuiltinType::ULongLong ||
          BTy->getKind() == BuiltinType::Float ||
          BTy->getKind() == BuiltinType::Half ||
-         BTy->getKind() == BuiltinType::BFloat16;
+         BTy->getKind() == BuiltinType::BFloat16 ||
+         BTy->getKind() == BuiltinType::MFloat8;
 }
 
 static bool verifyValidIntegerConstantExpr(Sema &S, const ParsedAttr &Attr,
diff --git a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c 
b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
new file mode 100644
index 00000000000000..147ca1d1becc15
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
@@ -0,0 +1,123 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-linux -target-feature +neon 
-disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+typedef __attribute__((neon_vector_type(8))) signed char int8x8_t;
+typedef __attribute__((neon_vector_type(16))) signed char int8x16_t;
+
+typedef __attribute__((neon_vector_type(8))) __mfp8 mfloat8x8_t;
+typedef __attribute__((neon_vector_type(16))) __mfp8 mfloat8x16_t;
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_8x8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> 
[[X]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE]]
+//
+mfloat8x8_t test_8x8(mfloat8x8_t x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_8x8_v(
+// CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MASK:%.*]] = and <8 x i8> [[P]], splat (i8 7)
+// CHECK-NEXT:    [[SHUF_IDX:%.*]] = extractelement <8 x i8> [[MASK]], i64 0
+// CHECK-NEXT:    [[SHUF_ELT:%.*]] = extractelement <8 x i8> [[X]], i8 
[[SHUF_IDX]]
+// CHECK-NEXT:    [[SHUF_INS:%.*]] = insertelement <8 x i8> poison, i8 
[[SHUF_ELT]], i64 0
+// CHECK-NEXT:    [[SHUF_IDX1:%.*]] = extractelement <8 x i8> [[MASK]], i64 1
+// CHECK-NEXT:    [[SHUF_ELT2:%.*]] = extractelement <8 x i8> [[X]], i8 
[[SHUF_IDX1]]
+// CHECK-NEXT:    [[SHUF_INS3:%.*]] = insertelement <8 x i8> [[SHUF_INS]], i8 
[[SHUF_ELT2]], i64 1
+// CHECK-NEXT:    [[SHUF_IDX4:%.*]] = extractelement <8 x i8> [[MASK]], i64 2
+// CHECK-NEXT:    [[SHUF_ELT5:%.*]] = extractelement <8 x i8> [[X]], i8 
[[SHUF_IDX4]]
+// CHECK-NEXT:    [[SHUF_INS6:%.*]] = insertelement <8 x i8> [[SHUF_INS3]], i8 
[[SHUF_ELT5]], i64 2
+// CHECK-NEXT:    [[SHUF_IDX7:%.*]] = extractelement <8 x i8> [[MASK]], i64 3
+// CHECK-NEXT:    [[SHUF_ELT8:%.*]] = extractelement <8 x i8> [[X]], i8 
[[SHUF_IDX7]]
+// CHECK-NEXT:    [[SHUF_INS9:%.*]] = insertelement <8 x i8> [[SHUF_INS6]], i8 
[[SHUF_ELT8]], i64 3
+// CHECK-NEXT:    [[SHUF_IDX10:%.*]] = extractelement <8 x i8> [[MASK]], i64 4
+// CHECK-NEXT:    [[SHUF_ELT11:%.*]] = extractelement <8 x i8> [[X]], i8 
[[SHUF_IDX10]]
+// CHECK-NEXT:    [[SHUF_INS12:%.*]] = insertelement <8 x i8> [[SHUF_INS9]], 
i8 [[SHUF_ELT11]], i64 4
+// CHECK-NEXT:    [[SHUF_IDX13:%.*]] = extractelement <8 x i8> [[MASK]], i64 5
+// CHECK-NEXT:    [[SHUF_ELT14:%.*]] = extractelement <8 x i8> [[X]], i8 
[[SHUF_IDX13]]
+// CHECK-NEXT:    [[SHUF_INS15:%.*]] = insertelement <8 x i8> [[SHUF_INS12]], 
i8 [[SHUF_ELT14]], i64 5
+// CHECK-NEXT:    [[SHUF_IDX16:%.*]] = extractelement <8 x i8> [[MASK]], i64 6
+// CHECK-NEXT:    [[SHUF_ELT17:%.*]] = extractelement <8 x i8> [[X]], i8 
[[SHUF_IDX16]]
+// CHECK-NEXT:    [[SHUF_INS18:%.*]] = insertelement <8 x i8> [[SHUF_INS15]], 
i8 [[SHUF_ELT17]], i64 6
+// CHECK-NEXT:    [[SHUF_IDX19:%.*]] = extractelement <8 x i8> [[MASK]], i64 7
+// CHECK-NEXT:    [[SHUF_ELT20:%.*]] = extractelement <8 x i8> [[X]], i8 
[[SHUF_IDX19]]
+// CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <8 x i8> [[SHUF_INS18]], 
i8 [[SHUF_ELT20]], i64 7
+// CHECK-NEXT:    ret <8 x i8> [[SHUF_INS21]]
+//
+mfloat8x8_t test_8x8_v(mfloat8x8_t x, int8x8_t p) {
+  return __builtin_shufflevector(x, p);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_8x16(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[X]], <16 x i8> 
[[X]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 
7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE]]
+//
+mfloat8x16_t test_8x16(mfloat8x16_t x) {
+  return __builtin_shufflevector(x, x, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 
2,
+                                 1, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_8x16_v(
+// CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MASK:%.*]] = and <16 x i8> [[P]], splat (i8 15)
+// CHECK-NEXT:    [[SHUF_IDX:%.*]] = extractelement <16 x i8> [[MASK]], i64 0
+// CHECK-NEXT:    [[SHUF_ELT:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX]]
+// CHECK-NEXT:    [[SHUF_INS:%.*]] = insertelement <16 x i8> poison, i8 
[[SHUF_ELT]], i64 0
+// CHECK-NEXT:    [[SHUF_IDX1:%.*]] = extractelement <16 x i8> [[MASK]], i64 1
+// CHECK-NEXT:    [[SHUF_ELT2:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX1]]
+// CHECK-NEXT:    [[SHUF_INS3:%.*]] = insertelement <16 x i8> [[SHUF_INS]], i8 
[[SHUF_ELT2]], i64 1
+// CHECK-NEXT:    [[SHUF_IDX4:%.*]] = extractelement <16 x i8> [[MASK]], i64 2
+// CHECK-NEXT:    [[SHUF_ELT5:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX4]]
+// CHECK-NEXT:    [[SHUF_INS6:%.*]] = insertelement <16 x i8> [[SHUF_INS3]], 
i8 [[SHUF_ELT5]], i64 2
+// CHECK-NEXT:    [[SHUF_IDX7:%.*]] = extractelement <16 x i8> [[MASK]], i64 3
+// CHECK-NEXT:    [[SHUF_ELT8:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX7]]
+// CHECK-NEXT:    [[SHUF_INS9:%.*]] = insertelement <16 x i8> [[SHUF_INS6]], 
i8 [[SHUF_ELT8]], i64 3
+// CHECK-NEXT:    [[SHUF_IDX10:%.*]] = extractelement <16 x i8> [[MASK]], i64 4
+// CHECK-NEXT:    [[SHUF_ELT11:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX10]]
+// CHECK-NEXT:    [[SHUF_INS12:%.*]] = insertelement <16 x i8> [[SHUF_INS9]], 
i8 [[SHUF_ELT11]], i64 4
+// CHECK-NEXT:    [[SHUF_IDX13:%.*]] = extractelement <16 x i8> [[MASK]], i64 5
+// CHECK-NEXT:    [[SHUF_ELT14:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX13]]
+// CHECK-NEXT:    [[SHUF_INS15:%.*]] = insertelement <16 x i8> [[SHUF_INS12]], 
i8 [[SHUF_ELT14]], i64 5
+// CHECK-NEXT:    [[SHUF_IDX16:%.*]] = extractelement <16 x i8> [[MASK]], i64 6
+// CHECK-NEXT:    [[SHUF_ELT17:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX16]]
+// CHECK-NEXT:    [[SHUF_INS18:%.*]] = insertelement <16 x i8> [[SHUF_INS15]], 
i8 [[SHUF_ELT17]], i64 6
+// CHECK-NEXT:    [[SHUF_IDX19:%.*]] = extractelement <16 x i8> [[MASK]], i64 7
+// CHECK-NEXT:    [[SHUF_ELT20:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX19]]
+// CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <16 x i8> [[SHUF_INS18]], 
i8 [[SHUF_ELT20]], i64 7
+// CHECK-NEXT:    [[SHUF_IDX22:%.*]] = extractelement <16 x i8> [[MASK]], i64 8
+// CHECK-NEXT:    [[SHUF_ELT23:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX22]]
+// CHECK-NEXT:    [[SHUF_INS24:%.*]] = insertelement <16 x i8> [[SHUF_INS21]], 
i8 [[SHUF_ELT23]], i64 8
+// CHECK-NEXT:    [[SHUF_IDX25:%.*]] = extractelement <16 x i8> [[MASK]], i64 9
+// CHECK-NEXT:    [[SHUF_ELT26:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX25]]
+// CHECK-NEXT:    [[SHUF_INS27:%.*]] = insertelement <16 x i8> [[SHUF_INS24]], 
i8 [[SHUF_ELT26]], i64 9
+// CHECK-NEXT:    [[SHUF_IDX28:%.*]] = extractelement <16 x i8> [[MASK]], i64 
10
+// CHECK-NEXT:    [[SHUF_ELT29:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX28]]
+// CHECK-NEXT:    [[SHUF_INS30:%.*]] = insertelement <16 x i8> [[SHUF_INS27]], 
i8 [[SHUF_ELT29]], i64 10
+// CHECK-NEXT:    [[SHUF_IDX31:%.*]] = extractelement <16 x i8> [[MASK]], i64 
11
+// CHECK-NEXT:    [[SHUF_ELT32:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX31]]
+// CHECK-NEXT:    [[SHUF_INS33:%.*]] = insertelement <16 x i8> [[SHUF_INS30]], 
i8 [[SHUF_ELT32]], i64 11
+// CHECK-NEXT:    [[SHUF_IDX34:%.*]] = extractelement <16 x i8> [[MASK]], i64 
12
+// CHECK-NEXT:    [[SHUF_ELT35:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX34]]
+// CHECK-NEXT:    [[SHUF_INS36:%.*]] = insertelement <16 x i8> [[SHUF_INS33]], 
i8 [[SHUF_ELT35]], i64 12
+// CHECK-NEXT:    [[SHUF_IDX37:%.*]] = extractelement <16 x i8> [[MASK]], i64 
13
+// CHECK-NEXT:    [[SHUF_ELT38:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX37]]
+// CHECK-NEXT:    [[SHUF_INS39:%.*]] = insertelement <16 x i8> [[SHUF_INS36]], 
i8 [[SHUF_ELT38]], i64 13
+// CHECK-NEXT:    [[SHUF_IDX40:%.*]] = extractelement <16 x i8> [[MASK]], i64 
14
+// CHECK-NEXT:    [[SHUF_ELT41:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX40]]
+// CHECK-NEXT:    [[SHUF_INS42:%.*]] = insertelement <16 x i8> [[SHUF_INS39]], 
i8 [[SHUF_ELT41]], i64 14
+// CHECK-NEXT:    [[SHUF_IDX43:%.*]] = extractelement <16 x i8> [[MASK]], i64 
15
+// CHECK-NEXT:    [[SHUF_ELT44:%.*]] = extractelement <16 x i8> [[X]], i8 
[[SHUF_IDX43]]
+// CHECK-NEXT:    [[SHUF_INS45:%.*]] = insertelement <16 x i8> [[SHUF_INS42]], 
i8 [[SHUF_ELT44]], i64 15
+// CHECK-NEXT:    ret <16 x i8> [[SHUF_INS45]]
+//
+mfloat8x16_t test_8x16_v(mfloat8x16_t x, int8x16_t p) {
+  return __builtin_shufflevector(x, p);
+}
diff --git a/clang/test/CodeGen/AArch64/fp8-cast.c 
b/clang/test/CodeGen/AArch64/fp8-cast.c
new file mode 100644
index 00000000000000..a9ce31b9e6beab
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-cast.c
@@ -0,0 +1,193 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon 
-disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | 
FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon 
-disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | 
FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon 
-disable-O0-optnone -Werror -Wall -S -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// Bitcast between FP8 Neon vectors
+// CHECK-LABEL: define dso_local <8 x i8> @test_f8_f8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z10test_f8_f813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[X]]
+//
+mfloat8x8_t test_f8_f8(mfloat8x8_t x) {
+    return (mfloat8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> 
@_Z11testq_f8_f814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[X]]
+//
+mfloat8x16_t testq_f8_f8(mfloat8x16_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// Bitcast between FP8 and int8 Neon vectors
+// CHECK-LABEL: define dso_local <8 x i8> @test_f8_s8(
+// CHECK-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z10test_f8_s810__Int8x8_t(
+// CHECK-CXX-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[X]]
+//
+mfloat8x8_t test_f8_s8(int8x8_t x) {
+    return (mfloat8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_s8_f8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x i8> 
@_Z10test_s8_f813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[X]]
+//
+int8x8_t test_s8_f8(mfloat8x8_t x) {
+    return (int8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_s8(
+// CHECK-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z11testq_f8_s811__Int8x16_t(
+// CHECK-CXX-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[X]]
+//
+mfloat8x16_t testq_f8_s8(int8x16_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_s8_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <16 x i8> 
@_Z11testq_s8_f814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[X]]
+//
+int8x16_t testq_s8_f8(mfloat8x16_t x) {
+    return (int8x16_t) x;
+}
+
+// Bitcast between FP8 and float32 Neon vectors
+// CHECK-LABEL: define dso_local <8 x i8> @test_f8_f32(
+// CHECK-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z11test_f8_f3213__Float32x2_t(
+// CHECK-CXX-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_f8_f32(float32x2_t x) {
+    return (mfloat8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_f32_f8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[X]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> 
@_Z11test_f32_f813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[X]] to <2 x float>
+// CHECK-CXX-NEXT:    ret <2 x float> [[TMP0]]
+//
+float32x2_t test_f32_f8(mfloat8x8_t x) {
+    return (float32x2_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_f32(
+// CHECK-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[X]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> 
@_Z12testq_f8_f3213__Float32x4_t(
+// CHECK-CXX-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[X]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t testq_f8_f32(float32x4_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @testq_f32_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> 
@_Z12testq_f32_f814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to <4 x float>
+// CHECK-CXX-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t testq_f32_f8(mfloat8x16_t x) {
+    return (float32x4_t) x;
+}
+
+// Bitcast between FP8 and poly128_t (which is integral)
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_p128(
+// CHECK-SAME: i128 noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[X]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z13testq_f8_p128o(
+// CHECK-CXX-SAME: i128 noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast i128 [[X]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t testq_f8_p128(poly128_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// CHECK-LABEL: define dso_local i128 @testq_p128_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef i128 
@_Z13testq_p128_f814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to i128
+// CHECK-CXX-NEXT:    ret i128 [[TMP0]]
+//
+poly128_t testq_p128_f8(mfloat8x16_t x) {
+    return (poly128_t) x;
+}
diff --git a/clang/test/CodeGen/arm-mfp8.c b/clang/test/CodeGen/arm-mfp8.c
index bf91066335a25c..9385b537f18b3d 100644
--- a/clang/test/CodeGen/arm-mfp8.c
+++ b/clang/test/CodeGen/arm-mfp8.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
-// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature 
-fp8 -target-feature +neon -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature 
-fp8 -target-feature +neon -o -  -x c++ %s | FileCheck %s 
--check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature 
-fp8 -target-feature +neon -disable-O0-optnone -o -        %s | opt -S 
--passes=mem2reg | FileCheck %s --check-prefixes=CHECK-C
+// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature 
-fp8 -target-feature +neon -disable-O0-optnone -o - -x c++ %s | opt -S 
--passes=mem2reg | FileCheck %s --check-prefixes=CHECK-CXX
 
 // REQUIRES: aarch64-registered-target
 
@@ -10,18 +10,12 @@
 // CHECK-C-LABEL: define dso_local <16 x i8> @test_ret_mfloat8x16_t(
 // CHECK-C-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-C-NEXT:  [[ENTRY:.*:]]
-// CHECK-C-NEXT:    [[V_ADDR:%.*]] = alloca <16 x i8>, align 16
-// CHECK-C-NEXT:    store <16 x i8> [[V]], ptr [[V_ADDR]], align 16
-// CHECK-C-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V_ADDR]], align 16
-// CHECK-C-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-C-NEXT:    ret <16 x i8> [[V]]
 //
-// CHECK-CXX-LABEL: define dso_local <16 x i8> 
@_Z21test_ret_mfloat8x16_tu14__MFloat8x16_t(
+// CHECK-CXX-LABEL: define dso_local <16 x i8> 
@_Z21test_ret_mfloat8x16_t14__Mfloat8x16_t(
 // CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[V_ADDR:%.*]] = alloca <16 x i8>, align 16
-// CHECK-CXX-NEXT:    store <16 x i8> [[V]], ptr [[V_ADDR]], align 16
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V_ADDR]], align 16
-// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[V]]
 //
 mfloat8x16_t test_ret_mfloat8x16_t(mfloat8x16_t v) {
   return v;
@@ -30,18 +24,12 @@ mfloat8x16_t test_ret_mfloat8x16_t(mfloat8x16_t v) {
 // CHECK-C-LABEL: define dso_local <8 x i8> @test_ret_mfloat8x8_t(
 // CHECK-C-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
 // CHECK-C-NEXT:  [[ENTRY:.*:]]
-// CHECK-C-NEXT:    [[V_ADDR:%.*]] = alloca <8 x i8>, align 8
-// CHECK-C-NEXT:    store <8 x i8> [[V]], ptr [[V_ADDR]], align 8
-// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[V_ADDR]], align 8
-// CHECK-C-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-C-NEXT:    ret <8 x i8> [[V]]
 //
-// CHECK-CXX-LABEL: define dso_local <8 x i8> 
@_Z20test_ret_mfloat8x8_tu13__MFloat8x8_t(
+// CHECK-CXX-LABEL: define dso_local <8 x i8> 
@_Z20test_ret_mfloat8x8_t13__Mfloat8x8_t(
 // CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[V_ADDR:%.*]] = alloca <8 x i8>, align 8
-// CHECK-CXX-NEXT:    store <8 x i8> [[V]], ptr [[V_ADDR]], align 8
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[V_ADDR]], align 8
-// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[V]]
 //
 mfloat8x8_t test_ret_mfloat8x8_t(mfloat8x8_t v) {
   return v;
@@ -50,28 +38,22 @@ mfloat8x8_t test_ret_mfloat8x8_t(mfloat8x8_t v) {
 // CHECK-C-LABEL: define dso_local <1 x i8> @func1n(
 // CHECK-C-SAME: <1 x i8> [[MFP8:%.*]]) #[[ATTR0]] {
 // CHECK-C-NEXT:  [[ENTRY:.*:]]
-// CHECK-C-NEXT:    [[MFP8_ADDR:%.*]] = alloca <1 x i8>, align 1
 // CHECK-C-NEXT:    [[F1N:%.*]] = alloca [10 x <1 x i8>], align 1
-// CHECK-C-NEXT:    store <1 x i8> [[MFP8]], ptr [[MFP8_ADDR]], align 1
-// CHECK-C-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[MFP8_ADDR]], align 1
 // CHECK-C-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x <1 x i8>], 
ptr [[F1N]], i64 0, i64 2
-// CHECK-C-NEXT:    store <1 x i8> [[TMP0]], ptr [[ARRAYIDX]], align 1
+// CHECK-C-NEXT:    store <1 x i8> [[MFP8]], ptr [[ARRAYIDX]], align 1
 // CHECK-C-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x <1 x 
i8>], ptr [[F1N]], i64 0, i64 2
-// CHECK-C-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1
-// CHECK-C-NEXT:    ret <1 x i8> [[TMP1]]
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1
+// CHECK-C-NEXT:    ret <1 x i8> [[TMP0]]
 //
 // CHECK-CXX-LABEL: define dso_local <1 x i8> @_Z6func1nu6__mfp8(
 // CHECK-CXX-SAME: <1 x i8> [[MFP8:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[MFP8_ADDR:%.*]] = alloca <1 x i8>, align 1
 // CHECK-CXX-NEXT:    [[F1N:%.*]] = alloca [10 x <1 x i8>], align 1
-// CHECK-CXX-NEXT:    store <1 x i8> [[MFP8]], ptr [[MFP8_ADDR]], align 1
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[MFP8_ADDR]], align 1
 // CHECK-CXX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x <1 x 
i8>], ptr [[F1N]], i64 0, i64 2
-// CHECK-CXX-NEXT:    store <1 x i8> [[TMP0]], ptr [[ARRAYIDX]], align 1
+// CHECK-CXX-NEXT:    store <1 x i8> [[MFP8]], ptr [[ARRAYIDX]], align 1
 // CHECK-CXX-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x <1 x 
i8>], ptr [[F1N]], i64 0, i64 2
-// CHECK-CXX-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1
-// CHECK-CXX-NEXT:    ret <1 x i8> [[TMP1]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1
+// CHECK-CXX-NEXT:    ret <1 x i8> [[TMP0]]
 //
 __mfp8 func1n(__mfp8 mfp8) {
   __mfp8 f1n[10];
@@ -79,7 +61,43 @@ __mfp8 func1n(__mfp8 mfp8) {
   return f1n[2];
 }
 
+// CHECK-C-LABEL: define dso_local <1 x i8> @test_extract_element(
+// CHECK-C-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[RETVAL:%.*]] = alloca <1 x i8>, align 1
+// CHECK-C-NEXT:    [[VECEXT:%.*]] = extractelement <16 x i8> [[X]], i32 [[I]]
+// CHECK-C-NEXT:    store i8 [[VECEXT]], ptr [[RETVAL]], align 1
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1
+// CHECK-C-NEXT:    ret <1 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <1 x i8> 
@_Z20test_extract_element14__Mfloat8x16_ti(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[RETVAL:%.*]] = alloca <1 x i8>, align 1
+// CHECK-CXX-NEXT:    [[VECEXT:%.*]] = extractelement <16 x i8> [[X]], i32 
[[I]]
+// CHECK-CXX-NEXT:    store i8 [[VECEXT]], ptr [[RETVAL]], align 1
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1
+// CHECK-CXX-NEXT:    ret <1 x i8> [[TMP0]]
+//
+mfloat8_t test_extract_element(mfloat8x16_t x, int i) {
+  return x[i];
+}
 
-
-//// NOTE: These prefixes are unused and the list is autogenerated. Do not add 
tests below this line:
-// CHECK: {{.*}}
+// CHECK-C-LABEL: define dso_local <16 x i8> @test_insert_element(
+// CHECK-C-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]], <1 x i8> 
[[V:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[TMP0:%.*]] = bitcast <1 x i8> [[V]] to i8
+// CHECK-C-NEXT:    [[VECINS:%.*]] = insertelement <16 x i8> [[X]], i8 
[[TMP0]], i32 [[I]]
+// CHECK-C-NEXT:    ret <16 x i8> [[VECINS]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> 
@_Z19test_insert_element14__Mfloat8x16_tiu6__mfp8(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]], <1 x i8> 
[[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <1 x i8> [[V]] to i8
+// CHECK-CXX-NEXT:    [[VECINS:%.*]] = insertelement <16 x i8> [[X]], i8 
[[TMP0]], i32 [[I]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[VECINS]]
+//
+mfloat8x16_t test_insert_element(mfloat8x16_t x, int i, mfloat8_t v) {
+  x[i] = v;
+  return x;
+}
diff --git a/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp 
b/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp
index 3b4a309327fe61..9b855698f57fdc 100644
--- a/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp
+++ b/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp
@@ -11,6 +11,7 @@ typedef unsigned short poly16_t;
 typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
+typedef __mfp8 mfloat8_t;
 
 typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
@@ -26,6 +27,8 @@ typedef __attribute__((neon_vector_type(8))) uint16_t 
uint16x8_t;
 typedef __attribute__((neon_vector_type(2))) unsigned int uint32x2_t;
 typedef __attribute__((neon_vector_type(4))) unsigned int uint32x4_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(8))) mfloat8_t mfloat8x8_t;
+typedef __attribute__((neon_vector_type(16))) mfloat8_t mfloat8x16_t;
 typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
 typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
 typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
@@ -82,3 +85,7 @@ void f21(int64x2_t) {}
 void f22(uint64x2_t) {}
 // CHECK: 13__Float64x2_t
 void f23(float64x2_t) {}
+// CHECK: 13__Mfloat8x8_t
+void f24(mfloat8x8_t) {}
+// CHECK: 14__Mfloat8x16_t
+void f25(mfloat8x16_t) {}
diff --git a/clang/test/CodeGenCXX/mangle-neon-vectors.cpp 
b/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
index cb5e40be6a6df2..2139a8ae98caf4 100644
--- a/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
+++ b/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
@@ -9,6 +9,7 @@ typedef __fp16 float16_t;
 #if defined(__aarch64__)
 typedef unsigned char poly8_t;
 typedef unsigned short poly16_t;
+typedef __mfp8 mfloat8_t;
 #else
 typedef signed char poly8_t;
 typedef short poly16_t;
@@ -29,6 +30,8 @@ typedef __attribute__((neon_vector_type(4))) float16_t 
float16x4_t;
 typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
 #ifdef __aarch64__
 typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
+typedef __attribute__((neon_vector_type(8))) mfloat8_t mfloat8x8_t;
+typedef __attribute__((neon_vector_type(16))) mfloat8_t mfloat8x16_t;
 #endif
 typedef __attribute__((neon_polyvector_type(16))) poly8_t  poly8x16_t;
 typedef __attribute__((neon_polyvector_type(8)))  poly16_t poly16x8_t;
@@ -86,3 +89,11 @@ void f11(float64x2_t v) { }
 // CHECK-AARCH64-BF16: 14__Bfloat16x4_t
 void f12(bfloat16x4_t v) {}
 #endif
+
+
+#ifdef __aarch64__
+// CHECK-AARCH64: 13__Mfloat8x8_t
+void f13(mfloat8x8_t v) { }
+// CHECK-AARCH64: 14__Mfloat8x16_t
+void f14(mfloat8x16_t v) { }
+#endif
diff --git a/clang/test/Sema/aarch64-fp8-cast.c 
b/clang/test/Sema/aarch64-fp8-cast.c
new file mode 100644
index 00000000000000..ad25401919b5ad
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-cast.c
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon 
-verify -emit-llvm -o - %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// Bitcast between FP8 Neon vectors
+mfloat8x8_t err_test_f8_f8(mfloat8x16_t x) {
+    return (mfloat8x8_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values) of different size}}
+}
+
+mfloat8x16_t err_testq_f8_f8(mfloat8x8_t x) {
+    return (mfloat8x16_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' 
(vector of 16 'mfloat8_t' values) and 'mfloat8x8_t' (vector of 8 'mfloat8_t' 
values) of different size}}
+}
+
+// Bitcast between FP8 and int8 Neon vectors
+mfloat8x8_t err_test_f8_s8(int8x16_t x) {
+    return (mfloat8x8_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'int8x16_t' (vector of 16 'int8_t' values) 
of different size}}
+}
+
+int8x8_t err_test_s8_f8(mfloat8x16_t x) {
+    return (int8x8_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'int8x8_t' 
(vector of 8 'int8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values) of different size}}
+}
+
+mfloat8x16_t err_testq_f8_s8(int8x8_t x) {
+    return (mfloat8x16_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' 
(vector of 16 'mfloat8_t' values) and 'int8x8_t' (vector of 8 'int8_t' values) 
of different size}}
+}
+
+int8x16_t err_testq_s8_f8(mfloat8x8_t x) {
+    return (int8x16_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'int8x16_t' 
(vector of 16 'int8_t' values) and 'mfloat8x8_t' (vector of 8 'mfloat8_t' 
values) of different size}}
+}
+
+// Bitcast between FP8 and float32 Neon vectors
+mfloat8x8_t err_test_f8_f32(float32x4_t x) {
+    return (mfloat8x8_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'float32x4_t' (vector of 4 'float32_t' 
values) of different size}}
+}
+
+float32x2_t err_test_f32_f8(mfloat8x16_t x) {
+    return (float32x2_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'float32x2_t' 
(vector of 2 'float32_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values) of different size}}
+}
+
+mfloat8x16_t err_testq_f8_f32(float32x2_t x) {
+    return (mfloat8x16_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' 
(vector of 16 'mfloat8_t' values) and 'float32x2_t' (vector of 2 'float32_t' 
values) of different size}}
+}
+
+float32x4_t err_testq_f32_f8(mfloat8x8_t x) {
+    return (float32x4_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'float32x4_t' 
(vector of 4 'float32_t' values) and 'mfloat8x8_t' (vector of 8 'mfloat8_t' 
values) of different size}}
+}
+
+// Bitcast between FP8 and poly128_t (which is integral)
+mfloat8x8_t err_testq_f8_p128(poly128_t x) {
+    return (mfloat8x8_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and integer type 'poly128_t' (aka 'unsigned 
__int128') of different size}}
+}
+
+poly128_t err_testq_p128_f8(mfloat8x8_t x) {
+    return (poly128_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and integer type 'poly128_t' (aka 'unsigned 
__int128') of different size}}
+}
+
+// Bitcast between FP8 and a non-integral type
+mfloat8x8_t err_test_f8_ptr(void *p) {
+    return (mfloat8x8_t) p;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and scalar type 'void *'}}
+}
+
+void *err_test_ptr_f8(mfloat8x8_t v) {
+    return (void *) v;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and scalar type 'void *'}}
+}
+
+mfloat8x8_t err_test_f8_dbl(double v) {
+    return (mfloat8x8_t) v;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and scalar type 'double'}}
+}
+
+double err_test_dbl_f8(mfloat8x8_t v) {
+    return (double) v;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and scalar type 'double'}}
+}
+
+struct S {
+    char ch[16];
+};
+
+mfloat8x16_t err_test_f8_agg(struct S s) {
+    return (mfloat8x16_t) s;
+// expected-error@-1 {{operand of type 'struct S' where arithmetic or pointer 
type is required}}
+}
+
+struct S err_test_agg_f8(mfloat8x16_t v) {
+    return (struct S) v;
+// expected-error@-1 {{used type 'struct S' where arithmetic or pointer type 
is required}}
+}
diff --git a/clang/test/Sema/arm-mfp8.cpp b/clang/test/Sema/arm-mfp8.cpp
index be5bc9bb71dbd2..1b4e6791420ec9 100644
--- a/clang/test/Sema/arm-mfp8.cpp
+++ b/clang/test/Sema/arm-mfp8.cpp
@@ -48,17 +48,27 @@ void test_vector_sve(svmfloat8_t a, svuint8_t c) {
 #include <arm_neon.h>
 
 void test_vector(mfloat8x8_t a, mfloat8x16_t b, uint8x8_t c) {
-  a + b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
-  a - b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
-  a * b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
-  a / b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
+  a + a;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}}
+  a - a;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}}
+  a * a;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}}
+  a / a;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}}
 
-  a + c;  // neon-error {{cannot convert between vector and non-scalar values 
('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' 
values))}}
-  a - c;  // neon-error {{cannot convert between vector and non-scalar values 
('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' 
values))}}
-  a * c;  // neon-error {{cannot convert between vector and non-scalar values 
('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' 
values))}}
-  a / c;  // neon-error {{cannot convert between vector and non-scalar values 
('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' 
values))}}
-  c + b;  // neon-error {{cannot convert between vector and non-scalar values 
('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka 
'__MFloat8x16_t'))}}
-  c - b;  // neon-error {{cannot convert between vector and non-scalar values 
('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka 
'__MFloat8x16_t'))}}
-  c * b;  // neon-error {{cannot convert between vector and non-scalar values 
('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka 
'__MFloat8x16_t'))}}
-  c / b;  // neon-error {{cannot convert between vector and non-scalar values 
('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka 
'__MFloat8x16_t'))}}
+  b + b;  // neon-error {{invalid operands to binary expression 
('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}}
+  b - b;  // neon-error {{invalid operands to binary expression 
('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}}
+  b * b;  // neon-error {{invalid operands to binary expression 
('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}}
+  b / b;  // neon-error {{invalid operands to binary expression 
('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}}
+
+  a + b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values))}}
+  a - b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values))}}
+  a * b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values))}}
+  a / b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values))}}
+
+  a + c;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' 
values))}}
+  a - c;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' 
values))}}
+  a * c;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' 
values))}}
+  a / c;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' 
(vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' 
values))}}
+  c + b;  // neon-error {{invalid operands to binary expression ('uint8x8_t' 
(vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values))}}
+  c - b;  // neon-error {{invalid operands to binary expression ('uint8x8_t' 
(vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values))}}
+  c * b;  // neon-error {{invalid operands to binary expression ('uint8x8_t' 
(vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values))}}
+  c / b;  // neon-error {{invalid operands to binary expression ('uint8x8_t' 
(vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' 
values))}}
 }
diff --git a/clang/utils/TableGen/NeonEmitter.cpp 
b/clang/utils/TableGen/NeonEmitter.cpp
index d7d649dd2456d5..e670783fa5286b 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -102,7 +102,7 @@ enum EltType {
   Float32,
   Float64,
   BFloat16,
-  MFloat8 // Not used by Sema or CodeGen in Clang
+  MFloat8
 };
 
 } // end namespace NeonTypeFlags
@@ -2295,9 +2295,7 @@ static void emitNeonTypeDefs(const std::string& types, 
raw_ostream &OS) {
       InIfdef = true;
     }
 
-    if (T.isMFloat8())
-      OS << "typedef __MFloat8x";
-    else if (T.isPoly())
+    if (T.isPoly())
       OS << "typedef __attribute__((neon_polyvector_type(";
     else
       OS << "typedef __attribute__((neon_vector_type(";
@@ -2305,10 +2303,7 @@ static void emitNeonTypeDefs(const std::string& types, 
raw_ostream &OS) {
     Type T2 = T;
     T2.makeScalar();
     OS << T.getNumElements();
-    if (T.isMFloat8())
-      OS << "_t ";
-    else
-      OS << "))) " << T2.str();
+    OS << "))) " << T2.str();
     OS << " " << T.str() << ";\n";
   }
   if (InIfdef)

>From d128c593e26dc2b71c608219c843dab19f2df824 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.veli...@arm.com>
Date: Fri, 6 Dec 2024 13:09:23 +0000
Subject: [PATCH 2/2] [AArch64] Refactor implementation of FP8 types (NFC)

* The FP8 scalar type (`__mfp8`) was described as a vector type
* The FP8 vector types were described/assumed to have
  integer element type (the element type ought to be `__mfp8`),
* Add support for `m` type specifier (denoting `__mfp8`)
  in `DecodeTypeFromStr` and create SVE builtin prototypes using
  the specifier, instead of `int8_t`.

[fixup] Add a comment about special case of mapping FP8 vectors to LLVM vector 
types
---
 .../clang/Basic/AArch64SVEACLETypes.def       | 35 +++++++++----------
 clang/lib/AST/ASTContext.cpp                  | 30 +++++++++-------
 clang/lib/AST/ItaniumMangle.cpp               |  2 +-
 clang/lib/AST/Type.cpp                        |  4 +--
 clang/lib/CodeGen/CodeGenTypes.cpp            | 22 +++++++-----
 clang/lib/CodeGen/Targets/AArch64.cpp         |  7 ++--
 clang/utils/TableGen/SveEmitter.cpp           |  4 +--
 7 files changed, 58 insertions(+), 46 deletions(-)

diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def 
b/clang/include/clang/Basic/AArch64SVEACLETypes.def
index 2dd2754e778d60..a408bb0c54057c 100644
--- a/clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -57,6 +57,11 @@
 //  - IsBF true for vector of brain float elements.
 
//===----------------------------------------------------------------------===//
 
+#ifndef SVE_SCALAR_TYPE
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \
+  SVE_TYPE(Name, Id, SingletonId)
+#endif
+
 #ifndef SVE_VECTOR_TYPE
 #define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
@@ -72,6 +77,11 @@
   SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, 
NF, false, false, true)
 #endif
 
+#ifndef SVE_VECTOR_TYPE_MFLOAT
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, 
ElBits, NF) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, 
NF, false, false, false)
+#endif
+
 #ifndef SVE_VECTOR_TYPE_FLOAT
 #define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, 
ElBits, NF) \
   SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, 
NF, false, true, false)
@@ -97,16 +107,6 @@
   SVE_TYPE(Name, Id, SingletonId)
 #endif
 
-#ifndef AARCH64_VECTOR_TYPE
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \
-  SVE_TYPE(Name, Id, SingletonId)
-#endif
-
-#ifndef AARCH64_VECTOR_TYPE_MFLOAT
-#define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, 
ElBits, NF) \
-  AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)
-#endif
-
 //===- Vector point types -----------------------------------------------===//
 
 SVE_VECTOR_TYPE_INT("__SVInt8_t",  "__SVInt8_t",  SveInt8,  SveInt8Ty, 16,  8, 
1, true)
@@ -125,8 +125,7 @@ SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", 
SveFloat64, SveFloat64Ty
 
 SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, 
SveBFloat16Ty, 8, 16, 1)
 
-// This is a 8 bits opaque type.
-SVE_VECTOR_TYPE_INT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, 
SveMFloat8Ty, 16, 8, 1, false)
+SVE_VECTOR_TYPE_MFLOAT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, 
SveMFloat8Ty, 16, 8, 1)
 
 //
 // x2
@@ -148,7 +147,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", 
"svfloat64x2_t", SveFloat64x2, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", 
SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, 
SveMFloat8x2Ty, 16, 8, 2, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, 
SveMFloat8x2Ty, 16, 8, 2)
 
 //
 // x3
@@ -170,7 +169,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", 
"svfloat64x3_t", SveFloat64x3, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", 
SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, 
SveMFloat8x3Ty, 16, 8, 3, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, 
SveMFloat8x3Ty, 16, 8, 3)
 
 //
 // x4
@@ -192,7 +191,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", 
"svfloat64x4_t", SveFloat64x4, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", 
SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, 
SveMFloat8x4Ty, 16, 8, 4, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, 
SveMFloat8x4Ty, 16, 8, 4)
 
 SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1)
 SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, 
SveBoolx2Ty, 16, 2)
@@ -200,15 +199,15 @@ SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", 
"svboolx4_t", SveBoolx4, SveBoolx4T
 
 SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy)
 
-AARCH64_VECTOR_TYPE_MFLOAT("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 1, 8, 1)
+SVE_SCALAR_TYPE("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 8)
 
 #undef SVE_VECTOR_TYPE
+#undef SVE_VECTOR_TYPE_MFLOAT
 #undef SVE_VECTOR_TYPE_BFLOAT
 #undef SVE_VECTOR_TYPE_FLOAT
 #undef SVE_VECTOR_TYPE_INT
 #undef SVE_PREDICATE_TYPE
 #undef SVE_PREDICATE_TYPE_ALL
 #undef SVE_OPAQUE_TYPE
-#undef AARCH64_VECTOR_TYPE_MFLOAT
-#undef AARCH64_VECTOR_TYPE
+#undef SVE_SCALAR_TYPE
 #undef SVE_TYPE
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 155dbcfcaeed31..51764095a68980 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2269,11 +2269,10 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) 
const {
     Width = 0;                                                                 
\
     Align = 16;                                                                
\
     break;
-#define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, 
\
-                                   ElBits, NF)                                 
\
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              
\
   case BuiltinType::Id:                                                        
\
-    Width = NumEls * ElBits * NF;                                              
\
-    Align = NumEls * ElBits;                                                   
\
+    Width = Bits;                                                              
\
+    Align = Bits;                                                              
\
     break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size)                                        
\
@@ -4395,15 +4394,14 @@ ASTContext::getBuiltinVectorTypeInfo(const BuiltinType 
*Ty) const {
                                ElBits, NF)                                     
\
   case BuiltinType::Id:                                                        
\
     return {BFloat16Ty, llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls,     
\
+                               ElBits, NF)                                     
\
+  case BuiltinType::Id:                                                        
\
+    return {MFloat8Ty, llvm::ElementCount::getScalable(NumEls), NF};
 #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) 
\
   case BuiltinType::Id:                                                        
\
     return {BoolTy, llvm::ElementCount::getScalable(NumEls), NF};
-#define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, 
\
-                                   ElBits, NF)                                 
\
-  case BuiltinType::Id:                                                        
\
-    return {getIntTypeForBitwidth(ElBits, false),                              
\
-            llvm::ElementCount::getFixed(NumEls), NF};
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
 
 #define RVV_VECTOR_TYPE_INT(Name, Id, SingletonId, NumEls, ElBits, NF,         
\
@@ -4465,11 +4463,16 @@ QualType ASTContext::getScalableVectorType(QualType 
EltTy, unsigned NumElts,
       EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     
\
     return SingletonId;                                                        
\
   }
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls,     
\
+                               ElBits, NF)                                     
\
+  if (EltTy->isMFloat8Type() && EltTySize == ElBits &&                         
\
+      NumElts == (NumEls * NF) && NumFields == 1) {                            
\
+    return SingletonId;                                                        
\
+  }
 #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) 
\
   if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1)    
\
     return SingletonId;
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
   } else if (Target->hasRISCVVTypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
@@ -12354,6 +12357,9 @@ static QualType DecodeTypeFromStr(const char *&Str, 
const ASTContext &Context,
   case 'p':
     Type = Context.getProcessIDType();
     break;
+  case 'm':
+    Type = Context.MFloat8Ty;
+    break;
   }
 
   // If there are modifiers and if we're allowed to parse them, go for it.
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 9948963d7f44b3..49089c0ea3c8ac 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3433,7 +3433,7 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
     type_name = MangledName;                                                   
\
     Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    
\
     break;
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                
\
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              
\
   case BuiltinType::Id:                                                        
\
     type_name = MangledName;                                                   
\
     Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    
\
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index caa0ac858a1bea..fde0746a175705 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2527,9 +2527,7 @@ bool Type::isSVESizelessBuiltinType() const {
 #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 
\
   case BuiltinType::Id:                                                        
\
     return true;
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                
\
-  case BuiltinType::Id:                                                        
\
-    return false;
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
     default:
       return false;
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp 
b/clang/lib/CodeGen/CodeGenTypes.cpp
index 950b23f4e13b99..405242e97e75cb 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -505,15 +505,18 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   case BuiltinType::Id:
 #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 
\
   case BuiltinType::Id:
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                
\
-  case BuiltinType::Id:
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
       {
         ASTContext::BuiltinVectorTypeInfo Info =
             Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
-        auto VTy =
-            llvm::VectorType::get(ConvertType(Info.ElementType), Info.EC);
+        // The `__mfp8` type maps to `<1 x i8>` which can't be used to build
+        // a <N x i8> vector type, hence bypass the call to `ConvertType` for
+        // the element type and create the vector type directly.
+        auto *EltTy = Info.ElementType->isMFloat8Type()
+                          ? llvm::Type::getInt8Ty(getLLVMContext())
+                          : ConvertType(Info.ElementType);
+        auto *VTy = llvm::VectorType::get(EltTy, Info.EC);
         switch (Info.NumVectors) {
         default:
           llvm_unreachable("Expected 1, 2, 3 or 4 vectors!");
@@ -529,6 +532,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       }
     case BuiltinType::SveCount:
       return llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
+    case BuiltinType::MFloat8:
+      return llvm::VectorType::get(llvm::Type::getInt8Ty(getLLVMContext()), 1,
+                                   false);
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
     case BuiltinType::Id: \
       ResultType = \
@@ -650,9 +656,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     // An ext_vector_type of Bool is really a vector of bits.
     llvm::Type *IRElemTy = VT->isExtVectorBoolType()
                                ? llvm::Type::getInt1Ty(getLLVMContext())
-                               : (VT->getElementType()->isMFloat8Type()
-                                      ? llvm::Type::getInt8Ty(getLLVMContext())
-                                      : ConvertType(VT->getElementType()));
+                           : VT->getElementType()->isMFloat8Type()
+                               ? llvm::Type::getInt8Ty(getLLVMContext())
+                               : ConvertType(VT->getElementType());
     ResultType = llvm::FixedVectorType::get(IRElemTy, VT->getNumElements());
     break;
   }
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp 
b/clang/lib/CodeGen/Targets/AArch64.cpp
index c702e79ff8eb98..057199c66f5a10 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -244,6 +244,7 @@ AArch64ABIInfo::convertFixedToScalableVectorType(const 
VectorType *VT) const {
 
     case BuiltinType::SChar:
     case BuiltinType::UChar:
+    case BuiltinType::MFloat8:
       return llvm::ScalableVectorType::get(
           llvm::Type::getInt8Ty(getVMContext()), 16);
 
@@ -776,8 +777,10 @@ bool AArch64ABIInfo::passAsPureScalableType(
     NPred += Info.NumVectors;
   else
     NVec += Info.NumVectors;
-  auto VTy = llvm::ScalableVectorType::get(CGT.ConvertType(Info.ElementType),
-                                           Info.EC.getKnownMinValue());
+  llvm::Type *EltTy = Info.ElementType->isMFloat8Type()
+                          ? llvm::Type::getInt8Ty(getVMContext())
+                          : CGT.ConvertType(Info.ElementType);
+  auto *VTy = llvm::ScalableVectorType::get(EltTy, Info.EC.getKnownMinValue());
 
   if (CoerceToSeq.size() + Info.NumVectors > 12)
     return false;
diff --git a/clang/utils/TableGen/SveEmitter.cpp 
b/clang/utils/TableGen/SveEmitter.cpp
index 35477cfc3cf455..caaf3c573ac13a 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -449,7 +449,7 @@ std::string SVEType::builtinBaseType() const {
   case TypeKind::PredicatePattern:
     return "i";
   case TypeKind::Fpm:
-    return "Wi";
+    return "UWi";
   case TypeKind::Predicate:
     return "b";
   case TypeKind::BFloat16:
@@ -457,7 +457,7 @@ std::string SVEType::builtinBaseType() const {
     return "y";
   case TypeKind::MFloat8:
     assert(ElementBitwidth == 8 && "Invalid MFloat8!");
-    return "c";
+    return "m";
   case TypeKind::Float:
     switch (ElementBitwidth) {
     case 16:

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [AArch64] Refactor implementation of FP8 types (NFC) (PR #123604)

Reply via email to