From bc14374b30ec64220dfcb627514d9d37f34df8e4 Mon Sep 17 00:00:00 2001
From: Carol Eidt <carol.eidt@microsoft.com>
Date: Fri, 29 Mar 2019 09:06:53 -0700
Subject: [PATCH 1/7] Support for Arm64 Vector ABI

Extend HFA support to support vectors as well as floating point types.
This requires that the JIT recognize vector types even during crossgen,
so that the ABI is supported consistently.

Fix #16022
---
 src/jit/assertionprop.cpp             |   2 +-
 src/jit/codegenarm64.cpp              |   4 +-
 src/jit/codegenarmarch.cpp            |  15 +-
 src/jit/codegencommon.cpp             |  26 ++-
 src/jit/codegenxarch.cpp              |   4 +-
 src/jit/compiler.cpp                  | 187 +++++++++----------
 src/jit/compiler.h                    | 247 ++++++++++++++++----------
 src/jit/compiler.hpp                  |   6 +-
 src/jit/dll/jit.nativeproj            |   2 +-
 src/jit/flowgraph.cpp                 |   2 +-
 src/jit/gentree.cpp                   |  10 +-
 src/jit/gentree.h                     |   3 +
 src/jit/importer.cpp                  |  10 +-
 src/jit/lclvars.cpp                   |  85 ++++-----
 src/jit/lower.cpp                     |   5 +-
 src/jit/lsra.cpp                      |   2 +-
 src/jit/lsraarmarch.cpp               |   2 +-
 src/jit/lsrabuild.cpp                 |  21 +--
 src/jit/morph.cpp                     | 190 ++++++++++++--------
 src/jit/register_arg_convention.h     |   4 +-
 src/jit/simd.cpp                      |   2 +-
 src/jit/target.h                      |  14 +-
 src/jit/vartype.h                     |  55 +++++-
 src/vm/argdestination.h               |  14 +-
 src/vm/arm64/CallDescrWorkerARM64.asm |  12 +-
 src/vm/arm64/asmconstants.h           |   2 +-
 src/vm/arm64/asmhelpers.asm           |   5 +-
 src/vm/arm64/calldescrworkerarm64.S   |  13 +-
 src/vm/arm64/cgencpu.h                |   2 +-
 src/vm/callhelpers.h                  |   5 +
 src/vm/callingconvention.h            |  51 ++++--
 src/vm/class.cpp                      | 165 ++++++++++++++---
 src/vm/class.h                        |  23 ++-
 src/vm/methodtable.h                  |   3 +
 34 files changed, 773 insertions(+), 420 deletions(-)

diff --git a/src/jit/assertionprop.cpp b/src/jit/assertionprop.cpp
index 1b48491e737e..cd1b90ffa2cb 100644
--- a/src/jit/assertionprop.cpp
+++ b/src/jit/assertionprop.cpp
@@ -75,7 +75,7 @@ void Compiler::optAddCopies()
         // We only add copies for non temp local variables
         // that have a single def and that can possibly be enregistered
 
-        if (varDsc->lvIsTemp || !varDsc->lvSingleDef || !varTypeCanReg(typ))
+        if (varDsc->lvIsTemp || !varDsc->lvSingleDef || !varTypeIsEnregisterable(typ))
         {
             continue;
         }
diff --git a/src/jit/codegenarm64.cpp b/src/jit/codegenarm64.cpp
index 91bb221b1958..efbb59007c9a 100644
--- a/src/jit/codegenarm64.cpp
+++ b/src/jit/codegenarm64.cpp
@@ -2023,10 +2023,10 @@ void CodeGen::genSimpleReturn(GenTree* treeNode)
     GenTree*  op1        = treeNode->gtGetOp1();
     var_types targetType = treeNode->TypeGet();
 
-    assert(!isStructReturn(treeNode));
+    assert(targetType != TYP_STRUCT);
     assert(targetType != TYP_VOID);
 
-    regNumber retReg = varTypeIsFloating(treeNode) ? REG_FLOATRET : REG_INTRET;
+    regNumber retReg = varTypeUsesFloatArgReg(treeNode) ? REG_FLOATRET : REG_INTRET;
 
     bool movRequired = (op1->gtRegNum != retReg);
 
diff --git a/src/jit/codegenarmarch.cpp b/src/jit/codegenarmarch.cpp
index 55b3f0a7ba95..e50778221bd6 100644
--- a/src/jit/codegenarmarch.cpp
+++ b/src/jit/codegenarmarch.cpp
@@ -2355,7 +2355,7 @@ void CodeGen::genCallInstruction(GenTreeCall* call)
     }
     else
     {
-        assert(!varTypeIsStruct(call));
+        assert(call->gtType != TYP_STRUCT);
 
         if (call->gtType == TYP_REF)
         {
@@ -2509,9 +2509,13 @@ void CodeGen::genCallInstruction(GenTreeCall* call)
                 // TCB in REG_PINVOKE_TCB. fgMorphCall() sets the correct argument registers.
                 returnReg = REG_PINVOKE_TCB;
             }
+            else if (compiler->opts.compUseSoftFP)
+            {
+                returnReg = REG_INTRET;
+            }
             else
 #endif // _TARGET_ARM_
-                if (varTypeIsFloating(returnType) && !compiler->opts.compUseSoftFP)
+                if (varTypeUsesFloatArgReg(returnType))
             {
                 returnReg = REG_FLOATRET;
             }
@@ -3501,8 +3505,13 @@ bool CodeGen::isStructReturn(GenTree* treeNode)
     // For the GT_RET_FILT, the return is always
     // a bool or a void, for the end of a finally block.
     noway_assert(treeNode->OperGet() == GT_RETURN || treeNode->OperGet() == GT_RETFILT);
+    var_types returnType = treeNode->TypeGet();
 
-    return varTypeIsStruct(treeNode);
+#ifdef _TARGET_ARM64_
+    return varTypeIsStruct(returnType) && (compiler->info.compRetNativeType == TYP_STRUCT);
+#else
+    return varTypeIsStruct(returnType);
+#endif
 }
 
 //------------------------------------------------------------------------
diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index 0ec2ba74e7e0..2362be5f2485 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -3305,7 +3305,7 @@ void CodeGen::genFnPrologCalleeRegArgs(regNumber xtraReg, bool* pXtraRegClobbere
         {
             // A struct might be passed  partially in XMM register for System V calls.
             // So a single arg might use both register files.
-            if (isFloatRegType(regType) != doingFloat)
+            if (emitter::isFloatReg(varDsc->lvArgReg) != doingFloat)
             {
                 continue;
             }
@@ -10158,7 +10158,11 @@ bool Compiler::IsMultiRegReturnedType(CORINFO_CLASS_HANDLE hClass)
     structPassingKind howToReturnStruct;
     var_types         returnType = getReturnTypeForStruct(hClass, &howToReturnStruct);
 
+#ifdef _TARGET_ARM64_
+    return (varTypeIsStruct(returnType) && (howToReturnStruct != SPK_PrimitiveType));
+#else
     return (varTypeIsStruct(returnType));
+#endif
 }
 
 //----------------------------------------------
@@ -10167,11 +10171,7 @@ bool Compiler::IsMultiRegReturnedType(CORINFO_CLASS_HANDLE hClass)
 
 bool Compiler::IsHfa(CORINFO_CLASS_HANDLE hClass)
 {
-#ifdef FEATURE_HFA
-    return varTypeIsFloating(GetHfaType(hClass));
-#else
-    return false;
-#endif
+    return varTypeIsValidHfaType(GetHfaType(hClass));
 }
 
 bool Compiler::IsHfa(GenTree* tree)
@@ -10204,7 +10204,19 @@ var_types Compiler::GetHfaType(CORINFO_CLASS_HANDLE hClass)
     {
 #ifdef FEATURE_HFA
         CorInfoType corType = info.compCompHnd->getHFAType(hClass);
-        if (corType != CORINFO_TYPE_UNDEF)
+#ifdef _TARGET_ARM64_
+        if (corType == CORINFO_TYPE_VALUECLASS)
+        {
+            // This is a vector type.
+            // HVAs are only supported on ARM64, and only for sizes of 8 or 16 bytes.
+            // For 8-byte vectors corType will be returned as CORINFO_TYPE_DOUBLE.
+            result = TYP_SIMD16;
+            // This type may not appear elsewhere, but it will occupy a floating point register.
+            compFloatingPointUsed = true;
+        }
+        else
+#endif // _TARGET_ARM64_
+            if (corType != CORINFO_TYPE_UNDEF)
         {
             result = JITtype2varType(corType);
         }
diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp
index 97d04e7c16d7..a7c01a78f7e7 100644
--- a/src/jit/codegenxarch.cpp
+++ b/src/jit/codegenxarch.cpp
@@ -1133,9 +1133,9 @@ void CodeGen::genStructReturn(GenTree* treeNode)
         unsigned regCount = retTypeDesc.GetReturnRegCount();
         assert(regCount == MAX_RET_REG_COUNT);
 
-        if (varTypeIsEnregisterableStruct(op1))
+        if (varTypeIsEnregisterable(op1))
         {
-            // Right now the only enregistrable structs supported are SIMD vector types.
+            // Right now the only enregisterable structs supported are SIMD vector types.
             assert(varTypeIsSIMD(op1));
             assert(op1->isUsedFromReg());
 
diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp
index e005a77c2900..1cc2f600ed16 100644
--- a/src/jit/compiler.cpp
+++ b/src/jit/compiler.cpp
@@ -573,8 +573,8 @@ bool Compiler::isSingleFloat32Struct(CORINFO_CLASS_HANDLE clsHnd)
 //     of size 'structSize'.
 //     We examine 'clsHnd' to check the GC layout of the struct and
 //     return TYP_REF for structs that simply wrap an object.
-//     If the struct is a one element HFA, we will return the
-//     proper floating point type.
+//     If the struct is a one element HFA/HVA, we will return the
+//     proper floating point or vector type.
 //
 // Arguments:
 //    structSize - the size of the struct type, cannot be zero
@@ -592,13 +592,64 @@ bool Compiler::isSingleFloat32Struct(CORINFO_CLASS_HANDLE clsHnd)
 //    same way as any other 8-byte struct
 //    For ARM32 if we have an HFA struct that wraps a 64-bit double
 //    we will return TYP_DOUBLE.
+//    For vector calling conventions, a vector is considered a "primitive"
+//    type, as it is passed in a single register.
 //
 var_types Compiler::getPrimitiveTypeForStruct(unsigned structSize, CORINFO_CLASS_HANDLE clsHnd, bool isVarArg)
 {
     assert(structSize != 0);
 
-    var_types useType;
+    var_types useType = TYP_UNKNOWN;
 
+// Start by determining if we have an HFA/HVA with a single element.
+#ifdef FEATURE_HFA
+#if defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)
+    // Arm64 Windows VarArg methods arguments will not classify HFA types, they will need to be treated
+    // as if they are not HFA types.
+    if (!isVarArg)
+#endif // defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)
+    {
+        switch (structSize)
+        {
+            case 4:
+            case 8:
+#ifdef _TARGET_ARM64_
+            case 16:
+#endif // _TARGET_ARM64_
+            {
+                var_types hfaType;
+#ifdef ARM_SOFTFP
+                // For ARM_SOFTFP, HFA is unsupported so we need to check in another way.
+                // This matters only for size-4 struct because bigger structs would be processed with RetBuf.
+                if (isSingleFloat32Struct(clsHnd))
+                {
+                    hfaType = TYP_FLOAT;
+                }
+#else  // !ARM_SOFTFP
+                hfaType = GetHfaType(clsHnd);
+#endif // ARM_SOFTFP
+                // We're only interested in the case where the struct size is equal to the size of the hfaType.
+                if (varTypeIsValidHfaType(hfaType))
+                {
+                    if (genTypeSize(hfaType) == structSize)
+                    {
+                        useType = hfaType;
+                    }
+                    else
+                    {
+                        return TYP_UNKNOWN;
+                    }
+                }
+            }
+        }
+        if (useType != TYP_UNKNOWN)
+        {
+            return useType;
+        }
+    }
+#endif // FEATURE_HFA
+
+    // Now deal with non-HFA/HVA structs.
     switch (structSize)
     {
         case 1:
@@ -618,15 +669,8 @@ var_types Compiler::getPrimitiveTypeForStruct(unsigned structSize, CORINFO_CLASS
 
 #ifdef _TARGET_64BIT_
         case 4:
-            if (IsHfa(clsHnd))
-            {
-                // A structSize of 4 with IsHfa, it must be an HFA of one float
-                useType = TYP_FLOAT;
-            }
-            else
-            {
-                useType = TYP_INT;
-            }
+            // We dealt with the one-float HFA above. All other 4-byte structs are handled as INT.
+            useType = TYP_INT;
             break;
 
 #if !defined(_TARGET_XARCH_) || defined(UNIX_AMD64_ABI)
@@ -640,86 +684,13 @@ var_types Compiler::getPrimitiveTypeForStruct(unsigned structSize, CORINFO_CLASS
 #endif // _TARGET_64BIT_
 
         case TARGET_POINTER_SIZE:
-#ifdef ARM_SOFTFP
-            // For ARM_SOFTFP, HFA is unsupported so we need to check in another way
-            // This matters only for size-4 struct cause bigger structs would be processed with RetBuf
-            if (isSingleFloat32Struct(clsHnd))
-#else // !ARM_SOFTFP
-            if (IsHfa(clsHnd)
-#if defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)
-                // Arm64 Windows VarArg methods arguments will not
-                // classify HFA types, they will need to be treated
-                // as if they are not HFA types.
-                && !isVarArg
-#endif // defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)
-                )
-#endif // ARM_SOFTFP
-            {
-#ifdef _TARGET_64BIT_
-                var_types hfaType = GetHfaType(clsHnd);
-
-                // A structSize of 8 with IsHfa, we have two possiblities:
-                // An HFA of one double or an HFA of two floats
-                //
-                // Check and exclude the case of an HFA of two floats
-                if (hfaType == TYP_DOUBLE)
-                {
-                    // We have an HFA of one double
-                    useType = TYP_DOUBLE;
-                }
-                else
-                {
-                    assert(hfaType == TYP_FLOAT);
-
-                    // We have an HFA of two floats
-                    // This should be passed or returned in two FP registers
-                    useType = TYP_UNKNOWN;
-                }
-#else  // a 32BIT target
-                // A structSize of 4 with IsHfa, it must be an HFA of one float
-                useType = TYP_FLOAT;
-#endif // _TARGET_64BIT_
-            }
-            else
-            {
-                BYTE gcPtr = 0;
-                // Check if this pointer-sized struct is wrapping a GC object
-                info.compCompHnd->getClassGClayout(clsHnd, &gcPtr);
-                useType = getJitGCType(gcPtr);
-            }
-            break;
-
-#ifdef _TARGET_ARM_
-        case 8:
-            if (IsHfa(clsHnd))
-            {
-                var_types hfaType = GetHfaType(clsHnd);
-
-                // A structSize of 8 with IsHfa, we have two possiblities:
-                // An HFA of one double or an HFA of two floats
-                //
-                // Check and exclude the case of an HFA of two floats
-                if (hfaType == TYP_DOUBLE)
-                {
-                    // We have an HFA of one double
-                    useType = TYP_DOUBLE;
-                }
-                else
-                {
-                    assert(hfaType == TYP_FLOAT);
-
-                    // We have an HFA of two floats
-                    // This should be passed or returned in two FP registers
-                    useType = TYP_UNKNOWN;
-                }
-            }
-            else
-            {
-                // We don't have an HFA
-                useType = TYP_UNKNOWN;
-            }
-            break;
-#endif // _TARGET_ARM_
+        {
+            BYTE gcPtr = 0;
+            // Check if this pointer-sized struct is wrapping a GC object
+            info.compCompHnd->getClassGClayout(clsHnd, &gcPtr);
+            useType = getJitGCType(gcPtr);
+        }
+        break;
 
         default:
             useType = TYP_UNKNOWN;
@@ -802,11 +773,11 @@ var_types Compiler::getArgTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
     else
 #endif // UNIX_AMD64_ABI
 
-        // The largest primitive type is 8 bytes (TYP_DOUBLE)
+        // The largest arg passed in a single register is MAX_PASS_SINGLEREG_BYTES,
         // so we can skip calling getPrimitiveTypeForStruct when we
         // have a struct that is larger than that.
         //
-        if (structSize <= sizeof(double))
+        if (structSize <= MAX_PASS_SINGLEREG_BYTES)
     {
         // We set the "primitive" useType based upon the structSize
         // and also examine the clsHnd to see if it is an HFA of count one
@@ -829,14 +800,21 @@ var_types Compiler::getArgTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
         //
         if (structSize <= MAX_PASS_MULTIREG_BYTES)
         {
-            // Structs that are HFA's are passed by value in multiple registers
-            if (IsHfa(clsHnd)
+            // Structs that are HFA's are passed by value in multiple registers.
+            // Arm64 Windows VarArg methods arguments will not classify HFA types, they will need to be treated
+            // as if they are not HFA types.
+            var_types hfaType;
 #if defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)
-                && !isVarArg // Arm64 Windows VarArg methods arguments will not
-                             // classify HFA types, they will need to be treated
-                             // as if they are not HFA types.
-#endif                       // defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)
-                )
+            if (isVarArg)
+            {
+                hfaType = TYP_UNDEF;
+            }
+            else
+#endif // defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)
+            {
+                hfaType = GetHfaType(clsHnd);
+            }
+            if (varTypeIsValidHfaType(hfaType))
             {
                 // HFA's of count one should have been handled by getPrimitiveTypeForStruct
                 assert(GetHfaCount(clsHnd) >= 2);
@@ -851,7 +829,6 @@ var_types Compiler::getArgTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
             {
 
 #ifdef UNIX_AMD64_ABI
-
                 // The case of (structDesc.eightByteCount == 1) should have already been handled
                 if ((structDesc.eightByteCount > 1) || !structDesc.passedInRegisters)
                 {
@@ -1035,10 +1012,10 @@ var_types Compiler::getReturnTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
     // Check for cases where a small struct is returned in a register
     // via a primitive type.
     //
-    // The largest primitive type is 8 bytes (TYP_DOUBLE)
+    // The largest "primitive type" is MAX_PASS_SINGLEREG_BYTES
     // so we can skip calling getPrimitiveTypeForStruct when we
     // have a struct that is larger than that.
-    if (canReturnInRegister && (useType == TYP_UNKNOWN) && (structSize <= sizeof(double)))
+    if (canReturnInRegister && (useType == TYP_UNKNOWN) && (structSize <= MAX_PASS_SINGLEREG_BYTES))
     {
         // We set the "primitive" useType based upon the structSize
         // and also examine the clsHnd to see if it is an HFA of count one
@@ -1070,7 +1047,7 @@ var_types Compiler::getReturnTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
     // because when HFA are enabled, normally we would use two FP registers to pass or return it
     //
     // But if we don't have support for multiple register return types, we have to change this.
-    // Since we what we have an 8-byte struct (float + float)  we change useType to TYP_I_IMPL
+    // Since what we have is an 8-byte struct (float + float)  we change useType to TYP_I_IMPL
     // so that the struct is returned instead using an 8-byte integer register.
     //
     if ((FEATURE_MULTIREG_RET == 0) && (useType == TYP_UNKNOWN) && (structSize == (2 * sizeof(float))) && IsHfa(clsHnd))
diff --git a/src/jit/compiler.h b/src/jit/compiler.h
index c734b5a96fff..751020603c3f 100644
--- a/src/jit/compiler.h
+++ b/src/jit/compiler.h
@@ -134,6 +134,61 @@ const unsigned FLG_CCTOR = (CORINFO_FLG_CONSTRUCTOR | CORINFO_FLG_STATIC);
 const int BAD_STK_OFFS = 0xBAADF00D; // for LclVarDsc::lvStkOffs
 #endif
 
+//------------------------------------------------------------------------
+// HFA info shared by LclVarDsc and fgArgTabEntry
+//------------------------------------------------------------------------
+#ifdef FEATURE_HFA
+enum HfaElemKind : unsigned int
+{
+    HFA_ELEM_NONE,
+    HFA_ELEM_FLOAT,
+    HFA_ELEM_DOUBLE,
+    HFA_ELEM_SIMD16
+};
+inline bool IsHfa(HfaElemKind kind)
+{
+    return kind != HFA_ELEM_NONE;
+}
+inline var_types HfaTypeFromElemKind(HfaElemKind kind)
+{
+    switch (kind)
+    {
+        case HFA_ELEM_FLOAT:
+            return TYP_FLOAT;
+        case HFA_ELEM_DOUBLE:
+            return TYP_DOUBLE;
+#ifdef FEATURE_SIMD
+        case HFA_ELEM_SIMD16:
+            return TYP_SIMD16;
+#endif
+        case HFA_ELEM_NONE:
+            return TYP_UNDEF;
+        default:
+            assert(!"Invalid HfaElemKind");
+            return TYP_UNDEF;
+    }
+}
+inline HfaElemKind HfaElemKindFromType(var_types type)
+{
+    switch (type)
+    {
+        case TYP_FLOAT:
+            return HFA_ELEM_FLOAT;
+        case TYP_DOUBLE:
+            return HFA_ELEM_DOUBLE;
+#ifdef FEATURE_SIMD
+        case TYP_SIMD16:
+            return HFA_ELEM_SIMD16;
+#endif
+        case TYP_UNDEF:
+            return HFA_ELEM_NONE;
+        default:
+            assert(!"Invalid HFA Type");
+            return HFA_ELEM_NONE;
+    }
+}
+#endif // FEATURE_HFA
+
 // The following holds the Local var info (scope information)
 typedef const char* VarName; // Actual ASCII string
 struct VarScopeDsc
@@ -595,11 +650,8 @@ class LclVarDsc
     unsigned char lvIsMultiRegRet : 1; // true if this is a multireg LclVar struct assigned from a multireg call
 
 #ifdef FEATURE_HFA
-    unsigned char _lvIsHfa : 1;          // Is this a struct variable who's class handle is an HFA type
-    unsigned char _lvIsHfaRegArg : 1;    // Is this a HFA argument variable?    // TODO-CLEANUP: Remove this and replace
-                                         // with (lvIsRegArg && lvIsHfa())
-    unsigned char _lvHfaTypeIsFloat : 1; // Is the HFA type float or double?
-#endif                                   // FEATURE_HFA
+    HfaElemKind _lvHfaElemKind : 2; // What kind of an HFA this is (HFA_ELEM_NONE if it is not an HFA).
+#endif                              // FEATURE_HFA
 
 #ifdef DEBUG
     // TODO-Cleanup: See the note on lvSize() - this flag is only in use by asserts that are checking for struct
@@ -666,70 +718,57 @@ class LclVarDsc
     bool lvIsHfa() const
     {
 #ifdef FEATURE_HFA
-        return _lvIsHfa;
+        return IsHfa(_lvHfaElemKind);
 #else
         return false;
 #endif
     }
 
-    void lvSetIsHfa()
-    {
-#ifdef FEATURE_HFA
-        _lvIsHfa = true;
-#endif
-    }
-
     bool lvIsHfaRegArg() const
     {
 #ifdef FEATURE_HFA
-        return _lvIsHfaRegArg;
+        return lvIsRegArg && lvIsHfa();
 #else
         return false;
 #endif
     }
 
-    void lvSetIsHfaRegArg(bool value = true)
-    {
-#ifdef FEATURE_HFA
-        _lvIsHfaRegArg = value;
-#endif
-    }
-
-    bool lvHfaTypeIsFloat() const
-    {
-#ifdef FEATURE_HFA
-        return _lvHfaTypeIsFloat;
-#else
-        return false;
-#endif
-    }
-
-    void lvSetHfaTypeIsFloat(bool value)
-    {
-#ifdef FEATURE_HFA
-        _lvHfaTypeIsFloat = value;
-#endif
-    }
-
-    // on Arm64 - Returns 1-4 indicating the number of register slots used by the HFA
-    // on Arm32 - Returns the total number of single FP register slots used by the HFA, max is 8
+    //------------------------------------------------------------------------------
+    // lvHfaSlots: Get the number of slots used by an HFA local
+    //
+    // Return Value:
+    //    On Arm64 - Returns 1-4 indicating the number of register slots used by the HFA
+    //    On Arm32 - Returns the total number of single FP register slots used by the HFA, max is 8
     //
     unsigned lvHfaSlots() const
     {
         assert(lvIsHfa());
         assert(varTypeIsStruct(lvType));
+        unsigned slots = 0;
 #ifdef _TARGET_ARM_
-        return lvExactSize / sizeof(float);
-#else  //  _TARGET_ARM64_
-        if (lvHfaTypeIsFloat())
-        {
-            return lvExactSize / sizeof(float);
-        }
-        else
+        slots = lvExactSize / sizeof(float);
+        assert(slots <= 8);
+#elif defined(_TARGET_ARM64_)
+        switch (_lvHfaElemKind)
         {
-            return lvExactSize / sizeof(double);
+            case HFA_ELEM_NONE:
+                assert(!"lvHfaSlots called for non-HFA");
+                break;
+            case HFA_ELEM_FLOAT:
+                slots = lvExactSize >> 2;
+                break;
+            case HFA_ELEM_DOUBLE:
+                slots = lvExactSize >> 3;
+                break;
+            case HFA_ELEM_SIMD16:
+                slots = lvExactSize >> 4;
+                break;
+            default:
+                unreached();
         }
+        assert(slots <= 4);
 #endif //  _TARGET_ARM64_
+        return slots;
     }
 
     // lvIsMultiRegArgOrRet()
@@ -750,7 +789,7 @@ class LclVarDsc
     regNumberSmall _lvOtherReg; // Used for "upper half" of long var.
 #endif                          // !defined(_TARGET_64BIT_)
 
-    regNumberSmall _lvArgReg; // The register in which this argument is passed.
+    regNumberSmall _lvArgReg; // The (first) register in which this argument is passed.
 
 #if FEATURE_MULTIREG_ARGS
     regNumberSmall _lvOtherArgReg; // Used for the second part of the struct passed in a register.
@@ -1030,14 +1069,21 @@ class LclVarDsc
     {
         return isFloatRegType(lvType) || lvIsHfaRegArg();
     }
+
     var_types GetHfaType() const
     {
-        return lvIsHfa() ? (lvHfaTypeIsFloat() ? TYP_FLOAT : TYP_DOUBLE) : TYP_UNDEF;
+#ifdef FEATURE_HFA
+        assert(lvIsHfa());
+        return HfaTypeFromElemKind(_lvHfaElemKind);
+#endif // FEATURE_HFA
+        return TYP_UNDEF;
     }
+
     void SetHfaType(var_types type)
     {
-        assert(varTypeIsFloating(type));
-        lvSetHfaTypeIsFloat(type == TYP_FLOAT);
+#ifdef FEATURE_HFA
+        _lvHfaElemKind = HfaElemKindFromType(type);
+#endif // FEATURE_HFA
     }
 
     var_types lvaArgType();
@@ -1487,8 +1533,7 @@ struct fgArgTabEntry
     bool _isSplit : 1; // True when this argument is split between the registers and OutArg area
 #endif                 // FEATURE_ARG_SPLIT
 #ifdef FEATURE_HFA
-    bool _isHfaArg : 1;    // True when the argument is an HFA type.
-    bool _isDoubleHfa : 1; // True when the argument is an HFA, with an element type of DOUBLE.
+    HfaElemKind _hfaElemKind : 2; // What kind of an HFA this is (HFA_ELEM_NONE if it is not an HFA).
 #endif
 
     bool isLateArg()
@@ -1569,7 +1614,7 @@ struct fgArgTabEntry
     bool getIsHfaArg()
     {
 #ifdef FEATURE_HFA
-        return _isHfaArg;
+        return IsHfa(_hfaElemKind);
 #else
         return false;
 #endif
@@ -1579,23 +1624,22 @@ struct fgArgTabEntry
     bool getIsHfaRegArg()
     {
 #ifdef FEATURE_HFA
-        return _isHfaArg && isPassedInRegisters();
+        return IsHfa(_hfaElemKind) && isPassedInRegisters();
 #else
         return false;
 #endif
     }
 
-    __declspec(property(get = getHfaType)) var_types hfaType;
-    var_types getHfaType()
+    __declspec(property(get = GetHfaType)) var_types hfaType;
+    var_types GetHfaType()
     {
 #ifdef FEATURE_HFA
-        return _isHfaArg ? (_isDoubleHfa ? TYP_DOUBLE : TYP_FLOAT) : TYP_UNDEF;
-#else
+        return HfaTypeFromElemKind(_hfaElemKind);
+#endif // FEATURE_HFA
         return TYP_UNDEF;
-#endif
     }
 
-    void setHfaType(var_types type, unsigned hfaSlots)
+    void SetHfaType(var_types type, unsigned hfaSlots)
     {
 #ifdef FEATURE_HFA
         if (type != TYP_UNDEF)
@@ -1607,29 +1651,33 @@ struct fgArgTabEntry
             // Note that hfaSlots is the number of registers we will use. For ARM, that is twice
             // the number of "double registers".
             unsigned numHfaRegs = hfaSlots;
-            if (isPassedInRegisters())
-            {
 #ifdef _TARGET_ARM_
-                if (type == TYP_DOUBLE)
-                {
-                    // Must be an even number of registers.
-                    assert((numRegs & 1) == 0);
-                    numHfaRegs = hfaSlots / 2;
-                }
+            if (type == TYP_DOUBLE)
+            {
+                // Must be an even number of registers.
+                assert((numRegs & 1) == 0);
+                numHfaRegs = hfaSlots / 2;
+            }
 #endif // _TARGET_ARM_
-                if (_isHfaArg)
+
+            if (!isHfaArg)
+            {
+                // We haven't previously set this; do so now.
+                _hfaElemKind = HfaElemKindFromType(type);
+                if (isPassedInRegisters())
                 {
-                    // This should already be set correctly.
-                    assert(numRegs == numHfaRegs);
-                    assert(_isDoubleHfa == (type == TYP_DOUBLE));
+                    numRegs = numHfaRegs;
                 }
-                else
+            }
+            else
+            {
+                // We've already set this; ensure that it's consistent.
+                if (isPassedInRegisters())
                 {
-                    numRegs = numHfaRegs;
+                    assert(numRegs == numHfaRegs);
                 }
+                assert(type == HfaTypeFromElemKind(_hfaElemKind));
             }
-            _isDoubleHfa = (type == TYP_DOUBLE);
-            _isHfaArg    = true;
         }
 #endif // FEATURE_HFA
     }
@@ -1701,22 +1749,30 @@ struct fgArgTabEntry
     {
         unsigned size = getSlotCount();
 #ifdef FEATURE_HFA
-#ifdef _TARGET_ARM_
-        // We counted the number of regs, but if they are DOUBLE hfa regs we have to double the size.
-        if (isHfaRegArg && (hfaType == TYP_DOUBLE))
+        if (isHfaRegArg)
         {
-            assert(!isSplit);
-            size <<= 1;
-        }
+#ifdef _TARGET_ARM_
+            // We counted the number of regs, but if they are DOUBLE hfa regs we have to double the size.
+            if (hfaType == TYP_DOUBLE)
+            {
+                assert(!isSplit);
+                size <<= 1;
+            }
 #elif defined(_TARGET_ARM64_)
-        // We counted the number of regs, but if they are FLOAT hfa regs we have to halve the size.
-        if (isHfaRegArg && (hfaType == TYP_FLOAT))
-        {
-            // Round up in case of odd HFA count.
-            size = (size + 1) >> 1;
-        }
+            // We counted the number of regs, but if they are FLOAT hfa regs we have to halve the size,
+            // or if they are SIMD16 vector hfa regs we have to double the size.
+            if (hfaType == TYP_FLOAT)
+            {
+                // Round up in case of odd HFA count.
+                size = (size + 1) >> 1;
+            }
+            else if (hfaType == TYP_SIMD16)
+            {
+                size <<= 1;
+            }
 #endif // _TARGET_ARM64_
-#endif
+        }
+#endif // FEATURE_HFA
         return size;
     }
 
@@ -7614,6 +7670,17 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     // Should we support SIMD intrinsics?
     bool featureSIMD;
 
+    // Should we recognize SIMD types?
+    // We always do this on ARM64 to support HVA types.
+    bool supportSIMDTypes()
+    {
+#ifdef _TARGET_ARM64_
+        return true;
+#else
+        return featureSIMD;
+#endif
+    }
+
     // Have we identified any SIMD types?
     // This is currently used by struct promotion to avoid getting type information for a struct
     // field to see if it is a SIMD type, if we haven't seen any SIMD types or operations in
diff --git a/src/jit/compiler.hpp b/src/jit/compiler.hpp
index 901a58ec9e35..0b30114f6768 100644
--- a/src/jit/compiler.hpp
+++ b/src/jit/compiler.hpp
@@ -2919,7 +2919,7 @@ inline regNumber genMapFloatRegArgNumToRegNum(unsigned argNum)
 
 __forceinline regNumber genMapRegArgNumToRegNum(unsigned argNum, var_types type)
 {
-    if (varTypeIsFloating(type))
+    if (varTypeUsesFloatArgReg(type))
     {
         return genMapFloatRegArgNumToRegNum(argNum);
     }
@@ -2957,7 +2957,7 @@ inline regMaskTP genMapFloatRegArgNumToRegMask(unsigned argNum)
 __forceinline regMaskTP genMapArgNumToRegMask(unsigned argNum, var_types type)
 {
     regMaskTP result;
-    if (varTypeIsFloating(type))
+    if (varTypeUsesFloatArgReg(type))
     {
         result = genMapFloatRegArgNumToRegMask(argNum);
 #ifdef _TARGET_ARM_
@@ -3076,7 +3076,7 @@ inline unsigned genMapFloatRegNumToRegArgNum(regNumber regNum)
 
 inline unsigned genMapRegNumToRegArgNum(regNumber regNum, var_types type)
 {
-    if (varTypeIsFloating(type))
+    if (varTypeUsesFloatArgReg(type))
     {
         return genMapFloatRegNumToRegArgNum(regNum);
     }
diff --git a/src/jit/dll/jit.nativeproj b/src/jit/dll/jit.nativeproj
index 7b8dacb78c52..56e94be9cac8 100644
--- a/src/jit/dll/jit.nativeproj
+++ b/src/jit/dll/jit.nativeproj
@@ -31,7 +31,7 @@
 
     <LinkModuleDefinitionFile>$(OutputName).def</LinkModuleDefinitionFile>
 
-    <ClDefines Condition="'$(BuildArchitecture)' == 'amd64'">$(ClDefines);FEATURE_SIMD</ClDefines>
+    <ClDefines Condition="'$(BuildArchitecture)' == 'amd64' or '$(BuildArchitecture)' == 'arm64'">$(ClDefines);FEATURE_SIMD</ClDefines>
 
     <Win32DllLibs>$(SdkLibPath)\kernel32.lib;$(SdkLibPath)\user32.lib;$(SdkLibPath)\advapi32.lib;$(SdkLibPath)\oleaut32.lib;$(SdkLibPath)\uuid.lib</Win32DllLibs>
     <Win32DllLibs>$(Win32DllLibs);$(ClrLibPath)\utilcode.lib</Win32DllLibs>
diff --git a/src/jit/flowgraph.cpp b/src/jit/flowgraph.cpp
index b24ea0e42256..236302cca5ae 100644
--- a/src/jit/flowgraph.cpp
+++ b/src/jit/flowgraph.cpp
@@ -23273,7 +23273,7 @@ GenTreeStmt* Compiler::fgInlinePrependStatements(InlineInfo* inlineInfo)
                     if (varTypeIsStruct(argType))
                     {
                         structHnd = gtGetStructHandleIfPresent(argNode);
-                        noway_assert(structHnd != NO_CLASS_HANDLE);
+                        noway_assert((structHnd != NO_CLASS_HANDLE) || (argType != TYP_STRUCT));
                     }
 
                     // Unsafe value cls check is not needed for
diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp
index be064c285afa..9e58db0df4fe 100644
--- a/src/jit/gentree.cpp
+++ b/src/jit/gentree.cpp
@@ -17682,7 +17682,8 @@ GenTreeSIMD* Compiler::gtNewSIMDNode(
     assert(op1 != nullptr);
     SetOpLclRelatedToSIMDIntrinsic(op1);
 
-    return new (this, GT_SIMD) GenTreeSIMD(type, op1, simdIntrinsicID, baseType, size);
+    GenTreeSIMD* simdNode = new (this, GT_SIMD) GenTreeSIMD(type, op1, simdIntrinsicID, baseType, size);
+    return simdNode;
 }
 
 GenTreeSIMD* Compiler::gtNewSIMDNode(
@@ -17692,7 +17693,8 @@ GenTreeSIMD* Compiler::gtNewSIMDNode(
     SetOpLclRelatedToSIMDIntrinsic(op1);
     SetOpLclRelatedToSIMDIntrinsic(op2);
 
-    return new (this, GT_SIMD) GenTreeSIMD(type, op1, op2, simdIntrinsicID, baseType, size);
+    GenTreeSIMD* simdNode = new (this, GT_SIMD) GenTreeSIMD(type, op1, op2, simdIntrinsicID, baseType, size);
+    return simdNode;
 }
 
 //-------------------------------------------------------------------
@@ -18064,7 +18066,7 @@ void ReturnTypeDesc::InitializeStructReturnType(Compiler* comp, CORINFO_CLASS_HA
         case Compiler::SPK_PrimitiveType:
         {
             assert(returnType != TYP_UNKNOWN);
-            assert(!varTypeIsStruct(returnType));
+            assert(returnType != TYP_STRUCT);
             m_regType[0] = returnType;
             break;
         }
@@ -18075,7 +18077,7 @@ void ReturnTypeDesc::InitializeStructReturnType(Compiler* comp, CORINFO_CLASS_HA
             var_types hfaType = comp->GetHfaType(retClsHnd);
 
             // We should have an hfa struct type
-            assert(varTypeIsFloating(hfaType));
+            assert(varTypeIsValidHfaType(hfaType));
 
             // Note that the retail build issues a warning about a potential divsion by zero without this Max function
             unsigned elemSize = Max((unsigned)1, EA_SIZE_IN_BYTES(emitActualTypeSize(hfaType)));
diff --git a/src/jit/gentree.h b/src/jit/gentree.h
index 5d45427a395b..b294748b67c8 100644
--- a/src/jit/gentree.h
+++ b/src/jit/gentree.h
@@ -3532,6 +3532,9 @@ struct GenTreeCall final : public GenTree
         return varTypeIsLong(gtType);
 #elif FEATURE_MULTIREG_RET && defined(_TARGET_ARM_)
         return varTypeIsLong(gtType) || (varTypeIsStruct(gtType) && !HasRetBufArg());
+#elif defined(FEATURE_HFA) && defined(_TARGET_ARM64_)
+        // SIMD types are returned in vector regs on ARM64.
+        return (gtType == TYP_STRUCT) && !HasRetBufArg();
 #elif FEATURE_MULTIREG_RET
         return varTypeIsStruct(gtType) && !HasRetBufArg();
 #else
diff --git a/src/jit/importer.cpp b/src/jit/importer.cpp
index 8600bf304cc5..88638092fe90 100644
--- a/src/jit/importer.cpp
+++ b/src/jit/importer.cpp
@@ -1217,7 +1217,7 @@ GenTree* Compiler::impAssignStructPtr(GenTree*             destAddr,
                 // If it is a multi-reg struct return, don't change the oper to GT_LCL_FLD.
                 // That is, the IR will be of the form lclVar = call for multi-reg return
                 //
-                GenTree* lcl = destAddr->gtOp.gtOp1;
+                GenTreeLclVar* lcl = destAddr->gtOp.gtOp1->AsLclVar();
                 if (src->AsCall()->HasMultiRegRetVal())
                 {
                     // Mark the struct LclVar as used in a MultiReg return context
@@ -1227,7 +1227,7 @@ GenTree* Compiler::impAssignStructPtr(GenTree*             destAddr,
                     lcl->gtFlags |= GTF_DONT_CSE;
                     lvaTable[lcl->gtLclVarCommon.gtLclNum].lvIsMultiRegRet = true;
                 }
-                else // The call result is not a multireg return
+                else if (lcl->gtType != src->gtType)
                 {
                     // We change this to a GT_LCL_FLD (from a GT_ADDR of a GT_LCL_VAR)
                     lcl->ChangeOper(GT_LCL_FLD);
@@ -1532,7 +1532,7 @@ var_types Compiler::impNormStructType(CORINFO_CLASS_HANDLE structHnd,
 
 #ifdef FEATURE_SIMD
     // Check to see if this is a SIMD type.
-    if (featureSIMD && !mayContainGCPtrs)
+    if (supportSIMDTypes() && !mayContainGCPtrs)
     {
         unsigned originalSize = info.compCompHnd->getClassSize(structHnd);
 
@@ -9057,7 +9057,7 @@ GenTree* Compiler::impFixupStructReturnType(GenTree* op, CORINFO_CLASS_HANDLE re
     {
         // It is possible that we now have a lclVar of scalar type.
         // If so, don't transform it to GT_LCL_FLD.
-        if (varTypeIsStruct(lvaTable[op->AsLclVar()->gtLclNum].lvType))
+        if (lvaTable[op->AsLclVar()->gtLclNum].lvType != info.compRetNativeType)
         {
             op->ChangeOper(GT_LCL_FLD);
         }
@@ -18983,7 +18983,7 @@ void Compiler::impInlineInitVars(InlineInfo* pInlineInfo)
         if ((!foundSIMDType || (type == TYP_STRUCT)) && isSIMDorHWSIMDClass(&(lclVarInfo[i + argCnt].lclVerTypeInfo)))
         {
             foundSIMDType = true;
-            if (featureSIMD && type == TYP_STRUCT)
+            if (supportSIMDTypes() && type == TYP_STRUCT)
             {
                 var_types structType = impNormStructType(lclVarInfo[i + argCnt].lclVerTypeInfo.GetClassHandle());
                 lclVarInfo[i + argCnt].lclTypeInfo = structType;
diff --git a/src/jit/lclvars.cpp b/src/jit/lclvars.cpp
index 6f34e24f32f7..d7d0f1113451 100644
--- a/src/jit/lclvars.cpp
+++ b/src/jit/lclvars.cpp
@@ -124,7 +124,7 @@ void Compiler::lvaInitTypeRef()
     info.compILargsCount = info.compArgsCount;
 
 #ifdef FEATURE_SIMD
-    if (featureSIMD && (info.compRetNativeType == TYP_STRUCT))
+    if (supportSIMDTypes() && (info.compRetNativeType == TYP_STRUCT))
     {
         var_types structType = impNormStructType(info.compMethodInfo->args.retTypeClass);
         info.compRetType     = structType;
@@ -149,7 +149,7 @@ void Compiler::lvaInitTypeRef()
         if ((howToReturnStruct == SPK_PrimitiveType) || (howToReturnStruct == SPK_EnclosingType))
         {
             assert(returnType != TYP_UNKNOWN);
-            assert(!varTypeIsStruct(returnType));
+            assert(returnType != TYP_STRUCT);
 
             info.compRetNativeType = returnType;
 
@@ -397,7 +397,7 @@ void Compiler::lvaInitThisPtr(InitVarDscInfo* varDscInfo)
         {
             varDsc->lvType = TYP_BYREF;
 #ifdef FEATURE_SIMD
-            if (featureSIMD)
+            if (supportSIMDTypes())
             {
                 var_types simdBaseType = TYP_UNKNOWN;
                 var_types type         = impNormStructType(info.compClassHnd, nullptr, nullptr, &simdBaseType);
@@ -505,7 +505,7 @@ void Compiler::lvaInitRetBuffArg(InitVarDscInfo* varDscInfo)
             }
         }
 #ifdef FEATURE_SIMD
-        else if (featureSIMD && varTypeIsSIMD(info.compRetType))
+        else if (supportSIMDTypes() && varTypeIsSIMD(info.compRetType))
         {
             varDsc->lvSIMDType = true;
             varDsc->lvBaseType =
@@ -598,8 +598,9 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo)
             // If the argType is a struct, then check if it is an HFA
             if (varTypeIsStruct(argType))
             {
-                hfaType  = GetHfaType(typeHnd); // set to float or double if it is an HFA, otherwise TYP_UNDEF
-                isHfaArg = varTypeIsFloating(hfaType);
+                // hfaType is set to float, double or SIMD type if it is an HFA, otherwise TYP_UNDEF.
+                hfaType  = GetHfaType(typeHnd);
+                isHfaArg = varTypeIsValidHfaType(hfaType);
             }
         }
         else if (info.compIsVarArgs)
@@ -616,11 +617,12 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo)
 
         if (isHfaArg)
         {
-            // We have an HFA argument, so from here on out treat the type as a float or double.
+            // We have an HFA argument, so from here on out treat the type as a float, double or vector.
             // The orginal struct type is available by using origArgType
             // We also update the cSlots to be the number of float/double fields in the HFA
             argType = hfaType;
-            cSlots  = varDsc->lvHfaSlots();
+            varDsc->SetHfaType(hfaType);
+            cSlots = varDsc->lvHfaSlots();
         }
         // The number of slots that must be enregistered if we are to consider this argument enregistered.
         // This is normally the same as cSlots, since we normally either enregister the entire object,
@@ -818,18 +820,31 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo)
             if (isHfaArg)
             {
                 // We need to save the fact that this HFA is enregistered
-                varDsc->lvSetIsHfa();
-                varDsc->lvSetIsHfaRegArg();
-                varDsc->SetHfaType(hfaType);
-                varDsc->lvIsMultiRegArg = (varDsc->lvHfaSlots() > 1);
+                // Note that we can have HVAs of SIMD types even if we are not recognizing intrinsics.
+                // In that case, we won't have normalized the vector types on the varDsc, so if we have a single vector
+                // register, we need to set the type now. Otherwise, later we'll assume this is passed by reference.
+                if (varDsc->lvHfaSlots() != 1)
+                {
+                    varDsc->lvIsMultiRegArg = true;
+                }
             }
 
             varDsc->lvIsRegArg = 1;
 
 #if FEATURE_MULTIREG_ARGS
+#ifdef _TARGET_ARM64_
+            if (argType == TYP_STRUCT)
+            {
+                varDsc->lvArgReg = genMapRegArgNumToRegNum(firstAllocatedRegArgNum, TYP_I_IMPL);
+                if (cSlots == 2)
+                {
+                    varDsc->lvOtherArgReg          = genMapRegArgNumToRegNum(firstAllocatedRegArgNum + 1, TYP_I_IMPL);
+                    varDscInfo->hasMultiSlotStruct = true;
+                }
+            }
+#elif defined(UNIX_AMD64_ABI)
             if (varTypeIsStruct(argType))
             {
-#if defined(UNIX_AMD64_ABI)
                 varDsc->lvArgReg = genMapRegArgNumToRegNum(firstAllocatedRegArgNum, firstEightByteType);
 
                 // If there is a second eightbyte, get a register for it too and map the arg to the reg number.
@@ -844,17 +859,13 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo)
                 {
                     varDsc->lvOtherArgReg = genMapRegArgNumToRegNum(secondAllocatedRegArgNum, secondEightByteType);
                 }
-#else // ARM32 or ARM64
+            }
+#else  // ARM32
+            if (varTypeIsStruct(argType))
+            {
                 varDsc->lvArgReg = genMapRegArgNumToRegNum(firstAllocatedRegArgNum, TYP_I_IMPL);
-#ifdef _TARGET_ARM64_
-                if (cSlots == 2)
-                {
-                    varDsc->lvOtherArgReg          = genMapRegArgNumToRegNum(firstAllocatedRegArgNum + 1, TYP_I_IMPL);
-                    varDscInfo->hasMultiSlotStruct = true;
-                }
-#endif //  _TARGET_ARM64_
-#endif // defined(UNIX_AMD64_ABI)
             }
+#endif // ARM32
             else
 #endif // FEATURE_MULTIREG_ARGS
             {
@@ -879,14 +890,13 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo)
                     isFloat = varTypeIsFloating(firstEightByteType);
                 }
                 else
-#else
+#endif // !UNIX_AMD64_ABI
                 {
                     isFloat = varTypeIsFloating(argType);
                 }
-#endif // !UNIX_AMD64_ABI
 
 #if defined(UNIX_AMD64_ABI)
-                    if (varTypeIsStruct(argType))
+                if (varTypeIsStruct(argType))
                 {
                     // Print both registers, just to be clear
                     if (firstEightByteType == TYP_UNDEF)
@@ -1270,7 +1280,11 @@ void Compiler::lvaInitVarDsc(LclVarDsc*              varDsc,
         varDsc->lvStructGcCount = 1;
     }
 
-    // Set the lvType (before this point it is TYP_UNDEF).
+// Set the lvType (before this point it is TYP_UNDEF).
+
+#ifdef FEATURE_HFA
+    varDsc->SetHfaType(TYP_UNDEF);
+#endif
     if ((varTypeIsStruct(type)))
     {
         lvaSetStruct(varNum, typeHnd, typeHnd != nullptr, !tiVerificationNeeded);
@@ -2513,10 +2527,9 @@ void Compiler::lvaSetStruct(unsigned varNum, CORINFO_CLASS_HANDLE typeHnd, bool
             if (varDsc->lvExactSize <= MAX_PASS_MULTIREG_BYTES)
             {
                 var_types hfaType = GetHfaType(typeHnd); // set to float or double if it is an HFA, otherwise TYP_UNDEF
-                if (varTypeIsFloating(hfaType))
+                if (varTypeIsValidHfaType(hfaType))
                 {
-                    varDsc->_lvIsHfa = true;
-                    varDsc->lvSetHfaTypeIsFloat(hfaType == TYP_FLOAT);
+                    varDsc->SetHfaType(hfaType);
 
                     // hfa variables can never contain GC pointers
                     assert(varDsc->lvStructGcCount == 0);
@@ -2588,8 +2601,7 @@ void Compiler::lvaSetStructUsedAsVarArg(unsigned varNum)
     LclVarDsc* varDsc = &lvaTable[varNum];
     // For varargs methods incoming and outgoing arguments should not be treated
     // as HFA.
-    varDsc->_lvIsHfa          = false;
-    varDsc->_lvHfaTypeIsFloat = false;
+    varDsc->SetHfaType(TYP_UNDEF);
 #endif // defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)
 #endif // FEATURE_HFA
 }
@@ -6913,16 +6925,9 @@ void Compiler::lvaDumpEntry(unsigned lclNum, FrameLayoutState curState, size_t r
         }
     }
 
-    if (varDsc->lvIsHfaRegArg())
+    if (varDsc->lvIsHfa())
     {
-        if (varDsc->lvHfaTypeIsFloat())
-        {
-            printf(" (enregistered HFA: float) ");
-        }
-        else
-        {
-            printf(" (enregistered HFA: double)");
-        }
+        printf(" HFA(%s) ", varTypeName(varDsc->GetHfaType()));
     }
 
     if (varDsc->lvDoNotEnregister)
diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp
index a2f7080a9d40..75c354dc3884 100644
--- a/src/jit/lower.cpp
+++ b/src/jit/lower.cpp
@@ -5691,10 +5691,11 @@ void Lowering::ContainCheckRet(GenTreeOp* ret)
         {
             GenTreeLclVarCommon* lclVarCommon = op1->AsLclVarCommon();
             LclVarDsc*           varDsc       = &(comp->lvaTable[lclVarCommon->gtLclNum]);
-            assert(varDsc->lvIsMultiRegRet);
+            // This must be a multi-reg return or an HFA of a single element.
+            assert(varDsc->lvIsMultiRegRet || (varDsc->lvIsHfa() && varTypeIsValidHfaType(varDsc->lvType)));
 
             // Mark var as contained if not enregistrable.
-            if (!varTypeIsEnregisterableStruct(op1))
+            if (!varTypeIsEnregisterable(op1))
             {
                 MakeSrcContained(ret, op1);
             }
diff --git a/src/jit/lsra.cpp b/src/jit/lsra.cpp
index 8a5323ced2bd..adc418c2a0b2 100644
--- a/src/jit/lsra.cpp
+++ b/src/jit/lsra.cpp
@@ -1415,7 +1415,7 @@ bool LinearScan::isRegCandidate(LclVarDsc* varDsc)
     // or enregistered, on x86 -- it is believed that we can enregister pinned (more properly, "pinning")
     // references when using the general GC encoding.
     unsigned lclNum = (unsigned)(varDsc - compiler->lvaTable);
-    if (varDsc->lvAddrExposed || !varTypeIsEnregisterableStruct(varDsc))
+    if (varDsc->lvAddrExposed || !varTypeIsEnregisterable(varDsc))
     {
 #ifdef DEBUG
         Compiler::DoNotEnregisterReason dner = Compiler::DNER_AddrExposed;
diff --git a/src/jit/lsraarmarch.cpp b/src/jit/lsraarmarch.cpp
index 251bf53ab267..9a54c3227978 100644
--- a/src/jit/lsraarmarch.cpp
+++ b/src/jit/lsraarmarch.cpp
@@ -208,7 +208,7 @@ int LinearScan::BuildCall(GenTreeCall* call)
         assert(retTypeDesc != nullptr);
         dstCandidates = retTypeDesc->GetABIReturnRegs();
     }
-    else if (varTypeIsFloating(registerType))
+    else if (varTypeUsesFloatArgReg(registerType))
     {
         dstCandidates = RBM_FLOATRET;
     }
diff --git a/src/jit/lsrabuild.cpp b/src/jit/lsrabuild.cpp
index da1fa8f7d37b..1b0f61ddcca2 100644
--- a/src/jit/lsrabuild.cpp
+++ b/src/jit/lsrabuild.cpp
@@ -1841,15 +1841,7 @@ void LinearScan::updateRegStateForArg(LclVarDsc* argDsc)
     {
         RegState* intRegState   = &compiler->codeGen->intRegState;
         RegState* floatRegState = &compiler->codeGen->floatRegState;
-        // In the case of AMD64 we'll still use the floating point registers
-        // to model the register usage for argument on vararg calls, so
-        // we will ignore the varargs condition to determine whether we use
-        // XMM registers or not for setting up the call.
-        bool isFloat = (isFloatRegType(argDsc->lvType)
-#ifndef _TARGET_AMD64_
-                        && !compiler->info.compIsVarArgs
-#endif
-                        && !compiler->opts.compUseSoftFP);
+        bool      isFloat       = emitter::isFloatReg(argDsc->lvArgReg);
 
         if (argDsc->lvIsHfaRegArg())
         {
@@ -3070,6 +3062,15 @@ int LinearScan::BuildReturn(GenTree* tree)
         regMaskTP useCandidates = RBM_NONE;
 
 #if FEATURE_MULTIREG_RET
+#ifdef _TARGET_ARM64_
+        if (varTypeIsSIMD(tree))
+        {
+            useCandidates = allSIMDRegs();
+            BuildUse(op1, useCandidates);
+            return 1;
+        }
+#endif // !_TARGET_ARM64_
+
         if (varTypeIsStruct(tree))
         {
             // op1 has to be either an lclvar or a multi-reg returning call
@@ -3209,7 +3210,7 @@ int LinearScan::BuildPutArgReg(GenTreeUnOp* node)
         GenTreeObj* obj  = op1->AsObj();
         GenTree*    addr = obj->Addr();
         unsigned    size = obj->gtBlkSize;
-        assert(size <= TARGET_POINTER_SIZE);
+        assert(size <= MAX_PASS_SINGLEREG_BYTES);
         if (addr->OperIsLocalAddr())
         {
             // We don't need a source register.
diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp
index 50143d3c7fb2..95be6dcec903 100644
--- a/src/jit/morph.cpp
+++ b/src/jit/morph.cpp
@@ -828,6 +828,7 @@ void fgArgTabEntry::Dump()
 {
     printf("fgArgTabEntry[arg %u", argNum);
     printf(" %d.%s", node->gtTreeID, GenTree::OpName(node->gtOper));
+    printf(" %s", varTypeName(argType));
     if (regNum != REG_STK)
     {
         printf(", %u reg%s:", numRegs, numRegs == 1 ? "" : "s");
@@ -867,7 +868,7 @@ void fgArgTabEntry::Dump()
     }
     if (isHfaRegArg)
     {
-        printf(", isHfa");
+        printf(", isHfa(%s)", varTypeName(GetHfaType()));
     }
     if (isBackFilled)
     {
@@ -1140,6 +1141,7 @@ fgArgTabEntry* fgArgInfo::AddRegArg(unsigned  argNum,
 
     curArgTabEntry->argNum     = argNum;
     curArgTabEntry->node       = node;
+    curArgTabEntry->argType    = node->TypeGet();
     curArgTabEntry->parent     = parent;
     curArgTabEntry->slotNum    = 0;
     curArgTabEntry->numRegs    = numRegs;
@@ -1153,7 +1155,7 @@ fgArgTabEntry* fgArgInfo::AddRegArg(unsigned  argNum,
     curArgTabEntry->needPlace  = false;
     curArgTabEntry->processed  = false;
 #ifdef FEATURE_HFA
-    curArgTabEntry->_isHfaArg = false;
+    curArgTabEntry->_hfaElemKind = HFA_ELEM_NONE;
 #endif
     curArgTabEntry->isBackFilled  = false;
     curArgTabEntry->isNonStandard = false;
@@ -1213,6 +1215,7 @@ fgArgTabEntry* fgArgInfo::AddStkArg(unsigned argNum,
     curArgTabEntry->setRegNum(0, REG_STK);
     curArgTabEntry->argNum     = argNum;
     curArgTabEntry->node       = node;
+    curArgTabEntry->argType    = node->TypeGet();
     curArgTabEntry->parent     = parent;
     curArgTabEntry->slotNum    = nextSlotNum;
     curArgTabEntry->numRegs    = 0;
@@ -1226,7 +1229,7 @@ fgArgTabEntry* fgArgInfo::AddStkArg(unsigned argNum,
     curArgTabEntry->needPlace  = false;
     curArgTabEntry->processed  = false;
 #ifdef FEATURE_HFA
-    curArgTabEntry->_isHfaArg = false;
+    curArgTabEntry->_hfaElemKind = HFA_ELEM_NONE;
 #endif
     curArgTabEntry->isBackFilled  = false;
     curArgTabEntry->isNonStandard = false;
@@ -2300,12 +2303,16 @@ void fgArgInfo::EvalArgsToTemps()
                     {
                         setupArg = compiler->fgMorphCopyBlock(setupArg);
 #if defined(_TARGET_ARMARCH_) || defined(UNIX_AMD64_ABI)
-                        // This scalar LclVar widening step is only performed for ARM and AMD64 unix.
-                        //
-                        CORINFO_CLASS_HANDLE clsHnd     = compiler->lvaGetStruct(tmpVarNum);
-                        unsigned             structSize = varDsc->lvExactSize;
+                        if (lclVarType == TYP_STRUCT)
+                        {
+                            // This scalar LclVar widening step is only performed for ARM architectures.
+                            //
+                            CORINFO_CLASS_HANDLE clsHnd     = compiler->lvaGetStruct(tmpVarNum);
+                            unsigned             structSize = varDsc->lvExactSize;
 
-                        scalarType = compiler->getPrimitiveTypeForStruct(structSize, clsHnd, curArgTabEntry->isVararg);
+                            scalarType =
+                                compiler->getPrimitiveTypeForStruct(structSize, clsHnd, curArgTabEntry->isVararg);
+                        }
 #endif // _TARGET_ARMARCH_ || defined (UNIX_AMD64_ABI)
                     }
 
@@ -2391,7 +2398,7 @@ void fgArgInfo::EvalArgsToTemps()
 
 #else // !defined(_TARGET_AMD64_) || defined(UNIX_AMD64_ABI)
 
-            if (varTypeIsStruct(defArg))
+            if (defArg->TypeGet() == TYP_STRUCT)
             {
                 clsHnd = compiler->gtGetStructHandleIfPresent(defArg);
                 noway_assert(clsHnd != NO_CLASS_HANDLE);
@@ -3079,7 +3086,7 @@ void Compiler::fgInitArgInfo(GenTreeCall* call)
 
 #ifdef FEATURE_HFA
         hfaType  = GetHfaType(argx);
-        isHfaArg = varTypeIsFloating(hfaType);
+        isHfaArg = varTypeIsValidHfaType(hfaType);
 
 #if defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)
         // Make sure for vararg methods isHfaArg is not true.
@@ -3628,7 +3635,7 @@ void Compiler::fgInitArgInfo(GenTreeCall* call)
 #ifdef FEATURE_HFA
         if (isHfaArg)
         {
-            newArgEntry->setHfaType(hfaType, hfaSlots);
+            newArgEntry->SetHfaType(hfaType, hfaSlots);
         }
 #endif // FEATURE_HFA
         newArgEntry->SetMultiRegNums();
@@ -3872,7 +3879,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call)
                 {
                     if (isPow2(passingSize))
                     {
-                        canTransform = true;
+                        canTransform = (!argEntry->isHfaArg || (passingSize == genTypeSize(argEntry->GetHfaType())));
                     }
 
 #if defined(_TARGET_ARM64_) || defined(UNIX_AMD64_ABI)
@@ -3957,15 +3964,16 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call)
                 }
                 else
                 {
-                    // We have a struct argument that's less than pointer size, and it is either a power of 2,
+                    // We have a struct argument that fits into a register, and it is either a power of 2,
                     // or a local.
-                    // Change our GT_OBJ into a GT_IND of the correct type.
+                    // Change our argument, as needed, into a value of the appropriate type.
                     CLANG_FORMAT_COMMENT_ANCHOR;
 
 #ifdef _TARGET_ARM_
                     assert((size == 1) || ((structBaseType == TYP_DOUBLE) && (size == 2)));
 #else
-                    assert(size == 1);
+                    assert((size == 1) ||
+                           (varTypeIsSIMD(structBaseType) && size == (genTypeSize(structBaseType) / REGSIZE_BYTES)));
 #endif
 
                     assert((structBaseType != TYP_STRUCT) && (genTypeSize(structBaseType) >= originalSize));
@@ -4012,7 +4020,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call)
                                     // we will use the first and only promoted field
                                     argObj->gtLclVarCommon.SetLclNum(varDsc->lvFieldLclStart);
 
-                                    if (varTypeCanReg(fieldVarDsc->TypeGet()) &&
+                                    if (varTypeIsEnregisterable(fieldVarDsc->TypeGet()) &&
                                         (genTypeSize(fieldVarDsc->TypeGet()) == originalSize))
                                     {
                                         // Just use the existing field's type
@@ -4025,7 +4033,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call)
                                         argObj->ChangeOper(GT_LCL_FLD);
                                         argObj->gtType = structBaseType;
                                     }
-                                    assert(varTypeCanReg(argObj->TypeGet()));
+                                    assert(varTypeIsEnregisterable(argObj->TypeGet()));
                                     assert(copyBlkClass == NO_CLASS_HANDLE);
                                 }
                                 else
@@ -4043,7 +4051,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call)
                                 copyBlkClass = objClass;
                             }
                         }
-                        else if (!varTypeIsIntegralOrI(varDsc->TypeGet()))
+                        else if (genActualType(varDsc->TypeGet()) != structBaseType)
                         {
                             // Not a promoted struct, so just swizzle the type by using GT_LCL_FLD
                             argObj->ChangeOper(GT_LCL_FLD);
@@ -4055,44 +4063,41 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call)
                         // Not a GT_LCL_VAR, so we can just change the type on the node
                         argObj->gtType = structBaseType;
                     }
-                    assert(varTypeCanReg(argObj->TypeGet()) ||
-                           ((copyBlkClass != NO_CLASS_HANDLE) && varTypeCanReg(structBaseType)));
-
-                    size = 1;
+                    assert(varTypeIsEnregisterable(argObj->TypeGet()) ||
+                           ((copyBlkClass != NO_CLASS_HANDLE) && varTypeIsEnregisterable(structBaseType)));
                 }
 #endif // !_TARGET_X86_
 
 #ifndef UNIX_AMD64_ABI
                 // We still have a struct unless we converted the GT_OBJ into a GT_IND above...
-                if (varTypeIsStruct(structBaseType) && !argEntry->passedByRef)
+                if (isHfaArg && passUsingFloatRegs)
                 {
-                    if (isHfaArg && passUsingFloatRegs)
-                    {
-                        size = argEntry->numRegs;
-                    }
-                    else
-                    {
-                        // If the valuetype size is not a multiple of TARGET_POINTER_SIZE,
-                        // we must copyblk to a temp before doing the obj to avoid
-                        // the obj reading memory past the end of the valuetype
-                        CLANG_FORMAT_COMMENT_ANCHOR;
+                    size = argEntry->numRegs;
+                }
+                else if (structBaseType == TYP_STRUCT)
+                {
+                    // If the valuetype size is not a multiple of TARGET_POINTER_SIZE,
+                    // we must copyblk to a temp before doing the obj to avoid
+                    // the obj reading memory past the end of the valuetype
+                    CLANG_FORMAT_COMMENT_ANCHOR;
 
-                        if (roundupSize > originalSize)
-                        {
-                            copyBlkClass = objClass;
+                    if (roundupSize > originalSize)
+                    {
+                        copyBlkClass = objClass;
 
-                            // There are a few special cases where we can omit using a CopyBlk
-                            // where we normally would need to use one.
+                        // There are a few special cases where we can omit using a CopyBlk
+                        // where we normally would need to use one.
 
-                            if (argObj->gtObj.gtOp1->IsLocalAddrExpr() != nullptr) // Is the source a LclVar?
-                            {
-                                copyBlkClass = NO_CLASS_HANDLE;
-                            }
+                        if (argObj->OperIs(GT_OBJ) &&
+                            argObj->AsObj()->gtGetOp1()->IsLocalAddrExpr() != nullptr) // Is the source a LclVar?
+                        {
+                            copyBlkClass = NO_CLASS_HANDLE;
                         }
-
-                        size = roundupSize / TARGET_POINTER_SIZE; // Normalize size to number of pointer sized items
                     }
+
+                    size = roundupSize / TARGET_POINTER_SIZE; // Normalize size to number of pointer sized items
                 }
+
 #endif // !UNIX_AMD64_ABI
             }
         }
@@ -4159,7 +4164,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call)
 #if FEATURE_MULTIREG_ARGS
         if (isStructArg)
         {
-            if (size > 1 || isHfaArg)
+            if ((argEntry->numRegs + argEntry->numSlots) > 1)
             {
                 hasMultiregStructArgs = true;
             }
@@ -4376,11 +4381,37 @@ void Compiler::fgMorphMultiregStructArgs(GenTreeCall* call)
         }
 
         unsigned size = (fgEntryPtr->numRegs + fgEntryPtr->numSlots);
-        if ((size > 1) || fgEntryPtr->isHfaArg)
+        if (size > 1)
         {
             foundStructArg = true;
             if (varTypeIsStruct(argx) && !argx->OperIs(GT_FIELD_LIST))
             {
+                if (fgEntryPtr->isHfaArg)
+                {
+                    var_types hfaType = fgEntryPtr->hfaType;
+                    unsigned  structSize;
+                    if (argx->OperIs(GT_OBJ))
+                    {
+                        structSize = argx->AsObj()->gtBlkSize;
+                    }
+                    else
+                    {
+                        assert(argx->OperIs(GT_LCL_VAR));
+                        structSize = lvaGetDesc(argx->AsLclVar()->gtLclNum)->lvExactSize;
+                    }
+                    assert(structSize > 0);
+                    if (structSize == genTypeSize(hfaType))
+                    {
+                        if (argx->OperIs(GT_OBJ))
+                        {
+                            fgMorphBlkToInd(argx->AsObj(), hfaType);
+                        }
+                        else
+                        {
+                            argx->gtType = hfaType;
+                        }
+                    }
+                }
                 arg = fgMorphMultiregStructArg(arg, fgEntryPtr);
 
                 // Did we replace 'argx' with a new tree?
@@ -4490,14 +4521,19 @@ GenTree* Compiler::fgMorphMultiregStructArg(GenTree* arg, fgArgTabEntry* fgEntry
 #if FEATURE_MULTIREG_ARGS
     // Examine 'arg' and setup argValue objClass and structSize
     //
-    CORINFO_CLASS_HANDLE objClass   = gtGetStructHandleIfPresent(arg);
-    GenTree*             argValue   = arg; // normally argValue will be arg, but see right below
-    unsigned             structSize = 0;
+    CORINFO_CLASS_HANDLE objClass = gtGetStructHandleIfPresent(arg);
+    noway_assert(objClass != NO_CLASS_HANDLE);
+    GenTree* argValue   = arg; // normally argValue will be arg, but see right below
+    unsigned structSize = 0;
 
-    if (arg->OperGet() == GT_OBJ)
+    if (arg->TypeGet() != TYP_STRUCT)
+    {
+        structSize = genTypeSize(arg->TypeGet());
+        assert(structSize == info.compCompHnd->getClassSize(objClass));
+    }
+    else if (arg->OperGet() == GT_OBJ)
     {
         GenTreeObj* argObj = arg->AsObj();
-        objClass           = argObj->gtClass;
         structSize         = argObj->Size();
         assert(structSize == info.compCompHnd->getClassSize(objClass));
 
@@ -4527,7 +4563,6 @@ GenTree* Compiler::fgMorphMultiregStructArg(GenTree* arg, fgArgTabEntry* fgEntry
     }
     else
     {
-        objClass   = gtGetStructHandleIfPresent(arg);
         structSize = info.compCompHnd->getClassSize(objClass);
     }
     noway_assert(objClass != NO_CLASS_HANDLE);
@@ -4538,8 +4573,8 @@ GenTree* Compiler::fgMorphMultiregStructArg(GenTree* arg, fgArgTabEntry* fgEntry
     unsigned  elemSize                = 0;
     var_types type[MAX_ARG_REG_COUNT] = {}; // TYP_UNDEF = 0
 
-    hfaType = GetHfaType(objClass); // set to float or double if it is an HFA, otherwise TYP_UNDEF
-    if (varTypeIsFloating(hfaType)
+    hfaType = fgEntryPtr->hfaType;
+    if (varTypeIsValidHfaType(hfaType)
 #if !defined(_HOST_UNIX_) && defined(_TARGET_ARM64_)
         && !fgEntryPtr->isVararg
 #endif // !defined(_HOST_UNIX_) && defined(_TARGET_ARM64_)
@@ -4657,8 +4692,13 @@ GenTree* Compiler::fgMorphMultiregStructArg(GenTree* arg, fgArgTabEntry* fgEntry
 #endif // !defined(_HOST_UNIX_) && defined(_TARGET_ARM64_)
             )
         {
-            // We have a HFA struct
-            noway_assert(elemType == (varDsc->lvHfaTypeIsFloat() ? TYP_FLOAT : TYP_DOUBLE));
+            // We have a HFA struct.
+            // Note that GetHfaType may not be the same as elemType, since TYP_SIMD8 is handled the same as TYP_DOUBLE.
+            var_types useElemType = elemType;
+#ifdef _TARGET_ARM64_
+            useElemType = (elemType == TYP_SIMD8) ? TYP_DOUBLE : useElemType;
+#endif // _TARGET_ARM64_
+            noway_assert(useElemType == varDsc->GetHfaType());
             noway_assert(elemSize == genTypeSize(elemType));
             noway_assert(elemCount == (varDsc->lvExactSize / elemSize));
             noway_assert(elemSize * elemCount == varDsc->lvExactSize);
@@ -5291,7 +5331,7 @@ void Compiler::fgFixupStructReturn(GenTree* callNode)
 
 #if FEATURE_MULTIREG_RET
     // Either we don't have a struct now or if struct, then it is a struct returned in regs or in return buffer.
-    assert(!varTypeIsStruct(call) || call->HasMultiRegRetVal() || callHasRetBuffArg);
+    assert((call->gtType != TYP_STRUCT) || call->HasMultiRegRetVal() || callHasRetBuffArg);
 #else // !FEATURE_MULTIREG_RET
     // No more struct returns
     assert(call->TypeGet() != TYP_STRUCT);
@@ -7104,7 +7144,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
 
 #elif defined(_TARGET_ARM64_) // ARM64
                 var_types hfaType  = GetHfaType(argx);
-                bool      isHfaArg = varTypeIsFloating(hfaType);
+                bool      isHfaArg = varTypeIsValidHfaType(hfaType);
                 size_t    size     = 1;
 
                 if (isHfaArg)
@@ -17414,12 +17454,10 @@ void Compiler::fgMorphLocalField(GenTree* tree, GenTree* parent)
             // The field must be an enregisterable type; otherwise it would not be a promoted field.
             // The tree type may not match, e.g. for return types that have been morphed, but both
             // must be enregisterable types.
-            // TODO-Cleanup: varTypeCanReg should presumably return true for SIMD types, but
-            // there may be places where that would violate existing assumptions.
             var_types treeType  = tree->TypeGet();
             var_types fieldType = fldVarDsc->TypeGet();
-            assert((varTypeCanReg(treeType) || varTypeIsSIMD(treeType)) &&
-                   (varTypeCanReg(fieldType) || varTypeIsSIMD(fieldType)));
+            assert((varTypeIsEnregisterable(treeType) || varTypeIsSIMD(treeType)) &&
+                   (varTypeIsEnregisterable(fieldType) || varTypeIsSIMD(fieldType)));
 
             tree->ChangeOper(GT_LCL_VAR);
             assert(tree->gtLclVarCommon.gtLclNum == fieldLclIndex);
@@ -17474,23 +17512,28 @@ void Compiler::fgMarkImplicitByRefArgs()
 
         if (varDsc->lvIsParam && varTypeIsStruct(varDsc))
         {
-            size_t size;
+            size_t size = varDsc->lvExactSize;
+            assert(size == info.compCompHnd->getClassSize(varDsc->lvVerTypeInfo.GetClassHandle()));
 
-            if (varDsc->lvSize() > REGSIZE_BYTES)
+            bool isPassedByReference;
+#if defined(_TARGET_AMD64_)
+            isPassedByReference = (size > REGSIZE_BYTES || (size & (size - 1)) != 0);
+#elif defined(_TARGET_ARM64_)
+            if (size > TARGET_POINTER_SIZE)
             {
-                size = varDsc->lvSize();
+                CORINFO_CLASS_HANDLE clsHnd = varDsc->lvVerTypeInfo.GetClassHandleForValueClass();
+                structPassingKind    howToPassStruct;
+                var_types            type =
+                    getArgTypeForStruct(clsHnd, &howToPassStruct, this->info.compIsVarArgs, varDsc->lvExactSize);
+                isPassedByReference = (howToPassStruct == SPK_ByReference);
             }
             else
             {
-                CORINFO_CLASS_HANDLE typeHnd = varDsc->lvVerTypeInfo.GetClassHandle();
-                size                         = info.compCompHnd->getClassSize(typeHnd);
+                isPassedByReference = false;
             }
-
-#if defined(_TARGET_AMD64_)
-            if (size > REGSIZE_BYTES || (size & (size - 1)) != 0)
-#elif defined(_TARGET_ARM64_)
-            if ((size > TARGET_POINTER_SIZE) && !lvaIsMultiregStruct(varDsc, this->info.compIsVarArgs))
 #endif
+
+            if (isPassedByReference)
             {
                 // Previously nobody was ever setting lvIsParam and lvIsTemp on the same local
                 // So I am now using it to indicate that this is one of the weird implicit
@@ -17636,8 +17679,7 @@ void Compiler::fgRetypeImplicitByRefArgs()
                     // the parameter which is really a pointer to the struct.
                     fieldVarDsc->lvIsRegArg      = false;
                     fieldVarDsc->lvIsMultiRegArg = false;
-                    fieldVarDsc->lvSetIsHfaRegArg(false);
-                    fieldVarDsc->lvArgReg = REG_NA;
+                    fieldVarDsc->lvArgReg        = REG_NA;
 #if FEATURE_MULTIREG_ARGS
                     fieldVarDsc->lvOtherArgReg = REG_NA;
 #endif
diff --git a/src/jit/register_arg_convention.h b/src/jit/register_arg_convention.h
index 28f29b7c13c3..ad20b4a0f543 100644
--- a/src/jit/register_arg_convention.h
+++ b/src/jit/register_arg_convention.h
@@ -58,7 +58,7 @@ struct InitVarDscInfo
     // return ref to current register arg for this type
     unsigned& regArgNum(var_types type)
     {
-        return varTypeIsFloating(type) ? floatRegArgNum : intRegArgNum;
+        return varTypeUsesFloatArgReg(type) ? floatRegArgNum : intRegArgNum;
     }
 
     // Allocate a set of contiguous argument registers. "type" is either an integer
@@ -110,7 +110,7 @@ struct InitVarDscInfo
     // return max register arg for this type
     unsigned maxRegArgNum(var_types type)
     {
-        return varTypeIsFloating(type) ? maxFloatRegArgNum : maxIntRegArgNum;
+        return varTypeUsesFloatArgReg(type) ? maxFloatRegArgNum : maxIntRegArgNum;
     }
 
     bool enoughAvailRegs(var_types type, unsigned numRegs = 1);
diff --git a/src/jit/simd.cpp b/src/jit/simd.cpp
index b4cecb3e2046..81494b02361c 100644
--- a/src/jit/simd.cpp
+++ b/src/jit/simd.cpp
@@ -121,7 +121,7 @@ int Compiler::getSIMDTypeAlignment(var_types simdType)
 //
 var_types Compiler::getBaseTypeAndSizeOfSIMDType(CORINFO_CLASS_HANDLE typeHnd, unsigned* sizeBytes /*= nullptr */)
 {
-    assert(featureSIMD);
+    assert(supportSIMDTypes());
 
     if (m_simdHandleCache == nullptr)
     {
diff --git a/src/jit/target.h b/src/jit/target.h
index 97df447f4d15..a225d1a4bb1c 100644
--- a/src/jit/target.h
+++ b/src/jit/target.h
@@ -246,6 +246,7 @@ typedef unsigned char   regNumberSmall;
   #define FEATURE_MULTIREG_ARGS_OR_RET  1  // Support for passing and/or returning single values in more than one register
   #define FEATURE_MULTIREG_ARGS         0  // Support for passing a single argument in more than one register  
   #define FEATURE_MULTIREG_RET          1  // Support for returning a single value in more than one register
+  #define MAX_PASS_SINGLEREG_BYTES      8  // Maximum size of a struct passed in a single register (double).
   #define MAX_PASS_MULTIREG_BYTES       0  // No multireg arguments (note this seems wrong as MAX_ARG_REG_COUNT is 2)
   #define MAX_RET_MULTIREG_BYTES        8  // Maximum size of a struct that could be returned in more than one register
 
@@ -540,6 +541,7 @@ typedef unsigned char   regNumberSmall;
   #define FEATURE_FASTTAILCALL     1       // Tail calls made as epilog+jmp
   #define FEATURE_TAILCALL_OPT     1       // opportunistic Tail calls (i.e. without ".tail" prefix) made as fast tail calls.
   #define FEATURE_SET_FLAGS        0       // Set to true to force the JIT to mark the trees with GTF_SET_FLAGS when the flags need to be set
+  #define MAX_PASS_SINGLEREG_BYTES      8  // Maximum size of a struct passed in a single register (double).
 #ifdef    UNIX_AMD64_ABI
   #define FEATURE_MULTIREG_ARGS_OR_RET  1  // Support for passing and/or returning single values in more than one register
   #define FEATURE_MULTIREG_ARGS         1  // Support for passing a single argument in more than one register  
@@ -924,6 +926,7 @@ typedef unsigned char   regNumberSmall;
   #define FEATURE_MULTIREG_ARGS         1  // Support for passing a single argument in more than one register (including passing HFAs)
   #define FEATURE_MULTIREG_RET          1  // Support for returning a single value in more than one register (including HFA returns)
   #define FEATURE_STRUCT_CLASSIFIER     0  // Uses a classifier function to determine is structs are passed/returned in more than one register
+  #define MAX_PASS_SINGLEREG_BYTES      8  // Maximum size of a struct passed in a single register (double).
   #define MAX_PASS_MULTIREG_BYTES      32  // Maximum size of a struct that could be passed in more than one register (Max is an HFA of 4 doubles)
   #define MAX_RET_MULTIREG_BYTES       32  // Maximum size of a struct that could be returned in more than one register (Max is an HFA of 4 doubles)
   #define MAX_ARG_REG_COUNT             4  // Maximum registers used to pass a single argument in multiple registers. (max is 4 floats or doubles using an HFA)
@@ -1231,9 +1234,10 @@ typedef unsigned char   regNumberSmall;
   #define FEATURE_MULTIREG_ARGS         1  // Support for passing a single argument in more than one register  
   #define FEATURE_MULTIREG_RET          1  // Support for returning a single value in more than one register  
   #define FEATURE_STRUCT_CLASSIFIER     0  // Uses a classifier function to determine is structs are passed/returned in more than one register
-  #define MAX_PASS_MULTIREG_BYTES      32  // Maximum size of a struct that could be passed in more than one register (max is 4 doubles using an HFA)
-  #define MAX_RET_MULTIREG_BYTES       32  // Maximum size of a struct that could be returned in more than one register (Max is an HFA of 4 doubles)
-  #define MAX_ARG_REG_COUNT             4  // Maximum registers used to pass a single argument in multiple registers. (max is 4 floats or doubles using an HFA)
+  #define MAX_PASS_SINGLEREG_BYTES     16  // Maximum size of a struct passed in a single register (16-byte vector).
+  #define MAX_PASS_MULTIREG_BYTES      64  // Maximum size of a struct that could be passed in more than one register (max is 4 16-byte vectors using an HVA)
+  #define MAX_RET_MULTIREG_BYTES       64  // Maximum size of a struct that could be returned in more than one register (Max is an HVA of 4 16-byte vectors)
+  #define MAX_ARG_REG_COUNT             4  // Maximum registers used to pass a single argument in multiple registers. (max is 4 128-bit vectors using an HVA)
   #define MAX_RET_REG_COUNT             4  // Maximum registers used to return a value.
 
   #define NOGC_WRITE_BARRIERS      1       // We have specialized WriteBarrier JIT Helpers that DO-NOT trash the RBM_CALLEE_TRASH registers
@@ -1955,10 +1959,10 @@ inline regNumber regNextOfType(regNumber reg, var_types type)
  *  Type checks
  */
 
-inline bool isFloatRegType(int /* s/b "var_types" */ type)
+inline bool isFloatRegType(var_types type)
 {
 #if CPU_HAS_FP_SUPPORT
-    return type == TYP_DOUBLE || type == TYP_FLOAT;
+    return varTypeUsesFloatReg(type);
 #else
     return false;
 #endif
diff --git a/src/jit/vartype.h b/src/jit/vartype.h
index 04793ea86830..83824ac13576 100644
--- a/src/jit/vartype.h
+++ b/src/jit/vartype.h
@@ -174,9 +174,9 @@ inline bool varTypeIsI(T vt)
 }
 
 template <class T>
-inline bool varTypeCanReg(T vt)
+inline bool varTypeIsEnregisterable(T vt)
 {
-    return ((varTypeClassification[TypeGet(vt)] & (VTF_INT | VTF_I | VTF_FLT)) != 0);
+    return (TypeGet(vt) != TYP_STRUCT);
 }
 
 template <class T>
@@ -271,9 +271,56 @@ inline bool varTypeIsStruct(T vt)
 }
 
 template <class T>
-inline bool varTypeIsEnregisterableStruct(T vt)
+inline bool varTypeUsesFloatReg(T vt)
 {
-    return (TypeGet(vt) != TYP_STRUCT);
+    // Note that not all targets support SIMD, but if they don't, varTypeIsSIMD will
+    // always return false.
+    return varTypeIsFloating(vt) || varTypeIsSIMD(vt);
+}
+
+template <class T>
+inline bool varTypeUsesFloatArgReg(T vt)
+{
+#ifdef _TARGET_ARM64_
+    // Arm64 passes SIMD types in floating point registers.
+    return varTypeUsesFloatReg(vt);
+#else
+    // Other targets pass them as regular structs - by reference or by value.
+    return varTypeIsFloating(vt);
+#endif
+}
+
+//------------------------------------------------------------------------
+// varTypeIsValidHfaType: Determine if the type is a valid HFA type
+//
+// Arguments:
+//    vt - the type of interest
+//
+// Return Value:
+//    Returns true iff the type is a valid HFA type.
+//
+// Notes:
+//    This should only be called with the return value from GetHfaType().
+//    The only valid values are TYP_UNDEF, for which this returns false,
+//    TYP_FLOAT, TYP_DOUBLE, or (ARM64-only) TYP_SIMD*.
+//
+template <class T>
+inline bool varTypeIsValidHfaType(T vt)
+{
+#ifdef FEATURE_HFA
+    bool isValid = (TypeGet(vt) != TYP_UNDEF);
+    if (isValid)
+    {
+#ifdef _TARGET_ARM64_
+        assert(varTypeUsesFloatReg(vt));
+#else  // !_TARGET_ARM64_
+        assert(varTypeIsFloating(vt));
+#endif // !_TARGET_ARM64_
+    }
+    return isValid;
+#else  // !FEATURE_HFA
+    return false;
+#endif // !FEATURE_HFA
 }
 
 /*****************************************************************************/
diff --git a/src/vm/argdestination.h b/src/vm/argdestination.h
index 386ba57c821f..8ddd7b210412 100644
--- a/src/vm/argdestination.h
+++ b/src/vm/argdestination.h
@@ -60,22 +60,24 @@ class ArgDestination
     //  fieldBytes - size of the structure
     void CopyHFAStructToRegister(void *src, int fieldBytes)
     {
-        // We are either copying either a float or double HFA and need to
+        // We are copying a float, double or vector HFA/HVA and need to
         // enregister each field.
 
         int floatRegCount = m_argLocDescForStructInRegs->m_cFloatReg;
-        bool typeFloat = m_argLocDescForStructInRegs->m_isSinglePrecision;
+        int hfaFieldSize = m_argLocDescForStructInRegs->m_hfaFieldSize;
         UINT64* dest = (UINT64*) this->GetDestinationAddress();
 
         for (int i = 0; i < floatRegCount; ++i) 
         {
             // Copy 4 or 8 bytes from src.
-            UINT64 val = typeFloat ? *((UINT32*)src + i) : *((UINT64*)src + i);
+            UINT64 val = (hfaFieldSize == 4) ? *((UINT32*)src) : *((UINT64*)src);
             // Always store 8 bytes
             *(dest++) = val;
-            // For now, always zero the next 8 bytes.
-            // (When HVAs are supported we will get the next 8 bytes from src.)
-            *(dest++) = 0;
+            // Either zero the next 8 bytes or get the next 8 bytes from src for 16-byte vector.
+            *(dest++) = (hfaFieldSize == 16) ? *((UINT64*)src + 1) : 0;
+
+            // Increment src by the appropriate amount.
+            src = (void*)((char*)src + hfaFieldSize);
         }
     }
 
diff --git a/src/vm/arm64/CallDescrWorkerARM64.asm b/src/vm/arm64/CallDescrWorkerARM64.asm
index fe277ceb6282..9f2ec2461159 100644
--- a/src/vm/arm64/CallDescrWorkerARM64.asm
+++ b/src/vm/arm64/CallDescrWorkerARM64.asm
@@ -93,7 +93,7 @@ LNoFloatingPoint
         bne     LNoDoubleReturn
 
 LFloatReturn
-        str     d0, [x19, #(CallDescrData__returnValue + 0)]
+        str     q0, [x19, #(CallDescrData__returnValue + 0)]
         b       LReturnDone
 
 LNoDoubleReturn
@@ -117,6 +117,16 @@ LNoFloatHFAReturn
 
 LNoDoubleHFAReturn
 
+        ;;VectorHFAReturn  return case
+        cmp     w3, #64
+        bne     LNoVectorHFAReturn
+
+        stp     q0, q1, [x19, #(CallDescrData__returnValue + 0)]
+        stp     q2, q3, [x19, #(CallDescrData__returnValue + 0x20)]
+        b       LReturnDone
+
+LNoVectorHFAReturn
+
         EMIT_BREAKPOINT ; Unreachable
 
 LIntReturn
diff --git a/src/vm/arm64/asmconstants.h b/src/vm/arm64/asmconstants.h
index caffa809eb50..8c99ed841967 100644
--- a/src/vm/arm64/asmconstants.h
+++ b/src/vm/arm64/asmconstants.h
@@ -61,7 +61,7 @@ ASMCONSTANTS_C_ASSERT(SIZEOF__FloatArgumentRegisters == sizeof(FloatArgumentRegi
 #define CallDescrData__fpReturnSize             0x20
 #define CallDescrData__pTarget                  0x28
 #define CallDescrData__pRetBuffArg              0x30
-#define CallDescrData__returnValue              0x38
+#define CallDescrData__returnValue              0x40
 
 ASMCONSTANTS_C_ASSERT(CallDescrData__pSrc                 == offsetof(CallDescrData, pSrc))
 ASMCONSTANTS_C_ASSERT(CallDescrData__numStackSlots        == offsetof(CallDescrData, numStackSlots))
diff --git a/src/vm/arm64/asmhelpers.asm b/src/vm/arm64/asmhelpers.asm
index 5883597aaa37..71e53d3d09e8 100644
--- a/src/vm/arm64/asmhelpers.asm
+++ b/src/vm/arm64/asmhelpers.asm
@@ -705,8 +705,9 @@ NoFloatingPointRetVal
 
         ; x0 = fpRetSize
 
-        ; return value is stored before float argument registers
-        add         x1, sp, #(__PWTB_FloatArgumentRegisters - 0x20)
+        ; The return value is stored before float argument registers
+        ; The maximum size of a return value is 0x40 (HVA of 4x16)
+        add         x1, sp, #(__PWTB_FloatArgumentRegisters - 0x40)
         bl          setStubReturnValue
 
         EPILOG_WITH_TRANSITION_BLOCK_RETURN
diff --git a/src/vm/arm64/calldescrworkerarm64.S b/src/vm/arm64/calldescrworkerarm64.S
index f987d402ddee..8e8084ba3496 100644
--- a/src/vm/arm64/calldescrworkerarm64.S
+++ b/src/vm/arm64/calldescrworkerarm64.S
@@ -85,7 +85,7 @@ LOCAL_LABEL(NoFloatingPoint):
     bne     LOCAL_LABEL(NoDoubleReturn)
 
 LOCAL_LABEL(FloatReturn):
-    str     d0, [x19, #(CallDescrData__returnValue + 0)]
+    str     q0, [x19, #(CallDescrData__returnValue + 0)]
     b       LOCAL_LABEL(ReturnDone)
 
 LOCAL_LABEL(NoDoubleReturn):
@@ -97,6 +97,7 @@ LOCAL_LABEL(NoDoubleReturn):
     stp     s0, s1, [x19, #(CallDescrData__returnValue + 0)]
     stp     s2, s3, [x19, #(CallDescrData__returnValue + 0x08)]
     b       LOCAL_LABEL(ReturnDone)
+
 LOCAL_LABEL(NoFloatHFAReturn):
 
     //DoubleHFAReturn  return case
@@ -109,6 +110,16 @@ LOCAL_LABEL(NoFloatHFAReturn):
 
 LOCAL_LABEL(NoDoubleHFAReturn):
 
+    //VectorHFAReturn  return case
+    cmp     w3, #64
+    bne     LOCAL_LABEL(LNoVectorHFAReturn)
+
+    stp     q0, q1, [x19, #(CallDescrData__returnValue + 0)]
+    stp     q2, q3, [x19, #(CallDescrData__returnValue + 0x20)]
+    b       LOCAL_LABEL(ReturnDone)
+
+LOCAL_LABEL(LNoVectorHFAReturn):
+
     EMIT_BREAKPOINT // Unreachable
 
 LOCAL_LABEL(IntReturn):
diff --git a/src/vm/arm64/cgencpu.h b/src/vm/arm64/cgencpu.h
index fd1fbafe96da..a2cac4eb7c20 100644
--- a/src/vm/arm64/cgencpu.h
+++ b/src/vm/arm64/cgencpu.h
@@ -51,7 +51,7 @@ extern PCODE GetPreStubEntryPoint();
 #define CACHE_LINE_SIZE                         64
 #define LOG2SLOT                                LOG2_PTRSIZE
 
-#define ENREGISTERED_RETURNTYPE_MAXSIZE         32  // bytes (four FP registers: d0,d1,d2 and d3)
+#define ENREGISTERED_RETURNTYPE_MAXSIZE         64  // bytes (four vector registers: q0,q1,q2 and q3)
 #define ENREGISTERED_RETURNTYPE_INTEGER_MAXSIZE 16  // bytes (two int registers: x0 and x1)
 #define ENREGISTERED_PARAMTYPE_MAXSIZE          16  // bytes (max value type size that can be passed by value)
 
diff --git a/src/vm/callhelpers.h b/src/vm/callhelpers.h
index db9cfad6cb1e..d04412b68e17 100644
--- a/src/vm/callhelpers.h
+++ b/src/vm/callhelpers.h
@@ -39,8 +39,13 @@ struct CallDescrData
     // Return value
     //
 #ifdef ENREGISTERED_RETURNTYPE_MAXSIZE
+#ifdef _TARGET_ARM64_
+    // Use NEON128 to ensure proper alignment for vectors.
+    __declspec(align(16)) NEON128 returnValue[ENREGISTERED_RETURNTYPE_MAXSIZE / sizeof(NEON128)];
+#else
     // Use UINT64 to ensure proper alignment
     UINT64 returnValue[ENREGISTERED_RETURNTYPE_MAXSIZE / sizeof(UINT64)];
+#endif
 #else
     UINT64 returnValue;
 #endif
diff --git a/src/vm/callingconvention.h b/src/vm/callingconvention.h
index 7368fecac816..cb117278b511 100644
--- a/src/vm/callingconvention.h
+++ b/src/vm/callingconvention.h
@@ -50,8 +50,7 @@ struct ArgLocDesc
 #endif // UNIX_AMD64_ABI
 
 #if defined(_TARGET_ARM64_)
-    bool    m_isSinglePrecision;  // For determining if HFA is single or double
-                                  // precision
+    int      m_hfaFieldSize;      // Size of HFA field
 #endif // defined(_TARGET_ARM64_)
 
 #if defined(_TARGET_ARM_)
@@ -76,7 +75,7 @@ struct ArgLocDesc
         m_fRequires64BitAlignment = FALSE;
 #endif
 #if defined(_TARGET_ARM64_)
-        m_isSinglePrecision = FALSE;
+        m_hfaFieldSize = 0;
 #endif // defined(_TARGET_ARM64_)
 #if defined(UNIX_AMD64_ABI)
         m_eeClass = NULL;
@@ -589,10 +588,19 @@ class ArgIteratorTemplate : public ARGITERATOR_BASE
             if (!m_argTypeHandle.IsNull() && m_argTypeHandle.IsHFA())
             {
                 CorElementType type = m_argTypeHandle.GetHFAType();
-                bool isFloatType = (type == ELEMENT_TYPE_R4);
+                int hfaFieldSize = 0;
+                switch (type)
+                {
+                case ELEMENT_TYPE_R4: hfaFieldSize = 4; break;
+                case ELEMENT_TYPE_R8: hfaFieldSize = 8; break;
+#ifdef _TARGET_ARM64_
+                case ELEMENT_TYPE_VALUETYPE: hfaFieldSize = 16; break;
+#endif
+                default: _ASSERTE(!"Invalid HFA Type");
+                }
 
-                pLoc->m_cFloatReg = isFloatType ? GetArgSize()/sizeof(float): GetArgSize()/sizeof(double);
-                pLoc->m_isSinglePrecision = isFloatType;
+                pLoc->m_cFloatReg = GetArgSize()/hfaFieldSize;
+                pLoc->m_hfaFieldSize = hfaFieldSize;
             }
             else
             {
@@ -1297,16 +1305,25 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
         if (thValueType.IsHFA())
         {
             CorElementType type = thValueType.GetHFAType();
-            bool isFloatType = (type == ELEMENT_TYPE_R4);
 
-            cFPRegs = (type == ELEMENT_TYPE_R4)? (argSize/sizeof(float)): (argSize/sizeof(double));
+            int hfaFieldSize = 0;
+            switch (type)
+            {
+            case ELEMENT_TYPE_R4: hfaFieldSize = 4; break;
+            case ELEMENT_TYPE_R8: hfaFieldSize = 8; break;
+#ifdef _TARGET_ARM64_
+            case ELEMENT_TYPE_VALUETYPE: hfaFieldSize = 16; break;
+#endif
+            default: _ASSERTE(!"Invalid HFA Type");
+            }
+            cFPRegs = argSize/hfaFieldSize;
 
             m_argLocDescForStructInRegs.Init();
             m_argLocDescForStructInRegs.m_cFloatReg = cFPRegs;
             m_argLocDescForStructInRegs.m_idxFloatReg = m_idxFPReg;
 
-            m_argLocDescForStructInRegs.m_isSinglePrecision = isFloatType;
-                
+            m_argLocDescForStructInRegs.m_hfaFieldSize = hfaFieldSize;
+
             m_hasArgLocDescForStructInRegs = true;
         }
         else 
@@ -1474,9 +1491,17 @@ void ArgIteratorTemplate<ARGITERATOR_BASE>::ComputeReturnFlags()
             {
                 CorElementType hfaType = thValueType.GetHFAType();
 
-                flags |= (hfaType == ELEMENT_TYPE_R4) ? 
-                    ((4 * sizeof(float)) << RETURN_FP_SIZE_SHIFT) : 
-                    ((4 * sizeof(double)) << RETURN_FP_SIZE_SHIFT);
+                int hfaFieldSize = 0;
+                switch (hfaType)
+                {
+                case ELEMENT_TYPE_R4: hfaFieldSize = 4; break;
+                case ELEMENT_TYPE_R8: hfaFieldSize = 8; break;
+#ifdef _TARGET_ARM64_
+                case ELEMENT_TYPE_VALUETYPE: hfaFieldSize = 16; break;
+#endif
+                default: _ASSERTE(!"Invalid HFA Type");
+                }
+                flags |= ((4 * hfaFieldSize) << RETURN_FP_SIZE_SHIFT);
 
                 break;
             }
diff --git a/src/vm/class.cpp b/src/vm/class.cpp
index af1073fb5109..14eb0595b450 100644
--- a/src/vm/class.cpp
+++ b/src/vm/class.cpp
@@ -1172,6 +1172,58 @@ bool MethodTable::IsHFA()
 }
 #endif // !FEATURE_HFA
 
+//*******************************************************************************
+int MethodTable::GetVectorSize()
+{
+    // This is supported for finding HVA types for Arm64. In order to support the altjit,
+    // we support this on 64-bit platforms (i.e. Arm64 and X64).
+#ifdef _TARGET_64BIT_
+    if (IsIntrinsicType())
+    {
+        LPCUTF8 namespaceName;
+        LPCUTF8 className = GetFullyQualifiedNameInfo(&namespaceName);
+        int vectorSize = 0;
+
+        if (strcmp(className, "Vector`1") == 0)
+        {
+            vectorSize = GetNumInstanceFieldBytes();
+            _ASSERTE(strcmp(namespaceName, "System.Numerics") == 0);
+            return vectorSize;
+        }
+        if (strcmp(className, "Vector128`1") == 0)
+        {
+            vectorSize = 16;
+        }
+        else if (strcmp(className, "Vector256`1") == 0)
+        {
+            vectorSize = 32;
+        }
+        else if (strcmp(className, "Vector64`1") == 0)
+        {
+            vectorSize = 8;
+        }
+        if (vectorSize != 0)
+        {
+            // We need to verify that T (the element or "base" type) is a primitive type.
+            TypeHandle typeArg = GetInstantiation()[0];
+            CorElementType corType = typeArg.GetSignatureCorElementType();
+            bool isSupportedElementType = (corType >= ELEMENT_TYPE_I1 && corType <= ELEMENT_TYPE_R8);
+            // These element types are not supported for Vector64<T>.
+            if ((vectorSize == 8) && (corType == ELEMENT_TYPE_I8 || corType == ELEMENT_TYPE_U8 || corType == ELEMENT_TYPE_R8))
+            {
+                isSupportedElementType = false;
+            }
+            if (isSupportedElementType)
+            {
+                _ASSERTE(strcmp(namespaceName, "System.Runtime.Intrinsics") == 0);
+                return vectorSize;
+            }
+        }
+    }
+#endif // _TARGET_64BIT_
+    return 0;
+}
+
 //*******************************************************************************
 CorElementType MethodTable::GetHFAType()
 {
@@ -1191,17 +1243,28 @@ CorElementType MethodTable::GetHFAType()
         _ASSERTE(pMT->IsValueType());
         _ASSERTE(pMT->GetNumInstanceFields() > 0);
 
+        int vectorSize = pMT->GetVectorSize();
+        if (vectorSize != 0)
+        {
+            return (vectorSize == 8) ? ELEMENT_TYPE_R8 : ELEMENT_TYPE_VALUETYPE;
+        }
+
         PTR_FieldDesc pFirstField = pMT->GetApproxFieldDescListRaw();
 
         CorElementType fieldType = pFirstField->GetFieldType();
-        
+
         // All HFA fields have to be of the same type, so we can just return the type of the first field
         switch (fieldType)
         {
         case ELEMENT_TYPE_VALUETYPE:
             pMT = pFirstField->LookupApproxFieldTypeHandle().GetMethodTable();
+            vectorSize = pMT->GetVectorSize();
+            if (vectorSize != 0)
+            {
+                return (vectorSize == 8) ? ELEMENT_TYPE_R8 : ELEMENT_TYPE_VALUETYPE;
+            }
             break;
-            
+
         case ELEMENT_TYPE_R4:
         case ELEMENT_TYPE_R8:
             return fieldType;
@@ -1212,7 +1275,7 @@ CorElementType MethodTable::GetHFAType()
             _ASSERTE(false);
             return ELEMENT_TYPE_END;
         }
-    }    
+    }
 }
 
 bool MethodTable::IsNativeHFA()
@@ -1231,6 +1294,7 @@ CorElementType MethodTable::GetNativeHFAType()
 //
 // When FEATURE_HFA is defined, we cache the value; otherwise we recompute it with each
 // call. The latter is only for the armaltjit and the arm64altjit.
+//
 bool
 #if defined(FEATURE_HFA)
 EEClass::CheckForHFA(MethodTable ** pByValueClassCache)
@@ -1243,25 +1307,18 @@ EEClass::CheckForHFA()
     // This method should be called for valuetypes only
     _ASSERTE(GetMethodTable()->IsValueType());
 
-    // The SIMD Intrinsic types are meant to be handled specially and should not be treated as HFA
-    if (GetMethodTable()->IsIntrinsicType())
-    {
-        LPCUTF8 namespaceName;
-        LPCUTF8 className = GetMethodTable()->GetFullyQualifiedNameInfo(&namespaceName);
 
-        if ((strcmp(className, "Vector256`1") == 0) || (strcmp(className, "Vector128`1") == 0) ||
-            (strcmp(className, "Vector64`1") == 0))
-        {
-            assert(strcmp(namespaceName, "System.Runtime.Intrinsics") == 0);
-            return false;
-        }
-       
-        if ((strcmp(className, "Vector`1") == 0) && (strcmp(namespaceName, "System.Numerics") == 0))
-        {
-            return false;
-        }
+    // The opaque Vector types appear to have multiple fields, but need to be treated
+    // as an opaque type of a single vector.
+    if (GetMethodTable()->GetVectorSize() != 0)
+    {
+#if defined(FEATURE_HFA)
+        GetMethodTable()->SetIsHFA();
+#endif
+        return true;
     }
 
+    int elemSize = 0;
     CorElementType hfaType = ELEMENT_TYPE_END;
 
     FieldDesc *pFieldDescList = GetFieldDescList();
@@ -1278,11 +1335,41 @@ EEClass::CheckForHFA()
         switch (fieldType)
         {
         case ELEMENT_TYPE_VALUETYPE:
+            {
+#ifdef _TARGET_ARM64_
+            // hfa/hva types are unique by size, except for Vector64 which we can conveniently
+                // treat as if it were a double for ABI purposes. However, it only qualifies as
+                // an HVA if all fields are the same type. This will ensure that we only
+                // consider it an HVA if all the fields are ELEMENT_TYPE_VALUETYPE (which have been
+                // determined above to be vectors) of the same size.
+                MethodTable* pMT;
+#if defined(FEATURE_HFA)
+                pMT = pByValueClassCache[i];
+#else
+                pMT = pFD->LookupApproxFieldTypeHandle().AsMethodTable();
+#endif
+                int thisElemSize = pMT->GetVectorSize();
+                if (thisElemSize != 0)
+                {
+                    if (elemSize == 0)
+                    {
+                        elemSize = thisElemSize;
+                    }
+                    else if ((thisElemSize != elemSize) || (hfaType != ELEMENT_TYPE_VALUETYPE))
+                    {
+                        return false;
+                    }
+                }
+                else
+#endif // _TARGET_ARM64_
+                {
 #if defined(FEATURE_HFA)
-            fieldType = pByValueClassCache[i]->GetHFAType();
+                    fieldType = pByValueClassCache[i]->GetHFAType();
 #else
-            fieldType = pFD->LookupApproxFieldTypeHandle().AsMethodTable()->GetHFAType();
+                    fieldType = pFD->LookupApproxFieldTypeHandle().AsMethodTable()->GetHFAType();
 #endif
+                }
+            }
             break;
 
         case ELEMENT_TYPE_R4:
@@ -1326,14 +1413,31 @@ EEClass::CheckForHFA()
         }
     }
 
-    if (hfaType == ELEMENT_TYPE_END)
+    switch (hfaType)
+    {
+    case ELEMENT_TYPE_R4:
+        elemSize = 4;
+        break;
+    case ELEMENT_TYPE_R8:
+        elemSize = 8;
+        break;
+#ifdef _TARGET_ARM64_
+    case ELEMENT_TYPE_VALUETYPE:
+        // Should already have set elemSize, but be conservative
+        if (elemSize == 0)
+        {
+            return false;
+        }
+        break;
+#endif
+    default:
+        // ELEMENT_TYPE_END
         return false;
+    }
         
     if (!hasZeroOffsetField) // If the struct doesn't have a zero-offset field, it's not an HFA.
         return false;
 
-    int elemSize = (hfaType == ELEMENT_TYPE_R8) ? sizeof(double) : sizeof(float);
-
     // Note that we check the total size, but do not perform any checks on number of fields:
     // - Type of fields can be HFA valuetype itself
     // - Managed C++ HFA valuetypes have just one <alignment member> of type float to signal that 
@@ -1348,7 +1452,7 @@ EEClass::CheckForHFA()
     if (totalSize / elemSize > 4)
         return false;
 
-    // All the above tests passed. It's HFA!
+    // All the above tests passed. It's HFA(/HVA)!
 #if defined(FEATURE_HFA)
     GetMethodTable()->SetIsHFA();
 #endif
@@ -1421,7 +1525,16 @@ CorElementType EEClassLayoutInfo::GetNativeHFATypeRaw()
     if (hfaType == ELEMENT_TYPE_END)
         return ELEMENT_TYPE_END;
 
-    int elemSize = (hfaType == ELEMENT_TYPE_R8) ? sizeof(double) : sizeof(float);
+    int elemSize = 1;
+    switch (hfaType)
+    {
+    case ELEMENT_TYPE_R4: elemSize = sizeof(float); break;
+    case ELEMENT_TYPE_R8: elemSize = sizeof(double); break;
+#ifdef _TARGET_ARM64_
+    case ELEMENT_TYPE_VALUETYPE: elemSize = 16; break;
+#endif
+    default: _ASSERTE(!"Invalid HFA Type");
+    }
 
     // Note that we check the total size, but do not perform any checks on number of fields:
     // - Type of fields can be HFA valuetype itself
diff --git a/src/vm/class.h b/src/vm/class.h
index 2853aee330e2..fae62795a8b7 100644
--- a/src/vm/class.h
+++ b/src/vm/class.h
@@ -414,8 +414,11 @@ class EEClassLayoutInfo
 #endif // UNIX_AMD64_ABI
 #ifdef FEATURE_HFA
             // HFA type of the unmanaged layout
+            // Note that these are not flags, they are discrete values.
             e_R4_HFA                    = 0x10,
             e_R8_HFA                    = 0x20,
+            e_16_HFA                    = 0x30,
+            e_HFATypeFlags              = 0x30,
 #endif
         };
 
@@ -526,15 +529,19 @@ class EEClassLayoutInfo
         bool IsNativeHFA()
         {
             LIMITED_METHOD_CONTRACT;
-            return (m_bFlags & (e_R4_HFA | e_R8_HFA)) != 0;
+            return (m_bFlags & e_HFATypeFlags) != 0;
         }
 
         CorElementType GetNativeHFAType()
         {
             LIMITED_METHOD_CONTRACT;
-            if (IsNativeHFA())                      
-                return (m_bFlags & e_R4_HFA) ? ELEMENT_TYPE_R4 : ELEMENT_TYPE_R8;
-            return ELEMENT_TYPE_END;
+            switch (m_bFlags & e_HFATypeFlags)
+            {
+            case e_R4_HFA: return ELEMENT_TYPE_R4;
+            case e_R8_HFA: return ELEMENT_TYPE_R8;
+            case e_16_HFA: return ELEMENT_TYPE_VALUETYPE;
+            default:       return ELEMENT_TYPE_END;
+            }
         }
 #else // !FEATURE_HFA
         bool IsNativeHFA()
@@ -580,7 +587,13 @@ class EEClassLayoutInfo
         void SetNativeHFAType(CorElementType hfaType)
         {
             LIMITED_METHOD_CONTRACT;
-            m_bFlags |= (hfaType == ELEMENT_TYPE_R4) ? e_R4_HFA : e_R8_HFA;
+            switch (hfaType)
+            {
+            case ELEMENT_TYPE_R4: m_bFlags |= e_R4_HFA; break;
+            case ELEMENT_TYPE_R8: m_bFlags |= e_R8_HFA; break;
+            case ELEMENT_TYPE_VALUETYPE: m_bFlags |= e_16_HFA; break;
+            default: _ASSERTE(!"Invalid HFA Type");
+            }
         }
 #endif
 #ifdef UNIX_AMD64_ABI
diff --git a/src/vm/methodtable.h b/src/vm/methodtable.h
index 74febebc39bc..154efa2ee4aa 100644
--- a/src/vm/methodtable.h
+++ b/src/vm/methodtable.h
@@ -1929,6 +1929,9 @@ class MethodTable
     bool IsHFA();
 #endif // FEATURE_HFA
 
+    // Returns the size in bytes of this type if it is a HW vector type; 0 otherwise.
+    int GetVectorSize();
+
     // Get the HFA type. This is supported both with FEATURE_HFA, in which case it
     // depends on the cached bit on the class, or without, in which case it is recomputed
     // for each invocation.

From a17f160ae67f3ada98095dac673036524fcb6cdd Mon Sep 17 00:00:00 2001
From: Carol Eidt <carol.eidt@microsoft.com>
Date: Wed, 10 Apr 2019 17:39:46 -0700
Subject: [PATCH 2/7] Re-enable and fix disabled Simd tests

---
 src/jit/hwintrinsicArm64.cpp                  | 36 ++++++++++++++-----
 src/jit/lowerarmarch.cpp                      |  6 ++++
 .../src/JIT/HardwareIntrinsics/Arm64/Simd.cs  |  4 ---
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/src/jit/hwintrinsicArm64.cpp b/src/jit/hwintrinsicArm64.cpp
index 98b495623855..65b7fa85abc3 100644
--- a/src/jit/hwintrinsicArm64.cpp
+++ b/src/jit/hwintrinsicArm64.cpp
@@ -214,8 +214,8 @@ GenTree* Compiler::addRangeCheckIfNeeded(GenTree* immOp, unsigned int max, bool
 {
     assert(immOp != nullptr);
 
-    // Need to range check only if we're must expand and don't have an appropriate constant
-    if (mustExpand && (!immOp->IsCnsIntOrI() || (immOp->AsIntConCommon()->IconValue() < max)))
+    // Need to range check only if we're must expand.
+    if (mustExpand)
     {
         GenTree* upperBoundNode = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, max);
         GenTree* index          = nullptr;
@@ -463,20 +463,40 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic        intrinsic,
             return gtNewSimdHWIntrinsicNode(simdType, op1, intrinsic, simdBaseType, simdSizeBytes);
 
         case HWIntrinsicInfo::SimdExtractOp:
-            op2 =
-                addRangeCheckIfNeeded(impPopStack().val, getSIMDVectorLength(simdSizeBytes, simdBaseType), mustExpand);
+        {
+            int vectorLength = getSIMDVectorLength(simdSizeBytes, simdBaseType);
+            op2 = impStackTop().val;
+            if (!mustExpand && (!op2->IsCnsIntOrI() || op2->AsIntConCommon()->IconValue() >= vectorLength))
+            {
+                // This is either an out-of-range constant or a non-constant.
+                // We won't expand it; it will be handled recursively, at which point 'mustExpand'
+                // will be true.
+                return nullptr;
+            }
+            op2 = impPopStack().val;
+            op2 = addRangeCheckIfNeeded(op2, vectorLength, mustExpand);
             op1 = impSIMDPopStack(simdType);
 
             return gtNewScalarHWIntrinsicNode(JITtype2varType(sig->retType), op1, op2, intrinsic);
-
+        }
         case HWIntrinsicInfo::SimdInsertOp:
+        {
+            int vectorLength = getSIMDVectorLength(simdSizeBytes, simdBaseType);
+            op2 = impStackTop(1).val;
+            if (!mustExpand && (!op2->IsCnsIntOrI() || op2->AsIntConCommon()->IconValue() >= vectorLength))
+            {
+                // This is either an out-of-range constant or a non-constant.
+                // We won't expand it; it will be handled recursively, at which point 'mustExpand'
+                // will be true.
+                return nullptr;
+            }
             op3 = impPopStack().val;
-            op2 =
-                addRangeCheckIfNeeded(impPopStack().val, getSIMDVectorLength(simdSizeBytes, simdBaseType), mustExpand);
+            op2 = impPopStack().val;
+            op2 = addRangeCheckIfNeeded(op2, vectorLength, mustExpand);
             op1 = impSIMDPopStack(simdType);
 
             return gtNewSimdHWIntrinsicNode(simdType, op1, op2, op3, intrinsic, simdBaseType, simdSizeBytes);
-
+        }
         case HWIntrinsicInfo::Sha1HashOp:
             op3 = impSIMDPopStack(simdType);
             op2 = impPopStack().val;
diff --git a/src/jit/lowerarmarch.cpp b/src/jit/lowerarmarch.cpp
index 3e0d636e9921..adfc34ae94d5 100644
--- a/src/jit/lowerarmarch.cpp
+++ b/src/jit/lowerarmarch.cpp
@@ -892,6 +892,11 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
             {
                 MakeSrcContained(node, op2);
 
+#if 0
+                // This is currently not supported downstream. The following (at least) need to be modifed:
+                //   GenTree::isContainableHWIntrinsic() needs to handle this.
+                //   CodeGen::genConsumRegs()
+                // 
                 GenTree* op3 = argList->Rest()->Rest()->Current();
 
                 // In the HW intrinsics C# API there is no direct way to specify a vector element to element mov
@@ -909,6 +914,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
                         MakeSrcContained(node, op3);
                     }
                 }
+#endif
             }
             break;
 
diff --git a/tests/src/JIT/HardwareIntrinsics/Arm64/Simd.cs b/tests/src/JIT/HardwareIntrinsics/Arm64/Simd.cs
index 97c0a42bd199..ad7602246b9d 100644
--- a/tests/src/JIT/HardwareIntrinsics/Arm64/Simd.cs
+++ b/tests/src/JIT/HardwareIntrinsics/Arm64/Simd.cs
@@ -1355,7 +1355,6 @@ static void TestExtract()
                 testExtractOp<int,    Vector64< int   >>(name, (x) => Simd.Extract(x, 1), (x) => x[ 1]);
                 testExtractOp<uint,   Vector64< uint  >>(name, (x) => Simd.Extract(x, 0), (x) => x[ 0]);
                 testExtractOp<uint,   Vector64< uint  >>(name, (x) => Simd.Extract(x, 1), (x) => x[ 1]);
-#if Broken
 
                 // Test non-constant call
                 testExtractOp<float,  Vector128<float >>(name, (x) => simdExtract(x, 0), (x) => x[ 0]);
@@ -1472,7 +1471,6 @@ static void TestExtract()
                 testThrowsArgumentOutOfRangeException<ushort, Vector64< ushort>>(name, (x, y) => Simd.Extract(x, 4));
                 testThrowsArgumentOutOfRangeException<int,    Vector64< int   >>(name, (x, y) => Simd.Extract(x, 2));
                 testThrowsArgumentOutOfRangeException<uint,   Vector64< uint  >>(name, (x, y) => Simd.Extract(x, 2));
-#endif
 
                 testThrowsTypeNotSupported<Vector64< long >>(name, (x, y) => { return Simd.Extract(x, 1) > 1 ? x : y; });
                 testThrowsTypeNotSupported<Vector64< ulong>>(name, (x, y) => { return Simd.Extract(x, 1) > 1 ? x : y; });
@@ -1528,7 +1526,6 @@ static void TestInsert()
                 testPermuteOp<ushort, Vector64< ushort>>(name, (x, y) => Simd.Insert(x, 1, (ushort)2), (i, x, y) => (ushort)(i != 1 ? x[i] : 2));
                 testPermuteOp<int,    Vector64< int   >>(name, (x, y) => Simd.Insert(x, 1, (int   )2), (i, x, y) => (int   )(i != 1 ? x[i] : 2));
                 testPermuteOp<uint,   Vector64< uint  >>(name, (x, y) => Simd.Insert(x, 1, (uint  )2), (i, x, y) => (uint  )(i != 1 ? x[i] : 2));
-#if Broken
 
                 testPermuteOp<float,  Vector128<float >>(name, (x, y) => Simd.Insert(x, 3, Simd.Extract(y, 1)), (i, x, y) => (float )(i != 3 ? x[i] : y[1]));
                 testPermuteOp<double, Vector128<double>>(name, (x, y) => Simd.Insert(x, 0, Simd.Extract(y, 1)), (i, x, y) => (double)(i != 0 ? x[i] : y[1]));
@@ -1565,7 +1562,6 @@ static void TestInsert()
                 testThrowsArgumentOutOfRangeException<ushort, Vector64< ushort>, Vector64< ushort>>(name, (x, y) => Simd.Insert(x, 4, (ushort)1));
                 testThrowsArgumentOutOfRangeException<int,    Vector64< int   >, Vector64< int   >>(name, (x, y) => Simd.Insert(x, 2, (int   )1));
                 testThrowsArgumentOutOfRangeException<uint,   Vector64< uint  >, Vector64< uint  >>(name, (x, y) => Simd.Insert(x, 2, (uint  )1));
-#endif
 
                 testThrowsTypeNotSupported<Vector128<bool >>(name, (x, y) => Simd.Insert(x, 1,      true));
                 testThrowsTypeNotSupported<Vector64< long >>(name, (x, y) => Simd.Insert(x, 1, ( long )5));

From dc310e6974a35f7f3e14115be0e96ad6c07ebfa3 Mon Sep 17 00:00:00 2001
From: Carol Eidt <carol.eidt@microsoft.com>
Date: Wed, 10 Apr 2019 18:46:45 -0700
Subject: [PATCH 3/7] Formatting

---
 src/jit/hwintrinsicArm64.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/jit/hwintrinsicArm64.cpp b/src/jit/hwintrinsicArm64.cpp
index 65b7fa85abc3..dec60383bd2e 100644
--- a/src/jit/hwintrinsicArm64.cpp
+++ b/src/jit/hwintrinsicArm64.cpp
@@ -465,7 +465,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic        intrinsic,
         case HWIntrinsicInfo::SimdExtractOp:
         {
             int vectorLength = getSIMDVectorLength(simdSizeBytes, simdBaseType);
-            op2 = impStackTop().val;
+            op2              = impStackTop().val;
             if (!mustExpand && (!op2->IsCnsIntOrI() || op2->AsIntConCommon()->IconValue() >= vectorLength))
             {
                 // This is either an out-of-range constant or a non-constant.
@@ -482,7 +482,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic        intrinsic,
         case HWIntrinsicInfo::SimdInsertOp:
         {
             int vectorLength = getSIMDVectorLength(simdSizeBytes, simdBaseType);
-            op2 = impStackTop(1).val;
+            op2              = impStackTop(1).val;
             if (!mustExpand && (!op2->IsCnsIntOrI() || op2->AsIntConCommon()->IconValue() >= vectorLength))
             {
                 // This is either an out-of-range constant or a non-constant.

From 2c5a026c3f93f65e2bc1bed30aa2bf554f541fb0 Mon Sep 17 00:00:00 2001
From: Carol Eidt <carol.eidt@microsoft.com>
Date: Thu, 11 Apr 2019 12:13:57 -0700
Subject: [PATCH 4/7] Fix condition for `hasMultiRegStructArgs`

---
 src/jit/morph.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp
index 95be6dcec903..fbfa17331191 100644
--- a/src/jit/morph.cpp
+++ b/src/jit/morph.cpp
@@ -4164,7 +4164,7 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call)
 #if FEATURE_MULTIREG_ARGS
         if (isStructArg)
         {
-            if ((argEntry->numRegs + argEntry->numSlots) > 1)
+            if (((argEntry->numRegs + argEntry->numSlots) > 1) || (isHfaArg && argx->TypeGet() == TYP_STRUCT))
             {
                 hasMultiregStructArgs = true;
             }
@@ -4381,7 +4381,7 @@ void Compiler::fgMorphMultiregStructArgs(GenTreeCall* call)
         }
 
         unsigned size = (fgEntryPtr->numRegs + fgEntryPtr->numSlots);
-        if (size > 1)
+        if ((size > 1) || (fgEntryPtr->isHfaArg && argx->TypeGet() == TYP_STRUCT))
         {
             foundStructArg = true;
             if (varTypeIsStruct(argx) && !argx->OperIs(GT_FIELD_LIST))

From dd186f2f9db108479837d3cc39f5b86d02c60b64 Mon Sep 17 00:00:00 2001
From: Carol Eidt <carol.eidt@microsoft.com>
Date: Fri, 12 Apr 2019 21:49:32 -0700
Subject: [PATCH 5/7] PR Feedback

---
 src/jit/codegencommon.cpp  |  2 +-
 src/jit/compiler.cpp       |  6 ++--
 src/jit/compiler.h         |  3 ++
 src/jit/dll/jit.nativeproj |  2 +-
 src/vm/callhelpers.h       |  2 +-
 src/vm/callingconvention.h | 59 ++++++++++++++------------------------
 src/vm/class.h             |  2 ++
 7 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/src/jit/codegencommon.cpp b/src/jit/codegencommon.cpp
index 2362be5f2485..c7a6e83d01c9 100644
--- a/src/jit/codegencommon.cpp
+++ b/src/jit/codegencommon.cpp
@@ -10208,7 +10208,7 @@ var_types Compiler::GetHfaType(CORINFO_CLASS_HANDLE hClass)
         if (corType == CORINFO_TYPE_VALUECLASS)
         {
             // This is a vector type.
-            // HVAs are only supported on ARM64, and only for sizes of 8 or 16 bytes.
+            // HVAs are only supported on ARM64, and only for homogeneous aggregates of 8 or 16 byte vectors.
             // For 8-byte vectors corType will be returned as CORINFO_TYPE_DOUBLE.
             result = TYP_SIMD16;
             // This type may not appear elsewhere, but it will occupy a floating point register.
diff --git a/src/jit/compiler.cpp b/src/jit/compiler.cpp
index 1cc2f600ed16..15ed19c62d5b 100644
--- a/src/jit/compiler.cpp
+++ b/src/jit/compiler.cpp
@@ -800,9 +800,9 @@ var_types Compiler::getArgTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
         //
         if (structSize <= MAX_PASS_MULTIREG_BYTES)
         {
-            // Structs that are HFA's are passed by value in multiple registers.
-            // Arm64 Windows VarArg methods arguments will not classify HFA types, they will need to be treated
-            // as if they are not HFA types.
+            // Structs that are HFA/HVA's are passed by value in multiple registers.
+            // Arm64 Windows VarArg methods arguments will not classify HFA/HVA types, they will need to be treated
+            // as if they are not HFA/HVA types.
             var_types hfaType;
 #if defined(_TARGET_WINDOWS_) && defined(_TARGET_ARM64_)
             if (isVarArg)
diff --git a/src/jit/compiler.h b/src/jit/compiler.h
index 751020603c3f..dc1f844a3136 100644
--- a/src/jit/compiler.h
+++ b/src/jit/compiler.h
@@ -755,12 +755,15 @@ class LclVarDsc
                 assert(!"lvHfaSlots called for non-HFA");
                 break;
             case HFA_ELEM_FLOAT:
+                assert((lvExactSize % 4) == 0);
                 slots = lvExactSize >> 2;
                 break;
             case HFA_ELEM_DOUBLE:
+                assert((lvExactSize % 8) == 0);
                 slots = lvExactSize >> 3;
                 break;
             case HFA_ELEM_SIMD16:
+                assert((lvExactSize % 16) == 0);
                 slots = lvExactSize >> 4;
                 break;
             default:
diff --git a/src/jit/dll/jit.nativeproj b/src/jit/dll/jit.nativeproj
index 56e94be9cac8..7b8dacb78c52 100644
--- a/src/jit/dll/jit.nativeproj
+++ b/src/jit/dll/jit.nativeproj
@@ -31,7 +31,7 @@
 
     <LinkModuleDefinitionFile>$(OutputName).def</LinkModuleDefinitionFile>
 
-    <ClDefines Condition="'$(BuildArchitecture)' == 'amd64' or '$(BuildArchitecture)' == 'arm64'">$(ClDefines);FEATURE_SIMD</ClDefines>
+    <ClDefines Condition="'$(BuildArchitecture)' == 'amd64'">$(ClDefines);FEATURE_SIMD</ClDefines>
 
     <Win32DllLibs>$(SdkLibPath)\kernel32.lib;$(SdkLibPath)\user32.lib;$(SdkLibPath)\advapi32.lib;$(SdkLibPath)\oleaut32.lib;$(SdkLibPath)\uuid.lib</Win32DllLibs>
     <Win32DllLibs>$(Win32DllLibs);$(ClrLibPath)\utilcode.lib</Win32DllLibs>
diff --git a/src/vm/callhelpers.h b/src/vm/callhelpers.h
index d04412b68e17..f0d718c6ff5d 100644
--- a/src/vm/callhelpers.h
+++ b/src/vm/callhelpers.h
@@ -41,7 +41,7 @@ struct CallDescrData
 #ifdef ENREGISTERED_RETURNTYPE_MAXSIZE
 #ifdef _TARGET_ARM64_
     // Use NEON128 to ensure proper alignment for vectors.
-    __declspec(align(16)) NEON128 returnValue[ENREGISTERED_RETURNTYPE_MAXSIZE / sizeof(NEON128)];
+    DECLSPEC_ALIGN(16) NEON128 returnValue[ENREGISTERED_RETURNTYPE_MAXSIZE / sizeof(NEON128)];
 #else
     // Use UINT64 to ensure proper alignment
     UINT64 returnValue[ENREGISTERED_RETURNTYPE_MAXSIZE / sizeof(UINT64)];
diff --git a/src/vm/callingconvention.h b/src/vm/callingconvention.h
index cb117278b511..4044a379f0df 100644
--- a/src/vm/callingconvention.h
+++ b/src/vm/callingconvention.h
@@ -50,7 +50,22 @@ struct ArgLocDesc
 #endif // UNIX_AMD64_ABI
 
 #if defined(_TARGET_ARM64_)
-    int      m_hfaFieldSize;      // Size of HFA field
+    unsigned m_hfaFieldSize;      // Size of HFA field in bytes.
+    static unsigned getHFAFieldSize(CorElementType  hfaType)
+    {
+        switch (hfaType)
+        {
+        case ELEMENT_TYPE_R4: return 4;
+        case ELEMENT_TYPE_R8: return 8;
+        // We overload VALUETYPE for 16-byte vectors.
+        case ELEMENT_TYPE_VALUETYPE: return 16;
+        default: _ASSERTE(!"Invalid HFA Type"); return 0;
+        }
+    }
+    void setHFAFieldSize(CorElementType  hfaType)
+    {
+        m_hfaFieldSize = getHFAFieldSize(hfaType);
+    }
 #endif // defined(_TARGET_ARM64_)
 
 #if defined(_TARGET_ARM_)
@@ -588,19 +603,9 @@ class ArgIteratorTemplate : public ARGITERATOR_BASE
             if (!m_argTypeHandle.IsNull() && m_argTypeHandle.IsHFA())
             {
                 CorElementType type = m_argTypeHandle.GetHFAType();
-                int hfaFieldSize = 0;
-                switch (type)
-                {
-                case ELEMENT_TYPE_R4: hfaFieldSize = 4; break;
-                case ELEMENT_TYPE_R8: hfaFieldSize = 8; break;
-#ifdef _TARGET_ARM64_
-                case ELEMENT_TYPE_VALUETYPE: hfaFieldSize = 16; break;
-#endif
-                default: _ASSERTE(!"Invalid HFA Type");
-                }
+                pLoc->setHFAFieldSize(type);
+                pLoc->m_cFloatReg = GetArgSize()/pLoc->m_hfaFieldSize;
 
-                pLoc->m_cFloatReg = GetArgSize()/hfaFieldSize;
-                pLoc->m_hfaFieldSize = hfaFieldSize;
             }
             else
             {
@@ -1306,23 +1311,11 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
         {
             CorElementType type = thValueType.GetHFAType();
 
-            int hfaFieldSize = 0;
-            switch (type)
-            {
-            case ELEMENT_TYPE_R4: hfaFieldSize = 4; break;
-            case ELEMENT_TYPE_R8: hfaFieldSize = 8; break;
-#ifdef _TARGET_ARM64_
-            case ELEMENT_TYPE_VALUETYPE: hfaFieldSize = 16; break;
-#endif
-            default: _ASSERTE(!"Invalid HFA Type");
-            }
-            cFPRegs = argSize/hfaFieldSize;
-
             m_argLocDescForStructInRegs.Init();
-            m_argLocDescForStructInRegs.m_cFloatReg = cFPRegs;
             m_argLocDescForStructInRegs.m_idxFloatReg = m_idxFPReg;
 
-            m_argLocDescForStructInRegs.m_hfaFieldSize = hfaFieldSize;
+            m_argLocDescForStructInRegs.setHFAFieldSize(type);
+            m_argLocDescForStructInRegs.m_cFloatReg = argSize/m_argLocDescForStructInRegs.m_hfaFieldSize;
 
             m_hasArgLocDescForStructInRegs = true;
         }
@@ -1491,18 +1484,8 @@ void ArgIteratorTemplate<ARGITERATOR_BASE>::ComputeReturnFlags()
             {
                 CorElementType hfaType = thValueType.GetHFAType();
 
-                int hfaFieldSize = 0;
-                switch (hfaType)
-                {
-                case ELEMENT_TYPE_R4: hfaFieldSize = 4; break;
-                case ELEMENT_TYPE_R8: hfaFieldSize = 8; break;
-#ifdef _TARGET_ARM64_
-                case ELEMENT_TYPE_VALUETYPE: hfaFieldSize = 16; break;
-#endif
-                default: _ASSERTE(!"Invalid HFA Type");
-                }
+                int hfaFieldSize = ArgLocDesc::getHFAFieldSize(hfaType);
                 flags |= ((4 * hfaFieldSize) << RETURN_FP_SIZE_SHIFT);
-
                 break;
             }
 #endif
diff --git a/src/vm/class.h b/src/vm/class.h
index fae62795a8b7..a1e7aebc1e21 100644
--- a/src/vm/class.h
+++ b/src/vm/class.h
@@ -587,6 +587,8 @@ class EEClassLayoutInfo
         void SetNativeHFAType(CorElementType hfaType)
         {
             LIMITED_METHOD_CONTRACT;
+            // We should call this at most once.
+            _ASSERTE((m_bFlags & e_HFATypeFlags) == 0);
             switch (hfaType)
             {
             case ELEMENT_TYPE_R4: m_bFlags |= e_R4_HFA; break;

From 582d5062dc94e886bd3a08eb36f191b200bc7521 Mon Sep 17 00:00:00 2001
From: Carol Eidt <carol.eidt@microsoft.com>
Date: Sat, 13 Apr 2019 16:46:04 -0700
Subject: [PATCH 6/7] Restore a deleted assignment

---
 src/vm/callingconvention.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/vm/callingconvention.h b/src/vm/callingconvention.h
index 4044a379f0df..18217cc09b4e 100644
--- a/src/vm/callingconvention.h
+++ b/src/vm/callingconvention.h
@@ -1315,7 +1315,8 @@ int ArgIteratorTemplate<ARGITERATOR_BASE>::GetNextOffset()
             m_argLocDescForStructInRegs.m_idxFloatReg = m_idxFPReg;
 
             m_argLocDescForStructInRegs.setHFAFieldSize(type);
-            m_argLocDescForStructInRegs.m_cFloatReg = argSize/m_argLocDescForStructInRegs.m_hfaFieldSize;
+            cFPRegs = argSize/m_argLocDescForStructInRegs.m_hfaFieldSize;
+            m_argLocDescForStructInRegs.m_cFloatReg = cFPRegs;
 
             m_hasArgLocDescForStructInRegs = true;
         }

From 69e0a6f3efcb06b295982d21d67397f975fb3a85 Mon Sep 17 00:00:00 2001
From: Carol Eidt <carol.eidt@microsoft.com>
Date: Sat, 13 Apr 2019 17:29:03 -0700
Subject: [PATCH 7/7] Fix arm build

---
 src/vm/callingconvention.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/vm/callingconvention.h b/src/vm/callingconvention.h
index 18217cc09b4e..cbc6aad5c4f6 100644
--- a/src/vm/callingconvention.h
+++ b/src/vm/callingconvention.h
@@ -49,19 +49,21 @@ struct ArgLocDesc
 
 #endif // UNIX_AMD64_ABI
 
-#if defined(_TARGET_ARM64_)
-    unsigned m_hfaFieldSize;      // Size of HFA field in bytes.
+#ifdef FEATURE_HFA
     static unsigned getHFAFieldSize(CorElementType  hfaType)
     {
         switch (hfaType)
         {
         case ELEMENT_TYPE_R4: return 4;
         case ELEMENT_TYPE_R8: return 8;
-        // We overload VALUETYPE for 16-byte vectors.
+            // We overload VALUETYPE for 16-byte vectors.
         case ELEMENT_TYPE_VALUETYPE: return 16;
         default: _ASSERTE(!"Invalid HFA Type"); return 0;
         }
     }
+#endif
+#if defined(_TARGET_ARM64_)
+    unsigned m_hfaFieldSize;      // Size of HFA field in bytes.
     void setHFAFieldSize(CorElementType  hfaType)
     {
         m_hfaFieldSize = getHFAFieldSize(hfaType);