From 88f860c6f5548c458a5c3c9862b781b9df962498 Mon Sep 17 00:00:00 2001
From: Filip Navara <filip.navara@gmail.com>
Date: Thu, 12 Sep 2024 18:51:04 +0200
Subject: [PATCH 1/2] JIT/ARM64: Add ability to generate frames compatible with
 Apple compact unwinding format.

For NativeAOT/ARM64/Apple API do the following:
- Save callee registers in opposite order and in pairs.
- Prefer saving FP/LR on the top of the frame. Heuristics are used to
  avoid worse code quality outside of prolog/epilog due to addressing
  range limits of the ARM64 instruction set.
- Added optimization to lvaFrameAddress to rewrite FP-x references to
  SP+y when possible. This allows efficient addressing using positive
  indexes when FP points to the top of the frame. It mimics similar
  optimization on ARM32.
---
 src/coreclr/jit/codegen.h         |   1 +
 src/coreclr/jit/codegenarm64.cpp  |  46 ++++++++--
 src/coreclr/jit/codegencommon.cpp |  24 ++++++
 src/coreclr/jit/compiler.hpp      |  10 +++
 src/coreclr/jit/lclvars.cpp       | 139 ++++++++++++++++++------------
 5 files changed, 158 insertions(+), 62 deletions(-)

diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 1e53616c2d8e7..73ecd14e03c0d 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -656,6 +656,7 @@ class CodeGen final : public CodeGenInterface
     virtual bool IsSaveFpLrWithAllCalleeSavedRegisters() const;
     bool         genSaveFpLrWithAllCalleeSavedRegisters;
     bool         genForceFuncletFrameType5;
+    bool         genReverseAndPairCalleeSavedRegisters;
 #endif // TARGET_ARM64
 
     //-------------------------------------------------------------------------
diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp
index ac28ebd30a19b..d56a92ed0d5ba 100644
--- a/src/coreclr/jit/codegenarm64.cpp
+++ b/src/coreclr/jit/codegenarm64.cpp
@@ -845,12 +845,19 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, i
 
     for (int i = 0; i < regStack.Height(); ++i)
     {
-        RegPair regPair = regStack.Bottom(i);
+        RegPair regPair = genReverseAndPairCalleeSavedRegisters ? regStack.Top(i) : regStack.Bottom(i);
         if (regPair.reg2 != REG_NA)
         {
             // We can use a STP instruction.
-            genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_IP0,
-                                 nullptr);
+            if (genReverseAndPairCalleeSavedRegisters)
+            {
+                genPrologSaveRegPair(regPair.reg2, regPair.reg1, spOffset, spDelta, false, REG_IP0, nullptr);
+            }
+            else
+            {
+                genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_IP0,
+                                     nullptr);
+            }
 
             spOffset += 2 * slotSize;
         }
@@ -926,8 +933,9 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe
 
     // Save integer registers at higher addresses than floating-point registers.
 
+    regMaskTP maskSaveRegsFrame = regsToSaveMask & (RBM_FP | RBM_LR);
     regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT;
-    regMaskTP maskSaveRegsInt   = regsToSaveMask & ~maskSaveRegsFloat;
+    regMaskTP maskSaveRegsInt   = regsToSaveMask & ~maskSaveRegsFloat & ~maskSaveRegsFrame;
 
     if (maskSaveRegsFloat != RBM_NONE)
     {
@@ -939,6 +947,13 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe
     if (maskSaveRegsInt != RBM_NONE)
     {
         genSaveCalleeSavedRegisterGroup(maskSaveRegsInt, spDelta, lowestCalleeSavedOffset);
+        spDelta = 0;
+        lowestCalleeSavedOffset += genCountBits(maskSaveRegsInt) * FPSAVE_REGSIZE_BYTES;
+    }
+
+    if (maskSaveRegsFrame != RBM_NONE)
+    {
+        genPrologSaveRegPair(REG_FP, REG_LR, lowestCalleeSavedOffset, spDelta, false, REG_IP0, nullptr);
         // No need to update spDelta, lowestCalleeSavedOffset since they're not used after this.
     }
 }
@@ -970,13 +985,20 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta
             stackDelta = spDelta;
         }
 
-        RegPair regPair = regStack.Top(i);
+        RegPair regPair = genReverseAndPairCalleeSavedRegisters ? regStack.Bottom(i) : regStack.Top(i);
         if (regPair.reg2 != REG_NA)
         {
             spOffset -= 2 * slotSize;
 
-            genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair, REG_IP1,
-                                    nullptr);
+            if (genReverseAndPairCalleeSavedRegisters)
+            {
+                genEpilogRestoreRegPair(regPair.reg2, regPair.reg1, spOffset, stackDelta, false, REG_IP1, nullptr);
+            }
+            else
+            {
+                genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair,
+                                        REG_IP1, nullptr);
+            }
         }
         else
         {
@@ -1043,11 +1065,19 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in
 
     // Save integer registers at higher addresses than floating-point registers.
 
+    regMaskTP maskRestoreRegsFrame = regsToRestoreMask & (RBM_FP | RBM_LR);
     regMaskTP maskRestoreRegsFloat = regsToRestoreMask & RBM_ALLFLOAT;
-    regMaskTP maskRestoreRegsInt   = regsToRestoreMask & ~maskRestoreRegsFloat;
+    regMaskTP maskRestoreRegsInt   = regsToRestoreMask & ~maskRestoreRegsFloat & ~maskRestoreRegsFrame;
 
     // Restore in the opposite order of saving.
 
+    if (maskRestoreRegsFrame != RBM_NONE)
+    {
+        int spFrameDelta = (maskRestoreRegsFloat != RBM_NONE || maskRestoreRegsInt != RBM_NONE) ? 0 : spDelta;
+        spOffset -= 2 * REGSIZE_BYTES;
+        genEpilogRestoreRegPair(REG_FP, REG_LR, spOffset, spFrameDelta, false, REG_IP1, nullptr);
+    }
+
     if (maskRestoreRegsInt != RBM_NONE)
     {
         int spIntDelta = (maskRestoreRegsFloat != RBM_NONE) ? 0 : spDelta; // should we delay the SP adjustment?
diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 9920f9846d273..881b4ff0cc566 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -255,6 +255,7 @@ CodeGen::CodeGen(Compiler* theCompiler)
 #ifdef TARGET_ARM64
     genSaveFpLrWithAllCalleeSavedRegisters = false;
     genForceFuncletFrameType5              = false;
+    genReverseAndPairCalleeSavedRegisters  = false;
 #endif // TARGET_ARM64
 }
 
@@ -4846,6 +4847,29 @@ void CodeGen::genFinalizeFrame()
     }
 #endif // TARGET_ARM
 
+#ifdef TARGET_ARM64
+    if (compiler->IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform)
+    {
+        JITDUMP("Setting genReverseAndPairCalleeSavedRegisters = true");
+
+        genReverseAndPairCalleeSavedRegisters = true;
+
+        // Make sure we push the registers in pairs if possible. If we only allocate a contiguous
+        // block of registers this should add at most one integer and at most one floating point
+        // register to the list. The stack has to be 16-byte aligned, so in worst case it results
+        // in allocating 16 bytes more space on stack if odd number of integer and odd number of
+        // FP registers were occupied. Same number of instructions will be generated, just the
+        // STR instructions are replaced with STP (store pair).
+        regMaskTP maskModifiedRegs = regSet.rsGetModifiedRegsMask();
+        regMaskTP maskPairRegs     = ((maskModifiedRegs & (RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14)).getLow() << 1) |
+                                 ((maskModifiedRegs & (RBM_R19 | RBM_R21 | RBM_R23 | RBM_R25 | RBM_R27)).getLow() << 1);
+        if (maskPairRegs != RBM_NONE)
+        {
+            regSet.rsSetRegsModified(maskPairRegs);
+        }
+    }
+#endif
+
 #ifdef DEBUG
     if (verbose)
     {
diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp
index 4f70a01a8a79e..401b5993dab1f 100644
--- a/src/coreclr/jit/compiler.hpp
+++ b/src/coreclr/jit/compiler.hpp
@@ -2807,6 +2807,16 @@ inline
     {
         *pBaseReg = REG_SPBASE;
     }
+#elif defined(TARGET_ARM64)
+    if (FPbased && !codeGen->isFramePointerRequired() && varOffset < 0 &&
+        lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT && codeGen->IsSaveFpLrWithAllCalleeSavedRegisters())
+    {
+        int spVarOffset = varOffset + codeGen->genSPtoFPdelta();
+        JITDUMP("lvaFrameAddress optimization for V%02u: [FP-%d] -> [SP+%d]\n", varNum, -varOffset, spVarOffset);
+        FPbased   = false;
+        varOffset = spVarOffset;
+    }
+    *pFPbased = FPbased;
 #else
     *pFPbased = FPbased;
 #endif
diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
index d4adbd6b9907a..1c5973f9d6bb7 100644
--- a/src/coreclr/jit/lclvars.cpp
+++ b/src/coreclr/jit/lclvars.cpp
@@ -5648,7 +5648,9 @@ void Compiler::lvaFixVirtualFrameOffsets()
 #endif
 
     // The delta to be added to virtual offset to adjust it relative to frame pointer or SP
-    int delta = 0;
+    int delta            = 0;
+    int frameLocalsDelta = 0;
+    int frameBoundary    = 0;
 
 #ifdef TARGET_XARCH
     delta += REGSIZE_BYTES; // pushed PC (return address) for x86/x64
@@ -5673,7 +5675,25 @@ void Compiler::lvaFixVirtualFrameOffsets()
         // We set FP to be after LR, FP
         delta += 2 * REGSIZE_BYTES;
     }
-#elif defined(TARGET_AMD64) || defined(TARGET_ARM64)
+#elif defined(TARGET_ARM64)
+    else
+    {
+        // FP is used.
+        delta += codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta();
+
+        // If we placed FP/LR at the bottom of the frame we need to shift all the variables
+        // on the new frame to account for it. See lvaAssignVirtualFrameOffsetsToLocals.
+        if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters())
+        {
+            // We set FP to be after LR, FP
+            frameLocalsDelta = 2 * REGSIZE_BYTES;
+            frameBoundary    = opts.IsOSR() ? -info.compPatchpointInfo->TotalFrameSize() : 0;
+            if (info.compIsVarArgs)
+                frameBoundary -= MAX_REG_ARG * REGSIZE_BYTES;
+        }
+        JITDUMP("--- delta bump %d for FP frame, %d inside frame for FP/LR relocation\n", delta, frameLocalsDelta);
+    }
+#elif defined(TARGET_AMD64)
     else
     {
         // FP is used.
@@ -5741,7 +5761,7 @@ void Compiler::lvaFixVirtualFrameOffsets()
 
 #if defined(TARGET_X86)
             // On x86, we set the stack offset for a promoted field
-            // to match a struct parameter in lvAssignFrameOffsetsToPromotedStructs.
+            // to match a struct parameter in lvaAssignFrameOffsetsToPromotedStructs.
             if ((!varDsc->lvIsParam || parentvarDsc->lvIsParam) && promotionType == PROMOTION_TYPE_DEPENDENT)
 #else
             if (!varDsc->lvIsParam && promotionType == PROMOTION_TYPE_DEPENDENT)
@@ -5761,15 +5781,23 @@ void Compiler::lvaFixVirtualFrameOffsets()
 
         if (doAssignStkOffs)
         {
-            JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(), varDsc->GetStackOffset() + delta);
-            varDsc->SetStackOffset(varDsc->GetStackOffset() + delta);
+            int localDelta = delta;
+
+            if (frameLocalsDelta != 0 && varDsc->GetStackOffset() < frameBoundary)
+            {
+                localDelta += frameLocalsDelta;
+            }
+
+            JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(),
+                    varDsc->GetStackOffset() + localDelta);
+            varDsc->SetStackOffset(varDsc->GetStackOffset() + localDelta);
 
 #if DOUBLE_ALIGN
             if (genDoubleAlign() && !codeGen->isFramePointerUsed())
             {
                 if (varDsc->lvFramePointerBased)
                 {
-                    varDsc->SetStackOffset(varDsc->GetStackOffset() - delta);
+                    varDsc->SetStackOffset(varDsc->GetStackOffset() - localDelta);
 
                     // We need to re-adjust the offsets of the parameters so they are EBP
                     // relative rather than stack/frame pointer relative
@@ -5791,9 +5819,13 @@ void Compiler::lvaFixVirtualFrameOffsets()
     assert(codeGen->regSet.tmpAllFree());
     for (TempDsc* temp = codeGen->regSet.tmpListBeg(); temp != nullptr; temp = codeGen->regSet.tmpListNxt(temp))
     {
-        temp->tdAdjustTempOffs(delta);
+        temp->tdAdjustTempOffs(delta + frameLocalsDelta);
     }
 
+    if (lvaCachedGenericContextArgOffs < frameBoundary)
+    {
+        lvaCachedGenericContextArgOffs += frameLocalsDelta;
+    }
     lvaCachedGenericContextArgOffs += delta;
 
 #if FEATURE_FIXED_OUT_ARGS
@@ -6050,30 +6082,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
         codeGen->setFramePointerUsed(codeGen->isFramePointerRequired());
     }
 
-#ifdef TARGET_ARM64
-    // Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is
-    // a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we
-    // need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value,
-    // and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the
-    // frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress.
-    // (It should be legal to use these frame types for every frame).
-
-    if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0)
-    {
-        // Default configuration
-        codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) ||
-                                                        opts.compDbgEnC || compStressCompile(STRESS_GENERIC_VARN, 20));
-    }
-    else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1)
-    {
-        codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames
-    }
-    else if ((opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (opts.compJitSaveFpLrWithCalleeSavedRegisters == 3))
-    {
-        codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames
-    }
-#endif // TARGET_ARM64
-
 #ifdef TARGET_XARCH
     // On x86/amd64, the return address has already been pushed by the call instruction in the caller.
     stkOffs -= TARGET_POINTER_SIZE; // return address;
@@ -6122,9 +6130,13 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 #endif // !TARGET_ARM
 
 #ifdef TARGET_ARM64
-    // If the frame pointer is used, then we'll save FP/LR at the bottom of the stack.
-    // Otherwise, we won't store FP, and we'll store LR at the top, with the other callee-save
-    // registers (if any).
+    // If the frame pointer is used, then we'll save FP/LR either at the bottom of the stack
+    // or at the top of the stack depending on frame type. We make the decision after assigning
+    // the variables on the frame and then fix up the offsets in lvaFixVirtualFrameOffsets.
+    // For now, we proceed as if FP/LR were saved with the callee registers. If we later
+    // decide to move the FP/LR to the bottom of the frame it shifts all the assigned
+    // variables and temporaries by 16 bytes. The largest alignment we currently make is 16
+    // bytes for SIMD.
 
     int initialStkOffs = 0;
     if (info.compIsVarArgs)
@@ -6135,17 +6147,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
         stkOffs -= initialStkOffs;
     }
 
-    if (codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() || !isFramePointerUsed()) // Note that currently we always have
-                                                                                   // a frame pointer
-    {
-        stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES;
-    }
-    else
-    {
-        // Subtract off FP and LR.
-        assert(compCalleeRegsPushed >= 2);
-        stkOffs -= (compCalleeRegsPushed - 2) * REGSIZE_BYTES;
-    }
+    stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES;
 
 #elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
 
@@ -6815,15 +6817,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
     }
 #endif // TARGET_AMD64
 
-#ifdef TARGET_ARM64
-    if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() && isFramePointerUsed()) // Note that currently we always have
-                                                                                   // a frame pointer
-    {
-        // Create space for saving FP and LR.
-        stkOffs -= 2 * REGSIZE_BYTES;
-    }
-#endif // TARGET_ARM64
-
 #if FEATURE_FIXED_OUT_ARGS
     if (lvaOutgoingArgSpaceSize > 0)
     {
@@ -6861,6 +6854,44 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals()
 
     noway_assert(compLclFrameSize + originalFrameSize ==
                  (unsigned)-(stkOffs + (pushedCount * (int)TARGET_POINTER_SIZE)));
+
+#ifdef TARGET_ARM64
+    // Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is
+    // a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we
+    // need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value,
+    // and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the
+    // frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress.
+    // (It should be legal to use these frame types for every frame).
+    //
+    // For Apple NativeAOT ABI we try to save the FP/LR registers on top to get canonical frame layout that can
+    // be represented with compact unwinding information. In order to maintain code quality we only do it when
+    // we can use SP-based addressing (!isFramePointerRequired) through lvaFrameAddress optimization, or if the
+    // whole frame is small enough that the negative FP-based addressing can address the whole frame.
+
+    if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0)
+    {
+        if (IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform &&
+            (!codeGen->isFramePointerRequired() || codeGen->genTotalFrameSize() < 0x100))
+        {
+            codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true);
+        }
+        else
+        {
+            // Default configuration
+            codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) ||
+                                                            opts.compDbgEnC ||
+                                                            compStressCompile(Compiler::STRESS_GENERIC_VARN, 20));
+        }
+    }
+    else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1)
+    {
+        codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames
+    }
+    else if ((opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (opts.compJitSaveFpLrWithCalleeSavedRegisters == 3))
+    {
+        codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames
+    }
+#endif // TARGET_ARM64
 }
 
 //------------------------------------------------------------------------

From 73bea2efe7f62e6b36894363da0c1d4c21ed35b3 Mon Sep 17 00:00:00 2001
From: Filip Navara <filip.navara@gmail.com>
Date: Thu, 12 Sep 2024 18:59:03 +0200
Subject: [PATCH 2/2] ObjWriter: For Mach-O ARM64 try to convert the DWARF CFI
 unwinding codes into compact unwinding code

---
 .../Compiler/ObjectWriter/MachNative.cs       |  13 ++
 .../Compiler/ObjectWriter/MachObjectWriter.cs | 182 +++++++++++++++---
 2 files changed, 171 insertions(+), 24 deletions(-)

diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs
index 14db96a935474..8b3af9392a7b1 100644
--- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs
+++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs
@@ -120,5 +120,18 @@ internal static class MachNative
         public const uint PLATFORM_TVOSSIMULATOR = 8;
         public const uint PLATFORM_WATCHOSSIMULATOR = 9;
         public const uint PLATFORM_DRIVERKIT = 10;
+
+        public const uint UNWIND_ARM64_MODE_FRAMELESS = 0x02000000;
+        public const uint UNWIND_ARM64_MODE_DWARF = 0x03000000;
+        public const uint UNWIND_ARM64_MODE_FRAME = 0x04000000;
+        public const uint UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001;
+        public const uint UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002;
+        public const uint UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004;
+        public const uint UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008;
+        public const uint UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010;
+        public const uint UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100;
+        public const uint UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200;
+        public const uint UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400;
+        public const uint UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800;
     }
 }
diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs
index 2424ec434126c..4a1742c36b8f5 100644
--- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs
+++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs
@@ -752,26 +752,164 @@ void EmitCompactUnwindSymbol(string symbolName)
 
         private protected override string ExternCName(string name) => "_" + name;
 
-        // This represents the following DWARF code:
-        //   DW_CFA_advance_loc: 4
-        //   DW_CFA_def_cfa_offset: +16
-        //   DW_CFA_offset: W29 -16
-        //   DW_CFA_offset: W30 -8
-        //   DW_CFA_advance_loc: 4
-        //   DW_CFA_def_cfa_register: W29
-        // which is generated for the following frame prolog/epilog:
-        //   stp fp, lr, [sp, #-10]!
-        //   mov fp, sp
-        //   ...
-        //   ldp fp, lr, [sp], #0x10
-        //   ret
-        private static ReadOnlySpan<byte> DwarfArm64EmptyFrame => new byte[]
+        private static uint GetArm64CompactUnwindCode(byte[] blobData)
         {
-            0x04, 0x00, 0xFF, 0xFF, 0x10, 0x00, 0x00, 0x00,
-            0x04, 0x02, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x04, 0x02, 0x1E, 0x00, 0x08, 0x00, 0x00, 0x00,
-            0x08, 0x01, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00
-        };
+            if (blobData == null || blobData.Length == 0)
+            {
+                return UNWIND_ARM64_MODE_FRAMELESS;
+            }
+
+            Debug.Assert(blobData.Length % 8 == 0);
+
+            short spReg = -1;
+
+            int codeOffset = 0;
+            short cfaRegister = spReg;
+            int cfaOffset = 0;
+            int spOffset = 0;
+
+            const int REG_DWARF_X19 = 19;
+            const int REG_DWARF_X30 = 30;
+            const int REG_DWARF_FP = 29;
+            const int REG_DWARF_D8 = 72;
+            const int REG_DWARF_D15 = 79;
+            const int REG_IDX_X19 = 0;
+            const int REG_IDX_X28 = 9;
+            const int REG_IDX_FP = 10;
+            const int REG_IDX_LR = 11;
+            const int REG_IDX_D8 = 12;
+            const int REG_IDX_D15 = 19;
+            Span<int> registerOffset = stackalloc int[20];
+
+            registerOffset.Fill(int.MinValue);
+
+            // First process all the CFI codes to figure out the layout of X19-X28, FP, LR, and
+            // D8-D15 on the stack.
+            int offset = 0;
+            while (offset < blobData.Length)
+            {
+                codeOffset = Math.Max(codeOffset, blobData[offset++]);
+                CFI_OPCODE opcode = (CFI_OPCODE)blobData[offset++];
+                short dwarfReg = BinaryPrimitives.ReadInt16LittleEndian(blobData.AsSpan(offset));
+                offset += sizeof(short);
+                int cfiOffset = BinaryPrimitives.ReadInt32LittleEndian(blobData.AsSpan(offset));
+                offset += sizeof(int);
+
+                switch (opcode)
+                {
+                    case CFI_OPCODE.CFI_DEF_CFA_REGISTER:
+                        cfaRegister = dwarfReg;
+
+                        if (spOffset != 0)
+                        {
+                            for (int i = 0; i < registerOffset.Length; i++)
+                                if (registerOffset[i] != int.MinValue)
+                                    registerOffset[i] -= spOffset;
+
+                            cfaOffset += spOffset;
+                            spOffset = 0;
+                        }
+
+                        break;
+
+                    case CFI_OPCODE.CFI_REL_OFFSET:
+                        Debug.Assert(cfaRegister == spReg);
+                        if (dwarfReg >= REG_DWARF_X19 && dwarfReg <= REG_DWARF_X30) // X19 - X28, FP, LR
+                        {
+                            registerOffset[dwarfReg - REG_DWARF_X19 + REG_IDX_X19] = cfiOffset;
+                        }
+                        else if (dwarfReg >= REG_DWARF_D8 && dwarfReg <= REG_DWARF_D15) // D8 - D15
+                        {
+                            registerOffset[dwarfReg - REG_DWARF_D8 + REG_IDX_D8] = cfiOffset;
+                        }
+                        else
+                        {
+                            // We cannot represent this register in the compact unwinding format,
+                            // fallback to DWARF immediately.
+                            return UNWIND_ARM64_MODE_DWARF;
+                        }
+                        break;
+
+                    case CFI_OPCODE.CFI_ADJUST_CFA_OFFSET:
+                        if (cfaRegister != spReg)
+                        {
+                            cfaOffset += cfiOffset;
+                        }
+                        else
+                        {
+                            spOffset += cfiOffset;
+
+                            for (int i = 0; i < registerOffset.Length; i++)
+                                if (registerOffset[i] != int.MinValue)
+                                    registerOffset[i] += cfiOffset;
+                        }
+                        break;
+                }
+            }
+
+            uint unwindCode;
+            int nextOffset;
+
+            if (cfaRegister == REG_DWARF_FP &&
+                cfaOffset == 16 &&
+                registerOffset[REG_IDX_FP] == -16 &&
+                registerOffset[REG_IDX_LR] == -8)
+            {
+                // Frame format - FP/LR are saved on the top. SP is restored to FP+16
+                unwindCode = UNWIND_ARM64_MODE_FRAME;
+                nextOffset = -24;
+            }
+            else if (cfaRegister == -1 && spOffset <= 65520 &&
+                     registerOffset[REG_IDX_FP] == int.MinValue && registerOffset[REG_IDX_LR] == int.MinValue)
+            {
+                // Frameless format - FP/LR are not saved, SP must fit within the representable range
+                uint encodedSpOffset = (uint)(spOffset / 16) << 12;
+                unwindCode = UNWIND_ARM64_MODE_FRAMELESS | encodedSpOffset;
+                nextOffset = spOffset - 8;
+            }
+            else
+            {
+                return UNWIND_ARM64_MODE_DWARF;
+            }
+
+            // Check that the integer register pairs are in the right order and mark
+            // a flag for each successive pair that is present.
+            for (int i = REG_IDX_X19; i < REG_IDX_X28; i += 2)
+            {
+                if (registerOffset[i] == int.MinValue)
+                {
+                    if (registerOffset[i + 1] != int.MinValue)
+                        return UNWIND_ARM64_MODE_DWARF;
+                }
+                else if (registerOffset[i] == nextOffset)
+                {
+                    if (registerOffset[i + 1] != nextOffset - 8)
+                        return UNWIND_ARM64_MODE_DWARF;
+                    nextOffset -= 16;
+                    unwindCode |= UNWIND_ARM64_FRAME_X19_X20_PAIR << (i >> 1);
+                }
+            }
+
+            // Check that the floating point register pairs are in the right order and mark
+            // a flag for each successive pair that is present.
+            for (int i = REG_IDX_D8; i < REG_IDX_D15; i += 2)
+            {
+                if (registerOffset[i] == int.MinValue)
+                {
+                    if (registerOffset[i + 1] != int.MinValue)
+                        return UNWIND_ARM64_MODE_DWARF;
+                }
+                else if (registerOffset[i] == nextOffset)
+                {
+                    if (registerOffset[i + 1] != nextOffset - 8)
+                        return UNWIND_ARM64_MODE_DWARF;
+                    nextOffset -= 16;
+                    unwindCode |= UNWIND_ARM64_FRAME_D8_D9_PAIR << (i >> 1);
+                }
+            }
+
+            return unwindCode;
+        }
 
         private protected override bool EmitCompactUnwinding(string startSymbolName, ulong length, string lsdaSymbolName, byte[] blob)
         {
@@ -779,11 +917,7 @@ private protected override bool EmitCompactUnwinding(string startSymbolName, ulo
 
             if (_cpuType == CPU_TYPE_ARM64)
             {
-                if (blob.AsSpan().SequenceEqual(DwarfArm64EmptyFrame))
-                {
-                    // Frame-based encoding, no saved registers
-                    encoding = 0x04000000;
-                }
+                encoding = GetArm64CompactUnwindCode(blob);
             }
 
             _compactUnwindCodes.Add(new CompactUnwindCode(