From 88f860c6f5548c458a5c3c9862b781b9df962498 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Thu, 12 Sep 2024 18:51:04 +0200 Subject: [PATCH 1/2] JIT/ARM64: Add ability to generate frames compatible with Apple compact unwinding format. For NativeAOT/ARM64/Apple API do the following: - Save callee registers in opposite order and in pairs. - Prefer saving FP/LR on the top of the frame. Heuristics are used to avoid worse code quality outside of prolog/epilog due to addressing range limits of the ARM64 instruction set. - Added optimization to lvaFrameAddress to rewrite FP-x references to SP+y when possible. This allows efficient addressing using positive indexes when FP points to the top of the frame. It mimics similar optimization on ARM32. --- src/coreclr/jit/codegen.h | 1 + src/coreclr/jit/codegenarm64.cpp | 46 ++++++++-- src/coreclr/jit/codegencommon.cpp | 24 ++++++ src/coreclr/jit/compiler.hpp | 10 +++ src/coreclr/jit/lclvars.cpp | 139 ++++++++++++++++++------------ 5 files changed, 158 insertions(+), 62 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 1e53616c2d8e7..73ecd14e03c0d 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -656,6 +656,7 @@ class CodeGen final : public CodeGenInterface virtual bool IsSaveFpLrWithAllCalleeSavedRegisters() const; bool genSaveFpLrWithAllCalleeSavedRegisters; bool genForceFuncletFrameType5; + bool genReverseAndPairCalleeSavedRegisters; #endif // TARGET_ARM64 //------------------------------------------------------------------------- diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index ac28ebd30a19b..d56a92ed0d5ba 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -845,12 +845,19 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, i for (int i = 0; i < regStack.Height(); ++i) { - RegPair regPair = regStack.Bottom(i); + RegPair regPair = genReverseAndPairCalleeSavedRegisters ? regStack.Top(i) : regStack.Bottom(i); if (regPair.reg2 != REG_NA) { // We can use a STP instruction. - genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_IP0, - nullptr); + if (genReverseAndPairCalleeSavedRegisters) + { + genPrologSaveRegPair(regPair.reg2, regPair.reg1, spOffset, spDelta, false, REG_IP0, nullptr); + } + else + { + genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, REG_IP0, + nullptr); + } spOffset += 2 * slotSize; } @@ -926,8 +933,9 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe // Save integer registers at higher addresses than floating-point registers. + regMaskTP maskSaveRegsFrame = regsToSaveMask & (RBM_FP | RBM_LR); regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT; - regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat; + regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat & ~maskSaveRegsFrame; if (maskSaveRegsFloat != RBM_NONE) { @@ -939,6 +947,13 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe if (maskSaveRegsInt != RBM_NONE) { genSaveCalleeSavedRegisterGroup(maskSaveRegsInt, spDelta, lowestCalleeSavedOffset); + spDelta = 0; + lowestCalleeSavedOffset += genCountBits(maskSaveRegsInt) * FPSAVE_REGSIZE_BYTES; + } + + if (maskSaveRegsFrame != RBM_NONE) + { + genPrologSaveRegPair(REG_FP, REG_LR, lowestCalleeSavedOffset, spDelta, false, REG_IP0, nullptr); // No need to update spDelta, lowestCalleeSavedOffset since they're not used after this. } } @@ -970,13 +985,20 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta stackDelta = spDelta; } - RegPair regPair = regStack.Top(i); + RegPair regPair = genReverseAndPairCalleeSavedRegisters ? regStack.Bottom(i) : regStack.Top(i); if (regPair.reg2 != REG_NA) { spOffset -= 2 * slotSize; - genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair, REG_IP1, - nullptr); + if (genReverseAndPairCalleeSavedRegisters) + { + genEpilogRestoreRegPair(regPair.reg2, regPair.reg1, spOffset, stackDelta, false, REG_IP1, nullptr); + } + else + { + genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair, + REG_IP1, nullptr); + } } else { @@ -1043,11 +1065,19 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in // Save integer registers at higher addresses than floating-point registers. + regMaskTP maskRestoreRegsFrame = regsToRestoreMask & (RBM_FP | RBM_LR); regMaskTP maskRestoreRegsFloat = regsToRestoreMask & RBM_ALLFLOAT; - regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat; + regMaskTP maskRestoreRegsInt = regsToRestoreMask & ~maskRestoreRegsFloat & ~maskRestoreRegsFrame; // Restore in the opposite order of saving. + if (maskRestoreRegsFrame != RBM_NONE) + { + int spFrameDelta = (maskRestoreRegsFloat != RBM_NONE || maskRestoreRegsInt != RBM_NONE) ? 0 : spDelta; + spOffset -= 2 * REGSIZE_BYTES; + genEpilogRestoreRegPair(REG_FP, REG_LR, spOffset, spFrameDelta, false, REG_IP1, nullptr); + } + if (maskRestoreRegsInt != RBM_NONE) { int spIntDelta = (maskRestoreRegsFloat != RBM_NONE) ? 0 : spDelta; // should we delay the SP adjustment? diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 9920f9846d273..881b4ff0cc566 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -255,6 +255,7 @@ CodeGen::CodeGen(Compiler* theCompiler) #ifdef TARGET_ARM64 genSaveFpLrWithAllCalleeSavedRegisters = false; genForceFuncletFrameType5 = false; + genReverseAndPairCalleeSavedRegisters = false; #endif // TARGET_ARM64 } @@ -4846,6 +4847,29 @@ void CodeGen::genFinalizeFrame() } #endif // TARGET_ARM +#ifdef TARGET_ARM64 + if (compiler->IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform) + { + JITDUMP("Setting genReverseAndPairCalleeSavedRegisters = true"); + + genReverseAndPairCalleeSavedRegisters = true; + + // Make sure we push the registers in pairs if possible. If we only allocate a contiguous + // block of registers this should add at most one integer and at most one floating point + // register to the list. The stack has to be 16-byte aligned, so in worst case it results + // in allocating 16 bytes more space on stack if odd number of integer and odd number of + // FP registers were occupied. Same number of instructions will be generated, just the + // STR instructions are replaced with STP (store pair). + regMaskTP maskModifiedRegs = regSet.rsGetModifiedRegsMask(); + regMaskTP maskPairRegs = ((maskModifiedRegs & (RBM_V8 | RBM_V10 | RBM_V12 | RBM_V14)).getLow() << 1) | + ((maskModifiedRegs & (RBM_R19 | RBM_R21 | RBM_R23 | RBM_R25 | RBM_R27)).getLow() << 1); + if (maskPairRegs != RBM_NONE) + { + regSet.rsSetRegsModified(maskPairRegs); + } + } +#endif + #ifdef DEBUG if (verbose) { diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp index 4f70a01a8a79e..401b5993dab1f 100644 --- a/src/coreclr/jit/compiler.hpp +++ b/src/coreclr/jit/compiler.hpp @@ -2807,6 +2807,16 @@ inline { *pBaseReg = REG_SPBASE; } +#elif defined(TARGET_ARM64) + if (FPbased && !codeGen->isFramePointerRequired() && varOffset < 0 && + lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT && codeGen->IsSaveFpLrWithAllCalleeSavedRegisters()) + { + int spVarOffset = varOffset + codeGen->genSPtoFPdelta(); + JITDUMP("lvaFrameAddress optimization for V%02u: [FP-%d] -> [SP+%d]\n", varNum, -varOffset, spVarOffset); + FPbased = false; + varOffset = spVarOffset; + } + *pFPbased = FPbased; #else *pFPbased = FPbased; #endif diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index d4adbd6b9907a..1c5973f9d6bb7 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5648,7 +5648,9 @@ void Compiler::lvaFixVirtualFrameOffsets() #endif // The delta to be added to virtual offset to adjust it relative to frame pointer or SP - int delta = 0; + int delta = 0; + int frameLocalsDelta = 0; + int frameBoundary = 0; #ifdef TARGET_XARCH delta += REGSIZE_BYTES; // pushed PC (return address) for x86/x64 @@ -5673,7 +5675,25 @@ void Compiler::lvaFixVirtualFrameOffsets() // We set FP to be after LR, FP delta += 2 * REGSIZE_BYTES; } -#elif defined(TARGET_AMD64) || defined(TARGET_ARM64) +#elif defined(TARGET_ARM64) + else + { + // FP is used. + delta += codeGen->genTotalFrameSize() - codeGen->genSPtoFPdelta(); + + // If we placed FP/LR at the bottom of the frame we need to shift all the variables + // on the new frame to account for it. See lvaAssignVirtualFrameOffsetsToLocals. + if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters()) + { + // We set FP to be after LR, FP + frameLocalsDelta = 2 * REGSIZE_BYTES; + frameBoundary = opts.IsOSR() ? -info.compPatchpointInfo->TotalFrameSize() : 0; + if (info.compIsVarArgs) + frameBoundary -= MAX_REG_ARG * REGSIZE_BYTES; + } + JITDUMP("--- delta bump %d for FP frame, %d inside frame for FP/LR relocation\n", delta, frameLocalsDelta); + } +#elif defined(TARGET_AMD64) else { // FP is used. @@ -5741,7 +5761,7 @@ void Compiler::lvaFixVirtualFrameOffsets() #if defined(TARGET_X86) // On x86, we set the stack offset for a promoted field - // to match a struct parameter in lvAssignFrameOffsetsToPromotedStructs. + // to match a struct parameter in lvaAssignFrameOffsetsToPromotedStructs. if ((!varDsc->lvIsParam || parentvarDsc->lvIsParam) && promotionType == PROMOTION_TYPE_DEPENDENT) #else if (!varDsc->lvIsParam && promotionType == PROMOTION_TYPE_DEPENDENT) @@ -5761,15 +5781,23 @@ void Compiler::lvaFixVirtualFrameOffsets() if (doAssignStkOffs) { - JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(), varDsc->GetStackOffset() + delta); - varDsc->SetStackOffset(varDsc->GetStackOffset() + delta); + int localDelta = delta; + + if (frameLocalsDelta != 0 && varDsc->GetStackOffset() < frameBoundary) + { + localDelta += frameLocalsDelta; + } + + JITDUMP("-- V%02u was %d, now %d\n", lclNum, varDsc->GetStackOffset(), + varDsc->GetStackOffset() + localDelta); + varDsc->SetStackOffset(varDsc->GetStackOffset() + localDelta); #if DOUBLE_ALIGN if (genDoubleAlign() && !codeGen->isFramePointerUsed()) { if (varDsc->lvFramePointerBased) { - varDsc->SetStackOffset(varDsc->GetStackOffset() - delta); + varDsc->SetStackOffset(varDsc->GetStackOffset() - localDelta); // We need to re-adjust the offsets of the parameters so they are EBP // relative rather than stack/frame pointer relative @@ -5791,9 +5819,13 @@ void Compiler::lvaFixVirtualFrameOffsets() assert(codeGen->regSet.tmpAllFree()); for (TempDsc* temp = codeGen->regSet.tmpListBeg(); temp != nullptr; temp = codeGen->regSet.tmpListNxt(temp)) { - temp->tdAdjustTempOffs(delta); + temp->tdAdjustTempOffs(delta + frameLocalsDelta); } + if (lvaCachedGenericContextArgOffs < frameBoundary) + { + lvaCachedGenericContextArgOffs += frameLocalsDelta; + } lvaCachedGenericContextArgOffs += delta; #if FEATURE_FIXED_OUT_ARGS @@ -6050,30 +6082,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() codeGen->setFramePointerUsed(codeGen->isFramePointerRequired()); } -#ifdef TARGET_ARM64 - // Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is - // a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we - // need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value, - // and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the - // frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress. - // (It should be legal to use these frame types for every frame). - - if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0) - { - // Default configuration - codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) || - opts.compDbgEnC || compStressCompile(STRESS_GENERIC_VARN, 20)); - } - else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1) - { - codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames - } - else if ((opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (opts.compJitSaveFpLrWithCalleeSavedRegisters == 3)) - { - codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames - } -#endif // TARGET_ARM64 - #ifdef TARGET_XARCH // On x86/amd64, the return address has already been pushed by the call instruction in the caller. stkOffs -= TARGET_POINTER_SIZE; // return address; @@ -6122,9 +6130,13 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() #endif // !TARGET_ARM #ifdef TARGET_ARM64 - // If the frame pointer is used, then we'll save FP/LR at the bottom of the stack. - // Otherwise, we won't store FP, and we'll store LR at the top, with the other callee-save - // registers (if any). + // If the frame pointer is used, then we'll save FP/LR either at the bottom of the stack + // or at the top of the stack depending on frame type. We make the decision after assigning + // the variables on the frame and then fix up the offsets in lvaFixVirtualFrameOffsets. + // For now, we proceed as if FP/LR were saved with the callee registers. If we later + // decide to move the FP/LR to the bottom of the frame it shifts all the assigned + // variables and temporaries by 16 bytes. The largest alignment we currently make is 16 + // bytes for SIMD. int initialStkOffs = 0; if (info.compIsVarArgs) @@ -6135,17 +6147,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() stkOffs -= initialStkOffs; } - if (codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() || !isFramePointerUsed()) // Note that currently we always have - // a frame pointer - { - stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES; - } - else - { - // Subtract off FP and LR. - assert(compCalleeRegsPushed >= 2); - stkOffs -= (compCalleeRegsPushed - 2) * REGSIZE_BYTES; - } + stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES; #elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) @@ -6815,15 +6817,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() } #endif // TARGET_AMD64 -#ifdef TARGET_ARM64 - if (!codeGen->IsSaveFpLrWithAllCalleeSavedRegisters() && isFramePointerUsed()) // Note that currently we always have - // a frame pointer - { - // Create space for saving FP and LR. - stkOffs -= 2 * REGSIZE_BYTES; - } -#endif // TARGET_ARM64 - #if FEATURE_FIXED_OUT_ARGS if (lvaOutgoingArgSpaceSize > 0) { @@ -6861,6 +6854,44 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() noway_assert(compLclFrameSize + originalFrameSize == (unsigned)-(stkOffs + (pushedCount * (int)TARGET_POINTER_SIZE))); + +#ifdef TARGET_ARM64 + // Decide where to save FP and LR registers. We store FP/LR registers at the bottom of the frame if there is + // a frame pointer used (so we get positive offsets from the frame pointer to access locals), but not if we + // need a GS cookie AND localloc is used, since we need the GS cookie to protect the saved return value, + // and also the saved frame pointer. See CodeGen::genPushCalleeSavedRegisters() for more details about the + // frame types. Since saving FP/LR at high addresses is a relatively rare case, force using it during stress. + // (It should be legal to use these frame types for every frame). + // + // For Apple NativeAOT ABI we try to save the FP/LR registers on top to get canonical frame layout that can + // be represented with compact unwinding information. In order to maintain code quality we only do it when + // we can use SP-based addressing (!isFramePointerRequired) through lvaFrameAddress optimization, or if the + // whole frame is small enough that the negative FP-based addressing can address the whole frame. + + if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 0) + { + if (IsTargetAbi(CORINFO_NATIVEAOT_ABI) && TargetOS::IsApplePlatform && + (!codeGen->isFramePointerRequired() || codeGen->genTotalFrameSize() < 0x100)) + { + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); + } + else + { + // Default configuration + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters((getNeedsGSSecurityCookie() && compLocallocUsed) || + opts.compDbgEnC || + compStressCompile(Compiler::STRESS_GENERIC_VARN, 20)); + } + } + else if (opts.compJitSaveFpLrWithCalleeSavedRegisters == 1) + { + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(false); // Disable using new frames + } + else if ((opts.compJitSaveFpLrWithCalleeSavedRegisters == 2) || (opts.compJitSaveFpLrWithCalleeSavedRegisters == 3)) + { + codeGen->SetSaveFpLrWithAllCalleeSavedRegisters(true); // Force using new frames + } +#endif // TARGET_ARM64 } //------------------------------------------------------------------------ From 73bea2efe7f62e6b36894363da0c1d4c21ed35b3 Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Thu, 12 Sep 2024 18:59:03 +0200 Subject: [PATCH 2/2] ObjWriter: For Mach-O ARM64 try to convert the DWARF CFI unwinding codes into compact unwinding code --- .../Compiler/ObjectWriter/MachNative.cs | 13 ++ .../Compiler/ObjectWriter/MachObjectWriter.cs | 182 +++++++++++++++--- 2 files changed, 171 insertions(+), 24 deletions(-) diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs index 14db96a935474..8b3af9392a7b1 100644 --- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs +++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachNative.cs @@ -120,5 +120,18 @@ internal static class MachNative public const uint PLATFORM_TVOSSIMULATOR = 8; public const uint PLATFORM_WATCHOSSIMULATOR = 9; public const uint PLATFORM_DRIVERKIT = 10; + + public const uint UNWIND_ARM64_MODE_FRAMELESS = 0x02000000; + public const uint UNWIND_ARM64_MODE_DWARF = 0x03000000; + public const uint UNWIND_ARM64_MODE_FRAME = 0x04000000; + public const uint UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001; + public const uint UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002; + public const uint UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004; + public const uint UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008; + public const uint UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010; + public const uint UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100; + public const uint UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200; + public const uint UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400; + public const uint UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800; } } diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs index 2424ec434126c..4a1742c36b8f5 100644 --- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs +++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/ObjectWriter/MachObjectWriter.cs @@ -752,26 +752,164 @@ void EmitCompactUnwindSymbol(string symbolName) private protected override string ExternCName(string name) => "_" + name; - // This represents the following DWARF code: - // DW_CFA_advance_loc: 4 - // DW_CFA_def_cfa_offset: +16 - // DW_CFA_offset: W29 -16 - // DW_CFA_offset: W30 -8 - // DW_CFA_advance_loc: 4 - // DW_CFA_def_cfa_register: W29 - // which is generated for the following frame prolog/epilog: - // stp fp, lr, [sp, #-10]! - // mov fp, sp - // ... - // ldp fp, lr, [sp], #0x10 - // ret - private static ReadOnlySpan DwarfArm64EmptyFrame => new byte[] + private static uint GetArm64CompactUnwindCode(byte[] blobData) { - 0x04, 0x00, 0xFF, 0xFF, 0x10, 0x00, 0x00, 0x00, - 0x04, 0x02, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x04, 0x02, 0x1E, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x08, 0x01, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00 - }; + if (blobData == null || blobData.Length == 0) + { + return UNWIND_ARM64_MODE_FRAMELESS; + } + + Debug.Assert(blobData.Length % 8 == 0); + + short spReg = -1; + + int codeOffset = 0; + short cfaRegister = spReg; + int cfaOffset = 0; + int spOffset = 0; + + const int REG_DWARF_X19 = 19; + const int REG_DWARF_X30 = 30; + const int REG_DWARF_FP = 29; + const int REG_DWARF_D8 = 72; + const int REG_DWARF_D15 = 79; + const int REG_IDX_X19 = 0; + const int REG_IDX_X28 = 9; + const int REG_IDX_FP = 10; + const int REG_IDX_LR = 11; + const int REG_IDX_D8 = 12; + const int REG_IDX_D15 = 19; + Span registerOffset = stackalloc int[20]; + + registerOffset.Fill(int.MinValue); + + // First process all the CFI codes to figure out the layout of X19-X28, FP, LR, and + // D8-D15 on the stack. + int offset = 0; + while (offset < blobData.Length) + { + codeOffset = Math.Max(codeOffset, blobData[offset++]); + CFI_OPCODE opcode = (CFI_OPCODE)blobData[offset++]; + short dwarfReg = BinaryPrimitives.ReadInt16LittleEndian(blobData.AsSpan(offset)); + offset += sizeof(short); + int cfiOffset = BinaryPrimitives.ReadInt32LittleEndian(blobData.AsSpan(offset)); + offset += sizeof(int); + + switch (opcode) + { + case CFI_OPCODE.CFI_DEF_CFA_REGISTER: + cfaRegister = dwarfReg; + + if (spOffset != 0) + { + for (int i = 0; i < registerOffset.Length; i++) + if (registerOffset[i] != int.MinValue) + registerOffset[i] -= spOffset; + + cfaOffset += spOffset; + spOffset = 0; + } + + break; + + case CFI_OPCODE.CFI_REL_OFFSET: + Debug.Assert(cfaRegister == spReg); + if (dwarfReg >= REG_DWARF_X19 && dwarfReg <= REG_DWARF_X30) // X19 - X28, FP, LR + { + registerOffset[dwarfReg - REG_DWARF_X19 + REG_IDX_X19] = cfiOffset; + } + else if (dwarfReg >= REG_DWARF_D8 && dwarfReg <= REG_DWARF_D15) // D8 - D15 + { + registerOffset[dwarfReg - REG_DWARF_D8 + REG_IDX_D8] = cfiOffset; + } + else + { + // We cannot represent this register in the compact unwinding format, + // fallback to DWARF immediately. + return UNWIND_ARM64_MODE_DWARF; + } + break; + + case CFI_OPCODE.CFI_ADJUST_CFA_OFFSET: + if (cfaRegister != spReg) + { + cfaOffset += cfiOffset; + } + else + { + spOffset += cfiOffset; + + for (int i = 0; i < registerOffset.Length; i++) + if (registerOffset[i] != int.MinValue) + registerOffset[i] += cfiOffset; + } + break; + } + } + + uint unwindCode; + int nextOffset; + + if (cfaRegister == REG_DWARF_FP && + cfaOffset == 16 && + registerOffset[REG_IDX_FP] == -16 && + registerOffset[REG_IDX_LR] == -8) + { + // Frame format - FP/LR are saved on the top. SP is restored to FP+16 + unwindCode = UNWIND_ARM64_MODE_FRAME; + nextOffset = -24; + } + else if (cfaRegister == -1 && spOffset <= 65520 && + registerOffset[REG_IDX_FP] == int.MinValue && registerOffset[REG_IDX_LR] == int.MinValue) + { + // Frameless format - FP/LR are not saved, SP must fit within the representable range + uint encodedSpOffset = (uint)(spOffset / 16) << 12; + unwindCode = UNWIND_ARM64_MODE_FRAMELESS | encodedSpOffset; + nextOffset = spOffset - 8; + } + else + { + return UNWIND_ARM64_MODE_DWARF; + } + + // Check that the integer register pairs are in the right order and mark + // a flag for each successive pair that is present. + for (int i = REG_IDX_X19; i < REG_IDX_X28; i += 2) + { + if (registerOffset[i] == int.MinValue) + { + if (registerOffset[i + 1] != int.MinValue) + return UNWIND_ARM64_MODE_DWARF; + } + else if (registerOffset[i] == nextOffset) + { + if (registerOffset[i + 1] != nextOffset - 8) + return UNWIND_ARM64_MODE_DWARF; + nextOffset -= 16; + unwindCode |= UNWIND_ARM64_FRAME_X19_X20_PAIR << (i >> 1); + } + } + + // Check that the floating point register pairs are in the right order and mark + // a flag for each successive pair that is present. + for (int i = REG_IDX_D8; i < REG_IDX_D15; i += 2) + { + if (registerOffset[i] == int.MinValue) + { + if (registerOffset[i + 1] != int.MinValue) + return UNWIND_ARM64_MODE_DWARF; + } + else if (registerOffset[i] == nextOffset) + { + if (registerOffset[i + 1] != nextOffset - 8) + return UNWIND_ARM64_MODE_DWARF; + nextOffset -= 16; + unwindCode |= UNWIND_ARM64_FRAME_D8_D9_PAIR << (i >> 1); + } + } + + return unwindCode; + } private protected override bool EmitCompactUnwinding(string startSymbolName, ulong length, string lsdaSymbolName, byte[] blob) { @@ -779,11 +917,7 @@ private protected override bool EmitCompactUnwinding(string startSymbolName, ulo if (_cpuType == CPU_TYPE_ARM64) { - if (blob.AsSpan().SequenceEqual(DwarfArm64EmptyFrame)) - { - // Frame-based encoding, no saved registers - encoding = 0x04000000; - } + encoding = GetArm64CompactUnwindCode(blob); } _compactUnwindCodes.Add(new CompactUnwindCode(