-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[AArch64][SME] Support Windows/stack probes in MachineSMEABIPass #149063
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/MacDue/machine-sme
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesOn Windows or with stack probes on other targets, additional code needs to be inserted after dynamic stack allocations to validate stack accesses and/or ensure enough stack space has been allocated. Rather than handle this case in the MachineSMEABIPass (like we do for the standard case), we allocate the memory for the lazy save buffer in SelectionDAG, which allows the existing expansions to emit the correct code. Note: This means in these cases, we may allocate a lazy save buffer when there are no lazy saves present in the function (as we have to allocate the buffer before the MachineSMEABIPass runs). Full diff: https://github.com/llvm/llvm-project/pull/149063.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index c0d118aa3afed..b76172fd2e934 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1642,6 +1642,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
}
case AArch64::InOutZAUsePseudo:
case AArch64::RequiresZASavePseudo:
+ case AArch64::SMEStateAllocPseudo:
case AArch64::COALESCER_BARRIER_FPR16:
case AArch64::COALESCER_BARRIER_FPR32:
case AArch64::COALESCER_BARRIER_FPR64:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 49135d05b689b..d586942582d8b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8154,7 +8154,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
- if (!Subtarget->useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
+ if (Subtarget->useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) {
+ if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
+ SDValue Size;
+ if (Attrs.hasZAState()) {
+ SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+ DAG.getConstant(1, DL, MVT::i32));
+ Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+ } else if (Attrs.hasAgnosticZAInterface()) {
+ SDValue Callee = DAG.getExternalSymbol(
+ "__arm_sme_state_size", getPointerTy(DAG.getDataLayout()));
+ auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1,
+ RetTy, Callee, {});
+ std::tie(Size, Chain) = LowerCallTo(CLI);
+ }
+ if (Size) {
+ SDValue Buffer = DAG.getNode(
+ ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
+ {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
+ Chain = Buffer.getValue(1);
+
+ Register BufferPtr =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
+ Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
+ DAG.getVTList(MVT::Other), Chain);
+ FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+ }
+ } else {
// Old SME ABI lowering (deprecated):
// Create a 16 Byte TPIDR2 object. The dynamic buffer
// will be expanded and stored in the static object later using a
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 3f6980fe11aea..0f9654939be53 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -239,6 +239,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// Holds the SME function attributes (streaming mode, ZA/ZT0 state).
SMEAttrs SMEFnAttrs;
+ // Holds the TPIDR2 block if allocated early (for Windows/stack probes
+ // support).
+ Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;
+
// Note: The following properties are only used for the old SME ABI lowering:
/// The frame-index for the TPIDR2 object used for lazy saves.
TPIDR2Object TPIDR2;
@@ -257,6 +261,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
const override;
+ void setEarlyAllocSMESaveBuffer(Register Ptr) {
+ EarlyAllocSMESaveBuffer = Ptr;
+ }
+
+ Register getEarlyAllocSMESaveBuffer() { return EarlyAllocSMESaveBuffer; }
+
// Old SME ABI lowering state getters/setters:
Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index d50dcd8ebd815..02d86a924e7d2 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -90,6 +90,8 @@ let hasSideEffects = 1 in {
def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
}
+def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+
def CommitZAPseudo
: Pseudo<(outs),
(ins GPR64:$tpidr2_el0, i64imm:$restore_routine, variable_ops), []>,
@@ -105,6 +107,11 @@ def AArch64_requires_za_save
[SDNPHasChain, SDNPInGlue]>;
def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
+def AArch64_sme_state_alloc
+ : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
+ [SDNPHasChain]>;
+def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;
+
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 9ef4d71d04568..287cc86e19bde 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -166,6 +166,7 @@ struct MachineSMEABI : public MachineFunctionPass {
SmallVector<BlockInfo> Blocks;
SmallVector<ZAState> BundleStates;
std::optional<TPIDR2State> TPIDR2Block;
+ std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
} State;
EdgeBundles *Bundles = nullptr;
@@ -212,6 +213,13 @@ void MachineSMEABI::collectNeededZAStates(MachineFunction &MF,
MachineBasicBlock::iterator MBBI(MI);
LiveUnits.stepBackward(MI);
LiveRegs PhysLiveRegs = GetPhysLiveRegs();
+ // The SMEStateAllocPseudo marker is added to a function if the save
+ // buffer was allocated in SelectionDAG. It marks the end of the
+ // allocation -- which is a safe point for this pass to insert any TPIDR2
+ // block setup.
+ if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
+ State.AfterSMEProloguePt = MBBI;
+ }
auto [NeededState, InsertPt] = getInstNeededZAState(
TRI, MI, /*ZALiveAtReturn=*/SMEFnAttrs.hasSharedZAInterface());
assert((InsertPt == MBBI ||
@@ -465,23 +473,25 @@ void MachineSMEABI::emitAllocateLazySaveBuffer(
auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
+ auto *AFI = MF.getInfo<AArch64FunctionInfo>();
DebugLoc DL = getDebugLoc(MBB, MBBI);
Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
Register SVL = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
- Register Buffer = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
// Calculate SVL.
BuildMI(MBB, MBBI, DL, TII.get(AArch64::RDSVLI_XI), SVL).addImm(1);
// 1. Allocate the lazy save buffer.
- {
+ if (Buffer == AArch64::NoRegister) {
// TODO This function grows the stack with a subtraction, which doesn't work
// on Windows. Some refactoring to share the functionality in
// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
// supports SME
assert(!Subtarget.isTargetWindows() &&
"Lazy ZA save is not yet supported on Windows");
+ Buffer = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
// Get original stack pointer.
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), SP).addReg(AArch64::SP);
// Allocate a lazy-save buffer object of the size given, normally SVL * SVL
@@ -632,8 +642,15 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
// Allocate save buffer (if needed).
if (State.TPIDR2Block.has_value()) {
- MachineBasicBlock &EntryBlock = MF.front();
- emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+ if (State.AfterSMEProloguePt) {
+ // Note: With inline stack probes the AfterSMEProloguePt may not be in the
+ // entry block (due to the probing loop).
+ emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
+ *State.AfterSMEProloguePt);
+ } else {
+ MachineBasicBlock &EntryBlock = MF.front();
+ emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+ }
}
return true;
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
new file mode 100644
index 0000000000000..873b6d9244f46
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING
+
+declare void @private_za_callee()
+declare void @shared_za_callee() "aarch64_inout_za"
+
+define void @test_lazy_save() nounwind "aarch64_inout_za" {
+; CHECK-LABEL: test_lazy_save:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mul x9, x8, x8
+; CHECK-NEXT: lsr x15, x9, #4
+; CHECK-NEXT: bl __chkstk
+; CHECK-NEXT: sub x9, sp, x15, lsl #4
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: sub x9, x29, #16
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: sturh w8, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: bl private_za_callee
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB0_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: test_lazy_save:
+; CHECK-NEWLOWERING: // %bb.0:
+; CHECK-NEWLOWERING-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x29, sp
+; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
+; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
+; CHECK-NEWLOWERING-NEXT: mul x9, x8, x8
+; CHECK-NEWLOWERING-NEXT: lsr x15, x9, #4
+; CHECK-NEWLOWERING-NEXT: bl __chkstk
+; CHECK-NEWLOWERING-NEXT: sub x9, sp, x15, lsl #4
+; CHECK-NEWLOWERING-NEXT: mov sp, x9
+; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
+; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEWLOWERING-NEXT: bl private_za_callee
+; CHECK-NEWLOWERING-NEXT: smstart za
+; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
+; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_2
+; CHECK-NEWLOWERING-NEXT: // %bb.1:
+; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEWLOWERING-NEXT: .LBB0_2:
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEWLOWERING-NEXT: mov sp, x29
+; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ret
+ call void @private_za_callee()
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index a9ad6f695cf8f..1a039eeb76956 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -103,7 +103,6 @@ exit:
ret float %ret
}
-; FIXME: This is missing stack probes with -aarch64-new-sme-abi.
define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
; CHECK: // %bb.0:
@@ -165,26 +164,35 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
; CHECK-NEWLOWERING-NEXT: mov x9, sp
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
+; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEWLOWERING-NEXT: cmp sp, x9
+; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3
+; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
+; CHECK-NEWLOWERING-NEXT: b .LBB2_1
+; CHECK-NEWLOWERING-NEXT: .LBB2_3:
; CHECK-NEWLOWERING-NEXT: mov sp, x9
+; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b
+; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5
+; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b
; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
-; CHECK-NEWLOWERING-NEXT: b .LBB2_3
-; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c
+; CHECK-NEWLOWERING-NEXT: b .LBB2_6
+; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c
; CHECK-NEWLOWERING-NEXT: fmov s0, s1
; CHECK-NEWLOWERING-NEXT: bl cosf
-; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit
+; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5
-; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
+; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8
+; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit
+; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On a side-note, doing inline stack probes on Windows, instead of using __chkstk, is allowed; __chkstk is just faster for large allocations because it caches the size of the stack. Not sure if that changes what you want to do here.
|
||
// Calculate SVL. | ||
BuildMI(MBB, MBBI, DL, TII.get(AArch64::RDSVLI_XI), SVL).addImm(1); | ||
|
||
// 1. Allocate the lazy save buffer. | ||
{ | ||
if (Buffer == AArch64::NoRegister) { | ||
// TODO This function grows the stack with a subtraction, which doesn't work | ||
// on Windows. Some refactoring to share the functionality in | ||
// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI | ||
// supports SME |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update this comment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've rewritten this comment 👍 I think there's still a TODO to improve how this is handled.
What is allowed? I think part of your message got cut off 😅 |
I'll restate: On Windows, there are guard pages, so you have to grow the stack one page at a time. There are two ways to do this: __chkstk, or explicit memory accesses. __chkstk is faster for large allocations because it caches the size of the stack. LLVM currently relies explicit accesses for prologues that allocate less than one page, and __chkstk for all other allocations. But the ABI doesn't require that; we can use whatever mix we want. |
Thanks 👍 My current thought is that it does not change anything for this patch (which mainly aims to restore the functionality that existed with the old lowering, which solved this the same way). However, it could help simplify the implementation if we move the handling for Windows into the MachineSMEABIPass (if we want to remove redundant buffer allocations on that target). |
On Windows or with stack probes on other targets, additional code needs to be inserted after dynamic stack allocations to validate stack accesses and/or ensure enough stack space has been allocated. Rather than handle this case in the MachineSMEABIPass (like we do for the standard case), we allocate the memory for the lazy save buffer in SelectionDAG, which allows the existing expansions to emit the correct code. Note: This means in these cases, we may allocate a lazy save buffer when there are no lazy saves present in the function (as we have to allocate the buffer before the MachineSMEABIPass runs). Change-Id: If89ab54c4de79f6fe5513a6b387e9e349f7bc7d1
9dcb26a
to
4250bec
Compare
Change-Id: I5dca6eaca8613a33e89a5cec9cc7d2c0f9cc7fb5
On Windows or with stack probes on other targets, additional code needs to be inserted after dynamic stack allocations to validate stack accesses and/or ensure enough stack space has been allocated.
Rather than handle this case in the MachineSMEABIPass (like we do for the standard case), we allocate the memory for the lazy save buffer in SelectionDAG, which allows the existing expansions to emit the correct code.
Note: This means in these cases, we may allocate a lazy save buffer when there are no lazy saves present in the function (as we have to allocate the buffer before the MachineSMEABIPass runs).