-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[AMDGPU] Select flat GVS loads on gfx1250 #149183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Select flat GVS loads on gfx1250 #149183
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 117.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149183.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 06e23dbb92450..3965b5dd8c5c3 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1250,6 +1250,16 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
(inst $saddr, $voffset, $offset, 0, $in)
>;
+class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
+ (inst $saddr, $voffset, $offset, (i32 0), $in)
+>;
+
+class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
+ (inst $saddr, $voffset, $offset, (i32 0))
+>;
+
class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
(inst $saddr, $voffset, $offset, (i32 0))
@@ -1260,7 +1270,7 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $vaddr, $offset)
>;
-class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
(inst $saddr, $voffset, $offset, 0)
>;
@@ -1444,7 +1454,7 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
let AddedComplexity = 10;
}
- def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1454,7 +1464,7 @@ multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Valu
let AddedComplexity = 10;
}
- def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
@@ -1618,32 +1628,60 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
}
}
+multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat <inst, node, vt>;
+
+ def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_D16 <inst, node, vt>;
+
+ def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
+multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+ def : FlatLoadPat_D16_t16 <inst, node, vt>;
+
+ def : FlatLoadSaddrPat_D16_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let AddedComplexity = 9;
+ let SubtargetPredicate = HasFlatGVSMode;
+ }
+}
+
let OtherPredicates = [HasFlatAddressSpace] in {
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let True16Predicate = p in {
- def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
@@ -1651,28 +1689,28 @@ let True16Predicate = p in {
}
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
+ defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
-def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
foreach vt = Reg32Types.types in {
-def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORD, load_flat, vt>;
def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
}
@@ -1684,7 +1722,7 @@ def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>;
foreach vt = VReg_128.RegTypes in {
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
+defm : FlatLoadPats <FLAT_LOAD_DWORDX4, load_flat, vt>;
def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a1e14d90ebcab..6109a2c4dfc7f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6460,7 +6460,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
if (OldSAddrIdx < 0)
return false;
- assert(isSegmentSpecificFLAT(Inst));
+ assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode()));
int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
if (NewOpc < 0)
@@ -6537,7 +6537,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
// FIXME: Remove this when SelectionDAG is obsoleted.
void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
- if (!isSegmentSpecificFLAT(MI))
+ if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode())
return;
// Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
new file mode 100644
index 0000000000000..f0988a17b35f0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -0,0 +1,2405 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+; Test using saddr addressing mode of flat_*load_* instructions.
+
+; --------------------------------------------------------------------------------
+; No vgpr offset, constants
+; --------------------------------------------------------------------------------
+
+; SGPR base only
+define amdgpu_ps float @flat_load_saddr_i8_offset_0(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_0:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
+ %load = load i8, ptr %sbase
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx1250 immediate offset
+define amdgpu_ps float @flat_load_saddr_i8_offset_8388607(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_8388607:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388607
+ %load = load i8, ptr %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum gfx1250 immediate offset + 1
+define amdgpu_ps float @flat_load_saddr_i8_offset_8388608(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_8388608:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0x800000
+; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388608
+ %load = load i8, ptr %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx1250 immediate offset
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388608(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_neg8388608:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-8388608
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -8388608
+ %load = load i8, ptr %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+; SGPR base with maximum negative gfx1250 immediate offset -1
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg8388609:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-1
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg8388609:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0xff7fffff
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -8388609
+ %load = load i8, ptr %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) {
+; GFX1250-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0xff800000
+; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967295
+ %load = load i8, ptr %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000000:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, 1
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000000:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967296
+ %load = load i8, ptr %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000001:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:1
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000001:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 1
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967297
+ %load = load i8, ptr %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000FFF:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000FFF:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294971391
+ %load = load i8, ptr %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100001000:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100001000:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
+ %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294971392
+ %load = load i8, ptr %gep0
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase) {
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800000, s2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0...
[truncated]
|
|
It will create some downstream conflicts, not all flat features are upstreamed yet. In particular some of the patterns shall take CPol operand, but that is a later patch. |
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll llvm/lib/Target/AMDGPU/SIInstrInfo.cpp The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
No description provided.