Skip to content

Commit 17ff078

Browse files
committed
[AMDGPU] gfx1010 memory legalizer
Differential Revision: https://reviews.llvm.org/D61535 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360087 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent c8792bb commit 17ff078

7 files changed

+4813
-913
lines changed

lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 262 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,40 @@ class SIGfx7CacheControl : public SIGfx6CacheControl {
352352

353353
};
354354

355+
class SIGfx10CacheControl : public SIGfx7CacheControl {
356+
protected:
357+
bool CuMode = false;
358+
359+
/// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
360+
/// is modified, false otherwise.
361+
bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
362+
return enableNamedBit<AMDGPU::OpName::dlc>(MI);
363+
}
364+
365+
public:
366+
367+
SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
368+
SIGfx7CacheControl(ST), CuMode(CuMode) {};
369+
370+
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
371+
SIAtomicScope Scope,
372+
SIAtomicAddrSpace AddrSpace) const override;
373+
374+
bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
375+
376+
bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
377+
SIAtomicScope Scope,
378+
SIAtomicAddrSpace AddrSpace,
379+
Position Pos) const override;
380+
381+
bool insertWait(MachineBasicBlock::iterator &MI,
382+
SIAtomicScope Scope,
383+
SIAtomicAddrSpace AddrSpace,
384+
SIMemOp Op,
385+
bool IsCrossAddrSpaceOrdering,
386+
Position Pos) const override;
387+
};
388+
355389
class SIMemoryLegalizer final : public MachineFunctionPass {
356390
private:
357391

@@ -623,7 +657,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
623657
GCNSubtarget::Generation Generation = ST.getGeneration();
624658
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
625659
return make_unique<SIGfx6CacheControl>(ST);
626-
return make_unique<SIGfx7CacheControl>(ST);
660+
if (Generation < AMDGPUSubtarget::GFX10)
661+
return make_unique<SIGfx7CacheControl>(ST);
662+
return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
627663
}
628664

629665
bool SIGfx6CacheControl::enableLoadCacheBypass(
@@ -860,6 +896,231 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
860896
return Changed;
861897
}
862898

899+
bool SIGfx10CacheControl::enableLoadCacheBypass(
900+
const MachineBasicBlock::iterator &MI,
901+
SIAtomicScope Scope,
902+
SIAtomicAddrSpace AddrSpace) const {
903+
assert(MI->mayLoad() && !MI->mayStore());
904+
bool Changed = false;
905+
906+
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
907+
/// TODO Do not set glc for rmw atomic operations as they
908+
/// implicitly bypass the L0/L1 caches.
909+
910+
switch (Scope) {
911+
case SIAtomicScope::SYSTEM:
912+
case SIAtomicScope::AGENT:
913+
Changed |= enableGLCBit(MI);
914+
Changed |= enableDLCBit(MI);
915+
break;
916+
case SIAtomicScope::WORKGROUP:
917+
// In WGP mode the waves of a work-group can be executing on either CU of
918+
// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
919+
// CU mode and all waves of a work-group are on the same CU, and so the
920+
// L0 does not need to be bypassed.
921+
if (!CuMode) Changed |= enableGLCBit(MI);
922+
break;
923+
case SIAtomicScope::WAVEFRONT:
924+
case SIAtomicScope::SINGLETHREAD:
925+
// No cache to bypass.
926+
break;
927+
default:
928+
llvm_unreachable("Unsupported synchronization scope");
929+
}
930+
}
931+
932+
/// The scratch address space does not need the global memory caches
933+
/// to be bypassed as all memory operations by the same thread are
934+
/// sequentially consistent, and no other thread can access scratch
935+
/// memory.
936+
937+
/// Other address spaces do not hava a cache.
938+
939+
return Changed;
940+
}
941+
942+
bool SIGfx10CacheControl::enableNonTemporal(
943+
const MachineBasicBlock::iterator &MI) const {
944+
assert(MI->mayLoad() ^ MI->mayStore());
945+
bool Changed = false;
946+
947+
Changed |= enableSLCBit(MI);
948+
/// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
949+
950+
return Changed;
951+
}
952+
953+
bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
954+
SIAtomicScope Scope,
955+
SIAtomicAddrSpace AddrSpace,
956+
Position Pos) const {
957+
bool Changed = false;
958+
959+
MachineBasicBlock &MBB = *MI->getParent();
960+
DebugLoc DL = MI->getDebugLoc();
961+
962+
if (Pos == Position::AFTER)
963+
++MI;
964+
965+
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
966+
switch (Scope) {
967+
case SIAtomicScope::SYSTEM:
968+
case SIAtomicScope::AGENT:
969+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
970+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
971+
Changed = true;
972+
break;
973+
case SIAtomicScope::WORKGROUP:
974+
// In WGP mode the waves of a work-group can be executing on either CU of
975+
// the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
976+
// in CU mode and all waves of a work-group are on the same CU, and so the
977+
// L0 does not need to be invalidated.
978+
if (!CuMode) {
979+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
980+
Changed = true;
981+
}
982+
break;
983+
case SIAtomicScope::WAVEFRONT:
984+
case SIAtomicScope::SINGLETHREAD:
985+
// No cache to invalidate.
986+
break;
987+
default:
988+
llvm_unreachable("Unsupported synchronization scope");
989+
}
990+
}
991+
992+
/// The scratch address space does not need the global memory cache
993+
/// to be flushed as all memory operations by the same thread are
994+
/// sequentially consistent, and no other thread can access scratch
995+
/// memory.
996+
997+
/// Other address spaces do not hava a cache.
998+
999+
if (Pos == Position::AFTER)
1000+
--MI;
1001+
1002+
return Changed;
1003+
}
1004+
1005+
bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1006+
SIAtomicScope Scope,
1007+
SIAtomicAddrSpace AddrSpace,
1008+
SIMemOp Op,
1009+
bool IsCrossAddrSpaceOrdering,
1010+
Position Pos) const {
1011+
bool Changed = false;
1012+
1013+
MachineBasicBlock &MBB = *MI->getParent();
1014+
DebugLoc DL = MI->getDebugLoc();
1015+
1016+
if (Pos == Position::AFTER)
1017+
++MI;
1018+
1019+
bool VMCnt = false;
1020+
bool VSCnt = false;
1021+
bool LGKMCnt = false;
1022+
1023+
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1024+
switch (Scope) {
1025+
case SIAtomicScope::SYSTEM:
1026+
case SIAtomicScope::AGENT:
1027+
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1028+
VMCnt |= true;
1029+
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1030+
VSCnt |= true;
1031+
break;
1032+
case SIAtomicScope::WORKGROUP:
1033+
// In WGP mode the waves of a work-group can be executing on either CU of
1034+
// the WGP. Therefore need to wait for operations to complete to ensure
1035+
// they are visible to waves in the other CU as the L0 is per CU.
1036+
// Otherwise in CU mode and all waves of a work-group are on the same CU
1037+
// which shares the same L0.
1038+
if (!CuMode) {
1039+
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1040+
VMCnt |= true;
1041+
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1042+
VSCnt |= true;
1043+
}
1044+
break;
1045+
case SIAtomicScope::WAVEFRONT:
1046+
case SIAtomicScope::SINGLETHREAD:
1047+
// The L0 cache keeps all memory operations in order for
1048+
// work-items in the same wavefront.
1049+
break;
1050+
default:
1051+
llvm_unreachable("Unsupported synchronization scope");
1052+
}
1053+
}
1054+
1055+
if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1056+
switch (Scope) {
1057+
case SIAtomicScope::SYSTEM:
1058+
case SIAtomicScope::AGENT:
1059+
case SIAtomicScope::WORKGROUP:
1060+
// If no cross address space ordering then an LDS waitcnt is not
1061+
// needed as LDS operations for all waves are executed in a
1062+
// total global ordering as observed by all waves. Required if
1063+
// also synchronizing with global/GDS memory as LDS operations
1064+
// could be reordered with respect to later global/GDS memory
1065+
// operations of the same wave.
1066+
LGKMCnt |= IsCrossAddrSpaceOrdering;
1067+
break;
1068+
case SIAtomicScope::WAVEFRONT:
1069+
case SIAtomicScope::SINGLETHREAD:
1070+
// The LDS keeps all memory operations in order for
1071+
// the same wavesfront.
1072+
break;
1073+
default:
1074+
llvm_unreachable("Unsupported synchronization scope");
1075+
}
1076+
}
1077+
1078+
if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1079+
switch (Scope) {
1080+
case SIAtomicScope::SYSTEM:
1081+
case SIAtomicScope::AGENT:
1082+
// If no cross address space ordering then an GDS waitcnt is not
1083+
// needed as GDS operations for all waves are executed in a
1084+
// total global ordering as observed by all waves. Required if
1085+
// also synchronizing with global/LDS memory as GDS operations
1086+
// could be reordered with respect to later global/LDS memory
1087+
// operations of the same wave.
1088+
LGKMCnt |= IsCrossAddrSpaceOrdering;
1089+
break;
1090+
case SIAtomicScope::WORKGROUP:
1091+
case SIAtomicScope::WAVEFRONT:
1092+
case SIAtomicScope::SINGLETHREAD:
1093+
// The GDS keeps all memory operations in order for
1094+
// the same work-group.
1095+
break;
1096+
default:
1097+
llvm_unreachable("Unsupported synchronization scope");
1098+
}
1099+
}
1100+
1101+
if (VMCnt || LGKMCnt) {
1102+
unsigned WaitCntImmediate =
1103+
AMDGPU::encodeWaitcnt(IV,
1104+
VMCnt ? 0 : getVmcntBitMask(IV),
1105+
getExpcntBitMask(IV),
1106+
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1107+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1108+
Changed = true;
1109+
}
1110+
1111+
if (VSCnt) {
1112+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1113+
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1114+
.addImm(0);
1115+
Changed = true;
1116+
}
1117+
1118+
if (Pos == Position::AFTER)
1119+
--MI;
1120+
1121+
return Changed;
1122+
}
1123+
8631124
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
8641125
if (AtomicPseudoMIs.empty())
8651126
return false;

0 commit comments

Comments
 (0)