@@ -352,6 +352,40 @@ class SIGfx7CacheControl : public SIGfx6CacheControl {
352
352
353
353
};
354
354
355
+ class SIGfx10CacheControl : public SIGfx7CacheControl {
356
+ protected:
357
+ bool CuMode = false ;
358
+
359
+ // / Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
360
+ // / is modified, false otherwise.
361
+ bool enableDLCBit (const MachineBasicBlock::iterator &MI) const {
362
+ return enableNamedBit<AMDGPU::OpName::dlc>(MI);
363
+ }
364
+
365
+ public:
366
+
367
+ SIGfx10CacheControl (const GCNSubtarget &ST, bool CuMode) :
368
+ SIGfx7CacheControl (ST), CuMode(CuMode) {};
369
+
370
+ bool enableLoadCacheBypass (const MachineBasicBlock::iterator &MI,
371
+ SIAtomicScope Scope,
372
+ SIAtomicAddrSpace AddrSpace) const override ;
373
+
374
+ bool enableNonTemporal (const MachineBasicBlock::iterator &MI) const override ;
375
+
376
+ bool insertCacheInvalidate (MachineBasicBlock::iterator &MI,
377
+ SIAtomicScope Scope,
378
+ SIAtomicAddrSpace AddrSpace,
379
+ Position Pos) const override ;
380
+
381
+ bool insertWait (MachineBasicBlock::iterator &MI,
382
+ SIAtomicScope Scope,
383
+ SIAtomicAddrSpace AddrSpace,
384
+ SIMemOp Op,
385
+ bool IsCrossAddrSpaceOrdering,
386
+ Position Pos) const override ;
387
+ };
388
+
355
389
class SIMemoryLegalizer final : public MachineFunctionPass {
356
390
private:
357
391
@@ -623,7 +657,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
623
657
GCNSubtarget::Generation Generation = ST.getGeneration ();
624
658
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
625
659
return make_unique<SIGfx6CacheControl>(ST);
626
- return make_unique<SIGfx7CacheControl>(ST);
660
+ if (Generation < AMDGPUSubtarget::GFX10)
661
+ return make_unique<SIGfx7CacheControl>(ST);
662
+ return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled ());
627
663
}
628
664
629
665
bool SIGfx6CacheControl::enableLoadCacheBypass (
@@ -860,6 +896,231 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
860
896
return Changed;
861
897
}
862
898
899
+ bool SIGfx10CacheControl::enableLoadCacheBypass (
900
+ const MachineBasicBlock::iterator &MI,
901
+ SIAtomicScope Scope,
902
+ SIAtomicAddrSpace AddrSpace) const {
903
+ assert (MI->mayLoad () && !MI->mayStore ());
904
+ bool Changed = false ;
905
+
906
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
907
+ // / TODO Do not set glc for rmw atomic operations as they
908
+ // / implicitly bypass the L0/L1 caches.
909
+
910
+ switch (Scope) {
911
+ case SIAtomicScope::SYSTEM:
912
+ case SIAtomicScope::AGENT:
913
+ Changed |= enableGLCBit (MI);
914
+ Changed |= enableDLCBit (MI);
915
+ break ;
916
+ case SIAtomicScope::WORKGROUP:
917
+ // In WGP mode the waves of a work-group can be executing on either CU of
918
+ // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
919
+ // CU mode and all waves of a work-group are on the same CU, and so the
920
+ // L0 does not need to be bypassed.
921
+ if (!CuMode) Changed |= enableGLCBit (MI);
922
+ break ;
923
+ case SIAtomicScope::WAVEFRONT:
924
+ case SIAtomicScope::SINGLETHREAD:
925
+ // No cache to bypass.
926
+ break ;
927
+ default :
928
+ llvm_unreachable (" Unsupported synchronization scope" );
929
+ }
930
+ }
931
+
932
+ // / The scratch address space does not need the global memory caches
933
+ // / to be bypassed as all memory operations by the same thread are
934
+ // / sequentially consistent, and no other thread can access scratch
935
+ // / memory.
936
+
937
+ // / Other address spaces do not hava a cache.
938
+
939
+ return Changed;
940
+ }
941
+
942
+ bool SIGfx10CacheControl::enableNonTemporal (
943
+ const MachineBasicBlock::iterator &MI) const {
944
+ assert (MI->mayLoad () ^ MI->mayStore ());
945
+ bool Changed = false ;
946
+
947
+ Changed |= enableSLCBit (MI);
948
+ // / TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
949
+
950
+ return Changed;
951
+ }
952
+
953
+ bool SIGfx10CacheControl::insertCacheInvalidate (MachineBasicBlock::iterator &MI,
954
+ SIAtomicScope Scope,
955
+ SIAtomicAddrSpace AddrSpace,
956
+ Position Pos) const {
957
+ bool Changed = false ;
958
+
959
+ MachineBasicBlock &MBB = *MI->getParent ();
960
+ DebugLoc DL = MI->getDebugLoc ();
961
+
962
+ if (Pos == Position::AFTER)
963
+ ++MI;
964
+
965
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
966
+ switch (Scope) {
967
+ case SIAtomicScope::SYSTEM:
968
+ case SIAtomicScope::AGENT:
969
+ BuildMI (MBB, MI, DL, TII->get (AMDGPU::BUFFER_GL0_INV));
970
+ BuildMI (MBB, MI, DL, TII->get (AMDGPU::BUFFER_GL1_INV));
971
+ Changed = true ;
972
+ break ;
973
+ case SIAtomicScope::WORKGROUP:
974
+ // In WGP mode the waves of a work-group can be executing on either CU of
975
+ // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
976
+ // in CU mode and all waves of a work-group are on the same CU, and so the
977
+ // L0 does not need to be invalidated.
978
+ if (!CuMode) {
979
+ BuildMI (MBB, MI, DL, TII->get (AMDGPU::BUFFER_GL0_INV));
980
+ Changed = true ;
981
+ }
982
+ break ;
983
+ case SIAtomicScope::WAVEFRONT:
984
+ case SIAtomicScope::SINGLETHREAD:
985
+ // No cache to invalidate.
986
+ break ;
987
+ default :
988
+ llvm_unreachable (" Unsupported synchronization scope" );
989
+ }
990
+ }
991
+
992
+ // / The scratch address space does not need the global memory cache
993
+ // / to be flushed as all memory operations by the same thread are
994
+ // / sequentially consistent, and no other thread can access scratch
995
+ // / memory.
996
+
997
+ // / Other address spaces do not hava a cache.
998
+
999
+ if (Pos == Position::AFTER)
1000
+ --MI;
1001
+
1002
+ return Changed;
1003
+ }
1004
+
1005
+ bool SIGfx10CacheControl::insertWait (MachineBasicBlock::iterator &MI,
1006
+ SIAtomicScope Scope,
1007
+ SIAtomicAddrSpace AddrSpace,
1008
+ SIMemOp Op,
1009
+ bool IsCrossAddrSpaceOrdering,
1010
+ Position Pos) const {
1011
+ bool Changed = false ;
1012
+
1013
+ MachineBasicBlock &MBB = *MI->getParent ();
1014
+ DebugLoc DL = MI->getDebugLoc ();
1015
+
1016
+ if (Pos == Position::AFTER)
1017
+ ++MI;
1018
+
1019
+ bool VMCnt = false ;
1020
+ bool VSCnt = false ;
1021
+ bool LGKMCnt = false ;
1022
+
1023
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1024
+ switch (Scope) {
1025
+ case SIAtomicScope::SYSTEM:
1026
+ case SIAtomicScope::AGENT:
1027
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1028
+ VMCnt |= true ;
1029
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1030
+ VSCnt |= true ;
1031
+ break ;
1032
+ case SIAtomicScope::WORKGROUP:
1033
+ // In WGP mode the waves of a work-group can be executing on either CU of
1034
+ // the WGP. Therefore need to wait for operations to complete to ensure
1035
+ // they are visible to waves in the other CU as the L0 is per CU.
1036
+ // Otherwise in CU mode and all waves of a work-group are on the same CU
1037
+ // which shares the same L0.
1038
+ if (!CuMode) {
1039
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1040
+ VMCnt |= true ;
1041
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1042
+ VSCnt |= true ;
1043
+ }
1044
+ break ;
1045
+ case SIAtomicScope::WAVEFRONT:
1046
+ case SIAtomicScope::SINGLETHREAD:
1047
+ // The L0 cache keeps all memory operations in order for
1048
+ // work-items in the same wavefront.
1049
+ break ;
1050
+ default :
1051
+ llvm_unreachable (" Unsupported synchronization scope" );
1052
+ }
1053
+ }
1054
+
1055
+ if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1056
+ switch (Scope) {
1057
+ case SIAtomicScope::SYSTEM:
1058
+ case SIAtomicScope::AGENT:
1059
+ case SIAtomicScope::WORKGROUP:
1060
+ // If no cross address space ordering then an LDS waitcnt is not
1061
+ // needed as LDS operations for all waves are executed in a
1062
+ // total global ordering as observed by all waves. Required if
1063
+ // also synchronizing with global/GDS memory as LDS operations
1064
+ // could be reordered with respect to later global/GDS memory
1065
+ // operations of the same wave.
1066
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
1067
+ break ;
1068
+ case SIAtomicScope::WAVEFRONT:
1069
+ case SIAtomicScope::SINGLETHREAD:
1070
+ // The LDS keeps all memory operations in order for
1071
+ // the same wavesfront.
1072
+ break ;
1073
+ default :
1074
+ llvm_unreachable (" Unsupported synchronization scope" );
1075
+ }
1076
+ }
1077
+
1078
+ if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1079
+ switch (Scope) {
1080
+ case SIAtomicScope::SYSTEM:
1081
+ case SIAtomicScope::AGENT:
1082
+ // If no cross address space ordering then an GDS waitcnt is not
1083
+ // needed as GDS operations for all waves are executed in a
1084
+ // total global ordering as observed by all waves. Required if
1085
+ // also synchronizing with global/LDS memory as GDS operations
1086
+ // could be reordered with respect to later global/LDS memory
1087
+ // operations of the same wave.
1088
+ LGKMCnt |= IsCrossAddrSpaceOrdering;
1089
+ break ;
1090
+ case SIAtomicScope::WORKGROUP:
1091
+ case SIAtomicScope::WAVEFRONT:
1092
+ case SIAtomicScope::SINGLETHREAD:
1093
+ // The GDS keeps all memory operations in order for
1094
+ // the same work-group.
1095
+ break ;
1096
+ default :
1097
+ llvm_unreachable (" Unsupported synchronization scope" );
1098
+ }
1099
+ }
1100
+
1101
+ if (VMCnt || LGKMCnt) {
1102
+ unsigned WaitCntImmediate =
1103
+ AMDGPU::encodeWaitcnt (IV,
1104
+ VMCnt ? 0 : getVmcntBitMask (IV),
1105
+ getExpcntBitMask (IV),
1106
+ LGKMCnt ? 0 : getLgkmcntBitMask (IV));
1107
+ BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAITCNT)).addImm (WaitCntImmediate);
1108
+ Changed = true ;
1109
+ }
1110
+
1111
+ if (VSCnt) {
1112
+ BuildMI (MBB, MI, DL, TII->get (AMDGPU::S_WAITCNT_VSCNT))
1113
+ .addReg (AMDGPU::SGPR_NULL, RegState::Undef)
1114
+ .addImm (0 );
1115
+ Changed = true ;
1116
+ }
1117
+
1118
+ if (Pos == Position::AFTER)
1119
+ --MI;
1120
+
1121
+ return Changed;
1122
+ }
1123
+
863
1124
bool SIMemoryLegalizer::removeAtomicPseudoMIs () {
864
1125
if (AtomicPseudoMIs.empty ())
865
1126
return false ;
0 commit comments