Skip to content

Commit 30fcd29

Browse files
committed
[X86][SSE] lowerShuffleWithSHUFPS - commute '2*V1+2*V2 elements' mask if it allows a loaded fold
As mentioned on D73023.
1 parent 805c157 commit 30fcd29

File tree

2 files changed

+15
-9
lines changed

2 files changed

+15
-9
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13316,10 +13316,11 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1331613316
/// It makes no assumptions about whether this is the *best* lowering, it simply
1331713317
/// uses it.
1331813318
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
13319-
ArrayRef<int> Mask, SDValue V1,
13319+
ArrayRef<int> OriginalMask, SDValue V1,
1332013320
SDValue V2, SelectionDAG &DAG) {
1332113321
SDValue LowV = V1, HighV = V2;
13322-
int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
13322+
SmallVector<int, 4> Mask(OriginalMask.begin(), OriginalMask.end());
13323+
SmallVector<int, 4> NewMask = Mask;
1332313324

1332413325
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
1332513326

@@ -13357,6 +13358,14 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
1335713358
NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
1335813359
}
1335913360
} else if (NumV2Elements == 2) {
13361+
// If we are likely to fold V1 but not V2, then commute the shuffle.
13362+
if (MayFoldLoad(V1) && !MayFoldLoad(V2)) {
13363+
ShuffleVectorSDNode::commuteMask(Mask);
13364+
NewMask = Mask;
13365+
std::swap(V1, V2);
13366+
std::swap(LowV, HighV);
13367+
}
13368+
1336013369
if (Mask[0] < 4 && Mask[1] < 4) {
1336113370
// Handle the easy case where we have V1 in the low lanes and V2 in the
1336213371
// high lanes.

llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2471,17 +2471,14 @@ define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) {
24712471
define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, <4 x float>* %a1) {
24722472
; SSE-LABEL: shuffle_mem_v4f32_0624:
24732473
; SSE: # %bb.0:
2474-
; SSE-NEXT: movaps (%rdi), %xmm1
2475-
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,0]
2476-
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2477-
; SSE-NEXT: movaps %xmm1, %xmm0
2474+
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2475+
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
24782476
; SSE-NEXT: retq
24792477
;
24802478
; AVX1OR2-LABEL: shuffle_mem_v4f32_0624:
24812479
; AVX1OR2: # %bb.0:
2482-
; AVX1OR2-NEXT: vmovaps (%rdi), %xmm1
2483-
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,0]
2484-
; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2480+
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2481+
; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
24852482
; AVX1OR2-NEXT: retq
24862483
;
24872484
; AVX512VL-LABEL: shuffle_mem_v4f32_0624:

0 commit comments

Comments
 (0)