Skip to content

Commit 4e69df0

Browse files
committed
Revert "AMDGPU: Temporary drop s_mul_hi_i/u32 patterns"
This reverts commit fe23ed2. It was never really clear this was responsible for the performance regressions that caused this to be reverted. It's been a long time, and we need to have scalar patterns for this to get GlobalISel working.
1 parent 60249c2 commit 4e69df0

File tree

3 files changed

+41
-28
lines changed

3 files changed

+41
-28
lines changed

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -609,8 +609,12 @@ let SubtargetPredicate = isGFX9Plus in {
609609
def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32">;
610610
} // End Defs = [SCC]
611611

612-
def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">;
613-
def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">;
612+
let isCommutable = 1 in {
613+
def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32",
614+
[(set i32:$sdst, (UniformBinFrag<mulhu> SSrc_b32:$src0, SSrc_b32:$src1))]>;
615+
def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32",
616+
[(set i32:$sdst, (UniformBinFrag<mulhs> SSrc_b32:$src0, SSrc_b32:$src1))]>;
617+
}
614618
} // End SubtargetPredicate = isGFX9Plus
615619

616620
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1354,13 +1354,13 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
13541354
; GFX9-NEXT: s_cbranch_execz BB6_2
13551355
; GFX9-NEXT: ; %bb.1:
13561356
; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
1357-
; GFX9-NEXT: v_mov_b32_e32 v1, s6
13581357
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1359-
; GFX9-NEXT: v_mul_hi_u32 v2, s2, v1
13601358
; GFX9-NEXT: s_mul_i32 s7, s3, s6
1359+
; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
1360+
; GFX9-NEXT: s_add_i32 s8, s8, s7
13611361
; GFX9-NEXT: s_mul_i32 s6, s2, s6
13621362
; GFX9-NEXT: v_mov_b32_e32 v1, s6
1363-
; GFX9-NEXT: v_add_u32_e32 v2, s7, v2
1363+
; GFX9-NEXT: v_mov_b32_e32 v2, s8
13641364
; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
13651365
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13661366
; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
@@ -1399,11 +1399,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
13991399
; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
14001400
; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
14011401
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1402-
; GFX1064-NEXT: v_mul_hi_u32 v2, s2, s6
1403-
; GFX1064-NEXT: s_mul_i32 s7, s2, s6
1404-
; GFX1064-NEXT: s_mul_i32 s6, s3, s6
1405-
; GFX1064-NEXT: v_mov_b32_e32 v1, s7
1406-
; GFX1064-NEXT: v_add_nc_u32_e32 v2, s6, v2
1402+
; GFX1064-NEXT: s_mul_i32 s7, s3, s6
1403+
; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
1404+
; GFX1064-NEXT: s_mul_i32 s6, s2, s6
1405+
; GFX1064-NEXT: s_add_i32 s8, s8, s7
1406+
; GFX1064-NEXT: v_mov_b32_e32 v1, s6
1407+
; GFX1064-NEXT: v_mov_b32_e32 v2, s8
14071408
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14081409
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
14091410
; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
@@ -1441,11 +1442,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
14411442
; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
14421443
; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
14431444
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1444-
; GFX1032-NEXT: v_mul_hi_u32 v2, s2, s5
1445-
; GFX1032-NEXT: s_mul_i32 s6, s2, s5
1446-
; GFX1032-NEXT: s_mul_i32 s5, s3, s5
1447-
; GFX1032-NEXT: v_mov_b32_e32 v1, s6
1448-
; GFX1032-NEXT: v_add_nc_u32_e32 v2, s5, v2
1445+
; GFX1032-NEXT: s_mul_i32 s6, s3, s5
1446+
; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
1447+
; GFX1032-NEXT: s_mul_i32 s5, s2, s5
1448+
; GFX1032-NEXT: s_add_i32 s7, s7, s6
1449+
; GFX1032-NEXT: v_mov_b32_e32 v1, s5
1450+
; GFX1032-NEXT: v_mov_b32_e32 v2, s7
14491451
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14501452
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
14511453
; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
@@ -2439,13 +2441,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
24392441
; GFX9-NEXT: s_cbranch_execz BB12_2
24402442
; GFX9-NEXT: ; %bb.1:
24412443
; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
2442-
; GFX9-NEXT: v_mov_b32_e32 v1, s6
24432444
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2444-
; GFX9-NEXT: v_mul_hi_u32 v2, s2, v1
24452445
; GFX9-NEXT: s_mul_i32 s7, s3, s6
2446+
; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
2447+
; GFX9-NEXT: s_add_i32 s8, s8, s7
24462448
; GFX9-NEXT: s_mul_i32 s6, s2, s6
24472449
; GFX9-NEXT: v_mov_b32_e32 v1, s6
2448-
; GFX9-NEXT: v_add_u32_e32 v2, s7, v2
2450+
; GFX9-NEXT: v_mov_b32_e32 v2, s8
24492451
; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
24502452
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
24512453
; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
@@ -2484,11 +2486,12 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
24842486
; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
24852487
; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
24862488
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2487-
; GFX1064-NEXT: v_mul_hi_u32 v2, s2, s6
2488-
; GFX1064-NEXT: s_mul_i32 s7, s2, s6
2489-
; GFX1064-NEXT: s_mul_i32 s6, s3, s6
2490-
; GFX1064-NEXT: v_mov_b32_e32 v1, s7
2491-
; GFX1064-NEXT: v_add_nc_u32_e32 v2, s6, v2
2489+
; GFX1064-NEXT: s_mul_i32 s7, s3, s6
2490+
; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
2491+
; GFX1064-NEXT: s_mul_i32 s6, s2, s6
2492+
; GFX1064-NEXT: s_add_i32 s8, s8, s7
2493+
; GFX1064-NEXT: v_mov_b32_e32 v1, s6
2494+
; GFX1064-NEXT: v_mov_b32_e32 v2, s8
24922495
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
24932496
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
24942497
; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
@@ -2526,11 +2529,12 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
25262529
; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
25272530
; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
25282531
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2529-
; GFX1032-NEXT: v_mul_hi_u32 v2, s2, s5
2530-
; GFX1032-NEXT: s_mul_i32 s6, s2, s5
2531-
; GFX1032-NEXT: s_mul_i32 s5, s3, s5
2532-
; GFX1032-NEXT: v_mov_b32_e32 v1, s6
2533-
; GFX1032-NEXT: v_add_nc_u32_e32 v2, s5, v2
2532+
; GFX1032-NEXT: s_mul_i32 s6, s3, s5
2533+
; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
2534+
; GFX1032-NEXT: s_mul_i32 s5, s2, s5
2535+
; GFX1032-NEXT: s_add_i32 s7, s7, s6
2536+
; GFX1032-NEXT: v_mov_b32_e32 v1, s5
2537+
; GFX1032-NEXT: v_mov_b32_e32 v2, s7
25342538
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
25352539
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
25362540
; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]

llvm/test/CodeGen/AMDGPU/mul.ll

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,11 @@ define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
141141
; crash with a 'failed to select' error.
142142

143143
; FUNC-LABEL: {{^}}s_mul_i64:
144+
; GFX9_10-DAG: s_mul_i32
145+
; GFX9_10-DAG: s_mul_hi_u32
146+
; GFX9_10-DAG: s_mul_i32
147+
; GFX9_10-DAG: s_mul_i32
148+
; GFX9_10: s_endpgm
144149
define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
145150
%mul = mul i64 %a, %b
146151
store i64 %mul, i64 addrspace(1)* %out, align 8

0 commit comments

Comments
 (0)