@@ -1354,13 +1354,13 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
1354
1354
; GFX9-NEXT: s_cbranch_execz BB6_2
1355
1355
; GFX9-NEXT: ; %bb.1:
1356
1356
; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
1357
- ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1358
1357
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1359
- ; GFX9-NEXT: v_mul_hi_u32 v2, s2, v1
1360
1358
; GFX9-NEXT: s_mul_i32 s7, s3, s6
1359
+ ; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
1360
+ ; GFX9-NEXT: s_add_i32 s8, s8, s7
1361
1361
; GFX9-NEXT: s_mul_i32 s6, s2, s6
1362
1362
; GFX9-NEXT: v_mov_b32_e32 v1, s6
1363
- ; GFX9-NEXT: v_add_u32_e32 v2, s7, v2
1363
+ ; GFX9-NEXT: v_mov_b32_e32 v2, s8
1364
1364
; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
1365
1365
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1366
1366
; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
@@ -1399,11 +1399,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
1399
1399
; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
1400
1400
; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
1401
1401
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1402
- ; GFX1064-NEXT: v_mul_hi_u32 v2, s2, s6
1403
- ; GFX1064-NEXT: s_mul_i32 s7, s2, s6
1404
- ; GFX1064-NEXT: s_mul_i32 s6, s3, s6
1405
- ; GFX1064-NEXT: v_mov_b32_e32 v1, s7
1406
- ; GFX1064-NEXT: v_add_nc_u32_e32 v2, s6, v2
1402
+ ; GFX1064-NEXT: s_mul_i32 s7, s3, s6
1403
+ ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
1404
+ ; GFX1064-NEXT: s_mul_i32 s6, s2, s6
1405
+ ; GFX1064-NEXT: s_add_i32 s8, s8, s7
1406
+ ; GFX1064-NEXT: v_mov_b32_e32 v1, s6
1407
+ ; GFX1064-NEXT: v_mov_b32_e32 v2, s8
1407
1408
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1408
1409
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
1409
1410
; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
@@ -1441,11 +1442,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
1441
1442
; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
1442
1443
; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
1443
1444
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1444
- ; GFX1032-NEXT: v_mul_hi_u32 v2, s2, s5
1445
- ; GFX1032-NEXT: s_mul_i32 s6, s2, s5
1446
- ; GFX1032-NEXT: s_mul_i32 s5, s3, s5
1447
- ; GFX1032-NEXT: v_mov_b32_e32 v1, s6
1448
- ; GFX1032-NEXT: v_add_nc_u32_e32 v2, s5, v2
1445
+ ; GFX1032-NEXT: s_mul_i32 s6, s3, s5
1446
+ ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
1447
+ ; GFX1032-NEXT: s_mul_i32 s5, s2, s5
1448
+ ; GFX1032-NEXT: s_add_i32 s7, s7, s6
1449
+ ; GFX1032-NEXT: v_mov_b32_e32 v1, s5
1450
+ ; GFX1032-NEXT: v_mov_b32_e32 v2, s7
1449
1451
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1450
1452
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
1451
1453
; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
@@ -2439,13 +2441,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
2439
2441
; GFX9-NEXT: s_cbranch_execz BB12_2
2440
2442
; GFX9-NEXT: ; %bb.1:
2441
2443
; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
2442
- ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2443
2444
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2444
- ; GFX9-NEXT: v_mul_hi_u32 v2, s2, v1
2445
2445
; GFX9-NEXT: s_mul_i32 s7, s3, s6
2446
+ ; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
2447
+ ; GFX9-NEXT: s_add_i32 s8, s8, s7
2446
2448
; GFX9-NEXT: s_mul_i32 s6, s2, s6
2447
2449
; GFX9-NEXT: v_mov_b32_e32 v1, s6
2448
- ; GFX9-NEXT: v_add_u32_e32 v2, s7, v2
2450
+ ; GFX9-NEXT: v_mov_b32_e32 v2, s8
2449
2451
; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
2450
2452
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2451
2453
; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
@@ -2484,11 +2486,12 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
2484
2486
; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
2485
2487
; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
2486
2488
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2487
- ; GFX1064-NEXT: v_mul_hi_u32 v2, s2, s6
2488
- ; GFX1064-NEXT: s_mul_i32 s7, s2, s6
2489
- ; GFX1064-NEXT: s_mul_i32 s6, s3, s6
2490
- ; GFX1064-NEXT: v_mov_b32_e32 v1, s7
2491
- ; GFX1064-NEXT: v_add_nc_u32_e32 v2, s6, v2
2489
+ ; GFX1064-NEXT: s_mul_i32 s7, s3, s6
2490
+ ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
2491
+ ; GFX1064-NEXT: s_mul_i32 s6, s2, s6
2492
+ ; GFX1064-NEXT: s_add_i32 s8, s8, s7
2493
+ ; GFX1064-NEXT: v_mov_b32_e32 v1, s6
2494
+ ; GFX1064-NEXT: v_mov_b32_e32 v2, s8
2492
2495
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2493
2496
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
2494
2497
; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
@@ -2526,11 +2529,12 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
2526
2529
; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
2527
2530
; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo
2528
2531
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2529
- ; GFX1032-NEXT: v_mul_hi_u32 v2, s2, s5
2530
- ; GFX1032-NEXT: s_mul_i32 s6, s2, s5
2531
- ; GFX1032-NEXT: s_mul_i32 s5, s3, s5
2532
- ; GFX1032-NEXT: v_mov_b32_e32 v1, s6
2533
- ; GFX1032-NEXT: v_add_nc_u32_e32 v2, s5, v2
2532
+ ; GFX1032-NEXT: s_mul_i32 s6, s3, s5
2533
+ ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
2534
+ ; GFX1032-NEXT: s_mul_i32 s5, s2, s5
2535
+ ; GFX1032-NEXT: s_add_i32 s7, s7, s6
2536
+ ; GFX1032-NEXT: v_mov_b32_e32 v1, s5
2537
+ ; GFX1032-NEXT: v_mov_b32_e32 v2, s7
2534
2538
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2535
2539
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
2536
2540
; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
0 commit comments