-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[LLVM][NVPTX] Upstream tanh intrinsic for libdevice #149596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-nvptx Author: Meredith Julian (mjulian31) ChangesCurrently __nv_fast_tanhf() in libdevice maps to an nvvm intrinsic that has not been upstreamed, which is causing issues when using the NVPTX backend from upstream. Instead of upstreaming the intrinsic, we can instead use the existing Intrinsic::tanh with the afn flag. This change adds NVPTX backend support for ISD::TANH, adds auto-upgrade for the old tanh_approx intrinsic to @llvm.tanh.f32 with afn flag so that libdevice works properly upstream, and adds a basic codegen test and a case to the auto-upgrade test. Full diff: https://github.com/llvm/llvm-project/pull/149596.diff 5 Files Affected:
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 86285a03c66bb..7b2640ede9c49 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1438,6 +1438,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
.Case("popc.ll", true)
.Case("h2f", true)
.Case("swap.lo.hi.b64", true)
+ .Case("tanh.approx.f32", true)
.Default(false);
if (Expand) {
@@ -2532,6 +2533,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
MDNode *MD = MDNode::get(Builder.getContext(), {});
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
return LD;
+ } else if (Name == "tanh.approx.f32") {
+ // nvvm.tanh.approx.f32 -> afn llvm.tanh.f32
+ FastMathFlags FMF;
+ FMF.setApproxFunc();
+ Rep = Builder.CreateUnaryIntrinsic(Intrinsic::tanh, CI->getArgOperand(0),
+ FMF);
} else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") {
Value *Arg =
Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7aa06f9079b09..74511929bb1c6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -950,10 +950,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// promoted to f32. v2f16 is expanded to f16, which is then promoted
// to f32.
for (const auto &Op :
- {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
+ {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
setOperationAction(Op, MVT::f16, Promote);
setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
+ // fsin, fcos, and ftanh are not supported on f64
+ if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
+ setOperationAction(Op, MVT::f64, Legal);
+ }
setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
setOperationAction(Op, MVT::bf16, Promote);
AddPromotedToType(Op, MVT::bf16, MVT::f32);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index a5bb83dfadb84..959e5297971f9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1233,7 +1233,7 @@ defm FMA_F32 : FMA<F32RT, allow_ftz = true>;
defm FMA_F32x2 : FMA<F32X2RT, allow_ftz = true, preds = [hasF32x2Instructions]>;
defm FMA_F64 : FMA<F64RT, allow_ftz = false>;
-// sin/cos
+// sin/cos/tanh
class UnaryOpAllowsApproxFn<SDPatternOperator operator>
: PatFrag<(ops node:$A),
@@ -1249,6 +1249,9 @@ def COS_APPROX_f32 :
BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz),
"cos.approx$ftz.f32",
[(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>;
+def TANH_APPROX_f32 :
+ BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32",
+ [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>;
//-----------------------------------
// Bitwise operations
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index a17f11a680aa2..362586af4f9b7 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -17,6 +17,8 @@ declare float @llvm.nvvm.fabs.f(float)
declare float @llvm.nvvm.fabs.ftz.f(float)
declare double @llvm.nvvm.fabs.d(double)
+declare float @llvm.nvvm.tanh.approx.f32(float)
+
declare i16 @llvm.nvvm.max.s(i16, i16)
declare i32 @llvm.nvvm.max.i(i32, i32)
declare i64 @llvm.nvvm.max.ll(i64, i64)
@@ -138,6 +140,13 @@ define void @fabs(float %a, double %b) {
ret void
}
+; CHECK-LABEL: @tanh
+define void @tanh(float %a) {
+; CHECK: call afn float @llvm.tanh.f32(float %a)
+ %r1 = call float @llvm.nvvm.tanh.approx.f32(float %a)
+ ret void
+}
+
; CHECK-LABEL: @min_max
define void @min_max(i16 %a1, i16 %a2, i32 %b1, i32 %b2, i64 %c1, i64 %c2) {
; CHECK: [[maxs:%[a-zA-Z0-9.]+]] = icmp sge i16 %a1, %a2
diff --git a/llvm/test/CodeGen/NVPTX/tanhf.ll b/llvm/test/CodeGen/NVPTX/tanhf.ll
new file mode 100644
index 0000000000000..73f3ca68768e9
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tanhf.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}
+
+
+define float @test1(float %in) local_unnamed_addr {
+; CHECK-LABEL: test1(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test1_param_0];
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %call = call afn float @llvm.tanh.f32(float %in)
+ ret float %call
+}
+
+define float @test2(float %in) local_unnamed_addr {
+; CHECK-LABEL: test2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test2_param_0];
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %call = tail call afn float @llvm.tanh.f32(float %in)
+ ret float %call
+}
+
+define half @test3(half %in) local_unnamed_addr {
+; CHECK-LABEL: test3(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test3_param_0];
+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+ %call = call afn half @llvm.tanh.f16(half %in)
+ ret half %call
+}
+
+define half @test4(half %in) local_unnamed_addr {
+; CHECK-LABEL: test4(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test4_param_0];
+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+ %call = tail call afn half @llvm.tanh.f16(half %in)
+ ret half %call
+}
+
+declare float @llvm.tanh.f32(float)
+declare half @llvm.tanh.f16(half)
+
|
@llvm/pr-subscribers-llvm-ir Author: Meredith Julian (mjulian31) ChangesCurrently __nv_fast_tanhf() in libdevice maps to an nvvm intrinsic that has not been upstreamed, which is causing issues when using the NVPTX backend from upstream. Instead of upstreaming the intrinsic, we can instead use the existing Intrinsic::tanh with the afn flag. This change adds NVPTX backend support for ISD::TANH, adds auto-upgrade for the old tanh_approx intrinsic to @llvm.tanh.f32 with afn flag so that libdevice works properly upstream, and adds a basic codegen test and a case to the auto-upgrade test. Full diff: https://github.com/llvm/llvm-project/pull/149596.diff 5 Files Affected:
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 86285a03c66bb..7b2640ede9c49 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1438,6 +1438,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
.Case("popc.ll", true)
.Case("h2f", true)
.Case("swap.lo.hi.b64", true)
+ .Case("tanh.approx.f32", true)
.Default(false);
if (Expand) {
@@ -2532,6 +2533,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
MDNode *MD = MDNode::get(Builder.getContext(), {});
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
return LD;
+ } else if (Name == "tanh.approx.f32") {
+ // nvvm.tanh.approx.f32 -> afn llvm.tanh.f32
+ FastMathFlags FMF;
+ FMF.setApproxFunc();
+ Rep = Builder.CreateUnaryIntrinsic(Intrinsic::tanh, CI->getArgOperand(0),
+ FMF);
} else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") {
Value *Arg =
Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7aa06f9079b09..74511929bb1c6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -950,10 +950,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// promoted to f32. v2f16 is expanded to f16, which is then promoted
// to f32.
for (const auto &Op :
- {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
+ {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
setOperationAction(Op, MVT::f16, Promote);
setOperationAction(Op, MVT::f32, Legal);
- setOperationAction(Op, MVT::f64, Legal);
+ // fsin, fcos, and ftanh are not supported on f64
+ if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
+ setOperationAction(Op, MVT::f64, Legal);
+ }
setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
setOperationAction(Op, MVT::bf16, Promote);
AddPromotedToType(Op, MVT::bf16, MVT::f32);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index a5bb83dfadb84..959e5297971f9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1233,7 +1233,7 @@ defm FMA_F32 : FMA<F32RT, allow_ftz = true>;
defm FMA_F32x2 : FMA<F32X2RT, allow_ftz = true, preds = [hasF32x2Instructions]>;
defm FMA_F64 : FMA<F64RT, allow_ftz = false>;
-// sin/cos
+// sin/cos/tanh
class UnaryOpAllowsApproxFn<SDPatternOperator operator>
: PatFrag<(ops node:$A),
@@ -1249,6 +1249,9 @@ def COS_APPROX_f32 :
BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz),
"cos.approx$ftz.f32",
[(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>;
+def TANH_APPROX_f32 :
+ BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32",
+ [(set f32:$dst, (UnaryOpAllowsApproxFn<ftanh> f32:$src))]>;
//-----------------------------------
// Bitwise operations
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index a17f11a680aa2..362586af4f9b7 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -17,6 +17,8 @@ declare float @llvm.nvvm.fabs.f(float)
declare float @llvm.nvvm.fabs.ftz.f(float)
declare double @llvm.nvvm.fabs.d(double)
+declare float @llvm.nvvm.tanh.approx.f32(float)
+
declare i16 @llvm.nvvm.max.s(i16, i16)
declare i32 @llvm.nvvm.max.i(i32, i32)
declare i64 @llvm.nvvm.max.ll(i64, i64)
@@ -138,6 +140,13 @@ define void @fabs(float %a, double %b) {
ret void
}
+; CHECK-LABEL: @tanh
+define void @tanh(float %a) {
+; CHECK: call afn float @llvm.tanh.f32(float %a)
+ %r1 = call float @llvm.nvvm.tanh.approx.f32(float %a)
+ ret void
+}
+
; CHECK-LABEL: @min_max
define void @min_max(i16 %a1, i16 %a2, i32 %b1, i32 %b2, i64 %c1, i64 %c2) {
; CHECK: [[maxs:%[a-zA-Z0-9.]+]] = icmp sge i16 %a1, %a2
diff --git a/llvm/test/CodeGen/NVPTX/tanhf.ll b/llvm/test/CodeGen/NVPTX/tanhf.ll
new file mode 100644
index 0000000000000..73f3ca68768e9
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tanhf.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}
+
+
+define float @test1(float %in) local_unnamed_addr {
+; CHECK-LABEL: test1(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test1_param_0];
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %call = call afn float @llvm.tanh.f32(float %in)
+ ret float %call
+}
+
+define float @test2(float %in) local_unnamed_addr {
+; CHECK-LABEL: test2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test2_param_0];
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %call = tail call afn float @llvm.tanh.f32(float %in)
+ ret float %call
+}
+
+define half @test3(half %in) local_unnamed_addr {
+; CHECK-LABEL: test3(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test3_param_0];
+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+ %call = call afn half @llvm.tanh.f16(half %in)
+ ret half %call
+}
+
+define half @test4(half %in) local_unnamed_addr {
+; CHECK-LABEL: test4(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [test4_param_0];
+; CHECK-NEXT: cvt.f32.f16 %r1, %rs1;
+; CHECK-NEXT: tanh.approx.f32 %r2, %r1;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r2;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT: ret;
+ %call = tail call afn half @llvm.tanh.f16(half %in)
+ ret half %call
+}
+
+declare float @llvm.tanh.f32(float)
+declare half @llvm.tanh.f16(half)
+
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a nit.
For the context: it appears that the dependency on the new intrinsic was introduced in libdevice in CUDA-12.8+ and we're not touching it from the clang side yet, AFAICT. Please make sure that all tests pass -- github reports that some CI checks have failed. |
It looks like these tests are failing from this recent commit, and are not related to this change: #149571 |
And yes, this was added to libdevice in CUDA 12.8- hence adding the auto-upgrade for the previously-used intrinsic |
Currently __nv_fast_tanhf() in libdevice maps to an nvvm intrinsic that has not been upstreamed, which is causing issues when using the NVPTX backend from upstream. Instead of upstreaming the intrinsic, we can instead use the existing Intrinsic::tanh with the afn flag. This change adds NVPTX backend support for ISD::TANH, adds auto-upgrade for the old tanh_approx intrinsic to @llvm.tanh.f32 with afn flag so that libdevice works properly upstream, and adds a basic codegen test and a case to the auto-upgrade test.