From 4ace637a582bc9b769f697912d2c9e9f000763be Mon Sep 17 00:00:00 2001 From: aferust Date: Mon, 28 Nov 2022 15:17:30 +0300 Subject: [PATCH 01/17] initial effort to allocate shared stack memory --- .gitignore | 1 + dub.json | 2 +- source/dcompute/std/memory.d | 125 +++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5347b19..605e644 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ # DUB .dub +dub.selections.json docs.json __dummy.html diff --git a/dub.json b/dub.json index e205164..e6daab5 100644 --- a/dub.json +++ b/dub.json @@ -9,7 +9,7 @@ "derelict-cuda": "~>3.1.1", "taggedalgebraic": "~>0.10.7" }, - "dflags" : ["-mdcompute-targets=cuda-210" ,"-oq"], + "dflags" : ["-mdcompute-targets=cuda-300" ,"-oq"], "configurations": [ { "name": "library", diff --git a/source/dcompute/std/memory.d b/source/dcompute/std/memory.d index ce51b33..80c637e 100644 --- a/source/dcompute/std/memory.d +++ b/source/dcompute/std/memory.d @@ -2,6 +2,11 @@ import ldc.dcompute; +pragma(LDC_inline_ir) + R __irEx(string prefix, string code, string suffix, R, P...)(P) @trusted nothrow @nogc; + +pragma(LDC_inline_ir) + R inlineIR(string s, R, P...)(P) @trusted nothrow @nogc; /* *Pointer conversions: * *Pointer!T genericPtrTo*(GenericPointer!T ptr) @@ -15,3 +20,123 @@ import ldc.dcompute; for what this emulates any why. Memory aligned to A = reduce!max(T.alignof) Returns a tuple of {SharedPointer!(align(A) T), length} "arrays" */ + +alias SharedArr = ulong; + +SharedArr sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ + ulong address = __irEx!(`@`~uniqueName~` = addrspace(3) global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 ; + `, ` + %el0 = getelementptr inbounds [`~Itoa!N~` x `~llvmType!T~`], [`~Itoa!N~` x `~llvmType!T~`] addrspace(3)* @`~uniqueName~`, `~llvmType!T~` 0, i64 0 + %ptrint = ptrtoint `~llvmType!T~` addrspace(3)* %el0 to i64 + ret i64 %ptrint + `, ``, ulong)(); + return address; +} + +void setSharedVal(T)(SharedArr ptrint, size_t index, uint val){ + inlineIR!(` + %sptr = inttoptr i64 %0 to `~llvmType!T~` addrspace(3)* + %lptr = getelementptr inbounds `~llvmType!T~`, `~llvmType!T~` addrspace(3)* %sptr, i64 %1 + store `~llvmType!T~` %2, `~llvmType!T~` addrspace(3)* %lptr, align 4 + ret void`,void)(ptrint, index, val); +} + + +T getSharedVal(T)(SharedArr ptrint, size_t index){ + return inlineIR!(` + %sptr = inttoptr i64 %0 to `~llvmType!T~` addrspace(3)* + %lptr = getelementptr inbounds `~llvmType!T~`, `~llvmType!T~` addrspace(3)* %sptr, i64 %1 + %r = load `~llvmType!T~`, `~llvmType!T~` addrspace(3)* %lptr, align 4 + ret `~llvmType!T~` %r`, T)(ptrint, index); +} + +/+ +SharedPointer!T sharedStaticReserve(T: T[N], string uuid, size_t N)() +{ + /++ + TODO: + -check uid (var name) for compliance + -check N for max available size in terms of microarchitecture of device (fermi, kepler etc) + + +/ + + void* _vp = __irEx!(` + @`~uuid~` = addrspace(3) global [`~Itoa!(N)~` x `~llvmType!T~`] zeroinitializer, align 4 ; + %SharedType = type { `~llvmType!T~` addrspace(3)* } + `, ` + %el0 = getelementptr inbounds [`~Itoa!(N)~` x `~llvmType!T~`], [`~Itoa!(N)~` x `~llvmType!T~`] addrspace(3)* @`~uuid~`, `~llvmType!T~` 0, i64 0 + %SharedTypePtr = alloca %SharedType, align 4 + %tmp = getelementptr inbounds %SharedType, %SharedType* %SharedTypePtr, `~llvmType!T~` 0, `~llvmType!T~` 0 + store `~llvmType!T~` addrspace(3)* %el0, `~llvmType!T~` addrspace(3)** %tmp, align 8 + %retptr = bitcast %SharedType* %SharedTypePtr to i8* + ret i8* %retptr + `, ``, void*)(); + + SharedPointer!T _sptr = *cast(SharedPointer!(T)*)_vp; + return _sptr; +} ++/ +package: +immutable(string) Digit(size_t n)() +{ // "0123456789"[n..n+1]; does not work either + + //enum Digit = `0123456789`[n..n+1]; + + static if(n == 0) + return 0.stringof; + else static if(n == 1) + return 1.stringof; + else static if(n == 2) + return 2.stringof; + else static if(n == 3) + return 3.stringof; + else static if(n == 4) + return 4.stringof; + else static if(n == 5) + return 5.stringof; + else static if(n == 6) + return 6.stringof; + else static if(n == 7) + return 7.stringof; + else static if(n == 8) + return 8.stringof; + else static if(n == 9) + return 9.stringof; + else static assert(0); +} + + +immutable(string) Itoa(uint n)() +{ + static if(n < 0){ + enum ret = "-" ~ Itoa!(-n); + return ret; + } + else static if (n < 10){ + enum ret = Digit!(n); + return ret; + } + else{ + enum ret = Itoa!(n / 10) ~ Digit!(n % 10); + return ret; + } +} + +immutable(string) llvmType(T)() +{ + static if(is(T == float)) + return "float"; + else static if(is(T == double)) + return "double"; + else static if(is(T == byte) || is(T == ubyte) || is(T == void)) + return "i8"; + else static if(is(T == short) || is(T == ushort)) + return "i16"; + else static if(is(T == int) || is(T == uint)) + return "i32"; + else static if(is(T == long) || is(T == ulong)) + return "i64"; + else + static assert(0, + "Can't determine llvm type for D type " ~ T.stringof); +} \ No newline at end of file From 4dc85d45bcebe5f0c8d9b3442b5d74ef5cb756db Mon Sep 17 00:00:00 2001 From: aferust Date: Fri, 2 Dec 2022 00:45:10 +0300 Subject: [PATCH 02/17] sharedStaticReserve --- source/dcompute/std/memory.d | 72 ++++++++---------------------------- 1 file changed, 16 insertions(+), 56 deletions(-) diff --git a/source/dcompute/std/memory.d b/source/dcompute/std/memory.d index 80c637e..51fa4c4 100644 --- a/source/dcompute/std/memory.d +++ b/source/dcompute/std/memory.d @@ -21,67 +21,27 @@ pragma(LDC_inline_ir) Returns a tuple of {SharedPointer!(align(A) T), length} "arrays" */ -alias SharedArr = ulong; - -SharedArr sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ - ulong address = __irEx!(`@`~uniqueName~` = addrspace(3) global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 ; +SharedPointer!T sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ + void* address = __irEx!(`@`~uniqueName~` = addrspace(3) global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 ; + %Dummy = type { `~llvmType!T~` addrspace(3)* } `, ` - %el0 = getelementptr inbounds [`~Itoa!N~` x `~llvmType!T~`], [`~Itoa!N~` x `~llvmType!T~`] addrspace(3)* @`~uniqueName~`, `~llvmType!T~` 0, i64 0 - %ptrint = ptrtoint `~llvmType!T~` addrspace(3)* %el0 to i64 - ret i64 %ptrint - `, ``, ulong)(); - return address; -} - -void setSharedVal(T)(SharedArr ptrint, size_t index, uint val){ - inlineIR!(` - %sptr = inttoptr i64 %0 to `~llvmType!T~` addrspace(3)* - %lptr = getelementptr inbounds `~llvmType!T~`, `~llvmType!T~` addrspace(3)* %sptr, i64 %1 - store `~llvmType!T~` %2, `~llvmType!T~` addrspace(3)* %lptr, align 4 - ret void`,void)(ptrint, index, val); + %sharedptr = getelementptr inbounds [`~Itoa!N~` x `~llvmType!T~`], [`~Itoa!N~` x `~llvmType!T~`] addrspace(3)* @`~uniqueName~`, `~llvmType!T~` 0, i64 0 + + %.structliteral = alloca %Dummy, align 8 + + %dumptr = getelementptr inbounds %Dummy, %Dummy* %.structliteral, i32 0, i32 0 + + store `~llvmType!T~` addrspace(3)* %sharedptr, `~llvmType!T~` addrspace(3)** %dumptr + + %vptr = bitcast %Dummy* %.structliteral to i8* + ret i8* %vptr + `, ``, void*)(); + return *(cast(SharedPointer!(uint)*)address); } - -T getSharedVal(T)(SharedArr ptrint, size_t index){ - return inlineIR!(` - %sptr = inttoptr i64 %0 to `~llvmType!T~` addrspace(3)* - %lptr = getelementptr inbounds `~llvmType!T~`, `~llvmType!T~` addrspace(3)* %sptr, i64 %1 - %r = load `~llvmType!T~`, `~llvmType!T~` addrspace(3)* %lptr, align 4 - ret `~llvmType!T~` %r`, T)(ptrint, index); -} - -/+ -SharedPointer!T sharedStaticReserve(T: T[N], string uuid, size_t N)() -{ - /++ - TODO: - -check uid (var name) for compliance - -check N for max available size in terms of microarchitecture of device (fermi, kepler etc) - - +/ - - void* _vp = __irEx!(` - @`~uuid~` = addrspace(3) global [`~Itoa!(N)~` x `~llvmType!T~`] zeroinitializer, align 4 ; - %SharedType = type { `~llvmType!T~` addrspace(3)* } - `, ` - %el0 = getelementptr inbounds [`~Itoa!(N)~` x `~llvmType!T~`], [`~Itoa!(N)~` x `~llvmType!T~`] addrspace(3)* @`~uuid~`, `~llvmType!T~` 0, i64 0 - %SharedTypePtr = alloca %SharedType, align 4 - %tmp = getelementptr inbounds %SharedType, %SharedType* %SharedTypePtr, `~llvmType!T~` 0, `~llvmType!T~` 0 - store `~llvmType!T~` addrspace(3)* %el0, `~llvmType!T~` addrspace(3)** %tmp, align 8 - %retptr = bitcast %SharedType* %SharedTypePtr to i8* - ret i8* %retptr - `, ``, void*)(); - - SharedPointer!T _sptr = *cast(SharedPointer!(T)*)_vp; - return _sptr; -} -+/ package: immutable(string) Digit(size_t n)() -{ // "0123456789"[n..n+1]; does not work either - - //enum Digit = `0123456789`[n..n+1]; - +{ static if(n == 0) return 0.stringof; else static if(n == 1) From 0c0159651a11b83e323a9e40bbedc605f5f18d98 Mon Sep 17 00:00:00 2001 From: aferust Date: Fri, 2 Dec 2022 01:13:45 +0300 Subject: [PATCH 03/17] fix typo --- source/dcompute/std/memory.d | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/dcompute/std/memory.d b/source/dcompute/std/memory.d index 51fa4c4..a2a0063 100644 --- a/source/dcompute/std/memory.d +++ b/source/dcompute/std/memory.d @@ -36,7 +36,7 @@ SharedPointer!T sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ %vptr = bitcast %Dummy* %.structliteral to i8* ret i8* %vptr `, ``, void*)(); - return *(cast(SharedPointer!(uint)*)address); + return *(cast(SharedPointer!(T)*)address); } package: @@ -99,4 +99,4 @@ immutable(string) llvmType(T)() else static assert(0, "Can't determine llvm type for D type " ~ T.stringof); -} \ No newline at end of file +} From 88214a4ca9138ab734f160d8aa87dfb0d2b745c8 Mon Sep 17 00:00:00 2001 From: aferust Date: Thu, 8 Dec 2022 15:36:07 +0300 Subject: [PATCH 04/17] make helpers public --- source/dcompute/std/memory.d | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/dcompute/std/memory.d b/source/dcompute/std/memory.d index a2a0063..b79ea76 100644 --- a/source/dcompute/std/memory.d +++ b/source/dcompute/std/memory.d @@ -21,6 +21,8 @@ pragma(LDC_inline_ir) Returns a tuple of {SharedPointer!(align(A) T), length} "arrays" */ +@nogc nothrow: + SharedPointer!T sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ void* address = __irEx!(`@`~uniqueName~` = addrspace(3) global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 ; %Dummy = type { `~llvmType!T~` addrspace(3)* } @@ -39,7 +41,6 @@ SharedPointer!T sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ return *(cast(SharedPointer!(T)*)address); } -package: immutable(string) Digit(size_t n)() { static if(n == 0) From ad6a73d93d28b7de7ee9d7a1309a9f172d4f9320 Mon Sep 17 00:00:00 2001 From: aferust Date: Tue, 13 Dec 2022 15:43:07 +0300 Subject: [PATCH 05/17] add nvvm math intrinsics --- source/dcompute/std/cuda/math.d | 683 ++++++++++++++++++++++++++++++++ 1 file changed, 683 insertions(+) create mode 100644 source/dcompute/std/cuda/math.d diff --git a/source/dcompute/std/cuda/math.d b/source/dcompute/std/cuda/math.d new file mode 100644 index 0000000..c4bcbd4 --- /dev/null +++ b/source/dcompute/std/cuda/math.d @@ -0,0 +1,683 @@ +module dcompute.std.cuda.math; + +/++ auto-generated intrinsics definitions from CUDA 10.2 - libdevice.10.bc + the generator python script is available at the end of this file ++/ + +//declare float @llvm.nvvm.floor.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.floor.ftz.f") +float floor_ftz_f(float); + +//declare float @llvm.nvvm.floor.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.floor.f") +float floor_f(float); + +//declare double @llvm.nvvm.floor.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.floor.d") +double floor_d(double); + +//declare float @llvm.nvvm.fabs.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.fabs.ftz.f") +float fabs_ftz_f(float); + +//declare float @llvm.nvvm.fabs.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.fabs.f") +float fabs_f(float); + +//declare double @llvm.nvvm.fabs.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.fabs.d") +double fabs_d(double); + +//declare double @llvm.nvvm.rcp.approx.ftz.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.approx.ftz.d") +double rcp_approx_ftz_d(double); + +//declare float @llvm.nvvm.fmin.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fmin.ftz.f") +float fmin_ftz_f(float, float); + +//declare float @llvm.nvvm.fmin.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fmin.f") +float fmin_f(float, float); + +//declare float @llvm.nvvm.fmax.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fmax.ftz.f") +float fmax_ftz_f(float, float); + +//declare float @llvm.nvvm.fmax.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fmax.f") +float fmax_f(float, float); + +//declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.rsqrt.approx.ftz.f") +float rsqrt_approx_ftz_f(float); + +//declare float @llvm.nvvm.rsqrt.approx.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.rsqrt.approx.f") +float rsqrt_approx_f(float); + +//declare double @llvm.nvvm.fmin.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.fmin.d") +double fmin_d(double, double); + +//declare double @llvm.nvvm.fmax.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.fmax.d") +double fmax_d(double, double); + +//declare double @llvm.nvvm.rsqrt.approx.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.rsqrt.approx.d") +double rsqrt_approx_d(double); + +//declare double @llvm.nvvm.ceil.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.ceil.d") +double ceil_d(double); + +//declare double @llvm.nvvm.trunc.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.trunc.d") +double trunc_d(double); + +//declare float @llvm.nvvm.ex2.approx.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.ex2.approx.ftz.f") +float ex2_approx_ftz_f(float); + +//declare float @llvm.nvvm.ex2.approx.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.ex2.approx.f") +float ex2_approx_f(float); + +//declare float @llvm.nvvm.lg2.approx.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.lg2.approx.ftz.f") +float lg2_approx_ftz_f(float); + +//declare float @llvm.nvvm.lg2.approx.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.lg2.approx.f") +float lg2_approx_f(float); + +//declare float @llvm.nvvm.sin.approx.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sin.approx.ftz.f") +float sin_approx_ftz_f(float); + +//declare float @llvm.nvvm.sin.approx.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sin.approx.f") +float sin_approx_f(float); + +//declare float @llvm.nvvm.cos.approx.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.cos.approx.ftz.f") +float cos_approx_ftz_f(float); + +//declare float @llvm.nvvm.cos.approx.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.cos.approx.f") +float cos_approx_f(float); + +//declare float @llvm.nvvm.round.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.round.ftz.f") +float round_ftz_f(float); + +//declare float @llvm.nvvm.round.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.round.f") +float round_f(float); + +//declare double @llvm.nvvm.round.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.round.d") +double round_d(double); + +//declare float @llvm.nvvm.trunc.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.trunc.ftz.f") +float trunc_ftz_f(float); + +//declare float @llvm.nvvm.trunc.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.trunc.f") +float trunc_f(float); + +//declare float @llvm.nvvm.ceil.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.ceil.ftz.f") +float ceil_ftz_f(float); + +//declare float @llvm.nvvm.ceil.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.ceil.f") +float ceil_f(float); + +//declare float @llvm.nvvm.saturate.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.saturate.ftz.f") +float saturate_ftz_f(float); + +//declare float @llvm.nvvm.saturate.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.saturate.f") +float saturate_f(float); + +//declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rn.ftz.f") +float fma_rn_ftz_f(float, float, float); + +//declare float @llvm.nvvm.fma.rn.f(float, float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rn.f") +float fma_rn_f(float, float, float); + +//declare float @llvm.nvvm.fma.rz.ftz.f(float, float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rz.ftz.f") +float fma_rz_ftz_f(float, float, float); + +//declare float @llvm.nvvm.fma.rz.f(float, float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rz.f") +float fma_rz_f(float, float, float); + +//declare float @llvm.nvvm.fma.rm.ftz.f(float, float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rm.ftz.f") +float fma_rm_ftz_f(float, float, float); + +//declare float @llvm.nvvm.fma.rm.f(float, float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rm.f") +float fma_rm_f(float, float, float); + +//declare float @llvm.nvvm.fma.rp.ftz.f(float, float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rp.ftz.f") +float fma_rp_ftz_f(float, float, float); + +//declare float @llvm.nvvm.fma.rp.f(float, float, float) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rp.f") +float fma_rp_f(float, float, float); + +//declare double @llvm.nvvm.fma.rn.d(double, double, double) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rn.d") +double fma_rn_d(double, double, double); + +//declare double @llvm.nvvm.fma.rz.d(double, double, double) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rz.d") +double fma_rz_d(double, double, double); + +//declare double @llvm.nvvm.fma.rm.d(double, double, double) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rm.d") +double fma_rm_d(double, double, double); + +//declare double @llvm.nvvm.fma.rp.d(double, double, double) +pragma(LDC_intrinsic, "llvm.nvvm.fma.rp.d") +double fma_rp_d(double, double, double); + +//declare float @llvm.nvvm.div.approx.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.div.approx.ftz.f") +float div_approx_ftz_f(float, float); + +//declare float @llvm.nvvm.div.approx.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.div.approx.f") +float div_approx_f(float, float); + +//declare float @llvm.nvvm.div.rn.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.div.rn.ftz.f") +float div_rn_ftz_f(float, float); + +//declare float @llvm.nvvm.div.rn.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.div.rn.f") +float div_rn_f(float, float); + +//declare float @llvm.nvvm.div.rz.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.div.rz.ftz.f") +float div_rz_ftz_f(float, float); + +//declare float @llvm.nvvm.div.rz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.div.rz.f") +float div_rz_f(float, float); + +//declare float @llvm.nvvm.div.rm.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.div.rm.ftz.f") +float div_rm_ftz_f(float, float); + +//declare float @llvm.nvvm.div.rm.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.div.rm.f") +float div_rm_f(float, float); + +//declare float @llvm.nvvm.div.rp.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.div.rp.ftz.f") +float div_rp_ftz_f(float, float); + +//declare float @llvm.nvvm.div.rp.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.div.rp.f") +float div_rp_f(float, float); + +//declare float @llvm.nvvm.rcp.rn.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rn.ftz.f") +float rcp_rn_ftz_f(float); + +//declare float @llvm.nvvm.rcp.rn.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rn.f") +float rcp_rn_f(float); + +//declare float @llvm.nvvm.rcp.rz.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rz.ftz.f") +float rcp_rz_ftz_f(float); + +//declare float @llvm.nvvm.rcp.rz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rz.f") +float rcp_rz_f(float); + +//declare float @llvm.nvvm.rcp.rm.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rm.ftz.f") +float rcp_rm_ftz_f(float); + +//declare float @llvm.nvvm.rcp.rm.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rm.f") +float rcp_rm_f(float); + +//declare float @llvm.nvvm.rcp.rp.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rp.ftz.f") +float rcp_rp_ftz_f(float); + +//declare float @llvm.nvvm.rcp.rp.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rp.f") +float rcp_rp_f(float); + +//declare float @llvm.nvvm.sqrt.rn.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rn.ftz.f") +float sqrt_rn_ftz_f(float); + +//declare float @llvm.nvvm.sqrt.rn.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rn.f") +float sqrt_rn_f(float); + +//declare float @llvm.nvvm.sqrt.rz.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rz.ftz.f") +float sqrt_rz_ftz_f(float); + +//declare float @llvm.nvvm.sqrt.rz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rz.f") +float sqrt_rz_f(float); + +//declare float @llvm.nvvm.sqrt.rm.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rm.ftz.f") +float sqrt_rm_ftz_f(float); + +//declare float @llvm.nvvm.sqrt.rm.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rm.f") +float sqrt_rm_f(float); + +//declare float @llvm.nvvm.sqrt.rp.ftz.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rp.ftz.f") +float sqrt_rp_ftz_f(float); + +//declare float @llvm.nvvm.sqrt.rp.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rp.f") +float sqrt_rp_f(float); + +//declare double @llvm.nvvm.div.rn.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.div.rn.d") +double div_rn_d(double, double); + +//declare double @llvm.nvvm.div.rz.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.div.rz.d") +double div_rz_d(double, double); + +//declare double @llvm.nvvm.div.rm.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.div.rm.d") +double div_rm_d(double, double); + +//declare double @llvm.nvvm.div.rp.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.div.rp.d") +double div_rp_d(double, double); + +//declare double @llvm.nvvm.rcp.rn.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rn.d") +double rcp_rn_d(double); + +//declare double @llvm.nvvm.rcp.rz.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rz.d") +double rcp_rz_d(double); + +//declare double @llvm.nvvm.rcp.rm.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rm.d") +double rcp_rm_d(double); + +//declare double @llvm.nvvm.rcp.rp.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.rcp.rp.d") +double rcp_rp_d(double); + +//declare double @llvm.nvvm.sqrt.rn.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rn.d") +double sqrt_rn_d(double); + +//declare double @llvm.nvvm.sqrt.rz.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rz.d") +double sqrt_rz_d(double); + +//declare double @llvm.nvvm.sqrt.rm.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rm.d") +double sqrt_rm_d(double); + +//declare double @llvm.nvvm.sqrt.rp.d(double) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.rp.d") +double sqrt_rp_d(double); + +//declare float @llvm.nvvm.sqrt.f(float) +pragma(LDC_intrinsic, "llvm.nvvm.sqrt.f") +float sqrt_f(float); + +//declare double @llvm.nvvm.add.rn.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.add.rn.d") +double add_rn_d(double, double); + +//declare double @llvm.nvvm.add.rz.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.add.rz.d") +double add_rz_d(double, double); + +//declare double @llvm.nvvm.add.rm.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.add.rm.d") +double add_rm_d(double, double); + +//declare double @llvm.nvvm.add.rp.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.add.rp.d") +double add_rp_d(double, double); + +//declare double @llvm.nvvm.mul.rn.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rn.d") +double mul_rn_d(double, double); + +//declare double @llvm.nvvm.mul.rz.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rz.d") +double mul_rz_d(double, double); + +//declare double @llvm.nvvm.mul.rm.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rm.d") +double mul_rm_d(double, double); + +//declare double @llvm.nvvm.mul.rp.d(double, double) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rp.d") +double mul_rp_d(double, double); + +//declare float @llvm.nvvm.add.rm.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.add.rm.ftz.f") +float add_rm_ftz_f(float, float); + +//declare float @llvm.nvvm.add.rm.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.add.rm.f") +float add_rm_f(float, float); + +//declare float @llvm.nvvm.add.rp.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.add.rp.ftz.f") +float add_rp_ftz_f(float, float); + +//declare float @llvm.nvvm.add.rp.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.add.rp.f") +float add_rp_f(float, float); + +//declare float @llvm.nvvm.mul.rm.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rm.ftz.f") +float mul_rm_ftz_f(float, float); + +//declare float @llvm.nvvm.mul.rm.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rm.f") +float mul_rm_f(float, float); + +//declare float @llvm.nvvm.mul.rp.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rp.ftz.f") +float mul_rp_ftz_f(float, float); + +//declare float @llvm.nvvm.mul.rp.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rp.f") +float mul_rp_f(float, float); + +//declare float @llvm.nvvm.add.rn.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.add.rn.ftz.f") +float add_rn_ftz_f(float, float); + +//declare float @llvm.nvvm.add.rn.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.add.rn.f") +float add_rn_f(float, float); + +//declare float @llvm.nvvm.add.rz.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.add.rz.ftz.f") +float add_rz_ftz_f(float, float); + +//declare float @llvm.nvvm.add.rz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.add.rz.f") +float add_rz_f(float, float); + +//declare float @llvm.nvvm.mul.rn.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rn.ftz.f") +float mul_rn_ftz_f(float, float); + +//declare float @llvm.nvvm.mul.rn.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rn.f") +float mul_rn_f(float, float); + +//declare float @llvm.nvvm.mul.rz.ftz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rz.ftz.f") +float mul_rz_ftz_f(float, float); + +//declare float @llvm.nvvm.mul.rz.f(float, float) +pragma(LDC_intrinsic, "llvm.nvvm.mul.rz.f") +float mul_rz_f(float, float); + +//declare float @llvm.nvvm.d2f.rn.ftz(double) +pragma(LDC_intrinsic, "llvm.nvvm.d2f.rn.ftz") +float d2f_rn_ftz(double); + +//declare float @llvm.nvvm.d2f.rn(double) +pragma(LDC_intrinsic, "llvm.nvvm.d2f.rn") +float d2f_rn(double); + +//declare float @llvm.nvvm.d2f.rz.ftz(double) +pragma(LDC_intrinsic, "llvm.nvvm.d2f.rz.ftz") +float d2f_rz_ftz(double); + +//declare float @llvm.nvvm.d2f.rz(double) +pragma(LDC_intrinsic, "llvm.nvvm.d2f.rz") +float d2f_rz(double); + +//declare float @llvm.nvvm.d2f.rm.ftz(double) +pragma(LDC_intrinsic, "llvm.nvvm.d2f.rm.ftz") +float d2f_rm_ftz(double); + +//declare float @llvm.nvvm.d2f.rm(double) +pragma(LDC_intrinsic, "llvm.nvvm.d2f.rm") +float d2f_rm(double); + +//declare float @llvm.nvvm.d2f.rp.ftz(double) +pragma(LDC_intrinsic, "llvm.nvvm.d2f.rp.ftz") +float d2f_rp_ftz(double); + +//declare float @llvm.nvvm.d2f.rp(double) +pragma(LDC_intrinsic, "llvm.nvvm.d2f.rp") +float d2f_rp(double); + +//declare double @llvm.nvvm.i2d.rn(i32) +pragma(LDC_intrinsic, "llvm.nvvm.i2d.rn") +double i2d_rn(int); + +//declare double @llvm.nvvm.ui2d.rn(i32) +pragma(LDC_intrinsic, "llvm.nvvm.ui2d.rn") +double ui2d_rn(int); + +//declare float @llvm.nvvm.i2f.rn(i32) +pragma(LDC_intrinsic, "llvm.nvvm.i2f.rn") +float i2f_rn(int); + +//declare float @llvm.nvvm.i2f.rz(i32) +pragma(LDC_intrinsic, "llvm.nvvm.i2f.rz") +float i2f_rz(int); + +//declare float @llvm.nvvm.i2f.rm(i32) +pragma(LDC_intrinsic, "llvm.nvvm.i2f.rm") +float i2f_rm(int); + +//declare float @llvm.nvvm.i2f.rp(i32) +pragma(LDC_intrinsic, "llvm.nvvm.i2f.rp") +float i2f_rp(int); + +//declare float @llvm.nvvm.ui2f.rn(i32) +pragma(LDC_intrinsic, "llvm.nvvm.ui2f.rn") +float ui2f_rn(int); + +//declare float @llvm.nvvm.ui2f.rz(i32) +pragma(LDC_intrinsic, "llvm.nvvm.ui2f.rz") +float ui2f_rz(int); + +//declare float @llvm.nvvm.ui2f.rm(i32) +pragma(LDC_intrinsic, "llvm.nvvm.ui2f.rm") +float ui2f_rm(int); + +//declare float @llvm.nvvm.ui2f.rp(i32) +pragma(LDC_intrinsic, "llvm.nvvm.ui2f.rp") +float ui2f_rp(int); + +//declare double @llvm.nvvm.lohi.i2d(i32, i32) +pragma(LDC_intrinsic, "llvm.nvvm.lohi.i2d") +double lohi_i2d(int, int); + +//declare float @llvm.nvvm.ll2f.rn(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ll2f.rn") +float ll2f_rn(long); + +//declare float @llvm.nvvm.ll2f.rz(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ll2f.rz") +float ll2f_rz(long); + +//declare float @llvm.nvvm.ll2f.rm(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ll2f.rm") +float ll2f_rm(long); + +//declare float @llvm.nvvm.ll2f.rp(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ll2f.rp") +float ll2f_rp(long); + +//declare float @llvm.nvvm.ull2f.rn(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ull2f.rn") +float ull2f_rn(long); + +//declare float @llvm.nvvm.ull2f.rz(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ull2f.rz") +float ull2f_rz(long); + +//declare float @llvm.nvvm.ull2f.rm(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ull2f.rm") +float ull2f_rm(long); + +//declare float @llvm.nvvm.ull2f.rp(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ull2f.rp") +float ull2f_rp(long); + +//declare double @llvm.nvvm.ll2d.rn(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ll2d.rn") +double ll2d_rn(long); + +//declare double @llvm.nvvm.ll2d.rz(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ll2d.rz") +double ll2d_rz(long); + +//declare double @llvm.nvvm.ll2d.rm(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ll2d.rm") +double ll2d_rm(long); + +//declare double @llvm.nvvm.ll2d.rp(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ll2d.rp") +double ll2d_rp(long); + +//declare double @llvm.nvvm.ull2d.rn(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ull2d.rn") +double ull2d_rn(long); + +//declare double @llvm.nvvm.ull2d.rz(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ull2d.rz") +double ull2d_rz(long); + +//declare double @llvm.nvvm.ull2d.rm(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ull2d.rm") +double ull2d_rm(long); + +//declare double @llvm.nvvm.ull2d.rp(i64) +pragma(LDC_intrinsic, "llvm.nvvm.ull2d.rp") +double ull2d_rp(long); + +/+ python codegen from libdevice.10.ll + +# use `llvm-dis libdevice.10.bc` to obtain libdevice.10.ll then run this script + +import re + +def use_regex(input_text): + pattern = re.compile(r"[a-zA-Z]+ [a-zA-Z]+ @([A-Za-z0-9]+(\.[A-Za-z0-9]+)+)\([^)]*\)", re.IGNORECASE) + return pattern.match(input_text) + +def unique(list1): + # initialize a null list + unique_list = [] + # traverse for all elements + for x in list1: + # check if exists in unique_list or not + if x not in unique_list: + unique_list.append(x) + return unique_list + +def getReturnType(s): + rt = "" + i = 0 + while s[i] != " ": + rt += s[i] + i += 1 + return rt + +def getFunNameandArgInd(s): + index = -1; + try: + index = s.index('@llvm.nvvm.') + except ValueError: + return None + fnstartind = index + 11 + i = fnstartind + name = "" + while s[i] != "(": + name += s[i] + i += 1 + name = name.replace(".", "_") + return (name, i) + +def getNVVMname(s): + i = -1; + try: + i = s.index('llvm.nvvm.') + except ValueError: + return None + name = "" + while s[i] != "(": + name += s[i] + i += 1 + return name + +def getParams(s, i): + params = "" + while s[i] != ")": + params += s[i] + i += 1 + return params + ")" + +file1 = open('libdevice.10.ll', 'r') +Lines = file1.readlines() + +mlist = [] + +for line in Lines: + some = use_regex(line) + if some is not None: + mlist.append(some.group(0)) + +ftemplate = "//%s\npragma(LDC_intrinsic, \"%s\")\n%s %s%s;\n" + + +for d in unique(mlist): + raw = d[8:] + rtype = getReturnType(raw) + + rtype = rtype.replace("i64", "long") + rtype = rtype.replace("i32", "int") + + tup = getFunNameandArgInd(raw) + if tup is None: + continue + + lfn = getNVVMname(raw) + if lfn is None: + continue + + name, i = tup + params = getParams(raw, i) + params = params.replace("i64", "long") + params = params.replace("i32", "int") + + print(ftemplate % (d, lfn, rtype, name, params)) ++/ \ No newline at end of file From 727cd8b47fb26718ffa839b3ded024b2ed6ae46b Mon Sep 17 00:00:00 2001 From: aferust Date: Tue, 13 Dec 2022 16:15:38 +0300 Subject: [PATCH 06/17] Update math.d add missing attr --- source/dcompute/std/cuda/math.d | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/dcompute/std/cuda/math.d b/source/dcompute/std/cuda/math.d index c4bcbd4..edd458b 100644 --- a/source/dcompute/std/cuda/math.d +++ b/source/dcompute/std/cuda/math.d @@ -1,4 +1,6 @@ -module dcompute.std.cuda.math; +@compute(CompileFor.hostAndDevice) module dcompute.std.cuda.math; + +import ldc.dcompute; /++ auto-generated intrinsics definitions from CUDA 10.2 - libdevice.10.bc the generator python script is available at the end of this file @@ -680,4 +682,4 @@ for d in unique(mlist): params = params.replace("i32", "int") print(ftemplate % (d, lfn, rtype, name, params)) -+/ \ No newline at end of file ++/ From 9c3c4b4bd2b9a20aada73f66c805b389ee1469a0 Mon Sep 17 00:00:00 2001 From: aferust Date: Mon, 19 Dec 2022 09:01:51 +0300 Subject: [PATCH 07/17] fix sharedStaticReserve --- source/dcompute/std/memory.d | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dcompute/std/memory.d b/source/dcompute/std/memory.d index b79ea76..e4839a4 100644 --- a/source/dcompute/std/memory.d +++ b/source/dcompute/std/memory.d @@ -27,7 +27,7 @@ SharedPointer!T sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ void* address = __irEx!(`@`~uniqueName~` = addrspace(3) global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 ; %Dummy = type { `~llvmType!T~` addrspace(3)* } `, ` - %sharedptr = getelementptr inbounds [`~Itoa!N~` x `~llvmType!T~`], [`~Itoa!N~` x `~llvmType!T~`] addrspace(3)* @`~uniqueName~`, `~llvmType!T~` 0, i64 0 + %sharedptr = getelementptr inbounds [`~Itoa!N~` x `~llvmType!T~`], [`~Itoa!N~` x `~llvmType!T~`] addrspace(3)* @`~uniqueName~`, i32 0, i32 0 %.structliteral = alloca %Dummy, align 8 From c0f0570f87f80972f5a9e87f3721b0c80a78cae5 Mon Sep 17 00:00:00 2001 From: aferust Date: Thu, 22 Dec 2022 14:03:45 +0300 Subject: [PATCH 08/17] implement constStaticReserve constStaticReserve --- source/dcompute/std/memory.d | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/source/dcompute/std/memory.d b/source/dcompute/std/memory.d index e4839a4..201cf22 100644 --- a/source/dcompute/std/memory.d +++ b/source/dcompute/std/memory.d @@ -41,6 +41,17 @@ SharedPointer!T sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ return *(cast(SharedPointer!(T)*)address); } +immutable(T)* constStaticReserve(T : T[N], string uniqueName, size_t N)(){ + immutable(T)* address = __irEx!(`@`~uniqueName~` = addrspace(4) externally_initialized global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 + `, ` + %mptr = getelementptr inbounds [`~Itoa!N~` x `~llvmType!T~`], [`~Itoa!N~` x `~llvmType!T~`] addrspace(4)* @`~uniqueName~`, i32 0, i32 0 + + %r = addrspacecast `~llvmType!T~` addrspace(4)* %mptr to `~llvmType!T~`* + ret `~llvmType!T~`* %r + `, ``, immutable(T)*)(); + return address; +} + immutable(string) Digit(size_t n)() { static if(n == 0) From 96e9a5cdbba0215d36519899c7f94a8f6c3eb67f Mon Sep 17 00:00:00 2001 From: aferust Date: Thu, 22 Dec 2022 14:05:47 +0300 Subject: [PATCH 09/17] wrap cuModuleGetGlobal and implement getGlobal wrap cuModuleGetGlobal and implement getGlobal --- source/dcompute/driver/cuda/program.d | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/source/dcompute/driver/cuda/program.d b/source/dcompute/driver/cuda/program.d index 25191ac..55691f9 100644 --- a/source/dcompute/driver/cuda/program.d +++ b/source/dcompute/driver/cuda/program.d @@ -22,6 +22,12 @@ struct Program //cuModuleGetGlobal //cuModuleGetTexRef //cuModuleGetSurfRef + static size_t getGlobal(ref size_t bytes, immutable(char)* name){ + size_t globptr; + status = cast(Status)cuModuleGetGlobal(&globptr, &bytes, globalProgram.raw, name); + checkErrors(); + return globptr; + } static Program fromFile(string name) { From 24723cc3f782618cd2a8018b4ea9afaa3f62bbf6 Mon Sep 17 00:00:00 2001 From: aferust Date: Thu, 22 Dec 2022 14:26:03 +0300 Subject: [PATCH 10/17] CUDA texture sampling return types CUDA texture sampling return types --- source/dcompute/std/cuda/texture.d | 236 +++++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 source/dcompute/std/cuda/texture.d diff --git a/source/dcompute/std/cuda/texture.d b/source/dcompute/std/cuda/texture.d new file mode 100644 index 0000000..d7360e3 --- /dev/null +++ b/source/dcompute/std/cuda/texture.d @@ -0,0 +1,236 @@ +module dcompute.std.cuda.texture; + +// CUDA texture sampling return types +struct int4 +{ + int x, y, z, w; +} + +struct float4 +{ + float x, y, z, w; + + float4 opBinary(string op)(float s) if (op == "+"){ + return float4(x+s, y+s, z+s, w+s); + } + float4 opBinary(string op)(float s) if (op == "*") { + return float4(x*s, y*s, z*s, w*s); + } + float4 opBinary(string op)(float s) if (op == "/") { + return float4(x/s, y/s, z/s, w/s); + } + + float4 opBinary(string op)(float4 other){ + static if (op == "+"){ + return float4(x+other.x, y+other.y, z+other.z, w+other.w); + } else + static assert(0, "op is not implemented"); + } +} + +/++ +Sampling a 1D texture: + +%float4 @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %tex, i32 %x) +%float4 @llvm.nvvm.tex.unified.1d.v4f32.f32(i64 %tex, float %x) +%float4 @llvm.nvvm.tex.unified.1d.level.v4f32.f32(i64 %tex, float %x, + float %level) +%float4 @llvm.nvvm.tex.unified.1d.grad.v4f32.f32(i64 %tex, float %x, + float %dPdx, + float %dPdy) + +%int4 @llvm.nvvm.tex.unified.1d.v4s32.s32(i64 %tex, i32 %x) +%int4 @llvm.nvvm.tex.unified.1d.v4s32.f32(i64 %tex, float %x) +%int4 @llvm.nvvm.tex.unified.1d.level.v4s32.f32(i64 %tex, float %x, + float %level) +%int4 @llvm.nvvm.tex.unified.1d.grad.v4s32.f32(i64 %tex, float %x, + float %dPdx, + float %dPdy) + +%int4 @llvm.nvvm.tex.unified.1d.v4u32.s32(i64 %tex, i32 %x) +%int4 @llvm.nvvm.tex.unified.1d.v4u32.f32(i64 %tex, float %x) +%int4 @llvm.nvvm.tex.unified.1d.level.v4u32.f32(i64 %tex, float %x, + float %level) +%int4 @llvm.nvvm.tex.unified.1d.grad.v4u32.f32(i64 %tex, float %x, + float %dPdx, + float %dPdy) + + +Sampling a 1D texture array: + +%float4 @llvm.nvvm.tex.unified.1d.array.v4f32.s32(i64 %tex, i32 %idx, i32 %x) +%float4 @llvm.nvvm.tex.unified.1d.array.v4f32.f32(i64 %tex, i32 %idx, float %x) +%float4 @llvm.nvvm.tex.unified.1d.array.level.v4f32.f32(i64 %tex, i32 %idx, + float %x, + float %level) +%float4 @llvm.nvvm.tex.unified.1d.array.grad.v4f32.f32(i64 %tex, i32 %idx, + float %x, + float %dPdx, + float %dPdy) + +%int4 @llvm.nvvm.tex.unified.1d.array.v4s32.s32(i64 %tex, i32 %idx, i32 %x) +%int4 @llvm.nvvm.tex.unified.1d.array.v4s32.f32(i64 %tex, i32 %idx, float %x) +%int4 @llvm.nvvm.tex.unified.1d.array.level.v4s32.f32(i64 %tex, i32 %idx, + float %x, + float %level) +%int4 @llvm.nvvm.tex.unified.1d.array.grad.v4s32.f32(i64 %tex, i32 %idx, + float %x, + float %dPdx, + float %dPdy) + +%int4 @llvm.nvvm.tex.unified.1d.array.v4u32.s32(i64 %tex, i32 %idx, i32 %x) +%int4 @llvm.nvvm.tex.unified.1d.array.v4u32.f32(i64 %tex, i32 %idx, float %x) +%int4 @llvm.nvvm.tex.unified.1d.array.level.v4u32.f32(i64 %tex, i32 %idx, + float %x, + float %level) +%int4 @llvm.nvvm.tex.unified.1d.array.grad.v4u32.f32(i64 %tex, i32 %idx, + float %x, + float %dPdx, + float %dPdy) + + +Sampling a 2D texture: + +%float4 @llvm.nvvm.tex.unified.2d.v4f32.s32(i64 %tex, i32 %x, i32 %y) +%float4 @llvm.nvvm.tex.unified.2d.v4f32.f32(i64 %tex, float %x, float %y) +%float4 @llvm.nvvm.tex.unified.2d.level.v4f32.f32(i64 %tex, float %x, float %y, + float %level) +%float4 @llvm.nvvm.tex.unified.2d.grad.v4f32.f32(i64 %tex, float %x, float %y, + float %dPdx_x, float %dPdx_y, + float %dPdy_x, float %dPdy_y) + +%int4 @llvm.nvvm.tex.unified.2d.v4s32.s32(i64 %tex, i32 %x, i32 %y) +%int4 @llvm.nvvm.tex.unified.2d.v4s32.f32(i64 %tex, float %x, float %y,) +%int4 @llvm.nvvm.tex.unified.2d.level.v4s32.f32(i64 %tex, float %x, float %y, + float %level) +%int4 @llvm.nvvm.tex.unified.2d.grad.v4s32.f32(i64 %tex, float %x, float %y, + float %dPdx_x, float %dPdx_y, + float %dPdy_x, float %dPdy_y) + +%int4 @llvm.nvvm.tex.unified.2d.v4u32.s32(i64 %tex, i32 %x i32 %y) +%int4 @llvm.nvvm.tex.unified.2d.v4u32.f32(i64 %tex, float %x float %y) +%int4 @llvm.nvvm.tex.unified.2d.level.v4u32.f32(i64 %tex, float %x, float %y, + float %level) +%int4 @llvm.nvvm.tex.unified.2d.grad.v4u32.f32(i64 %tex, float %x, float %y, + float %dPdx_x, float %dPdx_y, + float %dPdy_x, float %dPdy_y) + +Sampling a 2D texture array: + +%float4 @llvm.nvvm.tex.unified.2d.array.v4f32.s32(i64 %tex, i32 %idx, + i32 %x, i32 %y) +%float4 @llvm.nvvm.tex.unified.2d.array.v4f32.f32(i64 %tex, i32 %idx, + float %x, float %y) +%float4 @llvm.nvvm.tex.unified.2d.array.level.v4f32.f32(i64 %tex, i32 %idx, + float %x, float %y, + float %level) +%float4 @llvm.nvvm.tex.unified.2d.array.grad.v4f32.f32(i64 %tex, i32 %idx, + float %x, float %y, + float %dPdx_x, + float %dPdx_y, + float %dPdy_x, + float %dPdy_y) + +%int4 @llvm.nvvm.tex.unified.2d.array.v4s32.s32(i64 %tex, i32 %idx, + i32 %x, i32 %y) +%int4 @llvm.nvvm.tex.unified.2d.array.v4s32.f32(i64 %tex, i32 %idx, + float %x, float %y) +%int4 @llvm.nvvm.tex.unified.2d.array.level.v4s32.f32(i64 %tex, i32 %idx, + float %x, float %y, + float %level) +%int4 @llvm.nvvm.tex.unified.2d.array.grad.v4s32.f32(i64 %tex, i32 %idx, + float %x, float %y, + float %dPdx_x, + float %dPdx_y, + float %dPdy_x, + float %dPdy_y) + +%int4 @llvm.nvvm.tex.unified.2d.array.v4u32.s32(i64 %tex, i32 %idx, + i32 %x i32 %y) +%int4 @llvm.nvvm.tex.unified.2d.array.v4u32.f32(i64 %tex, i32 %idx, + float %x float %y) +%int4 @llvm.nvvm.tex.unified.2d.array.level.v4u32.f32(i64 %tex, i32 %idx, + float %x, float %y, + float %level) +%int4 @llvm.nvvm.tex.unified.2d.array.grad.v4u32.f32(i64 %tex, i32 %idx, + float %x, float %y, + float %dPdx_x, + float %dPdx_y, + float %dPdy_x, + float %dPdy_y) + +Sampling a 3D texture: + +%float4 @llvm.nvvm.tex.unified.3d.v4f32.s32(i64 %tex, i32 %x, i32 %y, i32 %z) +%float4 @llvm.nvvm.tex.unified.3d.v4f32.f32(i64 %tex, float %x, float %y, + float %z) +%float4 @llvm.nvvm.tex.unified.3d.level.v4f32.f32(i64 %tex,float %x, float %y, + float %z, float %level) +%float4 @llvm.nvvm.tex.unified.3d.grad.v4f32.f32(i64 %tex, float %x, float %y, + float %z, float %dPdx_x, + float %dPdx_y, float %dPdx_z, + float %dPdy_x, float %dPdy_y, + float %dPdy_z) + +%int4 @llvm.nvvm.tex.unified.3d.v4s32.s32(i64 %tex, i32 %x, i32 %y, i32 %z) +%int4 @llvm.nvvm.tex.unified.3d.v4s32.f32(i64 %tex, float %x, float %y, + float %z) +%int4 @llvm.nvvm.tex.unified.3d.level.v4s32.f32(i64 %tex, float %x, float %y, + float %z, float %level) +%int4 @llvm.nvvm.tex.unified.3d.grad.v4s32.f32(i64 %tex, float %x, float %y, + float %z, float %dPdx_x, + float %dPdx_y, float %dPdx_z, + float %dPdy_x, float %dPdy_y, + float %dPdy_z) + +%int4 @llvm.nvvm.tex.unified.3d.v4u32.s32(i64 %tex, i32 %x i32 %y, i32 %z) +%int4 @llvm.nvvm.tex.unified.3d.v4u32.f32(i64 %tex, float %x, float %y, + float %z) +%int4 @llvm.nvvm.tex.unified.3d.level.v4u32.f32(i64 %tex, float %x, float %y, + float %z, float %level) +%int4 @llvm.nvvm.tex.unified.3d.grad.v4u32.f32(i64 %tex, float %x, float %y, + float %z, float %dPdx_x, + float %dPdx_y, float %dPdx_z, + float %dPdy_x, float %dPdy_y, + float %dPdy_z) + +Sampling a cube texture: + +%float4 @llvm.nvvm.tex.unified.cube.v4f32.f32(i64 %tex, float %x, float %y, + float %z) +%float4 @llvm.nvvm.tex.unified.cube.level.v4f32.f32(i64 %tex,float %x, float %y, + float %z, float %level) + +%int4 @llvm.nvvm.tex.unified.cube.v4s32.f32(i64 %tex, float %x, float %y, + float %z) +%int4 @llvm.nvvm.tex.unified.cube.level.v4s32.f32(i64 %tex, float %x, float %y, + float %z, float %level) + +%int4 @llvm.nvvm.tex.unified.cube.v4u32.f32(i64 %tex, float %x, float %y, + float %z) +%int4 @llvm.nvvm.tex.unified.cube.level.v4u32.f32(i64 %tex, float %x, float %y, + float %z, float %level) + +Sampling a cube texture array: + +%float4 @llvm.nvvm.tex.unified.cube.array.v4f32.f32(i64 %tex, i32 %idx, + float %x, float %y, + float %z) +%float4 @llvm.nvvm.tex.unified.cube.array.level.v4f32.f32(i64 %tex, i32 %idx, + float %x, float %y, + float %z, + float %level) + +%int4 @llvm.nvvm.tex.unified.cube.array.v4s32.f32(i64 %tex, i32 %idx, float %x, + float %y, float %z) +%int4 @llvm.nvvm.tex.unified.cube.array.level.v4s32.f32(i64 %tex, i32 %idx, + float %x, float %y, + float %z, float %level) + +%int4 @llvm.nvvm.tex.unified.cube.array.v4u32.f32(i64 %tex, i32 %idx, float %x, + float %y, float %z) +%int4 @llvm.nvvm.tex.unified.cube.array.level.v4u32.f32(i64 %tex, i32 %idx, + float %x, float %y, + float %z, float %level) + + +/ From 192d1c174efdf5fd1e30dc19efaabdaf7d314b59 Mon Sep 17 00:00:00 2001 From: aferust Date: Thu, 22 Dec 2022 14:37:57 +0300 Subject: [PATCH 11/17] Update texture.d --- source/dcompute/std/cuda/texture.d | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/source/dcompute/std/cuda/texture.d b/source/dcompute/std/cuda/texture.d index d7360e3..f00c6f6 100644 --- a/source/dcompute/std/cuda/texture.d +++ b/source/dcompute/std/cuda/texture.d @@ -13,6 +13,9 @@ struct float4 float4 opBinary(string op)(float s) if (op == "+"){ return float4(x+s, y+s, z+s, w+s); } + float4 opBinary(string op)(float s) if (op == "-"){ + return float4(x-s, y-s, z-s, w-s); + } float4 opBinary(string op)(float s) if (op == "*") { return float4(x*s, y*s, z*s, w*s); } @@ -23,7 +26,17 @@ struct float4 float4 opBinary(string op)(float4 other){ static if (op == "+"){ return float4(x+other.x, y+other.y, z+other.z, w+other.w); - } else + }else + static if (op == "-"){ + return float4(x-other.x, y-other.y, z-other.z, w-other.w); + }else + static if (op == "*"){ + return float4(x*other.x, y*other.y, z*other.z, w*other.w); + }else + static if (op == "/"){ + return float4(x/other.x, y/other.y, z/other.z, w/other.w); + } + else static assert(0, "op is not implemented"); } } From 82dffc865d3b1e1dceb0beb9ce76146e1ec030f1 Mon Sep 17 00:00:00 2001 From: aferust Date: Fri, 23 Dec 2022 15:00:08 +0300 Subject: [PATCH 12/17] add texture sampling functions for cuda add texture sampling functions for cuda --- source/dcompute/std/cuda/texture.d | 511 +++++++++++++++++------------ 1 file changed, 305 insertions(+), 206 deletions(-) diff --git a/source/dcompute/std/cuda/texture.d b/source/dcompute/std/cuda/texture.d index f00c6f6..7fe2472 100644 --- a/source/dcompute/std/cuda/texture.d +++ b/source/dcompute/std/cuda/texture.d @@ -41,209 +41,308 @@ struct float4 } } -/++ -Sampling a 1D texture: - -%float4 @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %tex, i32 %x) -%float4 @llvm.nvvm.tex.unified.1d.v4f32.f32(i64 %tex, float %x) -%float4 @llvm.nvvm.tex.unified.1d.level.v4f32.f32(i64 %tex, float %x, - float %level) -%float4 @llvm.nvvm.tex.unified.1d.grad.v4f32.f32(i64 %tex, float %x, - float %dPdx, - float %dPdy) - -%int4 @llvm.nvvm.tex.unified.1d.v4s32.s32(i64 %tex, i32 %x) -%int4 @llvm.nvvm.tex.unified.1d.v4s32.f32(i64 %tex, float %x) -%int4 @llvm.nvvm.tex.unified.1d.level.v4s32.f32(i64 %tex, float %x, - float %level) -%int4 @llvm.nvvm.tex.unified.1d.grad.v4s32.f32(i64 %tex, float %x, - float %dPdx, - float %dPdy) - -%int4 @llvm.nvvm.tex.unified.1d.v4u32.s32(i64 %tex, i32 %x) -%int4 @llvm.nvvm.tex.unified.1d.v4u32.f32(i64 %tex, float %x) -%int4 @llvm.nvvm.tex.unified.1d.level.v4u32.f32(i64 %tex, float %x, - float %level) -%int4 @llvm.nvvm.tex.unified.1d.grad.v4u32.f32(i64 %tex, float %x, - float %dPdx, - float %dPdy) - - -Sampling a 1D texture array: - -%float4 @llvm.nvvm.tex.unified.1d.array.v4f32.s32(i64 %tex, i32 %idx, i32 %x) -%float4 @llvm.nvvm.tex.unified.1d.array.v4f32.f32(i64 %tex, i32 %idx, float %x) -%float4 @llvm.nvvm.tex.unified.1d.array.level.v4f32.f32(i64 %tex, i32 %idx, - float %x, - float %level) -%float4 @llvm.nvvm.tex.unified.1d.array.grad.v4f32.f32(i64 %tex, i32 %idx, - float %x, - float %dPdx, - float %dPdy) - -%int4 @llvm.nvvm.tex.unified.1d.array.v4s32.s32(i64 %tex, i32 %idx, i32 %x) -%int4 @llvm.nvvm.tex.unified.1d.array.v4s32.f32(i64 %tex, i32 %idx, float %x) -%int4 @llvm.nvvm.tex.unified.1d.array.level.v4s32.f32(i64 %tex, i32 %idx, - float %x, - float %level) -%int4 @llvm.nvvm.tex.unified.1d.array.grad.v4s32.f32(i64 %tex, i32 %idx, - float %x, - float %dPdx, - float %dPdy) - -%int4 @llvm.nvvm.tex.unified.1d.array.v4u32.s32(i64 %tex, i32 %idx, i32 %x) -%int4 @llvm.nvvm.tex.unified.1d.array.v4u32.f32(i64 %tex, i32 %idx, float %x) -%int4 @llvm.nvvm.tex.unified.1d.array.level.v4u32.f32(i64 %tex, i32 %idx, - float %x, - float %level) -%int4 @llvm.nvvm.tex.unified.1d.array.grad.v4u32.f32(i64 %tex, i32 %idx, - float %x, - float %dPdx, - float %dPdy) - - -Sampling a 2D texture: - -%float4 @llvm.nvvm.tex.unified.2d.v4f32.s32(i64 %tex, i32 %x, i32 %y) -%float4 @llvm.nvvm.tex.unified.2d.v4f32.f32(i64 %tex, float %x, float %y) -%float4 @llvm.nvvm.tex.unified.2d.level.v4f32.f32(i64 %tex, float %x, float %y, - float %level) -%float4 @llvm.nvvm.tex.unified.2d.grad.v4f32.f32(i64 %tex, float %x, float %y, - float %dPdx_x, float %dPdx_y, - float %dPdy_x, float %dPdy_y) - -%int4 @llvm.nvvm.tex.unified.2d.v4s32.s32(i64 %tex, i32 %x, i32 %y) -%int4 @llvm.nvvm.tex.unified.2d.v4s32.f32(i64 %tex, float %x, float %y,) -%int4 @llvm.nvvm.tex.unified.2d.level.v4s32.f32(i64 %tex, float %x, float %y, - float %level) -%int4 @llvm.nvvm.tex.unified.2d.grad.v4s32.f32(i64 %tex, float %x, float %y, - float %dPdx_x, float %dPdx_y, - float %dPdy_x, float %dPdy_y) - -%int4 @llvm.nvvm.tex.unified.2d.v4u32.s32(i64 %tex, i32 %x i32 %y) -%int4 @llvm.nvvm.tex.unified.2d.v4u32.f32(i64 %tex, float %x float %y) -%int4 @llvm.nvvm.tex.unified.2d.level.v4u32.f32(i64 %tex, float %x, float %y, - float %level) -%int4 @llvm.nvvm.tex.unified.2d.grad.v4u32.f32(i64 %tex, float %x, float %y, - float %dPdx_x, float %dPdx_y, - float %dPdy_x, float %dPdy_y) - -Sampling a 2D texture array: - -%float4 @llvm.nvvm.tex.unified.2d.array.v4f32.s32(i64 %tex, i32 %idx, - i32 %x, i32 %y) -%float4 @llvm.nvvm.tex.unified.2d.array.v4f32.f32(i64 %tex, i32 %idx, - float %x, float %y) -%float4 @llvm.nvvm.tex.unified.2d.array.level.v4f32.f32(i64 %tex, i32 %idx, - float %x, float %y, - float %level) -%float4 @llvm.nvvm.tex.unified.2d.array.grad.v4f32.f32(i64 %tex, i32 %idx, - float %x, float %y, - float %dPdx_x, - float %dPdx_y, - float %dPdy_x, - float %dPdy_y) - -%int4 @llvm.nvvm.tex.unified.2d.array.v4s32.s32(i64 %tex, i32 %idx, - i32 %x, i32 %y) -%int4 @llvm.nvvm.tex.unified.2d.array.v4s32.f32(i64 %tex, i32 %idx, - float %x, float %y) -%int4 @llvm.nvvm.tex.unified.2d.array.level.v4s32.f32(i64 %tex, i32 %idx, - float %x, float %y, - float %level) -%int4 @llvm.nvvm.tex.unified.2d.array.grad.v4s32.f32(i64 %tex, i32 %idx, - float %x, float %y, - float %dPdx_x, - float %dPdx_y, - float %dPdy_x, - float %dPdy_y) - -%int4 @llvm.nvvm.tex.unified.2d.array.v4u32.s32(i64 %tex, i32 %idx, - i32 %x i32 %y) -%int4 @llvm.nvvm.tex.unified.2d.array.v4u32.f32(i64 %tex, i32 %idx, - float %x float %y) -%int4 @llvm.nvvm.tex.unified.2d.array.level.v4u32.f32(i64 %tex, i32 %idx, - float %x, float %y, - float %level) -%int4 @llvm.nvvm.tex.unified.2d.array.grad.v4u32.f32(i64 %tex, i32 %idx, - float %x, float %y, - float %dPdx_x, - float %dPdx_y, - float %dPdy_x, - float %dPdy_y) - -Sampling a 3D texture: - -%float4 @llvm.nvvm.tex.unified.3d.v4f32.s32(i64 %tex, i32 %x, i32 %y, i32 %z) -%float4 @llvm.nvvm.tex.unified.3d.v4f32.f32(i64 %tex, float %x, float %y, - float %z) -%float4 @llvm.nvvm.tex.unified.3d.level.v4f32.f32(i64 %tex,float %x, float %y, - float %z, float %level) -%float4 @llvm.nvvm.tex.unified.3d.grad.v4f32.f32(i64 %tex, float %x, float %y, - float %z, float %dPdx_x, - float %dPdx_y, float %dPdx_z, - float %dPdy_x, float %dPdy_y, - float %dPdy_z) - -%int4 @llvm.nvvm.tex.unified.3d.v4s32.s32(i64 %tex, i32 %x, i32 %y, i32 %z) -%int4 @llvm.nvvm.tex.unified.3d.v4s32.f32(i64 %tex, float %x, float %y, - float %z) -%int4 @llvm.nvvm.tex.unified.3d.level.v4s32.f32(i64 %tex, float %x, float %y, - float %z, float %level) -%int4 @llvm.nvvm.tex.unified.3d.grad.v4s32.f32(i64 %tex, float %x, float %y, - float %z, float %dPdx_x, - float %dPdx_y, float %dPdx_z, - float %dPdy_x, float %dPdy_y, - float %dPdy_z) - -%int4 @llvm.nvvm.tex.unified.3d.v4u32.s32(i64 %tex, i32 %x i32 %y, i32 %z) -%int4 @llvm.nvvm.tex.unified.3d.v4u32.f32(i64 %tex, float %x, float %y, - float %z) -%int4 @llvm.nvvm.tex.unified.3d.level.v4u32.f32(i64 %tex, float %x, float %y, - float %z, float %level) -%int4 @llvm.nvvm.tex.unified.3d.grad.v4u32.f32(i64 %tex, float %x, float %y, - float %z, float %dPdx_x, - float %dPdx_y, float %dPdx_z, - float %dPdy_x, float %dPdy_y, - float %dPdy_z) - -Sampling a cube texture: - -%float4 @llvm.nvvm.tex.unified.cube.v4f32.f32(i64 %tex, float %x, float %y, - float %z) -%float4 @llvm.nvvm.tex.unified.cube.level.v4f32.f32(i64 %tex,float %x, float %y, - float %z, float %level) - -%int4 @llvm.nvvm.tex.unified.cube.v4s32.f32(i64 %tex, float %x, float %y, - float %z) -%int4 @llvm.nvvm.tex.unified.cube.level.v4s32.f32(i64 %tex, float %x, float %y, - float %z, float %level) - -%int4 @llvm.nvvm.tex.unified.cube.v4u32.f32(i64 %tex, float %x, float %y, - float %z) -%int4 @llvm.nvvm.tex.unified.cube.level.v4u32.f32(i64 %tex, float %x, float %y, - float %z, float %level) - -Sampling a cube texture array: - -%float4 @llvm.nvvm.tex.unified.cube.array.v4f32.f32(i64 %tex, i32 %idx, - float %x, float %y, - float %z) -%float4 @llvm.nvvm.tex.unified.cube.array.level.v4f32.f32(i64 %tex, i32 %idx, - float %x, float %y, - float %z, - float %level) - -%int4 @llvm.nvvm.tex.unified.cube.array.v4s32.f32(i64 %tex, i32 %idx, float %x, - float %y, float %z) -%int4 @llvm.nvvm.tex.unified.cube.array.level.v4s32.f32(i64 %tex, i32 %idx, - float %x, float %y, - float %z, float %level) - -%int4 @llvm.nvvm.tex.unified.cube.array.v4u32.f32(i64 %tex, i32 %idx, float %x, - float %y, float %z) -%int4 @llvm.nvvm.tex.unified.cube.array.level.v4u32.f32(i64 %tex, i32 %idx, - float %x, float %y, - float %z, float %level) - - +/ +// Sampling a 1D texture: + +//%float4 @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %tex, i32 %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.v4f32.s32") +float4 tex_unified_1d_v4f32_s32(long tex, int x); + +//%float4 @llvm.nvvm.tex.unified.1d.v4f32.f32(i64 %tex, float %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.v4f32.f32") +float4 tex_unified_1d_v4f32_f32(long tex, float x); + +//%float4 @llvm.nvvm.tex.unified.1d.level.v4f32.f32(i64 %tex, float %x, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.level.v4f32.f32") +float4 tex_unified_1d_level_v4f32_f32(long tex, float x, float level); + +//%float4 @llvm.nvvm.tex.unified.1d.grad.v4f32.f32(i64 %tex, float %x, float %dPdx, float %dPdy) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.grad.v4f32.f32") +float4 tex_unified_1d_grad_v4f32_f32(long tex, float x, float dPdx, float dPdy); + +//%int4 @llvm.nvvm.tex.unified.1d.v4s32.s32(i64 %tex, i32 %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.v4s32.s32") +int4 tex_unified_1d_v4s32_s32(long tex, int x); + +//%int4 @llvm.nvvm.tex.unified.1d.v4s32.f32(i64 %tex, float %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.v4s32.f32") +int4 tex_unified_1d_v4s32_f32(long tex, float x); + +//%int4 @llvm.nvvm.tex.unified.1d.level.v4s32.f32(i64 %tex, float %x, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.level.v4s32.f32") +int4 tex_unified_1d_level_v4s32_f32(long tex, float x, float level); + +//%int4 @llvm.nvvm.tex.unified.1d.grad.v4s32.f32(i64 %tex, float %x, float %dPdx, float %dPdy) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.grad.v4s32.f32") +int4 tex_unified_1d_grad_v4s32_f32(long tex, float x, float dPdx, float dPdy); + +//%int4 @llvm.nvvm.tex.unified.1d.v4u32.s32(i64 %tex, i32 %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.v4u32.s32") +int4 tex_unified_1d_v4u32_s32(long tex, int x); + +//%int4 @llvm.nvvm.tex.unified.1d.v4u32.f32(i64 %tex, float %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.v4u32.f32") +int4 tex_unified_1d_v4u32_f32(long tex, float x); + +//%int4 @llvm.nvvm.tex.unified.1d.level.v4u32.f32(i64 %tex, float %x, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.level.v4u32.f32") +int4 tex_unified_1d_level_v4u32_f32(long tex, float x, float level); + +//%int4 @llvm.nvvm.tex.unified.1d.grad.v4u32.f32(i64 %tex, float %x, float %dPdx, float %dPdy) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.grad.v4u32.f32") +int4 tex_unified_1d_grad_v4u32_f32(long tex, float x, float dPdx, float dPdy); + + +// Sampling a 1D texture array: + +//%float4 @llvm.nvvm.tex.unified.1d.array.v4f32.s32(i64 %tex, i32 %idx, i32 %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.v4f32.s32") +float4 tex_unified_1d_array_v4f32_s32(long tex, int idx, int x); + +//%float4 @llvm.nvvm.tex.unified.1d.array.v4f32.f32(i64 %tex, i32 %idx, float %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.v4f32.f32") +float4 tex_unified_1d_array_v4f32_f32(long tex, int idx, float x); + +//%float4 @llvm.nvvm.tex.unified.1d.array.level.v4f32.f32(i64 %tex, i32 %idx, float %x, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.level.v4f32.f32") +float4 tex_unified_1d_array_level_v4f32_f32(long tex, int idx, float x, float level); + +//%float4 @llvm.nvvm.tex.unified.1d.array.grad.v4f32.f32(i64 %tex, i32 %idx, float %x, float %dPdx, float %dPdy) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.grad.v4f32.f32") +float4 tex_unified_1d_array_grad_v4f32_f32(long tex, int idx, float x, float dPdx, float dPdy); + +//%int4 @llvm.nvvm.tex.unified.1d.array.v4s32.s32(i64 %tex, i32 %idx, i32 %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.v4s32.s32") +int4 tex_unified_1d_array_v4s32_s32(long tex, int idx, int x); + +//%int4 @llvm.nvvm.tex.unified.1d.array.v4s32.f32(i64 %tex, i32 %idx, float %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.v4s32.f32") +int4 tex_unified_1d_array_v4s32_f32(long tex, int idx, float x); + +//%int4 @llvm.nvvm.tex.unified.1d.array.level.v4s32.f32(i64 %tex, i32 %idx, float %x, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.level.v4s32.f32") +int4 tex_unified_1d_array_level_v4s32_f32(long tex, int idx, float x, float level); + +//%int4 @llvm.nvvm.tex.unified.1d.array.grad.v4s32.f32(i64 %tex, i32 %idx, float %x, float %dPdx, float %dPdy) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.grad.v4s32.f32") +int4 tex_unified_1d_array_grad_v4s32_f32(long tex, int idx, float x, float dPdx, float dPdy); + +//%int4 @llvm.nvvm.tex.unified.1d.array.v4u32.s32(i64 %tex, i32 %idx, i32 %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.v4u32.s32") +int4 tex_unified_1d_array_v4u32_s32(long tex, int idx, int x); + +//%int4 @llvm.nvvm.tex.unified.1d.array.v4u32.f32(i64 %tex, i32 %idx, float %x) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.v4u32.f32") +int4 tex_unified_1d_array_v4u32_f32(long tex, int idx, float x); + +//%int4 @llvm.nvvm.tex.unified.1d.array.level.v4u32.f32(i64 %tex, i32 %idx, float %x, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.level.v4u32.f32") +int4 tex_unified_1d_array_level_v4u32_f32(long tex, int idx, float x, float level); + +//%int4 @llvm.nvvm.tex.unified.1d.array.grad.v4u32.f32(i64 %tex, i32 %idx, float %x, float %dPdx, float %dPdy) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.1d.array.grad.v4u32.f32") +int4 tex_unified_1d_array_grad_v4u32_f32(long tex, int idx, float x, float dPdx, float dPdy); + + +// Sampling a 2D texture: + + +//%float4 @llvm.nvvm.tex.unified.2d.v4f32.s32(i64 %tex, i32 %x, i32 %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.v4f32.s32") +float4 tex_unified_2d_v4f32_s32(long tex, int x, int y); + +//%float4 @llvm.nvvm.tex.unified.2d.v4f32.f32(i64 %tex, float %x, float %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.v4f32.f32") +float4 tex_unified_2d_v4f32_f32(long tex, float x, float y); + +//%float4 @llvm.nvvm.tex.unified.2d.level.v4f32.f32(i64 %tex, float %x, float %y, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.level.v4f32.f32") +float4 tex_unified_2d_level_v4f32_f32(long tex, float x, float y, float level); + +//%float4 @llvm.nvvm.tex.unified.2d.grad.v4f32.f32(i64 %tex, float %x, float %y, float %dPdx_x, float %dPdx_y, float %dPdy_x, float %dPdy_y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.grad.v4f32.f32") +float4 tex_unified_2d_grad_v4f32_f32(long tex, float x, float y, float dPdx_x, float dPdx_y, float dPdy_x, float dPdy_y); + +//%int4 @llvm.nvvm.tex.unified.2d.v4s32.s32(i64 %tex, i32 %x, i32 %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.v4s32.s32") +int4 tex_unified_2d_v4s32_s32(long tex, int x, int y); + +//%int4 @llvm.nvvm.tex.unified.2d.v4s32.f32(i64 %tex, float %x, float %y,) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.v4s32.f32") +int4 tex_unified_2d_v4s32_f32(long tex, float x, float y,); + +//%int4 @llvm.nvvm.tex.unified.2d.level.v4s32.f32(i64 %tex, float %x, float %y, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.level.v4s32.f32") +int4 tex_unified_2d_level_v4s32_f32(long tex, float x, float y, float level); + +//%int4 @llvm.nvvm.tex.unified.2d.grad.v4s32.f32(i64 %tex, float %x, float %y, float %dPdx_x, float %dPdx_y, float %dPdy_x, float %dPdy_y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.grad.v4s32.f32") +int4 tex_unified_2d_grad_v4s32_f32(long tex, float x, float y, float dPdx_x, float dPdx_y, float dPdy_x, float dPdy_y); + +//%int4 @llvm.nvvm.tex.unified.2d.v4u32.s32(i64 %tex, i32 %x i32 %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.v4u32.s32") +int4 tex_unified_2d_v4u32_s32(long tex, int x, int y); + +//%int4 @llvm.nvvm.tex.unified.2d.v4u32.f32(i64 %tex, float %x float %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.v4u32.f32") +int4 tex_unified_2d_v4u32_f32(long tex, float x, float y); + +//%int4 @llvm.nvvm.tex.unified.2d.level.v4u32.f32(i64 %tex, float %x, float %y, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.level.v4u32.f32") +int4 tex_unified_2d_level_v4u32_f32(long tex, float x, float y, float level); + +//%int4 @llvm.nvvm.tex.unified.2d.grad.v4u32.f32(i64 %tex, float %x, float %y, float %dPdx_x, float %dPdx_y, float %dPdy_x, float %dPdy_y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.grad.v4u32.f32") +int4 tex_unified_2d_grad_v4u32_f32(long tex, float x, float y, float dPdx_x, float dPdx_y, float dPdy_x, float dPdy_y); + +// Sampling a 2D texture array: + + +//%float4 @llvm.nvvm.tex.unified.2d.array.v4f32.s32(i64 %tex, i32 %idx, i32 %x, i32 %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.v4f32.s32") +float4 tex_unified_2d_array_v4f32_s32(long tex, int idx, int x, int y); + +//%float4 @llvm.nvvm.tex.unified.2d.array.v4f32.f32(i64 %tex, i32 %idx, float %x, float %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.v4f32.f32") +float4 tex_unified_2d_array_v4f32_f32(long tex, int idx, float x, float y); + +//%float4 @llvm.nvvm.tex.unified.2d.array.level.v4f32.f32(i64 %tex, i32 %idx, float %x, float %y, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.level.v4f32.f32") +float4 tex_unified_2d_array_level_v4f32_f32(long tex, int idx, float x, float y, float level); + +//%float4 @llvm.nvvm.tex.unified.2d.array.grad.v4f32.f32(i64 %tex, i32 %idx, float %x, float %y, float %dPdx_x, float %dPdx_y, float %dPdy_x, float %dPdy_y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.grad.v4f32.f32") +float4 tex_unified_2d_array_grad_v4f32_f32(long tex, int idx, float x, float y, float dPdx_x, float dPdx_y, float dPdy_x, float dPdy_y); + +//%int4 @llvm.nvvm.tex.unified.2d.array.v4s32.s32(i64 %tex, i32 %idx, i32 %x, i32 %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.v4s32.s32") +int4 tex_unified_2d_array_v4s32_s32(long tex, int idx, int x, int y); + +//%int4 @llvm.nvvm.tex.unified.2d.array.v4s32.f32(i64 %tex, i32 %idx, float %x, float %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.v4s32.f32") +int4 tex_unified_2d_array_v4s32_f32(long tex, int idx, float x, float y); + +//%int4 @llvm.nvvm.tex.unified.2d.array.level.v4s32.f32(i64 %tex, i32 %idx, float %x, float %y, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.level.v4s32.f32") +int4 tex_unified_2d_array_level_v4s32_f32(long tex, int idx, float x, float y, float level); + +//%int4 @llvm.nvvm.tex.unified.2d.array.grad.v4s32.f32(i64 %tex, i32 %idx, float %x, float %y, float %dPdx_x, float %dPdx_y, float %dPdy_x, float %dPdy_y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.grad.v4s32.f32") +int4 tex_unified_2d_array_grad_v4s32_f32(long tex, int idx, float x, float y, float dPdx_x, float dPdx_y, float dPdy_x, float dPdy_y); + +//%int4 @llvm.nvvm.tex.unified.2d.array.v4u32.s32(i64 %tex, i32 %idx, i32 %x i32 %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.v4u32.s32") +int4 tex_unified_2d_array_v4u32_s32(long tex, int idx, int x, int y); + +//%int4 @llvm.nvvm.tex.unified.2d.array.v4u32.f32(i64 %tex, i32 %idx, float %x float %y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.v4u32.f32") +int4 tex_unified_2d_array_v4u32_f32(long tex, int idx, float x, float y); + +//%int4 @llvm.nvvm.tex.unified.2d.array.level.v4u32.f32(i64 %tex, i32 %idx, float %x, float %y, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.level.v4u32.f32") +int4 tex_unified_2d_array_level_v4u32_f32(long tex, int idx, float x, float y, float level); + +//%int4 @llvm.nvvm.tex.unified.2d.array.grad.v4u32.f32(i64 %tex, i32 %idx, float %x, float %y, float %dPdx_x, float %dPdx_y, float %dPdy_x, float %dPdy_y) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.2d.array.grad.v4u32.f32") +int4 tex_unified_2d_array_grad_v4u32_f32(long tex, int idx, float x, float y, float dPdx_x, float dPdx_y, float dPdy_x, float dPdy_y); + +// Sampling a 3D texture: + +//%float4 @llvm.nvvm.tex.unified.3d.v4f32.s32(i64 %tex, i32 %x, i32 %y, i32 %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.v4f32.s32") +float4 tex_unified_3d_v4f32_s32(long tex, int x, int y, int z); + +//%float4 @llvm.nvvm.tex.unified.3d.v4f32.f32(i64 %tex, float %x, float %y, float %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.v4f32.f32") +float4 tex_unified_3d_v4f32_f32(long tex, float x, float y, float z); + +//%float4 @llvm.nvvm.tex.unified.3d.level.v4f32.f32(i64 %tex,float %x, float %y, float %z, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.level.v4f32.f32") +float4 tex_unified_3d_level_v4f32_f32(long tex,float x, float y, float z, float level); + +//%float4 @llvm.nvvm.tex.unified.3d.grad.v4f32.f32(i64 %tex, float %x, float %y, float %z, float %dPdx_x, float %dPdx_y, float %dPdx_z, float %dPdy_x, float %dPdy_y, float %dPdy_z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.grad.v4f32.f32") +float4 tex_unified_3d_grad_v4f32_f32(long tex, float x, float y, float z, float dPdx_x, float dPdx_y, float dPdx_z, float dPdy_x, float dPdy_y, float dPdy_z); + +//%int4 @llvm.nvvm.tex.unified.3d.v4s32.s32(i64 %tex, i32 %x, i32 %y, i32 %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.v4s32.s32") +int4 tex_unified_3d_v4s32_s32(long tex, int x, int y, int z); + +//%int4 @llvm.nvvm.tex.unified.3d.v4s32.f32(i64 %tex, float %x, float %y, float %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.v4s32.f32") +int4 tex_unified_3d_v4s32_f32(long tex, float x, float y, float z); + +//%int4 @llvm.nvvm.tex.unified.3d.level.v4s32.f32(i64 %tex, float %x, float %y, float %z, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.level.v4s32.f32") +int4 tex_unified_3d_level_v4s32_f32(long tex, float x, float y, float z, float level); + +//%int4 @llvm.nvvm.tex.unified.3d.grad.v4s32.f32(i64 %tex, float %x, float %y, float %z, float %dPdx_x, float %dPdx_y, float %dPdx_z, float %dPdy_x, float %dPdy_y, float %dPdy_z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.grad.v4s32.f32") +int4 tex_unified_3d_grad_v4s32_f32(long tex, float x, float y, float z, float dPdx_x, float dPdx_y, float dPdx_z, float dPdy_x, float dPdy_y, float dPdy_z); + +//%int4 @llvm.nvvm.tex.unified.3d.v4u32.s32(i64 %tex, i32 %x i32 %y, i32 %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.v4u32.s32") +int4 tex_unified_3d_v4u32_s32(long tex, int x, int y, int z); + +//%int4 @llvm.nvvm.tex.unified.3d.v4u32.f32(i64 %tex, float %x, float %y, float %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.v4u32.f32") +int4 tex_unified_3d_v4u32_f32(long tex, float x, float y, float z); + +//%int4 @llvm.nvvm.tex.unified.3d.level.v4u32.f32(i64 %tex, float %x, float %y, float %z, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.level.v4u32.f32") +int4 tex_unified_3d_level_v4u32_f32(long tex, float x, float y, float z, float level); + +//%int4 @llvm.nvvm.tex.unified.3d.grad.v4u32.f32(i64 %tex, float %x, float %y, float %z, float %dPdx_x, float %dPdx_y, float %dPdx_z, float %dPdy_x, float %dPdy_y, float %dPdy_z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.3d.grad.v4u32.f32") +int4 tex_unified_3d_grad_v4u32_f32(long tex, float x, float y, float z, float dPdx_x, float dPdx_y, float dPdx_z, float dPdy_x, float dPdy_y, float dPdy_z); + +// Sampling a cube texture: + +//%float4 @llvm.nvvm.tex.unified.cube.v4f32.f32(i64 %tex, float %x, float %y, float %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.v4f32.f32") +float4 tex_unified_cube_v4f32_f32(long tex, float x, float y, float z); + +//%float4 @llvm.nvvm.tex.unified.cube.level.v4f32.f32(i64 %tex,float %x, float %y, float %z, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.level.v4f32.f32") +float4 tex_unified_cube_level_v4f32_f32(long tex,float x, float y, float z, float level); + +//%int4 @llvm.nvvm.tex.unified.cube.v4s32.f32(i64 %tex, float %x, float %y, float %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.v4s32.f32") +int4 tex_unified_cube_v4s32_f32(long tex, float x, float y, float z); + +//%int4 @llvm.nvvm.tex.unified.cube.level.v4s32.f32(i64 %tex, float %x, float %y, float %z, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.level.v4s32.f32") +int4 tex_unified_cube_level_v4s32_f32(long tex, float x, float y, float z, float level); + +//%int4 @llvm.nvvm.tex.unified.cube.v4u32.f32(i64 %tex, float %x, float %y, float %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.v4u32.f32") +int4 tex_unified_cube_v4u32_f32(long tex, float x, float y, float z); + +//%int4 @llvm.nvvm.tex.unified.cube.level.v4u32.f32(i64 %tex, float %x, float %y, float %z, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.level.v4u32.f32") +int4 tex_unified_cube_level_v4u32_f32(long tex, float x, float y, float z, float level); + +// Sampling a cube texture array: + +//%float4 @llvm.nvvm.tex.unified.cube.array.v4f32.f32(i64 %tex, i32 %idx, float %x, float %y, float %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.array.v4f32.f32") +float4 tex_unified_cube_array_v4f32_f32(long tex, int idx, float x, float y, float z); + +//%float4 @llvm.nvvm.tex.unified.cube.array.level.v4f32.f32(i64 %tex, i32 %idx, float %x, float %y, float %z, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.array.level.v4f32.f32") +float4 tex_unified_cube_array_level_v4f32_f32(long tex, int idx, float x, float y, float z, float level); + +//%int4 @llvm.nvvm.tex.unified.cube.array.v4s32.f32(i64 %tex, i32 %idx, float %x, float %y, float %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.array.v4s32.f32") +int4 tex_unified_cube_array_v4s32_f32(long tex, int idx, float x, float y, float z); + +//%int4 @llvm.nvvm.tex.unified.cube.array.level.v4s32.f32(i64 %tex, i32 %idx, float %x, float %y, float %z, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.array.level.v4s32.f32") +int4 tex_unified_cube_array_level_v4s32_f32(long tex, int idx, float x, float y, float z, float level); + +//%int4 @llvm.nvvm.tex.unified.cube.array.v4u32.f32(i64 %tex, i32 %idx, float %x, float %y, float %z) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.array.v4u32.f32") +int4 tex_unified_cube_array_v4u32_f32(long tex, int idx, float x, float y, float z); + +//%int4 @llvm.nvvm.tex.unified.cube.array.level.v4u32.f32(i64 %tex, i32 %idx, float %x, float %y, float %z, float %level) +pragma(LDC_intrinsic, "llvm.nvvm.tex.unified.cube.array.level.v4u32.f32") +int4 tex_unified_cube_array_level_v4u32_f32(long tex, int idx, float x, float y, float z, float level); From 776ffa5ee41604902180ed2ea51a2d2e5f863ef6 Mon Sep 17 00:00:00 2001 From: aferust Date: Fri, 23 Dec 2022 15:00:40 +0300 Subject: [PATCH 13/17] Update texture.d --- source/dcompute/std/cuda/texture.d | 1 + 1 file changed, 1 insertion(+) diff --git a/source/dcompute/std/cuda/texture.d b/source/dcompute/std/cuda/texture.d index 7fe2472..56b5995 100644 --- a/source/dcompute/std/cuda/texture.d +++ b/source/dcompute/std/cuda/texture.d @@ -41,6 +41,7 @@ struct float4 } } +@nogc nothrow: // Sampling a 1D texture: //%float4 @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %tex, i32 %x) From 3391cc5b1d43b7dc054803e053c8c63872034607 Mon Sep 17 00:00:00 2001 From: aferust Date: Fri, 23 Dec 2022 15:14:20 +0300 Subject: [PATCH 14/17] fix module dec --- source/dcompute/std/cuda/texture.d | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/source/dcompute/std/cuda/texture.d b/source/dcompute/std/cuda/texture.d index 56b5995..00cab32 100644 --- a/source/dcompute/std/cuda/texture.d +++ b/source/dcompute/std/cuda/texture.d @@ -1,4 +1,8 @@ -module dcompute.std.cuda.texture; +@compute(CompileFor.deviceOnly) +module bilateral; +pragma(LDC_no_moduleinfo); + +import ldc.dcompute; // CUDA texture sampling return types struct int4 @@ -9,7 +13,9 @@ struct int4 struct float4 { float x, y, z, w; - + + @nogc nothrow: + float4 opBinary(string op)(float s) if (op == "+"){ return float4(x+s, y+s, z+s, w+s); } From def18f204b07aa3596df6647dd885d84e7981cdb Mon Sep 17 00:00:00 2001 From: aferust Date: Fri, 23 Dec 2022 15:33:30 +0300 Subject: [PATCH 15/17] Update memory.d --- source/dcompute/std/memory.d | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/source/dcompute/std/memory.d b/source/dcompute/std/memory.d index 201cf22..e6d21ba 100644 --- a/source/dcompute/std/memory.d +++ b/source/dcompute/std/memory.d @@ -23,6 +23,8 @@ pragma(LDC_inline_ir) @nogc nothrow: +// SharedPointer and constant memory allocations, only tested for Cuda backend + SharedPointer!T sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ void* address = __irEx!(`@`~uniqueName~` = addrspace(3) global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 ; %Dummy = type { `~llvmType!T~` addrspace(3)* } @@ -41,6 +43,14 @@ SharedPointer!T sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ return *(cast(SharedPointer!(T)*)address); } +/+ defines and allocates a global constant. Host code must initialize this array like + ` + size_t nbytes; + size_t _gaussConstAddr = Program.getGlobal(nbytes, "gauss0"); // uniqueName must match here + cuMemcpyHtoD(_gaussConstAddr, Gaussian.ptr, nbytes); + ` + This type is immutable for device, but the memory can be updated in host code. ++/ immutable(T)* constStaticReserve(T : T[N], string uniqueName, size_t N)(){ immutable(T)* address = __irEx!(`@`~uniqueName~` = addrspace(4) externally_initialized global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 `, ` @@ -50,6 +60,7 @@ immutable(T)* constStaticReserve(T : T[N], string uniqueName, size_t N)(){ ret `~llvmType!T~`* %r `, ``, immutable(T)*)(); return address; + // returning a ConstPointer!T causes an LLVM error for CUDA backend. immutable(T)* is a convenient type anyway. } immutable(string) Digit(size_t n)() From d4b336f0a10ba80b9dce125b81f4643d7c3f8e2a Mon Sep 17 00:00:00 2001 From: aferust Date: Fri, 23 Dec 2022 15:42:56 +0300 Subject: [PATCH 16/17] Update texture.d --- source/dcompute/std/cuda/texture.d | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/dcompute/std/cuda/texture.d b/source/dcompute/std/cuda/texture.d index 00cab32..c27b2ec 100644 --- a/source/dcompute/std/cuda/texture.d +++ b/source/dcompute/std/cuda/texture.d @@ -1,5 +1,5 @@ @compute(CompileFor.deviceOnly) -module bilateral; +module dcompute.std.cuda.texture; pragma(LDC_no_moduleinfo); import ldc.dcompute; From 6b9853052222821475f82e0b18fb30b2d5b9007b Mon Sep 17 00:00:00 2001 From: aferust Date: Fri, 23 Dec 2022 21:34:17 +0300 Subject: [PATCH 17/17] use __dcompute_reflect in std memory use __dcompute_reflect for using a correct addrspace for cuda and opencl --- source/dcompute/std/memory.d | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/source/dcompute/std/memory.d b/source/dcompute/std/memory.d index e6d21ba..8282af0 100644 --- a/source/dcompute/std/memory.d +++ b/source/dcompute/std/memory.d @@ -52,15 +52,28 @@ SharedPointer!T sharedStaticReserve(T : T[N], string uniqueName, size_t N)(){ This type is immutable for device, but the memory can be updated in host code. +/ immutable(T)* constStaticReserve(T : T[N], string uniqueName, size_t N)(){ - immutable(T)* address = __irEx!(`@`~uniqueName~` = addrspace(4) externally_initialized global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 + if(__dcompute_reflect(ReflectTarget.CUDA)){ + immutable(T)* address = __irEx!(`@`~uniqueName~` = addrspace(4) externally_initialized global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 `, ` - %mptr = getelementptr inbounds [`~Itoa!N~` x `~llvmType!T~`], [`~Itoa!N~` x `~llvmType!T~`] addrspace(4)* @`~uniqueName~`, i32 0, i32 0 + %mptr = getelementptr inbounds [`~Itoa!N~` x `~llvmType!T~`], [`~Itoa!N~` x `~llvmType!T~`] addrspace(4)* @`~uniqueName~`, i32 0, i32 0 - %r = addrspacecast `~llvmType!T~` addrspace(4)* %mptr to `~llvmType!T~`* - ret `~llvmType!T~`* %r + %r = addrspacecast `~llvmType!T~` addrspace(4)* %mptr to `~llvmType!T~`* + ret `~llvmType!T~`* %r `, ``, immutable(T)*)(); - return address; - // returning a ConstPointer!T causes an LLVM error for CUDA backend. immutable(T)* is a convenient type anyway. + return address; + // returning a ConstPointer!T causes an LLVM error for CUDA backend. immutable(T)* is a convenient type anyway. + } else if(__dcompute_reflect(ReflectTarget.OpenCL)){ + immutable(T)* address = __irEx!(`@`~uniqueName~` = addrspace(2) global [`~Itoa!N~` x `~llvmType!T~`] zeroinitializer, align 4 + `, ` + %mptr = getelementptr inbounds [`~Itoa!N~` x `~llvmType!T~`], [`~Itoa!N~` x `~llvmType!T~`] addrspace(2)* @`~uniqueName~`, i32 0, i32 0 + + %r = addrspacecast `~llvmType!T~` addrspace(2)* %mptr to `~llvmType!T~`* + ret `~llvmType!T~`* %r + `, ``, immutable(T)*)(); + return address; + } else + assert(0); + } immutable(string) Digit(size_t n)()