Skip to content

Commit 558dfc6

Browse files
authored
Fix inversion bug
The inversion of p -> 1-p was not triggered sometimes, leading to wrong statistics in part of the sampled array. Added distributional tests for mean and variance.
2 parents 613da4b + 9aebcae commit 558dfc6

File tree

5 files changed

+199
-96
lines changed

5 files changed

+199
-96
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "BinomialGPU"
22
uuid = "c5bbfde1-2136-42cd-9b65-d5719df69ebf"
33
authors = ["Simone Carlo Surace"]
4-
version = "0.4.2"
4+
version = "0.4.3"
55

66
[deps]
77
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

src/kernels.jl

Lines changed: 102 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,66 @@ end
3333

3434
# BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)
3535

36-
## Kernel for scalar parameters
36+
## Kernels for scalar parameters
37+
function kernel_naive_scalar!(A, n, p, seed::UInt32, counter::UInt32)
38+
device_rng = Random.default_rng()
39+
40+
# initialize the state
41+
@inbounds Random.seed!(device_rng, seed, counter)
42+
43+
# grid-stride loop
44+
tid = threadIdx().x
45+
window = (blockDim().x - 1i32) * gridDim().x
46+
offset = (blockIdx().x - 1i32) * blockDim().x
47+
48+
while offset < length(A)
49+
i = tid + offset
50+
51+
k = 0
52+
ctr = 1
53+
while ctr <= n
54+
rand(Float32) < p && (k += 1)
55+
ctr += 1
56+
end
57+
58+
if i <= length(A)
59+
@inbounds A[i] = k
60+
end
61+
offset += window
62+
end
63+
return nothing
64+
end
65+
function kernel_inversion_scalar!(A, n, p, seed::UInt32, counter::UInt32)
66+
device_rng = Random.default_rng()
67+
68+
# initialize the state
69+
@inbounds Random.seed!(device_rng, seed, counter)
70+
71+
# grid-stride loop
72+
tid = threadIdx().x
73+
window = (blockDim().x - 1i32) * gridDim().x
74+
offset = (blockIdx().x - 1i32) * blockDim().x
75+
76+
while offset < length(A)
77+
i = tid + offset
78+
79+
logp = CUDA.log(1f0-p)
80+
geom_sum = 0f0
81+
k = 0
82+
while true
83+
geom = ceil(CUDA.log(rand(Float32)) / logp)
84+
geom_sum += geom
85+
geom_sum > n && break
86+
k += 1
87+
end
88+
89+
if i <= length(A)
90+
@inbounds A[i] = k
91+
end
92+
offset += window
93+
end
94+
return nothing
95+
end
3796
function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32)
3897
device_rng = Random.default_rng()
3998

@@ -49,80 +108,47 @@ function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32)
49108
while offset < length(A)
50109
i = tid + offset
51110

52-
# edge cases
53-
if p <= 0 || n <= 0
54-
k = 0
55-
elseif p >= 1
56-
k = n
57-
# Use naive algorithm for n <= 17
58-
elseif n <= 17
59-
k = 0
60-
ctr = 1
61-
while ctr <= n
62-
rand(Float32) < p && (k += 1)
63-
ctr += 1
64-
end
65-
# Use inversion algorithm for n*p < 10
66-
elseif n * p < 10f0
67-
logp = CUDA.log(1f0-p)
68-
geom_sum = 0f0
69-
k = 0
70-
while true
71-
geom = ceil(CUDA.log(rand(Float32)) / logp)
72-
geom_sum += geom
73-
geom_sum > n && break
74-
k += 1
111+
r = p/(1f0-p)
112+
s = p*(1f0-p)
113+
114+
stddev = sqrt(n * s)
115+
b = 1.15f0 + 2.53f0 * stddev
116+
a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
117+
c = n * p + 0.5f0
118+
v_r = 0.92f0 - 4.2f0 / b
119+
120+
alpha = (2.83f0 + 5.1f0 / b) * stddev;
121+
m = floor((n + 1) * p)
122+
123+
ks = 0f0
124+
125+
while true
126+
usample = rand(Float32) - 0.5f0
127+
vsample = rand(Float32)
128+
129+
us = 0.5f0 - abs(usample)
130+
ks = floor((2 * a / us + b) * usample + c)
131+
132+
if us >= 0.07f0 && vsample <= v_r
133+
break
75134
end
76-
# BTRS algorithm
77-
else
78-
# BTRS approximations work well for p <= 0.5
79-
# invert p and set `invert` flag
80-
(invert = p > 0.5f0) && (p = 1f0 - p)
81135

82-
r = p/(1f0-p)
83-
s = p*(1f0-p)
84-
85-
stddev = sqrt(n * s)
86-
b = 1.15f0 + 2.53f0 * stddev
87-
a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
88-
c = n * p + 0.5f0
89-
v_r = 0.92f0 - 4.2f0 / b
90-
91-
alpha = (2.83f0 + 5.1f0 / b) * stddev;
92-
m = floor((n + 1) * p)
136+
if ks < 0 || ks > n
137+
continue
138+
end
93139

94-
ks = 0f0
95-
96-
while true
97-
usample = rand(Float32) - 0.5f0
98-
vsample = rand(Float32)
99-
100-
us = 0.5f0 - abs(usample)
101-
ks = floor((2 * a / us + b) * usample + c)
102-
103-
if us >= 0.07f0 && vsample <= v_r
104-
break
105-
end
106-
107-
if ks < 0 || ks > n
108-
continue
109-
end
110-
111-
v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
112-
ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
113-
(n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
114-
(ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
115-
stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
116-
if v2 <= ub
117-
break
118-
end
140+
v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
141+
ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
142+
(n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
143+
(ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
144+
stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
145+
if v2 <= ub
146+
break
119147
end
120-
invert && (ks = n - ks)
121-
k = Int(ks)
122148
end
123149

124150
if i <= length(A)
125-
@inbounds A[i] = k
151+
@inbounds A[i] = ks
126152
end
127153
offset += window
128154
end
@@ -165,6 +191,9 @@ function kernel_BTRS!(
165191
@inbounds n = count[I1]
166192
@inbounds p = prob[CartesianIndex(I1, I2)]
167193
end
194+
# BTRS approximations work well for p <= 0.5
195+
# invert p and set `invert` flag
196+
(invert = p > 0.5f0) && (p = 1-p)
168197
else
169198
n = 0
170199
p = 0f0
@@ -197,10 +226,6 @@ function kernel_BTRS!(
197226
end
198227
# BTRS algorithm
199228
else
200-
# BTRS approximations work well for p <= 0.5
201-
# invert p and set `invert` flag
202-
(invert = p > 0.5f0) && (p = 1f0 - p)
203-
204229
r = p/(1f0-p)
205230
s = p*(1f0-p)
206231

@@ -239,12 +264,15 @@ function kernel_BTRS!(
239264
break
240265
end
241266
end
242-
invert && (ks = n - ks)
243267
k = Int(ks)
244268
end
245269

246270
if i <= length(A)
247-
@inbounds A[i] = k
271+
if invert
272+
@inbounds A[i] = n - k
273+
else
274+
@inbounds A[i] = k
275+
end
248276
end
249277
offset += window
250278
end

src/rand_binomial.jl

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,20 +65,56 @@ end
6565

6666
## constant (scalar) parameters
6767
function rand_binom!(rng, A::BinomialArray, count::Integer, prob::AbstractFloat)
68-
kernel = @cuda launch=false kernel_BTRS_scalar!(
69-
A, count, Float32(prob), rng.seed, rng.counter
70-
)
68+
n = count
69+
70+
# edge cases
71+
if prob <= 0 || n <= 0
72+
A .= 0
73+
return A
74+
elseif prob >= 1
75+
A .= n
76+
return A
77+
end
78+
79+
invert = prob > 0.5f0
80+
if invert
81+
p = 1 - prob
82+
else
83+
p = prob
84+
end
85+
86+
# Use naive algorithm for n <= 17
87+
if n <= 17
88+
kernel = @cuda launch=false kernel_naive_scalar!(
89+
A, n, Float32(p), rng.seed, rng.counter
90+
)
91+
# Use inversion algorithm for n*p < 10
92+
elseif n * p < 10f0
93+
kernel = @cuda launch=false kernel_inversion_scalar!(
94+
A, n, Float32(p), rng.seed, rng.counter
95+
)
96+
# BTRS algorithm
97+
else
98+
kernel = @cuda launch=false kernel_BTRS_scalar!(
99+
A, n, Float32(p), rng.seed, rng.counter
100+
)
101+
end
102+
71103
config = launch_configuration(kernel.fun)
72104
threads = max(32, min(config.threads, length(A)))
73105
blocks = min(config.blocks, cld(length(A), threads))
74-
kernel(A, count, Float32(prob), rng.seed, rng.counter; threads=threads, blocks=blocks)
106+
kernel(A, n, Float32(p), rng.seed, rng.counter; threads=threads, blocks=blocks)
75107

76108
new_counter = Int64(rng.counter) + length(A)
77109
overflow, remainder = fldmod(new_counter, typemax(UInt32))
78110
rng.seed += overflow # XXX: is this OK?
79111
rng.counter = remainder
80112

81-
return A
113+
if invert
114+
return n .- A
115+
else
116+
return A
117+
end
82118
end
83119

84120
## arrays of parameters

test/Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
[deps]
22
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
33
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
4+
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
5+
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
46
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

test/runtests.jl

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
using BinomialGPU
22
using CUDA
3+
using Distributions
4+
using Statistics
35

46
using BenchmarkTools
57
using Test
@@ -92,22 +94,6 @@ using Test
9294
@test rand_binomial!(A, count = 2, prob = 1.5) == CUDA.fill(2, 256) # probabilities greater than 1 are equivalent to 1
9395
@test_throws MethodError rand_binomial!(A, count = 5., prob = 0.5) # non-integer counts throw an error
9496
end
95-
96-
@testset "benchmarks" begin
97-
# benchmarks
98-
A = CUDA.zeros(Int, 1024, 1024)
99-
n = 128
100-
p = 0.5
101-
ns = CUDA.fill(128, (1024, 1024))
102-
ps = CUDA.rand(1024, 1024)
103-
println("")
104-
println("Benchmarking constant parameter array: should run in less than 2ms on an RTX20xx card")
105-
display(@benchmark CUDA.@sync rand_binomial!($A, count = $n, prob = $p))
106-
println("")
107-
println("Benchmarking full parameter array: should run in less than 2ms on an RTX20xx card")
108-
display(@benchmark CUDA.@sync rand_binomial!($A, count = $ns, prob = $ps))
109-
println("")
110-
end
11197
end # in-place
11298

11399
@testset "out-of-place" begin
@@ -179,4 +165,55 @@ using Test
179165
end
180166
end
181167
end # out-of-place
168+
169+
@testset "Distributional tests" begin
170+
function mean_var_CI(m, S2, n, p, N, α)
171+
truemean = n*p
172+
truevar = n*p*(1-p)
173+
a = quantile(Normal(), α/2)
174+
b = quantile(Normal(), 1-α/2)
175+
c = quantile(Chisq(N-1), α/2)
176+
d = quantile(Chisq(N-1), 1-α/2)
177+
@test a <= sqrt(N/truevar)*(m - truemean) <= b
178+
@test c <= (N-1)*S2/truevar <= d
179+
end
180+
@testset "Scalar parameters" begin
181+
function test_mean_variance(N, n, p)
182+
CUDA.@sync A = rand_binomial(N, count = n, prob = p)
183+
mean_var_CI(mean(A), var(A), n, p, N, 1e-5)
184+
end
185+
N = 2^20
186+
@testset "n = $n, p = $p" for n in [1, 10, 20, 50, 100, 200, 500, 1000],
187+
p in 0.1:0.1:0.9
188+
test_mean_variance(N, n, p)
189+
end
190+
end
191+
@testset "Arrays of parameters" begin
192+
function test_mean_variance(N, n, p)
193+
CUDA.@sync A = rand_binomial(N, count = fill(n, N), prob = fill(p, N))
194+
mean_var_CI(mean(A), var(A), n, p, N, 1e-5)
195+
end
196+
N = 2^20
197+
@testset "n = $n, p = $p" for n in [1, 10, 20, 50, 100, 200, 500, 1000],
198+
p in 0.1:0.1:0.9
199+
test_mean_variance(N, n, p)
200+
end
201+
end
202+
end # Distributional tests
203+
204+
@testset "benchmarks" begin
205+
# benchmarks
206+
A = CUDA.zeros(Int, 1024, 1024)
207+
n = 128
208+
p = 0.5
209+
ns = CUDA.fill(128, (1024, 1024))
210+
ps = CUDA.rand(1024, 1024)
211+
println("")
212+
println("Benchmarking constant parameter array: should run in less than 2ms on an RTX20xx card")
213+
display(@benchmark CUDA.@sync rand_binomial!($A, count = $n, prob = $p))
214+
println("")
215+
println("Benchmarking full parameter array: should run in less than 2ms on an RTX20xx card")
216+
display(@benchmark CUDA.@sync rand_binomial!($A, count = $ns, prob = $ps))
217+
println("")
218+
end
182219
end

0 commit comments

Comments
 (0)