Skip to content
4 changes: 4 additions & 0 deletions src/Distributions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ export
MatrixTDist,
MixtureModel,
Multinomial,
MvHypergeometric,
MultivariateNormal,
MvLogNormal,
MvNormal,
Expand Down Expand Up @@ -244,6 +245,7 @@ export
ncategories, # the number of categories in a Categorical distribution
ncomponents, # the number of components in a mixture model
ntrials, # the number of trials being performed in the experiment
nelements, # the number of elements of each type in a finite population
params, # get the tuple of parameters
params!, # provide storage space to calculate the tuple of parameters for a multivariate distribution like mvlognormal
partype, # returns a type large enough to hold all of a distribution's parameters' element types
Expand Down Expand Up @@ -281,6 +283,8 @@ export
# type system
include("common.jl")



# implementation helpers
include("utils.jl")
include("eachvariate.jl")
Expand Down
127 changes: 127 additions & 0 deletions src/multivariate/mvhypergeometric.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""
The [Multivariate hypergeometric distribution](https://en.wikipedia.org/wiki/Hypergeometric_distribution#Multivariate_hypergeometric_distribution)
generalizes the *hypergeometric distribution*. Consider ``n`` draws from a finite population containing ``k`` types of elements. Suppose that the population has size `M` and there are ``m_i`` elements of type ``i`` for ``i = 1, .., k`` with ``m_1+...m_k = M``. Let ``X = (X_1, ..., X_k)`` where ``X_i`` represents the number of elements of type ``i`` drawn, then the distribution of ``X`` is a multivariate hypergeometric distribution. Each sample of a multivariate hypergeometric distribution is a ``k``-dimensional integer vector that sums to ``n`` and satisfies ``0 \\le X_i \\le m_i``.


The probability mass function is given by

```math
f(x; m, n) = {{{m_1 \\choose x_1}{m_2 \\choose x_2}\\cdots {m_k \\choose x_k}}\\over {N \\choose n}},
\\quad x_1 + \\cdots + x_k = n, \\quad x_i \\le m_i
```

```julia
MvHypergeometric(m, n) # Multivariate hypergeometric distribution for a population with
# m = (m_1, ..., m_k) elements of type 1 to k and n draws

params(d) # Get the parameters, i.e. (m, n)
```
"""
struct MvHypergeometric <: DiscreteMultivariateDistribution
m::AbstractVector{Int} # number of elements of each type
n::Int # number of draws
function MvHypergeometric(m::AbstractVector{Int}, n::Int; check_args::Bool=true)
@check_args(
MvHypergeometric,
(m, all(m .>= zero.(n))),
zero(n) <= n <= sum(m),
)
new(m, n)
end
end


# Parameters

ncategories(d::MvHypergeometric) = length(d.m)
length(d::MvHypergeometric) = ncategories(d)
nelements(d::MvHypergeometric) = d.m
ntrials(d::MvHypergeometric) = d.n

params(d::MvHypergeometric) = (d.m, d.n)

# Statistics

mean(d::MvHypergeometric) = d.n .* d.m ./ sum(d.m)

function var(d::MvHypergeometric)
m = nelements(d)
k = length(m)
n = ntrials(d)
M = sum(m)
p = m / M
f = n * (M - n) / (M-1)

v = Vector{Real}(undef, k)
for i = 1:k
@inbounds p_i = p[i]
v[i] = f * p_i * (1 - p_i)
end
v
end

function cov(d::MvHypergeometric)
m = nelements(d)
k = length(m)
n = ntrials(d)
M = sum(m)
p = m / M
f = n * (M - n) / (M-1)

C = Matrix{Real}(undef, k, k)
for j = 1:k
pj = p[j]
for i = 1:j-1
@inbounds C[i,j] = - f * p[i] * pj
end

@inbounds C[j,j] = f * pj * (1-pj)
end

for j = 1:k-1
for i = j+1:k
@inbounds C[i,j] = C[j,i]
end
end
C
end


# Evaluation
function insupport(d::MvHypergeometric, x::AbstractVector{T}) where T<:Real
k = length(d)
m = nelements(d)
length(x) == k || return false
s = 0.0
for i = 1:k
@inbounds xi = x[i]
if !(isinteger(xi) && xi >= 0 && xi <= m[i])
return false
end
s += xi
end
return s == ntrials(d) # integer computation would not yield truncation errors
end

function _logpdf(d::MvHypergeometric, x::AbstractVector{T}) where T<:Real
m = nelements(d)
M = sum(m)
n = ntrials(d)
insupport(d,x) || return -Inf
s = -logabsbinomial(M, n)[1]
for i = 1:length(m)
@inbounds xi = x[i]
@inbounds m_i = m[i]
s += logabsbinomial(m_i, xi)[1]
end
return s
end

# Sampling is performed by sequentially sampling each entry from the
# hypergeometric distribution
_rand!(rng::AbstractRNG, d::MvHypergeometric, x::AbstractVector{Int}) =
mvhypergeom_rand!(rng, nelements(d), ntrials(d), x)




3 changes: 2 additions & 1 deletion src/multivariates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ for fname in ["dirichlet.jl",
"mvlognormal.jl",
"mvtdist.jl",
"product.jl", # deprecated
"vonmisesfisher.jl"]
"vonmisesfisher.jl",
"mvhypergeometric.jl"]
include(joinpath("multivariate", fname))
end
1 change: 1 addition & 0 deletions src/samplers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ for fname in ["aliastable.jl",
"exponential.jl",
"gamma.jl",
"multinomial.jl",
"mvhypergeometric.jl",
"vonmises.jl",
"vonmisesfisher.jl",
"discretenonparametric.jl",
Expand Down
41 changes: 41 additions & 0 deletions src/samplers/mvhypergeometric.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
function mvhypergeom_rand!(rng::AbstractRNG, m::AbstractVector{Int}, n::Int,
x::AbstractVector{<:Real})
k = length(m)
length(x) == k || throw(DimensionMismatch("Invalid argument dimension."))

z = zero(eltype(m))
M = sum(m) # remaining population
i = 0
km1 = k - 1

while i < km1 && n > 0
i += 1
@inbounds mi = m[i]
if mi < M
# Sample from hypergeometric distribution
# element of type i are considered successes
# all other elements are considered failures
xi = rand(rng, Hypergeometric(mi, M-mi, n))
@inbounds x[i] = xi
# Remove elements of type i from population
n -= xi
M -= mi
else
# In this case, we don't even have to sample
# from Hypergeometric. Just assign remaining counts
@inbounds x[i] = n
n = 0
end
end

if i == km1
@inbounds x[k] = n
else # n must have been zero
z = zero(eltype(x))
for j = i+1 : k
@inbounds x[j] = z
end
end

return x
end
122 changes: 122 additions & 0 deletions test/multivariate/mvhypergeometric.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Tests for Multivariate Hypergeometric

using Distributions
using Test

@testset "Multivariate Hypergeometric" begin
@test_throws DomainError MvHypergeometric([5, 3, -2], 4)
@test_throws ArgumentError MvHypergeometric([5, 3], 10)

m = [5, 3, 2]
n = 4
d = MvHypergeometric(m, n)
@test length(d) == 3
@test d.n == n
@test nelements(d) == m
@test ncategories(d) == length(m)
@test params(d) == (m, n)

@test mean(d) ≈ [2.0, 1.2, 0.8]
@test var(d) ≈ [2/3, 56/100, 32/75]

covmat = cov(d)
@test covmat ≈ (8/3) .* [1/4 -3/20 -1/10; -3/20 21/100 -3/50; -1/10 -3/50 4/25]

@test insupport(d, [2, 1, 1])
@test !insupport(d, [3, 2, 1])
@test !insupport(d, [0, 0, 4])


# random sampling
x = rand(d)
@test isa(x, Vector{Int})
@test sum(x) == n
@test all(x .>= 0)
@test all(x .<= m)
@test insupport(d, x)

x = rand(d, 100)
@test isa(x, Matrix{Int})
@test all(sum(x, dims=1) .== n)
@test all(x .>= 0)
@test all(x .<= m)
@test all(insupport(d, x))

# random sampling with many catergories
m = [20, 2, 2, 2, 1, 1, 1]
n = 5
d2 = MvHypergeometric(m, n)
x = rand(d2)
@test isa(x, Vector{Int})
@test sum(x) == n
@test all(x .>= 0)
@test all(x .<= m)
@test insupport(d2, x)

# log pdf
x = [2, 1, 1]
@test pdf(d, x) ≈ 2/7
@test logpdf(d, x) ≈ log(2/7)
@test logpdf(d, x) ≈ log(pdf(d, x))

x = rand(d, 100)
pv = pdf(d, x)
lp = logpdf(d, x)
for i in 1 : size(x, 2)
@test pv[i] ≈ pdf(d, x[:,i])
@test lp[i] ≈ logpdf(d, x[:,i])
end

# test degenerate cases
d1 = MvHypergeometric([1], 1)
@test logpdf(d1, [1]) ≈ 0
@test logpdf(d1, [0]) == -Inf
d2 = MvHypergeometric([2, 0], 1)
@test logpdf(d2, [1, 0]) ≈ 0
@test logpdf(d2, [0, 1]) == -Inf

d3 = MvHypergeometric([5, 0, 0, 0], 3)
@test logpdf(d3, [3, 0, 0, 0]) ≈ 0
@test logpdf(d3, [2, 1, 0, 0]) == -Inf
@test logpdf(d3, [2, 0, 0, 0]) == -Inf

# behavior with n = 0
d0 = MvHypergeometric([5, 3, 2], 0)
@test logpdf(d0, [0, 0, 0]) ≈ 0
@test logpdf(d0, [1, 0, 0]) == -Inf

@test rand(d0) == [0, 0, 0]
@test mean(d0) == [0.0, 0.0, 0.0]
@test var(d0) == [0.0, 0.0, 0.0]
@test insupport(d0, [0, 0, 0])
@test !insupport(d0, [1, 0, 0])
@test length(d0) == 3

# compare with hypergeometric
ns = 3
nf = 5
n = 4
dh1 = MvHypergeometric([ns, nf], n)
dh2 = Hypergeometric(ns, nf, n)

x = 2
@test pdf(dh1, [x, n-x]) ≈ pdf(dh2, x)
x = 3
@test pdf(dh1, [x, n-x]) ≈ pdf(dh2, x)

# comparing marginals to hypergeometric
m = [5, 3, 2]
n = 4
d = MvHypergeometric(m, n)
dh = Hypergeometric(m[1], sum(m[2:end]), n)
x1 = 2
@test pdf(dh, x1) ≈ sum([pdf(d, [x1, x2, n-x1-x2]) for x2 in 0:m[2]])

# comparing conditionals to hypergeometric
x1 = 2
dh = Hypergeometric(m[2], m[3], n-x1)
q = sum([pdf(d, [x1, x2, n-x1-x2]) for x2 in 0:m[2]])
for x2 = 0:m[2]
@test pdf(dh, x2) ≈ pdf(d, [x1, x2, n-x1-x2])/q
end
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ const tests = [
"univariate/continuous/triangular",
"statsapi",
"univariate/continuous/inversegaussian",
"multivariate/mvhypergeometric",

### missing files compared to /src:
# "common",
Expand Down
Loading