JuliaStats · matbesancon · Apr 18, 2022 · Apr 24, 2022 · Apr 25, 2022 · Apr 25, 2022
diff --git a/src/multivariate/mvnormal.jl b/src/multivariate/mvnormal.jl
@@ -253,7 +253,7 @@ Base.show(io::IO, d::MvNormal) =
 length(d::MvNormal) = length(d.μ)
 mean(d::MvNormal) = d.μ
 params(d::MvNormal) = (d.μ, d.Σ)
-@inline partype(d::MvNormal{T}) where {T<:Real} = T
+@inline partype(::MvNormal{T}) where {T<:Real} = T
 
 var(d::MvNormal) = diag(d.Σ)
 cov(d::MvNormal) = Matrix(d.Σ)
@@ -372,7 +372,7 @@ struct MvNormalStats <: SufficientStats
     tw::Float64         # total sample weight
 end
 
-function suffstats(D::Type{MvNormal}, x::AbstractMatrix{Float64})
+function suffstats(::Type{MvNormal}, x::AbstractMatrix{Float64})
     d = size(x, 1)
     n = size(x, 2)
     s = vec(sum(x, dims=2))
@@ -382,7 +382,7 @@ function suffstats(D::Type{MvNormal}, x::AbstractMatrix{Float64})
     MvNormalStats(s, m, s2, Float64(n))
 end
 
-function suffstats(D::Type{MvNormal}, x::AbstractMatrix{Float64}, w::AbstractVector)
+function suffstats(::Type{MvNormal}, x::AbstractMatrix{Float64}, w::AbstractVector)
     d = size(x, 1)
     n = size(x, 2)
     length(w) == n || throw(DimensionMismatch("Inconsistent argument dimensions."))
@@ -410,13 +410,13 @@ end
 # each kind of covariance
 #
 
-fit_mle(D::Type{MvNormal}, ss::MvNormalStats) = fit_mle(FullNormal, ss)
-fit_mle(D::Type{MvNormal}, x::AbstractMatrix{Float64}) = fit_mle(FullNormal, x)
-fit_mle(D::Type{MvNormal}, x::AbstractMatrix{Float64}, w::AbstractArray{Float64}) = fit_mle(FullNormal, x, w)
+fit_mle(::Type{MvNormal}, ss::MvNormalStats) = fit_mle(FullNormal, ss)
+fit_mle(::Type{MvNormal}, x::AbstractMatrix{Float64}) = fit_mle(FullNormal, x)
+fit_mle(::Type{MvNormal}, x::AbstractMatrix{Float64}, w::AbstractArray{Float64}) = fit_mle(FullNormal, x, w)
 
-fit_mle(D::Type{FullNormal}, ss::MvNormalStats) = MvNormal(ss.m, ss.s2 * inv(ss.tw))
+fit_mle(::Type{<:FullNormal}, ss::MvNormalStats) = MvNormal(ss.m, ss.s2 * inv(ss.tw))
 
-function fit_mle(D::Type{FullNormal}, x::AbstractMatrix{Float64})
+function fit_mle(::Type{FullNormal}, x::AbstractMatrix{Float64})
     n = size(x, 2)
     mu = vec(mean(x, dims=2))
     z = x .- mu
@@ -425,7 +425,7 @@ function fit_mle(D::Type{FullNormal}, x::AbstractMatrix{Float64})
     MvNormal(mu, PDMat(C))
 end
 
-function fit_mle(D::Type{FullNormal}, x::AbstractMatrix{Float64}, w::AbstractVector)
+function fit_mle(::Type{<:FullNormal}, x::AbstractMatrix{Float64}, w::AbstractVector)
     m = size(x, 1)
     n = size(x, 2)
     length(w) == n || throw(DimensionMismatch("Inconsistent argument dimensions"))
@@ -445,7 +445,7 @@ function fit_mle(D::Type{FullNormal}, x::AbstractMatrix{Float64}, w::AbstractVec
     MvNormal(mu, PDMat(C))
 end
 
-function fit_mle(D::Type{DiagNormal}, x::AbstractMatrix{Float64})
+function fit_mle(::Type{DiagNormal}, x::AbstractMatrix{Float64})
     m = size(x, 1)
     n = size(x, 2)
 
@@ -460,7 +460,7 @@ function fit_mle(D::Type{DiagNormal}, x::AbstractMatrix{Float64})
     MvNormal(mu, PDiagMat(va))
 end
 
-function fit_mle(D::Type{DiagNormal}, x::AbstractMatrix{Float64}, w::AbstractVector)
+function fit_mle(::Type{<:DiagNormal}, x::AbstractMatrix{Float64}, w::AbstractVector)
     m = size(x, 1)
     n = size(x, 2)
     length(w) == n || throw(DimensionMismatch("Inconsistent argument dimensions"))
@@ -479,7 +479,7 @@ function fit_mle(D::Type{DiagNormal}, x::AbstractMatrix{Float64}, w::AbstractVec
     MvNormal(mu, PDiagMat(va))
 end
 
-function fit_mle(D::Type{IsoNormal}, x::AbstractMatrix{Float64})
+function fit_mle(::Type{IsoNormal}, x::AbstractMatrix{Float64})
     m = size(x, 1)
     n = size(x, 2)
 
@@ -495,7 +495,7 @@ function fit_mle(D::Type{IsoNormal}, x::AbstractMatrix{Float64})
     MvNormal(mu, ScalMat(m, va / (m * n)))
 end
 
-function fit_mle(D::Type{IsoNormal}, x::AbstractMatrix{Float64}, w::AbstractVector)
+function fit_mle(::Type{<:IsoNormal}, x::AbstractMatrix{Float64}, w::AbstractVector)
     m = size(x, 1)
     n = size(x, 2)
     length(w) == n || throw(DimensionMismatch("Inconsistent argument dimensions"))
@@ -515,3 +515,87 @@ function fit_mle(D::Type{IsoNormal}, x::AbstractMatrix{Float64}, w::AbstractVect
     end
     MvNormal(mu, ScalMat(m, va / (m * sw)))
 end
+
+## Differentiation
+
+function ChainRulesCore.frule((_, Δd, Δx)::Tuple{Any,Any,Any}, ::typeof(_logpdf), d::AbstractMvNormal, x::AbstractVector)
+    c0, Δc0 = ChainRulesCore.frule((ChainRulesCore.NoTangent(), Δd), mvnormal_c0, d)
+    sq, Δsq = ChainRulesCore.frule((ChainRulesCore.NoTangent(), Δd, Δx), sqmahal, d, x)
+    Δc0 = ChainRulesCore.unthunk(Δc0)
+    Δsq = ChainRulesCore.unthunk(Δsq)
+    return c0 - sq/2, Δc0 - Δsq/2
+end
-function ChainRulesCore.frule((_, Δd, Δx)::Tuple{Any,Any,Any}, ::typeof(_logpdf), d::AbstractMvNormal, x::AbstractVector)
-    c0, Δc0 = ChainRulesCore.frule((ChainRulesCore.NoTangent(), Δd), mvnormal_c0, d)
-    sq, Δsq = ChainRulesCore.frule((ChainRulesCore.NoTangent(), Δd, Δx), sqmahal, d, x)
-    Δc0 = ChainRulesCore.unthunk(Δc0)
-    Δsq = ChainRulesCore.unthunk(Δsq)
-    return c0 - sq/2, Δc0 - Δsq/2
-end
-function ChainRulesCore.frule((_, Δd, Δx)::Tuple{Any,Any,Any}, ::typeof(_logpdf), d::AbstractMvNormal, x::AbstractVector)
-    c0, Δc0 = ChainRulesCore.frule((ChainRulesCore.NoTangent(), Δd), mvnormal_c0, d)
-    sq, Δsq = ChainRulesCore.frule((ChainRulesCore.NoTangent(), Δd, Δx), sqmahal, d, x)
-    Δc0 = ChainRulesCore.unthunk(Δc0)
-    Δsq = ChainRulesCore.unthunk(Δsq)
-    return c0 - sq/2, Δc0 - Δsq/2
-end
+
+function ChainRulesCore.rrule(::typeof(_logpdf), d::MvNormal, x::AbstractVector)
+    c0, c0_pullback = ChainRulesCore.rrule(mvnormal_c0, d)
+    sq, sq_pullback = ChainRulesCore.rrule(sqmahal, d, x)
+    function logpdf_MvNormal_pullback(dy)
+        dy = ChainRulesCore.unthunk(dy)
+        (_, ∂d_c0) = c0_pullback(dy)
+        ∂d_c0 = ChainRulesCore.unthunk(∂d_c0)
+        (_, ∂d_sq, ∂x_sq) = sq_pullback(dy)
+        ∂d_sq = ChainRulesCore.unthunk(∂d_sq)
+        ∂x_sq = ChainRulesCore.unthunk(∂x_sq)
+        backing = NamedTuple{(:μ, :Σ), Tuple{typeof(∂d_sq.μ), typeof(∂d_sq.Σ)}}((
+            (∂d_c0.μ - 0.5 * ∂d_sq.μ),
+            (∂d_c0.Σ - 0.5 * ∂d_sq.Σ),
+        ))
+        ∂d = ChainRulesCore.Tangent{typeof(d), typeof(backing)}(backing)
+        return ChainRulesCore.NoTangent(), ∂d, ∂x_sq / (-2)
+    end
+    return c0 - sq / 2, logpdf_MvNormal_pullback
+end
-function ChainRulesCore.rrule(::typeof(_logpdf), d::MvNormal, x::AbstractVector)
-    c0, c0_pullback = ChainRulesCore.rrule(mvnormal_c0, d)
-    sq, sq_pullback = ChainRulesCore.rrule(sqmahal, d, x)
-    function logpdf_MvNormal_pullback(dy)
-        dy = ChainRulesCore.unthunk(dy)
-        (_, ∂d_c0) = c0_pullback(dy)
-        ∂d_c0 = ChainRulesCore.unthunk(∂d_c0)
-        (_, ∂d_sq, ∂x_sq) = sq_pullback(dy)
-        ∂d_sq = ChainRulesCore.unthunk(∂d_sq)
-        ∂x_sq = ChainRulesCore.unthunk(∂x_sq)
-        backing = NamedTuple{(:μ, :Σ), Tuple{typeof(∂d_sq.μ), typeof(∂d_sq.Σ)}}((
-            (∂d_c0.μ - 0.5 * ∂d_sq.μ),
-            (∂d_c0.Σ - 0.5 * ∂d_sq.Σ),
-        ))
-        ∂d = ChainRulesCore.Tangent{typeof(d), typeof(backing)}(backing)
-        return ChainRulesCore.NoTangent(), ∂d, ∂x_sq / (-2)
-    end
-    return c0 - sq / 2, logpdf_MvNormal_pullback
-end
-function ChainRulesCore.rrule(::typeof(_logpdf), d::MvNormal, x::AbstractVector)
-    c0, c0_pullback = ChainRulesCore.rrule(mvnormal_c0, d)
-    sq, sq_pullback = ChainRulesCore.rrule(sqmahal, d, x)
-    function logpdf_MvNormal_pullback(dy)
-        dy = ChainRulesCore.unthunk(dy)
-        (_, ∂d_c0) = c0_pullback(dy)
-        ∂d_c0 = ChainRulesCore.unthunk(∂d_c0)
-        (_, ∂d_sq, ∂x_sq) = sq_pullback(dy)
-        ∂d_sq = ChainRulesCore.unthunk(∂d_sq)
-        ∂x_sq = ChainRulesCore.unthunk(∂x_sq)
-        backing = NamedTuple{(:μ, :Σ), Tuple{typeof(∂d_sq.μ), typeof(∂d_sq.Σ)}}((
-            (∂d_c0.μ - 0.5 * ∂d_sq.μ),
-            (∂d_c0.Σ - 0.5 * ∂d_sq.Σ),
-        ))
-        ∂d = ChainRulesCore.Tangent{typeof(d), typeof(backing)}(backing)
-        return ChainRulesCore.NoTangent(), ∂d, ∂x_sq / (-2)
-    end
-    return c0 - sq / 2, logpdf_MvNormal_pullback
-end
+
+function ChainRulesCore.frule((_, Δd)::Tuple{Any,Any}, ::typeof(mvnormal_c0), d::MvNormal)
+    y = mvnormal_c0(d)
+    Δd = ChainRulesCore.unthunk(Δd)
+    Δy = -dot(Δd.Σ, invcov(d)) / 2
+    return y, Δy
+end
+
+function ChainRulesCore.rrule(::typeof(mvnormal_c0), d::MvNormal)
+    y = mvnormal_c0(d)
+    function mvnormal_c0_pullback(dy)
+        dy = ChainRulesCore.unthunk(dy)
+        ∂Σ = -dy/2 * invcov(d)
+        ∂d = ChainRulesCore.Tangent{typeof(d)}(μ = ChainRulesCore.ZeroTangent(), Σ = ∂Σ)
+        return ChainRulesCore.NoTangent(), ∂d
+    end
+    return y, mvnormal_c0_pullback
+end
+
+function ChainRulesCore.frule(dargs::Tuple{Any,Any,Any}, ::typeof(sqmahal), d::MvNormal, x::AbstractVector)
+    y = sqmahal(d, x)    
+    (_, Δd, Δx) = dargs
+    Δd = ChainRulesCore.unthunk(Δd)
+    Δx = ChainRulesCore.unthunk(Δx)
+    Σinv = invcov(d)
+    # TODO optimize
+    dΣ = -dot(Σinv * Δd.Σ * Σinv, x * x' - d.μ * x' - x * d.μ' + d.μ * d.μ')
+    dx = 2 * dot(Σinv * (x - d.μ), Δx)
+    dμ = 2 * dot(Σinv * (d.μ - x), Δd.μ)
+    Δy = dΣ + dx + dμ
+    return (y, Δy)
+end
+
+function ChainRulesCore.rrule(::typeof(sqmahal), d::MvNormal, x::AbstractVector)
+    y = sqmahal(d, x)
+    function sqmahal_pullback(dy)
+        ∂x = ChainRulesCore.@thunk(begin
+            dy = ChainRulesCore.unthunk(dy)
+            Σinv = invcov(d)
+            2dy * Σinv * (x - d.μ)
+        end)
+        ∂d = ChainRulesCore.@thunk(begin
+            dy = ChainRulesCore.unthunk(dy)
+            Σinv = invcov(d)
+            cx = x - d.μ
+            ∂μ = -2dy * Σinv * cx
+            ∂J = dy * cx * cx'
+            ∂Σ = - Σinv * ∂J * Σinv
+            ChainRulesCore.Tangent{typeof(d)}(μ = ∂μ, Σ = ∂Σ)
+        end)
+        return (ChainRulesCore.NoTangent(), ∂d, ∂x)
+    end
+    return y, sqmahal_pullback
+end
diff --git a/test/mvnormal.jl b/test/mvnormal.jl
@@ -1,5 +1,6 @@
 # Tests on Multivariate Normal distributions
 
+import PDMats
 import PDMats: ScalMat, PDiagMat, PDMat
 if isdefined(PDMats, :PDSparseMat)
     import PDMats: PDSparseMat
@@ -9,6 +10,8 @@ using Distributions
 using LinearAlgebra, Random, Test
 using SparseArrays
 using FillArrays
+using ChainRulesCore
+using ChainRulesTestUtils
 
 ###### General Testing
 
@@ -302,3 +305,67 @@ end
     x = rand(d)
     @test logpdf(d, x) ≈ logpdf(Normal(), x[1]) + logpdf(Normal(), x[2])
 end
+
+@testset "MvNormal differentiation rules" begin
+    for n in (3, 10)
+        for _ in 1:10
+            A = Symmetric(rand(n,n)) .+ 4 * Matrix(I, n, n)
+            @assert isposdef(A)
+            d = MvNormal(randn(n), A)
+            # make ΔΣ symmetric, such that Σ ± ΔΣ is PSD
+            t = 0.001 * ChainRulesTestUtils.rand_tangent(d)
+            t.Σ .+= t.Σ'
+            if eigmin(t.Σ) < 0
+                while eigmin(d.Σ + t.Σ) < 0
+                    t.Σ .*= 0.8
+                end
+            end
+            if eigmax(t.Σ) > 0
+                while eigmin(d.Σ - t.Σ) < 0
+                    t.Σ .*= 0.8
+                end
+            end
+            # mvnormal_c0
+            (y, Δy) = @inferred ChainRulesCore.frule((ChainRulesCore.NoTangent(), t), Distributions.mvnormal_c0, d)
+            y_r, c0_pullback = @inferred ChainRulesCore.rrule(Distributions.mvnormal_c0, d)
+            @test y_r ≈ y
+            y2 = Distributions.mvnormal_c0(MvNormal(d.μ, d.Σ + t.Σ))
+            @test unthunk(Δy) ≈ y2 - y atol= n * 1e-4
+            y3 = Distributions.mvnormal_c0(MvNormal(d.μ, d.Σ - t.Σ))
+            @test unthunk(Δy) ≈ y - y3 atol = n * 1e-4
+            (_, ∇c0) = c0_pullback(1.0)
+            ∇c0 = ChainRulesCore.unthunk(∇c0)
+            @test dot(∇c0.Σ, t.Σ) ≈ y2 - y atol = n * 1e-4
+            @test dot(∇c0.Σ, t.Σ) ≈ y - y3 atol = n * 1e-4
+            # sqmahal
+            x = randn(n)
+            Δx = 0.0001 * randn(n)
+            (y, Δy) = @inferred ChainRulesCore.frule((ChainRulesCore.NoTangent(), t, Δx), sqmahal, d, x)
+            (yr, sqmahal_pullback) = @inferred ChainRulesCore.rrule(sqmahal, d, x)
+            (_, ∇s_d, ∇s_x) = @inferred sqmahal_pullback(1.0)
+            ∇s_d = ChainRulesCore.unthunk(∇s_d)
+            ∇s_x = ChainRulesCore.unthunk(∇s_x)
+            @test yr ≈ y
+            y2 = Distributions.sqmahal(MvNormal(d.μ + t.μ, d.Σ + t.Σ), x + Δx)
+            y3 = Distributions.sqmahal(MvNormal(d.μ - t.μ, d.Σ - t.Σ), x - Δx)
+            @test unthunk(Δy) ≈ y2 - y atol = n * 1e-4
+            @test unthunk(Δy) ≈ y - y3 atol = n * 1e-4
+            @test dot(∇s_d.Σ, t.Σ) + dot(∇s_d.μ, t.μ) + dot(∇s_x, Δx) ≈ y2 - y atol = n * 1e-4
+            @test dot(∇s_d.Σ, t.Σ) + dot(∇s_d.μ, t.μ) + dot(∇s_x, Δx) ≈ y - y3 atol = n * 1e-4
+            # _logpdf
+            (y, Δy) = @inferred ChainRulesCore.frule((ChainRulesCore.NoTangent(), t, Δx), Distributions._logpdf, d, x)
+            (yr, logpdf_MvNormal_pullback) = @inferred ChainRulesCore.rrule(Distributions._logpdf, d, x)
+            @test y ≈ yr
+            # inference broken
+            # (_, ∇s_d, ∇s_x) = @inferred logpdf_MvNormal_pullback(1.0)
+            (_, ∇s_d, ∇s_x) = logpdf_MvNormal_pullback(1.0)
+
+            y2 = Distributions._logpdf(MvNormal(d.μ + t.μ, d.Σ + t.Σ), x + Δx)
+            y3 = Distributions._logpdf(MvNormal(d.μ - t.μ, d.Σ - t.Σ), x - Δx)
+            @test unthunk(Δy) ≈ y - y3 atol = n * 1e-4
+            @test unthunk(Δy) ≈ y2 - y atol = n * 1e-4
+            @test dot(∇s_d.Σ, t.Σ) + dot(∇s_d.μ, t.μ) + dot(∇s_x, Δx) ≈ y2 - y atol = n * 1e-4
+            @test dot(∇s_d.Σ, t.Σ) + dot(∇s_d.μ, t.μ) + dot(∇s_x, Δx) ≈ y - y3 atol = n * 1e-4
+        end
+    end
+end