Enable batch computation of OT via Sinkhorn (#67)

zsteve · devmotion · web-flow · commit caec5407c157 · 2021-05-29T13:30:37.000-07:00
* allow barycenter to be computed with batch kernel reduction (changes
calling convention)

* sinkhorn batch computation

* update example

* sinkhorn batch computation

* fix tests

* format

* fix type instability as per review

* type instability fix attempt 2

* type instability fix attempt 3

* formatting

* implement common output type for sinkhorn and sinkhorn2

* formatting

* Update src/OptimalTransport.jl

Co-authored-by: David Widmann &lt;devmotion@users.noreply.github.com&gt;

* removed multiple cost matrix from sinkhorn_barycenter, updated docs

* Update examples/basic/script.jl

Co-authored-by: David Widmann &lt;devmotion@users.noreply.github.com&gt;

* Update examples/basic/script.jl

Co-authored-by: David Widmann &lt;devmotion@users.noreply.github.com&gt;

* allow one to many sinkhorn computation

* rebase

* fix output dimensions

* format

* update docstrings

* increment version

* update tests

* formatting

Co-authored-by: David Widmann &lt;devmotion@users.noreply.github.com&gt;
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "OptimalTransport"
 uuid = "7e02d93a-ae51-4f58-b602-d97af76e3b33"
 authors = ["zsteve <stephenz@student.unimelb.edu.au>"]
-version = "0.3.4"
+version = "0.3.5"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
diff --git a/examples/basic/script.jl b/examples/basic/script.jl
@@ -206,10 +206,10 @@ heatmap(
 # the entropically regularised **barycenter** in $\mathcal{P}$ is the discrete probability
 # measure $\mu$ that solves
 # ```math
-# \inf_{\mu \in \mathcal{P}} \sum_{i = 1}^N \lambda_i \mathrm{entropicOT}^{\epsilon}_{C_i}(\mu, \mu_i)
+# \inf_{\mu \in \mathcal{P}} \sum_{i = 1}^N \lambda_i \operatorname{OT}_{\epsilon}(\mu, \mu_i)
 # ```
-# where $\mathrm{entropicOT}^\epsilon_{C_i}(\mu, \mu_i)$ denotes the entropically regularised
-# optimal transport cost with marginals $\mu$ and $\mu_i$, cost matrix $C_i$, and entropic
+# where $\operatorname{OT}_\epsilon(\mu, \mu_i)$ denotes the entropically regularised
+# optimal transport cost with marginals $\mu$ and $\mu_i$, cost matrix $C$, and entropic
 # regularisation parameter $\epsilon$.
 #
 # We set up two measures and compute the weighted barycenters. We choose weights
@@ -225,9 +225,8 @@ plt = plot(; size=(800, 400), legend=:outertopright)
 plot!(plt, support, mu1; label=raw"$\mu_1$")
 plot!(plt, support, mu2; label=raw"$\mu_2$")
 
-mu = hcat(mu1, mu2)'
-C1 = C2 = pairwise(SqEuclidean(), support'; dims=2)
-C = [C1, C2]
+mu = hcat(mu1, mu2)
+C = pairwise(SqEuclidean(), support'; dims=2)
 for λ1 in (0.25, 0.5, 0.75)
     λ2 = 1 - λ1
     a = sinkhorn_barycenter(mu, C, 0.01, [λ1, λ2]; max_iter=1000)
diff --git a/src/OptimalTransport.jl b/src/OptimalTransport.jl
@@ -21,6 +21,12 @@ export ot_cost, ot_plan
 
 const MOI = MathOptInterface
 
+dot_matwise(x::AbstractMatrix, y::AbstractMatrix) = dot(x, y)
+function dot_matwise(x::AbstractArray, y::AbstractMatrix)
+    xmat = reshape(x, size(x, 1) * size(x, 2), :)
+    return reshape(reshape(y, 1, :) * xmat, size(x)[3:end])
+end
+
 """
     emd(μ, ν, C, optimizer)
 
@@ -138,14 +144,19 @@ and ``v`` as
 ```math
 \\gamma = \\operatorname{diag}(u) K \\operatorname{diag}(v).
 ```
-
 Every `check_convergence` steps it is assessed if the algorithm is converged by checking if
 the iterate of the transport plan `G` satisfies
 ```julia
 isapprox(sum(G; dims=2), μ; atol=atol, rtol=rtol, norm=x -> norm(x, 1))
 ```
 The default `rtol` depends on the types of `μ`, `ν`, and `K`. After `maxiter` iterations,
 the computation is stopped.
+
+Note that for a common kernel `K`, multiple histograms may be provided for a batch computation by passing `μ` and `ν`
+as matrices whose columns `μ[:, i]` and `ν[:, i]` correspond to pairs of histograms. 
+The output are then matrices `u` and `v` such that `u[:, i]` and `v[:, i]` are the dual variables for `μ[:, i]` and `ν[:, i]`.
+
+In addition, the case where one of `μ` or `ν` is a single histogram and the other a matrix of histograms is supported.
 """
 function sinkhorn_gibbs(
     μ,
@@ -170,7 +181,14 @@ function sinkhorn_gibbs(
             :sinkhorn_gibbs,
         )
     end
-    sum(μ) ≈ sum(ν) ||
+    if (size(μ, 2) != size(ν, 2)) && (min(size(μ, 2), size(ν, 2)) > 1)
+        throw(
+            DimensionMismatch(
+                "Error: number of columns in μ and ν must coincide, if both are matrix valued",
+            ),
+        )
+    end
+    all(sum(μ; dims=1) .≈ sum(ν; dims=1)) ||
         throw(ArgumentError("source and target marginals must have the same mass"))
 
     # set default values of tolerances
@@ -179,32 +197,37 @@ function sinkhorn_gibbs(
     _rtol = rtol === nothing ? (_atol > zero(_atol) ? zero(T) : sqrt(eps(T))) : rtol
 
     # initial iteration
-    u = μ ./ sum(K; dims=2)
+    u = if isequal(size(μ, 2), size(ν, 2))
+        similar(μ)
+    else
+        repeat(similar(μ[:, 1]); outer=(1, max(size(μ, 2), size(ν, 2))))
+    end
+    u .= μ ./ vec(sum(K; dims=2))
     v = ν ./ (K' * u)
     tmp1 = K * v
     tmp2 = similar(u)
 
-    norm_μ = sum(abs, μ) # for convergence check
+    norm_μ = sum(abs, μ; dims=1) # for convergence check
     isconverged = false
     check_step = check_convergence === nothing ? 10 : check_convergence
     for iter in 0:maxiter
         if iter % check_step == 0
             # check source marginal
             # do not overwrite `tmp1` but reuse it for computing `u` if not converged
             @. tmp2 = u * tmp1
-            norm_uKv = sum(abs, tmp2)
+            norm_uKv = sum(abs, tmp2; dims=1)
             @. tmp2 = μ - tmp2
-            norm_diff = sum(abs, tmp2)
+            norm_diff = sum(abs, tmp2; dims=1)
 
             @debug "Sinkhorn algorithm (" *
                    string(iter) *
                    "/" *
                    string(maxiter) *
                    ": absolute error of source marginal = " *
-                   string(norm_diff)
+                   string(maximum(norm_diff))
 
             # check stopping criterion
-            if norm_diff < max(_atol, _rtol * max(norm_μ, norm_uKv))
+            if all(@. norm_diff < max(_atol, _rtol * max(norm_μ, norm_uKv)))
                 @debug "Sinkhorn algorithm ($iter/$maxiter): converged"
                 isconverged = true
                 break
@@ -227,6 +250,13 @@ function sinkhorn_gibbs(
     return u, v
 end
 
+function add_singleton(x::AbstractArray, ::Val{dim}) where {dim}
+    shape = ntuple(ndims(x) + 1) do i
+        return i < dim ? size(x, i) : (i > dim ? size(x, i - 1) : 1)
+    end
+    return reshape(x, shape)
+end
+
 """
     sinkhorn(
         μ, ν, C, ε; atol=0, rtol=atol > 0 ? 0 : √eps, check_convergence=10, maxiter=1_000
@@ -252,6 +282,11 @@ isapprox(sum(G; dims=2), μ; atol=atol, rtol=rtol, norm=x -> norm(x, 1))
 The default `rtol` depends on the types of `μ`, `ν`, and `C`. After `maxiter` iterations,
 the computation is stopped.
 
+Note that for a common cost `C`, multiple histograms may be provided for a batch computation by passing `μ` and `ν`
+as matrices whose columns `μ[:, i]` and `ν[:, i]` correspond to pairs of histograms. 
+
+The output in this case is an `Array` `γ` of coupling matrices such that `γ[:, :, i]` is a coupling of `μ[:, i]` and `ν[:, i]`.
+
 See also: [`sinkhorn2`](@ref)
 """
 function sinkhorn(μ, ν, C, ε; kwargs...)
@@ -260,8 +295,7 @@ function sinkhorn(μ, ν, C, ε; kwargs...)
 
     # compute dual potentials
     u, v = sinkhorn_gibbs(μ, ν, K; kwargs...)
-
-    return K .* u .* v'
+    return K .* add_singleton(u, Val(2)) .* add_singleton(v, Val(1))
 end
 
 """
@@ -286,18 +320,19 @@ function sinkhorn2(μ, ν, C, ε; regularization=false, plan=nothing, kwargs...)
         sinkhorn(μ, ν, C, ε; kwargs...)
     else
         # check dimensions
-        size(C) == (length(μ), length(ν)) ||
-            error("cost matrix `C` must be of size `(length(μ), length(ν))`")
-        size(plan) == size(C) || error(
+        size(C) == (size(μ, 1), size(ν, 1)) || error(
+            "cost matrix `C` must be of size `(size(μ, dims = 1), size(ν, dims = 1))`",
+        )
+        (size(plan, 1), size(plan, 2)) == size(C) || error(
             "optimal transport plan `plan` and cost matrix `C` must be of the same size",
         )
         plan
     end
-
     cost = if regularization
-        dot(γ, C) + ε * sum(LogExpFunctions.xlogx, γ)
+        dot_matwise(γ, C) .+
+        ε * reshape(sum(LogExpFunctions.xlogx, γ; dims=(1, 2)), size(γ)[3:end])
     else
-        dot(γ, C)
+        dot_matwise(γ, C)
     end
 
     return cost
@@ -668,54 +703,36 @@ function sinkhorn_stabilized(
 end
 
 """
-    sinkhorn_barycenter(mu_all, C_all, eps, lambda_all; tol = 1e-9, check_marginal_step = 10, max_iter = 1000)
-
-Compute the entropically regularised (i.e. Sinkhorn) barycenter for a collection of `N`
-histograms `mu_all` with respective cost matrices `C_all`, relative weights `lambda_all`,
-and entropic regularisation parameter `eps`. 
+    sinkhorn_barycenter(μ, C, ε, w; tol=1e-9, check_marginal_step=10, max_iter=1000)
 
- - `mu_all` is taken to contain `N` histograms `mu_all[i, :]` for `math i = 1, \\ldots, N`.
- - `C_all` is taken to be a list of `N` cost matrices corresponding to the `mu_all[i, :]`.
- - `eps` is the scalar regularisation parameter.
- - `lambda_all` are positive weights.
-
-Returns the entropically regularised barycenter of the `mu_all`, i.e. the distribution that minimises
+Compute the Sinkhorn barycenter for a collection of `N` histograms contained in the columns of `μ`, for a cost matrix `C` of size `(size(μ, 1), size(μ, 1))`, relative weights `w` of size `N`, and entropic regularisation parameter `ε`. 
+Returns the entropically regularised barycenter of the `μ`, i.e. the histogram `ρ` of length `size(μ, 1)` that solves 
 
 ```math
-\\min_{\\mu \\in \\Sigma} \\sum_{i = 1}^N \\lambda_i \\mathrm{entropicOT}^{\\epsilon}_{C_i}(\\mu, \\mu_i)
+\\min_{\\rho \\in \\Sigma} \\sum_{i = 1}^N w_i \\operatorname{OT}_{\\varepsilon}(\\mu_i, \\rho)
 ```
 
-where ``\\mathrm{entropicOT}^{\\epsilon}_{C}`` denotes the entropic optimal transport cost with cost ``C`` and entropic regularisation level ``\\epsilon``.
+where ``\\operatorname{OT}_{ε}(\\mu, \\nu) = \\inf_{\\gamma \\Pi(\\mu, \\nu)} \\langle \\gamma, C \\rangle + \\varepsilon \\Omega(\\gamma)`` 
+is the entropic optimal transport loss with cost ``C`` and regularisation ``\\epsilon``.
 """
-function sinkhorn_barycenter(
-    mu_all, C_all, eps, lambda_all; tol=1e-9, check_marginal_step=10, max_iter=1000
-)
-    sums = sum(mu_all; dims=2)
+function sinkhorn_barycenter(μ, C, ε, w; tol=1e-9, check_marginal_step=10, max_iter=1000)
+    sums = sum(μ; dims=1)
     if !isapprox(extrema(sums)...)
         throw(ArgumentError("Error: marginals are unbalanced"))
     end
-    K_all = [exp.(-C_all[i] / eps) for i in 1:length(C_all)]
+    K = exp.(-C / ε)
     converged = false
-    v_all = ones(size(mu_all))
-    u_all = ones(size(mu_all))
-    N = size(mu_all, 1)
+    v = ones(size(μ))
+    u = ones(size(μ))
+    N = size(μ, 2)
     for n in 1:max_iter
-        for i in 1:N
-            v_all[i, :] = mu_all[i, :] ./ (K_all[i]' * u_all[i, :])
-        end
-        a = ones(size(u_all, 2))
-        for i in 1:N
-            a = a .* ((K_all[i] * v_all[i, :]) .^ (lambda_all[i]))
-        end
-        for i in 1:N
-            u_all[i, :] = a ./ (K_all[i] * v_all[i, :])
-        end
+        v = μ ./ (K' * u)
+        a = ones(size(u, 1))
+        a = prod((K * v)' .^ w; dims=1)'
+        u = a ./ (K * v)
         if n % check_marginal_step == 0
             # check marginal errors
-            err = maximum([
-                maximum(abs.(mu_all[i, :] .- v_all[i, :] .* (K_all[i]' * u_all[i, :]))) for
-                i in 1:N
-            ])
+            err = maximum(abs.(μ .- v .* (K' * u)))
             @debug "Sinkhorn algorithm: iteration $n" err
             if err < tol
                 converged = true
@@ -726,7 +743,7 @@ function sinkhorn_barycenter(
     if !converged
         @warn "Sinkhorn did not converge"
     end
-    return u_all[1, :] .* (K_all[1] * v_all[1, :])
+    return u[:, 1] .* (K * v[:, 1])
 end
 
 """
diff --git a/test/entropic.jl b/test/entropic.jl
@@ -79,6 +79,53 @@ Random.seed!(100)
             # compare with POT
             c_pot = POT.sinkhorn2(μ, ν, C, eps; numItermax=5_000, stopThr=1e-6)[1]
             @test Float32(c_pot) ≈ c rtol = 1e-3
+
+            # batch
+            d = 10
+            μ = fill(Float32(1 / M), (M, d))
+            ν = fill(Float32(1 / N), N)
+
+            γ_all = sinkhorn(μ, ν, C, eps; maxiter=5_000, rtol=1e-6)
+            γ_pot = [
+                POT.sinkhorn(μ[:, i], vec(ν), C, eps; numItermax=5_000, stopThr=1e-6) for
+                i in 1:d
+            ]
+            @test all([
+                isapprox(Float32.(γ_pot[i]), γ_all[:, :, i]; rtol=1e-3) for i in 1:d
+            ])
+            @test eltype(γ_all) == Float32
+        end
+
+        @testset "batch" begin
+            # create two sets of batch histograms 
+            d = 10
+            μ = rand(Float64, (M, d))
+            μ = μ ./ sum(μ; dims=1)
+            ν = rand(Float64, (N, d))
+            ν = ν ./ sum(ν; dims=1)
+
+            # create random cost matrix
+            C = pairwise(SqEuclidean(), rand(1, M), rand(1, N); dims=2)
+
+            # compute optimal transport map (Julia implementation + POT)
+            eps = 0.01
+            γ_all = sinkhorn(μ, ν, C, eps; maxiter=5_000)
+            γ_pot = [POT.sinkhorn(μ[:, i], ν[:, i], C, eps; numItermax=5_000) for i in 1:d]
+            @test all([isapprox(γ_all[:, :, i], γ_pot[i]; rtol=1e-6) for i in 1:d])
+
+            c_all = sinkhorn2(μ, ν, C, eps; maxiter=5_000)
+            c_pot = [
+                POT.sinkhorn2(μ[:, i], ν[:, i], C, eps; numItermax=5_000)[1] for i in 1:d
+            ]
+            @test c_all ≈ c_pot rtol = 1e-6
+
+            γ_all = sinkhorn(μ[:, 1], ν, C, eps; maxiter=5_000)
+            γ_pot = [POT.sinkhorn(μ[:, 1], ν[:, i], C, eps; numItermax=5_000) for i in 1:d]
+            @test all([isapprox(γ_all[:, :, i], γ_pot[i]; rtol=1e-6) for i in 1:d])
+
+            γ_all = sinkhorn(μ, ν[:, 1], C, eps; maxiter=5_000)
+            γ_pot = [POT.sinkhorn(μ[:, i], ν[:, 1], C, eps; numItermax=5_000) for i in 1:d]
+            @test all([isapprox(γ_all[:, :, i], γ_pot[i]; rtol=1e-6) for i in 1:d])
         end
 
         @testset "deprecations" begin
@@ -146,22 +193,24 @@ Random.seed!(100)
         end
     end
 
-    @testset "sinkhorn_barycenter" begin
-        # set up support
-        support = range(-1; stop=1, length=250)
-        μ1 = exp.(-(support .+ 0.5) .^ 2 ./ 0.1^2)
-        μ1 ./= sum(μ1)
-        μ2 = exp.(-(support .- 0.5) .^ 2 ./ 0.1^2)
-        μ2 ./= sum(μ2)
-        μ_all = hcat(μ1, μ2)'
-
-        # create cost matrix
-        C = pairwise(SqEuclidean(), support'; dims=2)
-
-        # compute Sinkhorn barycenter (Julia implementation + POT)
-        eps = 0.01
-        μ_interp = sinkhorn_barycenter(μ_all, [C, C], eps, [0.5, 0.5])
-        μ_interp_pot = POT.barycenter(μ_all', C, eps; weights=[0.5, 0.5], stopThr=1e-9)
-        @test μ_interp ≈ μ_interp_pot
+    @testset "sinkhorn barycenter" begin
+        @testset "example" begin
+            # set up support
+            support = range(-1; stop=1, length=250)
+            μ1 = exp.(-(support .+ 0.5) .^ 2 ./ 0.1^2)
+            μ1 ./= sum(μ1)
+            μ2 = exp.(-(support .- 0.5) .^ 2 ./ 0.1^2)
+            μ2 ./= sum(μ2)
+            μ_all = hcat(μ1, μ2)
+            # create cost matrix
+            C = pairwise(SqEuclidean(), support'; dims=2)
+
+            # compute Sinkhorn barycenter (Julia implementation + POT)
+            eps = 0.01
+            μ_interp = sinkhorn_barycenter(μ_all, C, eps, [0.5, 0.5])
+            μ_interp_pot = POT.barycenter(μ_all, C, eps; weights=[0.5, 0.5], stopThr=1e-9)
+            # need to use a larger tolerance here because of a quirk with the POT solver 
+            @test μ_interp ≈ μ_interp_pot rtol = 1e-6
+        end
     end
 end