JuliaGaussianProcesses · theogf · Jun 4, 2020 · Jun 12, 2020 · Jul 6, 2020 · Jul 6, 2020
diff --git a/docs/.gitignore b/docs/.gitignore
@@ -1,4 +1,3 @@
 build/
 site/
-
-#Temp to avoid to many changes
+src/examples/
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,4 +1,6 @@
 [deps]
+AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
 

diff --git a/docs/examples/automaticstatistician.jl b/docs/examples/automaticstatistician.jl
diff --git a/docs/examples/deepkernellearning.jl b/docs/examples/deepkernellearning.jl
@@ -0,0 +1,72 @@
+# # Deep Kernel Learning with Flux
+# ## Package loading
+# We use a couple of useful packages to plot and optimize
+# the different hyper-parameters
+using KernelFunctions
+using Flux
+using Distributions, LinearAlgebra
+using Plots
+using ProgressMeter
+using AbstractGPs
+pyplot(); default(legendfontsize = 15.0, linewidth = 3.0)
+
+# ## Data creation
+# We create a simple 1D Problem with very different variations
+xmin = -3; xmax = 3 # Limits
+N = 150
+noise = 0.01
+x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset
+target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value
+target_f(x::AbstractArray) = target_f(first(x))
+y_train = target_f.(x_train) + randn(N) * noise
+x_test = collect(eachrow(range(xmin, xmax, length=200))) # Testing dataset
+spectral_mixture_kernel()
+# ## Model definition
+# We create a neural net with 2 layers and 10 units each
+# The data is passed through the NN before being used in the kernel
+neuralnet = Chain(Dense(1, 20), Dense(20, 30), Dense(30, 5))
+# We use two cases :
+# - The Squared Exponential Kernel
+k = transform(SqExponentialKernel(), FunctionTransform(neuralnet) )
+
+# We use AbstractGPs.jl to define our model
+gpprior = GP(k) # GP Prior
+fx = AbstractGPs.FiniteGP(gpprior, x_train, noise) # Prior on f
+fp = posterior(fx, y_train) # Posterior of f
+
+# This compute the log evidence of `y`,
+# which is going to be used as the objective
+loss(y) = -logpdf(fx, y)
+
+@info "Init Loss = $(loss(y_train))"
+
+# Flux will automatically extract all the parameters of the kernel
+ps = Flux.params(k)
+
+# We show the initial prediction with the untrained model
+p_init = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
+Plots.scatter!(vcat(x_train...), y_train, lab = "data")
+pred = marginals(fp(x_test))
+Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
+# ## Training
+anim = Animation()
+nmax= 1000
+opt = Flux.ADAM(0.1)
+@showprogress for i = 1:nmax
+    global grads = gradient(ps) do
+        loss(y_train)
+    end
+    Flux.Optimise.update!(opt, ps, grads)
+    if i % 100 == 0
+        @info "$i/$nmax"
+        L = loss(y_train)
+        # @info "Loss = $L"
+        p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
+        p = Plots.scatter!(vcat(x_train...), y_train, lab = "data")
+        pred = marginals(posterior(fx, y_train)(x_test))
+        Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
+        frame(anim)
+        display(p)
+    end
+end
+gif(anim, fps = 5)
diff --git a/docs/examples/kernelridgeregression.jl b/docs/examples/kernelridgeregression.jl
@@ -0,0 +1,71 @@
+# # Kernel Ridge Regression
+# ## We load KernelFunctions and some other packages
+using KernelFunctions
+using LinearAlgebra
+using Distributions
+using Plots; default(lw = 2.0, legendfontsize = 15.0)
+using Flux: Optimise
+using ForwardDiff
+using Random: seed!
+seed!(42)
+
+# ## Data Generation
+# We generated data in 1 dimension
+xmin = -3; xmax = 3 # Bounds of the data
+N = 50 # Number of samples
+x_train = rand(Uniform(xmin, xmax), N) # We sample 100 random samples
+σ = 0.1
+y_train = sinc.(x_train) + randn(N) * σ # We create a function and add some noise
+x_test = range(xmin-0.1, xmax+0.1, length=300)
+
+# Plot the data
+scatter(x_train, y_train, lab = "data")
+plot!(x_test, sinc, lab = "true function")
+
+# ## Kernel training
+# To train the kernel parameters via ForwardDiff.jl
+# we need to create a function creating a kernel from an array
+kernelcall(θ) = transform(
+    exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(),
+    exp(θ[3]),
+)
+
+# From theory we know the prediction for a test set x given
+# the kernel parameters and normalization constant
+function f(x, x_train, y_train, θ)
+    k = kernelcall(θ[1:3])
+    kernelmatrix(k, x, x_train) *
+    inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y_train
+end
+
+# We look how the prediction looks like
+# with starting parameters [1.0, 1.0, 1.0, 1.0] we get :
+ŷ = f(x_test, x_train, y_train, log.(ones(4)))
+scatter(x_train, y_train, lab = "data")
+plot!(x_test, sinc, lab = "true function")
+plot!(x_test, ŷ, lab = "prediction")
+
+# We define the loss based on the L2 norm both
+# for the loss and the regularization
+function loss(θ)
+    ŷ = f(x_train, x_train, y_train, θ)
+    sum(abs2, y_train - ŷ) + exp(θ[4]) * norm(ŷ)
+end
+
+# The loss with our starting point :
+loss(log.(ones(4)))
+
+# ## Training the model
+θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) # Initial vector
+anim = Animation()
+opt = Optimise.ADAGrad(0.5)
+for i = 1:30
+    grads = ForwardDiff.gradient(x -> loss(x), θ) # We compute the gradients given the kernel parameters and regularization
+    Δ = Optimise.apply!(opt, θ, grads)
+    θ .-= Δ # We apply a simple Gradient descent algorithm
+    p = scatter(x_train, y_train, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))")
+    plot!(x_test, sinc, lab = "true function")
+    plot!(x_test, f(x_test, x_train, y_train, θ), lab = "Prediction", lw = 3.0)
+    frame(anim)
+end
+gif(anim)
diff --git a/docs/examples/svm.jl b/docs/examples/svm.jl
@@ -0,0 +1,48 @@
+# # Support Vector Machines
+# ## Package loading
+using KernelFunctions
+using Distributions, LinearAlgebra
+using Plots; default(legendfontsize = 15.0, ms = 5.0)
+
+# ## Data Generation
+# ### We first generate a mixture of two Gaussians in 2 dimensions
+xmin = -3; xmax = 3 # Limits for sampling μ₁ and μ₂
+μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers
+# ### We then sample both y and x
+N = 100 # Number of samples
+y = rand((-1, 1), N) # Select randomly between the two classes
+x = Vector{Vector{Float64}}(undef, N) # We preallocate x
+x[y .== 1] = [rand(MvNormal(μ[:, 1], I)) for _ in 1:count(y.==1)] # Features for samples of class 1
+x[y .== -1] = [rand(MvNormal(μ[:, 2], I)) for _ in 1:count(y.==-1)] # Features for samples of class 2
+scatter(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1", title = "Data")
+scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
+
+# ## Model Definition
+# TODO Write theory here
+# ### We create a kernel k
+k = SqExponentialKernel() # SqExponentialKernel/RBFKernel
+λ = 1.0 # Regularization parameter
+
+# ### We create a function to return the optimal prediction for a
+# test data `x_new`
+function f(x_new, x, y, k, λ)
+    kernelmatrix(k, x_new, x) * inv(kernelmatrix(k, x) + λ * I) * y # Optimal prediction f
+end
+
+# ### We also compute the total loss of the model that we want to minimize
+hingeloss(y, ŷ) = maximum(zero(ŷ), 1 - y * ŷ) # hingeloss function
+function reg_hingeloss(k, x, y, λ)
+    ŷ = f(x, x, y, k, λ)
+    return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation
+end
+# ### We create a 2D grid based on the maximum values of the data
+N_test = 100 # Size of the grid
+xgrid = range(extrema(vcat(x...)).*1.1..., length=N_test) # Create a 1D grid
+xgrid_v = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid
+# ### We predict the value of y on this grid on plot it against the data
+y_grid = f(xgrid_v, x, y, k, λ) #Compute prediction on a grid
+contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)', label =  "Predictions", title="Trained model")
+scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1")
+scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
+xlims!(extrema(xgrid))
+ylims!(extrema(xgrid))
diff --git a/docs/make.jl b/docs/make.jl
@@ -7,6 +7,21 @@ end
 
 using KernelFunctions
 
+if ispath(joinpath(@__DIR__, "src", "examples"))
+    rm(joinpath(@__DIR__, "src", "examples"), recursive=true)
+end
+
+for filename in readdir(joinpath(@__DIR__, "..", "examples"))
+    endswith(filename, ".jl") || continue
+    name = splitext(filename)[1]
+    Literate.markdown(
+        joinpath(@__DIR__, "..", "examples", filename),
+        joinpath(@__DIR__, "src", "examples"),
+        name = name,
+        documenter = true,
+    )
+end
+
 DocMeta.setdocmeta!(
     KernelFunctions,
     :DocTestSetup,

diff --git a/docs/src/create_kernel.md b/docs/src/create_kernel.md
@@ -8,15 +8,15 @@ Here are a few ways depending on how complicated your kernel is:
 
 ### SimpleKernel for kernel functions depending on a metric
 
-If your kernel function is of the form `k(x, y) = f(d(x, y))` where `d(x, y)` is a `PreMetric`,
-you can construct your custom kernel by defining `kappa` and `metric` for your kernel.
+If your kernel function is of the form `k(x, y) = f(binary_op(x, y))` where `binary_op(x, y)` is a `PreMetric` or another function/instance implementing `pairwise` and `evaluate` from `Distances.jl`,
+you can construct your custom kernel by defining `kappa` and `binary_op` for your kernel.
 Here is for example how one can define the `SqExponentialKernel` again :
 
 ```julia
 struct MyKernel <: KernelFunctions.SimpleKernel end
 
 KernelFunctions.kappa(::MyKernel, d2::Real) = exp(-d2)
-KernelFunctions.metric(::MyKernel) = SqEuclidean()
+KernelFunctions.binary_op(::MyKernel) = SqEuclidean()
 ```
 
 ### Kernel for more complex kernels

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -11,7 +11,7 @@ The main goals of this package compared to its predecessors/concurrents in [MLKe
 
 The methodology of how kernels are computed is quite simple and is done in three phases :
 - A `Transform` object is applied sample-wise on every sample
-- The pairwise matrix is computed using [Distances.jl](https://github.com/JuliaStats/Distances.jl) by using a `Metric` proper to each kernel
+- The pairwise matrix is computed using [Distances.jl](https://github.com/JuliaStats/Distances.jl) by using a `BinaryOp` like a `Metric` proper to each kernel
 - The `Kernel` function is applied element-wise on the pairwise matrix
 
 For a quick introduction on how to use it go to [User guide](@ref)
diff --git a/docs/src/metrics.md b/docs/src/metrics.md
@@ -1,31 +1,42 @@
-# Metrics
+# Binary Operations
 
 `SimpleKernel` implementations rely on [Distances.jl](https://github.com/JuliaStats/Distances.jl) for efficiently computing the pairwise matrix.
 This requires a distance measure or metric, such as the commonly used `SqEuclidean` and `Euclidean`.
 
 The metric used by a given kernel type is specified as
 ```julia
-KernelFunctions.metric(::CustomKernel) = SqEuclidean()
+KernelFunctions.binary_op(::CustomKernel) = SqEuclidean()
 ```
 
 However, there are kernels that can be implemented efficiently using "metrics" that do not respect all the definitions expected by Distances.jl. For this reason, KernelFunctions.jl provides additional "metrics" such as `DotProduct` ($\langle x, y \rangle$) and `Delta` ($\delta(x,y)$).
 
 
-## Adding a new metric
+## Adding a new binary operation
 
-If you want to create a new "metric" just implement the following:
+If you want to create a new binary operation you have the choice.
+If your operation respects all [`PreMetric` conditions](https://en.wikipedia.org/wiki/Metric_(mathematics)#Generalized_metrics) you can just implement the following:
 
 ```julia
-struct Delta <: Distances.PreMetric
-end
+struct Delta <: Distances.PreMetric end
 
-@inline function Distances._evaluate(::Delta,a::AbstractVector{T},b::AbstractVector{T}) where {T}
+@inline function Distances._evaluate(
+    ::Delta,
+    a::AbstractVector{T},
+    b::AbstractVector{T}
+    ) where {T}
     @boundscheck if length(a) != length(b)
         throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
     end
     return a==b
 end
 
-@inline (dist::Delta)(a::AbstractArray,b::AbstractArray) = Distances._evaluate(dist,a,b)
-@inline (dist::Delta)(a::Number,b::Number) = a==b
+@inline (dist::Delta)(a::AbstractArray, b::AbstractArray) = Distances._evaluate(dist,a,b)
+@inline (dist::Delta)(a::Number, b::Number) = a==b
 ```
+
+However if it somehow does not respect some of the conditions (for instance `d(x, y) ≥ 0` for the `DotProduct`), we have a similar backend:
+```julia
+struct DotProduct <: KernelFunctions.AbstractBinaryOp end
+(d::DotProduct)(a, b) = evaluate(d, a,b)
+Distances.evaluate(::DotProduct, a, b) = dot(a, b)
+```
diff --git a/src/KernelFunctions.jl b/src/KernelFunctions.jl
@@ -67,11 +67,15 @@ using TensorCore
 abstract type Kernel end
 abstract type SimpleKernel <: Kernel end
 
+abstract type AbstractBinaryOp end
+const BinaryOp = Union{Distances.Metric, AbstractBinaryOp}
+
 include("utils.jl")
-include(joinpath("distances", "pairwise.jl"))
-include(joinpath("distances", "dotproduct.jl"))
-include(joinpath("distances", "delta.jl"))
-include(joinpath("distances", "sinus.jl"))
+
+include(joinpath("binary_op", "abstractbinaryop.jl"))
+include(joinpath("binary_op", "dotproduct.jl"))
+include(joinpath("binary_op", "delta.jl"))
+include(joinpath("binary_op", "sinus.jl"))
 
 include(joinpath("transform", "transform.jl"))
 include(joinpath("transform", "scaletransform.jl"))

diff --git a/src/basekernels/constant.jl b/src/basekernels/constant.jl
@@ -17,7 +17,7 @@ struct ZeroKernel <: SimpleKernel end
 
 kappa(κ::ZeroKernel, d::T) where {T<:Real} = zero(T)
 
-metric(::ZeroKernel) = Delta()
+binary_op(::ZeroKernel) = Delta()
 
 Base.show(io::IO, ::ZeroKernel) = print(io, "Zero Kernel")
 
@@ -44,7 +44,7 @@ const EyeKernel = WhiteKernel
 
 kappa(κ::WhiteKernel, δₓₓ::Real) = δₓₓ
 
-metric(::WhiteKernel) = Delta()
+binary_op(::WhiteKernel) = Delta()
 
 Base.show(io::IO, ::WhiteKernel) = print(io, "White Kernel")
 
@@ -75,6 +75,6 @@ end
 
 kappa(κ::ConstantKernel, x::Real) = first(κ.c) * one(x)
 
-metric(::ConstantKernel) = Delta()
+binary_op(::ConstantKernel) = Delta()
 
 Base.show(io::IO, κ::ConstantKernel) = print(io, "Constant Kernel (c = ", first(κ.c), ")")
diff --git a/src/basekernels/cosine.jl b/src/basekernels/cosine.jl
@@ -14,6 +14,6 @@ struct CosineKernel <: SimpleKernel end
 
 kappa(::CosineKernel, d::Real) = cospi(d)
 
-metric(::CosineKernel) = Euclidean()
+binary_op(::CosineKernel) = Euclidean()
 
 Base.show(io::IO, ::CosineKernel) = print(io, "Cosine Kernel")
diff --git a/src/basekernels/exponential.jl b/src/basekernels/exponential.jl
@@ -16,7 +16,7 @@ struct SqExponentialKernel <: SimpleKernel end
 
 kappa(::SqExponentialKernel, d²::Real) = exp(-d² / 2)
 
-metric(::SqExponentialKernel) = SqEuclidean()
+binary_op(::SqExponentialKernel) = SqEuclidean()
 
 iskroncompatible(::SqExponentialKernel) = true
 
@@ -63,7 +63,7 @@ struct ExponentialKernel <: SimpleKernel end
 
 kappa(::ExponentialKernel, d::Real) = exp(-d)
 
-metric(::ExponentialKernel) = Euclidean()
+binary_op(::ExponentialKernel) = Euclidean()
 
 iskroncompatible(::ExponentialKernel) = true
 
@@ -114,7 +114,7 @@ end
 
 kappa(κ::GammaExponentialKernel, d::Real) = exp(-d^first(κ.γ))
 
-metric(::GammaExponentialKernel) = Euclidean()
+binary_op(::GammaExponentialKernel) = Euclidean()
 
 iskroncompatible(::GammaExponentialKernel) = true
 

diff --git a/src/basekernels/exponentiated.jl b/src/basekernels/exponentiated.jl
@@ -14,7 +14,7 @@ struct ExponentiatedKernel <: SimpleKernel end
 
 kappa(::ExponentiatedKernel, xᵀy::Real) = exp(xᵀy)
 
-metric(::ExponentiatedKernel) = DotProduct()
+binary_op(::ExponentiatedKernel) = DotProduct()
 
 iskroncompatible(::ExponentiatedKernel) = true