Skip to content

Define DotProduct as Binary Op #194

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docs/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
build/
site/

#Temp to avoid to many changes
src/examples/
2 changes: 2 additions & 0 deletions docs/Project.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
[deps]
AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"

Expand Down
Empty file.
72 changes: 72 additions & 0 deletions docs/examples/deepkernellearning.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# # Deep Kernel Learning with Flux
# ## Package loading
# We use a couple of useful packages to plot and optimize
# the different hyper-parameters
using KernelFunctions
using Flux
using Distributions, LinearAlgebra
using Plots
using ProgressMeter
using AbstractGPs
pyplot(); default(legendfontsize = 15.0, linewidth = 3.0)

# ## Data creation
# We create a simple 1D Problem with very different variations
xmin = -3; xmax = 3 # Limits
N = 150
noise = 0.01
x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset
target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value
target_f(x::AbstractArray) = target_f(first(x))
y_train = target_f.(x_train) + randn(N) * noise
x_test = collect(eachrow(range(xmin, xmax, length=200))) # Testing dataset
spectral_mixture_kernel()
# ## Model definition
# We create a neural net with 2 layers and 10 units each
# The data is passed through the NN before being used in the kernel
neuralnet = Chain(Dense(1, 20), Dense(20, 30), Dense(30, 5))
# We use two cases :
# - The Squared Exponential Kernel
k = transform(SqExponentialKernel(), FunctionTransform(neuralnet) )

# We use AbstractGPs.jl to define our model
gpprior = GP(k) # GP Prior
fx = AbstractGPs.FiniteGP(gpprior, x_train, noise) # Prior on f
fp = posterior(fx, y_train) # Posterior of f

# This compute the log evidence of `y`,
# which is going to be used as the objective
loss(y) = -logpdf(fx, y)

@info "Init Loss = $(loss(y_train))"

# Flux will automatically extract all the parameters of the kernel
ps = Flux.params(k)

# We show the initial prediction with the untrained model
p_init = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
Plots.scatter!(vcat(x_train...), y_train, lab = "data")
pred = marginals(fp(x_test))
Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
# ## Training
anim = Animation()
nmax= 1000
opt = Flux.ADAM(0.1)
@showprogress for i = 1:nmax
global grads = gradient(ps) do
loss(y_train)
end
Flux.Optimise.update!(opt, ps, grads)
if i % 100 == 0
@info "$i/$nmax"
L = loss(y_train)
# @info "Loss = $L"
p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
p = Plots.scatter!(vcat(x_train...), y_train, lab = "data")
pred = marginals(posterior(fx, y_train)(x_test))
Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
frame(anim)
display(p)
end
end
gif(anim, fps = 5)
71 changes: 71 additions & 0 deletions docs/examples/kernelridgeregression.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# # Kernel Ridge Regression
# ## We load KernelFunctions and some other packages
using KernelFunctions
using LinearAlgebra
using Distributions
using Plots; default(lw = 2.0, legendfontsize = 15.0)
using Flux: Optimise
using ForwardDiff
using Random: seed!
seed!(42)

# ## Data Generation
# We generated data in 1 dimension
xmin = -3; xmax = 3 # Bounds of the data
N = 50 # Number of samples
x_train = rand(Uniform(xmin, xmax), N) # We sample 100 random samples
σ = 0.1
y_train = sinc.(x_train) + randn(N) * σ # We create a function and add some noise
x_test = range(xmin-0.1, xmax+0.1, length=300)

# Plot the data
scatter(x_train, y_train, lab = "data")
plot!(x_test, sinc, lab = "true function")

# ## Kernel training
# To train the kernel parameters via ForwardDiff.jl
# we need to create a function creating a kernel from an array
kernelcall(θ) = transform(
exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(),
exp(θ[3]),
)

# From theory we know the prediction for a test set x given
# the kernel parameters and normalization constant
function f(x, x_train, y_train, θ)
k = kernelcall(θ[1:3])
kernelmatrix(k, x, x_train) *
inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y_train
end

# We look how the prediction looks like
# with starting parameters [1.0, 1.0, 1.0, 1.0] we get :
ŷ = f(x_test, x_train, y_train, log.(ones(4)))
scatter(x_train, y_train, lab = "data")
plot!(x_test, sinc, lab = "true function")
plot!(x_test, ŷ, lab = "prediction")

# We define the loss based on the L2 norm both
# for the loss and the regularization
function loss(θ)
ŷ = f(x_train, x_train, y_train, θ)
sum(abs2, y_train - ŷ) + exp(θ[4]) * norm(ŷ)
end

# The loss with our starting point :
loss(log.(ones(4)))

# ## Training the model
θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) # Initial vector
anim = Animation()
opt = Optimise.ADAGrad(0.5)
for i = 1:30
grads = ForwardDiff.gradient(x -> loss(x), θ) # We compute the gradients given the kernel parameters and regularization
Δ = Optimise.apply!(opt, θ, grads)
θ .-= Δ # We apply a simple Gradient descent algorithm
p = scatter(x_train, y_train, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))")
plot!(x_test, sinc, lab = "true function")
plot!(x_test, f(x_test, x_train, y_train, θ), lab = "Prediction", lw = 3.0)
frame(anim)
end
gif(anim)
48 changes: 48 additions & 0 deletions docs/examples/svm.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# # Support Vector Machines
# ## Package loading
using KernelFunctions
using Distributions, LinearAlgebra
using Plots; default(legendfontsize = 15.0, ms = 5.0)

# ## Data Generation
# ### We first generate a mixture of two Gaussians in 2 dimensions
xmin = -3; xmax = 3 # Limits for sampling μ₁ and μ₂
μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers
# ### We then sample both y and x
N = 100 # Number of samples
y = rand((-1, 1), N) # Select randomly between the two classes
x = Vector{Vector{Float64}}(undef, N) # We preallocate x
x[y .== 1] = [rand(MvNormal(μ[:, 1], I)) for _ in 1:count(y.==1)] # Features for samples of class 1
x[y .== -1] = [rand(MvNormal(μ[:, 2], I)) for _ in 1:count(y.==-1)] # Features for samples of class 2
scatter(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1", title = "Data")
scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")

# ## Model Definition
# TODO Write theory here
# ### We create a kernel k
k = SqExponentialKernel() # SqExponentialKernel/RBFKernel
λ = 1.0 # Regularization parameter

# ### We create a function to return the optimal prediction for a
# test data `x_new`
function f(x_new, x, y, k, λ)
kernelmatrix(k, x_new, x) * inv(kernelmatrix(k, x) + λ * I) * y # Optimal prediction f
end

# ### We also compute the total loss of the model that we want to minimize
hingeloss(y, ŷ) = maximum(zero(ŷ), 1 - y * ŷ) # hingeloss function
function reg_hingeloss(k, x, y, λ)
ŷ = f(x, x, y, k, λ)
return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation
end
# ### We create a 2D grid based on the maximum values of the data
N_test = 100 # Size of the grid
xgrid = range(extrema(vcat(x...)).*1.1..., length=N_test) # Create a 1D grid
xgrid_v = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid
# ### We predict the value of y on this grid on plot it against the data
y_grid = f(xgrid_v, x, y, k, λ) #Compute prediction on a grid
contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)', label = "Predictions", title="Trained model")
scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1")
scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
xlims!(extrema(xgrid))
ylims!(extrema(xgrid))
15 changes: 15 additions & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,21 @@ end

using KernelFunctions

if ispath(joinpath(@__DIR__, "src", "examples"))
rm(joinpath(@__DIR__, "src", "examples"), recursive=true)
end

for filename in readdir(joinpath(@__DIR__, "..", "examples"))
endswith(filename, ".jl") || continue
name = splitext(filename)[1]
Literate.markdown(
joinpath(@__DIR__, "..", "examples", filename),
joinpath(@__DIR__, "src", "examples"),
name = name,
documenter = true,
)
end

DocMeta.setdocmeta!(
KernelFunctions,
:DocTestSetup,
Expand Down
6 changes: 3 additions & 3 deletions docs/src/create_kernel.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ Here are a few ways depending on how complicated your kernel is:

### SimpleKernel for kernel functions depending on a metric

If your kernel function is of the form `k(x, y) = f(d(x, y))` where `d(x, y)` is a `PreMetric`,
you can construct your custom kernel by defining `kappa` and `metric` for your kernel.
If your kernel function is of the form `k(x, y) = f(binary_op(x, y))` where `binary_op(x, y)` is a `PreMetric` or another function/instance implementing `pairwise` and `evaluate` from `Distances.jl`,
you can construct your custom kernel by defining `kappa` and `binary_op` for your kernel.
Here is for example how one can define the `SqExponentialKernel` again :

```julia
struct MyKernel <: KernelFunctions.SimpleKernel end

KernelFunctions.kappa(::MyKernel, d2::Real) = exp(-d2)
KernelFunctions.metric(::MyKernel) = SqEuclidean()
KernelFunctions.binary_op(::MyKernel) = SqEuclidean()
```

### Kernel for more complex kernels
Expand Down
2 changes: 1 addition & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The main goals of this package compared to its predecessors/concurrents in [MLKe

The methodology of how kernels are computed is quite simple and is done in three phases :
- A `Transform` object is applied sample-wise on every sample
- The pairwise matrix is computed using [Distances.jl](https://github.com/JuliaStats/Distances.jl) by using a `Metric` proper to each kernel
- The pairwise matrix is computed using [Distances.jl](https://github.com/JuliaStats/Distances.jl) by using a `BinaryOp` like a `Metric` proper to each kernel
- The `Kernel` function is applied element-wise on the pairwise matrix

For a quick introduction on how to use it go to [User guide](@ref)
29 changes: 20 additions & 9 deletions docs/src/metrics.md
Original file line number Diff line number Diff line change
@@ -1,31 +1,42 @@
# Metrics
# Binary Operations

`SimpleKernel` implementations rely on [Distances.jl](https://github.com/JuliaStats/Distances.jl) for efficiently computing the pairwise matrix.
This requires a distance measure or metric, such as the commonly used `SqEuclidean` and `Euclidean`.

The metric used by a given kernel type is specified as
```julia
KernelFunctions.metric(::CustomKernel) = SqEuclidean()
KernelFunctions.binary_op(::CustomKernel) = SqEuclidean()
```

However, there are kernels that can be implemented efficiently using "metrics" that do not respect all the definitions expected by Distances.jl. For this reason, KernelFunctions.jl provides additional "metrics" such as `DotProduct` ($\langle x, y \rangle$) and `Delta` ($\delta(x,y)$).


## Adding a new metric
## Adding a new binary operation

If you want to create a new "metric" just implement the following:
If you want to create a new binary operation you have the choice.
If your operation respects all [`PreMetric` conditions](https://en.wikipedia.org/wiki/Metric_(mathematics)#Generalized_metrics) you can just implement the following:

```julia
struct Delta <: Distances.PreMetric
end
struct Delta <: Distances.PreMetric end

@inline function Distances._evaluate(::Delta,a::AbstractVector{T},b::AbstractVector{T}) where {T}
@inline function Distances._evaluate(
::Delta,
a::AbstractVector{T},
b::AbstractVector{T}
) where {T}
@boundscheck if length(a) != length(b)
throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
end
return a==b
end

@inline (dist::Delta)(a::AbstractArray,b::AbstractArray) = Distances._evaluate(dist,a,b)
@inline (dist::Delta)(a::Number,b::Number) = a==b
@inline (dist::Delta)(a::AbstractArray, b::AbstractArray) = Distances._evaluate(dist,a,b)
@inline (dist::Delta)(a::Number, b::Number) = a==b
```

However if it somehow does not respect some of the conditions (for instance `d(x, y) ≥ 0` for the `DotProduct`), we have a similar backend:
```julia
struct DotProduct <: KernelFunctions.AbstractBinaryOp end
(d::DotProduct)(a, b) = evaluate(d, a,b)
Distances.evaluate(::DotProduct, a, b) = dot(a, b)
```
12 changes: 8 additions & 4 deletions src/KernelFunctions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,15 @@ using TensorCore
abstract type Kernel end
abstract type SimpleKernel <: Kernel end

abstract type AbstractBinaryOp end
const BinaryOp = Union{Distances.Metric, AbstractBinaryOp}

include("utils.jl")
include(joinpath("distances", "pairwise.jl"))
include(joinpath("distances", "dotproduct.jl"))
include(joinpath("distances", "delta.jl"))
include(joinpath("distances", "sinus.jl"))

include(joinpath("binary_op", "abstractbinaryop.jl"))
include(joinpath("binary_op", "dotproduct.jl"))
include(joinpath("binary_op", "delta.jl"))
include(joinpath("binary_op", "sinus.jl"))

include(joinpath("transform", "transform.jl"))
include(joinpath("transform", "scaletransform.jl"))
Expand Down
6 changes: 3 additions & 3 deletions src/basekernels/constant.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ struct ZeroKernel <: SimpleKernel end

kappa(κ::ZeroKernel, d::T) where {T<:Real} = zero(T)

metric(::ZeroKernel) = Delta()
binary_op(::ZeroKernel) = Delta()

Base.show(io::IO, ::ZeroKernel) = print(io, "Zero Kernel")

Expand All @@ -44,7 +44,7 @@ const EyeKernel = WhiteKernel

kappa(κ::WhiteKernel, δₓₓ::Real) = δₓₓ

metric(::WhiteKernel) = Delta()
binary_op(::WhiteKernel) = Delta()

Base.show(io::IO, ::WhiteKernel) = print(io, "White Kernel")

Expand Down Expand Up @@ -75,6 +75,6 @@ end

kappa(κ::ConstantKernel, x::Real) = first(κ.c) * one(x)

metric(::ConstantKernel) = Delta()
binary_op(::ConstantKernel) = Delta()

Base.show(io::IO, κ::ConstantKernel) = print(io, "Constant Kernel (c = ", first(κ.c), ")")
2 changes: 1 addition & 1 deletion src/basekernels/cosine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ struct CosineKernel <: SimpleKernel end

kappa(::CosineKernel, d::Real) = cospi(d)

metric(::CosineKernel) = Euclidean()
binary_op(::CosineKernel) = Euclidean()

Base.show(io::IO, ::CosineKernel) = print(io, "Cosine Kernel")
6 changes: 3 additions & 3 deletions src/basekernels/exponential.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ struct SqExponentialKernel <: SimpleKernel end

kappa(::SqExponentialKernel, d²::Real) = exp(-d² / 2)

metric(::SqExponentialKernel) = SqEuclidean()
binary_op(::SqExponentialKernel) = SqEuclidean()

iskroncompatible(::SqExponentialKernel) = true

Expand Down Expand Up @@ -63,7 +63,7 @@ struct ExponentialKernel <: SimpleKernel end

kappa(::ExponentialKernel, d::Real) = exp(-d)

metric(::ExponentialKernel) = Euclidean()
binary_op(::ExponentialKernel) = Euclidean()

iskroncompatible(::ExponentialKernel) = true

Expand Down Expand Up @@ -114,7 +114,7 @@ end

kappa(κ::GammaExponentialKernel, d::Real) = exp(-d^first(κ.γ))

metric(::GammaExponentialKernel) = Euclidean()
binary_op(::GammaExponentialKernel) = Euclidean()

iskroncompatible(::GammaExponentialKernel) = true

Expand Down
2 changes: 1 addition & 1 deletion src/basekernels/exponentiated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ struct ExponentiatedKernel <: SimpleKernel end

kappa(::ExponentiatedKernel, xᵀy::Real) = exp(xᵀy)

metric(::ExponentiatedKernel) = DotProduct()
binary_op(::ExponentiatedKernel) = DotProduct()

iskroncompatible(::ExponentiatedKernel) = true

Expand Down
Loading