From dd0252d8503626595f488897000385975eb8a7a8 Mon Sep 17 00:00:00 2001
From: Parv Agarwal <65726543+Parvfect@users.noreply.github.com>
Date: Tue, 5 Apr 2022 09:33:01 +0100
Subject: [PATCH] Add NeuralTangentKernel Loss

Adapted from paper - https://arxiv.org/pdf/2007.14527.pdf

Issues
1. Still not completely sure if values used in struct are to be calculated in the struct or somewhere else
2. Relies on ForwardDiff.compute_jacobian
3. Tests and dimension check
4. Not completely clear on how reweighing takes place
---
 src/pinns_pde_solve.jl | 54 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/src/pinns_pde_solve.jl b/src/pinns_pde_solve.jl
index a7a2dfa9fb..90999931d8 100644
--- a/src/pinns_pde_solve.jl
+++ b/src/pinns_pde_solve.jl
@@ -336,7 +336,61 @@ SciMLBase.@add_kwonly function MiniMaxAdaptiveLoss(reweight_every; pde_max_optim
 end
 
 
+"""
+
+A way of adaptively reweighing the components of the loss function by using the values of the Jacobian of the 
+NTK predictions at the current point (infinite width assumption). 
 
+* 'u_pred' : the predictions of the Network on the training data at current state
+* 'r_pred' : the predictions of the Network on the boundary conditions at current state
+* 'kernel_size' : the size of the kernel used to compute the Jacobian for the NTK matrix
+* `reweight_every`: how often to reweight the PDE and BC loss functions, measured in iterations.  reweighting is cheap since it re-uses the value of loss functions generated during the main optimisation loop,
+* `pde_max_optimiser`: a Flux.Optimise.AbstractOptimiser that is used internally to maximize the weights of the PDE loss functions,
+* `bc_max_optimiser`: a Flux.Optimise.AbstractOptimiser that is used internally to maximize the weights of the BC loss functions,
+* `pde_loss_weights`: either a scalar (which will be broadcast) or vector the size of the number of PDE equations, which describes the initial weight the respective PDE loss has in the full loss sum,
+* `bc_loss_weights`: either a scalar (which will be broadcast) or vector the size of the number of BC equations, which describes the initial weight the respective BC loss has in the full loss sum,
+* `additional_loss_weights`: a scalar which describes the weight the additional loss function has in the full loss sum, this is currently not adaptive and will be constant with this adaptive loss,
+
+Adapted from paper 
+When and Why PINNs Fail to Train: A Neural Tangent kernel perspective
+https://arxiv.org/pdf/2007.14527.pdf
+
+"""
+
+
+struct NeuralTangentKernelAdaptiveLoss <: AbstractAdaptiveLoss
+    pde_loss_weights::Vector{T} 
+    bc_loss_weights::Vector{T} 
+    additional_loss_weights::Vector{T} 
+    pde_max_optimiser::PDE_OPT
+    bc_max_optimiser::BC_OPT
+    
+    SciMLBase.@add_kwonly function NeuralTangentKernelAdaptiveLoss{T, PDE_OPT, BC_OPT}(reweight_every; pde_max_optimiser=Flux.ADAM(1e-4), bc_max_optimiser=Flux.ADAM(0.5),
+        u_pred::Vector{T}, r_pred::Vector{T}, kernel_size::Int, pde_loss_weights=1, bc_loss_weights=1, additional_loss_weights=1) where {T <: Real, PDE_OPT <: Flux.Optimise.AbstractOptimiser, BC_OPT <: Flux.Optimise.AbstractOptimiser}
+        Jr, Ju = compute_jacobian(r_pred, bc_loss_weights), compute_jacobian(u_pred, pde_loss_weights)
+        Kr, Ku = compute_ntk(Jr, kernel_size , Jr, kernel_size), compute_ntk(Ju, kernel_size, Ju, kernel_size)
+        lambda_r, lambda_u = (trace(Kr) + trace(Ku))/trace(Kr), (trace(Kr) + trace(Ku))/trace(Ku)
+        new(convert(Int64, reweight_every), convert(PDE_OPT, pde_max_optimiser), convert(BC_OPT, bc_max_optimiser), 
+        (lambda_r * vectorify(pde_loss_weights, T)), (lambda_u * vectorify(bc_loss_weights, T)), vectorify(additional_loss_weights, T))
+    end
+end
+
+SciMLBase.@add_kwonly function NeuralTangentKernelAdaptiveLoss(reweight_every; pde_max_optimiser=Flux.ADAM(1e-4), bc_max_optimiser=Flux.ADAM(0.5),
+    u_pred::Vector{T}, r_pred::Vector{T}, kernel_size::Int, pde_loss_weights=1, bc_loss_weights=1, additional_loss_weights=1) where {T <: Real, PDE_OPT <: Flux.Optimise.AbstractOptimiser, BC_OPT <: Flux.Optimise.AbstractOptimiser}
+    NeuralTangentKernelAdaptiveLoss{Float64, typeof(pde_max_optimiser), typeof(bc_max_optimiser)}(
+        reweight_every; pde_max_optimiser=pde_max_optimiser, bc_max_optimiser=bc_max_optimiser,
+        u_pred=u_pred, r_pred=r_pred, kernel_size=kernel_size, pde_loss_weights=pde_loss_weights, bc_loss_weights=bc_loss_weights, additional_loss_weights=additional_loss_weights)
+end
+
+
+function compute_ntk{T}(J1_list::Vector{T}, D1::Int, J2_list::Vector{T}, D2::Int) where {T <: Real}
+    
+    Ker = zeros(D1,D2)
+    for k in 0:(size(J1_list)[1])
+        Ker += reshape(J1_list[k], (in,-1))* transpose(reshape(J2_list[k], (in,-1)))
+    return Ker
+end
+             
 """
 Create dictionary: variable => unique number for variable