FluxML · DhairyaLGandhi · Sep 8, 2020 · Sep 8, 2020 · Sep 8, 2020 · Sep 13, 2020
diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
@@ -163,6 +163,7 @@ end
 
 Base.show(io::IO, ps::Grads) = print(io, "Grads(...)")
 
+@forward Grads.grads Base.getindex, Base.haskey, Base.iterate, Base.keys
 @forward Grads.grads  Base.setindex!
 @forward Grads.params  Base.length
 
@@ -171,15 +172,15 @@ const ADictOrGrads = Union{AbstractDict, Grads}
 # Dictionary interface.
 # Don't use the IdDict directly since it may contain some spurious pairs.
 Base.haskey(gs::Grads, x) = x ∈ gs.params 
-Base.keys(gs::Grads) = gs.params
+# Base.keys(gs::Grads) = gs.params
 Base.values(gs::Grads) = (gs.grads[p] for p in gs.params)
 
-function Base.iterate(gs::Grads, state...)
-  res = iterate(gs.params, state...)
-  isnothing(res) && return nothing
-  p, next_state = res
-  return gs[p], next_state
-end
+# function Base.iterate(gs::Grads, state...)
+#   res = iterate(gs.params, state...)
+#   isnothing(res) && return nothing
+#   p, next_state = res
+#   return gs[p], next_state
+# end
 
 function Base.getindex(gs::Grads, x)
   isbits(x) && error("Only reference types can be differentiated with `Params`.")

diff --git a/src/lib/array.jl b/src/lib/array.jl
@@ -552,6 +552,7 @@ end
 
 @adjoint convert(::Type{R}, A::LinearAlgebra.HermOrSym{T,S}) where {T,S,R<:Array} = convert(R, A),
   Δ -> (nothing, convert(S, Δ),)
+
 @adjoint Matrix(A::LinearAlgebra.HermOrSym{T,S}) where {T,S} = Matrix(A),
   Δ -> (convert(S, Δ),)
 
@@ -731,7 +732,6 @@ end
     return ((uplo=nothing, info=nothing, factors=Δ_factors),)
   end
 end
-
 @adjoint function logdet(C::Cholesky)
   return logdet(C), function(Δ)
     return ((uplo=nothing, info=nothing, factors=Diagonal(2 .* Δ ./ diag(C.factors))),)
@@ -756,14 +756,11 @@ end
 @adjoint function -(S::UniformScaling, A::AbstractMatrix)
   return S - A, Δ->((λ=tr(Δ),), -Δ)
 end
-
 @adjoint +(A::AbstractArray, B::AbstractArray) = A + B, Δ->(Δ, Δ)
 @adjoint -(A::AbstractArray, B::AbstractArray) = A - B, Δ->(Δ, -Δ)
 @adjoint -(A::AbstractArray) = -A, Δ->(-Δ,)
-
 # Abstract FFT
 # ===================
-
 # AbstractFFTs functions do not work with FillArrays, which are needed
 # for some functionality of Zygote. To make it work with FillArrays
 # as well, overload the relevant functions
@@ -773,56 +770,47 @@ AbstractFFTs.ifft(x::Fill, dims...) = AbstractFFTs.ifft(collect(x), dims...)
 AbstractFFTs.rfft(x::Fill, dims...) = AbstractFFTs.rfft(collect(x), dims...)
 AbstractFFTs.irfft(x::Fill, d, dims...) = AbstractFFTs.irfft(collect(x), d, dims...)
 AbstractFFTs.brfft(x::Fill, d, dims...) = AbstractFFTs.brfft(collect(x), d, dims...)
-
 # the adjoint jacobian of an FFT with respect to its input is the reverse FFT of the
 # gradient of its inputs, but with different normalization factor
 @adjoint function fft(xs)
   return AbstractFFTs.fft(xs), function(Δ)
     return (AbstractFFTs.bfft(Δ),)
   end
 end
-
 @adjoint function *(P::AbstractFFTs.Plan, xs)
   return P * xs, function(Δ)
     N = prod(size(xs)[[P.region...]])
     return (nothing, N * (P \ Δ))
   end
 end
-
 @adjoint function \(P::AbstractFFTs.Plan, xs)
   return P \ xs, function(Δ)
     N = prod(size(Δ)[[P.region...]])
     return (nothing, (P * Δ)/N)
   end
 end
-
 # all of the plans normalize their inverse, while we need the unnormalized one.
 @adjoint function ifft(xs)
   return AbstractFFTs.ifft(xs), function(Δ)
     N = length(xs)
     return (AbstractFFTs.fft(Δ)/N,)
   end
 end
-
 @adjoint function bfft(xs)
   return AbstractFFTs.bfft(xs), function(Δ)
     return (AbstractFFTs.fft(Δ),)
   end
 end
-
 @adjoint function fftshift(x)
     return fftshift(x), function(Δ)
         return (ifftshift(Δ),)
     end
 end
-
 @adjoint function ifftshift(x)
     return ifftshift(x), function(Δ)
         return (fftshift(Δ),)
     end
 end
-
-
 # to actually use rfft, one needs to insure that everything
 # that happens in the Fourier domain could've been done in
 # the space domain with real numbers. This means enforcing

diff --git a/src/lib/base.jl b/src/lib/base.jl
@@ -44,6 +44,27 @@ end
   end
 end
 
+@adjoint function Base._oidd_nextind(a, i)
+  Base._oidd_nextind(a, i), Δ -> begin
+    (nothing, nothing)
+  end
+end
+@adjoint! function get(d::AbstractDict, k, default)
+  hk = Ref{Bool}()
+  val = if haskey(d, k)
+    hk[] = true
+    d[k]
+  else
+    hk[] = false
+    d[k] = default
+  end
+  function back(Δ)
+    Δ2 = setindex!(grad_mut(__context__, d), Δ, k)
+    (Δ2, nothing, nothing)
+  end
+  val, back
+end
+
 # Channels
 
 @nograd Channel

diff --git a/test/interface.jl b/test/interface.jl
@@ -164,3 +164,33 @@ end
     @test all(abs.(gs[b]) .<= 1e-5) 
   end
 end
+
+@testset "Params nesting" begin
+  struct Dense{F,T,S}
+    W::T
+    b::S
+    σ::F
+  end
+
+  (d::Dense)(x) = d.σ.(d.W * x .+ d.b)
+  d = Dense(ones(Float32, 3,3), zeros(Float32, 3), identity)
+  ps = Zygote.Params([d.W, d.b])
+  r = ones(Float32, 3,3)
+
+  gs = gradient(ps) do
+    p, pb = pullback(ps) do
+      sum(d(r))
+    end
+    g = pb(p)
+    sum(g[d.W]) # + sum(g[d.b])
+  end
+
+  @test gs[d.W] ≈ fill(81f0, (3,3))
+
+  # Test L2
+  l2g = gradient(ps) do
+    sum(sum(x .^ 2) for x in ps)
+  end
+  @test l2g[d.W] ≈ fill(2.f0, size(d.W))
+  @test l2g[d.b] ≈ fill(0.f0, size(d.b))
+end