rm Flux.Zeros (#1882)

mcabbott · CarloLucibello · web-flow · commit 0edf602dc2eb · 2022-03-05T11:39:40.000-05:00
* rm Flux.Zeros, take N+1

* human-readable loadparams tests, same results

* fixup

* make the words match the code

* upgrade to test Chain, more errors, but same on master

* Update src/utils.jl

Co-authored-by: Carlo Lucibello &lt;carlo.lucibello@gmail.com&gt;

* Update src/layers/basic.jl

Co-authored-by: Carlo Lucibello &lt;carlo.lucibello@gmail.com&gt;
diff --git a/src/Flux.jl b/src/Flux.jl
@@ -37,7 +37,6 @@ using CUDA
 const use_cuda = Ref{Union{Nothing,Bool}}(nothing)
 
 include("utils.jl")
-include("zeros.jl")
 include("onehot.jl")
 include("functor.jl")
 
diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -26,6 +26,14 @@ end
 
 @deprecate frequencies(xs) group_counts(xs)
 
+struct Zeros
+  function Zeros()
+    Base.depwarn("Flux.Zeros is no more, has ceased to be, is bereft of life, is an ex-boondoggle... please use bias=false instead", :Zeros)
+    false
+  end
+end
+Zeros(args...) = Zeros()  # was used both Dense(10, 2, initb = Zeros) and Dense(rand(2,10), Zeros())
+
 # Channel notation: Changed to match Conv, but very softly deprecated!
 # Perhaps change to @deprecate for v0.14, but there is no plan to remove these.
 Dense(in::Integer, out::Integer, σ = identity; kw...) =
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -167,7 +167,7 @@ end
 function Base.show(io::IO, l::Dense)
   print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1))
   l.σ == identity || print(io, ", ", l.σ)
-  l.bias == Zeros() && print(io, "; bias=false")
+  l.bias == false && print(io, "; bias=false")
   print(io, ")")
 end
 
@@ -301,7 +301,7 @@ end
     Bilinear((in1, in2) => out, σ=identity; bias=true, init=glorot_uniform)
     Bilinear(W::AbstractArray, [bias, σ])
 
-Creates a bilinear layer, which operates on two inputs at the same time.
+Creates a layer which is fully connected between two inputs and the output, and otherwise similar to [`Dense`](@ref).
 Its output, given vectors `x` & `y`, is another vector `z` with,
 for all `i ∈ 1:out`:
 
@@ -394,7 +394,7 @@ function Base.show(io::IO, l::Bilinear)
     print(io, "Bilinear((", size(l.weight, 2), ", ", size(l.weight, 3), ") => ", size(l.weight, 1))
   end
   l.σ == identity || print(io, ", ", l.σ)
-  l.bias == Flux.Zeros() && print(io, "; bias=false")
+  l.bias === false && print(io, "; bias=false")
   print(io, ")")
 end
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -6,6 +6,10 @@ _paddims(x::Tuple, y::Tuple) = (x..., y[(end - (length(y) - length(x) - 1)):end]
 expand(N, i::Tuple) = i
 expand(N, i::Integer) = ntuple(_ -> i, N)
 
+conv_reshape_bias(c) = c.bias isa AbstractVector ?
+   reshape(c.bias, map(_->1, c.stride)..., :, 1) :
+   c.bias
+
 """
     SamePad()
 
@@ -61,8 +65,8 @@ Then:
 
 Keywords to control initialization of the layer:
 * `init` - Function used to generate initial weights. Defaults to `glorot_uniform`.
-* `bias` - Initial bias is zero by default, this can be disabled entirely by setting it to
-  `false`, or another vector explicitly as `bias = randn(Float32, out)`.
+* `bias` - The initial bias vector is all zero by default. Trainable bias can be disabled entirely
+  by setting this to `false`, or another vector can be provided such as `bias = randn(Float32, out)`.
 
 See also [`ConvTranspose`](@ref), [`DepthwiseConv`](@ref), [`CrossCor`](@ref).
 
@@ -159,10 +163,9 @@ end
 @functor Conv
 
 function (c::Conv)(x::AbstractArray)
-  b = reshape(c.bias, map(_->1, c.stride)..., :, 1)
   σ = NNlib.fast_act(c.σ, x)
   cdims = DenseConvDims(x, c.weight; stride = c.stride, padding = c.pad, dilation = c.dilation, groups = c.groups)
-  σ.(conv(x, c.weight, cdims) .+ b)
+  σ.(conv(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
 _channels_in(l ::Conv) = size(l.weight, ndims(l.weight)-1) * l.groups
@@ -183,7 +186,7 @@ function _print_conv_opt(io::IO, l)
   if hasproperty(l, :groups)
     (l.groups == 1) || print(io, ", groups=", l.groups)
   end
-  (l.bias isa Zeros) && print(io, ", bias=false")
+  (l.bias === false) && print(io, ", bias=false")
 end
 
 """
@@ -276,10 +279,9 @@ end
 ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any)
 
 function (c::ConvTranspose)(x::AbstractArray)
-  b = reshape(c.bias, map(_->1, c.stride)..., :, 1)
   σ = NNlib.fast_act(c.σ, x)
   cdims = conv_transpose_dims(c, x)
-  σ.(∇conv_data(x, c.weight, cdims) .+ b)
+  σ.(∇conv_data(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
 function Base.show(io::IO, l::ConvTranspose)
@@ -371,10 +373,9 @@ depthwiseconvfilter(filter::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer};
                     init = glorot_uniform) where N = init(filter..., div(ch[2], ch[1]), ch[1])
 
 function (c::DepthwiseConv)(x)
-  b = reshape(c.bias, map(_->1, c.stride)..., :, 1)
   σ = NNlib.fast_act(c.σ, x)
   cdims = DepthwiseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
-  σ.(depthwiseconv(x, c.weight, cdims) .+ b)
+  σ.(depthwiseconv(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
 function Base.show(io::IO, l::DepthwiseConv)
@@ -452,10 +453,9 @@ function crosscor(x, w, ddims::DenseConvDims)
 end
 
 function (c::CrossCor)(x::AbstractArray)
-  b = reshape(c.bias, map(_->1, c.stride)..., :, 1)
   σ = NNlib.fast_act(c.σ, x)
   cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
-  σ.(crosscor(x, c.weight, cdims) .+ b)
+  σ.(crosscor(x, c.weight, cdims) .+ conv_reshape_bias(c))
 end
 
 function Base.show(io::IO, l::CrossCor)
diff --git a/src/utils.jl b/src/utils.jl
@@ -441,17 +441,18 @@ rand32(dims...) = Base.rand(Float32, dims...)
 randn32(dims...) = Base.randn(Float32, dims...)
 
 """
-    create_bias(weights, bias, length)
+    create_bias(weights, bias, size...)
 
 Return a bias parameter for a layer, based on the value given
 to the constructor's keyword `bias=bias`.
 
-* `bias == true` creates a zero vector, of the same type as weights.
-* `bias == false` returns `Zeros()`, a special struct which exists only to encode the absence of bias.
-* `bias::AbstractArray` uses the array provided, provided it has the correct size and eltype. If the type is wrong, it will be converted.
+* `bias == true` creates a trainable array of the given size, of the same type as `weights`, initialised to zero.
+* `bias == false` returns `false`, which is understood by AD to be non-differentiable.
+* `bias::AbstractArray` uses the array provided, provided it has the correct size.
+  It does not at present correct the `eltype` to match that of `weights`.
 """
 function create_bias(weights::AbstractArray, bias::Bool, dims::Integer...)
-  bias ? fill!(similar(weights, dims...), 0) : Zeros()
+  bias ? fill!(similar(weights, dims...), 0) : false
 end
 function create_bias(weights::AbstractArray, bias::AbstractArray, dims::Integer...)
   size(bias) == dims || throw(DimensionMismatch("expected bias of size $(dims), got size $(size(bias))"))
diff --git a/src/zeros.jl b/src/zeros.jl
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
@@ -155,8 +155,8 @@ end
   end
 end
 
-@testset "Dense with Zeros bias" begin
-  l = Dense(ones(Float32, 4, 3), Flux.Zeros()) |> gpu
+@testset "Dense without bias" begin
+  l = Dense(ones(Float32, 4, 3), false) |> gpu
   ip = zeros(Float32, 3, 7) |> gpu
 
   @test sum(l(ip)) ≈ 0.f0
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
@@ -175,7 +175,7 @@ import Flux: activations
       @test b1.σ == identity
 
       b2 = Flux.Bilinear(randn(3,4,5), false)
-      @test b2.bias == Flux.Zeros()
+      @test b2.bias === false
 
       b3 = Flux.Bilinear(randn(Float16, 3,4,5), true, tanh)
       @test b3.σ == tanh
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
@@ -273,7 +273,7 @@ end
 
 @testset "constructors: $fun" for fun in [Conv, CrossCor, ConvTranspose, DepthwiseConv]
   @test fun(rand(2,3,4)).bias isa Vector{Float64}
-  @test fun(rand(2,3,4,5), false).bias isa Flux.Zeros
+  @test fun(rand(2,3,4,5), false).bias === false
   if fun == Conv
     @test fun(rand(2,3,4,5,6), rand(6)).bias isa Vector{Float64}
     @test_skip fun(rand(2,3,4,5,6), 1:6).bias isa Vector{Float64}
diff --git a/test/optimise.jl b/test/optimise.jl
@@ -15,7 +15,7 @@ using Random
                        Nesterov(), RMSProp(), Momentum()]
     Random.seed!(42)
     w′ = randn(10, 10)
-    b = Flux.Zeros()
+    b = false
     loss(x) = Flux.Losses.mse(w*x, w′*x .+ b)
     for t = 1: 10^5
       θ = params([w′, b])
diff --git a/test/utils.jl b/test/utils.jl