Add redirected Relu grad and test

ludwigschubert · ludwigschubert · commit 869fe0606315 · 2018-05-03T17:57:52.000-07:00
diff --git a/lucid/misc/redirected_relu_grad.py b/lucid/misc/redirected_relu_grad.py
@@ -0,0 +1,120 @@
+# Copyright 2018 The Lucid Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Redirected ReLu Gradient Overrides
+
+When visualizing models we often[0] have to optimize through ReLu activation
+functions. Where accessing pre-relu tensors is too hard, we use these
+overrides to allow gradient to flow back through the ReLu—even if it didn't
+activate ("dead neuron") and thus its derivative is 0.
+
+Usage:
+```python
+from lucid.misc.gradient_override import gradient_override_map
+from lucid.misc.redirected_relu_grad import redirected_relu_grad
+
+with gradient_override_map({'Relu': redirected_relu_grad}):
+  model.import_graph(…)
+```
+
+Discussion:
+ReLus block the flow of the gradient during backpropagation when their input is
+negative. ReLu6s also do so when the input is larger than 6. These overrides
+change this behavior to allow gradient pushing the input into a desired regime
+between these points.
+
+In effect, this replaces the relu gradient with the following:
+
+Regime       | Effect
+============================================================
+ 0 <= x <= 6 | pass through gradient
+ x < 0       | pass through gradient pushing the input up
+ x > 6       | pass through gradient pushing the input down
+
+Or visually:
+
+  ReLu:                     |   |____________
+                            |  /|
+                            | / |
+                ____________|/  |
+                            0   6
+
+  Override:     ------------|   |------------
+                  allow  ->       <-  allow
+
+Our implementations contains one extra complication:
+tf.train.Optimizer performs gradient _descent_, so in the update step the
+optimizer changes values in the opposite direction of the gradient. Thus, the
+sign of the gradient in our overrides has the opposite of the intuitive effect:
+negative gradient pushes the input up, positive pushes it down.
+Thus, the code below only allows _negative_ gradient when the input is already
+negative, and allows _positive_ gradient when the input is already above 6.
+
+
+[0] That is because many model architectures don't provide easy access
+to pre-relu tensors. For example, GoogLeNet's mixed__ layers are passed through
+an activation function before being concatenated. We are still interested in the
+entire concatenated layer, we would just like to skip the activation function.
+"""
+
+import tensorflow as tf
+
+
+def redirected_relu_grad(op, grad):
+  assert op.type == "Relu"
+  x = op.inputs[0]
+
+  # Compute ReLu gradient
+  relu_grad = tf.where(x < 0., tf.zeros_like(grad), grad)
+
+  # Compute redirected gradient: where do we need to zero out incoming gradient
+  # to prevent input going lower if its already negative
+  neg_pushing_lower = tf.logical_and(x < 0., grad > 0.)
+  redirected_grad = tf.where(neg_pushing_lower, tf.zeros_like(grad), grad)
+
+  # Ensure we have at least a rank 2 tensor, as we expect a batch dimension
+  assert_op = tf.Assert(tf.greater(tf.rank(relu_grad), 1), [tf.rank(relu_grad)])
+  with tf.control_dependencies([assert_op]):
+    # only use redirected gradient where nothing got through original gradient
+    batch = tf.shape(relu_grad)[0]
+    reshaped_relu_grad = tf.reshape(relu_grad, [batch, -1])
+    relu_grad_mag = tf.norm(reshaped_relu_grad, axis=1)
+  return tf.where(relu_grad_mag > 0., relu_grad, redirected_grad)
+
+
+def redirected_relu6_grad(op, grad):
+  assert op.type == "Relu6"
+  x = op.inputs[0]
+
+  # Compute ReLu gradient
+  relu6_cond = tf.logical_or(x < 0., x > 6.)
+  relu_grad = tf.where(relu6_cond, tf.zeros_like(grad), grad)
+
+  # Compute redirected gradient: where do we need to zero out incoming gradient
+  # to prevent input going lower if its already negative, or going higher if
+  # already bigger than 6?
+  neg_pushing_lower = tf.logical_and(x < 0., grad > 0.)
+  pos_pushing_higher = tf.logical_and(x > 6., grad < 0.)
+  dir_filter = tf.logical_or(neg_pushing_lower, pos_pushing_higher)
+  redirected_grad = tf.where(dir_filter, tf.zeros_like(grad), grad)
+
+  # Ensure we have at least a rank 2 tensor, as we expect a batch dimension
+  assert_op = tf.Assert(tf.greater(tf.rank(relu_grad), 1), [tf.rank(relu_grad)])
+  with tf.control_dependencies([assert_op]):
+    # only use redirected gradient where nothing got through original gradient
+    batch = tf.shape(relu_grad)[0]
+    reshaped_relu_grad = tf.reshape(relu_grad, [batch, -1])
+    relu_grad_mag = tf.norm(reshaped_relu_grad, axis=1)
+  return tf.where(relu_grad_mag > 0., relu_grad, redirected_grad)
diff --git a/tests/misc/test_gradient_override.py b/tests/misc/test_gradient_override.py
@@ -0,0 +1,65 @@
+import pytest
+
+import tensorflow as tf
+from lucid.misc.gradient_override import use_gradient
+
+def test_use_gradient():
+  def foo_grad(op, grad):
+    return tf.constant(42), tf.constant(43)
+
+  @use_gradient(foo_grad)
+  def foo(x, y):
+    return x + y
+
+  with tf.Session().as_default() as sess:
+    x = tf.constant(1.)
+    y = tf.constant(2.)
+    z = foo(x, y)
+    grad_wrt_x = tf.gradients(z, x, [1.])[0]
+    grad_wrt_y = tf.gradients(z, y, [1.])[0]
+    assert grad_wrt_x.eval() == 42
+    assert grad_wrt_y.eval() == 43
+
+
+from lucid.misc.gradient_override import gradient_override_map
+
+def test_gradient_override_map():
+
+  def gradient_override(op, grad):
+    return tf.constant(42)
+
+  with tf.Session().as_default() as sess:
+    a = tf.constant(1.)
+    standard_relu = tf.nn.relu(a)
+    grad_wrt_a = tf.gradients(standard_relu, a, [1.])[0]
+    with gradient_override_map({"Relu": gradient_override}):
+      overriden_relu = tf.nn.relu(a)
+      overriden_grad_wrt_a = tf.gradients(overriden_relu, a, [1.])[0]
+    assert grad_wrt_a.eval() != overriden_grad_wrt_a.eval()
+    assert overriden_grad_wrt_a.eval() == 42
+
+
+from lucid.misc.redirected_relu_grad import redirected_relu_grad, redirected_relu6_grad
+
+relu_examples = [
+    (1., -1., 0.), (-1., -1., -1.),
+    (1.,  1., 1.), (-1.,  1., -1.),
+]
+relu6_examples = relu_examples + [
+    (1.,  7., 1.), (-1.,  7.,  0.),
+]
+nonls = [("Relu", tf.nn.relu, redirected_relu_grad, relu_examples),
+         ("Relu6", tf.nn.relu6, redirected_relu6_grad, relu6_examples)]
+
+@pytest.mark.parametrize("nonl_name,nonl,nonl_grad_override, examples", nonls)
+def test_gradient_override_relu6_directionality(nonl_name, nonl,
+    nonl_grad_override, examples):
+  for incoming_grad, input, grad in examples:
+    with tf.Session().as_default() as sess:
+      batched_shape = [1,1]
+      incoming_grad_t = tf.constant(incoming_grad, shape=batched_shape)
+      input_t = tf.constant(input, shape=batched_shape)
+      with gradient_override_map({nonl_name: nonl_grad_override}):
+        nonl_t = nonl(input_t)
+        grad_wrt_input = tf.gradients(nonl_t, input_t, [incoming_grad_t])[0]
+      assert (grad_wrt_input.eval() == grad).all()