diff --git a/README.md b/README.md
index 4125c53..102063c 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,10 @@
 # [Learning to Learn](https://arxiv.org/abs/1606.04474) in TensorFlow
 
-Compatible with TensorFlow 1.0
+
+## Dependencies
+
+* [TensorFlow >=1.0](https://www.tensorflow.org/)
+* [Sonnet >=1.0](https://github.com/deepmind/sonnet)
 
 
 ## Training
diff --git a/meta.py b/meta.py
index 9de24a1..c4f2880 100644
--- a/meta.py
+++ b/meta.py
@@ -23,13 +23,13 @@
 import os
 
 import mock
+import sonnet as snt
 import tensorflow as tf
 
 from tensorflow.python.framework import ops
 from tensorflow.python.util import nest
 
 import networks
-import nn
 
 
 def _nested_assign(ref, value):
@@ -379,7 +379,7 @@ def time_step(t, fx_array, x, state):
     # Log internal variables.
     for k, net in nets.items():
       print("Optimizer '{}' variables".format(k))
-      print([op.name for op in nn.get_variables_in_module(net)])
+      print([op.name for op in snt.get_variables_in_module(net)])
 
     return MetaLoss(loss, update, reset, fx_final, x_final)
 
diff --git a/meta_test.py b/meta_test.py
index 0eb0c52..08fcee9 100644
--- a/meta_test.py
+++ b/meta_test.py
@@ -24,10 +24,10 @@
 from nose_parameterized import parameterized
 import numpy as np
 from six.moves import xrange
+import sonnet as snt
 import tensorflow as tf
 
 import meta
-import nn
 import problems
 
 
@@ -141,10 +141,10 @@ def testConvolutional(self):
     """Tests L2L applied to problem with convolutions."""
     kernel_shape = 4
     def convolutional_problem():
-      conv = nn.Conv2D(output_channels=1,
-                       kernel_shape=kernel_shape,
-                       stride=1,
-                       name="conv")
+      conv = snt.Conv2D(output_channels=1,
+                        kernel_shape=kernel_shape,
+                        stride=1,
+                        name="conv")
       output = conv(tf.random_normal((100, 100, 3, 10)))
       return tf.reduce_sum(output)
 
diff --git a/networks.py b/networks.py
index 86301be..b588703 100644
--- a/networks.py
+++ b/networks.py
@@ -25,9 +25,9 @@
 import dill as pickle
 import numpy as np
 import six
+import sonnet as snt
 import tensorflow as tf
 
-import nn
 import preprocess
 
 
@@ -47,7 +47,7 @@ def factory(net, net_options=(), net_path=None):
 def save(network, sess, filename=None):
   """Save the variables contained by a network to disk."""
   to_save = collections.defaultdict(dict)
-  variables = nn.get_variables_in_module(network)
+  variables = snt.get_variables_in_module(network)
 
   for v in variables:
     split = v.name.split(":")[0].split("/")
@@ -63,7 +63,7 @@ def save(network, sess, filename=None):
 
 
 @six.add_metaclass(abc.ABCMeta)
-class Network(nn.RNNCore):
+class Network(snt.RNNCore):
   """Base class for meta-optimizer networks."""
 
   @abc.abstractmethod
@@ -166,8 +166,8 @@ def __init__(self, output_size, layers, preprocess_name="identity",
           tf modules). Default is `tf.identity`.
       preprocess_options: Gradient preprocessing options.
       scale: Gradient scaling (default is 1.0).
-      initializer: Variable initializer for linear layer. See `nn.Linear` and
-          `nn.LSTM` docs for more info. This parameter can be a string (e.g.
+      initializer: Variable initializer for linear layer. See `snt.Linear` and
+          `snt.LSTM` docs for more info. This parameter can be a string (e.g.
           "zeros" will be converted to tf.zeros_initializer).
       name: Module name.
     """
@@ -188,12 +188,12 @@ def __init__(self, output_size, layers, preprocess_name="identity",
         name = "lstm_{}".format(i)
         init = _get_layer_initializers(initializer, name,
                                        ("w_gates", "b_gates"))
-        self._cores.append(nn.LSTM(size, name=name, initializers=init))
-      self._rnn = nn.DeepRNN(self._cores, skip_connections=False,
-                             name="deep_rnn")
+        self._cores.append(snt.LSTM(size, name=name, initializers=init))
+      self._rnn = snt.DeepRNN(self._cores, skip_connections=False,
+                              name="deep_rnn")
 
       init = _get_layer_initializers(initializer, "linear", ("w", "b"))
-      self._linear = nn.Linear(output_size, name="linear", initializers=init)
+      self._linear = snt.Linear(output_size, name="linear", initializers=init)
 
   def _build(self, inputs, prev_state):
     """Connects the `StandardDeepLSTM` module into the graph.
diff --git a/networks_test.py b/networks_test.py
index 062dac9..145c0ce 100644
--- a/networks_test.py
+++ b/networks_test.py
@@ -20,10 +20,10 @@
 
 from nose_parameterized import parameterized
 import numpy as np
+import sonnet as snt
 import tensorflow as tf
 
 import networks
-import nn
 
 
 class CoordinateWiseDeepLSTMTest(tf.test.TestCase):
@@ -45,7 +45,7 @@ def testTrainable(self):
     state = net.initial_state_for_inputs(gradients)
     net(gradients, state)
     # Weights and biases for two layers.
-    variables = nn.get_variables_in_module(net)
+    variables = snt.get_variables_in_module(net)
     self.assertEqual(len(variables), 4)
 
   @parameterized.expand([
@@ -90,7 +90,7 @@ def testTrainable(self):
     state = net.initial_state_for_inputs(gradients)
     net(gradients, state)
     # Weights and biases for two layers.
-    variables = nn.get_variables_in_module(net)
+    variables = snt.get_variables_in_module(net)
     self.assertEqual(len(variables), 4)
 
   @parameterized.expand([
@@ -134,7 +134,7 @@ def testNonTrainable(self):
     net = networks.Sgd()
     state = net.initial_state_for_inputs(gradients)
     net(gradients, state)
-    variables = nn.get_variables_in_module(net)
+    variables = snt.get_variables_in_module(net)
     self.assertEqual(len(variables), 0)
 
   def testResults(self):
@@ -169,7 +169,7 @@ def testNonTrainable(self):
     net = networks.Adam()
     state = net.initial_state_for_inputs(gradients)
     net(gradients, state)
-    variables = nn.get_variables_in_module(net)
+    variables = snt.get_variables_in_module(net)
     self.assertEqual(len(variables), 0)
 
   def testZeroLearningRate(self):
diff --git a/nn/__init__.py b/nn/__init__.py
deleted file mode 100644
index 7035fbe..0000000
--- a/nn/__init__.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""This python module contains Neural Network Modules for TensorFlow."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from nn.base import AbstractModule
-from nn.base import Error
-from nn.base import IncompatibleShapeError
-from nn.base import Module
-from nn.base import NotConnectedError
-from nn.base import NotSupportedError
-from nn.base import ParentNotBuiltError
-from nn.base import Transposable
-from nn.base import UnderspecifiedError
-from nn.basic import BatchFlatten
-from nn.basic import BatchReshape
-from nn.basic import Linear
-from nn.basic_rnn import DeepRNN
-from nn.batch_norm import BatchNorm
-from nn.conv import Conv2D
-from nn.conv import Conv2DTranspose
-from nn.conv import SAME
-from nn.conv import VALID
-from nn.convnet import ConvNet2D
-from nn.gated_rnn import LSTM
-from nn.mlp import MLP
-from nn.rnn_core import RNNCore
-from nn.rnn_core import TrainableInitialState
-from nn.sequential import Sequential
-from nn.util import get_variables_in_module
diff --git a/nn/base.py b/nn/base.py
deleted file mode 100644
index 19ac0c2..0000000
--- a/nn/base.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Base class for TensorFlow nn.
-
-This file contains the Abstract Base Class for defining Modules in TensorFlow.
-A Module is an object which can be connected into the Graph multiple times
-using the __call__ method, sharing variables automatically with no need to
-explicitly use scopes or specify reuse=True.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import six
-from six import string_types
-from six.moves import xrange
-import tensorflow as tf
-
-
-class Error(Exception):
-  """Base class for all errors from nn.
-
-  This is thrown to indicate a Neural Network specific problem, e.g. wrong
-  module arity, module is not connected to the graph when it should be,
-  tried to wire together incompatible modules, etc.
-  """
-
-
-class NotConnectedError(Error):
-  """Error raised when operating on a module that has not yet been connected.
-
-  Some module properties / methods are valid to access before the module has
-  been connected into the graph, but some are not. This Error is raised when
-  the user attempts to do anything not valid before connection.
-  """
-
-
-class ParentNotBuiltError(Error):
-  """Error raised when the parent of a module has not been built yet.
-
-  For example, when making a transpose of modules which inherit from
-  `module.Transposable`, the parent has to be connected to the graph before the
-  child transpose to ensure that shape inference has already occurred.
-  """
-
-
-class IncompatibleShapeError(Error):
-  """Error raised when the shape of the input at build time is incompatible."""
-
-
-class UnderspecifiedError(Error):
-  """Error raised when too little information is available.
-
-  This does not typically mean the user is trying to do something that doesn't
-  work (in which case `IncompatibleShapeError` should be used), just that
-  some more information needs to be provided in order to build the Graph.
-  """
-
-
-class NotSupportedError(Error):
-  """Error raised when something that cannot be supported is requested.
-
-  For example a Dilated Convolution module cannot be transposed.
-  """
-
-
-@six.add_metaclass(abc.ABCMeta)
-class AbstractModule(object):
-  """Superclass for nn Modules.
-
-  This class defines the functionality that every module should implement,
-  principally the `build` method which is wrapped using `tf.make_template`
-  and called from `__call__`. Every time the module is called it will
-  be connected into the graph but using the same shared set of variables, thanks
-  to the template.
-
-  For this to work correctly, the `build` implementation in the derived class
-  must access all variables using `tf.get_variable`, not `tf.Variable`. The same
-  set of variables must be created each time, if this is not the case an Error
-  will be raised.
-
-  Every subclass must call this class' `__init__` at the start of their
-  `__init__`, passing the relevant name. If this step is omitted variable
-  sharing will not work.
-  """
-
-  # Name of TensorFlow collection containing ops to update every step, such as
-  # moving average update ops.
-  UPDATE_OPS_COLLECTION = tf.GraphKeys.UPDATE_OPS
-
-  def __init__(self, name):
-    """Performs the initialisation necessary for all AbstractModule instances.
-
-    Every subclass of AbstractModule must begin their constructor with a call to
-    this constructor, i.e. `super(MySubModule, self).__init__(name=name)`.
-
-    Avoid instantiating sub-modules in __init__ where possible, as they will not
-    be defined under the module's scope. Instead, instantiate sub-modules in
-    `build`.
-
-    Args:
-      name: Name of this module. Used to construct the Templated build function.
-
-    Raises:
-      ValueError: If name is not specified.
-    """
-    if not isinstance(name, string_types):
-      raise ValueError("Name must be a string.")
-    self._is_connected = False
-    self._template = tf.make_template(name, self._build,
-                                      create_scope_now_=True)
-
-    # Update __call__ and the object docstrings to enable better introspection
-    self.__doc__ = self._build.__doc__
-    self.__call__.__func__.__doc__ = self._build.__doc__
-
-  @abc.abstractmethod
-  def _build(self, *args, **kwargs):
-    """Add elements to the Graph, computing output Tensors from input Tensors.
-
-    Subclasses must implement this method, which will be wrapped in a Template.
-
-    Args:
-      *args: Input Tensors.
-      **kwargs: Additional Python flags controlling connection.
-    """
-    pass
-
-  def __call__(self, *args, **kwargs):
-    out = self._template(*args, **kwargs)
-    # Connect the module only if self._template returns with no errors.
-    self._is_connected = True
-    return out
-
-  @property
-  def variable_scope(self):
-    """Returns the variable_scope declared by the module.
-
-    It is valid for library users to access the internal templated
-    variable_scope, but only makes sense to do so after connection. Therefore
-    we raise an error here if the variable_scope is requested before connection.
-
-    The only case where it does make sense to access the variable_scope before
-    connection is to get the post-uniquification name, which we support using
-    the separate .name property.
-
-    Returns:
-      variable_scope: `tf.VariableScope` instance of the internal `tf.Template`.
-
-    Raises:
-      NotConnectedError: If the module is not connected to the Graph.
-    """
-    self._ensure_is_connected()
-    return self._template.variable_scope
-
-  @property
-  def name(self):
-    """Returns the name of the Module."""
-    return self._template.variable_scope.name
-
-  @property
-  def is_connected(self):
-    """Returns true iff the Module been connected to the Graph at least once."""
-    return self._is_connected
-
-  @classmethod
-  def get_possible_initializer_keys(cls):
-    """Returns the keys the dictionary of variable initializers may contain.
-
-    This provides the user with a way of knowing the initializer keys that are
-    available without having to instantiate a nn module. Subclasses may
-    override this class method if they need additional arguments to determine
-    what initializer keys may be provided.
-
-    Returns:
-      Set with strings corresponding to the strings that may be passed to the
-          constructor.
-    """
-    return getattr(cls, "POSSIBLE_INITIALIZER_KEYS", set())
-
-  def _ensure_is_connected(self):
-    """Raise an Error if the module has not been connected yet.
-
-    Until the module is connected into the Graph, any variables created do
-    not exist yet and cannot be created in advance due to not knowing the size
-    of the input Tensor(s). This assertion ensures that any variables contained
-    in this module must now exist.
-
-    Raises:
-      NotConnectedError: If the module is not connected to the Graph.
-    """
-    if not self.is_connected:
-      raise NotConnectedError(
-          "Variables in {} not instantiated yet, __call__ the module "
-          "first.".format(self.name))
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Transposable(object):
-  """Transposable module interface.
-
-    The Transposable interface requires that transposable modules implement
-    a method called `transpose`, returning a module which is the transposed
-    version of the one the method is called on.
-    Calling the method twice should return a module with the same specifications
-    as the original module.
-
-    When implementing a transposable module, special care is required to make
-    sure that parameters needed to instantiate the module are provided as
-    functions whose invocation is deferred to graph construction time.
-
-    For example, in Linear we might want to call:
-
-    ```python
-    linear = nn.Linear(name="linear", output_size=output_size)
-    linear_transpose = linear.transpose()
-    ```
-
-    where the output_size for linear_transpose is not known yet, as linear is
-    not yet connected to the graph: output_size is passed to linear_transpose's
-    constructor as a lambda returning linear.input_size. The lambda will return
-    the correct value once linear is given an input.
-    Notice that linear_transpose's output_size value does not need to be defined
-    until the module is connected to the graph.
-  """
-
-  @abc.abstractmethod
-  def transpose(self, name=None, **kwargs):
-    """Builds and returns transposed version of module.
-
-    Args:
-      name: Name of the transposed module.
-      **kwargs: Additional Python flags controlling transposition.
-
-    Returns:
-      Transposed version of the module.
-    """
-    pass
-
-  @abc.abstractmethod
-  def input_shape(self):
-    """Returns shape of input `Tensor` passed at last call to `build`."""
-    pass
-
-
-class Module(AbstractModule):
-  """Module wrapping a function provided by the user."""
-
-  def __init__(self, build, name="module"):
-    """Constructs a module with a given build function.
-
-    The Module class can be used to wrap a function assembling a network into a
-    module.
-
-    For example, the following code implements a simple one-hidden-layer MLP
-    model by defining a function called make_model and using a Module instance
-    to wrap it.
-
-    ```python
-    def make_model(inputs):
-      lin1 = nn.Linear(name="lin1", output_size=10)(inputs)
-      relu1 = tf.nn.relu(lin1, name="relu1")
-      lin2 = nn.Linear(name="lin2", output_size=20)(relu1)
-      return lin2
-
-    model = nn.Module(name='simple_mlp', build=make_model)
-    outputs = model(inputs)
-    ```
-
-    The `partial` package from `functools` can be used to bake configuration
-    parameters into the function at construction time, as shown in the following
-    example.
-
-    ```python
-    from functools import partial
-
-    def make_model(inputs, output_sizes):
-      lin1 = nn.Linear(name="lin1", output_size=output_sizes[0])(inputs)
-      relu1 = tf.nn.relu(lin1, name="relu1")
-      lin2 = nn.Linear(name="lin2", output_size=output_sizes[1])(relu1)
-      return lin2
-
-    model = nn.Module(name='simple_mlp',
-                       build=partial(make_model, output_size=[10, 20])
-    outputs = model(inputs)
-    ```
-
-    Args:
-      build: Callable to be invoked when connecting the module to the graph.
-          The `build` function is invoked when the module is called, and its
-          role is to specify how to add elements to the Graph, and how to
-          compute output Tensors from input Tensors.
-          The `build` function signature can include the following parameters:
-            *args - Input Tensors.
-            **kwargs - Additional Python parameters controlling connection.
-      name: Module name.
-
-    Raises:
-      TypeError: If build is not callable.
-    """
-    super(Module, self).__init__(name)
-
-    if not callable(build):
-      raise TypeError("Input 'build' must be callable.")
-    self._build = build
-
-  def _build(self, *args, **kwargs):
-    """Forwards call to the passed-in build function."""
-    return self._build(*args, **kwargs)
diff --git a/nn/basic.py b/nn/basic.py
deleted file mode 100644
index a1c448e..0000000
--- a/nn/basic.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Basic Modules for TensorFlow nn.
-
-Modules defining the simplest building blocks for Neural Networks.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import numbers
-
-
-import numpy as np
-import tensorflow as tf
-
-from nn import base
-from nn import util
-
-
-def create_linear_initializer(input_size):
-  """Returns a default initializer for weights or bias of a linear module."""
-  stddev = 1 / math.sqrt(input_size)
-  return tf.truncated_normal_initializer(stddev=stddev)
-
-
-class Linear(base.AbstractModule, base.Transposable):
-  """Linear module, optionally including bias."""
-
-  def __init__(self,
-               output_size,
-               use_bias=True,
-               initializers=None,
-               partitioners=None,
-               name="linear"):
-    """Constructs a Linear module.
-
-    Args:
-      output_size: Output dimensionality. `output_size` can be either an integer
-          or a callable. In the latter case, since the function invocation is
-          deferred to graph construction time, the user must only ensure that
-          output_size can be called, returning an integer, when build is called.
-      use_bias: Whether to include bias parameters. Default `True`.
-      initializers: Optional dict containing initializers to initialize the
-          weights (with key 'w') or biases (with key 'b'). The default
-          initializers are truncated normal initializers, which are commonly
-          used when the inputs are zero centered (see
-          https://arxiv.org/pdf/1502.03167v3.pdf).
-      partitioners: Optional dict containing partitioners to partition
-          weights (with key 'w') or biases (with key 'b'). As a default, no
-          partitioners are used.
-      name: Name of the module.
-
-    Raises:
-      KeyError: If an initializer is provided for a key other than 'w' or 'b' if
-          `use_bias` is `True`..
-      TypeError: If a provided initializer is not a callable function.
-    """
-    super(Linear, self).__init__(name=name)
-    self._output_size = output_size
-    self._use_bias = use_bias
-    self._input_shape = None
-    self._w = None
-    self._b = None
-    self.possible_keys = self.get_possible_initializer_keys(use_bias=use_bias)
-    self._initializers = util.check_initializers(
-        initializers, self.possible_keys)
-    self._partitioners = util.check_partitioners(
-        partitioners, self.possible_keys)
-
-  @classmethod
-  def get_possible_initializer_keys(cls, use_bias=True):
-    return {"w", "b"} if use_bias else {"w"}
-
-  def _build(self, inputs):
-    """Connects the Linear module into the graph, with input Tensor `inputs`.
-
-    If this is not the first time the module has been connected to the graph,
-    the Tensor provided here must have the same final dimension, in order for
-    the existing variables to be the correct size for the multiplication. The
-    batch size may differ for each connection.
-
-    Args:
-      inputs: A 2D Tensor of size [batch_size, input_size].
-
-    Returns:
-      A 2D Tensor of size [batch_size, output_size].
-
-    Raises:
-      base.IncompatibleShapeError: If the input is not a 2-D `Tensor` with
-          the size of the second dimension specified.
-      base.IncompatibleShapeError: If reconnecting an already connected module
-          into the graph, and the shape of the input is not compatible with
-          previous inputs.
-    """
-    input_shape = tuple(inputs.get_shape().as_list())
-
-    if len(input_shape) != 2:
-      raise base.IncompatibleShapeError(
-          "{}: rank of shape must be 2 not: {}".format(
-              self.name, len(input_shape)))
-
-    if input_shape[1] is None:
-      raise base.IncompatibleShapeError(
-          "{}: Input size must be specified at module build time".format(
-              self.name))
-
-    if self._input_shape is not None and input_shape[1] != self._input_shape[1]:
-      raise base.IncompatibleShapeError(
-          "{}: Input shape must be [batch_size, {}] not: [batch_size, {}]"
-          .format(self.name, self._input_shape[1], input_shape[1]))
-
-    self._input_shape = input_shape
-
-    if "w" not in self._initializers:
-      self._initializers["w"] = create_linear_initializer(self._input_shape[1])
-
-    if "b" not in self._initializers and self._use_bias:
-      self._initializers["b"] = create_linear_initializer(self._input_shape[1])
-
-    weight_shape = (self._input_shape[1], self.output_size)
-    dtype = inputs.dtype
-    self._w = tf.get_variable("w",
-                              shape=weight_shape,
-                              dtype=dtype,
-                              initializer=self._initializers["w"],
-                              partitioner=self._partitioners.get("w", None))
-    outputs = tf.matmul(inputs, self._w)
-
-    if self._use_bias:
-      bias_shape = (self.output_size,)
-      self._b = tf.get_variable("b",
-                                shape=bias_shape,
-                                dtype=dtype,
-                                initializer=self._initializers["b"],
-                                partitioner=self._partitioners.get("b", None))
-      outputs += self._b
-
-    return outputs
-
-  @property
-  def w(self):
-    """Returns the Variable containing the weight matrix.
-
-    Returns:
-      Variable object containing the weights, from the most recent __call__.
-
-    Raises:
-      base.NotConnectedError: If the module has not been connected to the
-          graph yet, meaning the variables do not exist.
-    """
-    self._ensure_is_connected()
-    return self._w
-
-  @property
-  def b(self):
-    """Returns the Variable containing the bias.
-
-    Returns:
-      Variable object containing the bias, from the most recent __call__.
-
-    Raises:
-      base.NotConnectedError: If the module has not been connected to the
-          graph yet, meaning the variables do not exist.
-      AttributeError: If the module does not use bias.
-    """
-    self._ensure_is_connected()
-    if not self._use_bias:
-      raise AttributeError(
-          "No bias Variable in Linear Module when `use_bias=False`.")
-    return self._b
-
-  @property
-  def output_size(self):
-    """Returns the module output size."""
-    if callable(self._output_size):
-      self._output_size = self._output_size()
-    return self._output_size
-
-  @property
-  def has_bias(self):
-    """Returns `True` if bias Variable is present in the module."""
-    return self._use_bias
-
-  # Implements Transposable interface.
-  @property
-  def input_shape(self):
-    """Returns shape of input `Tensor` passed at last call to `_build`."""
-    self._ensure_is_connected()
-    return self._input_shape
-
-  # Implements Transposable interface
-  def transpose(self, name=None):
-    """Returns transposed `Linear` module.
-
-    Args:
-      name: Optional string assigning name of transpose module. The default name
-          is constructed by appending "_transpose" to `self.name`.
-
-    Returns:
-      Transposed `Linear` module.
-    """
-    if name is None:
-      name = self.name + "_transpose"
-    return Linear(output_size=lambda: self.input_shape[1],
-                  use_bias=self._use_bias,
-                  initializers=self._initializers,
-                  name=name)
-
-
-class BatchReshape(base.AbstractModule, base.Transposable):
-  """Reshapes input Tensor, preserving the batch dimension."""
-
-  def __init__(self, shape, name="batch_reshape"):
-    """Constructs a BatchReshape module.
-
-    Args:
-      shape: Shape to reshape the input Tensor to while preserving its
-          batch size; `shape` can be either a tuple/list, or a callable that
-          returns the actual shape. The callable does not need to be ready to
-          return something meaningful at construction time, but it will be
-          required to be able to do so when the module is connected to the
-          graph. When the special value -1 appears in `shape` the corresponding
-          size is automatically inferred. Note that -1 can only appear once in
-          `shape`. To flatten all non-batch dimensions, the nn.BatchFlatten
-          module can also be used.
-      name: Name of the module.
-    """
-    super(BatchReshape, self).__init__(name=name)
-
-    self._input_shape = None
-    self._shape = shape
-
-    if not callable(self._shape):
-      self._shape = tuple(self._shape)
-
-  def _infer_shape(self, dimensions):
-    """Replaces the -1 wildcard in the output shape vector.
-
-    This function infers the correct output shape given the input dimensions.
-
-    Args:
-      dimensions: List of input non-batch dimensions.
-
-    Returns:
-      Tuple of non-batch output dimensions.
-    """
-    # Size of input
-    n = np.prod(dimensions)
-    # Size of output where defined
-    m = np.prod(abs(np.array(self._shape)))
-    # Replace wildcard
-    v = np.array(self._shape)
-    v[v == -1] = n // m
-    return tuple(v)
-
-  def _build(self, inputs):
-    """Connects the module into the graph, with input Tensor `inputs`.
-
-    Args:
-      inputs: A Tensor of shape [batch_size] + input_shape.
-
-    Returns:
-      A Tensor of shape [batch_size] + output_shape, with output_shape as
-         defined in constructor.
-
-    Raises:
-      ValueError: If output shape is incompatible with input shape; or if
-          shape array contains non numeric entries; or if shape array contains
-          more than 1 wildcard -1.
-    """
-    if callable(self._shape):
-      self._shape = tuple(self._shape())
-
-    if not all([isinstance(x, numbers.Integral) and (x > 0 or x == -1)
-                for x in self._shape]):
-      raise ValueError("Input array shape can contain positive integral "
-                       "numbers only, and the wildcard -1 used once")
-
-    if self._shape.count(-1) > 1:
-      raise ValueError("Wildcard -1 can appear only once in shape")
-
-    self._input_shape = inputs.get_shape()[1:].as_list()
-    if self._shape.count(-1) > 0:
-      shape = (-1,) + self._infer_shape(self._input_shape)
-    else:
-      shape = (-1,) + self._shape
-
-    if np.prod(self._input_shape) != np.prod(shape[1:]):
-      raise ValueError("Output shape is incompatible with input shape")
-    return tf.reshape(inputs, shape)
-
-  @property
-  def input_shape(self):
-    self._ensure_is_connected()
-    return self._input_shape
-
-  # Implements Transposable interface.
-  def transpose(self, name=None):
-    """Returns transpose batch reshape."""
-    if name is None:
-      name = self.name + "_transpose"
-    return BatchReshape(shape=lambda: self.input_shape, name=name)
-
-
-class BatchFlatten(BatchReshape):
-  """Flattens the input Tensor, preserving the batch dimension."""
-
-  def __init__(self, name="batch_flatten"):
-    """Constructs a BatchFlatten module.
-
-    Args:
-      name: Name of the module.
-    """
-    super(BatchFlatten, self).__init__(name=name, shape=(-1,))
diff --git a/nn/basic_rnn.py b/nn/basic_rnn.py
deleted file mode 100644
index fa7e499..0000000
--- a/nn/basic_rnn.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Basic RNN Cores for TensorFlow nn.
-
-This file contains the definitions of the simplest building blocks for Recurrent
-Neural Networks.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import tensorflow as tf
-
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.util import nest
-
-from nn import rnn_core
-
-
-def _get_flat_core_sizes(cores):
-  """Obtains the list flattened output sizes of a list of cores.
-
-  Args:
-    cores: list of cores to get the shapes from.
-
-  Returns:
-    List of lists that, for each core, contains the list of its output
-      dimensions.
-  """
-  core_sizes_lists = []
-  for core in cores:
-    flat_output_size = nest.flatten(core.output_size)
-    core_sizes_lists.append([tensor_shape.as_shape(size).as_list()
-                             for size in flat_output_size])
-  return core_sizes_lists
-
-
-class DeepRNN(rnn_core.RNNCore):
-  """RNN core which passes data through a number of internal modules or ops.
-
-  This module is constructed by passing an iterable of externally constructed
-  modules or ops. The DeepRNN takes `(input, prev_state)` as input and passes
-  the input through each internal module in the order they were presented,
-  using elements from `prev_state` as necessary for internal recurrent cores.
-  The output is `(output, next_state)` in common with other RNN cores.
-  By default, skip connections from the input to all internal modules and from
-  each intermediate output to the final output are used.
-
-  E.g.:
-
-  ```python
-  lin = nn.Linear(hidden_size=128)
-  tanh = tf.tanh
-  lstm = nn.LSTM(hidden_size=256)
-  deep_rnn = nn.DeepRNN([lin, tanh, lstm])
-  output, next_state = deep_rnn(input, prev_state)
-  ```
-
-  The computation set up inside the DeepRNN has the same effect as:
-
-  ```python
-  lin_output = lin(input)
-  tanh_output = tanh(tf.concat(1, [input, lin_output]))
-  lstm_output, lstm_next_state = lstm(
-      tf.concat(1, [input, tanh_output]), prev_state[0])
-
-  next_state = (lstm_next_state,)
-  output = tf.concat(1, [lin_output, tanh_output, lstm_output])
-  ```
-
-  Every internal module receives the preceding module's output and the entire
-  core's input. The output is created by concatenating each internal module's
-  output. In the case of internal recurrent elements, corresponding elements
-  of the state are used such that `state[i]` is passed to the `i`'th internal
-  recurrent element. Note that the state of a `DeepRNN` is always a tuple, which
-  will contain the same number of elements as there are internal recurrent
-  cores. If no internal modules are recurrent, the state of the DeepRNN as a
-  whole is the empty tuple. Wrapping non-recurrent modules into a DeepRNN can
-  be useful to produce something API compatible with a "real" recurrent module,
-  simplifying code that handles the cores.
-
-  Without skip connections the previous example would become the following
-  (note the only difference is the addition of `skip_connections=False`):
-
-  ```python
-  # ... declare other modules as above
-  deep_rnn = nn.DeepRNN([lin, tanh, lstm], skip_connections=False)
-  output, next_state = deep_rnn(input, prev_state)
-  ```
-
-  which is equivalent to:
-
-  ```python
-  lin_output = lin(input)
-  tanh_output = tanh(lin_output)
-  lstm_output, lstm_next_state = lstm(tanh_output, prev_state[0])
-
-  next_state = (lstm_next_state,)
-  output = lstm_output
-  ```
-  """
-
-  def __init__(self, cores, skip_connections=True, name="deep_rnn"):
-    """Construct a Deep RNN core.
-
-    Args:
-      cores: iterable of modules or ops.
-      skip_connections: a boolean that indicates whether to use skip
-        connections. This means that the input is fed to all the layers, after
-        being concatenated with the output of the previous layer. The output
-        of the module will be the concatenation of all the outputs of the
-        internal modules.
-      name: name of the module.
-
-    Raises:
-      ValueError: if `cores` is not an iterable.
-    """
-    super(DeepRNN, self).__init__(name=name)
-
-    if not isinstance(cores, collections.Iterable):
-      raise ValueError("Cores should be an iterable object.")
-    self._cores = tuple(cores)
-    self._skip_connections = skip_connections
-
-    if self._skip_connections:
-      self._check_cores_output_sizes()
-
-    self._is_recurrent_list = [isinstance(core, rnn_core.RNNCore)
-                               for core in self._cores]
-    self._num_recurrent = sum(self._is_recurrent_list)
-
-  def _check_cores_output_sizes(self):
-    """Checks the output_sizes of the cores of the DeepRNN module.
-
-    Raises:
-      ValueError: if the outputs of the cores cannot be concatenated along their
-        first dimension.
-    """
-    for core_sizes in zip(*tuple(_get_flat_core_sizes(self._cores))):
-      first_core_list = core_sizes[0][1:]
-      for i, core_list in enumerate(core_sizes[1:]):
-        if core_list[1:] != first_core_list:
-          raise ValueError("The outputs of the provided cores are not able "
-                           "to be concatenated along the first feature "
-                           "dimension. Core 0 has size %s, whereas Core %d "
-                           "has size %s" % (first_core_list, i, core_list))
-
-  def _build(self, inputs, prev_state):
-    """Connects the DeepRNN module into the graph.
-
-    If this is not the first time the module has been connected to the graph,
-    the Tensors provided as input_ and state must have the same final
-    dimension, in order for the existing variables to be the correct size for
-    their corresponding multiplications. The batch size may differ for each
-    connection.
-
-    Args:
-      inputs: a nested tuple of Tensors of arbitrary dimensionality, with at
-        least an initial batch dimension.
-      prev_state: a tuple of `prev_state`s that corresponds to the state
-        of each one of the cores of the `DeepCore`.
-
-    Returns:
-      output: a nested tuple of Tensors of arbitrary dimensionality, with at
-        least an initial batch dimension.
-      next_state: a tuple of `next_state`s that corresponds to the updated state
-        of each one of the cores of the `DeepCore`.
-
-    Raises:
-      ValueError: if connecting the module into the graph any time after the
-        first time, and the inferred size of the inputs does not match previous
-        invocations. This may happen if one connects a module any time after the
-        first time that does not have the configuration of skip connections as
-        the first time.
-    """
-    current_input = inputs
-    next_states = []
-    outputs = []
-    recurrent_idx = 0
-    for i, core in enumerate(self._cores):
-      if self._skip_connections and i > 0:
-        flat_input = (nest.flatten(inputs), nest.flatten(current_input))
-        flat_input = [tf.concat(1, input_) for input_ in zip(*flat_input)]
-        current_input = nest.pack_sequence_as(structure=inputs,
-                                              flat_sequence=flat_input)
-
-      # Determine if this core in the stack is recurrent or not and call
-      # accordingly.
-      if self._is_recurrent_list[i]:
-        current_input, next_state = core(current_input,
-                                         prev_state[recurrent_idx])
-        next_states.append(next_state)
-        recurrent_idx += 1
-      else:
-        current_input = core(current_input)
-
-      if self._skip_connections:
-        outputs.append(current_input)
-
-    if self._skip_connections:
-      flat_outputs = tuple(nest.flatten(output) for output in outputs)
-      flat_outputs = [tf.concat(1, output) for output in zip(*flat_outputs)]
-      output = nest.pack_sequence_as(structure=outputs[0],
-                                     flat_sequence=flat_outputs)
-    else:
-      output = current_input
-
-    return output, tuple(next_states)
-
-  def initial_state(self, batch_size, dtype=tf.float32, trainable=False,
-                    trainable_initializers=None):
-    """Builds the default start state for a DeepRNN.
-
-    Args:
-      batch_size: An int, float or scalar Tensor representing the batch size.
-      dtype: The data type to use for the state.
-      trainable: Boolean that indicates whether to learn the initial state.
-      trainable_initializers: An initializer function or nested structure of
-          functions with same structure as the `state_size` property of the
-          core, to be used as initializers of the initial state variable.
-
-    Returns:
-      A tensor or nested tuple of tensors with same structure and shape as the
-      `state_size` property of the core.
-
-    Raises:
-      ValueError: if the number of passed initializers is not the same as the
-          number of recurrent cores.
-    """
-    initial_state = []
-    if trainable_initializers is None:
-      trainable_initializers = [None] * self._num_recurrent
-
-    num_initializers = len(trainable_initializers)
-
-    if num_initializers != self._num_recurrent:
-      raise ValueError("The number of initializers and recurrent cores should "
-                       "be the same. Received %d initializers for %d specified "
-                       "recurrent cores."
-                       % (num_initializers, self._num_recurrent))
-
-    recurrent_idx = 0
-    for is_recurrent, core in zip(self._is_recurrent_list, self._cores):
-      if is_recurrent:
-        with tf.variable_scope("%s-rec_core%d" % (self.name, recurrent_idx)):
-          core_initial_state = core.initial_state(
-              batch_size, dtype=dtype, trainable=trainable,
-              trainable_initializers=trainable_initializers[recurrent_idx])
-        initial_state.append(core_initial_state)
-        recurrent_idx += 1
-    return tuple(initial_state)
-
-  @property
-  def state_size(self):
-    sizes = []
-    for is_recurrent, core in zip(self._is_recurrent_list, self._cores):
-      if is_recurrent:
-        sizes.append(core.state_size)
-    return tuple(sizes)
-
-  @property
-  def output_size(self):
-    if self._skip_connections:
-      output_size = []
-      for core_sizes in zip(*tuple(_get_flat_core_sizes(self._cores))):
-        added_core_size = core_sizes[0]
-        added_core_size[0] = sum([size[0] for size in core_sizes])
-        output_size.append(tf.TensorShape(added_core_size))
-      return nest.pack_sequence_as(structure=self._cores[0].output_size,
-                                   flat_sequence=output_size)
-    else:
-      return self._cores[-1].output_size
diff --git a/nn/batch_norm.py b/nn/batch_norm.py
deleted file mode 100644
index f7a8b5b..0000000
--- a/nn/batch_norm.py
+++ /dev/null
@@ -1,499 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Batch normalization module for nn.
-
-This contains the module BatchNorm, which performs batch normalization on
-its inputs. It has an optional post-normalization scale and offset, and it
-maintains moving averages of the statistics for use at test time.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from six.moves import xrange
-import tensorflow as tf
-
-from tensorflow.contrib.layers.python.layers import utils
-from tensorflow.python.training import moving_averages
-from nn import base
-from nn import util
-
-
-class BatchNorm(base.AbstractModule):
-  """Batch normalization module, including optional affine transformation.
-
-  This module maintains exponential moving averages of the mean and
-  variance, used for calculating more accurate shifted statistics at training
-  time and optionally used to normalize at test time.
-
-  In order to update the moving averages, the user must run the
-  ops in the tf.GraphKeys.UPDATE_OPS TensorFlow collection. For example:
-
-      bn = BatchNorm()
-      train_net = bn(train_inputs, is_training=True)
-      test_net = bn(test_inputs, is_training=False, test_local_stats=False)
-
-      ...
-
-      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-      with tf.control_dependencies(update_ops):
-        train_op = tf.group(train_op)
-
-  Then, whenever `train_op` is run so also are the moving average update ops.
-
-  At training time, batch statistics (mean, variance) are not shared between
-  separate connections. The moving averages are shared between separate
-  connections. At both training and test time, the optional affine
-  transformations are shared between separate connections.
-
-  Local batch statistics are used by default at test time, but the moving
-  averages can be used by specifying a flag when connecting. One often wants
-  to use local batch statistics at test time to track the progress while the
-  model is trained as it would ensure that moving average updates do not affect
-  the training curves. Once the training is finished, it's often advantageous
-  to use moving average statistics, since it would make evaluation agnostic to
-  the batch size, and might even lead to small improvements over the local
-  batch statistics.
-  """
-
-  GAMMA = "gamma"
-  BETA = "beta"
-  POSSIBLE_INITIALIZER_KEYS = {GAMMA, BETA}
-
-  def __init__(self, reduction_indices=None, offset=True, scale=False,
-               decay_rate=0.999, eps=1e-3, initializers=None,
-               use_legacy_moving_second_moment=False,
-               name="batch_norm"):
-    """Constructs a BatchNorm module.
-
-    By default reduces over all input tensor dimensions apart from the final
-    dimension. This has the effect of treating pixels in 1D/2D/3D images as
-    additional elements of the minibatch.
-
-    If this is not the desired behaviour, the user can specify the tensor
-    indices to reduce over with `reduction_indices`.
-
-    Args:
-      reduction_indices: Optional indices of dimensions to reduce over.
-      offset: Optional boolean to specify whether or not to apply a trained
-        component-wise bias after the batch normalization and scaling.
-      scale: Optional boolean to specify whether or not to apply a trained
-        component-wise scale after the batch normalization.
-      decay_rate: Decay rate of the exponential moving averages of the mean
-        and variance.
-      eps: Small number to avoid dividing by zero when diving by the standard
-        deviation.
-      initializers: Optional dict containing ops to initialize the weights of
-        the affine transform (`gamma` and `beta`).
-      use_legacy_moving_second_moment: Keep a moving second moment, rather than
-        the moving variance. This is deprecated, but is kept for backwards
-        compatability with old checkpoints. By default `False`.
-      name: Name of the module.
-
-    Raises:
-      base.Error: If initializers contains any keys other
-          than `gamma` or `beta`.
-      ValueError: If `use_legacy_moving_second_moment` is not `True`.
-    """
-    super(BatchNorm, self).__init__(name)
-
-    self._reduction_indices = reduction_indices
-    self._offset = offset
-    self._scale = scale
-    self._decay_rate = decay_rate
-    self._eps = eps
-    self._use_legacy_moving_second_moment = use_legacy_moving_second_moment
-
-    self._initializers = util.check_initializers(
-        initializers, self.POSSIBLE_INITIALIZER_KEYS)
-
-  def _set_default_initializer(self, var_name):
-    """Sets up a default initializer for a variable if one doesn't exist.
-
-    For the offset (beta), a zeros initializer is used by default.
-    For the scale (gamma), a ones initializer is used by default.
-
-    Args:
-      var_name: name of variable as a string.
-    """
-    if var_name not in self._initializers:
-      if var_name == self.GAMMA:
-        self._initializers[self.GAMMA] = tf.ones_initializer()
-      elif var_name == self.BETA:
-        self._initializers[self.BETA] = tf.zeros_initializer()
-
-  def _build_statistics_variance(self, input_batch,
-                                 reduction_indices, use_batch_stats):
-    """Builds the statistics part of the graph when using moving variance.
-
-    Args:
-      input_batch: Input batch Tensor.
-      reduction_indices: Indices of `input_batch` to reduce over.
-      use_batch_stats: Boolean to indicate if batch statistics should be
-        calculated, otherwise moving averages are returned.
-
-    Returns:
-      Tuple of (mean, variance).
-    """
-    # Set up our moving statistics. When connecting in parallel, this is shared.
-    self._moving_mean = tf.get_variable(
-        "moving_mean",
-        shape=self._mean_shape,
-        collections=[tf.GraphKeys.MOVING_AVERAGE_VARIABLES,
-                     tf.GraphKeys.GLOBAL_VARIABLES],
-        initializer=tf.zeros_initializer(),
-        trainable=False)
-
-    self._moving_variance = tf.get_variable(
-        "moving_variance",
-        shape=self._mean_shape,
-        collections=[tf.GraphKeys.MOVING_AVERAGE_VARIABLES,
-                     tf.GraphKeys.GLOBAL_VARIABLES],
-        initializer=tf.ones_initializer(),
-        trainable=False)
-
-    def build_batch_stats():
-      """Builds the batch statistics calculation ops."""
-
-      # We use the moving mean as an estimate of the mean in order to perform
-      # a more numerically stable calculation of the batch mean.
-      # Copy for better stability.
-      shift = tf.add(self._moving_mean, 0)
-      counts, shifted_sum_x, shifted_sum_x2, _ = tf.nn.sufficient_statistics(
-          input_batch,
-          reduction_indices,
-          keep_dims=True,
-          shift=shift,
-          name="batch_norm_ss")
-
-      mean, variance = tf.nn.normalize_moments(counts,
-                                               shifted_sum_x,
-                                               shifted_sum_x2,
-                                               shift,
-                                               name="normalize_moments")
-
-      return mean, variance
-
-    def build_moving_stats():
-      return (
-          tf.identity(self._moving_mean),
-          tf.identity(self._moving_variance),
-      )
-
-    mean, variance = utils.smart_cond(
-        use_batch_stats,
-        build_batch_stats,
-        build_moving_stats,
-    )
-
-    return mean, variance
-
-  def _build_statistics_second_moment(self, input_batch,
-                                      reduction_indices, use_batch_stats):
-    """Builds the statistics part of the graph when using moving second moment.
-
-    Args:
-      input_batch: Input batch Tensor.
-      reduction_indices: Indices of `input_batch` to reduce over.
-      use_batch_stats: Boolean to indicate if batch statistics should be
-        calculated, otherwise moving averages are returned.
-
-    Returns:
-      Tuple of (mean, variance, second_moment).
-    """
-    # Set up our moving statistics. When connecting in parallel, this is shared.
-    self._moving_mean = tf.get_variable(
-        "moving_mean",
-        shape=self._mean_shape,
-        collections=[tf.GraphKeys.MOVING_AVERAGE_VARIABLES,
-                     tf.GraphKeys.GLOBAL_VARIABLES],
-        initializer=tf.zeros_initializer(),
-        trainable=False)
-
-    self._moving_second_moment = tf.get_variable(
-        "moving_second_moment",
-        shape=self._mean_shape,
-        collections=[tf.GraphKeys.MOVING_AVERAGE_VARIABLES,
-                     tf.GraphKeys.GLOBAL_VARIABLES],
-        initializer=tf.ones_initializer(),
-        trainable=False)
-
-    self._moving_variance = tf.subtract(self._moving_second_moment,
-                                        tf.square(self._moving_mean),
-                                        name="moving_variance")
-
-    def build_batch_stats():
-      """Builds the batch statistics calculation ops."""
-
-      # Copy for better stability.
-      # We use the moving mean as an estimate of the mean in order to perform
-      # a more numerically stable calculation of the batch mean.
-      shift = tf.add(self._moving_mean, 0)
-      counts, shifted_sum_x, shifted_sum_x2, _ = tf.nn.sufficient_statistics(
-          input_batch,
-          reduction_indices,
-          keep_dims=True,
-          shift=shift,
-          name="batch_norm_ss")
-
-      mean, variance = tf.nn.normalize_moments(counts,
-                                               shifted_sum_x,
-                                               shifted_sum_x2,
-                                               shift,
-                                               name="normalize_moments")
-      second_moment = variance + tf.square(mean)
-
-      return mean, variance, second_moment
-
-    def build_moving_stats():
-      return (
-          tf.identity(self._moving_mean),
-          tf.identity(self._moving_variance),
-          tf.identity(self._moving_second_moment),
-      )
-
-    mean, variance, second_moment = utils.smart_cond(
-        use_batch_stats,
-        build_batch_stats,
-        build_moving_stats,
-    )
-
-    return mean, variance, second_moment
-
-  def _build_update_ops_variance(self, mean, variance, is_training):
-    """Builds the moving average update ops when using moving variance.
-
-    Args:
-      mean: The mean value to update with.
-      variance: The variance value to update with.
-      is_training: Boolean Tensor to indicate if we're currently in
-        training mode.
-    """
-
-    def build_update_ops():
-      """Builds the exponential moving average update ops."""
-
-      update_mean_op = moving_averages.assign_moving_average(
-          variable=self._moving_mean,
-          value=mean,
-          decay=self._decay_rate,
-          name="update_moving_mean").op
-
-      update_variance_op = moving_averages.assign_moving_average(
-          variable=self._moving_variance,
-          value=variance,
-          decay=self._decay_rate,
-          name="update_moving_variance").op
-
-      return update_mean_op, update_variance_op
-
-    def build_no_ops():
-      return (tf.no_op(), tf.no_op())
-
-    # Only make the ops if we know that `is_training=True`, or the value of
-    # `is_training` is unknown.
-    is_training_const = utils.constant_value(is_training)
-    if is_training_const is None or is_training_const:
-      update_mean_op, update_variance_op = utils.smart_cond(
-          is_training,
-          build_update_ops,
-          build_no_ops,
-      )
-
-      # Every new connection creates a new op which adds its contribution
-      # to the running average when ran.
-      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_mean_op)
-      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_variance_op)
-
-  def _build_update_ops_second_moment(self, mean, second_moment, is_training):
-    """Builds the moving average update ops when using the moving second moment.
-
-    Args:
-      mean: The mean value to update with.
-      second_moment: The second_moment value to update with.
-      is_training: Boolean Tensor to indicate if we're currently in
-        training mode.
-    """
-
-    def build_update_ops():
-      """Builds the exponential moving average update ops."""
-
-      update_mean_op = moving_averages.assign_moving_average(
-          variable=self._moving_mean,
-          value=mean,
-          decay=self._decay_rate,
-          name="update_moving_mean").op
-
-      update_second_moment_op = moving_averages.assign_moving_average(
-          variable=self._moving_second_moment,
-          value=second_moment,
-          decay=self._decay_rate,
-          name="update_moving_second_moment").op
-
-      return update_mean_op, update_second_moment_op
-
-    def build_no_ops():
-      return (tf.no_op(), tf.no_op())
-
-    # Only make the ops if we know that `is_training=True`, or the value of
-    # `is_training` is unknown.
-    is_training_const = utils.constant_value(is_training)
-    if is_training_const is None or is_training_const:
-      update_mean_op, update_second_moment_op = utils.smart_cond(
-          is_training,
-          build_update_ops,
-          build_no_ops,
-      )
-
-      # Every new connection creates a new op which adds its contribution
-      # to the running average when ran.
-      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_mean_op)
-      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_second_moment_op)
-
-  def _build(self, input_batch, is_training=True, test_local_stats=True):
-    """Connects the BatchNorm module into the graph.
-
-    Args:
-      input_batch: A Tensor of arbitrary dimension. By default, the final
-        dimension is not reduced over when computing the minibatch statistics.
-      is_training: A boolean to indicate if the module should be connected in
-        training mode, meaning the moving averages are updated. By default
-        `True`. Can be a Tensor.
-      test_local_stats: A boolean to indicate if local batch statistics should
-        be used when `is_training=False`. If not, moving averages are used.
-        By default `True`. Can be a Tensor.
-
-    Returns:
-      A tensor with the same shape as `input_batch`.
-
-    Raises:
-      base.IncompatibleShapeError: If `reduction_indices` is not valid for the
-        input shape or has negative entries.
-      base.NotSupportedError: If `input_batch` has data type of `tf.float16`.
-    """
-    input_shape = input_batch.get_shape()
-
-    if self._reduction_indices is not None:
-      if len(self._reduction_indices) > len(input_shape):
-        raise base.IncompatibleShapeError(
-            "Too many reduction indices specified.")
-
-      if max(self._reduction_indices) >= len(input_shape):
-        raise base.IncompatibleShapeError(
-            "Reduction index too large for input shape.")
-
-      if min(self._reduction_indices) < 0:
-        raise base.IncompatibleShapeError(
-            "Reduction indeces must be non-negative.")
-
-      reduction_indices = self._reduction_indices
-    else:
-      # Reduce over all dimensions except the last.
-      reduction_indices = range(len(input_shape))[:-1]
-
-    if input_batch.dtype == tf.float16:
-      raise base.NotSupportedError(
-          "BatchNorm does not support `tf.float16`, insufficient "
-          "precision for calculating sufficient statistics.")
-
-    self._mean_shape = input_batch.get_shape().as_list()
-    for index in reduction_indices:
-      self._mean_shape[index] = 1
-
-    use_batch_stats = is_training | test_local_stats
-
-    # Use the legacy moving second moment if the flag is set.
-    if self._use_legacy_moving_second_moment:
-      tf.logging.warning(
-          "nn.BatchNorm `use_legacy_second_moment=True` is deprecated.")
-
-      mean, variance, second_moment = self._build_statistics_second_moment(
-          input_batch,
-          reduction_indices,
-          use_batch_stats)
-
-      self._build_update_ops_second_moment(mean, second_moment, is_training)
-    else:
-      mean, variance = self._build_statistics_variance(
-          input_batch,
-          reduction_indices,
-          use_batch_stats)
-
-      self._build_update_ops_variance(mean, variance, is_training)
-
-    # Set up optional scale and offset factors.
-    if self._offset:
-      self._set_default_initializer(self.BETA)
-      self._beta = tf.get_variable(
-          self.BETA,
-          shape=self._mean_shape,
-          initializer=self._initializers[self.BETA])
-    else:
-      self._beta = None
-
-    if self._scale:
-      self._set_default_initializer(self.GAMMA)
-      self._gamma = tf.get_variable(
-          self.GAMMA,
-          shape=self._mean_shape,
-          initializer=self._initializers[self.GAMMA])
-    else:
-      self._gamma = None
-
-    out = tf.nn.batch_normalization(
-        input_batch,
-        mean,
-        variance,
-        self._beta,
-        self._gamma,
-        self._eps,
-        name="batch_norm")
-
-    return out
-
-  @property
-  def moving_mean(self):
-    self._ensure_is_connected()
-    return self._moving_mean
-
-  @property
-  def moving_second_moment(self):
-    self._ensure_is_connected()
-    return self._moving_second_moment
-
-  @property
-  def moving_variance(self):
-    self._ensure_is_connected()
-    return self._moving_variance
-
-  @property
-  def beta(self):
-    self._ensure_is_connected()
-
-    if self._beta is None:
-      raise base.Error(
-          "Batch normalization doesn't have an offset, so no beta")
-    else:
-      return self._beta
-
-  @property
-  def gamma(self):
-    self._ensure_is_connected()
-
-    if self._gamma is None:
-      raise base.Error(
-          "Batch normalization doesn't have a scale, so no gamma")
-    else:
-      return self._gamma
diff --git a/nn/conv.py b/nn/conv.py
deleted file mode 100644
index c41dd75..0000000
--- a/nn/conv.py
+++ /dev/null
@@ -1,679 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implementation of convolutional nn modules.
-
-Classes defining convolutional operations, inheriting from `nn.Module`, with
-easy weight sharing.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import math
-import numbers
-
-
-import numpy as np
-import tensorflow as tf
-
-from nn import base
-from nn import util
-
-
-SAME = "SAME"
-VALID = "VALID"
-ALLOWED_PADDINGS = {SAME, VALID}
-
-
-def _fill_shape(x, n):
-  """Idempotentally converts an integer to a tuple of integers of a given size.
-
-  This is used to allow shorthand notation for various configuration parameters.
-  A user can provide either, for example, `2` or `[2, 2]` as a kernel shape, and
-  this function returns `(2, 2)` in both cases. Passing `[1, 2]` will return
-  `(1, 2)`.
-
-  Args:
-    x: An integer or an iterable of integers
-    n: An integer, the size of the desired output list
-
-  Returns:
-    If `x` is an integer, a tuple of size `n` containing `n` copies of `x`.
-    If `x` is an iterable of integers of size `n`, it returns `tuple(x)`.
-
-  Raises:
-    TypeError: If n is not a positive integer;
-      or if x is neither integer nor an iterable of size n.
-  """
-  if not isinstance(n, numbers.Integral) or n < 1:
-    raise TypeError("n must be a positive integer")
-
-  if isinstance(x, numbers.Integral):
-    return (x,) * n
-  elif (isinstance(x, collections.Iterable) and len(x) == n and
-        all(isinstance(v, numbers.Integral) for v in x)):
-    return tuple(x)
-  else:
-    raise TypeError("x is {}, must be either an integer "
-                    "or an iterable of integers of size {}".format(x, n))
-
-
-def _fill_and_verify_kernel_shape(x, n):
-  """Expands x if necessary into a `n`-D kernel shape and reports errors."""
-  try:
-    return _fill_shape(x, n)
-  except TypeError as e:
-    raise base.IncompatibleShapeError("Invalid kernel shape: {}".format(e))
-
-
-def _verify_padding(padding):
-  """Verifies that the provided padding is supported. Returns padding."""
-  if padding not in ALLOWED_PADDINGS:
-    raise ValueError(
-        "Padding must be member of '{}', not {}".format(
-            ALLOWED_PADDINGS, padding))
-  return padding
-
-
-def _fill_and_one_pad_stride(stride, n):
-  """Expands the provided stride to size n and pads it with 1s."""
-  try:
-    return (1,) + _fill_shape(stride, n) + (1,)
-  except TypeError:
-    raise base.IncompatibleShapeError(
-        "stride is {} ({}), must be either an integer or an iterable of "
-        "integers of size {}".format(stride, type(stride), n))
-
-
-def create_weight_initializer(fan_in_shape):
-  """Returns a default initializer for the weights of a convolutional module."""
-  stddev = 1 / math.sqrt(np.prod(fan_in_shape))
-  return tf.truncated_normal_initializer(stddev=stddev)
-
-
-def create_bias_initializer(bias_shape):
-  """Returns a default initializer for the biases of a convolutional module."""
-  stddev = 1 / math.sqrt(np.prod(bias_shape))
-  return tf.truncated_normal_initializer(stddev=stddev)
-
-
-class Conv2D(base.AbstractModule, base.Transposable):
-  """Spatial convolution and dilated convolution module, including bias.
-
-  This acts as a light wrapper around the TensorFlow ops `tf.nn.conv2d` and
-  `tf.nn.atrous_conv2d`, abstracting away variable creation and sharing.
-
-  The current implementation of `tf.nn.atrous_conv2d` does not easily permit for
-  strides > 1 when performing dilated convolution (see b/29893301). Therefore,
-  strides > 1 are currently disabled if the rate is set > 1.
-  """
-
-  def __init__(self, output_channels, kernel_shape, stride=1, rate=1,
-               padding=SAME, use_bias=True, initializers=None, mask=None,
-               name="conv_2d"):
-    """Constructs a Conv2D module.
-
-    See the following documentation for an explanation of VALID versus SAME
-    padding modes:
-    https://www.tensorflow.org/versions/r0.8/api_docs/python/nn.html#convolution
-
-    Args:
-      output_channels: Number of output channels. `output_channels` can be
-          either a number or a callable. In the latter case, since the function
-          invocation is deferred to graph construction time, the user must only
-          ensure that output_channels can be called, returning an integer,
-          when `_build` is called.
-      kernel_shape: List of kernel sizes, or integer that is used to define
-          kernel size in all dimensions.
-      stride: List of kernel strides, or integer that is used to define
-          stride in all dimensions.
-      rate: A positive integer, `rate=1` corresponds to standard 2D convolution,
-          `rate > 1` corresponds to dilated convolution.
-      padding: Padding algorithm, either `nn.SAME` or `nn.VALID`.
-      use_bias: Whether to include bias parameters. Default `True`.
-      initializers: Optional dict containing ops to initialize the filters (with
-          key 'w') or biases (with key 'b'). The default initializers are
-          truncated normal initializers, which are commonly used when the inputs
-          are zero centered (see https://arxiv.org/pdf/1502.03167v3.pdf).
-      mask: Optional 2D or 4D array, tuple or numpy array containing values to
-          multiply the weights by component-wise.
-      name: Name of the module.
-
-    Raises:
-      base.IncompatibleShapeError: If the given kernel shape is not an integer;
-          or if the given kernel shape is not a sequence of two integers.
-      base.IncompatibleShapeError: If the given stride is not an integer; or if
-          the given stride is not a sequence of two or four integers.
-      base.IncompatibleShapeError: If a mask is given and its rank is neither 2
-          nor 4.
-      base.NotSupportedError: If the given dilation rate is not a positive
-          integer.
-      base.NotSupportedError: If rate > 1 and the stride in any dimension is
-          > 1.
-      ValueError: If the given padding is not `nn.VALID` or `nn.SAME`.
-      KeyError: If initializers contains any keys other than 'w' or 'b'.
-      TypeError: If any of the given initializers are not callable.
-      TypeError: If mask is given and is not an array, tuple or a numpy array.
-    """
-    super(Conv2D, self).__init__(name=name)
-
-    self._output_channels = output_channels
-    self._input_shape = None
-    self._kernel_shape = _fill_and_verify_kernel_shape(kernel_shape, 2)
-    try:
-      self._stride = (1,) + _fill_shape(stride, 2) + (1,)
-    except TypeError as e:
-      # We want to support passing native strides akin to [1, m, n, 1].
-      if len(stride) == 4:
-        self._stride = tuple(stride)
-      else:
-        raise base.IncompatibleShapeError("Invalid stride: {}".format(e))
-
-    if not isinstance(rate, numbers.Integral) or rate < 1:
-      raise base.NotSupportedError(
-          "Rate, {}, must be integer >= 1".format(rate))
-    elif any(x > 1 for x in self._stride) and rate > 1:
-      raise base.NotSupportedError(
-          "Cannot have stride > 1 with rate > 1")
-    else:
-      self._rate = rate
-
-    self._padding = _verify_padding(padding)
-    self._use_bias = use_bias
-    self.possible_keys = self.get_possible_initializer_keys(use_bias=use_bias)
-    self._initializers = util.check_initializers(
-        initializers, self.possible_keys)
-
-    if mask is not None:
-      if not isinstance(mask, (list, tuple, np.ndarray)):
-        raise TypeError("Invalid type for mask: {}".format(type(mask)))
-      self._mask = np.asanyarray(mask)
-      mask_rank = mask.ndim
-      if mask_rank != 2 and mask_rank != 4:
-        raise base.IncompatibleShapeError(
-            "Invalid mask rank: {}".format(mask_rank))
-    else:
-      self._mask = None
-
-  @classmethod
-  def get_possible_initializer_keys(cls, use_bias=True):
-    return {"w", "b"} if use_bias else {"w"}
-
-  def _build(self, inputs):
-    """Connects the Conv2D module into the graph, with input Tensor `inputs`.
-
-    If this is not the first time the module has been connected to the graph,
-    the input Tensor provided here must have the same final 3 dimensions, in
-    order for the existing variables to be the correct size for the
-    multiplication. The batch size may differ for each connection.
-
-    Args:
-      inputs: A 4D Tensor of shape [batch_size, input_height, input_width,
-          input_channels].
-
-    Returns:
-      A 4D Tensor of shape [batch_size, output_height, output_width,
-          output_channels].
-
-    Raises:
-      ValueError: If connecting the module into the graph any time after the
-          first time and the inferred size of the input does not match previous
-          invocations.
-      base.IncompatibleShapeError: If the input tensor has the wrong number
-          of dimensions.
-      base.IncompatibleShapeError: If a mask is present and its shape is
-          incompatible with the shape of the weights.
-      base.UnderspecifiedError: If the input tensor has an unknown
-          `input_channels`.
-      base.UnderspecifiedError: If rate > 1 is used with an input tensor with
-          unknown `input_width` or `input_height`.
-      TypeError: If input Tensor dtype is not `tf.float32`.
-    """
-    # Handle input whose shape is unknown during graph creation.
-    self._input_shape = tuple(inputs.get_shape().as_list())
-
-    if len(self._input_shape) != 4:
-      raise base.IncompatibleShapeError(
-          "Input Tensor must have shape (batch_size, input_height, input_"
-          "width, input_channels)")
-
-    if self._input_shape[3] is None:
-      raise base.UnderSpecifiedError(
-          "Number of input channels must be known at module build time")
-    else:
-      input_channels = self._input_shape[3]
-
-    if inputs.dtype != tf.float32:
-      raise TypeError(
-          "Input must have dtype tf.float32, but dtype was {}".format(
-              inputs.dtype))
-
-    weight_shape = (
-        self._kernel_shape[0],
-        self._kernel_shape[1],
-        input_channels,
-        self.output_channels)
-
-    bias_shape = (self.output_channels,)
-
-    if "w" not in self._initializers:
-      self._initializers["w"] = create_weight_initializer(weight_shape[:3])
-
-    if "b" not in self._initializers and self._use_bias:
-      self._initializers["b"] = create_bias_initializer(bias_shape)
-
-    self._w = tf.get_variable("w",
-                              shape=weight_shape,
-                              initializer=self._initializers["w"])
-
-    w = self._w
-
-    if self._mask is not None:
-      mask_rank = self._mask.ndim
-      mask_shape = self._mask.shape
-      if mask_rank == 2:
-        if mask_shape != self._kernel_shape:
-          raise base.IncompatibleShapeError(
-              "Invalid mask shape: {}".format(mask_shape))
-        mask = np.reshape(self._mask, self._kernel_shape + (1, 1))
-      elif mask_rank == 4:
-        if mask_shape != tuple(weight_shape):
-          raise base.IncompatibleShapeError(
-              "Invalid mask shape: {}".format(mask_shape))
-        mask = self._mask
-      mask_tensor, = tf.py_func(lambda: mask, [], [w.dtype], stateful=False)
-      mask_tensor.set_shape(weight_shape)
-      w *= mask
-
-    if self._rate > 1:
-      if any(x is None for x in self._input_shape[1:-1]):
-        raise base.UnderspecifiedError(
-            "Can't use atrous convolutions with unknown input_width or "
-            "input_height at graph build time")
-      outputs = tf.nn.atrous_conv2d(inputs,
-                                    w,
-                                    rate=self._rate,
-                                    padding=self._padding)
-    else:
-      outputs = tf.nn.conv2d(inputs,
-                             w,
-                             strides=self._stride,
-                             padding=self._padding)
-
-    if self._use_bias:
-      self._b = tf.get_variable("b",
-                                shape=bias_shape,
-                                initializer=self._initializers["b"])
-      outputs += self._b
-
-    return outputs
-
-  @property
-  def output_channels(self):
-    """Returns the number of output channels."""
-    if callable(self._output_channels):
-      self._output_channels = self._output_channels()
-    return self._output_channels
-
-  @property
-  def kernel_shape(self):
-    """Returns the kernel shape."""
-    return self._kernel_shape
-
-  @property
-  def stride(self):
-    """Returns the stride."""
-    return self._stride
-
-  @property
-  def rate(self):
-    """Returns the dilation rate."""
-    return self._rate
-
-  @property
-  def padding(self):
-    """Returns the padding algorithm."""
-    return self._padding
-
-  @property
-  def w(self):
-    """Returns the Variable containing the weight matrix."""
-    self._ensure_is_connected()
-    return self._w
-
-  @property
-  def b(self):
-    """Returns the Variable containing the bias.
-
-    Returns:
-      Variable object containing the bias, from the most recent __call__.
-
-    Raises:
-      base.NotConnectedError: If the module has not been connected to the graph
-          yet, meaning the variables do not exist.
-      AttributeError: If the module does not use bias.
-    """
-    self._ensure_is_connected()
-    if not self._use_bias:
-      raise AttributeError(
-          "No bias Variable in Conv2D Module when `use_bias=False`.")
-    return self._b
-
-  @property
-  def has_bias(self):
-    """Returns `True` if bias Variable is present in the module."""
-    return self._use_bias
-
-  @property
-  def initializers(self):
-    """Returns the initializers dictionary."""
-    return self._initializers
-
-  # Implements Transposable interface.
-  @property
-  def input_shape(self):
-    """Returns the input shape."""
-    self._ensure_is_connected()
-    return self._input_shape
-
-  # Implements Transposable interface.
-  def transpose(self, name=None):
-    """Returns matching `Conv2DTranspose` module.
-
-    Args:
-      name: Optional string assigning name of transpose module. The default name
-        is constructed by appending "_transpose" to `self.name`.
-
-    Returns:
-      `Conv2DTranspose` module.
-
-    Raises:
-     base.NotSupportedError: If `rate > 1`.
-    """
-    if self._rate > 1:
-      raise base.NotSupportedError(
-          "Cannot transpose a dilated convolution module.")
-
-    if name is None:
-      name = self.name + "_transpose"
-    return Conv2DTranspose(output_channels=lambda: self.input_shape[-1],
-                           output_shape=lambda: self.input_shape[1:3],
-                           kernel_shape=self.kernel_shape,
-                           stride=self.stride,
-                           padding=self.padding,
-                           use_bias=self._use_bias,
-                           initializers=self.initializers,
-                           name=name)
-
-
-class Conv2DTranspose(base.AbstractModule, base.Transposable):
-  """Spatial transposed / reverse / up 2D convolution module, including bias.
-
-  This acts as a light wrapper around the TensorFlow op `tf.nn.conv2d_transpose`
-  abstracting away variable creation and sharing.
-  """
-
-  def __init__(self, output_channels, output_shape, kernel_shape, stride=1,
-               padding=SAME, use_bias=True, initializers=None,
-               name="conv_2d_transpose"):
-    """Constructs a `Conv2DTranspose module`.
-
-    See the following documentation for an explanation of VALID versus SAME
-    padding modes:
-    https://www.tensorflow.org/versions/r0.8/api_docs/python/nn.html#convolution
-
-    Args:
-      output_channels: Number of output channels.
-          Can be either a number or a callable. In the latter case, since the
-          function invocation is deferred to graph construction time, the user
-          must only ensure `output_channels` can be called, returning an
-          integer, when build is called.
-      output_shape: Output shape of transpose convolution.
-          Can be either an iterable of integers or a callable. In the latter
-          case, since the function invocation is deferred to graph construction
-          time, the user must only ensure that `output_shape` can be called,
-          returning an iterable of format `(out_height, out_width)` when
-          `_build` is called. Note that `output_shape` defines the size of
-          output signal domain, as opposed to the shape of the output `Tensor`.
-      kernel_shape: List of kernel sizes, must be length 2.
-      stride: List of kernel strides.
-      padding: Padding algorithm, either `nn.SAME` or `nn.VALID`.
-      use_bias: Whether to include bias parameters. Default `True`.
-      initializers: Optional dict containing ops to initialize the filters (with
-          key 'w') or biases (with key 'b').
-      name: Name of the module.
-
-    Raises:
-      base.IncompatibleShapeError: If the given kernel shape is neither an
-          integer nor a sequence of two integers.
-      base.IncompatibleShapeError: If the given stride is neither an integer nor
-          a sequence of two or four integers.
-      ValueError: If the given padding is not `nn.VALID` or `nn.SAME`.
-      KeyError: If `initializers` contains any keys other than 'w' or 'b'.
-      TypeError: If any of the given initializers are not callable.
-    """
-    super(Conv2DTranspose, self).__init__(name)
-
-    self._output_channels = output_channels
-    if callable(output_shape):
-      self._output_shape = output_shape
-    else:
-      self._output_shape = tuple(output_shape)
-    self._input_shape = None
-
-    self._kernel_shape = _fill_and_verify_kernel_shape(kernel_shape, 2)
-    # We want to support passing native strides akin to [1, m, n, 1].
-    if isinstance(stride, collections.Iterable) and len(stride) == 4:
-      if not stride[0] == stride[3] == 1:
-        raise base.IncompatibleShapeError(
-            "Invalid stride: First and last element must be 1.")
-      self._stride = tuple(stride)
-    else:
-      self._stride = _fill_and_one_pad_stride(stride, 2)
-
-    self._padding = _verify_padding(padding)
-    self._use_bias = use_bias
-    self.possible_keys = self.get_possible_initializer_keys(use_bias=use_bias)
-    self._initializers = util.check_initializers(
-        initializers, self.possible_keys)
-
-  @classmethod
-  def get_possible_initializer_keys(cls, use_bias=True):
-    return {"w", "b"} if use_bias else {"w"}
-
-  def _build(self, inputs):
-    """Connects the Conv2DTranspose module into the graph.
-
-    If this is not the first time the module has been connected to the graph,
-    the input Tensor provided here must have the same final 3 dimensions, in
-    order for the existing variables to be the correct size for the
-    multiplication. The batch size may differ for each connection.
-
-    Args:
-      inputs: A 4D Tensor of shape [batch_size, input_height, input_width,
-          input_channels].
-
-    Returns:
-      A 4D Tensor of shape [batch_size, output_height, output_width,
-          output_channels].
-
-    Raises:
-      ValueError: If connecting the module into the graph any time after the
-          first time and the inferred size of the input does not match previous
-          invocations.
-      base.IncompatibleShapeError: If the input tensor has the wrong number of
-          dimensions; or if the input tensor has an unknown `input_channels`; or
-          or if `output_shape` is an iterable and is not in the format
-          `(out_height, out_width)`.
-      TypeError: If input Tensor dtype is not `tf.float32`.
-    """
-    # Handle input whose shape is unknown during graph creation.
-    self._input_shape = tuple(inputs.get_shape().as_list())
-
-    if len(self._input_shape) != 4:
-      raise base.IncompatibleShapeError(
-          "Input Tensor must have shape (batch_size, input_height, "
-          "input_width, input_channels)")
-
-    if self._input_shape[3] is None:
-      raise base.IncompatibleShapeError(
-          "Number of input channels must be known at module build time")
-    input_channels = self._input_shape[3]
-
-    if inputs.dtype != tf.float32:
-      raise TypeError("Input must have dtype tf.float32, but dtype was " +
-                      inputs.dtype)
-
-    if len(self.output_shape) != 2:
-      raise base.IncompatibleShapeError("Output shape must be specified as "
-                                        "(output_height, output_width)")
-
-    weight_shape = (self._kernel_shape[0], self._kernel_shape[1],
-                    self.output_channels, input_channels)
-
-    bias_shape = (self.output_channels,)
-
-    if "w" not in self._initializers:
-      fan_in_shape = weight_shape[:2] + (weight_shape[3],)
-      self._initializers["w"] = create_weight_initializer(fan_in_shape)
-
-    if "b" not in self._initializers and self._use_bias:
-      self._initializers["b"] = create_bias_initializer(bias_shape)
-
-    self._w = tf.get_variable("w",
-                              shape=weight_shape,
-                              initializer=self._initializers["w"])
-
-    # Use tensorflow shape op to manipulate inputs shape, so that unknown batch
-    # size - which can happen when using input placeholders - is handled
-    # correcly.
-    batch_size = tf.expand_dims(tf.shape(inputs)[0], 0)
-    conv_output_shape = tf.convert_to_tensor(
-        tuple(self.output_shape) + (self.output_channels,))
-    output_shape = tf.concat(0, [batch_size, conv_output_shape])
-
-    outputs = tf.nn.conv2d_transpose(inputs,
-                                     self._w,
-                                     output_shape,
-                                     strides=self._stride,
-                                     padding=self._padding)
-
-    if self._use_bias:
-      self._b = tf.get_variable("b",
-                                shape=bias_shape,
-                                initializer=self._initializers["b"])
-      outputs += self._b
-
-    # Recover output tensor shape value and pass it to set_shape in order to
-    # enable shape inference.
-    batch_size_value = inputs.get_shape()[0]
-    output_shape_value = ((batch_size_value,) + self.output_shape +
-                          (self.output_channels,))
-    outputs.set_shape(output_shape_value)
-
-    return outputs
-
-  @property
-  def output_channels(self):
-    """Returns the number of output channels."""
-    if callable(self._output_channels):
-      self._output_channels = self._output_channels()
-    return self._output_channels
-
-  @property
-  def kernel_shape(self):
-    """Returns the kernel shape."""
-    return self._kernel_shape
-
-  @property
-  def stride(self):
-    """Returns the stride."""
-    return self._stride
-
-  @property
-  def output_shape(self):
-    """Returns the output shape."""
-    if callable(self._output_shape):
-      self._output_shape = tuple(self._output_shape())
-    return self._output_shape
-
-  @property
-  def padding(self):
-    """Returns the padding algorithm."""
-    return self._padding
-
-  @property
-  def w(self):
-    """Returns the Variable containing the weight matrix."""
-    self._ensure_is_connected()
-    return self._w
-
-  @property
-  def b(self):
-    """Returns the Variable containing the bias.
-
-    Returns:
-      Variable object containing the bias, from the most recent __call__.
-
-    Raises:
-      base.NotConnectedError: If the module has not been connected to the graph
-          yet, meaning the variables do not exist.
-      AttributeError: If the module does not use bias.
-    """
-    self._ensure_is_connected()
-    if not self._use_bias:
-      raise AttributeError(
-          "No bias Variable in Conv2DTranspose Module when `use_bias=False`.")
-    return self._b
-
-  @property
-  def has_bias(self):
-    """Returns `True` if bias Variable is present in the module."""
-    return self._use_bias
-
-  @property
-  def initializers(self):
-    """Returns the initializers dictionary."""
-    return self._initializers
-
-  # Implements Transposable interface.
-  @property
-  def input_shape(self):
-    """Returns the input shape."""
-    self._ensure_is_connected()
-    return self._input_shape
-
-  # Implements Transposable interface.
-  def transpose(self, name=None):
-    """Returns matching `Conv2D` module.
-
-    Args:
-      name: Optional string assigning name of transpose module. The default name
-          is constructed by appending "_transpose" to `self.name`.
-
-    Returns:
-      `Conv2D` module.
-    """
-    if name is None:
-      name = self.name + "_transpose"
-    return Conv2D(output_channels=lambda: self.input_shape[-1],
-                  kernel_shape=self.kernel_shape,
-                  stride=self.stride,
-                  padding=self.padding,
-                  use_bias=self._use_bias,
-                  initializers=self.initializers,
-                  name=name)
diff --git a/nn/convnet.py b/nn/convnet.py
deleted file mode 100644
index 7a43ddf..0000000
--- a/nn/convnet.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A minimal interface convolutional networks module."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from six.moves import xrange
-import tensorflow as tf
-
-from nn import base
-from nn import batch_norm
-from nn import conv
-from nn import util
-
-
-def _replicate_elements(input_iterable, num_times):
-  """Replicates entry in `input_iterable` if `input_iterable` is of length 1."""
-  if len(input_iterable) == 1:
-    return (input_iterable[0],) * num_times
-  return tuple(input_iterable)
-
-
-class ConvNet2D(base.AbstractModule, base.Transposable):
-  """A 2D Convolutional Network module."""
-
-  POSSIBLE_INITIALIZER_KEYS = {"w", "b"}
-
-  def __init__(self,
-               output_channels,
-               kernel_shapes,
-               strides,
-               paddings,
-               activation=tf.nn.relu,
-               activate_final=False,
-               initializers=None,
-               use_batch_norm=False,
-               use_bias=True,
-               batch_norm_config=None,
-               name="conv_net_2d"):
-    """Constructs a `ConvNet2D` module.
-
-    By default, neither batch normalization nor activation are applied to the
-    output of the final layer.
-
-    Args:
-      output_channels: Iterable of output channels, as defined in
-        `conv.Conv2D`. Output channels can be defined either as number or via a
-        callable. In the latter case, since the function invocation is deferred
-        to graph construction time, the user must only ensure that entries can
-        be called when build is called. Each entry in the iterable defines
-        properties in the corresponding convolutional layer.
-      kernel_shapes: Iterable of kernel sizes as defined in `conv.Conv2D`; if
-        the list contains one element only, the same kernel shape is used in
-        each layer of the network.
-      strides: Iterable of kernel strides as defined in `conv.Conv2D`; if the
-        list contains one element only, the same stride is used in each layer of
-        the network.
-      paddings: Iterable of padding options, either `nn.SAME` or
-        `nn.VALID`; if the Iterable contains one element only, the same padding
-        is used in each layer of the network.
-      activation: An activation op.
-      activate_final: Boolean determining if the activation and batch
-        normalization, if turned on, are applied to the final layer.
-      initializers: Optional dict containing ops to initialize the filters of
-        the whole network (with key 'w') or biases (with key 'b').
-      use_batch_norm: Boolean determining if batch normalization is applied
-        after convolution.
-      use_bias: Whether to include bias parameters in the convolutional layers.
-        Default `True`.
-      batch_norm_config: Optional mapping of additional configuration for the
-        `nn.BatchNorm` modules.
-      name: Name of the module.
-
-    Raises:
-      TypeError: If `output_channels` is not iterable; or if `kernel_shapes` is
-        not iterable; or `strides` is not iterable; or `paddings` is not
-        iterable; or if `activation` is not callable; or `batch_norm_config` is
-        not a mappable (e.g. `dict`).
-      ValueError: If `output_channels` is empty; or if `kernel_shapes` has not
-        length 1 or `len(output_channels)`; or if `strides` has not
-        length 1 or `len(output_channels)`; or if `paddings` has not
-        length 1 or `len(output_channels)`.
-      Error: If initializers contains any keys other than 'w' or 'b'.
-    """
-    if not isinstance(output_channels, collections.Iterable):
-      raise TypeError("output_channels must be iterable")
-    output_channels = tuple(output_channels)
-
-    if not isinstance(kernel_shapes, collections.Iterable):
-      raise TypeError("kernel_shapes must be iterable")
-    kernel_shapes = tuple(kernel_shapes)
-
-    if not isinstance(strides, collections.Iterable):
-      raise TypeError("strides must be iterable")
-    strides = tuple(strides)
-
-    if not isinstance(paddings, collections.Iterable):
-      raise TypeError("paddings must be iterable")
-    paddings = tuple(paddings)
-
-    super(ConvNet2D, self).__init__(name)
-
-    if not output_channels:
-      raise ValueError("output_channels must not be empty")
-    self._output_channels = tuple(output_channels)
-    self._num_layers = len(self._output_channels)
-
-    self._input_shape = None
-
-    self._initializers = util.check_initializers(
-        initializers, self.POSSIBLE_INITIALIZER_KEYS)
-
-    if not callable(activation):
-      raise TypeError("Input 'activation' must be callable")
-    self._activation = activation
-    self._activate_final = activate_final
-
-    self._kernel_shapes = _replicate_elements(kernel_shapes, self._num_layers)
-    if len(self._kernel_shapes) != self._num_layers:
-      raise ValueError(
-          "kernel_shapes must be of length 1 or len(output_channels)")
-
-    self._strides = _replicate_elements(strides, self._num_layers)
-    if len(self._strides) != self._num_layers:
-      raise ValueError(
-          """strides must be of length 1 or len(output_channels)""")
-
-    self._paddings = _replicate_elements(paddings, self._num_layers)
-    if len(self._paddings) != self._num_layers:
-      raise ValueError(
-          """paddings must be of length 1 or len(output_channels)""")
-
-    self._use_batch_norm = use_batch_norm
-
-    if batch_norm_config is not None:
-      if not isinstance(batch_norm_config, collections.Mapping):
-        raise TypeError("`batch_norm_config` must be a mapping, e.g. `dict`.")
-      self._batch_norm_config = batch_norm_config
-    else:
-      self._batch_norm_config = {}
-
-    self._use_bias = use_bias
-    self._instantiate_layers()
-
-  def _instantiate_layers(self):
-    """Instantiates all the convolutional modules used in the network."""
-
-    with tf.variable_scope(self._template.variable_scope):
-      self._layers = tuple(conv.Conv2D(name="conv_2d_{}".format(i),
-                                       output_channels=self._output_channels[i],
-                                       kernel_shape=self._kernel_shapes[i],
-                                       stride=self._strides[i],
-                                       padding=self._paddings[i],
-                                       use_bias=self._use_bias,
-                                       initializers=self._initializers)
-                           for i in xrange(self._num_layers))
-
-  def _build(self, inputs, is_training=True, test_local_stats=True):
-    """Assembles the `ConvNet2D` and connects it to the graph.
-
-    Args:
-      inputs: A 4D Tensor of shape `[batch_size, input_height, input_width,
-        input_channels]`.
-      is_training: Boolean to indicate to `nn.BatchNorm` if we are
-        currently training. By default `True`.
-      test_local_stats: Boolean to indicate to `nn.BatchNorm` if batch
-        normalization should  use local batch statistics at test time.
-        By default `True`.
-
-    Returns:
-      A 4D Tensor of shape `[batch_size, output_height, output_width,
-        output_channels[-1]]`.
-    """
-    self._input_shape = tuple(inputs.get_shape().as_list())
-    net = inputs
-
-    final_index = len(self._layers) - 1
-    for i, layer in enumerate(self._layers):
-      net = layer(net)
-
-      if i != final_index or self._activate_final:
-        if self._use_batch_norm:
-          bn = batch_norm.BatchNorm(name="batch_norm_{}".format(i),
-                                    **self._batch_norm_config)
-          net = bn(net,
-                   is_training=is_training,
-                   test_local_stats=test_local_stats)
-
-        net = self._activation(net)
-
-    return net
-
-  @property
-  def layers(self):
-    """Returns a tuple containing the convolutional layers of the network."""
-    return self._layers
-
-  @property
-  def strides(self):
-    return self._strides
-
-  @property
-  def paddings(self):
-    return self._paddings
-
-  @property
-  def kernel_shapes(self):
-    return self._kernel_shapes
-
-  @property
-  def output_channels(self):
-    return tuple([l() if callable(l) else l for l in self._output_channels])
-
-  @property
-  def use_bias(self):
-    return self._use_bias
-
-  @property
-  def use_batch_norm(self):
-    return self._use_batch_norm
-
-  @property
-  def activate_final(self):
-    return self._activate_final
-
-  # Implements Transposable interface.
-  @property
-  def input_shape(self):
-    """Returns shape of input `Tensor` passed at last call to `_build`."""
-    self._ensure_is_connected()
-    return self._input_shape
-
-  # Implements Transposable interface.
-  def transpose(self, name=None, output_channels=None):
-    """Returns transposed conv net.
-
-    Args:
-      name: Optional string specifiying the name of the transposed module. The
-        default name is constructed by appending "_transpose" to `self.name`.
-      output_channels: Optional iterable of numbers of output channels.
-
-    Returns:
-      Matching `ConvNetTranspose2D` module.
-
-    Raises:
-      ValueError: If output_channels is specified and its length does not match
-        the number of layers.
-    """
-    if name is None:
-      name = self.name + "_transpose"
-
-    if output_channels is None:
-      output_channels = []
-      for layer in reversed(self._layers):
-        output_channels.append(lambda l=layer: l.input_shape[-1])
-
-    elif len(output_channels) != len(self._layers):
-      raise ValueError("Iterable output_channels length must match the"
-                       "number of layers ({}), but is {} instead.".format(
-                           len(self._layers), len(output_channels)))
-
-    output_shapes = []
-    for layer in reversed(self._layers):
-      output_shapes.append(lambda l=layer: l.input_shape[1:-1])
-
-    return ConvNet2DTranspose(name=name,
-                              output_channels=output_channels,
-                              output_shapes=output_shapes,
-                              kernel_shapes=reversed(self.kernel_shapes),
-                              strides=reversed(self.strides),
-                              paddings=reversed(self.paddings),
-                              activation=self._activation,
-                              activate_final=self._activate_final,
-                              initializers=self._initializers,
-                              use_batch_norm=self._use_batch_norm,
-                              use_bias=self._use_bias,
-                              batch_norm_config=self._batch_norm_config)
-
-
-class ConvNet2DTranspose(ConvNet2D):
-  """A 2D Transpose-Convolutional Network module."""
-
-  def __init__(self,
-               output_channels,
-               output_shapes,
-               kernel_shapes,
-               strides,
-               paddings,
-               activation=tf.nn.relu,
-               activate_final=False,
-               initializers=None,
-               use_batch_norm=False,
-               use_bias=True,
-               batch_norm_config=None,
-               name="conv_net_2d_transpose"):
-    """Constructs a `ConvNetTranspose2D` module.
-
-    `output_{shapes,channels}` can be defined either as iterable of
-    {iterables,integers} or via a callable. In the latter case, since the
-    function invocation is deferred to graph construction time, the user
-    must only ensure that entries can be called returning meaningful values when
-    build is called. Each entry in the iterable defines properties in the
-    corresponding convolutional layer.
-
-    By default, neither batch normalization nor activation are applied to the
-    output of the final layer.
-
-    Args:
-      output_channels: Iterable of numbers of output channels.
-      output_shapes: Iterable of output shapes as defined in
-        `conv.conv2DTranpose`; if the iterable contains one element only, the
-        same shape is used in each layer of the network.
-      kernel_shapes: Iterable of kernel sizes as defined in `conv.Conv2D`; if
-        the list contains one element only, the same kernel shape is used in
-        each layer of the network.
-      strides: Iterable of kernel strides as defined in `conv.Conv2D`; if the
-        list contains one element only, the same stride is used in each layer of
-        the network.
-      paddings: Iterable of padding options, either `nn.SAME` or
-        `nn.VALID`; if the Iterable contains one element only, the same padding
-        is used in each layer of the network.
-      activation: An activation op.
-      activate_final: Boolean determining if the activation and batch
-        normalization, if turned on, are applied to the final layer.
-      initializers: Optional dict containing ops to initialize the filters of
-        the whole network (with key 'w') or biases (with key 'b').
-      use_batch_norm: Boolean determining if batch normalization is applied
-        after convolution.
-      use_bias: Whether to include bias parameters in the convolutional layers.
-        Default `True`.
-      batch_norm_config: Optional mapping of additional configuration for the
-        `nn.BatchNorm` modules.
-      name: Name of the module.
-
-    Raises:
-      TypeError: If `output_channels` is not iterable; or if `output_channels`
-        is not iterable; or if `kernel_shapes` is not iterable; or `strides` is
-        not iterable; or `paddings` is not iterable; or if `activation` is not
-        callable.
-      ValueError: If `output_channels` is empty; or if `kernel_shapes` has not
-        length 1 or `len(output_channels)`; or if `strides` has not
-        length 1 or `len(output_channels)`; or if `paddings` has not
-        length 1 or `len(output_channels)`.
-      Error: If initializers contains any keys other than 'w' or 'b'.
-    """
-    if not isinstance(output_channels, collections.Iterable):
-      raise TypeError("output_channels must be iterable")
-    output_channels = tuple(output_channels)
-    num_layers = len(output_channels)
-
-    if not isinstance(output_shapes, collections.Iterable):
-      raise TypeError("output_shapes must be iterable")
-    output_shapes = tuple(output_shapes)
-
-    self._output_shapes = _replicate_elements(output_shapes, num_layers)
-    if len(self._output_shapes) != num_layers:
-      raise ValueError(
-          "output_shapes must be of length 1 or len(output_channels)")
-
-    super(ConvNet2DTranspose, self).__init__(
-        output_channels,
-        kernel_shapes,
-        strides,
-        paddings,
-        activation=activation,
-        activate_final=activate_final,
-        initializers=initializers,
-        use_batch_norm=use_batch_norm,
-        use_bias=use_bias,
-        batch_norm_config=batch_norm_config,
-        name=name)
-
-  def _instantiate_layers(self):
-    """Instantiates all the convolutional modules used in the network."""
-
-    with tf.variable_scope(self._template.variable_scope):
-      self._layers = tuple(
-          conv.Conv2DTranspose(name="conv_2d_transpose_{}".format(i),
-                               output_channels=self._output_channels[i],
-                               output_shape=self._output_shapes[i],
-                               kernel_shape=self._kernel_shapes[i],
-                               stride=self._strides[i],
-                               padding=self._paddings[i],
-                               initializers=self._initializers,
-                               use_bias=self._use_bias)
-          for i in xrange(self._num_layers))
-
-  @property
-  def output_shapes(self):
-    return tuple([l() if callable(l) else l for l in self._output_shapes])
-
-  # Implement Transposable interface.
-  def transpose(self, name=None, output_channels=None):
-    """Returns transposed conv net.
-
-    Args:
-      name: Optional string specifiying the name of the transposed module. The
-        default name is constructed by appending "_transpose" to `self.name`.
-      output_channels: Optional iterable of numbers of output channels.
-
-    Returns:
-      Matching `ConvNetTranspose2D` module.
-
-    Raises:
-      ValueError: If output_channels is specified and its length does not match
-        the number of layers.
-    """
-    if name is None:
-      name = self.name + "_transpose"
-
-    if output_channels is None:
-      output_channels = []
-      for layer in reversed(self._layers):
-        output_channels.append(lambda l=layer: l.input_shape[-1])
-
-    elif len(output_channels) != len(self._layers):
-      raise ValueError("Iterable output_channels length must match the"
-                       "number of layers ({}), but is {} instead.".format(
-                           len(self._layers), len(output_channels)))
-
-    return ConvNet2D(name=name,
-                     output_channels=output_channels,
-                     kernel_shapes=reversed(self.kernel_shapes),
-                     strides=reversed(self.strides),
-                     paddings=reversed(self.paddings),
-                     activation=self._activation,
-                     activate_final=self._activate_final,
-                     initializers=self._initializers,
-                     use_batch_norm=self._use_batch_norm,
-                     use_bias=self._use_bias,
-                     batch_norm_config=self._batch_norm_config)
diff --git a/nn/gated_rnn.py b/nn/gated_rnn.py
deleted file mode 100644
index a84373d..0000000
--- a/nn/gated_rnn.py
+++ /dev/null
@@ -1,592 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""LSTM based modules for TensorFlow nn.
-
-This python module contains LSTM-like cores that fall under the broader group
-of RNN cores.  In general, initializers for the gate weights and other
-model parameters may be passed to the constructor.
-
-Typical usage example of the standard LSTM without peephole connections:
-
-  ```
-  import nn
-
-
-  hidden_size = 10
-  batch_size = 2
-
-  # Simple LSTM op on some input
-  rnn = nn.LSTM(hidden_size)
-  input = tf.placeholder(tf.float32, shape=[batch_size, hidden_size])
-  out, next_state = rnn(input, rnn.initial_state())
-  ```
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-from tensorflow.python.ops import array_ops
-from tensorflow.contrib import rnn
-
-from nn import base
-from nn import basic
-from nn import batch_norm
-from nn import rnn_core
-from nn import util
-
-
-class LSTM(rnn_core.RNNCore):
-  """LSTM recurrent network cell with optional peepholes & batch normalization.
-
-  The base implementation is based on: http://arxiv.org/abs/1409.2329. We add
-  forget_bias (default: 1) to the biases of the forget gate in order to
-  reduce the scale of forgetting in the beginning of the training.
-
-  #### Peep-hole connections
-
-  Peep-hole connections may optionally be used by specifying a flag in the
-  constructor. These connections can aid increasing the precision of output
-  timing, for more details see:
-
-    https://research.google.com/pubs/archive/43905.pdf
-
-  #### Batch normalization
-
-  The batch norm transformation (in training mode) is
-    batchnorm(x) = gamma * (x - mean(x)) / stddev(x) + beta,
-  where gamma is a learnt scaling factor and beta is a learnt offset.
-
-  Batch normalization may optionally be used at different places in the LSTM by
-  specifying flag(s) in the constructor. These are applied when calculating
-  the gate activations and cell-to-hidden transformation. The set-up is based on
-
-    https://arxiv.org/pdf/1603.09025.pdf
-
-  ##### Batch normalization: where to apply?
-
-  Batch norm can be applied in three different places in the LSTM:
-
-    (h) To the W_h h_{t-1} contribution to the gates from the previous hiddens.
-    (x) To the W_x x_t contribution to the gates from the current input.
-    (c) To the cell value c_t when calculating the output h_t from the cell.
-
-  (The notation here is consistent with the Recurrent Batch Normalization
-  paper). Each of these can be controlled individually, because batch norm is
-  expensive, and not all are necessary. The paper doesn't mention the relative
-  effects of these different batch norms; however, experimentation with a
-  shallow LSTM for the `permuted_mnist` sequence task suggests that (h) is the
-  most important and the other two can be left off. For other tasks or deeper
-  (stacked) LSTMs, other batch norm combinations may be more effective.
-
-  ##### Batch normalization: collecting stats (training vs test)
-
-  When switching to testing (see `LSTM.with_batch_norm_control`), we can use a
-  mean and stddev learnt from the training data instead of using the statistics
-  from the test data. (This both increases test accuracy because the statistics
-  have less variance, and if the test data does not have the same distribution
-  as the training data then we must use the training statistics to ensure the
-  effective network does not change when switching to testing anyhow.)
-
-  This does however introduces a slight subtlety. The first few time steps of
-  the RNN tend to have varying statistics (mean and variance) before settling
-  down to a steady value. Therefore in general, better performance is obtained
-  by using separate statistics for the first few time steps, and then using the
-  final set of statistics for all subsequent time steps. This is controlled by
-  the parameter `max_unique_stats`. (We can't have an unbounded number of
-  distinct statistics for both technical reasons and also for the case where
-  test sequences are longer than anything seen in training.)
-
-  You may be fine leaving it at its default value of 1. Small values (like 10)
-  may achieve better performance on some tasks when testing with cached
-  statistics.
-
-  Attributes:
-    state_size: Tuple of `tf.TensorShape`s indicating the size of state tensors.
-    output_size: `tf.TensorShape` indicating the size of the core output.
-    use_peepholes: Boolean indicating whether peephole connections are used.
-    use_batch_norm_h: Boolean indicating whether batch norm (h) is enabled.
-    use_batch_norm_x: Boolean indicating whether batch norm (x) is enabled.
-    use_batch_norm_c: Boolean indicating whether batch norm (c) is enabled.
-  """
-
-  # Keys that may be provided for parameter initializers.
-  W_GATES = "w_gates"  # weight for gates
-  B_GATES = "b_gates"  # bias of gates
-  W_F_DIAG = "w_f_diag"  # weight for prev_cell -> forget gate peephole
-  W_I_DIAG = "w_i_diag"  # weight for prev_cell -> input gate peephole
-  W_O_DIAG = "w_o_diag"  # weight for prev_cell -> output gate peephole
-  GAMMA_H = "gamma_h"  # batch norm scaling for previous_hidden -> gates
-  GAMMA_X = "gamma_x"  # batch norm scaling for input -> gates
-  GAMMA_C = "gamma_c"  # batch norm scaling for cell -> output
-  BETA_C = "beta_c"  # (batch norm) bias for cell -> output
-  POSSIBLE_KEYS = {W_GATES, B_GATES, W_F_DIAG, W_I_DIAG, W_O_DIAG, GAMMA_H,
-                   GAMMA_X, GAMMA_C, BETA_C}
-
-  def __init__(self,
-               hidden_size,
-               forget_bias=1.0,
-               initializers=None,
-               use_peepholes=False,
-               use_batch_norm_h=False,
-               use_batch_norm_x=False,
-               use_batch_norm_c=False,
-               max_unique_stats=1,
-               name="lstm"):
-    """Construct LSTM.
-
-    Args:
-      hidden_size: (int) Hidden size dimensionality.
-      forget_bias: (float) Bias for the forget activation.
-      initializers: Dict containing ops to initialize the weights.
-        This dictionary may contain any of the keys in POSSIBLE_KEYS.
-        The gamma and beta variables control batch normalization values for
-        different batch norm transformations inside the cell; see the paper for
-        details.
-      use_peepholes: Boolean that indicates whether peephole connections are
-        used.
-      use_batch_norm_h: Boolean that indicates whether to apply batch
-        normalization at the previous_hidden -> gates contribution. If you are
-        experimenting with batch norm then this may be the most effective to
-        turn on.
-      use_batch_norm_x: Boolean that indicates whether to apply batch
-        normalization at the input -> gates contribution.
-      use_batch_norm_c: Boolean that indicates whether to apply batch
-        normalization at the cell -> output contribution.
-      max_unique_stats: The maximum number of steps to use unique batch norm
-        statistics for. (See module description above for more details.)
-      name: name of the module.
-
-    Raises:
-      KeyError: if `initializers` contains any keys not in POSSIBLE_KEYS.
-      ValueError: if a peephole initializer is passed in the initializer list,
-        but `use_peepholes` is False.
-      ValueError: if a batch norm initializer is passed in the initializer list,
-        but batch norm is disabled.
-      ValueError: if `max_unique_stats` is not the default value, but batch norm
-        is disabled.
-      ValueError: if `max_unique_stats` is < 1.
-    """
-    super(LSTM, self).__init__(name=name)
-
-    self._hidden_size = hidden_size
-    self._forget_bias = forget_bias
-    self._use_peepholes = use_peepholes
-    self._max_unique_stats = max_unique_stats
-    self._use_batch_norm_h = use_batch_norm_h
-    self._use_batch_norm_x = use_batch_norm_x
-    self._use_batch_norm_c = use_batch_norm_c
-    self.possible_keys = self.get_possible_initializer_keys(
-        use_peepholes=use_peepholes, use_batch_norm_h=use_batch_norm_h,
-        use_batch_norm_x=use_batch_norm_x, use_batch_norm_c=use_batch_norm_c)
-    self._initializers = util.check_initializers(initializers,
-                                                 self.possible_keys)
-    if max_unique_stats < 1:
-      raise ValueError("max_unique_stats must be >= 1")
-    if max_unique_stats != 1 and not (
-        use_batch_norm_h or use_batch_norm_x or use_batch_norm_c):
-      raise ValueError("max_unique_stats specified but batch norm disabled")
-
-    if use_batch_norm_h:
-      self._batch_norm_h = LSTM.IndexedStatsBatchNorm(max_unique_stats,
-                                                      "batch_norm_h")
-    if use_batch_norm_x:
-      self._batch_norm_x = LSTM.IndexedStatsBatchNorm(max_unique_stats,
-                                                      "batch_norm_x")
-    if use_batch_norm_c:
-      self._batch_norm_c = LSTM.IndexedStatsBatchNorm(max_unique_stats,
-                                                      "batch_norm_c")
-
-  def with_batch_norm_control(self, is_training=True, test_local_stats=True):
-    """Wraps this RNNCore with the additional control input to the `BatchNorm`s.
-
-    Example usage:
-
-      lstm = nnd.LSTM(4)
-      is_training = tf.placeholder(tf.bool)
-      rnn_input = ...
-      my_rnn = rnn.rnn(lstm.with_batch_norm_control(is_training), rnn_input)
-
-    Args:
-      is_training: Boolean that indicates whether we are in
-        training mode or testing mode. When in training mode, the batch norm
-        statistics are taken from the given batch, and moving statistics are
-        updated. When in testing mode, the moving statistics are not updated,
-        and in addition if `test_local_stats` is False then the moving
-        statistics are used for the batch statistics. See the `BatchNorm` module
-        for more details.
-      test_local_stats: Boolean scalar indicated whether to use local
-        batch statistics in test mode.
-
-    Returns:
-      RNNCell wrapping this class with the extra input(s) added.
-    """
-    return LSTM.CellWithExtraInput(self,
-                                   is_training=is_training,
-                                   test_local_stats=test_local_stats)
-
-  @classmethod
-  def get_possible_initializer_keys(
-      cls, use_peepholes=False, use_batch_norm_h=False, use_batch_norm_x=False,
-      use_batch_norm_c=False):
-    possible_keys = cls.POSSIBLE_KEYS.copy()
-    if not use_peepholes:
-      possible_keys.difference_update(
-          {cls.W_F_DIAG, cls.W_I_DIAG, cls.W_O_DIAG})
-    if not use_batch_norm_h:
-      possible_keys.remove(cls.GAMMA_H)
-    if not use_batch_norm_x:
-      possible_keys.remove(cls.GAMMA_X)
-    if not use_batch_norm_c:
-      possible_keys.difference_update({cls.GAMMA_C, cls.BETA_C})
-    return possible_keys
-
-  def _build(self, inputs, prev_state, is_training=True, test_local_stats=True):
-    """Connects the LSTM module into the graph.
-
-    If this is not the first time the module has been connected to the graph,
-    the Tensors provided as inputs and state must have the same final
-    dimension, in order for the existing variables to be the correct size for
-    their corresponding multiplications. The batch size may differ for each
-    connection.
-
-    Args:
-      inputs: Tensor of size `[batch_size, input_size]`.
-      prev_state: Tuple (prev_hidden, prev_cell), or if batch norm is enabled
-        and `max_unique_stats > 1`, then (prev_hidden, prev_cell, time_step).
-        Here, prev_hidden and prev_cell are tensors of size
-        `[batch_size, hidden_size]`, and time_step is used to indicate the
-        current RNN step.
-      is_training: Boolean indicating whether we are in training mode (as
-        opposed to testing mode), passed to the batch norm
-        modules. Note to use this you must wrap the cell via the
-        `with_batch_norm_control` function.
-      test_local_stats: Boolean indicating whether to use local batch statistics
-        in test mode. See the `BatchNorm` documentation for more on this.
-
-    Returns:
-      A tuple (output, next_state) where 'output' is a Tensor of size
-      `[batch_size, hidden_size]` and 'next_state' is a tuple
-      (next_hidden, next_cell) or (next_hidden, next_cell, time_step + 1),
-      where next_hidden and next_cell have size `[batch_size, hidden_size]`.
-
-    Raises:
-      ValueError: If connecting the module into the graph any time after the
-        first time, and the inferred size of the inputs does not match previous
-        invocations.
-    """
-    if self._max_unique_stats == 1:
-      prev_hidden, prev_cell = prev_state
-      time_step = None
-    else:
-      prev_hidden, prev_cell, time_step = prev_state
-
-    self._create_gate_variables(inputs.get_shape(), inputs.dtype)
-    self._create_batch_norm_variables(inputs.dtype)
-
-    # pylint false positive: calling module of same file; see b/29989864
-    # pylint: disable=not-callable
-
-    if self._use_batch_norm_h or self._use_batch_norm_x:
-      gates_h = tf.matmul(prev_hidden, self._w_h)
-      gates_x = tf.matmul(inputs, self._w_x)
-      if self._use_batch_norm_h:
-        gates_h = self._gamma_h * self._batch_norm_h(gates_h,
-                                                     time_step,
-                                                     is_training,
-                                                     test_local_stats)
-      if self._use_batch_norm_x:
-        gates_x = self._gamma_x * self._batch_norm_x(gates_x,
-                                                     time_step,
-                                                     is_training,
-                                                     test_local_stats)
-      gates = gates_h + gates_x + self._b
-    else:
-      # Parameters of gates are concatenated into one multiply for efficiency.
-      inputs_and_hidden = tf.concat([inputs, prev_hidden], 1)
-      gates = tf.matmul(inputs_and_hidden, self._w_xh) + self._b
-
-    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-    i, j, f, o = array_ops.split(gates, 4, 1)
-
-    if self._use_peepholes:  # diagonal connections
-      self._create_peephole_variables(inputs.dtype)
-      f += self._w_f_diag * prev_cell
-      i += self._w_i_diag * prev_cell
-
-    forget_mask = tf.sigmoid(f + self._forget_bias)
-    new_cell = forget_mask * prev_cell + tf.sigmoid(i) * tf.tanh(j)
-    cell_output = new_cell
-    if self._use_batch_norm_c:
-      cell_output = (self._beta_c
-                     + self._gamma_c * self._batch_norm_c(cell_output,
-                                                          time_step,
-                                                          is_training,
-                                                          test_local_stats))
-    if self._use_peepholes:
-      cell_output += self._w_o_diag * cell_output
-    new_hidden = tf.tanh(cell_output) * tf.sigmoid(o)
-
-    if self._max_unique_stats == 1:
-      return new_hidden, (new_hidden, new_cell)
-    else:
-      return new_hidden, (new_hidden, new_cell, time_step + 1)
-
-  def _create_batch_norm_variables(self, dtype):
-    """Initialize the variables used for the `BatchNorm`s (if any)."""
-    # The paper recommends a value of 0.1 for good gradient flow through the
-    # tanh nonlinearity (although doesn't say whether this is for all gammas,
-    # or just some).
-    gamma_initializer = tf.constant_initializer(0.1)
-
-    if self._use_batch_norm_h:
-      self._gamma_h = tf.get_variable(
-          LSTM.GAMMA_H,
-          shape=[4 * self._hidden_size],
-          dtype=dtype,
-          initializer=(self._initializers.get(LSTM.GAMMA_H, gamma_initializer)))
-    if self._use_batch_norm_x:
-      self._gamma_x = tf.get_variable(
-          LSTM.GAMMA_X,
-          shape=[4 * self._hidden_size],
-          dtype=dtype,
-          initializer=(self._initializers.get(LSTM.GAMMA_X, gamma_initializer)))
-    if self._use_batch_norm_c:
-      self._gamma_c = tf.get_variable(
-          LSTM.GAMMA_C,
-          shape=[self._hidden_size],
-          dtype=dtype,
-          initializer=(self._initializers.get(LSTM.GAMMA_C, gamma_initializer)))
-      self._beta_c = tf.get_variable(
-          LSTM.BETA_C,
-          shape=[self._hidden_size],
-          dtype=dtype,
-          initializer=self._initializers.get(LSTM.BETA_C))
-
-  def _create_gate_variables(self, input_shape, dtype):
-    """Initialize the variables used for the gates."""
-    if len(input_shape) != 2:
-      raise ValueError(
-          "Rank of shape must be {} not: {}".format(2, len(input_shape)))
-    input_size = input_shape.dims[1].value
-
-    b_shape = [4 * self._hidden_size]
-
-    equiv_input_size = self._hidden_size + input_size
-    initializer = basic.create_linear_initializer(equiv_input_size)
-
-    if self._use_batch_norm_h or self._use_batch_norm_x:
-      self._w_h = tf.get_variable(
-          LSTM.W_GATES + "_H",
-          shape=[self._hidden_size, 4 * self._hidden_size],
-          dtype=dtype,
-          initializer=self._initializers.get(LSTM.W_GATES, initializer))
-      self._w_x = tf.get_variable(
-          LSTM.W_GATES + "_X",
-          shape=[input_size, 4 * self._hidden_size],
-          dtype=dtype,
-          initializer=self._initializers.get(LSTM.W_GATES, initializer))
-    else:
-      self._w_xh = tf.get_variable(
-          LSTM.W_GATES,
-          shape=[self._hidden_size + input_size, 4 * self._hidden_size],
-          dtype=dtype,
-          initializer=self._initializers.get(LSTM.W_GATES, initializer))
-    self._b = tf.get_variable(
-        LSTM.B_GATES,
-        shape=b_shape,
-        dtype=dtype,
-        initializer=self._initializers.get(LSTM.B_GATES, initializer))
-
-  def _create_peephole_variables(self, dtype):
-    """Initialize the variables used for the peephole connections."""
-    self._w_f_diag = tf.get_variable(
-        LSTM.W_F_DIAG,
-        shape=[self._hidden_size],
-        dtype=dtype,
-        initializer=self._initializers.get(LSTM.W_F_DIAG))
-    self._w_i_diag = tf.get_variable(
-        LSTM.W_I_DIAG,
-        shape=[self._hidden_size],
-        dtype=dtype,
-        initializer=self._initializers.get(LSTM.W_I_DIAG))
-    self._w_o_diag = tf.get_variable(
-        LSTM.W_O_DIAG,
-        shape=[self._hidden_size],
-        dtype=dtype,
-        initializer=self._initializers.get(LSTM.W_O_DIAG))
-
-  def initial_state(self, batch_size, dtype=tf.float32, trainable=False,
-                    trainable_initializers=None):
-    """Builds the default start state tensor of zeros.
-
-    Args:
-      batch_size: An int, float or scalar Tensor representing the batch size.
-      dtype: The data type to use for the state.
-      trainable: Boolean that indicates whether to learn the initial state.
-      trainable_initializers: An optional pair of initializers for the
-          initial hidden state and cell state.
-
-    Returns:
-      A tensor tuple `([batch_size x state_size], [batch_size x state_size], ?)`
-      filled with zeros, with the third entry present when batch norm is enabled
-      with `max_unique_stats > 1', with value `0` (representing the time step).
-    """
-    if self._max_unique_stats == 1:
-      return super(LSTM, self).initial_state(
-          batch_size, dtype, trainable, trainable_initializers)
-    else:
-      if not trainable:
-        state = super(rnn_core.RNNCore, self).zero_state(batch_size, dtype)
-      else:
-        # We have to manually create the state ourselves so we don't create a
-        # variable that never gets used for the third entry.
-        state = rnn_core.trainable_initial_state(
-            batch_size,
-            (tf.TensorShape([self._hidden_size]),
-             tf.TensorShape([self._hidden_size])),
-            dtype,
-            trainable_initializers)
-      return (state[0], state[1], tf.constant(0, dtype=tf.int32))
-
-  @property
-  def state_size(self):
-    """Tuple of `tf.TensorShape`s indicating the size of state tensors."""
-    if self._max_unique_stats == 1:
-      return (tf.TensorShape([self._hidden_size]),
-              tf.TensorShape([self._hidden_size]))
-    else:
-      return (tf.TensorShape([self._hidden_size]),
-              tf.TensorShape([self._hidden_size]),
-              tf.TensorShape(1))
-
-  @property
-  def output_size(self):
-    """`tf.TensorShape` indicating the size of the core output."""
-    return tf.TensorShape([self._hidden_size])
-
-  @property
-  def use_peepholes(self):
-    """Boolean indicating whether peephole connections are used."""
-    return self._use_peepholes
-
-  @property
-  def use_batch_norm_h(self):
-    """Boolean indicating whether batch norm for hidden -> gates is enabled."""
-    return self._use_batch_norm_h
-
-  @property
-  def use_batch_norm_x(self):
-    """Boolean indicating whether batch norm for input -> gates is enabled."""
-    return self._use_batch_norm_x
-
-  @property
-  def use_batch_norm_c(self):
-    """Boolean indicating whether batch norm for cell -> output is enabled."""
-    return self._use_batch_norm_c
-
-  class IndexedStatsBatchNorm(base.AbstractModule):
-    """BatchNorm module where batch statistics are selected by an input index.
-
-    This is used by LSTM+batchnorm, where we have distinct batch norm statistics
-    for the first `max_unique_stats` time steps, and then use the final set of
-    statistics for subsequent time steps.
-
-    The module has as input (x, index, is_training, test_local_stats). During
-    training or when test_local_stats=True, the output is simply batchnorm(x)
-    (where mean(x) and stddev(x) are used), and during training the
-    `BatchNorm` module accumulates statistics in mean_i, etc, where
-    i = min(index, max_unique_stats - 1).
-
-    During testing with test_local_stats=False, the output is batchnorm(x),
-    where mean_i and stddev_i are used instead of mean(x) and stddev(x).
-
-    See the `BatchNorm` module for more on is_training and test_local_stats.
-
-    No offset `beta` or scaling `gamma` are learnt.
-    """
-
-    def __init__(self, max_unique_stats, name=None):
-      """Create an IndexedStatsBatchNorm.
-
-      Args:
-        max_unique_stats: number of different indices to have statistics for;
-          indices beyond this will use the final statistics.
-        name: Name of the module.
-      """
-      super(LSTM.IndexedStatsBatchNorm, self).__init__(name=name)
-      self._max_unique_stats = max_unique_stats
-
-    def _build(self, inputs, index, is_training, test_local_stats):
-      """Add the IndexedStatsBatchNorm module to the graph.
-
-      Args:
-        inputs: Tensor to apply batch norm to.
-        index: Scalar TensorFlow int32 value to select the batch norm index.
-        is_training: Boolean to indicate to `nn.BatchNorm` if we are
-          currently training.
-        test_local_stats: Boolean to indicate to `nn.BatchNorm` if batch
-          normalization should  use local batch statistics at test time.
-
-      Returns:
-        Output of batch norm operation.
-      """
-      def create_batch_norm():
-        return batch_norm.BatchNorm(offset=False, scale=False)(
-            inputs, is_training, test_local_stats)
-
-      if self._max_unique_stats > 1:
-        pred_fn_pairs = [(tf.equal(i, index), create_batch_norm)
-                         for i in xrange(self._max_unique_stats - 1)]
-        out = tf.case(pred_fn_pairs, create_batch_norm)
-        out.set_shape(inputs.get_shape())  # needed for tf.case shape inference
-        return out
-      else:
-        return create_batch_norm()
-
-  class CellWithExtraInput(rnn.RNNCell):
-    """Wraps an RNNCell to create a new RNNCell with extra input appended.
-
-    This will pass the additional input `args` and `kwargs` to the __call__
-    function of the RNNCell after the input and prev_state inputs.
-    """
-
-    def __init__(self, cell, *args, **kwargs):
-      """Construct the CellWithExtraInput.
-
-      Args:
-        cell: The RNNCell to wrap (typically a nn.RNNCore).
-        *args: Extra arguments to pass to __call__.
-        **kwargs: Extra keyword arguments to pass to __call__.
-      """
-      self._cell = cell
-      self._args = args
-      self._kwargs = kwargs
-
-    def __call__(self, inputs, state):
-      return self._cell(inputs, state, *self._args, **self._kwargs)
-
-    @property
-    def state_size(self):
-      """Tuple indicating the size of nested state tensors."""
-      return self._cell.state_size
-
-    @property
-    def output_size(self):
-      """`tf.TensorShape` indicating the size of the core output."""
-      return self._cell.output_size
diff --git a/nn/mlp.py b/nn/mlp.py
deleted file mode 100644
index bfbb3c2..0000000
--- a/nn/mlp.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A minimal interface mlp module."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from six.moves import xrange
-import tensorflow as tf
-
-from nn import base
-from nn import basic
-from nn import util
-
-
-class MLP(base.AbstractModule, base.Transposable):
-  """A Multi-Layer perceptron module."""
-
-  def __init__(self,
-               output_sizes,
-               activation=tf.nn.relu,
-               activate_final=False,
-               initializers=None,
-               use_bias=True,
-               name="mlp"):
-    """Constructs an MLP module.
-
-    Args:
-      output_sizes: An iterable of output dimensionalities as defined in
-        `basic.Linear`. Output size can be defined either as number or via a
-        callable. In the latter case, since the function invocation is deferred
-        to graph construction time, the user must only ensure that entries can
-        be called when build is called. Each entry in the iterable defines
-        properties in the corresponding linear layer.
-      activation: An activation op. The activation is applied to intermediate
-        layers, and optionally to the output of the final layer.
-      activate_final: Boolean determining if the activation is applied to
-        the output of the final layer. Default `False`.
-      initializers: Optional dict containing ops to initialize the linear
-        layers' weights (with key 'w') or biases (with key 'b').
-      use_bias: Whether to include bias parameters in the linear layers.
-        Default `True`.
-      name: Name of the module.
-
-    Raises:
-      Error: If initializers contains any keys other than 'w' or 'b'.
-      ValueError: If output_sizes is empty.
-      TypeError: If `activation` is not callable; or if `output_sizes` is not
-        iterable.
-    """
-    super(MLP, self).__init__(name=name)
-
-    if not isinstance(output_sizes, collections.Iterable):
-      raise TypeError("output_sizes must be iterable")
-    output_sizes = tuple(output_sizes)
-    if not output_sizes:
-      raise ValueError("output_sizes must not be empty")
-    self._output_sizes = output_sizes
-    self._num_layers = len(self._output_sizes)
-    self._input_shape = None
-
-    self.possible_keys = self.get_possible_initializer_keys(use_bias=use_bias)
-    self._initializers = util.check_initializers(initializers,
-                                                 self.possible_keys)
-    if not callable(activation):
-      raise TypeError("Input 'activation' must be callable")
-    self._activation = activation
-    self._activate_final = activate_final
-
-    self._use_bias = use_bias
-    self._instantiate_layers()
-
-  def _instantiate_layers(self):
-    """Instantiates all the linear modules used in the network.
-
-    Layers are instantiated in the constructor, as opposed to the build
-    function, because MLP implements the Transposable interface, and the
-    transpose function can be called before the module is actually connected
-    to the graph and build is called.
-
-    Notice that this is safe since layers in the transposed module are
-    instantiated using a lambda returning input_size of the mlp layers, and
-    this doesn't have to return sensible values until the original module is
-    connected to the graph.
-    """
-
-    with tf.variable_scope(self._template.variable_scope):
-      self._layers = [basic.Linear(self._output_sizes[i],
-                                   name="linear_{}".format(i),
-                                   initializers=self._initializers,
-                                   use_bias=self.use_bias)
-                      for i in xrange(self._num_layers)]
-
-  @classmethod
-  def get_possible_initializer_keys(cls, use_bias=True):
-    return basic.Linear.get_possible_initializer_keys(use_bias=use_bias)
-
-  def _build(self, inputs):
-    """Assembles the `MLP` and connects it to the graph.
-
-    Args:
-      inputs: A 2D Tensor of size `[batch_size, input_size]`.
-
-    Returns:
-      A 2D Tensor of size `[batch_size, output_sizes[-1]]`.
-    """
-    self._input_shape = tuple(inputs.get_shape().as_list())
-    net = inputs
-
-    final_index = self._num_layers - 1
-    for layer_id in xrange(self._num_layers):
-      net = self._layers[layer_id](net)
-
-      if final_index != layer_id or self._activate_final:
-        net = self._activation(net)
-
-    return net
-
-  @property
-  def layers(self):
-    """Returns a tuple containing the linear layers of the `MLP`."""
-    return self._layers
-
-  @property
-  def output_sizes(self):
-    return tuple([l() if callable(l) else l for l in self._output_sizes])
-
-  @property
-  def use_bias(self):
-    return self._use_bias
-
-  @property
-  def activate_final(self):
-    return self._activate_final
-
-  # Implements Transposable interface
-  @property
-  def input_shape(self):
-    """Returns shape of input `Tensor` passed at last call to `build`."""
-    self._ensure_is_connected()
-    return self._input_shape
-
-  # Implements Transposable interface
-  def transpose(self, name=None):
-    """Returns transposed `MLP`.
-
-    Args:
-      name: Optional string specifiying the name of the transposed module. The
-        default name is constructed by appending "_transpose" to `self.name`.
-
-    Returns:
-      Matching transposed `MLP` module.
-    """
-    if name is None:
-      name = self.name + "_transpose"
-    output_sizes = [lambda l=layer: l.input_shape[1] for layer in self._layers]
-    output_sizes.reverse()
-    return MLP(name=name,
-               output_sizes=output_sizes,
-               activation=self._activation,
-               activate_final=self._activate_final,
-               initializers=self._initializers,
-               use_bias=self._use_bias)
diff --git a/nn/rnn_core.py b/nn/rnn_core.py
deleted file mode 100644
index bc2c6e2..0000000
--- a/nn/rnn_core.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Base class for TensorFlow nn recurrent cores.
-
-This file contains the Abstract Base Class for defining Recurrent Cores in
-TensorFlow. A Recurrent Core is an object which holds the properties of other
-`nn.Module`s and also satisfies the interface of any RNNCell in tensorflow.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import warnings
-
-
-import six
-from six.moves import xrange
-import tensorflow as tf
-
-from tensorflow.python.framework import tensor_shape
-from tensorflow.contrib import rnn
-from tensorflow.python.util import nest
-
-from nn import base
-
-
-def _single_learnable_state(state, state_id=0, learnable=True):
-  """Returns an initial (maybe learnable) state.
-
-  This function does not create any variable scopes, and it should be called
-  from a nn module. This function also makes sure that all the rows of its
-  `state` argument have the same value.
-
-  Args:
-    state: initial value of the initial state. It should be a tensor of at least
-      two dimensions, of which the first dimension corresponds to the
-      batch_size dimension. All rows of such tensor should have the same value.
-    state_id: integer that uniquely identifies this state.
-    learnable: boolean that indicates whether the state is learnable.
-
-  Returns:
-    The initial learnable state `Tensor`.
-  """
-  unpacked_state = tf.unpack(state)
-  # Assert that all rows have the same values.
-  assert_rows_equal = [tf.assert_equal(s, unpacked_state[0])
-                       for s in unpacked_state]
-
-  # We wish to have all the graph assertions in the graph's critical path,
-  # so we include them even if the initial state is left unmodified (i.e. when
-  # the state is not learnable).
-  # Note: All these assertions will be run every time that data flows
-  # through the graph. At that point, the control_dependencies context manager
-  # makes sure that such assertions are run, and will raise an exception if any
-  # fails.
-  with tf.control_dependencies(assert_rows_equal):
-    if not learnable:
-      return state
-    else:
-      state_shape = state.get_shape()
-      state_shape.assert_is_fully_defined()
-      state_shape_list = state_shape.as_list()
-      batch_size, trailing_shape = state_shape_list[0], state_shape_list[1:]
-
-      initial_value = tf.reshape(unpacked_state[0], [1] + trailing_shape)
-      initial_state_variable = tf.get_variable(
-          "initial_state_%d" % state_id, dtype=initial_value.dtype,
-          initializer=initial_value)
-
-      trailing_size_repeat = [1] * len(trailing_shape)
-      return tf.tile(initial_state_variable,
-                     tf.constant([batch_size] + trailing_size_repeat))
-
-
-def trainable_initial_state(batch_size, state_size, dtype, initializers=None):
-  """Creates an initial state consisting of trainable variables.
-
-  The trainable variables are created with the same shapes as the elements of
-  `state_size` and are tiled to produce an initial state.
-
-  Args:
-    batch_size: An int, or scalar int32 Tensor representing the batch size.
-    state_size: A `TensorShape` or nested tuple of `TensorShape`s to use for the
-        shape of the trainable variables.
-    dtype: The data type used to create the variables and thus initial state.
-    initializers: An optional container of the same structure as `state_size`
-        containing initializers for the variables.
-
-  Returns:
-    A `Tensor` or nested tuple of `Tensor`s with the same size and structure
-    as `state_size`, where each `Tensor` is a tiled trainable `Variable`.
-
-  Raises:
-    ValueError: if the user passes initializers that are not functions.
-  """
-  flat_state_size = nest.flatten(state_size)
-
-  if not initializers:
-    flat_initializer = tuple(tf.zeros_initializer for _ in flat_state_size)
-  else:
-    nest.assert_same_structure(initializers, state_size)
-    flat_initializer = nest.flatten(initializers)
-    if not all([callable(init) for init in flat_initializer]):
-      raise ValueError("Not all the passed initializers are callable objects.")
-
-  # Produce names for the variables. In the case of a tuple or nested tuple,
-  # this is just a sequence of numbers, but for a flat `namedtuple`, we use
-  # the field names. NOTE: this could be extended to nested `namedtuple`s,
-  # but for now that's extra complexity that's not used anywhere.
-  try:
-    names = ["init_{}".format(state_size._fields[i])
-             for i in xrange(len(flat_state_size))]
-  except (AttributeError, IndexError):
-    names = ["init_state_{}".format(i) for i in xrange(len(flat_state_size))]
-
-  flat_initial_state = []
-
-  for name, size, init in zip(names, flat_state_size, flat_initializer):
-    shape_with_batch_dim = [1] + tensor_shape.as_shape(size).as_list()
-    initial_state_variable = tf.get_variable(
-        name, shape=shape_with_batch_dim, dtype=dtype, initializer=init)
-
-    initial_state_variable_dims = initial_state_variable.get_shape().ndims
-    tile_dims = [batch_size] + [1] * (initial_state_variable_dims - 1)
-    flat_initial_state.append(
-        tf.tile(initial_state_variable, tile_dims, name=(name + "_tiled")))
-
-  return nest.pack_sequence_as(structure=state_size,
-                               flat_sequence=flat_initial_state)
-
-
-@six.add_metaclass(abc.ABCMeta)
-class RNNCore(base.AbstractModule, rnn.RNNCell):
-  """Superclass for Recurrent Neural Network Cores.
-
-  This class defines the basic functionality that every core should implement,
-  mainly the `initial_state` method which will return an example of their
-  initial state.
-  It also inherits from the two interfaces it should be compatible with, which
-  are `nn.Module` and `rnn_cell.RNNCell`.
-
-  As with any other `nn.Module` any subclass must implement a `_build` method
-  that constructs the graph that corresponds to a core. Such a build method
-  should always have the same interface, which is the following:
-
-    output, new_state = self._build(input, prev_state)
-
-  where output, new_state, input, and prev_state are arbitrarily nested
-  tensors. Such structures can be defined according to the following
-  grammar:
-
-      element = tuple(element*) | list(element*) | tf.Tensor
-
-  This class is to be used with tensorflow containers such as `rnn` in
-  tensorflow.python.ops.rnn. These containers only accept `rnn_cell.RNNCell`
-  objects, hence the need to comply with its interface. This way, all the
-  RNNCores should expose a `state_size` and `output_size` properties.
-  """
-  __metaclass__ = abc.ABCMeta
-
-  def initial_state(self, batch_size, dtype=tf.float32, trainable=False,
-                    trainable_initializers=None):
-    """Builds the default start state for an RNNCore.
-
-    Args:
-      batch_size: An int, or scalar int32 Tensor representing the batch size.
-      dtype: The data type to use for the state.
-      trainable: Boolean that indicates whether to learn the initial state.
-      trainable_initializers: An initializer function or nested structure of
-          functions with same structure as the `state_size` property of the
-          core, to be used as initializers of the initial state variable.
-
-    Returns:
-      A tensor or nested tuple of tensors with same structure and shape as the
-      `state_size` property of the core.
-
-    Raises:
-      ValueError: if the user passes initializers that are not functions.
-    """
-    if not trainable:
-      return super(RNNCore, self).zero_state(batch_size, dtype)
-    else:
-      return trainable_initial_state(
-          batch_size, self.state_size, dtype, trainable_initializers)
-
-
-class TrainableInitialState(base.AbstractModule):
-  """Helper Module that creates a learnable initial state for an RNNCore.
-
-  This class receives an example (possibly nested) initial state of an RNNCore,
-  and returns a state that has the same shape, structure, and values, but is
-  trainable. Additionally, the user may specify a boolean mask that
-  indicates which parts of the initial state should be trainable.
-
-  This allows users to train an unrolled RNNCore with a learnable initial state
-  in the following way:
-
-      core = ... # Any RNNCore module object.
-      initial_state = core.initial_state(batch_size, dtype)
-      trainable_initial_state = nn.TrainableInitialState(initial_state)()
-      output, final_state = tf.nn.dynamic_rnn(
-          core, input_sequence, initial_state=trainable_initial_state)
-  """
-
-  def __init__(self, initial_state, mask=None, name="trainable_initial_state"):
-    """Constructs the Module that introduces a trainable state in the graph.
-
-    It receives an initial state that will be used as the intial values for the
-    trainable variables that the module contains, and optionally a mask that
-    indicates the parts of the initial state that should be learnable.
-
-    Args:
-      initial_state: tensor or arbitrarily nested iterables of tensors.
-      mask: optional boolean mask. It should have the same nested structure as
-       the given initial_state.
-      name: module name.
-
-    Raises:
-      TypeError: if mask is not a list of booleans or None.
-    """
-    super(TrainableInitialState, self).__init__(name=name)
-
-    # Since python 2.7, DeprecationWarning is ignored by default.
-    # Turn on the warning:
-    warnings.simplefilter("always", DeprecationWarning)
-    warnings.warn("Use the trainable flag in initial_state instead.",
-                  DeprecationWarning, stacklevel=2)
-
-    if mask is not None:
-      flat_mask = nest.flatten(mask)
-      if not all([isinstance(m, bool) for m in flat_mask]):
-        raise TypeError("Mask should be None or a list of boolean values.")
-      nest.assert_same_structure(initial_state, mask)
-
-    self._mask = mask
-    self._initial_state = initial_state
-
-  def _build(self):
-    """Connects the module to the graph.
-
-    Returns:
-      The learnable state, which has the same type, structure and shape as
-        the `initial_state` passed to the constructor.
-    """
-    flat_initial_state = nest.flatten(self._initial_state)
-    if self._mask is not None:
-      flat_mask = nest.flatten(self._mask)
-      flat_learnable_state = [
-          _single_learnable_state(state, state_id=i, learnable=mask)
-          for i, (state, mask) in enumerate(zip(flat_initial_state, flat_mask))]
-    else:
-      flat_learnable_state = [_single_learnable_state(state, state_id=i)
-                              for i, state in enumerate(flat_initial_state)]
-
-    return nest.pack_sequence_as(structure=self._initial_state,
-                                 flat_sequence=flat_learnable_state)
-
diff --git a/nn/sequential.py b/nn/sequential.py
deleted file mode 100644
index b30cd3a..0000000
--- a/nn/sequential.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Sequential Module for TensorFlow nn.
-
-A Module that wraps a list of other modules and ops, connecting the output of
-each to the input of the next.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from nn import base
-
-
-class Sequential(base.AbstractModule):
-  """Builds a module out of a sequence of callables."""
-
-  def __init__(self, layers, name="sequential"):
-    """Constructs a Sequential module.
-
-    This feeds the output of each layer into the next and returns the output
-    of the final layer.
-
-    If a layer returns a tuple, it is assumed that this must be unpacked into
-    the argument list of the next layer. If it is not a tuple, it is simply
-    passed through to the next layer unchanged.
-
-    Args:
-      layers: Iterable of callables to stack together, which can be modules
-          or ops.
-      name: Name of the module.
-
-    Raises:
-      TypeError: If `layers` is None or contains any non-callable items.
-    """
-    super(Sequential, self).__init__(name=name)
-
-    # Store a copy of the iterable in a tuple to ensure users cannot modify the
-    # iterable later, and protect against iterables which can only be read once.
-    self._layers = tuple(layers)
-
-    is_not_callable = [(i, mod) for i, mod in enumerate(self._layers)
-                       if not callable(mod)]
-
-    if is_not_callable:
-      raise TypeError("Items {} not callable with types: {}".format(
-          ", ".join(str(i) for i, _ in is_not_callable),
-          ", ".join(type(layer).__name__ for _, layer in is_not_callable)))
-
-  def _build(self, *args):
-    """Connects the Sequential module into the graph.
-
-    Args:
-      *args: A tuple of inputs, to be unpacked as the arguments to the first
-          layer.
-
-    Returns:
-      The output value of the last layer.
-    """
-    net = args
-
-    for layer in self._layers:
-      if isinstance(net, tuple):
-        net = layer(*net)
-      else:
-        net = layer(net)
-
-    return net
-
-  @property
-  def layers(self):
-    return self._layers
diff --git a/nn/util.py b/nn/util.py
deleted file mode 100644
index ae59e65..0000000
--- a/nn/util.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright 2016 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Utility functions for dealing with nn Modules."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-
-import tensorflow as tf
-
-
-def get_variables_in_scope(scope, collection=tf.GraphKeys.TRAINABLE_VARIABLES):
-  """Returns a tuple `tf.Variable`s in a scope for a given collection.
-
-  Args:
-    scope: `tf.VariableScope` instance to retrieve variables from.
-    collection: Collection to restrict query to. By default this is
-        `tf.Graphkeys.TRAINABLE_VARIABLES`, which doesn't include non-trainable
-        variables such as moving averages.
-
-  Returns:
-    A tuple of `tf.Variable` objects.
-  """
-  # Escape the name in case it contains any "." characters. Add a closing slash
-  # so we will not search any scopes that have this scope name as a prefix.
-  scope_name = re.escape(scope.name) + "/"
-
-  return tuple(tf.get_collection(collection, scope_name))
-
-
-def get_variables_in_module(module,
-                            collection=tf.GraphKeys.TRAINABLE_VARIABLES):
-  """Returns tuple of `tf.Variable`s declared inside an `nn.Module`.
-
-  Note that this operates by searching the variable scope a module contains,
-  and so does not know about any modules which were constructed elsewhere but
-  used inside this module.
-
-  Args:
-    module: `nn.Module` instance to query the scope of.
-    collection: Collection to restrict query to. By default this is
-      `tf.Graphkeys.TRAINABLE_VARIABLES`, which doesn't include non-trainable
-      variables such as moving averages.
-
-  Returns:
-    A tuple of `tf.Variable` objects.
-
-  Raises:
-    NotConnectedError: If the module is not connected to the Graph.
-  """
-  return get_variables_in_scope(module.variable_scope, collection=collection)
-
-
-def check_initializers(initializers, keys):
-  """Checks the given initializers.
-
-  This checks that `initializers` is a dictionary that only contains keys in
-  `keys`, and furthermore the entries in `initializers` are functions or
-  further dictionaries (the latter used, for example, in passing initializers
-  to modules inside modules) which must satisfy the same constraints.
-
-  Args:
-    initializers: Dictionary of initializers (allowing nested dictionaries) or
-      None.
-    keys: Iterable of valid keys for `initializers`.
-
-  Returns:
-    Copy of checked dictionary of initializers.
-
-  Raises:
-    KeyError: If an initializer is provided for a key not in `keys`.
-    TypeError: If a provided initializer is not a callable function, or if the
-      dict of initializers is not in fact a dict.
-  """
-  if initializers is None:
-    return {}
-
-  keys = set(keys)
-
-  # If the user is creating modules that nests other modules, then it is
-  # possible that they might not nest the initializer dictionaries correctly. If
-  # that is the case, then we might find that initializers is not a dict here.
-  # We raise a helpful exception in this case.
-  if not issubclass(type(initializers), dict):
-    raise TypeError("A dict of initializers was expected, but not "
-                    "given. You should double-check that you've nested the "
-                    "initializers for any sub-modules correctly.")
-
-  if not set(initializers) <= keys:
-    extra_keys = set(initializers) - keys
-    raise KeyError(
-        "Invalid initializer keys {}, initializers can only "
-        "be provided for {}".format(
-            ", ".join("'{}'".format(key) for key in extra_keys),
-            ", ".join("'{}'".format(key) for key in keys)))
-
-  def check_nested_callables(dictionary):
-    for key, entry in dictionary.items():
-      if isinstance(entry, dict):
-        check_nested_callables(entry)
-      elif not callable(entry):
-        raise TypeError(
-            "Initializer for '{}' is not a callable function or dictionary"
-            .format(key))
-
-  check_nested_callables(initializers)
-
-  return dict(initializers)
-
-
-def check_partitioners(partitioners, keys):
-  """Checks the given partitioners.
-
-  This checks that `partitioners` is a dictionary that only contains keys in
-  `keys`, and furthermore the entries in `partitioners` are functions or
-  further dictionaries (the latter used, for example, in passing partitioners
-  to modules inside modules) which must satisfy the same constraints.
-
-  Args:
-    partitioners: Dictionary of partitioners (allowing nested dictionaries) or
-        None.
-    keys: Iterable of valid keys for `partitioners`.
-
-  Returns:
-    Checked dictionary of partitioners.
-
-  Raises:
-    KeyError: If an partitioner is provided for a key not in `keys`.
-    TypeError: If a provided partitioner is not a callable function.
-  """
-  if partitioners is None:
-    return {}
-
-  keys = set(keys)
-
-  if not set(partitioners) <= keys:
-    extra_keys = set(partitioners) - keys
-    raise KeyError(
-        "Invalid partitioner keys {}, partitioners can only "
-        "be provided for {}".format(
-            ", ".join("'{}'".format(key) for key in extra_keys),
-            ", ".join("'{}'".format(key) for key in keys)))
-
-  def check_nested_callables(dictionary):
-    for key, entry in dictionary.items():
-      if isinstance(entry, dict):
-        check_nested_callables(entry)
-      elif not callable(entry):
-        raise TypeError(
-            "Partitioner for '{}' is not a callable function or dictionary"
-            .format(key))
-
-  check_nested_callables(partitioners)
-
-  return partitioners
diff --git a/problems.py b/problems.py
index f9bca57..553556f 100644
--- a/problems.py
+++ b/problems.py
@@ -24,12 +24,11 @@
 
 from six.moves import urllib
 from six.moves import xrange  # pylint: disable=redefined-builtin
+import sonnet as snt
 import tensorflow as tf
 
 from tensorflow.contrib.learn.python.learn.datasets import mnist as mnist_dataset
 
-import nn
-
 
 _nn_initializers = {
     "w": tf.random_normal_initializer(mean=0, stddev=0.01),
@@ -160,10 +159,10 @@ def mnist(layers,  # pylint: disable=invalid-name
   labels = tf.constant(data.labels, dtype=tf.int64, name="MNIST_labels")
 
   # Network.
-  mlp = nn.MLP(list(layers) + [10],
-               activation=activation_op,
-               initializers=_nn_initializers)
-  network = nn.Sequential([nn.BatchFlatten(), mlp])
+  mlp = snt.nets.MLP(list(layers) + [10],
+                     activation=activation_op,
+                     initializers=_nn_initializers)
+  network = snt.Sequential([snt.BatchFlatten(), mlp])
 
   def build():
     indices = tf.random_uniform([batch_size], 0, data.num_examples, tf.int64)
@@ -249,24 +248,24 @@ def _conv_activation(x):  # pylint: disable=invalid-name
                           strides=[1, 2, 2, 1],
                           padding="SAME")
 
-  conv = nn.ConvNet2D(output_channels=conv_channels,
-                      kernel_shapes=[5],
-                      strides=[1],
-                      paddings=[nn.SAME],
-                      activation=_conv_activation,
-                      activate_final=True,
-                      initializers=_nn_initializers,
-                      use_batch_norm=batch_norm)
+  conv = snt.nets.ConvNet2D(output_channels=conv_channels,
+                            kernel_shapes=[5],
+                            strides=[1],
+                            paddings=[snt.SAME],
+                            activation=_conv_activation,
+                            activate_final=True,
+                            initializers=_nn_initializers,
+                            use_batch_norm=batch_norm)
 
   if batch_norm:
-    linear_activation = lambda x: tf.nn.relu(nn.BatchNorm()(x))
+    linear_activation = lambda x: tf.nn.relu(snt.BatchNorm()(x))
   else:
     linear_activation = tf.nn.relu
 
-  mlp = nn.MLP(list(linear_layers) + [10],
-               activation=linear_activation,
-               initializers=_nn_initializers)
-  network = nn.Sequential([conv, nn.BatchFlatten(), mlp])
+  mlp = snt.nets.MLP(list(linear_layers) + [10],
+                     activation=linear_activation,
+                     initializers=_nn_initializers)
+  network = snt.Sequential([conv, snt.BatchFlatten(), mlp])
 
   def build():
     image_batch, label_batch = queue.dequeue_many(batch_size)