diff --git a/.github/workflows/push_pr.yml b/.github/workflows/push_pr.yml
index 2c991e5dd..9b1595624 100644
--- a/.github/workflows/push_pr.yml
+++ b/.github/workflows/push_pr.yml
@@ -2,11 +2,9 @@ name: push and pull request testing
 on:
   push:
     branches:
-    - sockeye_2
     - master
   pull_request:
     branches:
-    - sockeye_2
     - master
 
 jobs:
diff --git a/.gitignore b/.gitignore
index 4069c48b0..99d7f0b7f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,5 +18,3 @@
 .pytest_cache
 tags
 sockeye/__pycache__
-git_version.py
-
diff --git a/.travis.yml b/.travis.yml
index 3704e87e5..8d7989d31 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,7 +8,6 @@ before_install:
   - docker pull ubuntu:16.04
 
 python:
-  - "3.4"
   - "3.5"
   - "3.6"
 
@@ -26,9 +25,7 @@ script:
   - mypy --version
   - mypy --ignore-missing-imports --follow-imports=silent @typechecked-files --no-strict-optional
   - check-manifest --ignore sockeye/git_version.py
-  - if [ "$TRAVIS_EVENT_TYPE" != "cron" ]; then python -m pytest -k "Copy:lstm:lstm" --maxfail=1 test/system; fi
   - if [ "$TRAVIS_EVENT_TYPE" != "cron" ]; then python -m pytest -k "Copy:transformer:transformer" --maxfail=1 test/system; fi
-  - if [ "$TRAVIS_EVENT_TYPE" != "cron" ]; then python -m pytest -k "Copy:cnn:cnn" --maxfail=1 test/system; fi
   - if [ "$TRAVIS_EVENT_TYPE" = "cron" ]; then python -m pytest --maxfail=1 test/system; fi
   - if [ "$TRAVIS_EVENT_TYPE" = "cron" ]; then python -m sockeye_contrib.autopilot.test; fi
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2d62c214e..20e8cac1e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,5 @@
 # Changelog
+
 All notable changes to the project are documented in this file.
 
 Version numbers are of the form `1.0.0`.
@@ -10,63 +11,120 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
-## [1.18.115]
-### Added
-- Added requirements for MXnet compatible with cuda 10.1.
+## [2.1.7]
 
-## [1.18.114]
-### Fixed
-- Fix bug in prepare_train_data arguments.
+### Changed
 
-## [1.18.113]
-### Fixed
-- Added logging arguments for prepare_data CLI.
+- Optimize prepare_data by saving the shards in parallel. The prepare_data script accepts a new parameter `--max-processes` to control the level of parallelism with which shards are written to disk.
+
+## [2.1.6]
+
+### Changed
+
+- Updated Dockerfiles optimized for CPU (intgemm int8 inference, full MKL support) and GPU (distributed training with Horovod).  See [sockeye_contrib/docker](sockeye_contrib/docker).
 
-## [1.18.112]
 ### Added
-- Option to suppress creation of logfiles for CLIs (`--no-logfile`).
 
-## [1.18.111]
+- Official support for int8 quantization with [intgemm](https://github.com/kpu/intgemm):
+  - This requires the "intgemm" fork of MXNet ([kpuatamazon/incubator-mxnet/intgemm](https://github.com/kpuatamazon/incubator-mxnet/tree/intgemm)).  This is the version of MXNet used in the Sockeye CPU docker image (see [sockeye_contrib/docker](sockeye_contrib/docker)).
+  - Use `sockeye.translate --dtype int8` to quantize a trained float32 model at runtime.
+  - Use the `sockeye.quantize` CLI to annotate a float32 model with int8 scaling factors for fast runtime quantization.
+
+## [2.1.5]
+
+### Changed
+
+- Changed state caching for transformer models during beam search to cache states with attention heads already separated out. This avoids repeated transpose operations during decoding, leading to faster inference.
+
+## [2.1.4]
+
 ### Added
-- Added an optional checkpoint callback for the train function.
+
+- Added Dockerfiles that build an experimental CPU-optimized Sockeye image:
+  - Uses the latest versions of [kpuatamazon/incubator-mxnet](https://github.com/kpuatamazon/incubator-mxnet) (supports [intgemm](https://github.com/kpu/intgemm) and makes full use of Intel MKL) and [kpuatamazon/sockeye](https://github.com/kpuatamazon/sockeye) (supports int8 quantization for inference).
+  - See [sockeye_contrib/docker](sockeye_contrib/docker).
+
+## [2.1.3]
 
 ### Changed
-- Excluded gradients from pickled fields of TrainState
 
-## [1.18.110]
+- Performance optimizations to beam search inference
+  - Remove unneeded take ops on encoder states
+  - Gathering input data before sending to GPU, rather than sending each batch element individually
+  - All of beam search can be done in fp16, if specified by the model
+  - Other small miscellaneous optimizations
+- Model states are now a flat list in ensemble inference, structure of states provided by `state_structure()`
+
+## [2.1.2]
+
 ### Changed
-- We now guard against failures to run `nvidia-smi` for GPU memory monitoring.
 
-## [1.18.109]
-### Fixed
-- Fixed the metric names by prefixing training metrics with 'train-' and validation metrics with 'val-'. Also restricted the custom logging function to accept only a dictionary and a compulsory global_step parameter.
+- Updated to [MXNet 1.6.0](https://github.com/apache/incubator-mxnet/tree/1.6.0)
+
+### Added
+
+- Added support for CUDA 10.2
+
+### Removed
+
+- Removed support for CUDA<9.1 / CUDNN<7.5
+
+## [2.1.1]
+
+### Added
+- Ability to set environment variables from training/translate CLIs before MXNet is imported. For example, users can
+  configure MXNet as such: `--env "OMP_NUM_THREADS=1;MXNET_ENGINE_TYPE=NaiveEngine"`
+
+## [2.1.0]
 
-## [1.18.108]
 ### Changed
-- More verbose log messages about target token counts.
 
-## [1.18.107]
+- Version bump, which should have been included in commit b0461b due to incompatible models.
+
+## [2.0.1]
+
 ### Changed
-- Updated to [MXNet 1.5.0](https://github.com/apache/incubator-mxnet/tree/1.5.0)
 
-## [1.18.106]
-### Added
-- Added an optional time limit for stopping training. The training will stop at the next checkpoint after reaching the time limit.
+- Inference defaults to using the max input length observed in training (versus scaling down based on mean length ratio and standard deviations).
 
-## [1.18.105]
 ### Added
-- Added support for a possibility to have a custom metrics logger - a function passed as an extra parameter. If supplied, the logger is called during training.
 
-## [1.18.104]
+- Additional parameter fixing strategies:
+  - `all_except_feed_forward`: Only train feed forward layers.
+  - `encoder_and_source_embeddings`: Only train the decoder (decoder layers, output layer, and target embeddings).
+  - `encoder_half_and_source_embeddings`: Train the latter half of encoder layers and the decoder.
+- Option to specify the number of CPU threads without using an environment variable (`--omp-num-threads`).
+- More flexibility for source factors combination
+
+## [2.0.0]
+
 ### Changed
-- Implemented an attention-based copy mechanism as described in [Jia, Robin, and Percy Liang. "Data recombination for neural semantic parsing." (2016)](https://arxiv.org/abs/1606.03622).
-- Added a <ptr\d+> special symbol to explicitly point at an input token in the target sequence
-- Changed the decoder interface to pass both the decoder data and the pointer data.
-- Changed the AttentionState named tuple to add the raw attention scores.
+
+- Update to [MXNet 1.5.0](https://github.com/apache/incubator-mxnet/tree/1.5.0)
+- Moved `SockeyeModel` implementation and all layers to [Gluon API](http://mxnet.incubator.apache.org/versions/master/gluon/index.html)
+- Removed support for Python 3.4.
+- Removed image captioning module
+- Removed outdated Autopilot module
+- Removed unused training options: Eve, Nadam, RMSProp, Nag, Adagrad, and Adadelta optimizers, `fixed-step` and `fixed-rate-inv-t` learning rate schedulers
+- Updated and renamed learning rate scheduler `fixed-rate-inv-sqrt-t` -> `inv-sqrt-decay`
+- Added script for plotting metrics files: [sockeye_contrib/plot_metrics.py](sockeye_contrib/plot_metrics.py)
+- Removed option `--weight-tying`.  Weight tying is enabled by default, disable with `--weight-tying-type none`.
+
+### Added
+
+- Added distributed training support with Horovod/OpenMPI.  Use `horovodrun` and the `--horovod` training flag.
+- Added Dockerfiles that build a Sockeye image with all features enabled.  See [sockeye_contrib/docker](sockeye_contrib/docker).
+- Added `none` learning rate scheduler (use a fixed rate throughout training)
+- Added `linear-decay` learning rate scheduler
+- Added training option `--learning-rate-t-scale` for time-based decay schedulers
+- Added support for MXNet's [Automatic Mixed Precision](https://mxnet.incubator.apache.org/versions/master/tutorials/amp/amp_tutorial.html).  Activate with the `--amp` training flag.  For best results, make sure as many model dimensions are possible are multiples of 8.
+- Added options for making various model dimensions multiples of a given value.  For example, use `--pad-vocab-to-multiple-of 8`, `--bucket-width 8 --no-bucket-scaling`, and `--round-batch-sizes-to-multiple-of 8` with AMP training.
+- Added [GluonNLP](http://gluon-nlp.mxnet.io/)'s BERTAdam optimizer, an implementation of the Adam variant used by Devlin et al. ([2018](https://arxiv.org/pdf/1810.04805.pdf)).  Use `--optimizer bertadam`.
+- Added training option `--checkpoint-improvement-threshold` to set the amount of metric improvement required over the window of previous checkpoints to be considered actual model improvement (used with `--max-num-checkpoint-not-improved`).
 
 ## [1.18.103]
 ### Added
-- Added ability to score image-sentence pairs by extending the scoring feature originally implemented for machine 
+- Added ability to score image-sentence pairs by extending the scoring feature originally implemented for machine
   translation to the image captioning module.
 
 ## [1.18.102]
@@ -95,7 +153,7 @@ Each version section may have have subsections for: _Added_, _Changed_, _Removed
 
 ## [1.18.96]
 ### Changed
-- Extracted prepare vocab functionality in the build vocab step into its own function. This matches the pattern in prepare data and train where the main() function only has argparsing, and it invokes a separate function to do the work. This is to allow modules that import this one to circumvent the command line. 
+- Extracted prepare vocab functionality in the build vocab step into its own function. This matches the pattern in prepare data and train where the main() function only has argparsing, and it invokes a separate function to do the work. This is to allow modules that import this one to circumvent the command line.
 
 ## [1.18.95]
 ### Changed
diff --git a/MANIFEST.in b/MANIFEST.in
index 5f8e3c773..f8ba0012b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -8,6 +8,7 @@ include .flake8
 include typechecked-files
 include test/data/config_with_missing_attributes.yaml
 include sockeye/git_version.py
+include *.bib
 recursive-include .github *
 include CONTRIBUTING.md
 exclude *.sh
@@ -21,8 +22,8 @@ recursive-include docs *.html
 recursive-include docs *.png
 recursive-include docs *.md
 recursive-include docs *.py
-recursive-include docs *.sh
 recursive-include docs *.yml
 recursive-include docs *.ico
 recursive-include docs *.css
 recursive-include test *.txt
+include docs/tutorials/multilingual/prepare-iwslt17-multilingual.sh
diff --git a/README.md b/README.md
index 868b646c3..fbd42dd0d 100644
--- a/README.md
+++ b/README.md
@@ -6,29 +6,87 @@
 [![Build Status](https://travis-ci.org/awslabs/sockeye.svg?branch=master)](https://travis-ci.org/awslabs/sockeye)
 [![Documentation Status](https://readthedocs.org/projects/sockeye/badge/?version=latest)](http://sockeye.readthedocs.io/en/latest/?badge=latest)
 
-This package contains the Sockeye project, a sequence-to-sequence framework for Neural Machine Translation based on Apache MXNet (Incubating).
-It implements state-of-the-art encoder-decoder architectures, such as:
+This package contains the Sockeye project, an open-source sequence-to-sequence framework for Neural Machine Translation based on [Apache MXNet (Incubating)](http://mxnet.incubator.apache.org/). Sockeye powers several Machine Translation use cases, including [Amazon Translate](https://aws.amazon.com/translate/). The framework implements state-of-the-art machine translation models with Transformers ([Vaswani et al, 2017](https://arxiv.org/abs/1706.03762)). Recent developments and changes are tracked in our [CHANGELOG](https://github.com/awslabs/sockeye/blob/master/CHANGELOG.md).
 
-- Deep Recurrent Neural Networks with Attention [[Bahdanau, '14](https://arxiv.org/abs/1409.0473)]
-- Transformer Models with self-attention [[Vaswani et al, '17](https://arxiv.org/abs/1706.03762)]
-- Fully convolutional sequence-to-sequence models [[Gehring et al, '17](https://arxiv.org/abs/1705.03122)]
+If you have any questions or discover problems, please [file an issue](https://github.com/awslabs/sockeye/issues/new). You can also send questions to *sockeye-dev-at-amazon-dot-com*.
 
-In addition, it provides an experimental [image-to-description module](https://github.com/awslabs/sockeye/tree/master/sockeye/image_captioning) that can be used for image captioning.
-Recent developments and changes are tracked in our [CHANGELOG](https://github.com/awslabs/sockeye/blob/master/CHANGELOG.md).
+#### Version 2.0
 
-If you have any questions or discover problems, please [file an issue](https://github.com/awslabs/sockeye/issues/new).
-You can also send questions to *sockeye-dev-at-amazon-dot-com*.
+With version 2.0, we have updated the usage of MXNet by moving to the [Gluon API](https://mxnet.incubator.apache.org/api/python/docs/api/gluon/index.html) and adding support for several state-of-the-art features such as distributed training, low-precision training and decoding, as well as easier debugging of neural network architectures.
+In the context of this rewrite, we also trimmed down the large feature set of version 1.18.x to concentrate on the most important types of models and features, to provide a maintainable framework that is suitable for fast prototyping, research, and production.
+We welcome Pull Requests if you would like to help with adding back features when needed.
+
+## Installation
+
+The easiest way to run Sockeye is with [Docker](https://www.docker.com) or [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
+To build a Sockeye image with all features enabled, run the build script:
+
+```bash
+python3 sockeye_contrib/docker/build.py
+```
+
+See the [Dockerfile documentation](sockeye_contrib/docker) for more information.
 
 ## Documentation
 
 For information on how to use Sockeye, please visit [our documentation](https://awslabs.github.io/sockeye/).
-Developers may be interested in our [developer guidelines](https://awslabs.github.io/sockeye/development.html).
+
+- For a quickstart guide to training a large data WMT model, see the [WMT 2018 German-English tutorial](https://awslabs.github.io/sockeye/tutorials/wmt_large.html).
+- Developers may be interested in our [developer guidelines](https://awslabs.github.io/sockeye/development.html).
 
 ## Citation
 
-For technical information about Sockeye, see our paper on the arXiv ([BibTeX](sockeye.bib)):
+For more information about Sockeye 2, see our paper ([BibTeX](sockeye2.bib)):
+
+> Felix Hieber, Tobias Domhan, Michael Denkowski, David Vilar. 2020.
+> [Sockeye 2: A Toolkit for Neural Machine Translation](https://www.amazon.science/publications/sockeye-2-a-toolkit-for-neural-machine-translation). To appear in EAMT 2020, project track.
+
+For technical information about Sockeye 1, see our paper on the arXiv ([BibTeX](sockeye.bib)):
 
 > Felix Hieber, Tobias Domhan, Michael Denkowski, David Vilar, Artem Sokolov, Ann Clifton and Matt Post. 2017.
 > [Sockeye: A Toolkit for Neural Machine Translation](https://arxiv.org/abs/1712.05690). ArXiv e-prints.
 
+## Research with Sockeye
+
+Sockeye has been used for both academic and industrial research. A list of known publications that use Sockeye is shown below.
+If you know more, please let us know or submit a pull request (last updated: April 2020).
+
+### 2020
+
+* Dinu, Georgiana, Prashant Mathur, Marcello Federico, Stanislas Lauly, Yaser Al-Onaizan. "Joint translation and unit conversion for end-to-end localization." arXiv preprint arXiv:2004.05219 (2020)
+* Hisamoto, Sorami, Matt Post, Kevin Duh. "Membership Inference Attacks on Sequence-to-Sequence Models: Is My Data In Your Machine Translation System?" Transactions of the Association for Computational Linguistics, Volume 8 (2020)
+* Naradowsky, Jason, Xuan Zhan, Kevin Duh. "Machine Translation System Selection from Bandit Feedback." arXiv preprint arXiv:2002.09646 (2020)
+* Niu, Xing, Marine Carpuat. "Controlling Neural Machine Translation Formality with Synthetic Supervision." Proceedings of AAAI (2020)
+
+### 2019
+
+* Agrawal, Sweta, Marine Carpuat. "Controlling Text Complexity in Neural Machine Translation." Proceedings of EMNLP (2019)
+* Beck, Daniel, Trevor Cohn, Gholamreza Haffari. "Neural Speech Translation using Lattice Transformations and Graph Networks." Proceedings of TextGraphs-13 (EMNLP 2019)
+* Currey, Anna, Kenneth Heafield. "Zero-Resource Neural Machine Translation with Monolingual Pivot Data." Proceedings of EMNLP (2019)
+* Gupta, Prabhakar, Mayank Sharma. "Unsupervised Translation Quality Estimation for Digital Entertainment Content Subtitles." IEEE International Journal of Semantic Computing (2019)
+* Hu, J. Edward, Huda Khayrallah, Ryan Culkin, Patrick Xia, Tongfei Chen, Matt Post, and Benjamin Van Durme. "Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting." Proceedings of NAACL-HLT (2019)
+* Rosendahl, Jan, Christian Herold, Yunsu Kim, Miguel Graça,Weiyue Wang, Parnia Bahar, Yingbo Gao and Hermann Ney “The RWTH Aachen University Machine Translation Systems for WMT 2019” Proceedings of the 4th WMT: Research Papers (2019)
+* Thompson, Brian, Jeremy Gwinnup, Huda Khayrallah, Kevin Duh, and Philipp Koehn. "Overcoming catastrophic forgetting during domain adaptation of neural machine translation." Proceedings of NAACL-HLT 2019 (2019)
+* Tättar, Andre, Elizaveta Korotkova, Mark Fishel “University of Tartu’s Multilingual Multi-domain WMT19 News Translation Shared Task Submission” Proceedings of 4th WMT: Research Papers (2019)
+
+### 2018
+
+* Domhan, Tobias. "How Much Attention Do You Need? A Granular Analysis of Neural Machine Translation Architectures". Proceedings of 56th ACL (2018)
+* Kim, Yunsu, Yingbo Gao, and Hermann Ney. "Effective Cross-lingual Transfer of Neural Machine Translation Models without Shared Vocabularies." arXiv preprint arXiv:1905.05475 (2019)
+* Korotkova, Elizaveta, Maksym Del, and Mark Fishel. "Monolingual and Cross-lingual Zero-shot Style Transfer." arXiv preprint arXiv:1808.00179 (2018)
+* Niu, Xing, Michael Denkowski, and Marine Carpuat. "Bi-directional neural machine translation with synthetic parallel data." arXiv preprint arXiv:1805.11213 (2018)
+* Niu, Xing, Sudha Rao, and Marine Carpuat. "Multi-Task Neural Models for Translating Between Styles Within and Across Languages." COLING (2018)
+* Post, Matt and David Vilar. "Fast Lexically Constrained Decoding with Dynamic Beam Allocation for Neural Machine Translation." Proceedings of NAACL-HLT (2018)
+* Schamper, Julian, Jan Rosendahl, Parnia Bahar, Yunsu Kim, Arne Nix, and Hermann Ney. "The RWTH Aachen University Supervised Machine Translation Systems for WMT 2018." Proceedings of the 3rd WMT: Shared Task Papers (2018)
+* Schulz, Philip, Wilker Aziz, and Trevor Cohn. "A stochastic decoder for neural machine translation." arXiv preprint arXiv:1805.10844 (2018)
+* Tamer, Alkouli, Gabriel Bretschner, and Hermann Ney. "On The Alignment Problem In Multi-Head Attention-Based Neural Machine Translation." Proceedings of the 3rd WMT: Research Papers (2018)
+* Tang, Gongbo, Rico Sennrich, and Joakim Nivre. "An Analysis of Attention Mechanisms: The Case of Word Sense Disambiguation in Neural Machine Translation." Proceedings of 3rd WMT: Research Papers (2018)
+* Thompson, Brian, Huda Khayrallah, Antonios Anastasopoulos, Arya McCarthy, Kevin Duh, Rebecca Marvin, Paul McNamee, Jeremy Gwinnup, Tim Anderson, and Philipp Koehn. "Freezing Subnetworks to Analyze Domain Adaptation in Neural Machine Translation." arXiv preprint arXiv:1809.05218 (2018)
+* Vilar, David. "Learning Hidden Unit Contribution for Adapting Neural Machine Translation Models." Proceedings of NAACL-HLT (2018)
+* Vyas, Yogarshi, Xing Niu and Marine Carpuat “Identifying Semantic Divergences in Parallel Text without Annotations”. Proceedings of NAACL-HLT (2018)
+* Wang, Weiyue, Derui Zhu, Tamer Alkhouli, Zixuan Gan, and Hermann Ney. "Neural Hidden Markov Model for Machine Translation". Proceedings of 56th ACL (2018)
+* Zhang, Xuan, Gaurav Kumar, Huda Khayrallah, Kenton Murray, Jeremy Gwinnup, Marianna J Martindale, Paul McNamee, Kevin Duh, and Marine Carpuat. "An Empirical Exploration of Curriculum Learning for Neural Machine Translation." arXiv preprint arXiv:1811.00739 (2018)
+
+### 2017
 
+* Domhan, Tobias and Felix Hieber. "Using target-side monolingual data for neural machine translation through multi-task learning." Proceedings of EMNLP (2017).
diff --git a/docs/development.md b/docs/development.md
index b75c9dcfb..4add22b33 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -32,7 +32,8 @@ def foo(bar: <type of bar>) -> <returnType>:
     """
 ```
 
-- When using MXNet operators, preceding symbolic statements in the code with the resulting, expected shape of the tensor greatly improves readability of the code:
+- Sockeye 2 uses the [Gluon API](http://mxnet.incubator.apache.org/versions/master/gluon/index.html).
+- When using MXNet operators, preceding symbolic or hybridizable statements in the code with the resulting, expected shape of the tensor greatly improves readability of the code:
 
 ```python
 # (batch_size, num_hidden)
@@ -43,8 +44,6 @@ data = mx.sym.reshape(data=data, shape=(-1))
 
 - The desired line length of Python modules should not exceed 120 characters.
 
-- When writing symbol-generating classes (such as encoders/decoders), initialize variables in the constructor of the class and re-use them in the class methods.
-
 - Make sure to pass unit tests before submitting a pull request.
 
 - Whenever reasonable, write py.test unit tests covering your contribution.
diff --git a/docs/image_captioning.md b/docs/image_captioning.md
deleted file mode 100644
index 922ed8693..000000000
--- a/docs/image_captioning.md
+++ /dev/null
@@ -1,163 +0,0 @@
----
-layout: default
----
-# Image Captioning
-
-Sockeye provides also a module to perform image captioning.
-It follows the same logic of sequence-to-sequence frameworks, which consist of encoder-decoder models.
-In this case the encoder takes an image instead of a sentence and encodes it in a feature representation.
-This is decoded with attention (optionally) using exactly the same models of Sockeye (RNNs, transformers, or CNNs).
-This tutorial explains how to train image captioning models.
-
-
-## Citation
-
-For technical information about the image captioning module, see our paper on the arXiv ([BibTeX](sockeye_captioning.bib)):
-
-> Loris Bazzani, Tobias Domhan, and Felix Hieber. 2018.
-> [Image Captioning as Neural Machine Translation Task in SOCKEYE](https://arxiv.org/abs/1810.04101). ArXiv e-prints.
-
-
-## Installation
-
-Follow the instructions to install Sockeye, and install further dependencies:
-
-```bash
-> sudo pip3 install Pillow
-```
-
-Optionally you can also install matplotlib for visualization:
-```bash
-> sudo pip3 install matplotlib
-```
-
-
-## Train
-
-In order to train your first image captioning model you will need two sets of parallel files: one for training
-and one for validation. The latter will be used for computing various metrics during training.
-Each set should consist of two files: one with source images and one with target sentences (captions).
-Both files should have the same number of lines, each line containing the relative path of the image and a single
-sentence, respectively. Each sentence should be a whitespace delimited list of tokens.
-
-First, you need to obtain the mxnet image models from the model gallery: https://github.com/dmlc/mxnet-model-gallery
-
-Then, we can extract features from them:
-```bash
-> python -m sockeye.image_captioning.extract_features \
-        --image-root /path/to/image/dataset/folder/ \
-        --input training_set.images \
-        --output-root /path/to/feature/cache/folder/ \
-        --output training_set.features \
-        --device-id 0 \
-        --batch-size 128 \
-        --source-image-size 3 224 224 \
-        --image-encoder-model-path /path/to/mxnet/model/filename_prefix \
-        --image-encoder-layer stage4_unit3_conv3
-
-> python -m sockeye.image_captioning.extract_features \
-        --image-root /path/to/image/dataset/folder/ \
-        --input validation_set.images \
-        --output-root /path/to/feature/cache/folder/ \
-        --output validation_set.features \
-        --device-id 0 \
-        --batch-size 128 \
-        --source-image-size 3 224 224 \
-        --image-encoder-model-path /path/to/mxnet/model/filename_prefix \
-        --image-encoder-layer stage4_unit3_conv3
-```
-In the option `--image-encoder-model-path`, `filename_prefix` should be the prefix of the MXNet model without `-symbol.json` or `-0000.params`.
-
-The script above will generate the features stored in `/path/to/feature/cache/` and a file `training_set.features` which contains the path to the features relative to `/path/to/feature/cache/`.
-Note that finetuning of the image model is not supported yet.
-
-
-Now we can train an one-layer LSTM with attention for image captioning model as follows:
-```bash
-> python -m sockeye.image_captioning.train \
-        --source-root /path/to/feature/cache/folder/ \
-        --source training_set.features \
-        --target training_set.captions \
-        --validation-source-root /path/to/feature/cache/folder/ \
-        --validation-source validation_set.features \
-        --validation-target validation_set.captions \
-        --batch-size 64 \
-        --initial-learning-rate 0.0003 \
-        --gradient-clipping-threshold 1.0 \
-        --bucket-width 5 \
-        --max-seq-len 1:60 \
-        --fill-up replicate \
-        --output models/ \
-        --encoder image-pretrain-cnn \
-        --rnn-num-hidden 512 \
-        --rnn-decoder-state-init zero \
-        --checkpoint-interval 200 \
-        --weight-normalization
-```
-Use the option `--load-all-features-to-memory` to load all the features to memory. This is possible depending on the size of the dataset/features and amount of available CPU memory.
-There is an initial overhead to load the feature (training does not start immediately), but with the big advantage that training is 15X-20X faster.
-
-You can add the options `--decode-and-evaluate 200 --max-output-length 60` to perform captioning of the part of the validation set (200 samples in this case) during training.
-
-## Image to Text
-
-Assuming that features were pre-extracted, you can do image captioning as follows:
-
-```bash
-> python -m sockeye.image_captioning.captioner \
-        --models models/ \
-        --input validation_set.features \
-        --source-root /path/to/feature/cache/folder/ \
-        --max-output-length 60 \
-        --batch-size 1024 \
-        --chunk-size 2048 \
-        --beam-size 3 > validation_set.predictions
-```
-
-This will take the best set of parameters found during training and then load the image provided in the STDIN and
-write the caption to STDOUT, which is redirected using `>` to the file `validation_set.predictions` overwriting its content if it exists already.
-
-You can also caption directly from image with the option `--extract-image-features` as follows:
-
-```bash
-> python -m sockeye.image_captioning.captioner \
-        --extract-image-features \
-        --source-image-size 3 224 224 \
-        --image-encoder-model-path /path/to/mxnet/model/filename_prefix \
-        --models models/ \
-        --input validation_set.images \
-        --source-root  /path/to/image/dataset/folder/ \
-        --max-output-length 60 \
-        --batch-size 512 \
-        --chunk-size 1024 \
-        --beam-size 3 > validation_set.predictions
-```
-
-
-### Using Lexical Constrains
-
-It is also possible to use lexical constraints during inference as described [here](inference.html#lexical-constraints).
-The input JSON object needs to have the following form, with the image path in the `text` field, and constraints specified as usual:
-
-    { 'text': 'relative/path/of/image/given/in/validation_set/file/filename.jpg',
-      'constraints': ['constr@@ aint',
-                      'multi@@ word constr@@ aint'] }
-
-(*Note: Sockeye expects this text to be present on a single line*).
-You can use the `sockeye.lexical_constraints` module to generate this (for usage, run `python3 -m sockeye.lexical_constraints`).
-Once the file is generated, the CLI option `--json-input` needs to be passed to `sockeye.image_captioning.captioner`.
-
-## Visualization
-
-You can now visualize the results in a nice format as follows:
-
-```bash
-> python -m sockeye.image_captioning.visualize \
-        --image-root /path/to/image/dataset/folder/ \
-        --source validation_set.images \
-        --prediction validation_set.predictions \
-        --ground-truth validation_set.captions \
-        --save-to-folder validation_set/
-````
-This will save to disk plots containing images, predicted captions (white background) and optionally (mutiple) ground-truth captions (green background).
-It is possible to remove `--save-to-folder` and the plots will be visualized on screen.
diff --git a/docs/index.md b/docs/index.md
index 43ed555cf..6d48f7b6c 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -13,15 +13,11 @@ layout: default
 This is the documentation for Sockeye, a sequence-to-sequence framework for Neural Machine Translation based on Apache MXNet Incubating.
 It implements state-of-the-art encoder-decoder architectures, such as
 
-- Deep Recurrent Neural Networks with Attention [[Bahdanau, '14](https://arxiv.org/abs/1409.0473)]
 - Transformer Models with self-attention [[Vaswani et al, '17](https://arxiv.org/abs/1706.03762)]
-- Fully convolutional sequence-to-sequence models [[Gehring et al, '17](https://arxiv.org/abs/1705.03122)]
-
-In addition, this framework provides an experimental [image-to-description module](https://github.com/awslabs/sockeye/tree/master/sockeye/image_captioning) that can be used for [image captioning](image_captioning.html).
 
 Recent developments and changes are tracked in our [CHANGELOG](https://github.com/awslabs/sockeye/blob/master/CHANGELOG.md).
 
-If you are interested in collaborating or have any questions, please submit a pull request or [issue](https://github.com/awslabs/sockeye/issues/new). 
+If you are interested in collaborating or have any questions, please submit a pull request or [issue](https://github.com/awslabs/sockeye/issues/new).
 You can also send questions to *sockeye-dev-at-amazon-dot-com*.
 Developers may be interested in [our developer guidelines](development.html).
 
diff --git a/docs/setup.md b/docs/setup.md
index fbf59acfe..89297b162 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -4,7 +4,7 @@
 
 Sockeye requires:
 - **Python3**
-- [MXNet 1.5.0](https://github.com/apache/incubator-mxnet/tree/1.5.0)
+- [MXNet 1.6.0](https://github.com/apache/incubator-mxnet/tree/1.6.0)
 - numpy
 
 ## Installation
@@ -28,7 +28,7 @@ Depending on your version of CUDA, you can do this by running the following:
 > pip install sockeye --no-deps -r requirements.gpu-cu${CUDA_VERSION}.txt
 > rm requirements.gpu-cu${CUDA_VERSION}.txt
 ```
-where `${CUDA_VERSION}` can be `80` (8.0), `90` (9.0), `92` (9.2), `100` (10.0) or `101` (10.1).
+where `${CUDA_VERSION}` can be `92` (9.2), `100` (10.0), `101` (10.1), or `102` (10.2).
 
 ### → via source...
 
@@ -47,7 +47,7 @@ running the following:
 > pip install -r requirements/requirements.gpu-cu${CUDA_VERSION}.txt
 > pip install .
 ```
-where `${CUDA_VERSION}` can be `80` (8.0), `90` (9.0), `92` (9.2), `100` (10.0) or `101` (10.1).
+where `${CUDA_VERSION}` can be `92` (9.2), `100` (10.0), `101` (10.1), or `102` (10.2).
 
 Developers will be better served by pointing `$PYTHONPATH` to the root of the git-cloned source.
 
@@ -70,7 +70,7 @@ On an instance with a GPU, the following commands will work
 > pip install sockeye --no-deps -r requirements.gpu-cu${CUDA_VERSION}.txt
 rm requirements.gpu-cu${CUDA_VERSION}.txt
 ```
-where `${CUDA_VERSION}` can be `80` (8.0), `90` (9.0), `92` (9.2), `100` (10.0) or `101` (10.1).
+where `${CUDA_VERSION}` can be `92` (9.2), `100` (10.0), `101` (10.1), or `102` (10.2).
 
 ### Optional dependencies
 In order to write training statistics to a Tensorboard event file for visualization, you can optionally install mxboard
diff --git a/docs/sockeye_captioning.bib b/docs/sockeye_captioning.bib
deleted file mode 100644
index 4c26cffb1..000000000
--- a/docs/sockeye_captioning.bib
+++ /dev/null
@@ -1,12 +0,0 @@
-@article{SockeyeCaptioning:18,
-   author = {Bazzani, Loris and Domhan, Tobias and Hieber, Felix},
-    title = "{Image Captioning as Neural Machine Translation Task in SOCKEYE}",
-  journal = {arXiv preprint arXiv:1810.04101},
-archivePrefix = "arXiv",
-   eprint = {1810.04101},
- primaryClass = "cs.CV",
- keywords = {Computer Science - Computer Vision and Pattern Recognition},
-     year = 2018,
-    month = oct,
-      url = {https://arxiv.org/abs/1810.04101}
-}
diff --git a/docs/training.md b/docs/training.md
index 7dabd49ec..f607555a5 100644
--- a/docs/training.md
+++ b/docs/training.md
@@ -4,12 +4,6 @@ layout: default
 
 # Training
 
-## Autopilot
-
-For easily training popular model types on known data sets, see the [Sockeye Autopilot documentation](https://github.com/awslabs/sockeye/tree/master/sockeye_contrib/autopilot).
-For manually training and running translation models on your data, read on.
-Autopilot also contains some other details you may find useful, such as recommended training parameters for [the RNN](https://github.com/awslabs/sockeye/blob/7fd7f152a2480ecf10683f71a89f7519fe7fbc06/sockeye_contrib/autopilot/models.py#L65) or [Transformer](https://github.com/awslabs/sockeye/blob/7fd7f152a2480ecf10683f71a89f7519fe7fbc06/sockeye_contrib/autopilot/models.py#L28) models.
-
 ## Data preparation
 
 Sockeye can read the raw data at training time in two sentence-parallel files via the `--source` and `--target` command-line options.
diff --git a/docs/tutorials.md b/docs/tutorials.md
index ee99ed51e..2187b20fa 100644
--- a/docs/tutorials.md
+++ b/docs/tutorials.md
@@ -13,4 +13,5 @@ introduce different concepts and parameters used for training and translation.
 1. [Sequence copy task](tutorials/seqcopy.html)
 1. [WMT German to English news translation](tutorials/wmt.html)
 1. [Domain adaptation of NMT models](tutorials/adapt.html)
-1. [Multilingual Zero-shot Translation IWSLT 2017](tutorials/multilingual.html)
+1. [Large data: WMT German-English 2018](tutorials/wmt_large.html)
+1. [Multilingual Zero-shot Translation IWSLT 2017](tutorials/multilingual.html)
\ No newline at end of file
diff --git a/docs/tutorials/adapt.md b/docs/tutorials/adapt.md
index ed61d6c29..97781474c 100644
--- a/docs/tutorials/adapt.md
+++ b/docs/tutorials/adapt.md
@@ -60,8 +60,6 @@ This argument accepts a (space separated) list of components where to apply the
 
 Again it may be beneficial to adjust the learning parameters for the adaptation run.
 
-**Note:** At the moment LHUC is not supported for convolutional models.
-
 ## References
 
 > Markus Freitag and Yaser Al-Onaizan. 2016.
diff --git a/docs/tutorials/multilingual.md b/docs/tutorials/multilingual.md
index df56952a5..1d82c0e31 100644
--- a/docs/tutorials/multilingual.md
+++ b/docs/tutorials/multilingual.md
@@ -64,9 +64,9 @@ git clone https://github.com/bricksdont/moses-scripts tools/moses-scripts
 
 # download helper scripts
 
-wget https://raw.githubusercontent.com/awslabs/sockeye/master/docs/tutorials/multilingual/prepare-iwslt17-multilingual.sh -P tools
-wget https://raw.githubusercontent.com/awslabs/sockeye/master/docs/tutorials/multilingual/add_tag_to_lines.py -P tools
-wget https://raw.githubusercontent.com/awslabs/sockeye/master/docs/tutorials/multilingual/remove_tag_from_translations.py -P tools
+wget https://raw.githubusercontent.com/awslabs/sockeye/sockeye_2/docs/tutorials/multilingual/prepare-iwslt17-multilingual.sh -P tools
+wget https://raw.githubusercontent.com/awslabs/sockeye/sockeye_2/docs/tutorials/multilingual/add_tag_to_lines.py -P tools
+wget https://raw.githubusercontent.com/awslabs/sockeye/sockeye_2/docs/tutorials/multilingual/remove_tag_from_translations.py -P tools
 ```
 
 
@@ -266,9 +266,6 @@ We can now kick off the training process:
 python -m sockeye.train -d train_data \
                         -vs $DATA/valid.tag.src \
                         -vt $DATA/valid.tag.trg \
-                        --encoder transformer \
-                        --decoder transformer \
-                        --weight-tying \
                         --shared-vocab \
                         --weight-tying-type src_trg_softmax \
                         --device-ids 0 \
diff --git a/docs/tutorials/seqcopy.md b/docs/tutorials/seqcopy.md
index 004012849..4b9afc085 100644
--- a/docs/tutorials/seqcopy.md
+++ b/docs/tutorials/seqcopy.md
@@ -44,42 +44,42 @@ python3 -m sockeye.train -s data/train.source \
                          -t data/train.target \
                          -vs data/dev.source \
                          -vt data/dev.target \
-                         --encoder rnn --decoder rnn \
+                         --encoder transformer --decoder transformer \
                          --num-layers 1:1 \
                          --num-embed 32 \
-                         --rnn-num-hidden 64 \
-                         --rnn-attention-type dot \
+                         --transformer-model-size 32 \
+                         --transformer-feed-forward-num-hidden 64 \
+                         --transformer-attention-heads 4 \
                          --use-cpu \
-                         --metrics perplexity accuracy \
                          --max-num-checkpoint-not-improved 3 \
                          -o seqcopy_model
 ```
 
-This will train a 1-layer RNN model with a bidirectional LSTM as the encoder and a uni-directional LSTM as the decoder.
-The RNNs have 64 hidden units and we learn embeddings of size 32.
+This will train a 1-layer Transformer model with 32 hidden units as the embedding size.
+The Feed-Forward sublayers have 64 hidden units and attention mechanisms are using 4 heads.
 Looking at the log we can see that our training data was assigned to buckets according to their lengths.
-Additionally, Sockeye will take care of correctly padding sequences and masking relevant parts of the network, in order to deal with sequences of variable length.
+Additionally, Sockeye will take care of correctly padding sequences and masking relevant parts of the network,
+in order to deal with sequences of variable length.
 
 
 ### Metrics and checkpointing
 During training Sockeye will print relevant metrics on both the training and the validation data.
-The metrics can be chosen using the `--metrics` parameter.
 Validation metrics are evaluated every time we create a checkpoint.
 During checkpointing the current model parameters are saved into the model directory and current validation scores are evaluated.
-By default Sockeye will create a checkpoint every 1000 updates.
+By default Sockeye will create a checkpoint every 4000 updates.
 This can be adjusted through the `--checkpoint-interval` parameter.
 
-From the log you can see that initially the accuracy is around 0.1:
+From the log you can see that initially the perplexity is around `20.0`:
 ```bash
 ...
+[INFO:sockeye.training] Early stopping by optimizing 'perplexity'
+[INFO:sockeye.model] Saved model config to "seqcopy_model/config"
 [INFO:sockeye.training] Training started.
-[INFO:sockeye.callback] Early stopping by optimizing 'perplexity'
-[INFO:root] Epoch[0] Batch [50]  Speed: 683.23 samples/sec perplexity=14.104128 accuracy=0.092011
-[INFO:root] Epoch[0] Batch [100] Speed: 849.97 samples/sec perplexity=13.036482 accuracy=0.096760
+[INFO:sockeye.training] Epoch[0] Batch [50]	Speed: 429.27 samples/sec 10879.00 tokens/sec 2.16 updates/sec	perplexity=20.074619
+[INFO:sockeye.training] Epoch[0] Batch [100]	Speed: 534.38 samples/sec 13846.37 tokens/sec 2.76 updates/sec	perplexity=17.064554
 ...
 ```
-With a vocabulary of size 10 this essentially means that the model is guessing randomly.
-As training progresses we see that after around 14 epochs the accuracy goes up to ~1.0 and the perplexity down to ~1.0.
+As training progresses we see that after the first checkpoint (~7 epochs) the validation perplexity is at ~1.05.
 Sockeye performs early stopping based on the validation metrics tracked when checkpointing.
 Once the validation metrics have not improved for several checkpoints the training is stopped.
 The number of tolerated non-improving checkpoints can be adjusted (`--max-num-checkpoint-not-improved`).
@@ -111,8 +111,8 @@ If you open the file you can see that in addition to the digits Sockeye also add
 
 ```
 
-Note that the model was trained on sequences consisting of between 10 and 30 characters.
-Therefore, the model will most likely have some difficulties with sequences shorter than 10 characters.
+Note that the model was trained on sequences consisting of between 10 and 30 digits.
+Therefore, the model will most likely have some difficulties with sequences shorter than 10 digits.
 By default Sockeye will read sentence from stdin and print the translations on stdout.
 
 Internally Sockeye will run a beam search in order to (approximately) find the translation with the highest probability.
diff --git a/docs/tutorials/wmt.md b/docs/tutorials/wmt.md
index 52eb900b4..3e608c905 100644
--- a/docs/tutorials/wmt.md
+++ b/docs/tutorials/wmt.md
@@ -16,13 +16,7 @@ git clone https://github.com/rsennrich/subword-nmt.git
 export PYTHONPATH=$(pwd)/subword-nmt:$PYTHONPATH
 ```
 
-For visualizating alignments we will need `matplotlib`.
-If you haven't installed the library yet you can do so by running:
-```bash
-pip install matplotlib
-```
-
-We will visualize training progress using Tensorboard and its MXNet adaptor, `mxboard`. 
+We will visualize training progress using Tensorboard and its MXNet adaptor, `mxboard`.
 Install it using:
 ```bash
 pip install tensorboard mxboard
@@ -101,24 +95,13 @@ We can now kick off the training process:
 python -m sockeye.train -d train_data \
                         -vs newstest2016.tc.BPE.de \
                         -vt newstest2016.tc.BPE.en \
-                        --encoder rnn \
-                        --decoder rnn \
-                        --num-embed 256 \
-                        --rnn-num-hidden 512 \
-                        --rnn-attention-type dot \
                         --max-seq-len 60 \
                         --decode-and-evaluate 500 \
                         --use-cpu \
                         -o wmt_model
 ```
 
-This will train a 1-layer bi-LSTM encoder, 1-layer LSTM decoder with dot attention.
-Sockeye offers a whole variety of different options regarding the model architecture,
-such as stacked RNNs with residual connections (`--num-layers`, `--rnn-residual-connections`),
-[Transformer](https://arxiv.org/abs/1706.03762) encoder and decoder (`--encoder transformer`, `--decoder transformer`),
-[ConvS2S](https://arxiv.org/pdf/1705.03122) (`--encoder cnn`, `--decoder cnn`),
-various RNN (`--rnn-cell-type`) and attention (`--attention-type`) types and more.  
-
+This will train a "base" [Transformer](https://arxiv.org/abs/1706.03762) model.
 There are also several parameters controlling training itself.
 Unless you specify a different optimizer (`--optimizer`) [Adam](https://arxiv.org/abs/1412.6980) will be used.
 Additionally, you can control the batch size (`--batch-size`), the learning rate schedule (`--learning-rate-schedule`) and other parameters relevant for training.
@@ -180,26 +163,6 @@ he is a great guy and a family father .
 At decoding time Sockeye will run a beam search.
 You can set the size of the beam (`--beam-size`) or change other decoding parameters such as `--softmax-temperature` and `--length-penalty-alpha`.
 
-### Alignment visualization
-
-Sockeye not only supports text output, but also other output types.
-The following command for example will plot the alignment matrix:
-
-
-```bash
-echo "er ist so ein toller Kerl und ein Familienvater ." | \
-  python -m apply_bpe -c bpe.codes --vocabulary bpe.vocab.en \
-                                   --vocabulary-threshold 50 | \
-  python -m sockeye.translate -m wmt_model --output-type align_plot
-```
-
-This will create a file `align_1.png` that looks similar to this:
-
-![Alignment plot](wmt/align.png "Alignment plot")
-
-Note that the alignment plot shows the subword units instead of tokens, as this is the representation used by Sockeye during translation.
-Additionally you can see the special end-of-sentence symbol `</s>` being added to the target sentence.
-
 
 ### Embedding inspection
 
diff --git a/docs/tutorials/wmt_large.md b/docs/tutorials/wmt_large.md
new file mode 100644
index 000000000..6a24cb22e
--- /dev/null
+++ b/docs/tutorials/wmt_large.md
@@ -0,0 +1,182 @@
+# Large Data: WMT 2018 German-English
+
+This tutorial covers training a Sockeye model using an arbitrarily large amount of data.
+We use the data provided for the [WMT 2018](http://www.statmt.org/wmt18/translation-task.html) German-English news task (41 million parallel sentences), though similar settings could be used for even larger data sets.
+
+## Setup
+
+**NOTE**: This build assumes that 4 local GPUs are available.
+
+For this tutorial, we use the Sockeye Docker image.
+
+1. Follow the linked instructions to install [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
+
+2. Build the Docker image and record the commit used as the tag:
+
+```bash
+python3 sockeye_contrib/docker/build.py
+
+export TAG=$(git rev-parse --short HEAD)
+```
+
+3. This tutorial uses two external pieces of software, the [subword-nmt](https://github.com/rsennrich/subword-nmt) tool that implements byte-pair encoding (BPE) and the [langid.py](https://github.com/saffsd/langid.py) tool that performs language identification:
+
+```bash
+git clone https://github.com/rsennrich/subword-nmt.git
+export PYTHONPATH=$(pwd)/subword-nmt:$PYTHONPATH
+
+git clone https://github.com/saffsd/langid.py.git
+export PYTHONPATH=$(pwd)/langid.py:$PYTHONPATH
+```
+
+4. We also recommend installing [GNU Parallel](https://www.gnu.org/software/parallel/) to speed up preprocessing steps (run `apt-get install parallel` or `yum install parallel`).
+
+## Data
+
+We use the preprocessed data provided for the WMT 2018 news translation shared task.
+Download and extract the data using the following commands:
+
+```bash
+wget http://data.statmt.org/wmt18/translation-task/preprocessed/de-en/corpus.gz
+wget http://data.statmt.org/wmt18/translation-task/preprocessed/de-en/dev.tgz
+zcat corpus.gz |cut -f1 >corpus.de
+zcat corpus.gz |cut -f2 >corpus.en
+tar xvzf dev.tgz '*.en' '*.de'
+```
+
+## Preprocessing
+
+The data has already been tokenized and true-cased, however no significant corpus cleaning is applied.
+The majority of the data is taken from inherently noisy web-crawls (sentence pairs are not always in the correct language, or even natural language text).
+If we were participating in the WMT evaluation, we would spend a substantial amount of effort selecting clean training data from the noisy corpus.
+For this tutorial, we run a simple cleaning step that retains sentence pairs for which a language identification model classifies the target side as English.
+The use of GNU Parallel is optional, but makes this step much faster:
+
+```bash
+parallel --pipe --keep-order \
+    python -m langid.langid --line -l en,de <corpus.en >corpus.en.langid
+
+paste corpus.en.langid corpus.de |grep "^('en" |cut -f2 >corpus.de.clean
+paste corpus.en.langid corpus.en |grep "^('en" |cut -f2 >corpus.en.clean
+```
+
+We next use BPE to learn a joint sub-word vocabulary from the clean training data.
+To speed up this step, we use random samples of the source and target data (note that these samples will not be parallel, but BPE training does not require parallel data).
+
+```bash
+shuf -n 1000000 corpus.de.clean >corpus.de.clean.sample
+shuf -n 1000000 corpus.en.clean >corpus.en.clean.sample
+
+python -m subword_nmt.learn_joint_bpe_and_vocab \
+    --input corpus.de.clean.sample corpus.en.clean.sample \
+    -s 32000 \
+    -o bpe.codes \
+    --write-vocabulary bpe.vocab.de bpe.vocab.en
+```
+
+We use this vocabulary to encode our training, validation, and test data.
+For simplicity, we use the 2016 data for validation and 2017 data for test.
+GNU parallel can also significantly speed up this step.
+
+```bash
+parallel --pipe --keep-order \
+    python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.de --vocabulary-threshold 50 <corpus.de.clean >corpus.de.clean.bpe
+parallel --pipe --keep-order \
+    python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.en --vocabulary-threshold 50 <corpus.en.clean >corpus.en.clean.bpe
+
+python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.de --vocabulary-threshold 50 <newstest2016.tc.de >newstest2016.tc.de.bpe
+python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.en --vocabulary-threshold 50 <newstest2016.tc.en >newstest2016.tc.en.bpe
+
+python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.de --vocabulary-threshold 50 <newstest2017.tc.de >newstest2017.tc.de.bpe
+python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.en --vocabulary-threshold 50 <newstest2017.tc.en >newstest2017.tc.en.bpe
+```
+
+## Training
+
+Now that our data is cleaned and sub-word encoded, we are almost ready to start model training.
+We first run a data preparation step that splits the training data into shards and serializes it in MXNet's NDArray format.
+This allows us to train on data of any size by efficiently loading and unloading different pieces during training:
+
+```bash
+nvidia-docker run --rm -i -v $(pwd):/work -w /work sockeye:$TAG \
+    python -m sockeye.prepare_data \
+        -s corpus.de.clean.bpe \
+        -t corpus.en.clean.bpe \
+        -o prepared_data \
+        --shared-vocab \
+        --word-min-count 2 \
+        --pad-vocab-to-multiple-of 8 \
+        --bucket-width 8 \
+        --no-bucket-scaling \
+        --max-seq-len 95 \
+        --num-samples-per-shard 10000000 \
+        --seed 1
+```
+
+We then start Sockeye training:
+
+```bash
+nvidia-docker run --rm -i -v $(pwd):/work -w /work -e OMP_NUM_THREADS=4 sockeye:$TAG \
+    python -m sockeye.train \
+        -d prepared_data \
+        -vs newstest2016.tc.de.bpe \
+        -vt newstest2016.tc.en.bpe \
+        -o model \
+        --num-layers 6 \
+        --transformer-model-size 512 \
+        --transformer-attention-heads 8 \
+        --transformer-feed-forward-num-hidden 2048 \
+        --weight-tying \
+        --weight-tying-type src_trg_softmax \
+        --optimizer adam \
+        --batch-size 8192 \
+        --update-interval 4 \
+        --round-batch-sizes-to-multiple-of 8 \
+        --checkpoint-interval 1000 \
+        --initial-learning-rate 0.0004 \
+        --learning-rate-reduce-factor 0.9 \
+        --learning-rate-reduce-num-not-improved 8 \
+        --max-num-checkpoint-not-improved 60 \
+        --decode-and-evaluate 500 \
+        --device-ids -4 \
+        --seed 1
+```
+
+**Faster training**:
+
+- To run FP16 training using a fixed loss scaling factor, add `--dtype float16`.
+- To use MXNet's Automatic Mixed Precision, add `--amp`.
+
+This trains a "base" [Transformer](https://arxiv.org/abs/1706.03762) model using the [Adam](https://arxiv.org/abs/1412.6980) optimizer with a batch size of 32,768 (8192 x 4) tokens.
+The learning rate will automatically reduce when validation perplexity does not improve for 8 checkpoints (1000 updates per checkpoint) and training will conclude when validation perplexity does not improve for 60 checkpoints.
+At each checkpoint, Sockeye runs a separate decoder process to evaluate metrics such as BLEU on a sample of the validation data (500 sentences).
+Note that these scores are calculated on the tokens provided to Sockeye, e.g. in this tutorial BLEU will be calculated on the sub-words we created above.
+
+## Evaluation
+
+Now the model is ready to translate data.
+Input should be preprocessed identically to the training data, including sub-word encoding (BPE).
+Run the following to translate the test set that we've already preprocessed:
+
+```bash
+nvidia-docker run --rm -i -v $(pwd):/work -w /work sockeye:$TAG \
+    python -m sockeye.translate \
+        -i newstest2017.tc.de.bpe \
+        -o newstest2017.tc.hyp.bpe \
+        -m model \
+        --beam-size 5 \
+        --batch-size 64 \
+        --device-ids -1
+```
+
+To evaluate the translations, reverse the BPE sub-word encoding and run [sacreBLEU](https://github.com/mjpost/sacreBLEU) to compute the BLEU score:
+
+```bash
+sed -re 's/(@@ |@@$)//g' <newstest2017.tc.hyp.bpe >newstest2017.tc.hyp
+
+nvidia-docker run --rm -i -v $(pwd):/work -w /work sockeye:$TAG \
+    sacrebleu newstest2017.tc.en -tok none -i newstest2017.tc.hyp
+```
+
+Note that this is tokenized, normalized, and true-cased data.
+If we were actually participating in WMT, the translations would need to be recased and detokenized for human evaluation.
diff --git a/pylintrc b/pylintrc
index d4c419405..7e7e6fd84 100644
--- a/pylintrc
+++ b/pylintrc
@@ -283,7 +283,7 @@ ignored-modules=mxnet,mxnet.*,numpy,numpy.*
 # List of class names for which member attributes should not be checked (useful
 # for classes with dynamically set attributes). This supports the use of
 # qualified names.
-ignored-classes=optparse.Values,thread._local,_thread._local
+ignored-classes=optparse.Values,thread._local,_thread._local,AbstractContextManager
 
 # List of members which are set dynamically and missed by pylint inference
 # system, and so shouldn't trigger E1101 when accessed. Python regular
diff --git a/pytest.ini b/pytest.ini
index f45f864b4..3cc6356bf 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,2 +1,3 @@
 [pytest]
-addopts = --cov sockeye test/unit test/integration -v
+addopts = -v
+testpaths = test/unit test/integration
diff --git a/requirements/requirements.gpu-cu100.txt b/requirements/requirements.gpu-cu100.txt
index d49d93658..b77b876d0 100644
--- a/requirements/requirements.gpu-cu100.txt
+++ b/requirements/requirements.gpu-cu100.txt
@@ -1,6 +1,6 @@
 pyyaml>=5.1
-mxnet-cu100mkl==1.5.1
-numpy>=1.14
+mxnet-cu100mkl==1.6.0
+numpy>1.16.0,<2.0.0
 typing
 portalocker
-sacrebleu==1.3.6
+sacrebleu==1.4.3
diff --git a/requirements/requirements.gpu-cu101.txt b/requirements/requirements.gpu-cu101.txt
index 35db8d6cc..1a2ecf218 100644
--- a/requirements/requirements.gpu-cu101.txt
+++ b/requirements/requirements.gpu-cu101.txt
@@ -1,6 +1,6 @@
 pyyaml>=5.1
-mxnet-cu101mkl==1.5.1
-numpy>=1.14
+mxnet-cu101mkl==1.6.0
+numpy>1.16.0,<2.0.0
 typing
 portalocker
-sacrebleu==1.3.6
+sacrebleu==1.4.3
diff --git a/requirements/requirements.gpu-cu102.txt b/requirements/requirements.gpu-cu102.txt
new file mode 100644
index 000000000..dd670a45d
--- /dev/null
+++ b/requirements/requirements.gpu-cu102.txt
@@ -0,0 +1,6 @@
+pyyaml>=5.1
+mxnet-cu102mkl==1.6.0
+numpy>1.16.0,<2.0.0
+typing
+portalocker
+sacrebleu==1.4.3
diff --git a/requirements/requirements.gpu-cu80.txt b/requirements/requirements.gpu-cu80.txt
deleted file mode 100644
index c809b6656..000000000
--- a/requirements/requirements.gpu-cu80.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-pyyaml>=5.1
-mxnet-cu80mkl==1.5.1
-numpy>=1.14
-typing
-portalocker
-sacrebleu==1.3.6
diff --git a/requirements/requirements.gpu-cu90.txt b/requirements/requirements.gpu-cu90.txt
deleted file mode 100644
index 9ad3732c2..000000000
--- a/requirements/requirements.gpu-cu90.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-pyyaml>=5.1
-mxnet-cu90mkl==1.5.1
-numpy>=1.14
-typing
-portalocker
-sacrebleu==1.3.6
diff --git a/requirements/requirements.gpu-cu92.txt b/requirements/requirements.gpu-cu92.txt
index bc80d5ac6..585832235 100644
--- a/requirements/requirements.gpu-cu92.txt
+++ b/requirements/requirements.gpu-cu92.txt
@@ -1,6 +1,6 @@
 pyyaml>=5.1
-mxnet-cu92mkl==1.5.1
-numpy>=1.14
+mxnet-cu92mkl==1.6.0
+numpy>1.16.0,<2.0.0
 typing
 portalocker
-sacrebleu==1.3.6
+sacrebleu==1.4.3
diff --git a/requirements/requirements.horovod.txt b/requirements/requirements.horovod.txt
new file mode 100644
index 000000000..9c74bec0d
--- /dev/null
+++ b/requirements/requirements.horovod.txt
@@ -0,0 +1,2 @@
+horovod==0.19.1
+mpi4py
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index a9d9217f0..0f5488dd9 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,6 +1,6 @@
 pyyaml>=5.1
-mxnet-mkl==1.5.1
-numpy>=1.14
+mxnet-mkl==1.6.0
+numpy>1.16.0,<2.0.0
 typing
 portalocker
-sacrebleu==1.3.6
+sacrebleu==1.4.3
diff --git a/setup.py b/setup.py
index ffa2a7b7c..21ac6031c 100644
--- a/setup.py
+++ b/setup.py
@@ -82,6 +82,7 @@ def get_requirements(filename):
         'sockeye-lexicon = sockeye.lexicon:main',
         'sockeye-init-embed = sockeye.init_embedding:main',
         'sockeye-prepare-data = sockeye.prepare_data:main',
+        'sockeye-quantize = sockeye.quantize:main',
         'sockeye-score = sockeye.score:main',
         'sockeye-train = sockeye.train:main',
         'sockeye-translate = sockeye.translate:main',
diff --git a/sockeye/__init__.py b/sockeye/__init__.py
index 378b5dd0b..11040ebc8 100644
--- a/sockeye/__init__.py
+++ b/sockeye/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '1.18.115'
+__version__ = '2.1.7'
diff --git a/sockeye/arguments.py b/sockeye/arguments.py
index c583e8a08..0f11f1a22 100644
--- a/sockeye/arguments.py
+++ b/sockeye/arguments.py
@@ -1,4 +1,4 @@
-# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -24,7 +24,6 @@
 
 from . import constants as C
 from . import data_io
-from .lr_scheduler import LearningRateSchedulerFixedStep
 from . import utils
 
 
@@ -170,21 +169,22 @@ def check_greater_equal(value: str):
     return check_greater_equal
 
 
-def learning_schedule() -> Callable:
+def bool_str() -> Callable:
     """
-    Returns a method that can be used in argument parsing to check that the argument is a valid learning rate schedule
-    string.
+    Returns a method that can be used in argument parsing to check that the argument is a valid representation of
+    a boolean value.
 
     :return: A method that can be used as a type in argparse.
     """
-
-    def parse(schedule_str):
-        try:
-            schedule = LearningRateSchedulerFixedStep.parse_schedule_str(schedule_str)
-        except ValueError:
+    def parse(value: str):
+        lower_value = value.lower()
+        if lower_value in ["true", "yes", "1"]:
+            return True
+        elif lower_value in ["false", "no", "0"]:
+            return False
+        else:
             raise argparse.ArgumentTypeError(
-                "Learning rate schedule string should have form rate1:num_updates1[,rate2:num_updates2,...]")
-        return schedule
+                "Invalid value for bool argument. Use true/false, yes/no or 1/0.")
 
     return parse
 
@@ -201,11 +201,11 @@ def simple_dict() -> Callable:
     def parse(dict_str: str):
 
         def _parse(value: str):
-            if value == "True":
+            if value.lower() == "true":
                 return True
-            if value == "False":
+            if value.lower() == "false":
                 return False
-            if "." in value:
+            if "." in value or "e" in value:
                 return float(value)
             return int(value)
 
@@ -299,7 +299,7 @@ def add_extract_args(params):
     extract_params.add_argument("input",
                                 metavar="INPUT",
                                 type=str,
-                                help="Either a model directory (using params.best) or a specific params.x file.")
+                                help="Either a model directory (using its %s) or a specific params.x file." % C.PARAMS_BEST_NAME)
     extract_params.add_argument('--names', '-n',
                                 nargs='*',
                                 default=[],
@@ -385,6 +385,13 @@ def add_training_data_args(params, required=False):
                         type=regular_file(),
                         default=[],
                         help='File(s) containing additional token-parallel source side factors. Default: %(default)s.')
+    params.add_argument('--source-factors-use-source-vocab',
+                        required=False,
+                        nargs='+',
+                        type=bool_str(),
+                        default=[],
+                        help='List of bools signaling wether to use the source vocabulary for the source factors. '
+                        'If empty (default) each factor has its own vocabulary.')
     params.add_argument(C.TRAINING_ARG_TARGET, '-t',
                         required=required,
                         type=regular_file(),
@@ -462,6 +469,10 @@ def add_bucketing_args(params):
                         default=10,
                         help='Width of buckets in tokens. Default: %(default)s.')
 
+    params.add_argument('--no-bucket-scaling',
+                        action='store_true',
+                        help='Disable scaling source/target buckets based on length ratio. Default: %(default)s.')
+
     params.add_argument(C.TRAINING_ARG_MAX_SEQ_LEN,
                         type=multiple_values(num_values=2, greater_or_equal=1),
                         default=(99, 99),
@@ -473,30 +484,32 @@ def add_prepare_data_cli_args(params):
     add_training_data_args(params, required=True)
     add_vocab_args(params)
     add_bucketing_args(params)
-    add_logging_args(params)
-
-    data_prep = params.add_argument_group("Data preparation.")
 
-    data_prep.add_argument('--num-samples-per-shard',
+    params.add_argument('--num-samples-per-shard',
                         type=int_greater_or_equal(1),
-                        default=1000000,
+                        default=10000000,
                         help='The approximate number of samples per shard. Default: %(default)s.')
 
-    data_prep.add_argument('--min-num-shards',
+    params.add_argument('--min-num-shards',
                         default=1,
                         type=int_greater_or_equal(1),
                         help='The minimum number of shards to use, even if they would not '
                              'reach the desired number of samples per shard. Default: %(default)s.')
 
-    data_prep.add_argument('--seed',
+    params.add_argument('--seed',
                         type=int,
                         default=13,
                         help='Random seed used that makes shard assignments deterministic. Default: %(default)s.')
 
-    data_prep.add_argument('--output', '-o',
+    params.add_argument('--output', '-o',
                         required=True,
                         help='Folder where the prepared and possibly sharded data is written to.')
+    params.add_argument('--max-processes',
+                        type=int_greater_or_equal(1),
+                        default=1,
+                        help='Process the shards in parallel using max-processes processes.')
 
+    add_logging_args(params)
 
 
 def add_device_args(params):
@@ -513,6 +526,14 @@ def add_device_args(params):
     device_params.add_argument('--use-cpu',
                                action='store_true',
                                help='Use CPU device instead of GPU.')
+    device_params.add_argument('--omp-num-threads',
+                               type=int,
+                               help='Set the OMP_NUM_THREADS environment variable (CPU threads). Recommended: set to '
+                                    'number of GPUs for training, number of physical CPU cores for inference. Default: '
+                                    '%(default)s.')
+    device_params.add_argument('--env',
+                               help='List of environment variables to be set before importing MXNet. Separated by ",", '
+                                    'e.g. --env=OMP_NUM_THREADS=4,MXNET_GPU_WORKER_NTHREADS=3 etc.')
     device_params.add_argument('--disable-device-locking',
                                action='store_true',
                                help='Just use the specified device ids without locking.')
@@ -590,87 +611,6 @@ def add_model_parameters(params):
                               help='Number of layers for encoder & decoder. '
                                    'Use "x:x" to specify separate values for encoder & decoder. Default: %(default)s.')
 
-    model_params.add_argument('--conv-embed-output-dim',
-                              type=int_greater_or_equal(1),
-                              default=None,
-                              help="Project segment embeddings to this size for ConvolutionalEmbeddingEncoder. Omit to"
-                                   " avoid projection, leaving segment embeddings total size of all filters. Default:"
-                                   " %(default)s.")
-    model_params.add_argument('--conv-embed-max-filter-width',
-                              type=int_greater_or_equal(1),
-                              default=8,
-                              help="Maximum filter width for ConvolutionalEmbeddingEncoder. Default: %(default)s.")
-    model_params.add_argument('--conv-embed-num-filters',
-                              type=multiple_values(greater_or_equal=1),
-                              default=(200, 200, 250, 250, 300, 300, 300, 300),
-                              help="List of number of filters of each width 1..max for ConvolutionalEmbeddingEncoder. "
-                                   "Default: %(default)s.")
-    model_params.add_argument('--conv-embed-pool-stride',
-                              type=int_greater_or_equal(1),
-                              default=5,
-                              help="Pooling stride for ConvolutionalEmbeddingEncoder. Default: %(default)s.")
-    model_params.add_argument('--conv-embed-num-highway-layers',
-                              type=int_greater_or_equal(0),
-                              default=4,
-                              help="Number of highway layers for ConvolutionalEmbeddingEncoder. Default: %(default)s.")
-    model_params.add_argument('--conv-embed-add-positional-encodings',
-                              action='store_true',
-                              default=False,
-                              help="Add positional encodings to final segment embeddings for"
-                                   " ConvolutionalEmbeddingEncoder. Default: %(default)s.")
-
-    # convolutional encoder/decoder arguments arguments
-    model_params.add_argument('--cnn-kernel-width',
-                              type=multiple_values(num_values=2, greater_or_equal=1, data_type=int),
-                              default=(3, 3),
-                              help='Kernel width of the convolutional encoder and decoder. Default: %(default)s.')
-    model_params.add_argument('--cnn-num-hidden',
-                              type=int_greater_or_equal(1),
-                              default=512,
-                              help='Number of hidden units for the convolutional encoder and decoder. '
-                                   'Default: %(default)s.')
-    model_params.add_argument('--cnn-activation-type',
-                              choices=C.CNN_ACTIVATION_TYPES,
-                              default=C.GLU,
-                              help="Type activation to use for each convolutional layer. Default: %(default)s.")
-    model_params.add_argument('--cnn-positional-embedding-type',
-                              choices=C.POSITIONAL_EMBEDDING_TYPES,
-                              default=C.LEARNED_POSITIONAL_EMBEDDING,
-                              help='The type of positional embedding. Default: %(default)s.')
-    model_params.add_argument('--cnn-project-qkv',
-                              action='store_true',
-                              default=False,
-                              help="Optionally apply query, key and value projections to the source and target hidden "
-                                   "vectors before applying the attention mechanism.")
-
-    # rnn arguments
-    model_params.add_argument('--rnn-cell-type',
-                              choices=C.CELL_TYPES,
-                              default=C.LSTM_TYPE,
-                              help='RNN cell type for encoder and decoder. Default: %(default)s.')
-    model_params.add_argument('--rnn-num-hidden',
-                              type=int_greater_or_equal(1),
-                              default=1024,
-                              help='Number of RNN hidden units for encoder and decoder. Default: %(default)s.')
-    model_params.add_argument('--rnn-encoder-reverse-input',
-                              action='store_true',
-                              help='Reverse input sequence for RNN encoder. Default: %(default)s.')
-    model_params.add_argument('--rnn-decoder-state-init',
-                              default=C.RNN_DEC_INIT_LAST,
-                              choices=C.RNN_DEC_INIT_CHOICES,
-                              help='How to initialize RNN decoder states. Default: %(default)s.')
-    model_params.add_argument('--rnn-residual-connections',
-                              action="store_true",
-                              default=False,
-                              help="Add residual connections to stacked RNNs. (see Wu ETAL'16). Default: %(default)s.")
-    model_params.add_argument('--rnn-first-residual-layer',
-                              type=int_greater_or_equal(2),
-                              default=2,
-                              help='First RNN layer to have a residual connection. Default: %(default)s.')
-    model_params.add_argument('--rnn-context-gating', action="store_true",
-                              help="Enables a context gate which adaptively weighs the RNN decoder input against the "
-                                   "source context vector before each update of the decoder hidden state.")
-
     # transformer arguments
     model_params.add_argument('--transformer-model-size',
                               type=multiple_values(num_values=2, greater_or_equal=1),
@@ -688,9 +628,11 @@ def add_model_parameters(params):
                               help='Number of hidden units in transformers feed forward layers. '
                                    'Use "x:x" to specify separate values for encoder & decoder. Default: %(default)s.')
     model_params.add_argument('--transformer-activation-type',
-                              choices=C.TRANSFORMER_ACTIVATION_TYPES,
-                              default=C.RELU,
-                              help="Type activation to use for each feed forward layer. Default: %(default)s.")
+                              type=multiple_values(num_values=2, greater_or_equal=None, data_type=str),
+                              default=(C.RELU, C.RELU),
+                              help='Type of activation to use for each feed forward layer. Use "x:x" to specify '
+                                   'different values for encoder & decoder. Supported: {}. Default: '
+                                   '%(default)s.'.format(' '.join(C.TRANSFORMER_ACTIVATION_TYPES)))
     model_params.add_argument('--transformer-positional-embedding-type',
                               choices=C.POSITIONAL_EMBEDDING_TYPES,
                               default=C.FIXED_POSITIONAL_EMBEDDING,
@@ -715,23 +657,16 @@ def add_model_parameters(params):
                                    'You can specify separate sequences for encoder and decoder by separating with ":" '
                                    'For example: n:drn '
                                    'Default: %(default)s.')
-    model_params.add_argument('--attention-based-copying', action="store_true",
-                              help="Enables an attention-based copying mechanism. Supported only by RNN decoders. "
-                                   "This allows to explicitly declare pointers to source tokens in the target "
-                                   "sequence (format: <ptr\d+>)."
-                                   "Each pointer on the target side can point to any of the d input tokens, "
-                                   "e.g. <ptr0> points to the first source token.")
 
     # LHUC
-    # TODO: The convolutional model does not support lhuc yet
     model_params.add_argument('--lhuc',
                               nargs="+",
                               default=None,
                               choices=C.LHUC_CHOICES,
                               metavar="COMPONENT",
                               help="Use LHUC (Vilar 2018). Include an amplitude parameter to hidden units for"
-                              " domain adaptation. Needs a pre-trained model. Valid values: {values}. Currently not"
-                              " supported for convolutional models. Default: %(default)s.".format(
+                              " domain adaptation. Needs a pre-trained model. Valid values: {values}."
+                              " Default: %(default)s.".format(
                                   values=", ".join(C.LHUC_CHOICES)))
 
     # embedding arguments
@@ -749,76 +684,30 @@ def add_model_parameters(params):
                                    '(validation) source factor files. Default: %(default)s.')
     model_params.add_argument('--source-factors-combine', '-sfc',
                               choices=C.SOURCE_FACTORS_COMBINE_CHOICES,
-                              default=C.SOURCE_FACTORS_COMBINE_CONCAT,
-                              help='How to combine source factors. Default: %(default)s.')
-
-    # attention arguments
-    model_params.add_argument('--rnn-attention-type',
-                              choices=C.ATT_TYPES,
-                              default=C.ATT_MLP,
-                              help='Attention model for RNN decoders. Choices: {%(choices)s}. '
-                                   'Default: %(default)s.')
-    model_params.add_argument('--rnn-attention-num-hidden',
-                              default=None,
-                              type=int,
-                              help='Number of hidden units for attention layers. Default: equal to --rnn-num-hidden.')
-    model_params.add_argument('--rnn-attention-use-prev-word', action="store_true",
-                              help="Feed the previous target embedding into the attention mechanism.")
-
-    model_params.add_argument('--rnn-scale-dot-attention',
-                              action='store_true',
-                              help='Optional scale before dot product. Only applicable to \'dot\' attention type. '
-                                   '[Vaswani et al, 2017]')
-
-    model_params.add_argument('--rnn-attention-coverage-type',
-                              choices=C.COVERAGE_TYPES,
-                              default=C.COVERAGE_COUNT,
-                              help="Type of model for updating coverage vectors. 'count' refers to an update method "
-                                   "that accumulates attention scores. 'fertility' accumulates attention scores as well "
-                                   "but also computes a fertility value for every source word. "
-                                   "'tanh', 'sigmoid', 'relu', 'softrelu' "
-                                   "use non-linear layers with the respective activation type, and 'gru' uses a "
-                                   "GRU to update the coverage vectors. Default: %(default)s.")
-    model_params.add_argument('--rnn-attention-coverage-max-fertility',
-                              type=int,
-                              default=2,
-                              help="Maximum fertility for individual source words. Default: %(default)s.")
-    model_params.add_argument('--rnn-attention-coverage-num-hidden',
-                              type=int,
-                              default=1,
-                              help="Number of hidden units for coverage vectors. Default: %(default)s.")
-    model_params.add_argument('--rnn-attention-in-upper-layers',
-                              action="store_true",
-                              help="Pass the attention to the upper layers of the RNN decoder, similar "
-                                   "to GNMT paper. Only applicable if more than one layer is used.")
-    model_params.add_argument('--rnn-attention-mhdot-heads',
-                              type=int, default=None,
-                              help='Number of heads for Multi-head dot attention. Default: %(default)s.')
+                              default=[C.SOURCE_FACTORS_COMBINE_CONCAT],
+                              nargs='+',
+                              help='How to combine source factors. Can be either one value which will be applied to all '
+                              'source factors, or a list of values. Default: %(default)s.')
+    model_params.add_argument('--source-factors-share-embedding',
+                              type=bool_str(),
+                              nargs='+',
+                              default=[False],
+                              help='Share the embeddings with the source language. Can be either one value which will be '
+                              'applied to all source factors, or a list of values. Default: do not share.')
 
-    model_params.add_argument('--weight-tying',
-                              action='store_true',
-                              help='Turn on weight tying (see arxiv.org/abs/1608.05859). '
-                                   'The type of weight sharing is determined through '
-                                   '--weight-tying-type. Default: %(default)s.')
     model_params.add_argument('--weight-tying-type',
-                              default=C.WEIGHT_TYING_TRG_SOFTMAX,
-                              choices=[C.WEIGHT_TYING_SRC_TRG_SOFTMAX,
-                                       C.WEIGHT_TYING_SRC_TRG,
-                                       C.WEIGHT_TYING_TRG_SOFTMAX],
+                              default=C.WEIGHT_TYING_SRC_TRG_SOFTMAX,
+                              choices=C.WEIGHT_TYING_TYPES,
                               help='The type of weight tying. source embeddings=src, target embeddings=trg, '
                                    'target softmax weight matrix=softmax. Default: %(default)s.')
 
-    model_params.add_argument('--layer-normalization', action="store_true",
-                              help="Adds layer normalization before non-linear activations. "
-                                   "This includes MLP attention, RNN decoder state initialization, "
-                                   "RNN decoder hidden state, and cnn layers."
-                                   "It does not normalize RNN cell activations "
-                                   "(this can be done using the '%s' or '%s' rnn-cell-type." % (C.LNLSTM_TYPE,
-                                                                                                C.LNGLSTM_TYPE))
+    model_params.add_argument('--dtype', default=C.DTYPE_FP32, choices=[C.DTYPE_FP32, C.DTYPE_FP16],
+                              help="Data type.")
 
-    model_params.add_argument('--weight-normalization', action="store_true",
-                              help="Adds weight normalization to decoder output layers "
-                                   "(and all convolutional weight matrices for CNN decoders). Default: %(default)s.")
+    model_params.add_argument('--amp', action='store_true', help='Use MXNet\'s automatic mixed precision (AMP).')
+    model_params.add_argument('--amp-scale-interval', type=int, default=2000,
+                              help='Attempt to increase loss scale after this many updates without overflow. '
+                                   'Default: %(default)s.')
 
 
 def add_batch_args(params, default_batch_size=4096):
@@ -837,6 +726,19 @@ def add_batch_args(params, default_batch_size=4096):
                         help="Sentence: each batch contains X sentences, number of words varies."
                              "Word: each batch contains (approximately) X target words, "
                              "number of sentences varies. Default: %(default)s.")
+    params.add_argument('--round-batch-sizes-to-multiple-of',
+                        type=int,
+                        default=1,
+                        help='For word-based batches, round each bucket\'s batch size (measured in sentences) to a '
+                             'multiple of this integer. Default: %(default)s.')
+
+
+
+def add_hybridization_arg(params):
+    params.add_argument('--no-hybridization',
+                        action='store_true',
+                        help='Turn off hybridization. Hybridization builds a static computation graph and computations will therefore be faster. '
+                             'The downside is that one can not set breakpoints to inspect intermediate results. Default: %(default)s.')
 
 
 def add_training_args(params):
@@ -844,11 +746,6 @@ def add_training_args(params):
 
     add_batch_args(train_params)
 
-    train_params.add_argument('--decoder-only',
-                              action='store_true',
-                              help='Pre-train a decoder. This is currently for RNN decoders only. '
-                                   'Default: %(default)s.')
-
     train_params.add_argument('--loss',
                               default=C.CROSS_ENTROPY,
                               choices=[C.CROSS_ENTROPY],
@@ -857,11 +754,6 @@ def add_training_args(params):
                               default=0.1,
                               type=float,
                               help='Smoothing constant for label smoothing. Default: %(default)s.')
-    train_params.add_argument('--loss-normalization-type',
-                              default=C.LOSS_NORM_VALID,
-                              choices=[C.LOSS_NORM_VALID, C.LOSS_NORM_BATCH],
-                              help='How to normalize the loss. By default loss is normalized by the number '
-                                   'of valid (non-PAD) tokens (%s).' % C.LOSS_NORM_VALID)
 
     train_params.add_argument('--length-task',
                               type=str,
@@ -878,33 +770,21 @@ def add_training_args(params):
                               default=1,
                               help='Number of fully-connected layers for predicting the length ratio. Default %(default)s.')
 
-    train_params.add_argument('--metrics',
-                              nargs='+',
-                              default=[C.PERPLEXITY],
-                              choices=[C.PERPLEXITY, C.ACCURACY, C.LENRATIO_MSE],
-                              help='Names of metrics to track on training and validation data. Default: %(default)s.')
     train_params.add_argument('--optimized-metric',
                               default=C.PERPLEXITY,
                               choices=C.METRICS,
                               help='Metric to optimize with early stopping {%(choices)s}. Default: %(default)s.')
 
-    train_params.add_argument('--min-updates',
-                              type=int,
-                              default=None,
-                              help='Minimum number of updates before training can stop. Default: %(default)s.')
-    train_params.add_argument('--max-updates',
-                              type=int,
-                              default=None,
-                              help='Maximum number of updates. Default: %(default)s.')
-    train_params.add_argument('--max-seconds',
-                              type=int,
-                              default=None,
-                              help='Training will stop on the next checkpoint after reaching the maximum seconds. '
-                                   'Default: %(default)s.')
     train_params.add_argument('--update-interval',
                               type=int,
                               default=1,
                               help="Number of batch gradients to accumulate before updating. Default: %(default)s.")
+    train_params.add_argument(C.TRAIN_ARGS_CHECKPOINT_INTERVAL,
+                              type=int_greater_or_equal(1),
+                              default=4000,
+                              help='Checkpoint and evaluate every x updates (update-interval * batches). '
+                                   'Default: %(default)s.')
+
     train_params.add_argument('--min-samples',
                               type=int,
                               default=None,
@@ -913,29 +793,38 @@ def add_training_args(params):
                               type=int,
                               default=None,
                               help='Maximum number of samples. Default: %(default)s.')
-    train_params.add_argument(C.TRAIN_ARGS_CHECKPOINT_INTERVAL,
-                              type=int_greater_or_equal(1),
-                              default=4000,
-                              help='Checkpoint and evaluate every x updates/batches. Default: %(default)s.')
-    train_params.add_argument(C.TRAIN_ARGS_CHECKPOINT_FREQUENCY,
-                              type=int_greater_or_equal(1),
-                              dest="checkpoint_interval",
-                              deprecated_dest="checkpoint_frequency",
-                              action=StoreDeprecatedAction,
-                              default=argparse.SUPPRESS,
-                              help=argparse.SUPPRESS)
-    train_params.add_argument('--max-num-checkpoint-not-improved',
+    train_params.add_argument('--min-updates',
                               type=int,
-                              default=32,
-                              help='Maximum number of checkpoints the model is allowed to not improve in '
-                                   '<optimized-metric> on validation data before training is stopped. '
+                              default=None,
+                              help='Minimum number of updates before training can stop. Default: %(default)s.')
+    train_params.add_argument('--max-updates',
+                              type=int,
+                              default=None,
+                              help='Maximum number of updates. Default: %(default)s.')
+    train_params.add_argument('--max-seconds',
+                              type=int,
+                              default=None,
+                              help='Training will stop on the next checkpoint after reaching the maximum seconds. '
                                    'Default: %(default)s.')
+
     train_params.add_argument('--max-checkpoints',
                               type=int,
                               default=None,
                               help='Maximum number of checkpoints to continue training the model '
                                    'before training is stopped. '
                                    'Default: %(default)s.')
+    train_params.add_argument('--max-num-checkpoint-not-improved',
+                              type=int,
+                              default=None,
+                              help='Maximum number of checkpoints the model is allowed to not improve in '
+                                   '<optimized-metric> on validation data before training is stopped. '
+                                   'Default: %(default)s.')
+    train_params.add_argument('--checkpoint-improvement-threshold',
+                              type=float,
+                              default=0.,
+                              help='Improvement in <optimized-metric> over specified number of checkpoints must exceed'
+                                   'this value to be considered actual improvement. Default: %(default)s.')
+
     train_params.add_argument('--min-num-epochs',
                               type=int,
                               default=None,
@@ -949,53 +838,23 @@ def add_training_args(params):
     train_params.add_argument('--embed-dropout',
                               type=multiple_values(2, data_type=float),
                               default=(.0, .0),
-                              help='Dropout probability for source & target embeddings. Use "x:x" to specify '
-                                   'separate values. Default: %(default)s.')
-    train_params.add_argument('--rnn-dropout-inputs',
-                              type=multiple_values(2, data_type=float),
-                              default=(.0, .0),
-                              help='RNN variational dropout probability for encoder & decoder RNN inputs. (Gal, 2015)'
-                                   'Use "x:x" to specify separate values. Default: %(default)s.')
-    train_params.add_argument('--rnn-dropout-states',
-                              type=multiple_values(2, data_type=float),
-                              default=(.0, .0),
-                              help='RNN variational dropout probability for encoder & decoder RNN states. (Gal, 2015)'
-                                   'Use "x:x" to specify separate values. Default: %(default)s.')
-    train_params.add_argument('--rnn-dropout-recurrent',
-                              type=multiple_values(2, data_type=float),
-                              default=(.0, .0),
-                              help='Recurrent dropout without memory loss (Semeniuta, 2016) for encoder & decoder '
-                                   'LSTMs. Use "x:x" to specify separate values. Default: %(default)s.')
-    train_params.add_argument('--rnn-enc-last-hidden-concat-to-embedding',
-                              action="store_true",
-                              help='Concatenate the last hidden layer of the encoder to the input of the decoder, '
-                                   'instead of the previous state of the decoder. Default: %(default)s.')
-
-    train_params.add_argument('--rnn-decoder-hidden-dropout',
-                              type=float,
-                              default=.2,
-                              help='Dropout probability for hidden state that combines the context with the '
-                                   'RNN hidden state in the decoder. Default: %(default)s.')
+                              help='Dropout probability for source & target embeddings. Use "x:x" to specify separate '
+                                   'values. Default: %(default)s.')
     train_params.add_argument('--transformer-dropout-attention',
-                              type=float,
-                              default=0.1,
-                              help='Dropout probability for multi-head attention. Default: %(default)s.')
+                              type=multiple_values(2, data_type=float),
+                              default=(0.1, 0.1),
+                              help='Dropout probability for multi-head attention. Use "x:x" to specify separate '
+                                   'values for encoder & decoder. Default: %(default)s.')
     train_params.add_argument('--transformer-dropout-act',
-                              type=float,
-                              default=0.1,
-                              help='Dropout probability before activation in feed-forward block. Default: %(default)s.')
+                              type=multiple_values(2, data_type=float),
+                              default=(0.1, 0.1),
+                              help='Dropout probability before activation in feed-forward block. Use "x:x" to specify '
+                                   'separate values for encoder & decoder. Default: %(default)s.')
     train_params.add_argument('--transformer-dropout-prepost',
-                              type=float,
-                              default=0.1,
-                              help='Dropout probability for pre/postprocessing blocks. Default: %(default)s.')
-    train_params.add_argument('--conv-embed-dropout',
-                              type=float,
-                              default=.0,
-                              help="Dropout probability for ConvolutionalEmbeddingEncoder. Default: %(default)s.")
-    train_params.add_argument('--cnn-hidden-dropout',
-                              type=float,
-                              default=.2,
-                              help="Dropout probability for dropout between convolutional layers. Default: %(default)s.")
+                              type=multiple_values(2, data_type=float),
+                              default=(0.1, 0.1),
+                              help='Dropout probability for pre/postprocessing blocks. Use "x:x" to specify separate '
+                                   'values for encoder & decoder. Default: %(default)s.')
 
     train_params.add_argument('--optimizer',
                               default=C.OPTIMIZER_ADAM,
@@ -1006,6 +865,12 @@ def add_training_args(params):
                               default=None,
                               help='Additional optimizer params as dictionary. Format: key1:value1,key2:value2,...')
 
+    train_params.add_argument('--horovod',
+                              action='store_true',
+                              help='Use Horovod/OpenMPI for distributed training (Sergeev and Del Balso 2018, '
+                                   'arxiv.org/abs/1802.05799).  When using this option, run Sockeye with `horovodrun '
+                                   '-np ... -H ... python`.')
+
     train_params.add_argument("--kvstore",
                               type=str,
                               default=C.KVSTORE_DEVICE,
@@ -1013,15 +878,6 @@ def add_training_args(params):
                               help="The MXNet kvstore to use. 'device' is recommended for single process training. "
                                    "Use any of 'dist_sync', 'dist_device_sync' and 'dist_async' for distributed "
                                    "training. Default: %(default)s.")
-    train_params.add_argument("--gradient-compression-type",
-                              type=str,
-                              default=C.GRADIENT_COMPRESSION_NONE,
-                              choices=C.GRADIENT_COMPRESSION_TYPES,
-                              help='Type of gradient compression to use. Default: %(default)s.')
-    train_params.add_argument("--gradient-compression-threshold",
-                              type=float,
-                              default=0.5,
-                              help="Threshold for gradient compression if --gctype is '2bit'. Default: %(default)s.")
 
     train_params.add_argument('--weight-init',
                               type=str,
@@ -1043,13 +899,6 @@ def add_training_args(params):
                               default=C.RAND_TYPE_UNIFORM,
                               choices=[C.RAND_TYPE_UNIFORM, C.RAND_TYPE_GAUSSIAN],
                               help='Xavier random number generator type. Default: %(default)s.')
-    train_params.add_argument('--embed-weight-init',
-                              type=str,
-                              default=C.EMBED_INIT_DEFAULT,
-                              choices=C.EMBED_INIT_TYPES,
-                              help='Type of embedding matrix weight initialization. If normal, initializes embedding '
-                                   'weights using a normal distribution with std=1/srqt(vocab_size). '
-                                   'Default: %(default)s.')
     train_params.add_argument('--initial-learning-rate',
                               type=float,
                               default=0.0002,
@@ -1076,9 +925,14 @@ def add_training_args(params):
                               default=C.LR_SCHEDULER_PLATEAU_REDUCE,
                               choices=C.LR_SCHEDULERS,
                               help='Learning rate scheduler type. Default: %(default)s.')
+    train_params.add_argument('--learning-rate-t-scale',
+                              type=float,
+                              default=1.0,
+                              help="Step number is multiplied by this value when determining learning rate for the "
+                                   "current step. Default: %(default)s.")
     train_params.add_argument('--learning-rate-reduce-factor',
                               type=float,
-                              default=0.7,
+                              default=0.9,
                               help="Factor to multiply learning rate with "
                                    "(for 'plateau-reduce' learning rate scheduler). Default: %(default)s.")
     train_params.add_argument('--learning-rate-reduce-num-not-improved',
@@ -1086,40 +940,11 @@ def add_training_args(params):
                               default=8,
                               help="For 'plateau-reduce' learning rate scheduler. Adjust learning rate "
                                    "if <optimized-metric> did not improve for x checkpoints. Default: %(default)s.")
-    train_params.add_argument('--learning-rate-schedule',
-                              type=learning_schedule(),
-                              default=None,
-                              help="For 'fixed-step' scheduler. Fully specified learning schedule in the form"
-                                   " \"rate1:num_updates1[,rate2:num_updates2,...]\". Overrides all other args related"
-                                   " to learning rate and stopping conditions. Default: %(default)s.")
-    train_params.add_argument('--learning-rate-half-life',
-                              type=float,
-                              default=10,
-                              help="Half-life of learning rate in checkpoints. For 'fixed-rate-*' "
-                                   "learning rate schedulers. Default: %(default)s.")
     train_params.add_argument('--learning-rate-warmup',
                               type=int,
                               default=0,
                               help="Number of warmup steps. If set to x, linearly increases learning rate from 10%% "
                                    "to 100%% of the initial learning rate. Default: %(default)s.")
-    train_params.add_argument('--learning-rate-decay-param-reset',
-                              action='store_true',
-                              help='Resets model parameters to current best when learning rate is reduced due to the '
-                                   'value of --learning-rate-reduce-num-not-improved. Default: %(default)s.')
-    train_params.add_argument('--learning-rate-decay-optimizer-states-reset',
-                              choices=C.LR_DECAY_OPT_STATES_RESET_CHOICES,
-                              default=C.LR_DECAY_OPT_STATES_RESET_OFF,
-                              help="Action to take on optimizer states (e.g. Adam states) when learning rate is "
-                                   "reduced due to the value of --learning-rate-reduce-num-not-improved. "
-                                   "Default: %(default)s.")
-
-    train_params.add_argument('--rnn-forget-bias',
-                              default=0.0,
-                              type=float,
-                              help='Initial value of RNN forget biases.')
-    train_params.add_argument('--rnn-h2h-init', type=str, default=C.RNN_INIT_ORTHOGONAL,
-                              choices=[C.RNN_INIT_ORTHOGONAL, C.RNN_INIT_ORTHOGONAL_STACKED, C.RNN_INIT_DEFAULT],
-                              help="Initialization method for RNN parameters. Default: %(default)s.")
 
     train_params.add_argument('--fixed-param-strategy',
                                default=None,
@@ -1137,10 +962,7 @@ def add_training_args(params):
                               type=int,
                               help='x>0: decode x sampled sentences from validation data and '
                                    'compute evaluation metrics. x==-1: use full validation data. Default: %(default)s.')
-    train_params.add_argument('--decode-and-evaluate-use-cpu',
-                              action='store_true',
-                              help='Use CPU for decoding validation data. Overrides --decode-and-evaluate-device-id. '
-                                   'Default: %(default)s.')
+
     train_params.add_argument('--decode-and-evaluate-device-id',
                               default=None,
                               type=int,
@@ -1155,7 +977,7 @@ def add_training_args(params):
 
     train_params.add_argument('--seed',
                               type=int,
-                              default=13,
+                              default=1,
                               help='Random seed. Default: %(default)s.')
 
     train_params.add_argument('--keep-last-params',
@@ -1179,12 +1001,14 @@ def add_train_cli_args(params):
     add_training_args(params)
     add_device_args(params)
     add_logging_args(params)
+    add_hybridization_arg(params)
 
 
 def add_translate_cli_args(params):
     add_inference_args(params)
     add_device_args(params)
     add_logging_args(params)
+    add_hybridization_arg(params)
 
 
 def add_score_cli_args(params):
@@ -1192,24 +1016,19 @@ def add_score_cli_args(params):
     add_vocab_args(params)
     add_device_args(params)
     add_batch_args(params, default_batch_size=500)
+    add_hybridization_arg(params)
 
     params = params.add_argument_group("Scoring parameters")
 
     params.add_argument("--model", "-m", required=True,
                         help="Model directory containing trained model.")
 
-    params.add_argument('--max-seq-len',
+    params.add_argument(C.TRAINING_ARG_MAX_SEQ_LEN,
                         type=multiple_values(num_values=2, greater_or_equal=1),
                         default=None,
                         help='Maximum sequence length in tokens.'
                              'Use "x:x" to specify separate values for src&tgt. Default: Read from model.')
 
-    params.add_argument('--softmax-temperature',
-                        type=float,
-                        default=None,
-                        help='Controls peakiness of model predictions. Values < 1.0 produce '
-                        'peaked predictions, values > 1.0 produce smoothed distributions.')
-
     # common params with translate CLI
     add_length_penalty_args(params)
     add_brevity_penalty_args(params)
@@ -1227,15 +1046,10 @@ def add_score_cli_args(params):
                         default=C.SCORING_TYPE_DEFAULT,
                         help='Score type to output. Default: %(default)s')
 
-    add_logging_args(params)
-
+    params.add_argument('--dtype', default=None, choices=[None, C.DTYPE_FP32, C.DTYPE_FP16, C.DTYPE_INT8],
+                        help="Data type. Default: %(default)s infers from saved model.")
 
-def add_max_output_cli_args(params):
-    params.add_argument('--max-output-length',
-                        type=int,
-                        default=None,
-                        help='Maximum number of words to generate during translation. '
-                             'If None, it will be computed automatically. Default: %(default)s.')
+    add_logging_args(params)
 
 
 def add_inference_args(params):
@@ -1288,12 +1102,6 @@ def add_inference_args(params):
                                default=5,
                                help='Size of the beam. Default: %(default)s.')
 
-    decode_params.add_argument('--beam-prune', '-p',
-                               type=float,
-                               default=0,
-                               help='Pruning threshold for beam search. All hypotheses with scores not within '
-                                    'this amount of the best finished hypothesis are discarded (0 = off). '
-                                    'Default: %(default)s.')
     decode_params.add_argument('--beam-search-stop',
                                choices=[C.BEAM_SEARCH_STOP_ALL, C.BEAM_SEARCH_STOP_FIRST],
                                default=C.BEAM_SEARCH_STOP_ALL,
@@ -1313,11 +1121,6 @@ def add_inference_args(params):
                                     ' Default: %d without batching '
                                     'and %d * batch_size with batching.' % (C.CHUNK_SIZE_NO_BATCHING,
                                                                             C.CHUNK_SIZE_PER_BATCH_SEGMENT))
-    decode_params.add_argument('--skip-topk',
-                               default=False,
-                               action='store_true',
-                               help='Use argmax instead of topk for greedy decoding (when --beam-size 1).'
-                                    'Default: %(default)s.')
     decode_params.add_argument('--sample',
                                type=int_greater_or_equal(0),
                                default=None,
@@ -1338,21 +1141,21 @@ def add_inference_args(params):
                                type=int_greater_or_equal(0),
                                default=10,
                                help='Bucket width for encoder steps. 0 means no bucketing. Default: %(default)s.')
-    decode_params.add_argument('--max-input-len', '-n',
-                               type=int,
+    decode_params.add_argument('--max-input-length',
+                               type=int_greater_or_equal(1),
                                default=None,
                                help='Maximum input sequence length. Default: value from model(s).')
-    decode_params.add_argument('--softmax-temperature',
-                               type=float,
-                               default=None,
-                               help='Controls peakiness of model predictions. Values < 1.0 produce '
-                                    'peaked predictions, values > 1.0 produce smoothed distributions.')
     decode_params.add_argument('--max-output-length-num-stds',
                                type=int,
                                default=C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH,
                                help='Number of target-to-source length ratio standard deviations from training to add '
                                     'to calculate maximum output length for beam search for each sentence. '
                                     'Default: %(default)s.')
+    decode_params.add_argument('--max-output-length',
+                               type=int_greater_or_equal(1),
+                               default=None,
+                               help='Maximum number of words to generate during translation. '
+                                    'If None, it will be computed automatically. Default: %(default)s.')
     decode_params.add_argument('--restrict-lexicon',
                                nargs='+',
                                type=multiple_values(num_values=2, data_type=str),
@@ -1383,20 +1186,13 @@ def add_inference_args(params):
                                default='translation',
                                choices=C.OUTPUT_HANDLERS,
                                help='Output type. Default: %(default)s.')
-    decode_params.add_argument('--sure-align-threshold',
-                               default=0.9,
-                               type=float,
-                               help='Threshold to consider a soft alignment a sure alignment. Default: %(default)s.')
 
     # common params with score CLI
     add_length_penalty_args(decode_params)
     add_brevity_penalty_args(decode_params)
 
-    decode_params.add_argument('--override-dtype',
-                               default=None,
-                               type=str,
-                               help='EXPERIMENTAL: may be changed or removed in future. Overrides training dtype of '
-                                    'encoders and decoders during inference. Default: %(default)s.')
+    decode_params.add_argument('--dtype', default=None, choices=[None, C.DTYPE_FP32, C.DTYPE_FP16, C.DTYPE_INT8],
+                               help="Data type. Default: %(default)s infers from saved model.")
 
 
 def add_length_penalty_args(params):
diff --git a/sockeye/average.py b/sockeye/average.py
index 465a2ddd9..9c45d2356 100644
--- a/sockeye/average.py
+++ b/sockeye/average.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -41,30 +41,21 @@ def average(param_paths: Iterable[str]) -> Dict[str, mx.nd.NDArray]:
     :param param_paths: List of paths to parameter files.
     :return: Averaged parameter dictionary.
     """
-    all_arg_params = []
-    all_aux_params = []
+    all_params = []  # type: List[Dict[str, mx.nd.NDArray]]
     for path in param_paths:
         logger.info("Loading parameters from '%s'", path)
-        arg_params, aux_params = utils.load_params(path)
-        all_arg_params.append(arg_params)
-        all_aux_params.append(aux_params)
+        params = mx.nd.load(path)
+        all_params.append(params)
 
-    logger.info("%d models loaded", len(all_arg_params))
-    utils.check_condition(all(all_arg_params[0].keys() == p.keys() for p in all_arg_params),
-                          "arg_param names do not match across models")
-    utils.check_condition(all(all_aux_params[0].keys() == p.keys() for p in all_aux_params),
-                          "aux_param names do not match across models")
+    logger.info("%d models loaded", len(all_params))
+    utils.check_condition(all(all_params[0].keys() == p.keys() for p in all_params),
+                          "param names do not match across models")
 
     avg_params = {}
     # average arg_params
-    for k in all_arg_params[0]:
-        arrays = [p[k] for p in all_arg_params]
-        avg_params["arg:" + k] = utils.average_arrays(arrays)
-    # average aux_params
-    for k in all_aux_params[0]:
-        arrays = [p[k] for p in all_aux_params]
-        avg_params["aux:" + k] = utils.average_arrays(arrays)
-
+    for k in all_params[0]:
+        arrays = [p[k] for p in all_params]
+        avg_params[k] = utils.average_arrays(arrays)
     return avg_params
 
 
diff --git a/sockeye/beam_search.py b/sockeye/beam_search.py
new file mode 100644
index 000000000..9fb818878
--- /dev/null
+++ b/sockeye/beam_search.py
@@ -0,0 +1,784 @@
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import logging
+import functools
+import operator
+from abc import abstractmethod, ABC
+from typing import Tuple, Optional, List, Union
+
+import mxnet as mx
+import numpy as np
+
+from . import constants as C
+from . import lexical_constraints as constrained
+from . import lexicon
+from . import utils
+from . import vocab
+from .model import SockeyeModel
+
+logger = logging.getLogger(__name__)
+
+
+class _Inference(ABC):
+
+    @abstractmethod
+    def state_structure(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def encode_and_initialize(self,
+                              inputs: mx.nd.NDArray,
+                              valid_length: Optional[mx.nd.NDArray] = None):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def decode_step(self,
+                    step_input: mx.nd.NDArray,
+                    states: List,
+                    vocab_slice_ids: Optional[mx.nd.NDArray] = None):
+        raise NotImplementedError()
+
+
+class _SingleModelInference(_Inference):
+
+    def __init__(self,
+                 model: SockeyeModel,
+                 skip_softmax: bool = False,
+                 constant_length_ratio: float = 0.0) -> None:
+        self._model = model
+        self._skip_softmax = skip_softmax
+        self._const_lr = constant_length_ratio
+
+    def state_structure(self) -> List:
+        return [self._model.state_structure()]
+
+    def encode_and_initialize(self, inputs: mx.nd.NDArray, valid_length: Optional[mx.nd.NDArray] = None):
+        states, predicted_output_length = self._model.encode_and_initialize(inputs, valid_length, self._const_lr)
+        predicted_output_length = predicted_output_length.expand_dims(axis=1)
+        return states, predicted_output_length
+
+    def decode_step(self,
+                    step_input: mx.nd.NDArray,
+                    states: List,
+                    vocab_slice_ids: Optional[mx.nd.NDArray] = None):
+        logits, states, _ = self._model.decode_step(step_input, states, vocab_slice_ids)
+        if not self._skip_softmax:
+            logits = logits.log_softmax(axis=-1)
+        scores = -logits
+        return scores, states
+
+
+class _EnsembleInference(_Inference):
+
+    def __init__(self,
+                 models: List[SockeyeModel],
+                 ensemble_mode: str = 'linear',
+                 constant_length_ratio: float = 0.0) -> None:
+        self._models = models
+        if ensemble_mode == 'linear':
+            self._interpolation = self.linear_interpolation
+        elif ensemble_mode == 'log_linear':
+            self._interpolation = self.log_linear_interpolation
+        else:
+            raise ValueError()
+        self._const_lr = constant_length_ratio
+
+    def state_structure(self) -> List:
+        structure = []
+        for model in self._models:
+            structure.append(model.state_structure())
+        return structure
+
+    def encode_and_initialize(self, inputs: mx.nd.NDArray, valid_length: Optional[mx.nd.NDArray] = None):
+        model_states = []  # type: List[mx.nd.NDArray]
+        predicted_output_lengths = []  # type: List[mx.nd.NDArray]
+        for model in self._models:
+            states, predicted_output_length = model.encode_and_initialize(inputs, valid_length, self._const_lr)
+            predicted_output_lengths.append(predicted_output_length)
+            model_states += states
+        # average predicted output lengths, (batch, 1)
+        predicted_output_lengths = mx.nd.mean(mx.nd.stack(*predicted_output_lengths, axis=1), axis=1, keepdims=True)
+        return model_states, predicted_output_lengths
+
+    def decode_step(self,
+                    step_input: mx.nd.NDArray,
+                    states: List,
+                    vocab_slice_ids: Optional[mx.nd.NDArray] = None):
+        outputs = []  # type: List[mx.nd.NDArray]
+        new_states = []  # type: List[mx.nd.NDArray]
+        state_index = 0
+        for model, model_state_structure in zip(self._models, self.state_structure()):
+            model_states = states[state_index:state_index+len(model_state_structure)]
+            state_index += len(model_state_structure)
+            logits, model_states, _ = model.decode_step(step_input, model_states, vocab_slice_ids)
+            probs = logits.softmax(axis=-1)
+            outputs.append(probs)
+            new_states += model_states
+        scores = self._interpolation(outputs)
+        return scores, new_states
+
+    @staticmethod
+    def linear_interpolation(predictions):
+        return -mx.nd.log(utils.average_arrays(predictions))  # pylint: disable=invalid-unary-operand-type
+
+    @staticmethod
+    def log_linear_interpolation(predictions):
+        log_probs = utils.average_arrays([p.log() for p in predictions])
+        return -log_probs.log_softmax()  # pylint: disable=invalid-unary-operand-type
+
+
+class UpdateScores(mx.gluon.HybridBlock):
+    """
+    A HybridBlock that updates the scores from the decoder step with accumulated scores.
+    Inactive hypotheses receive score inf. Finished hypotheses receive their accumulated score for C.PAD_ID.
+    Hypotheses at maximum length are forced to produce C.EOS_ID.
+    All other options are set to infinity.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        assert C.PAD_ID == 0, "This block only works with PAD_ID == 0"
+
+    def hybrid_forward(self, F,
+                       target_dists, finished, inactive,
+                       scores_accumulated, lengths, max_lengths,
+                       pad_dist, eos_dist):
+        # broadcast hypothesis score to each prediction.
+        # scores_accumulated. Shape: (batch*beam, 1)
+        # target_dists. Shape: (batch*beam, vocab_size)
+        scores = F.broadcast_add(target_dists, scores_accumulated)
+
+        # Special treatment for finished and inactive rows. Inactive rows are inf everywhere;
+        # finished rows are inf everywhere except column zero (pad_id), which holds the accumulated model score.
+        # Items that are finished (but not inactive) get their previous accumulated score for the <pad> symbol,
+        # infinity otherwise.
+        # pad_dist. Shape: (batch*beam, vocab_size)
+        pad_dist = F.concat(scores_accumulated, pad_dist)
+        scores = F.where(F.broadcast_logical_or(finished, inactive), pad_dist, scores)
+
+        # Update lengths of all items, except those that were already finished. This updates
+        # the lengths for inactive items, too, but that doesn't matter since they are ignored anyway.
+        lengths = lengths + (1 - finished)
+
+        # Items that are at their maximum length and not finished now are forced to produce the <eos> symbol.
+        # That is, we keep scores for hypotheses below max length or finished, and 'force-eos' the rest.
+        below_max_length = lengths < max_lengths
+        scores = F.where(F.broadcast_logical_or(below_max_length, finished), scores, eos_dist + scores)
+
+        return scores, lengths
+
+
+class LengthPenalty(mx.gluon.HybridBlock):
+    """
+    Calculates the length penalty as:
+    (beta + len(Y))**alpha / (beta + 1)**alpha
+
+    See Wu et al. 2016 (note that in the paper beta has a different meaning,
+    and a fixed value 5 was used for this parameter)
+
+    :param alpha: The alpha factor for the length penalty (see above).
+    :param beta: The beta factor for the length penalty (see above).
+    """
+
+    def __init__(self, alpha: float = 1.0, beta: float = 0.0, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.alpha = alpha
+        self.beta = beta
+        self.denominator = (self.beta + 1.) ** self.alpha
+
+    def forward(self, lengths):
+        if isinstance(lengths, mx.nd.NDArray) or isinstance(lengths, mx.sym.Symbol):
+            return super().forward(lengths)
+        else:
+            return self.hybrid_forward(None, lengths)
+
+    def hybrid_forward(self, F, lengths):
+        if self.alpha == 0.0:
+            if F is None:
+                return 1.0
+            else:
+                return F.ones_like(lengths)
+        else:
+            numerator = self.beta + lengths if self.beta != 0.0 else lengths
+            numerator = numerator ** self.alpha if self.alpha != 1.0 else numerator
+            return numerator / self.denominator
+
+
+class BrevityPenalty(mx.gluon.HybridBlock):
+    """
+    Calculates the logarithmic brevity penalty as:
+      weight * log min(1, exp(1 - ref_len / hyp_len)) = weight * min(0, 1 - ref_len / hyp_len).
+
+    :param weight: Linear weight.
+    """
+
+    def __init__(self, weight: float = 0.0, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.weight = weight
+
+    def forward(self, hyp_lengths, reference_lengths):
+        if isinstance(hyp_lengths, mx.nd.NDArray) or isinstance(hyp_lengths, mx.sym.Symbol):
+            return super().forward(hyp_lengths, reference_lengths)
+        else:
+            return self.hybrid_forward(None, hyp_lengths, reference_lengths)
+
+    def hybrid_forward(self, F, hyp_lengths, reference_lengths):
+        if self.weight == 0.0:
+            if F is None:
+                return 0.0
+            else:
+                # subtract to avoid MxNet's warning of not using both arguments
+                # this branch should not and is not used during inference
+                return F.zeros_like(hyp_lengths - reference_lengths)
+        else:
+            # log_bp is always <= 0.0
+            if F is None:
+                log_bp = min(0.0, 1.0 - reference_lengths / hyp_lengths)
+            else:
+                log_bp = F.minimum(F.zeros_like(hyp_lengths), 1.0 - reference_lengths / hyp_lengths)
+            return self.weight * log_bp
+
+
+class CandidateScorer(mx.gluon.HybridBlock):
+
+    def __init__(self,
+                 length_penalty_alpha: float = 1.0,
+                 length_penalty_beta: float = 0.0,
+                 brevity_penalty_weight: float = 0.0,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        with self.name_scope():
+            self._lp = LengthPenalty(alpha=length_penalty_alpha, beta=length_penalty_beta)
+            self._bp = None  # type: Optional[BrevityPenalty]
+            if brevity_penalty_weight > 0.0:
+                self._bp = BrevityPenalty(weight=brevity_penalty_weight)
+
+    def forward(self, scores, lengths, reference_lengths):
+        if isinstance(scores, mx.nd.NDArray) or isinstance(scores, mx.sym.Symbol):
+            return super().forward(scores, lengths, reference_lengths)
+        else:
+            return self.hybrid_forward(None, scores, lengths, reference_lengths)
+
+    def hybrid_forward(self, F, scores, lengths, reference_lengths):
+        lp = self._lp(lengths)
+        if self._bp is not None:
+            bp = self._bp(lengths, reference_lengths)
+        else:
+            if F is None:
+                bp = 0.0
+            else:
+                # avoid warning for unused input
+                bp = F.zeros_like(reference_lengths) if reference_lengths is not None else 0.0
+        return scores / lp - bp
+
+    def unnormalize(self, scores, lengths, reference_lengths):
+        bp = 0.0 if self._bp is None else self._bp(lengths, reference_lengths)
+        return (scores + bp) * self._lp(lengths)
+
+
+class SortNormalizeAndUpdateFinished(mx.gluon.HybridBlock):
+    """
+    A HybridBlock for normalizing newly finished hypotheses scores with LengthPenalty.
+    """
+
+    def __init__(self,
+                 dtype: str,
+                 pad_id: int,
+                 eos_id: int,
+                 scorer: CandidateScorer,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.dtype = dtype
+        self.pad_id = pad_id
+        self.eos_id = eos_id
+        self._scorer = scorer
+
+    def hybrid_forward(self, F, best_hyp_indices, best_word_indices,
+                       finished, scores_accumulated, lengths, reference_lengths):
+
+        # Reorder fixed-size beam data according to best_hyp_indices (ascending)
+        finished = F.take(finished, best_hyp_indices)
+        lengths = F.take(lengths, best_hyp_indices)
+        reference_lengths = F.take(reference_lengths, best_hyp_indices)
+
+        # Normalize hypotheses that JUST finished
+        all_finished = F.broadcast_logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id)
+        newly_finished = F.broadcast_logical_xor(all_finished, finished)
+        scores_accumulated = F.where(newly_finished,
+                                     self._scorer(scores_accumulated,
+                                                  F.cast(F.expand_dims(lengths, axis=1), self.dtype),
+                                                  reference_lengths),
+                                     scores_accumulated)
+
+        # Recompute finished. Hypotheses are finished if they are extended with <pad> or <eos>
+        finished = F.broadcast_logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id)
+
+        return finished, scores_accumulated, lengths, reference_lengths
+
+
+class TopK(mx.gluon.HybridBlock):
+    """
+    Batch-wise topk operation.
+    Forward method uses imperative shape inference, since both batch_size and vocab_size are dynamic
+    during translation (due to variable batch size and potential vocabulary selection).
+    """
+
+    def __init__(self, k: int, **kwargs) -> None:
+        """
+        :param k: The number of smallest scores to return.
+        """
+        super().__init__(**kwargs)
+        self.k = k
+
+    def forward(self, scores, offset):
+        """
+        Get the lowest k elements per sentence from a `scores` matrix.
+
+        :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
+        :param offset: Array to add to the hypothesis indices for offsetting in batch decoding.
+        :return: The row indices, column indices and values of the k smallest items in matrix.
+        """
+        vocab_size = scores.shape[1]
+        batch_size = int(offset.shape[-1] / self.k)
+        # Shape: (batch size, beam_size * vocab_size)
+        batchwise_scores = scores.reshape(shape=(batch_size, self.k * vocab_size))
+        indices, values = super().forward(batchwise_scores)
+        best_hyp_indices, best_word_indices = mx.nd.unravel_index(indices, shape=(batch_size * self.k, vocab_size))
+        if batch_size > 1:
+            # Offsetting the indices to match the shape of the scores matrix
+            best_hyp_indices += offset
+        return best_hyp_indices, best_word_indices, values
+
+    def hybrid_forward(self, F, scores):
+        values, indices = F.topk(scores, axis=1, k=self.k, ret_typ='both', is_ascend=True)
+        # Project indices back into original shape (which is different for t==1 and t>1)
+        return F.reshape(F.cast(indices, 'int32'), shape=(-1,)), F.reshape(values, shape=(-1, 1))
+
+
+class SampleK(mx.gluon.HybridBlock):
+    """
+    A HybridBlock for selecting a random word from each hypothesis according to its distribution.
+    """
+    def __init__(self, n, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.n = n
+
+    def hybrid_forward(self, F, scores, target_dists, finished, best_hyp_indices):
+        """
+        Choose an extension of each hypothesis from its softmax distribution.
+
+        :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
+        :param target_dists: The non-cumulative target distributions (ignored).
+        :param finished: The list of finished hypotheses.
+        :param best_hyp_indices: Best hypothesis indices constant.
+        :return: The row indices, column indices, and values of the sampled words.
+        """
+        # Map the negative logprobs to probabilities so as to have a distribution
+        target_dists = F.exp(-target_dists)
+
+        # n == 0 means sample from the full vocabulary. Otherwise, we sample from the top n.
+        if self.n != 0:
+            # select the top n in each row, via a mask
+            masked_items = F.topk(target_dists, k=self.n, ret_typ='mask', axis=1, is_ascend=False)
+            # set unmasked items to 0
+            masked_items = F.where(masked_items, target_dists, masked_items)
+            # renormalize
+            target_dists = F.broadcast_div(masked_items, F.sum(masked_items, axis=1, keepdims=True))
+
+        # Sample from the target distributions over words, then get the corresponding values from the cumulative scores
+        best_word_indices = F.random.multinomial(target_dists, get_prob=False)
+        # Zeroes for finished hypotheses.
+        best_word_indices = F.where(finished, F.zeros_like(best_word_indices), best_word_indices)
+        values = F.pick(scores, best_word_indices, axis=1, keepdims=True)
+
+        best_hyp_indices = F.slice_like(best_hyp_indices, best_word_indices, axes=(0,))
+
+        return best_hyp_indices, best_word_indices, values
+
+
+def _repeat_states(states: List, beam_size: int, state_structure: List) -> List:
+    repeated_states = []
+    flat_structure = functools.reduce(operator.add, state_structure)
+    assert len(states) == len(flat_structure), "Number of states do not match the defined state structure"
+    for state, state_format in zip(states, flat_structure):
+        if state_format == C.STEP_STATE or state_format == C.BIAS_STATE:
+            repeat_axis = 0
+        elif state_format == C.DECODER_STATE or state_format == C.ENCODER_STATE:
+            # TODO: Change repeat axis to 1 when interleaved multihead attention is implemented
+            repeat_axis = 0
+        else:
+            raise ValueError("Provided state format %s not recognized." % state_format)
+        repeated_state = state.repeat(repeats=beam_size, axis=repeat_axis)
+        repeated_states.append(repeated_state)
+    return repeated_states
+
+
+class SortStates(mx.gluon.HybridBlock):
+
+    def __init__(self, state_structure, prefix):
+        mx.gluon.HybridBlock.__init__(self, prefix=prefix)
+        self.flat_structure = functools.reduce(operator.add, state_structure)
+
+    def hybrid_forward(self, F, best_hyp_indices, *states):
+        sorted_states = []
+        assert len(states) == len(self.flat_structure), "Number of states do not match the defined state structure"
+        for state, state_format in zip(states, self.flat_structure):
+            if state_format == C.STEP_STATE or state_format == C.BIAS_STATE:
+                sorted_state = F.take(state, best_hyp_indices)
+            elif state_format == C.DECODER_STATE:
+                # TODO: Change take axis to 1 when interleaved multihead attention is implemented
+                sorted_state = F.take(state, best_hyp_indices)
+            elif state_format == C.ENCODER_STATE:
+                # No need for takes on encoder layer states
+                sorted_state = state
+            else:
+                raise ValueError("Provided state format %s not recognized." % state_format)
+            sorted_states.append(sorted_state)
+        return sorted_states
+
+
+class BeamSearch(mx.gluon.Block):
+    """
+    Features:
+    - beam search stop
+    - constraints (pos & neg)
+    - ensemble decoding
+    - vocabulary selection
+    - sampling (TODO: check if its working correctly)
+
+    Not supported:
+    - beam pruning
+    - beam history
+    """
+
+    def __init__(self,
+                 beam_size: int,
+                 dtype: str,
+                 bos_id: int,
+                 eos_id: int,
+                 context: Union[mx.Context, List[mx.Context]],
+                 output_vocab_size: int,
+                 scorer: CandidateScorer,
+                 num_source_factors: int,
+                 inference: _Inference,
+                 beam_search_stop: str = C.BEAM_SEARCH_STOP_ALL,
+                 global_avoid_trie: Optional[constrained.AvoidTrie] = None,
+                 sample: Optional[int] = None) -> None:
+        super().__init__(prefix='beam_search_')
+        self.beam_size = beam_size
+        self.dtype = dtype
+        self.bos_id = bos_id
+        self.eos_id = eos_id
+        self.output_vocab_size = output_vocab_size
+        self.context = context
+        self._inference = inference
+        self.beam_search_stop = beam_search_stop
+        self.num_source_factors = num_source_factors
+        self.global_avoid_trie = global_avoid_trie
+
+        with self.name_scope():
+            self._sort_states = SortStates(state_structure=self._inference.state_structure(),
+                                           prefix='sort_states_')
+            self._update_scores = UpdateScores(prefix='update_scores_')
+            self._scorer = scorer
+            self._sort_norm_and_update_finished = SortNormalizeAndUpdateFinished(
+                prefix='sort_norm_and_update_finished_',
+                dtype=self.dtype,
+                pad_id=C.PAD_ID,
+                eos_id=eos_id,
+                scorer=scorer)
+
+            self._sample = None  # type: Optional[mx.gluon.HybridBlock]
+            self._top = None  # type: Optional[mx.gluon.HybridBlock]
+            if sample is not None:
+                self._sample = SampleK(sample)
+            else:
+                self._top = TopK(self.beam_size)
+
+    def forward(self,
+                source: mx.nd.NDArray,
+                source_length: mx.nd.NDArray,
+                restrict_lexicon: Optional[lexicon.TopKLexicon],
+                raw_constraint_list: List[Optional[constrained.RawConstraintList]],
+                raw_avoid_list: List[Optional[constrained.RawConstraintList]],
+                max_output_lengths: mx.nd.NDArray) -> Tuple[np.ndarray,
+                                                            np.ndarray,
+                                                            np.ndarray,
+                                                            np.ndarray,
+                                                            List[Optional[np.ndarray]],
+                                                            List[Optional[constrained.ConstrainedHypothesis]]]:
+        """
+        Translates multiple sentences using beam search.
+
+        :param source: Source ids. Shape: (batch_size, bucket_key, num_factors).
+        :param source_length: Valid source lengths. Shape: (batch_size,).
+        :param restrict_lexicon: Lexicon to use for vocabulary restriction.
+        :param raw_constraint_list: A list of optional lists containing phrases (as lists of target word IDs)
+               that must appear in each output.
+        :param raw_avoid_list: A list of optional lists containing phrases (as lists of target word IDs)
+               that must NOT appear in each output.
+        :param max_output_lengths: NDArray of maximum output lengths per input in source.
+                Shape: (batch_size,). Dtype: int32.
+        :return List of best hypotheses indices, list of best word indices,
+                array of accumulated length-normalized negative log-probs, hypotheses lengths,
+                predicted lengths of references (if any), constraints (if any).
+        """
+        batch_size = source.shape[0]
+        logger.debug("beam_search batch size: %d", batch_size)
+
+        # Maximum beam search iterations (determined by longest input with eos)
+        max_iterations = max_output_lengths.max().asscalar()
+        logger.debug("max beam search iterations: %d", max_iterations)
+
+        sample_best_hyp_indices = None
+        if self._sample is not None:
+            utils.check_condition(restrict_lexicon is None,
+                                  "Sampling is not available when working with a restricted lexicon.")
+            sample_best_hyp_indices = mx.nd.arange(0, batch_size * self.beam_size, dtype='int32')
+
+        # General data structure: batch_size * beam_size blocks in total;
+        # a full beam for each sentence, followed by the next beam-block for the next sentence and so on
+
+        best_word_indices = mx.nd.full((batch_size * self.beam_size,), val=self.bos_id, ctx=self.context,
+                                       dtype='int32')
+
+        # offset for hypothesis indices in batch decoding
+        offset = mx.nd.repeat(mx.nd.arange(0, batch_size * self.beam_size, self.beam_size,
+                                           dtype='int32', ctx=self.context), self.beam_size)
+
+        # locations of each batch item when first dimension is (batch * beam)
+        batch_indices = mx.nd.arange(0, batch_size * self.beam_size, self.beam_size, dtype='int32', ctx=self.context)
+        first_step_mask = mx.nd.full((batch_size * self.beam_size, 1), val=np.inf, ctx=self.context, dtype=self.dtype)
+        first_step_mask[batch_indices] = 1.0
+        pad_dist = mx.nd.full((batch_size * self.beam_size, self.output_vocab_size - 1), val=np.inf,
+                              ctx=self.context, dtype=self.dtype)
+        eos_dist = mx.nd.full((batch_size * self.beam_size, self.output_vocab_size), val=np.inf,
+                              ctx=self.context, dtype=self.dtype)
+        eos_dist[:, C.EOS_ID] = 0
+
+        # Best word and hypotheses indices across beam search steps from topk operation.
+        best_hyp_indices_list = []  # type: List[mx.nd.NDArray]
+        best_word_indices_list = []  # type: List[mx.nd.NDArray]
+
+        lengths = mx.nd.zeros((batch_size * self.beam_size,), ctx=self.context, dtype='int32')
+        finished = mx.nd.zeros((batch_size * self.beam_size,), ctx=self.context, dtype='int32')
+
+        # Extending max_output_lengths to shape (batch_size * beam_size,)
+        max_output_lengths = mx.nd.repeat(max_output_lengths, self.beam_size)
+
+        # scores_accumulated: chosen smallest scores in scores (ascending).
+        scores_accumulated = mx.nd.zeros((batch_size * self.beam_size, 1), ctx=self.context, dtype=self.dtype)
+
+        # If using a top-k lexicon, select param rows for logit computation that correspond to the
+        # target vocab for this sentence.
+        vocab_slice_ids = None  # type: Optional[mx.nd.NDArray]
+        if restrict_lexicon:
+            source_words = utils.split(source, num_outputs=self.num_source_factors, axis=2, squeeze_axis=True)[0]
+            vocab_slice_ids = restrict_lexicon.get_trg_ids(source_words.astype("int32").asnumpy())
+            if any(raw_constraint_list):
+                # Add the constraint IDs to the list of permissibled IDs, and then project them into the reduced space
+                constraint_ids = np.array([word_id for sent in raw_constraint_list for phr in sent for word_id in phr])
+                vocab_slice_ids = np.lib.arraysetops.union1d(vocab_slice_ids, constraint_ids)
+                full_to_reduced = dict((val, i) for i, val in enumerate(vocab_slice_ids))
+                raw_constraint_list = [[[full_to_reduced[x] for x in phr] for phr in sent] for sent in
+                                       raw_constraint_list]
+            # Pad to a multiple of 8.
+            vocab_slice_ids = np.pad(vocab_slice_ids, (0, 7 - ((len(vocab_slice_ids) - 1) % 8)),
+                                     mode='constant', constant_values = self.eos_id)
+            vocab_slice_ids = mx.nd.array(vocab_slice_ids, ctx=self.context, dtype='int32')
+
+            if vocab_slice_ids.shape[0] < self.beam_size + 1:
+                # This fixes an edge case for toy models, where the number of vocab ids from the lexicon is
+                # smaller than the beam size.
+                logger.warning("Padding vocab_slice_ids (%d) with EOS to have at least %d+1 elements to expand",
+                               vocab_slice_ids.shape[0], self.beam_size)
+                n = self.beam_size - vocab_slice_ids.shape[0] + 1
+                vocab_slice_ids = mx.nd.concat(vocab_slice_ids,
+                                               mx.nd.full((n,), val=self.eos_id, ctx=self.context, dtype='int32'),
+                                               dim=0)
+
+            pad_dist = mx.nd.full((batch_size * self.beam_size, vocab_slice_ids.shape[0] - 1),
+                                  val=np.inf, ctx=self.context)
+            eos_dist = mx.nd.full((batch_size * self.beam_size, vocab_slice_ids.shape[0]),
+                                  val=np.inf, ctx=self.context)
+            eos_dist[:, C.EOS_ID] = 0
+
+        # Initialize the beam to track constraint sets, where target-side lexical constraints are present
+        constraints = constrained.init_batch(raw_constraint_list, self.beam_size, self.bos_id, self.eos_id)
+
+        if self.global_avoid_trie or any(raw_avoid_list):
+            avoid_states = constrained.AvoidBatch(batch_size, self.beam_size,
+                                                  avoid_list=raw_avoid_list,
+                                                  global_avoid_trie=self.global_avoid_trie)
+            avoid_states.consume(best_word_indices)
+
+        # (0) encode source sentence, returns a list
+        model_states, estimated_reference_lengths = self._inference.encode_and_initialize(source, source_length)
+        # repeat states to beam_size
+        model_states = _repeat_states(model_states, self.beam_size, self._inference.state_structure())
+
+        # Records items in the beam that are inactive. At the beginning (t==1), there is only one valid or active
+        # item on the beam for each sentence
+        inactive = mx.nd.zeros((batch_size * self.beam_size), dtype='int32', ctx=self.context)
+        t = 1
+        for t in range(1, max_iterations + 1):  # TODO: max_iterations + 1 is the MINIMUM to get correct results right now
+            # (1) obtain next predictions and advance models' state
+            # target_dists: (batch_size * beam_size, target_vocab_size)
+            target_dists, model_states = self._inference.decode_step(best_word_indices, model_states, vocab_slice_ids)
+
+            # (2) Produces the accumulated cost of target words in each row.
+            # There is special treatment for finished and inactive rows: inactive rows are inf everywhere;
+            # finished rows are inf everywhere except column zero, which holds the accumulated model score
+            scores, lengths = self._update_scores(target_dists,
+                                                  finished,
+                                                  inactive,
+                                                  scores_accumulated,
+                                                  lengths,
+                                                  max_output_lengths,
+                                                  pad_dist,
+                                                  eos_dist)
+
+            # Mark entries that should be blocked as having a score of np.inf
+            if self.global_avoid_trie or any(raw_avoid_list):
+                block_indices = avoid_states.avoid()
+                if len(block_indices) > 0:
+                    scores[block_indices] = np.inf
+                    if self._sample is not None:
+                        target_dists[block_indices] = np.inf
+
+            # (3) Get beam_size winning hypotheses for each sentence block separately. Only look as
+            # far as the active beam size for each sentence.
+            if self._sample is not None:
+                best_hyp_indices, best_word_indices, scores_accumulated = self._sample(scores,
+                                                                                       target_dists,
+                                                                                       finished,
+                                                                                       sample_best_hyp_indices)
+            else:
+                # On the first timestep, all hypotheses have identical histories, so force topk() to choose extensions
+                # of the first row only by setting all other rows to inf
+                if t == 1:
+                    scores *= first_step_mask
+
+                best_hyp_indices, best_word_indices, scores_accumulated = self._top(scores, offset)
+
+            # Constraints for constrained decoding are processed sentence by sentence
+            if any(raw_constraint_list):
+                best_hyp_indices, best_word_indices, scores_accumulated, constraints, inactive = constrained.topk(
+                    t,
+                    batch_size,
+                    self.beam_size,
+                    inactive,
+                    scores,
+                    constraints,
+                    best_hyp_indices,
+                    best_word_indices,
+                    scores_accumulated)
+
+            # Map from restricted to full vocab ids if needed
+            if restrict_lexicon:
+                best_word_indices = vocab_slice_ids.take(best_word_indices)
+
+            # (4) Normalize the scores of newly finished hypotheses. Note that after this until the
+            # next call to topk(), hypotheses may not be in sorted order.
+            finished, scores_accumulated, lengths, estimated_reference_lengths = self._sort_norm_and_update_finished(
+                best_hyp_indices,
+                best_word_indices,
+                finished,
+                scores_accumulated,
+                lengths,
+                estimated_reference_lengths)
+
+            # Collect best hypotheses, best word indices
+            best_hyp_indices_list.append(best_hyp_indices)
+            best_word_indices_list.append(best_word_indices)
+
+            if self._should_stop(finished, batch_size):
+                break
+
+            # (5) update models' state with winning hypotheses (ascending)
+            model_states = self._sort_states(best_hyp_indices, *model_states)
+
+        logger.debug("Finished after %d out of %d steps.", t, max_iterations)
+
+        # (9) Sort the hypotheses within each sentence (normalization for finished hyps may have unsorted them).
+        folded_accumulated_scores = scores_accumulated.reshape((batch_size,
+                                                                self.beam_size * scores_accumulated.shape[-1]))
+        indices = mx.nd.cast(mx.nd.argsort(folded_accumulated_scores.astype('float32'), axis=1), dtype='int32').reshape((-1,))
+        best_hyp_indices, _ = mx.nd.unravel_index(indices, scores_accumulated.shape) + offset
+        scores_accumulated = scores_accumulated.take(best_hyp_indices)
+        best_hyp_indices_list.append(best_hyp_indices)
+        lengths = lengths.take(best_hyp_indices)
+        all_best_hyp_indices = mx.nd.stack(*best_hyp_indices_list, axis=1)
+        all_best_word_indices = mx.nd.stack(*best_word_indices_list, axis=1)
+        constraints = [constraints[x] for x in best_hyp_indices.asnumpy()]
+
+        return all_best_hyp_indices.asnumpy(), \
+               all_best_word_indices.asnumpy(), \
+               scores_accumulated.asnumpy(), \
+               lengths.asnumpy().astype('int32'), \
+               estimated_reference_lengths.asnumpy(), \
+               constraints
+
+    def _should_stop(self, finished, batch_size):
+        if self.beam_search_stop == C.BEAM_SEARCH_STOP_FIRST:
+            at_least_one_finished = finished.reshape((batch_size, self.beam_size)).sum(axis=1) > 0
+            return at_least_one_finished.sum().asscalar() == batch_size
+        else:
+            return finished.sum().asscalar() == batch_size * self.beam_size  # all finished
+
+
+def get_beam_search(models: List[SockeyeModel],
+                    beam_size: int,
+                    context: Union[mx.Context, List[mx.Context]],
+                    vocab_target: vocab.Vocab,
+                    output_scores: bool,
+                    scorer: CandidateScorer,
+                    ensemble_mode: str = 'linear',
+                    beam_search_stop: str = C.BEAM_SEARCH_STOP_ALL,
+                    constant_length_ratio: float = 0.0,
+                    avoid_list: Optional[str] = None,
+                    sample: Optional[int] = None,
+                    hybridize: bool = True) -> BeamSearch:
+
+    inference = None  # type: Optional[_Inference]
+    if len(models) == 1:
+        skip_softmax = beam_size == 1 and not output_scores and not sample
+        if skip_softmax:
+            logger.info("Enabled skipping softmax for a single model and greedy decoding.")
+        inference = _SingleModelInference(model=models[0],
+                                          skip_softmax=skip_softmax, constant_length_ratio=constant_length_ratio)
+    else:
+        inference = _EnsembleInference(models=models,
+                                       ensemble_mode=ensemble_mode,
+                                       constant_length_ratio=constant_length_ratio)
+
+    global_avoid_trie = None if avoid_list is None else constrained.get_avoid_trie(avoid_list, vocab_target)
+    bs = BeamSearch(
+        beam_size=beam_size,
+        dtype=C.DTYPE_FP32 if models[0].dtype == C.DTYPE_INT8 else models[0].dtype,
+        bos_id=C.BOS_ID,
+        eos_id=C.EOS_ID,
+        context=context,
+        output_vocab_size=models[0].output_layer_vocab_size,
+        beam_search_stop=beam_search_stop,
+        scorer=scorer,
+        sample=sample,
+        num_source_factors=models[0].num_source_factors,
+        global_avoid_trie=global_avoid_trie,
+        inference=inference
+    )
+    bs.initialize()
+    if hybridize:
+        bs.hybridize(static_alloc=True)
+    return bs
diff --git a/sockeye/checkpoint_decoder.py b/sockeye/checkpoint_decoder.py
index 1c9d9d0a4..340c5b553 100644
--- a/sockeye/checkpoint_decoder.py
+++ b/sockeye/checkpoint_decoder.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -23,6 +23,7 @@
 
 import mxnet as mx
 
+import sockeye.model
 import sockeye.output_handler
 import sockeye.translate
 from . import constants as C
@@ -30,6 +31,7 @@
 from . import evaluate
 from . import inference
 from . import utils
+from . import vocab
 
 logger = logging.getLogger(__name__)
 
@@ -38,29 +40,34 @@ class CheckpointDecoder:
     """
     Decodes a (random sample of a) dataset using parameters at given checkpoint and computes BLEU against references.
 
-    :param context: MXNet context to bind the model to.
+    :param model_folder: The model folder where checkpoint decoder outputs will be written to.
     :param inputs: Path(s) to file containing input sentences (and their factors).
     :param references: Path to file containing references.
-    :param model: Model to load.
+    :param source_vocabs: The source vocabularies.
+    :param target_vocab: The target vocabulary.
+    :param context: The devices to use for decoding.
+    :param model: The translation model.
     :param max_input_len: Maximum input length.
     :param batch_size: Batch size.
     :param beam_size: Size of the beam.
     :param nbest_size: Size of nbest lists.
-    :param bucket_width_source: Source bucket width.
     :param length_penalty_alpha: Alpha factor for the length penalty
     :param length_penalty_beta: Beta factor for the length penalty
-    :param softmax_temperature: Optional parameter to control steepness of softmax distribution.
     :param max_output_length_num_stds: Number of standard deviations as safety margin for maximum output length.
     :param ensemble_mode: Ensemble mode: linear or log_linear combination.
     :param sample_size: Maximum number of sentences to sample and decode. If <=0, all sentences are used.
     :param random_seed: Random seed for sampling. Default: 42.
+    :param hybridize: Turn on hybridization of the translator.
     """
 
     def __init__(self,
-                 context: mx.context.Context,
+                 model_folder: str,
                  inputs: List[str],
                  references: str,
-                 model: str,
+                 source_vocabs: List[vocab.Vocab],
+                 target_vocab: vocab.Vocab,
+                 model: sockeye.model.SockeyeModel,
+                 context: mx.Context,
                  max_input_len: Optional[int] = None,
                  batch_size: int = 16,
                  beam_size: int = C.DEFAULT_BEAM_SIZE,
@@ -68,12 +75,11 @@ def __init__(self,
                  bucket_width_source: int = 10,
                  length_penalty_alpha: float = 1.0,
                  length_penalty_beta: float = 0.0,
-                 softmax_temperature: Optional[float] = None,
                  max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH,
                  ensemble_mode: str = 'linear',
                  sample_size: int = -1,
-                 random_seed: int = 42) -> None:
-        self.context = context
+                 random_seed: int = 42,
+                 hybridize: bool = True) -> None:
         self.max_input_len = max_input_len
         self.max_output_length_num_stds = max_output_length_num_stds
         self.ensemble_mode = ensemble_mode
@@ -83,18 +89,19 @@ def __init__(self,
         self.bucket_width_source = bucket_width_source
         self.length_penalty_alpha = length_penalty_alpha
         self.length_penalty_beta = length_penalty_beta
-        self.softmax_temperature = softmax_temperature
         self.model = model
 
         with ExitStack() as exit_stack:
-            inputs_fins = [exit_stack.enter_context(data_io.smart_open(f)) for f in inputs]  # pylint: disable=no-member
-            references_fin = exit_stack.enter_context(data_io.smart_open(references))  # pylint: disable=no-member
+            inputs_fins = [exit_stack.enter_context(data_io.smart_open(f)) for f in inputs]
+            references_fin = exit_stack.enter_context(data_io.smart_open(references))
 
             inputs_sentences = [f.readlines() for f in inputs_fins]
             target_sentences = references_fin.readlines()
 
             utils.check_condition(all(len(l) == len(target_sentences) for l in inputs_sentences),
                                   "Sentences differ in length")
+            utils.check_condition(all(len(sentence.strip()) > 0 for sentence in target_sentences),
+                                  "Empty target validation sentence.")
 
             if sample_size <= 0:
                 sample_size = len(inputs_sentences[0])
@@ -108,47 +115,44 @@ def __init__(self,
                 self.batch_size = sample_size
 
         for i, factor in enumerate(self.inputs_sentences):
-            write_to_file(factor, os.path.join(self.model, C.DECODE_IN_NAME % i))
-        write_to_file(self.target_sentences, os.path.join(self.model, C.DECODE_REF_NAME))
+            write_to_file(factor, os.path.join(model_folder, C.DECODE_IN_NAME % i))
+        write_to_file(self.target_sentences, os.path.join(model_folder, C.DECODE_REF_NAME))
 
         self.inputs_sentences = list(zip(*self.inputs_sentences))  # type: List[List[str]]
 
-        logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d, context=%s)",
-                    max_input_len if max_input_len is not None else -1, beam_size, model, len(self.target_sentences),
-                    context)
+        scorer = inference.CandidateScorer(
+            length_penalty_alpha=length_penalty_alpha,
+            length_penalty_beta=length_penalty_beta,
+            brevity_penalty_weight=0.0,
+            prefix='scorer_')
+
+        # TODO: possibly support decoding on multiple GPUs
+        self.translator = inference.Translator(
+            batch_size=self.batch_size,
+            context=context,
+            ensemble_mode=self.ensemble_mode,
+            scorer=scorer,
+            beam_search_stop='all',
+            nbest_size=self.nbest_size,
+            models=[self.model],
+            source_vocabs=source_vocabs,
+            target_vocab=target_vocab,
+            restrict_lexicon=None,
+            hybridize=hybridize)
+
+        logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, num_sentences=%d)",
+                    max_input_len if max_input_len is not None else -1, beam_size, len(self.target_sentences))
 
     def decode_and_evaluate(self,
-                            checkpoint: Optional[int] = None,
                             output_name: str = os.devnull) -> Dict[str, float]:
         """
         Decodes data set and evaluates given a checkpoint.
 
-        :param checkpoint: Checkpoint to load parameters from.
         :param output_name: Filename to write translations to. Defaults to /dev/null.
         :return: Mapping of metric names to scores.
         """
-        models, source_vocabs, target_vocab = inference.load_models(
-            self.context,
-            self.max_input_len,
-            self.beam_size,
-            self.batch_size,
-            [self.model],
-            [checkpoint],
-            softmax_temperature=self.softmax_temperature,
-            max_output_length_num_stds=self.max_output_length_num_stds)
-        translator = inference.Translator(context=self.context,
-                                          ensemble_mode=self.ensemble_mode,
-                                          bucket_source_width=self.bucket_width_source,
-                                          length_penalty=inference.LengthPenalty(self.length_penalty_alpha, self.length_penalty_beta),
-                                          brevity_penalty=inference.BrevityPenalty(weight=0.0),
-                                          beam_prune=0.0,
-                                          beam_search_stop='all',
-                                          nbest_size=self.nbest_size,
-                                          models=models,
-                                          source_vocabs=source_vocabs,
-                                          target_vocab=target_vocab,
-                                          restrict_lexicon=None,
-                                          store_beam=False)
+
+        # 1. Translate
         trans_wall_time = 0.0
         translations = []
         with data_io.smart_open(output_name, 'w') as output:
@@ -157,27 +161,27 @@ def decode_and_evaluate(self,
             trans_inputs = []  # type: List[inference.TranslatorInput]
             for i, inputs in enumerate(self.inputs_sentences):
                 trans_inputs.append(sockeye.inference.make_input_from_multiple_strings(i, inputs))
-            trans_outputs = translator.translate(trans_inputs)
+            trans_outputs = self.translator.translate(trans_inputs)
             trans_wall_time = time.time() - tic
             for trans_input, trans_output in zip(trans_inputs, trans_outputs):
                 handler.handle(trans_input, trans_output)
                 translations.append(trans_output.translation)
         avg_time = trans_wall_time / len(self.target_sentences)
 
-        # TODO(fhieber): eventually add more metrics (METEOR etc.)
-        return {C.BLEU_VAL: evaluate.raw_corpus_bleu(hypotheses=translations,
-                                                     references=self.target_sentences,
-                                                     offset=0.01),
-                C.CHRF_VAL: evaluate.raw_corpus_chrf(hypotheses=translations,
+        # 2. Evaluate
+        return {C.BLEU: evaluate.raw_corpus_bleu(hypotheses=translations,
+                                                 references=self.target_sentences,
+                                                 offset=0.01),
+                C.CHRF: evaluate.raw_corpus_chrf(hypotheses=translations,
+                                                 references=self.target_sentences),
+                C.ROUGE1: evaluate.raw_corpus_rouge1(hypotheses=translations,
+                                                     references=self.target_sentences),
+                C.ROUGE2: evaluate.raw_corpus_rouge2(hypotheses=translations,
+                                                     references=self.target_sentences),
+                C.ROUGEL: evaluate.raw_corpus_rougel(hypotheses=translations,
                                                      references=self.target_sentences),
-                C.ROUGE_1_VAL: evaluate.raw_corpus_rouge1(hypotheses=translations,
-                                                          references=self.target_sentences),
-                C.ROUGE_2_VAL: evaluate.raw_corpus_rouge2(hypotheses=translations,
-                                                          references=self.target_sentences),
-                C.ROUGE_L_VAL: evaluate.raw_corpus_rougel(hypotheses=translations,
-                                                          references=self.target_sentences),
-                C.LENRATIO_VAL: evaluate.raw_corpus_length_ratio(hypotheses=translations,
-                                                                 references=self.target_sentences),
+                C.LENRATIO: evaluate.raw_corpus_length_ratio(hypotheses=translations,
+                                                             references=self.target_sentences),
                 C.AVG_TIME: avg_time,
                 C.DECODING_TIME: trans_wall_time}
 
diff --git a/sockeye/config.py b/sockeye/config.py
index dcbf99140..31adeb0a4 100644
--- a/sockeye/config.py
+++ b/sockeye/config.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -31,17 +31,12 @@ def __init__(cls, name, bases, kwds):
 
 class Config(yaml.YAMLObject, metaclass=TaggedYamlObjectMetaclass):
     """
-    Base configuration object that supports freezing of members and YAML (de-)serialization.
+    Base configuration object YAML (de-)serialization.
     Actual Configuration should subclass this object.
     """
     yaml_loader = yaml.UnsafeLoader  # type: ignore
 
-    def __init__(self):
-        self.__add_frozen()
-
     def __setattr__(self, key, value):
-        if hasattr(self, '_frozen') and getattr(self, '_frozen'):
-            raise AttributeError("Cannot set '%s' in frozen config" % key)
         if value == self:
             raise AttributeError("Cannot set self as attribute")
         object.__setattr__(self, key, value)
@@ -58,17 +53,6 @@ def __setstate__(self, state):
                 if not hasattr(self, param_name):
                     object.__setattr__(self, param_name, param.default)
 
-    def freeze(self):
-        """
-        Freezes this Config object, disallowing modification or addition of any parameters.
-        """
-        if getattr(self, '_frozen'):
-            return
-        object.__setattr__(self, "_frozen", True)
-        for k, v in self.__dict__.items():
-            if isinstance(v, Config) and k != "self":
-                v.freeze()  # pylint: disable= no-member
-
     def __repr__(self):
         return "Config[%s]" % ", ".join("%s=%s" % (str(k), str(v)) for k, v in sorted(self.__dict__.items()))
 
@@ -83,46 +67,26 @@ def __eq__(self, other):
                     return False
         return True
 
-    def __del_frozen(self):
-        """
-        Removes _frozen attribute from this instance and all its child configurations.
-        """
-        self.__delattr__('_frozen')
-        for attr, val in self.__dict__.items():
-            if isinstance(val, Config) and hasattr(val, '_frozen'):
-                val.__del_frozen()  # pylint: disable= no-member
-
-    def __add_frozen(self):
-        """
-        Adds _frozen attribute to this instance and all its child configurations.
-        """
-        setattr(self, "_frozen", False)
-        for attr, val in self.__dict__.items():
-            if isinstance(val, Config):
-                val.__add_frozen()  # pylint: disable= no-member
-
     def save(self, fname: str):
         """
-        Saves this Config (without the frozen state) to a file called fname.
+        Saves this Config to a file called fname.
 
         :param fname: Name of file to store this Config in.
         """
         obj = copy.deepcopy(self)
-        obj.__del_frozen()
         with open(fname, 'w') as out:
             yaml.dump(obj, out, default_flow_style=False)
 
     @staticmethod
     def load(fname: str) -> 'Config':
         """
-        Returns a Config object loaded from a file. The loaded object is not frozen.
+        Returns a Config object loaded from a file.
 
         :param fname: Name of file to load the Config from.
         :return: Configuration.
         """
         with open(fname) as inp:
             obj = yaml.load(inp, Loader=yaml.UnsafeLoader)  # type: ignore
-            obj.__add_frozen()
             return obj
 
     def copy(self, **kwargs):
diff --git a/sockeye/constants.py b/sockeye/constants.py
index ad5c07cbf..5fd57db1c 100644
--- a/sockeye/constants.py
+++ b/sockeye/constants.py
@@ -19,17 +19,24 @@
 import mxnet as mx
 import numpy as np
 
+# MXNet environment variables
+MXNET_SAFE_ACCUMULATION = 'MXNET_SAFE_ACCUMULATION'
+
+# Horovod environment variables
+HOROVOD_HIERARCHICAL_ALLREDUCE = 'HOROVOD_HIERARCHICAL_ALLREDUCE'
+HOROVOD_HIERARCHICAL_ALLGATHER = 'HOROVOD_HIERARCHICAL_ALLGATHER'
+
 BOS_SYMBOL = "<s>"
 EOS_SYMBOL = "</s>"
 UNK_SYMBOL = "<unk>"
 PAD_SYMBOL = "<pad>"
 PAD_ID = 0
 PAD_FORMAT = "<pad%d>"
-POINTER_FORMAT = "<ptr%d>"
-POINTER_PATTERN = "<ptr(?P<index>\d+)>"
-
 TOKEN_SEPARATOR = " "
 VOCAB_SYMBOLS = [PAD_SYMBOL, UNK_SYMBOL, BOS_SYMBOL, EOS_SYMBOL]
+UNK_ID = VOCAB_SYMBOLS.index(UNK_SYMBOL)
+BOS_ID = VOCAB_SYMBOLS.index(BOS_SYMBOL)
+EOS_ID = VOCAB_SYMBOLS.index(EOS_SYMBOL)
 # reserve extra space for the EOS or BOS symbol that is added to both source and target
 SPACE_FOR_XOS = 1
 
@@ -40,13 +47,7 @@
 EMBEDDING_PREFIX = "embed_"
 ATTENTION_PREFIX = "att_"
 COVERAGE_PREFIX = "cov_"
-BIDIRECTIONALRNN_PREFIX = ENCODER_PREFIX + "birnn_"
-STACKEDRNN_PREFIX = ENCODER_PREFIX + "rnn_"
-FORWARD_PREFIX = "forward_"
-REVERSE_PREFIX = "reverse_"
 TRANSFORMER_ENCODER_PREFIX = ENCODER_PREFIX + "transformer_"
-CNN_ENCODER_PREFIX = ENCODER_PREFIX + "cnn_"
-CHAR_SEQ_ENCODER_PREFIX = ENCODER_PREFIX + "char_"
 DEFAULT_OUTPUT_LAYER_PREFIX = "target_output_"
 LENRATIOS_OUTPUT_LAYER_PREFIX = "length_ratio_layer_"
 
@@ -59,31 +60,20 @@
 
 # source factors
 SOURCE_FACTORS_COMBINE_SUM = 'sum'
+SOURCE_FACTORS_COMBINE_AVERAGE = 'average'
 SOURCE_FACTORS_COMBINE_CONCAT = 'concat'
-SOURCE_FACTORS_COMBINE_CHOICES = [SOURCE_FACTORS_COMBINE_SUM, SOURCE_FACTORS_COMBINE_CONCAT]
+SOURCE_FACTORS_COMBINE_CHOICES = [SOURCE_FACTORS_COMBINE_SUM,
+                                  SOURCE_FACTORS_COMBINE_AVERAGE,
+                                  SOURCE_FACTORS_COMBINE_CONCAT]
 
 # encoder names (arguments)
-RNN_NAME = "rnn"
-RNN_WITH_CONV_EMBED_NAME = "rnn-with-conv-embed"
 TRANSFORMER_TYPE = "transformer"
-CONVOLUTION_TYPE = "cnn"
-TRANSFORMER_WITH_CONV_EMBED_TYPE = "transformer-with-conv-embed"
-IMAGE_PRETRAIN_TYPE = "image-pretrain-cnn"
 
 # available encoders
-ENCODERS = [RNN_NAME, RNN_WITH_CONV_EMBED_NAME, TRANSFORMER_TYPE, TRANSFORMER_WITH_CONV_EMBED_TYPE, CONVOLUTION_TYPE, IMAGE_PRETRAIN_TYPE]
+ENCODERS = [TRANSFORMER_TYPE]
 
 # available decoder
-DECODERS = [RNN_NAME, TRANSFORMER_TYPE, CONVOLUTION_TYPE]
-
-# rnn types
-LSTM_TYPE = 'lstm'
-LNLSTM_TYPE = 'lnlstm'
-LNGLSTM_TYPE = 'lnglstm'
-GRU_TYPE = 'gru'
-LNGRU_TYPE = 'lngru'
-LNGGRU_TYPE = 'lnggru'
-CELL_TYPES = [LSTM_TYPE, LNLSTM_TYPE, LNGLSTM_TYPE, GRU_TYPE, LNGRU_TYPE, LNGGRU_TYPE]
+DECODERS = [TRANSFORMER_TYPE]
 
 # positional embeddings
 NO_POSITIONAL_EMBEDDING = "none"
@@ -113,71 +103,27 @@
 EMBED_INIT_TYPES = [EMBED_INIT_DEFAULT, EMBED_INIT_NORMAL]
 DEFAULT_NUM_EMBED = 512
 
-# RNN init types
-RNN_INIT_PATTERN = ".*h2h.*"
-RNN_INIT_ORTHOGONAL = 'orthogonal'
-RNN_INIT_ORTHOGONAL_STACKED = 'orthogonal_stacked'
-# use the default initializer used also for all other weights
-RNN_INIT_DEFAULT = 'default'
-
-# RNN decoder state init types
-RNN_DEC_INIT_ZERO = "zero"
-RNN_DEC_INIT_LAST = "last"
-RNN_DEC_INIT_AVG = "avg"
-RNN_DEC_INIT_CHOICES = [RNN_DEC_INIT_ZERO, RNN_DEC_INIT_LAST, RNN_DEC_INIT_AVG]
-
-# attention types
-ATT_BILINEAR = 'bilinear'
-ATT_DOT = 'dot'
-ATT_MH_DOT = 'mhdot'
-ATT_FIXED = 'fixed'
-ATT_LOC = 'location'
-ATT_MLP = 'mlp'
-ATT_COV = "coverage"
-ATT_TYPES = [ATT_BILINEAR, ATT_DOT, ATT_MH_DOT, ATT_FIXED, ATT_LOC, ATT_MLP, ATT_COV]
-
 # weight tying components
 WEIGHT_TYING_SRC = 'src'
 WEIGHT_TYING_TRG = 'trg'
 WEIGHT_TYING_SOFTMAX = 'softmax'
 # weight tying types (combinations of above components):
+WEIGHT_TYING_NONE = 'none'
 WEIGHT_TYING_TRG_SOFTMAX = 'trg_softmax'
 WEIGHT_TYING_SRC_TRG = 'src_trg'
 WEIGHT_TYING_SRC_TRG_SOFTMAX = 'src_trg_softmax'
+WEIGHT_TYING_TYPES = [WEIGHT_TYING_NONE, WEIGHT_TYING_SRC_TRG_SOFTMAX, WEIGHT_TYING_SRC_TRG, WEIGHT_TYING_TRG_SOFTMAX]
 
 # default decoder prefixes
-RNN_DECODER_PREFIX = DECODER_PREFIX + "rnn_"
 TRANSFORMER_DECODER_PREFIX = DECODER_PREFIX + "transformer_"
-CNN_DECODER_PREFIX = DECODER_PREFIX + "cnn_"
 
 # Activation types
-# Gaussian Error Linear Unit (https://arxiv.org/pdf/1606.08415.pdf)
-GELU = "gelu"
-# Gated Linear Unit (https://arxiv.org/pdf/1705.03122.pdf)
-GLU = "glu"
 RELU = "relu"
-SIGMOID = "sigmoid"
-SOFT_RELU = "softrelu"
 # Swish-1/SiLU (https://arxiv.org/pdf/1710.05941.pdf, https://arxiv.org/pdf/1702.03118.pdf)
 SWISH1 = "swish1"
-TANH = "tanh"
-TRANSFORMER_ACTIVATION_TYPES = [GELU, RELU, SWISH1]
-CNN_ACTIVATION_TYPES = [GLU, RELU, SIGMOID, SOFT_RELU, TANH]
-
-# Convolutional block pad types:
-CNN_PAD_LEFT = "left"
-CNN_PAD_CENTERED = "centered"
-
-# coverage types
-COVERAGE_COUNT = "count"
-COVERAGE_FERTILITY = "fertility"
-COVERAGE_TYPES = [TANH,
-                  SIGMOID,
-                  RELU,
-                  SOFT_RELU,
-                  GRU_TYPE,
-                  COVERAGE_COUNT,
-                  COVERAGE_FERTILITY]
+# Gaussian Error Linear Unit (https://arxiv.org/pdf/1606.08415.pdf)
+GELU = "gelu"
+TRANSFORMER_ACTIVATION_TYPES = [RELU, SWISH1, GELU]
 
 # default I/O variable names
 SOURCE_NAME = "source"
@@ -220,6 +166,12 @@
 BEAM_SEARCH_STOP_FIRST = 'first'
 BEAM_SEARCH_STOP_ALL = 'all'
 
+# State structure constants
+STEP_STATE = 's'
+BIAS_STATE = 'b'
+ENCODER_STATE = 'e'
+DECODER_STATE = 'd'
+
 # Inference Input JSON constants
 JSON_TEXT_KEY = "text"
 JSON_FACTORS_KEY = "factors"
@@ -233,6 +185,7 @@
 
 VERSION_NAME = "version"
 CONFIG_NAME = "config"
+CONFIG_NAME_FLOAT32 = CONFIG_NAME + ".float32"
 LOG_NAME = "log"
 JSON_SUFFIX = ".json"
 VOCAB_SRC_PREFIX = "vocab.src"
@@ -243,6 +196,7 @@
 PARAMS_PREFIX = "params."
 PARAMS_NAME = PARAMS_PREFIX + "%05d"
 PARAMS_BEST_NAME = "params.best"
+PARAMS_BEST_NAME_FLOAT32 = PARAMS_BEST_NAME + ".float32"
 DECODE_OUT_NAME = "decode.output.%05d"
 DECODE_IN_NAME = "decode.source.%d"
 DECODE_REF_NAME = "decode.target"
@@ -260,6 +214,7 @@
 BUCKET_ITER_STATE_NAME = "bucket.pkl"
 RNG_STATE_NAME = "rng.pkl"
 TRAINING_STATE_NAME = "training.pkl"
+AMP_LOSS_SCALER_STATE_NAME = "amp_loss_scaler.pkl"
 SCHEDULER_STATE_NAME = "scheduler.pkl"
 TRAINING_STATE_PARAMS_NAME = "params"
 ARGS_STATE_NAME = "args.yaml"
@@ -269,9 +224,8 @@
                    "align_plot_prefix", "sure_align_threshold",
                    "keep_last_params", "seed",
                    "max_updates", "min_updates",
-                   "max_seconds",
                    "max_num_epochs", "min_num_epochs",
-                   "max_samples", "min_samples", "max_checkpoints"]
+                   "max_samples", "min_samples", "max_checkpoints", "max_seconds"]
 
 # Other argument constants
 TRAINING_ARG_SOURCE = "--source"
@@ -295,11 +249,6 @@
 # Used to delimit factors on STDIN for inference
 DEFAULT_FACTOR_DELIMITER = '|'
 
-# data layout strings
-BATCH_MAJOR_IMAGE = "NCHW"
-BATCH_MAJOR = "NTC"
-TIME_MAJOR = "TNC"
-
 BATCH_TYPE_SENTENCE = "sentence"
 BATCH_TYPE_WORD = "word"
 
@@ -315,32 +264,18 @@
 
 # Training constants
 OPTIMIZER_ADAM = "adam"
-OPTIMIZER_EVE = "eve"
-OPTIMIZER_NADAM = "nadam"
-OPTIMIZER_RMSPROP = "rmsprop"
 OPTIMIZER_SGD = "sgd"
-OPTIMIZER_NAG = "nag"
-OPTIMIZER_ADAGRAD = "adagrad"
-OPTIMIZER_ADADELTA = "adadelta"
-OPTIMIZERS = [OPTIMIZER_ADAM, OPTIMIZER_EVE, OPTIMIZER_NADAM, OPTIMIZER_RMSPROP, OPTIMIZER_SGD, OPTIMIZER_NAG,
-              OPTIMIZER_ADAGRAD, OPTIMIZER_ADADELTA]
-
-LR_SCHEDULER_FIXED_RATE_INV_SQRT_T = "fixed-rate-inv-sqrt-t"
-LR_SCHEDULER_FIXED_RATE_INV_T = "fixed-rate-inv-t"
-LR_SCHEDULER_FIXED_STEP = "fixed-step"
-LR_SCHEDULER_PLATEAU_REDUCE = "plateau-reduce"
-LR_SCHEDULERS = [LR_SCHEDULER_FIXED_RATE_INV_SQRT_T,
-                 LR_SCHEDULER_FIXED_RATE_INV_T,
-                 LR_SCHEDULER_FIXED_STEP,
+OPTIMIZERS = [OPTIMIZER_ADAM, OPTIMIZER_SGD]
+
+LR_SCHEDULER_NONE = 'none'
+LR_SCHEDULER_INV_SQRT_DECAY = 'inv-sqrt-decay'
+LR_SCHEDULER_LINEAR_DECAY = 'linear-decay'
+LR_SCHEDULER_PLATEAU_REDUCE = 'plateau-reduce'
+LR_SCHEDULERS = [LR_SCHEDULER_NONE,
+                 LR_SCHEDULER_INV_SQRT_DECAY,
+                 LR_SCHEDULER_LINEAR_DECAY,
                  LR_SCHEDULER_PLATEAU_REDUCE]
 
-LR_DECAY_OPT_STATES_RESET_OFF = 'off'
-LR_DECAY_OPT_STATES_RESET_INITIAL = 'initial'
-LR_DECAY_OPT_STATES_RESET_BEST = 'best'
-LR_DECAY_OPT_STATES_RESET_CHOICES = [LR_DECAY_OPT_STATES_RESET_OFF,
-                                     LR_DECAY_OPT_STATES_RESET_INITIAL,
-                                     LR_DECAY_OPT_STATES_RESET_BEST]
-
 GRADIENT_CLIPPING_TYPE_ABS = 'abs'
 GRADIENT_CLIPPING_TYPE_NORM = 'norm'
 GRADIENT_CLIPPING_TYPE_NONE = 'none'
@@ -350,27 +285,20 @@
 GRADIENT_COMPRESSION_2BIT = "2bit"
 GRADIENT_COMPRESSION_TYPES = [GRADIENT_CLIPPING_TYPE_NONE, GRADIENT_COMPRESSION_2BIT]
 
+HOROVOD_SECONDARY_WORKERS_DIRNAME = 'secondary_workers'
+
 # output handler
 OUTPUT_HANDLER_TRANSLATION = "translation"
 OUTPUT_HANDLER_TRANSLATION_WITH_SCORE = "translation_with_score"
-OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENTS = "translation_with_alignments"
-OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENT_MATRIX = "translation_with_alignment_matrix"
 OUTPUT_HANDLER_SCORE = "score"
 OUTPUT_HANDLER_PAIR_WITH_SCORE = "pair_with_score"
 OUTPUT_HANDLER_BENCHMARK = "benchmark"
-OUTPUT_HANDLER_ALIGN_PLOT = "align_plot"
-OUTPUT_HANDLER_ALIGN_TEXT = "align_text"
 OUTPUT_HANDLER_BEAM_STORE = "beam_store"
 OUTPUT_HANDLER_JSON = "json"
 OUTPUT_HANDLERS = [OUTPUT_HANDLER_TRANSLATION,
                    OUTPUT_HANDLER_SCORE,
                    OUTPUT_HANDLER_TRANSLATION_WITH_SCORE,
-                   OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENTS,
-                   OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENT_MATRIX,
                    OUTPUT_HANDLER_BENCHMARK,
-                   OUTPUT_HANDLER_ALIGN_PLOT,
-                   OUTPUT_HANDLER_ALIGN_TEXT,
-                   OUTPUT_HANDLER_BEAM_STORE,
                    OUTPUT_HANDLER_JSON]
 OUTPUT_HANDLERS_SCORING = [OUTPUT_HANDLER_SCORE,
                            OUTPUT_HANDLER_PAIR_WITH_SCORE]
@@ -385,15 +313,9 @@
 ROUGE1 = 'rouge1'
 ROUGE2 = 'rouge2'
 ROUGEL = 'rougel'
-BLEU_VAL = BLEU + "-val"
-CHRF_VAL = CHRF + "-val"
-ROUGE_VAL = ROUGE + "-val"
-ROUGE_1_VAL = ROUGE1 + "-val"
-ROUGE_2_VAL = ROUGE2 + "-val"
-ROUGE_L_VAL = ROUGEL + "-val"
-LENRATIO_VAL = 'length-ratio-mse'
-AVG_TIME = "avg-sec-per-sent-val"
-DECODING_TIME = "decode-walltime-val"
+LENRATIO = 'length-ratio-mse'
+AVG_TIME = "avg-sec-per-sent"
+DECODING_TIME = "decode-walltime"
 METRICS = [PERPLEXITY, ACCURACY, LENRATIO_MSE, BLEU, CHRF, ROUGE1]
 METRIC_MAXIMIZE = {ACCURACY: True, BLEU: True, CHRF: True, ROUGE1: True, PERPLEXITY: False}
 METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf}
@@ -415,6 +337,7 @@
 TARGET_MAX_LENGTH_FACTOR = 2
 DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH = 2
 
+DTYPE_INT8 = 'int8'
 DTYPE_FP16 = 'float16'
 DTYPE_FP32 = 'float32'
 LARGE_POSITIVE_VALUE = 99999999.
@@ -423,31 +346,42 @@
     # Something at the middle of 32768<x<65519. Will be rounded to a multiple of 32.
     # https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Precision_limitations_on_integer_values
     DTYPE_FP16: 49152.0,
+    np.float16: 49152.0,
 
     # Will be rounded to 1.0e8.
     # https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Precision_limits_on_integer_values.
-    DTYPE_FP32: LARGE_POSITIVE_VALUE
+    DTYPE_FP32: LARGE_POSITIVE_VALUE,
+    np.float32: LARGE_POSITIVE_VALUE
 }
 LARGEST_INT = sys.maxsize
 
-LHUC_NAME = "lhuc"
+# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
+# TODO: better to use dynamic loss scaling for FP16, but unclear how to do this with SoftmaxOutput loss for CE.
+FIXED_GRAD_SCALE_FP16 = 1024.0
+
+LHUC_PREFIX = "lhuc_"
 # lhuc application points
 LHUC_ENCODER = "encoder"
 LHUC_DECODER = "decoder"
-LHUC_STATE_INIT = "state_init"
 LHUC_ALL = "all"
-LHUC_CHOICES = [LHUC_ENCODER, LHUC_DECODER, LHUC_STATE_INIT, LHUC_ALL]
+LHUC_CHOICES = [LHUC_ENCODER, LHUC_DECODER, LHUC_ALL]
 
 # Strategies for fixing various parameters.
 FIXED_PARAM_STRATEGY_ALL_EXCEPT_DECODER = "all_except_decoder"
 FIXED_PARAM_STRATEGY_ALL_EXCEPT_OUTER_LAYERS = "all_except_outer_layers"
 FIXED_PARAM_STRATEGY_ALL_EXCEPT_EMBEDDINGS = "all_except_embeddings"
 FIXED_PARAM_STRATEGY_ALL_EXCEPT_OUTPUT_PROJ = "all_except_output_proj"
+FIXED_PARAM_STRATEGY_ALL_EXCEPT_FEED_FORWARD = "all_except_feed_forward"
+FIXED_PARAM_STRATEGY_ENCODER_AND_SOURCE_EMBEDDINGS = "encoder_and_source_embeddings"
+FIXED_PARAM_STRATEGY_ENCODER_HALF_AND_SOURCE_EMBEDDINGS = "encoder_half_and_source_embeddings"
 
 FIXED_PARAM_STRATEGY_CHOICES = [FIXED_PARAM_STRATEGY_ALL_EXCEPT_DECODER,
                                 FIXED_PARAM_STRATEGY_ALL_EXCEPT_OUTER_LAYERS,
                                 FIXED_PARAM_STRATEGY_ALL_EXCEPT_EMBEDDINGS,
-                                FIXED_PARAM_STRATEGY_ALL_EXCEPT_OUTPUT_PROJ]
+                                FIXED_PARAM_STRATEGY_ALL_EXCEPT_OUTPUT_PROJ,
+                                FIXED_PARAM_STRATEGY_ALL_EXCEPT_FEED_FORWARD,
+                                FIXED_PARAM_STRATEGY_ENCODER_AND_SOURCE_EMBEDDINGS,
+                                FIXED_PARAM_STRATEGY_ENCODER_HALF_AND_SOURCE_EMBEDDINGS]
 
 # data sharding
 SHARD_NAME = "shard.%05d"
@@ -456,8 +390,7 @@
 DATA_INFO = "data.info"
 DATA_CONFIG = "data.config"
 PREPARED_DATA_VERSION_FILE = "data.version"
-# TODO: with next bump remove branch over data_statistics.length_ratio_stats_per_bucket
-PREPARED_DATA_VERSION = 2
+PREPARED_DATA_VERSION = 3
 
 # reranking
 RERANK_BLEU = "bleu"
diff --git a/sockeye/convolution.py b/sockeye/convolution.py
deleted file mode 100644
index 3975bce01..000000000
--- a/sockeye/convolution.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Convolutional layers.
-"""
-from sockeye.config import Config
-from . import utils
-from . import constants as C
-from . import layers
-
-import mxnet as mx
-
-
-class ConvolutionConfig(Config):
-    """
-    Configuration for a stack of convolutions with Gated Linear Units between layers, similar to Gehring et al. 2017.
-
-    :param kernel_width: Kernel size for 1D convolution.
-    :param num_hidden: Size of hidden representation after convolution.
-    :param act_type: The type of activation to use.
-    """
-
-    def __init__(self,
-                 kernel_width: int,
-                 num_hidden: int,
-                 act_type: str = C.GLU,
-                 weight_normalization: bool = False) -> None:
-        super().__init__()
-        self.kernel_width = kernel_width
-        self.num_hidden = num_hidden
-        utils.check_condition(act_type in C.CNN_ACTIVATION_TYPES, "Unknown activation %s." % act_type)
-        self.act_type = act_type
-        self.weight_normalization = weight_normalization
-
-
-class ConvolutionBlock:
-    """
-    A Convolution-GLU block consists of the 2 following sublayers:
-    1. Dropout (optional)
-    1. A Convolution (padded either both to the left and to the right or just to the left).
-    2. An activation: Either a Gated Linear Unit or any other activation supported by MXNet.
-
-    :param config: Configuration for Convolution block.
-    :param pad_type: 'left' or 'centered'. 'left' only pads to the left (for decoding
-           the target sequence). 'centered' pads on both sides (for encoding the source sequence).
-    :param prefix: Name prefix for symbols of this block.
-    """
-
-    def __init__(self,
-                 config: ConvolutionConfig,
-                 pad_type: str,
-                 prefix: str) -> None:
-        self.prefix = prefix
-        self.pad_type = pad_type
-        self.config = config
-        self.conv_weight = mx.sym.Variable("%sconv_weight" % prefix,
-                                           shape=(
-                                               self._pre_activation_num_hidden(),
-                                               self.config.num_hidden,
-                                               self.config.kernel_width)
-                                           )
-        if self.config.weight_normalization:
-            self.weight_norm = layers.WeightNormalization(self.conv_weight,
-                                                          self._pre_activation_num_hidden(),
-                                                          ndim=3,
-                                                          prefix="%sconv_" % prefix)
-            self.conv_weight = self.weight_norm()
-        else:
-            self.weight_norm = None
-        self.conv_bias = mx.sym.Variable("%sconv_bias" % prefix)
-
-    def _pre_activation_num_hidden(self):
-        if self.config.act_type == C.GLU:
-            return 2 * self.config.num_hidden
-        else:
-            return self.config.num_hidden
-
-    def __call__(self,
-                 data: mx.sym.Symbol,
-                 data_length: mx.sym.Symbol) -> mx.sym.Symbol:
-        """
-        Run the convolutional block.
-
-        :param data: Input data. Shape: (batch_size, seq_len, num_hidden).
-        :param data_length: Vector with sequence lengths. Shape: (batch_size,).
-        :return: Shape: (batch_size, seq_len, num_hidden).
-        """
-        if self.pad_type == C.CNN_PAD_LEFT:
-            # we pad enough on both sides and later slice the extra padding from the right
-            padding = (self.config.kernel_width - 1,)
-        elif self.pad_type == C.CNN_PAD_CENTERED:
-            # we pad enough so that the output size is equal to the input size and we don't need to slice
-            utils.check_condition(self.config.kernel_width % 2 == 1,
-                                  "Only odd kernel widths supported, but got %d" % self.config.kernel_width)
-            padding = (int((self.config.kernel_width - 1) / 2),)
-        else:
-            raise ValueError("Unknown pad type %s" % self.pad_type)
-
-        num_hidden = self._pre_activation_num_hidden()
-
-        # Apply masking (so that we properly have zero padding for variable sequence length batches)
-        data = mx.sym.SequenceMask(data=data, axis=1, sequence_length=data_length, use_sequence_length=True, value=0)
-
-        # (batch_size, num_hidden, seq_len)
-        data = mx.sym.transpose(data, axes=(0, 2, 1))
-        data_conv = mx.sym.Convolution(data=data,
-                                       weight=self.conv_weight,
-                                       bias=self.conv_bias,
-                                       pad=padding,
-                                       kernel=(self.config.kernel_width,),
-                                       num_filter=num_hidden,
-                                       layout="NCW")
-
-        # (batch_size, 2 * num_hidden, seq_len)
-        if self.pad_type == C.CNN_PAD_LEFT:
-            data_conv = mx.sym.slice_like(data_conv, data, axes=(0, 0, -1))
-
-        return self._post_convolution(data_conv)
-
-    def step(self, data: mx.sym.Symbol) -> mx.sym.Symbol:
-        """
-        Run convolution over a single position. The data must be exactly as wide as the convolution filters.
-
-        :param data: Shape: (batch_size, kernel_width, num_hidden).
-        :return: Single result of a convolution. Shape: (batch_size, 1, num_hidden).
-        """
-
-        # As we only run convolution over a single window that is exactly the size of the convolutional filter
-        # we can use FullyConnected instead of Convolution for efficiency reasons. Additionally we do not need to
-        # perform any masking.
-
-        num_hidden = self._pre_activation_num_hidden()
-
-        # (batch_size, num_hidden, kernel_width)
-        data = mx.sym.swapaxes(data, dim1=1, dim2=2)
-        # (batch_size, num_hidden * kernel_width)
-        data = mx.sym.reshape(data, shape=(0, -3))
-        # (preact_num_hidden, num_hidden * kernel_width)
-        weight = mx.sym.reshape(self.conv_weight, shape=(0, -3))
-        data_conv = mx.sym.FullyConnected(data=data,
-                                          weight=weight,
-                                          bias=self.conv_bias,
-                                          num_hidden=num_hidden)
-        # (batch_size, num_hidden, 1)
-        data_conv = mx.sym.expand_dims(data_conv, axis=2)
-        return self._post_convolution(data_conv)
-
-    def _post_convolution(self, data_conv: mx.sym.Symbol) -> mx.sym.Symbol:
-        # data_conv: (batch_size, pre_activation_num_hidden, seq_len)
-        # TODO: add layer norm (can we do this without reshaping?!)
-
-        if self.config.act_type == C.GLU:
-            # GLU
-            # two times: (batch_size, num_hidden, seq_len)
-            # pylint: disable=unbalanced-tuple-unpacking
-            gate_a, gate_b = mx.sym.split(data_conv, num_outputs=2, axis=1)
-            # (batch_size, num_hidden, seq_len)
-            block_output = mx.sym.broadcast_mul(gate_a,
-                                                mx.sym.Activation(data=gate_b, act_type="sigmoid"))
-        else:
-            # (batch_size, num_hidden, seq_len)
-            block_output = mx.sym.Activation(data_conv, act_type=self.config.act_type)
-
-        # (batch_size, seq_len, num_hidden)
-        block_output = mx.sym.swapaxes(block_output, dim1=1, dim2=2)
-        return block_output
diff --git a/sockeye/coverage.py b/sockeye/coverage.py
deleted file mode 100644
index 5aaedede1..000000000
--- a/sockeye/coverage.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Defines the dynamic source encodings ('coverage' mechanisms) for encoder/decoder networks as used in Tu et al. (2016).
-"""
-import logging
-from typing import Callable
-
-import mxnet as mx
-
-from . import config
-from . import constants as C
-from . import layers
-from . import rnn
-from . import utils
-
-logger = logging.getLogger(__name__)
-
-
-class CoverageConfig(config.Config):
-    """
-    Coverage configuration.
-
-    :param type: Coverage name.
-    :param num_hidden: Number of hidden units for coverage networks.
-    :param layer_normalization: Apply layer normalization to coverage networks.
-    :param max_fertility: Maximum number of target words generated by a source word.
-    """
-    def __init__(self,
-                 type: str,
-                 num_hidden: int,
-                 layer_normalization: bool,
-                 max_fertility: int = 2) -> None:
-        super().__init__()
-        self.type = type
-        self.max_fertility = max_fertility
-        self.num_hidden = num_hidden
-        self.layer_normalization = layer_normalization
-
-
-def get_coverage(config: CoverageConfig) -> 'Coverage':
-    """
-    Returns a Coverage instance.
-
-    :param config: Coverage configuration.
-    :return: Instance of Coverage.
-    """
-    if config.type == C.COVERAGE_COUNT or config.type == C.COVERAGE_FERTILITY:
-        utils.check_condition(config.num_hidden == 1, "Count or fertility coverage requires coverage_num_hidden==1")
-    if config.type == C.GRU_TYPE:
-        return GRUCoverage(config.num_hidden, config.layer_normalization)
-    elif config.type in {C.TANH, C.SIGMOID, C.RELU, C.SOFT_RELU}:
-        return ActivationCoverage(config.num_hidden, config.type, config.layer_normalization)
-    elif config.type == C.COVERAGE_COUNT:
-        return CountCoverage()
-    elif config.type == C.COVERAGE_FERTILITY:
-        return FertilityCoverage(config.max_fertility)
-    else:
-        raise ValueError("Unknown coverage type %s" % config.type)
-
-
-class Coverage:
-    """
-    Generic coverage class. Similar to Attention classes, a coverage instance returns a callable, update_coverage(),
-    function when self.on() is called.
-    """
-    def __init__(self, prefix=C.COVERAGE_PREFIX):
-        self.prefix = prefix
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for updating coverage vectors in a sequence decoder.
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Coverage callable.
-        """
-
-        def update_coverage(prev_hidden: mx.sym.Symbol,
-                            attention_prob_scores: mx.sym.Symbol,
-                            prev_coverage: mx.sym.Symbol):
-            """
-            :param prev_hidden: Previous hidden decoder state. Shape: (batch_size, decoder_num_hidden).
-            :param attention_prob_scores: Current attention scores. Shape: (batch_size, source_seq_len).
-            :param prev_coverage: Shape: (batch_size, source_seq_len, coverage_num_hidden).
-            :return: Updated coverage matrix . Shape: (batch_size, source_seq_len, coverage_num_hidden).
-            """
-            raise NotImplementedError()
-
-        return update_coverage
-
-
-class CountCoverage(Coverage):
-    """
-    Coverage class that accumulates the attention weights for each source word.
-    """
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for updating coverage vectors in a sequence decoder.
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Coverage callable.
-        """
-
-        def update_coverage(prev_hidden: mx.sym.Symbol,
-                            attention_prob_scores: mx.sym.Symbol,
-                            prev_coverage: mx.sym.Symbol):
-            """
-            :param prev_hidden: Previous hidden decoder state. Shape: (batch_size, decoder_num_hidden).
-            :param attention_prob_scores: Current attention scores. Shape: (batch_size, source_seq_len).
-            :param prev_coverage: Shape: (batch_size, source_seq_len, coverage_num_hidden).
-            :return: Updated coverage matrix . Shape: (batch_size, source_seq_len, coverage_num_hidden).
-            """
-            return prev_coverage + mx.sym.expand_dims(attention_prob_scores, axis=2)
-
-        return update_coverage
-
-
-class FertilityCoverage(Coverage):
-    """
-    Coverage class that accumulates the attention weights for each source word,
-    and also computes a fertility value for each source word.
-    """
-
-    def __init__(self, max_fertility: int) -> None:
-        super().__init__()
-        self.max_fertility = max_fertility
-        # input (encoder) to fertility
-        self.cov_e2f_weight = mx.sym.Variable("%se2f_weight" % self.prefix)
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for updating coverage vectors in a sequence decoder.
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Coverage callable.
-        """
-
-        # (batch_size, seq_len, 1)
-        source_fertility = mx.sym.FullyConnected(data=source,
-                                                 weight=self.cov_e2f_weight,
-                                                 no_bias=True,
-                                                 num_hidden=1,
-                                                 flatten=False,
-                                                 name="%ssource_fertility_fc" % self.prefix)
-
-        # (batch_size, seq_len, 1)
-        fertility = mx.sym.Activation(data=source_fertility,
-                                      act_type="sigmoid",
-                                      name="%sactivation" % self.prefix)
-
-        # (batch_size, seq_len, 1)
-        scaled_fertility = 1 / (self.max_fertility * fertility)
-
-        def update_coverage(prev_hidden: mx.sym.Symbol,
-                            attention_prob_scores: mx.sym.Symbol,
-                            prev_coverage: mx.sym.Symbol):
-            """
-            :param prev_hidden: Previous hidden decoder state. Shape: (batch_size, decoder_num_hidden).
-            :param attention_prob_scores: Current attention scores. Shape: (batch_size, source_seq_len).
-            :param prev_coverage: Shape: (batch_size, source_seq_len, coverage_num_hidden).
-            :return: Updated coverage matrix . Shape: (batch_size, source_seq_len, coverage_num_hidden).
-            """
-
-            # (batch_size, source_seq_len, 1)
-            expanded_att_scores = mx.sym.expand_dims(data=attention_prob_scores,
-                                                     axis=2,
-                                                     name="%sexpand_attention_scores" % self.prefix)
-
-            # (batch_size, source_seq_len, 1)
-            new_coverage = scaled_fertility * expanded_att_scores
-
-            return prev_coverage + new_coverage
-
-        return update_coverage
-
-
-class GRUCoverage(Coverage):
-    """
-    Implements a GRU whose state is the coverage vector.
-
-    TODO: This implementation is slightly inefficient since the source is fed in at every step.
-    It would be better to pre-compute the mapping of the source but this will likely mean opening up the GRU.
-
-    :param coverage_num_hidden: Number of hidden units for coverage vectors.
-    :param layer_normalization: If true, applies layer normalization for each gate in the GRU cell.
-    """
-
-    def __init__(self, coverage_num_hidden: int, layer_normalization: bool) -> None:
-        super().__init__()
-        self.num_hidden = coverage_num_hidden
-        gru_prefix = "%sgru" % self.prefix
-        if layer_normalization:
-            self.gru = rnn.LayerNormPerGateGRUCell(self.num_hidden, prefix=gru_prefix)
-        else:
-            self.gru = mx.rnn.GRUCell(self.num_hidden, prefix=gru_prefix)
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for updating coverage vectors in a sequence decoder.
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Coverage callable.
-        """
-
-        def update_coverage(prev_hidden: mx.sym.Symbol,
-                            attention_prob_scores: mx.sym.Symbol,
-                            prev_coverage: mx.sym.Symbol):
-            """
-            :param prev_hidden: Previous hidden decoder state. Shape: (batch_size, decoder_num_hidden).
-            :param attention_prob_scores: Current attention scores. Shape: (batch_size, source_seq_len).
-            :param prev_coverage: Shape: (batch_size, source_seq_len, coverage_num_hidden).
-            :return: Updated coverage matrix . Shape: (batch_size, source_seq_len, coverage_num_hidden).
-            """
-
-            # (batch_size, source_seq_len, decoder_num_hidden)
-            expanded_decoder = mx.sym.broadcast_axis(
-                data=mx.sym.expand_dims(data=prev_hidden, axis=1, name="%sexpand_decoder" % self.prefix),
-                axis=1, size=source_seq_len, name="%sbroadcast_decoder" % self.prefix)
-
-            # (batch_size, source_seq_len, 1)
-            expanded_att_scores = mx.sym.expand_dims(data=attention_prob_scores,
-                                                     axis=2,
-                                                     name="%sexpand_attention_scores" % self.prefix)
-
-            # (batch_size, source_seq_len, encoder_num_hidden + decoder_num_hidden + 1)
-            # +1 for the attention_prob_score for the source word
-            concat_input = mx.sym.concat(source, expanded_decoder, expanded_att_scores, dim=2,
-                                         name="%sconcat_inputs" % self.prefix)
-
-            # (batch_size * source_seq_len, encoder_num_hidden + decoder_num_hidden + 1)
-            flat_input = mx.sym.reshape(concat_input, shape=(-3, -1), name="%sflatten_inputs")
-
-            # coverage: (batch_size * seq_len, coverage_num_hidden)
-            coverage = mx.sym.reshape(data=prev_coverage, shape=(-3, -1))
-            updated_coverage, _ = self.gru(flat_input, states=[coverage])
-
-            # coverage: (batch_size, seq_len, coverage_num_hidden)
-            coverage = mx.sym.reshape(updated_coverage, shape=(-1, source_seq_len, self.num_hidden))
-
-            return mask_coverage(coverage, source_length)
-
-        return update_coverage
-
-
-class ActivationCoverage(Coverage):
-    """
-    Implements a coverage mechanism whose updates are performed by a Perceptron with
-    configurable activation function.
-
-    :param coverage_num_hidden: Number of hidden units for coverage vectors.
-    :param activation: Type of activation for Perceptron.
-    :param layer_normalization: If true, applies layer normalization before non-linear activation.
-    """
-
-    def __init__(self,
-                 coverage_num_hidden: int,
-                 activation: str,
-                 layer_normalization: bool) -> None:
-        super().__init__()
-        self.activation = activation
-        self.num_hidden = coverage_num_hidden
-        # input (encoder) to hidden
-        self.cov_e2h_weight = mx.sym.Variable("%se2h_weight" % self.prefix)
-        # decoder to hidden
-        self.cov_dec2h_weight = mx.sym.Variable("%si2h_weight" % self.prefix)
-        # previous coverage to hidden
-        self.cov_prev2h_weight = mx.sym.Variable("%sprev2h_weight" % self.prefix)
-        # attention scores to hidden
-        self.cov_a2h_weight = mx.sym.Variable("%sa2h_weight" % self.prefix)
-        # optional layer normalization
-        self.layer_norm = None
-        if layer_normalization and not self.num_hidden != 1:
-            self.layer_norm = layers.LayerNormalization(prefix="%snorm" % self.prefix)
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for updating coverage vectors in a sequence decoder.
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Coverage callable.
-        """
-
-        # (batch_size, seq_len, coverage_hidden_num)
-        source_hidden = mx.sym.FullyConnected(data=source,
-                                              weight=self.cov_e2h_weight,
-                                              no_bias=True,
-                                              num_hidden=self.num_hidden,
-                                              flatten=False,
-                                              name="%ssource_hidden_fc" % self.prefix)
-
-        def update_coverage(prev_hidden: mx.sym.Symbol,
-                            attention_prob_scores: mx.sym.Symbol,
-                            prev_coverage: mx.sym.Symbol):
-            """
-            :param prev_hidden: Previous hidden decoder state. Shape: (batch_size, decoder_num_hidden).
-            :param attention_prob_scores: Current attention scores. Shape: (batch_size, source_seq_len).
-            :param prev_coverage: Shape: (batch_size, source_seq_len, coverage_num_hidden).
-            :return: Updated coverage matrix . Shape: (batch_size, source_seq_len, coverage_num_hidden).
-            """
-
-            # (batch_size, seq_len, coverage_hidden_num)
-            coverage_hidden = mx.sym.FullyConnected(data=prev_coverage,
-                                                    weight=self.cov_prev2h_weight,
-                                                    no_bias=True,
-                                                    num_hidden=self.num_hidden,
-                                                    flatten=False,
-                                                    name="%sprevious_hidden_fc" % self.prefix)
-
-            # (batch_size, source_seq_len, 1)
-            attention_prob_scores = mx.sym.expand_dims(attention_prob_scores, axis=2)
-
-            # (batch_size, source_seq_len, coverage_num_hidden)
-            attention_hidden = mx.sym.FullyConnected(data=attention_prob_scores,
-                                                     weight=self.cov_a2h_weight,
-                                                     no_bias=True,
-                                                     num_hidden=self.num_hidden,
-                                                     flatten=False,
-                                                     name="%sattention_fc" % self.prefix)
-
-            # (batch_size, coverage_num_hidden)
-            prev_hidden = mx.sym.FullyConnected(data=prev_hidden, weight=self.cov_dec2h_weight, no_bias=True,
-                                                num_hidden=self.num_hidden, name="%sdecoder_hidden")
-
-            # (batch_size, 1, coverage_num_hidden)
-            prev_hidden = mx.sym.expand_dims(data=prev_hidden, axis=1,
-                                             name="%sinput_decoder_hidden_expanded" % self.prefix)
-
-            # (batch_size, source_seq_len, coverage_num_hidden)
-            intermediate = mx.sym.broadcast_add(lhs=source_hidden, rhs=prev_hidden,
-                                                name="%ssource_plus_hidden" % self.prefix)
-
-            # (batch_size, source_seq_len, coverage_num_hidden)
-            updated_coverage = intermediate + attention_hidden + coverage_hidden
-
-            if self.layer_norm is not None:
-                updated_coverage = self.layer_norm(updated_coverage)
-
-            # (batch_size, seq_len, coverage_num_hidden)
-            coverage = mx.sym.Activation(data=updated_coverage,
-                                         act_type=self.activation,
-                                         name="%sactivation" % self.prefix)
-
-            return mask_coverage(coverage, source_length)
-
-        return update_coverage
-
-
-def mask_coverage(coverage: mx.sym.Symbol, source_length: mx.sym.Symbol) -> mx.sym.Symbol:
-    """
-    Masks all coverage scores that are outside the actual sequence.
-
-    :param coverage: Input coverage vector. Shape: (batch_size, seq_len, coverage_num_hidden).
-    :param source_length: Source length. Shape: (batch_size,).
-    :return: Masked coverage vector. Shape: (batch_size, seq_len, coverage_num_hidden).
-    """
-    return mx.sym.SequenceMask(data=coverage, axis=1, use_sequence_length=True, sequence_length=source_length)
diff --git a/sockeye/data_io.py b/sockeye/data_io.py
index ddc935302..07e09f968 100644
--- a/sockeye/data_io.py
+++ b/sockeye/data_io.py
@@ -1,4 +1,4 @@
-# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -28,15 +28,18 @@
 import mxnet as mx
 import numpy as np
 
+import multiprocessing
+
 from . import config
 from . import constants as C
+from . import horovod_mpi
 from . import vocab
 from .utils import check_condition, smart_open, get_tokens, OnlineMeanAndVariance
 
 logger = logging.getLogger(__name__)
 
 
-def define_buckets(max_seq_len: int, step=10) -> List[int]:
+def define_buckets(max_seq_len: int, step: int = 10) -> List[int]:
     """
     Returns a list of integers defining bucket boundaries.
     Bucket boundaries are created according to the following policy:
@@ -45,9 +48,10 @@ def define_buckets(max_seq_len: int, step=10) -> List[int]:
 
     :param max_seq_len: Maximum bucket size.
     :param step: Distance between buckets.
+
     :return: List of bucket sizes.
     """
-    buckets = [bucket_len for bucket_len in range(step, max_seq_len + step, step)]
+    buckets = list(range(step, max_seq_len + step, step))
     buckets[-1] = max_seq_len
     return buckets
 
@@ -55,6 +59,7 @@ def define_buckets(max_seq_len: int, step=10) -> List[int]:
 def define_parallel_buckets(max_seq_len_source: int,
                             max_seq_len_target: int,
                             bucket_width: int = 10,
+                            bucket_scaling: bool = True,
                             length_ratio: float = 1.0) -> List[Tuple[int, int]]:
     """
     Returns (source, target) buckets up to (max_seq_len_source, max_seq_len_target).  The longer side of the data uses
@@ -64,16 +69,18 @@ def define_parallel_buckets(max_seq_len_source: int,
     :param max_seq_len_source: Maximum source bucket size.
     :param max_seq_len_target: Maximum target bucket size.
     :param bucket_width: Width of buckets on longer side.
+    :param bucket_scaling: Scale bucket steps based on length ratio.
     :param length_ratio: Length ratio of data (target/source).
     """
     source_step_size = bucket_width
     target_step_size = bucket_width
-    if length_ratio >= 1.0:
-        # target side is longer -> scale source
-        source_step_size = max(1, int(round(bucket_width / length_ratio)))
-    else:
-        # source side is longer, -> scale target
-        target_step_size = max(1, int(round(bucket_width * length_ratio)))
+    if bucket_scaling:
+        if length_ratio >= 1.0:
+            # target side is longer -> scale source
+            source_step_size = max(1, int(round(bucket_width / length_ratio)))
+        else:
+            # source side is longer, -> scale target
+            target_step_size = max(1, int(round(bucket_width * length_ratio)))
     source_buckets = define_buckets(max_seq_len_source, step=source_step_size)
     target_buckets = define_buckets(max_seq_len_target, step=target_step_size)
     # Extra buckets
@@ -144,7 +151,8 @@ def define_bucket_batch_sizes(buckets: List[Tuple[int, int]],
                               batch_size: int,
                               batch_by_words: bool,
                               batch_num_devices: int,
-                              data_target_average_len: List[Optional[float]]) -> List[BucketBatchSize]:
+                              data_target_average_len: List[Optional[float]],
+                              batch_sentences_multiple_of: int = 1) -> List[BucketBatchSize]:
     """
     Computes bucket-specific batch sizes (sentences, average_words).
 
@@ -160,12 +168,17 @@ def define_bucket_batch_sizes(buckets: List[Tuple[int, int]],
     :param batch_by_words: Batch by words.
     :param batch_num_devices: Number of devices.
     :param data_target_average_len: Optional average target length for each bucket.
+    :param batch_sentences_multiple_of: Round the number of sentences in each
+        bucket's batch to a multiple of this value (word-based batching only).
     """
     check_condition(len(data_target_average_len) == len(buckets),
                     "Must provide None or average target length for each bucket")
     data_target_average_len = list(data_target_average_len)
     bucket_batch_sizes = []  # type: List[BucketBatchSize]
     largest_total_num_words = 0
+    # Ensure the correct multiple for each batch per device.
+    min_batch_step = batch_sentences_multiple_of * batch_num_devices
+
     for buck_idx, bucket in enumerate(buckets):
         # Target/label length with padding
         padded_seq_len = bucket[1]
@@ -179,9 +192,9 @@ def define_bucket_batch_sizes(buckets: List[Tuple[int, int]],
         if batch_by_words:
             check_condition(padded_seq_len <= batch_size, "Word batch size must cover sequence lengths for all"
                                                           " buckets: (%d > %d)" % (padded_seq_len, batch_size))
-            # Multiple of number of devices (int) closest to target number of words, assuming each sentence is of
-            # average length
-            batch_size_seq = batch_num_devices * max(1, round((batch_size / average_seq_len) / batch_num_devices))
+            # Multiple of minimum batch step closest to target number of words,
+            # assuming each sentence is of average length
+            batch_size_seq = min_batch_step * max(1, round((batch_size / average_seq_len) / min_batch_step))
             batch_size_word = batch_size_seq * average_seq_len
         else:
             batch_size_seq = batch_size
@@ -199,8 +212,8 @@ def define_bucket_batch_sizes(buckets: List[Tuple[int, int]],
         while bucket_batch_sizes[-1].batch_size * padded_seq_len < largest_total_num_words:
             bucket_batch_sizes[-1] = BucketBatchSize(
                 bucket_batch_sizes[-1].bucket,
-                bucket_batch_sizes[-1].batch_size + batch_num_devices,
-                bucket_batch_sizes[-1].average_target_words_per_batch + batch_num_devices * average_seq_len)
+                bucket_batch_sizes[-1].batch_size + min_batch_step,
+                bucket_batch_sizes[-1].average_target_words_per_batch + min_batch_step * average_seq_len)
     return bucket_batch_sizes
 
 
@@ -393,10 +406,8 @@ def shard_data(source_fnames: List[str],
                                                              length_ratio_std) for shard_idx in range(num_shards)]
 
     with ExitStack() as exit_stack:
-        # pylint: disable=no-member
         sources_shards = [[exit_stack.enter_context(smart_open(f, mode="wt")) for f in sources_shard_fnames[i]] for i in
                           range(len(source_fnames))]
-        # pylint: disable=no-member
         target_shards = [exit_stack.enter_context(smart_open(f, mode="wt")) for f in target_shard_fnames]
 
         source_readers, target_reader = create_sequence_readers(source_fnames, target_fname,
@@ -463,10 +474,8 @@ def load(self,
 
         data_source = [np.full((num_samples, source_len, num_factors), self.pad_id, dtype=self.dtype)
                        for (source_len, target_len), num_samples in zip(self.buckets, num_samples_per_bucket)]
-        data_target = [np.full((num_samples, target_len), self.pad_id, dtype=self.dtype)
+        data_target = [np.full((num_samples, target_len + 1), self.pad_id, dtype=self.dtype)
                        for (source_len, target_len), num_samples in zip(self.buckets, num_samples_per_bucket)]
-        data_label = [np.full((num_samples, target_len), self.pad_id, dtype=self.dtype)
-                      for (source_len, target_len), num_samples in zip(self.buckets, num_samples_per_bucket)]
 
         bucket_sample_index = [0 for _ in self.buckets]
 
@@ -499,26 +508,21 @@ def load(self,
             sample_index = bucket_sample_index[buck_index]
             for i, s in enumerate(sources):
                 data_source[buck_index][sample_index, 0:source_len, i] = s
-            data_target[buck_index][sample_index, :target_len] = target
-            # NOTE(fhieber): while this is wasteful w.r.t memory, we need to explicitly create the label sequence
-            # with the EOS symbol here sentence-wise and not per-batch due to variable sequence length within a batch.
-            # Once MXNet allows item assignments given a list of indices (probably MXNet 1.0): e.g a[[0,1,5,2]] = x,
-            # we can try again to compute the label sequence on the fly in next().
-            data_label[buck_index][sample_index, :target_len] = target[1:] + [self.eos_id]
+            data_target[buck_index][sample_index, :target_len + 1] = target + [self.eos_id]
 
             bucket_sample_index[buck_index] += 1
 
         for i in range(len(data_source)):
-            data_source[i] = mx.nd.array(data_source[i], dtype=self.dtype)
-            data_target[i] = mx.nd.array(data_target[i], dtype=self.dtype)
-            data_label[i] = mx.nd.array(data_label[i], dtype=self.dtype)
+            # TODO(fhieber): Consider using pinned memory: mx.cpu_pinned() here
+            data_source[i] = mx.nd.from_numpy(data_source[i], zero_copy=True)
+            data_target[i] = mx.nd.from_numpy(data_target[i], zero_copy=True)
 
         if num_tokens_source > 0 and num_tokens_target > 0:
             logger.info("Created bucketed parallel data set. Introduced padding: source=%.1f%% target=%.1f%%)",
                         num_pad_source / num_tokens_source * 100,
                         num_pad_target / num_tokens_target * 100)
 
-        return ParallelDataSet(data_source, data_target, data_label)
+        return ParallelDataSet(data_source, data_target)
 
 
 def get_num_shards(num_samples: int, samples_per_shard: int, min_num_shards: int) -> int:
@@ -533,6 +537,35 @@ def get_num_shards(num_samples: int, samples_per_shard: int, min_num_shards: int
     return max(int(math.ceil(num_samples / samples_per_shard)), min_num_shards)
 
 
+def save_shard(shard_idx: int, data_loader: RawParallelDatasetLoader,
+               shard_sources: List[str], shard_target: str, 
+               shard_stats: 'DataStatistics', output_prefix: str, keep_tmp_shard_files: bool):
+    """
+    Load shard source and target data files into NDArrays and save to disk.
+    Optionally it can delete the source/target files.
+
+    :param shard_idx: The index of the shard.
+    :param data_loader: A loader for loading parallel data from sources and target.
+    :param shard_sources: A list of sources file names.
+    :param shard_target: A target file name.
+    :param shard_stats: The statistics for the sources/target data.
+    :param output_prefix: The prefix of the output file name.
+    :param keep_tmp_shard_files: Keep the sources/target files when it is True otherwise delete them. 
+    """
+    sources_sentences = [SequenceReader(s) for s in shard_sources]
+    target_sentences = SequenceReader(shard_target)
+    dataset = data_loader.load(sources_sentences, target_sentences, shard_stats.num_sents_per_bucket)
+    shard_fname = os.path.join(output_prefix, C.SHARD_NAME % shard_idx)
+    shard_stats.log()
+    logger.info("Writing '%s'", shard_fname)
+    dataset.save(shard_fname)
+
+    if not keep_tmp_shard_files:
+        for f in shard_sources:
+            os.remove(f)
+        os.remove(shard_target)
+
+
 def prepare_data(source_fnames: List[str],
                  target_fname: str,
                  source_vocabs: List[vocab.Vocab],
@@ -547,7 +580,9 @@ def prepare_data(source_fnames: List[str],
                  samples_per_shard: int,
                  min_num_shards: int,
                  output_prefix: str,
-                 keep_tmp_shard_files: bool = False):
+                 bucket_scaling: bool = True,
+                 keep_tmp_shard_files: bool = False,
+                 max_processes: int = 1):
     logger.info("Preparing data.")
     # write vocabularies to data folder
     vocab.save_source_vocabs(source_vocabs, output_prefix)
@@ -562,9 +597,9 @@ def prepare_data(source_fnames: List[str],
                     "Consider increasing %s" % C.TRAINING_ARG_MAX_SEQ_LEN)
 
     # define buckets
-    buckets = define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_width,
-                                      length_statistics.length_ratio_mean) if bucketing else [
-        (max_seq_len_source, max_seq_len_target)]
+    buckets = define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_width, bucket_scaling,
+                                      length_statistics.length_ratio_mean) if bucketing else [(max_seq_len_source,
+                                                                                               max_seq_len_target)]
     logger.info("Buckets: %s", buckets)
 
     # Pass 2: Randomly assign data to data shards
@@ -584,23 +619,34 @@ def prepare_data(source_fnames: List[str],
     data_statistics.log()
 
     data_loader = RawParallelDatasetLoader(buckets=buckets,
-                                           eos_id=target_vocab[C.EOS_SYMBOL],
+                                           eos_id=C.EOS_ID,
                                            pad_id=C.PAD_ID)
 
     # 3. convert each shard to serialized ndarrays
-    for shard_idx, (shard_sources, shard_target, shard_stats) in enumerate(shards):
-        sources_sentences = [SequenceReader(s) for s in shard_sources]
-        target_sentences = SequenceReader(shard_target)
-        dataset = data_loader.load(sources_sentences, target_sentences, shard_stats.num_sents_per_bucket)
-        shard_fname = os.path.join(output_prefix, C.SHARD_NAME % shard_idx)
-        shard_stats.log()
-        logger.info("Writing '%s'", shard_fname)
-        dataset.save(shard_fname)
-
-        if not keep_tmp_shard_files:
-            for f in shard_sources:
-                os.remove(f)
-            os.remove(shard_target)
+    if max_processes == 1:
+        logger.info("Processing shards sequentially.")
+        # Process shards sequantially woithout using multiprocessing
+        for shard_idx, (shard_sources, shard_target, shard_stats) in enumerate(shards):
+            save_shard(shard_idx, data_loader, shard_sources, shard_target,
+                       shard_stats, output_prefix, keep_tmp_shard_files)
+    else:
+        logger.info("Processing shards using %s processes.", max_processes)
+        # Process shards in parallel using max_processes process
+        results = []
+        pool = multiprocessing.pool.Pool(processes=max_processes)
+        for shard_idx, (shard_sources, shard_target, shard_stats) in enumerate(shards):
+            args = (shard_idx, data_loader, shard_sources, shard_target, 
+                    shard_stats, output_prefix, keep_tmp_shard_files)
+            result = pool.apply_async(save_shard, args=args)
+            results.append(result)
+        pool.close()
+        pool.join()
+
+        for result in results:
+            if not result.successful():
+                logger.error("Process ended in error.")
+                raise RuntimeError("Shard processing failed.")
+
 
     data_info = DataInfo(sources=[os.path.abspath(fname) for fname in source_fnames],
                          target=os.path.abspath(target_fname),
@@ -615,8 +661,7 @@ def prepare_data(source_fnames: List[str],
     config_data = DataConfig(data_statistics=data_statistics,
                              max_seq_len_source=max_seq_len_source,
                              max_seq_len_target=max_seq_len_target,
-                             num_source_factors=len(source_fnames),
-                             source_with_eos=True)
+                             num_source_factors=len(source_fnames))
     config_data_fname = os.path.join(output_prefix, C.DATA_CONFIG)
     logger.info("Writing data config to '%s'", config_data_fname)
     config_data.save(config_data_fname)
@@ -706,6 +751,7 @@ def get_prepared_data_iters(prepared_data_dir: str,
                             batch_size: int,
                             batch_by_words: bool,
                             batch_num_devices: int,
+                            batch_sentences_multiple_of: int = 1,
                             permute: bool = True) -> Tuple['BaseParallelSampleIter',
                                                            'BaseParallelSampleIter',
                                                            'DataConfig', List[vocab.Vocab], vocab.Vocab]:
@@ -754,7 +800,8 @@ def get_prepared_data_iters(prepared_data_dir: str,
                                                    batch_size,
                                                    batch_by_words,
                                                    batch_num_devices,
-                                                   config_data.data_statistics.average_len_target_per_bucket)
+                                                   config_data.data_statistics.average_len_target_per_bucket,
+                                                   batch_sentences_multiple_of)
 
     config_data.data_statistics.log(bucket_batch_sizes)
 
@@ -766,7 +813,7 @@ def get_prepared_data_iters(prepared_data_dir: str,
                                            permute=permute)
 
     data_loader = RawParallelDatasetLoader(buckets=buckets,
-                                           eos_id=target_vocab[C.EOS_SYMBOL],
+                                           eos_id=C.EOS_ID,
                                            pad_id=C.PAD_ID)
 
     validation_iter = get_validation_data_iter(data_loader=data_loader,
@@ -799,9 +846,11 @@ def get_training_data_iters(sources: List[str],
                             max_seq_len_target: int,
                             bucketing: bool,
                             bucket_width: int,
-                            allow_empty: bool = False) -> Tuple['BaseParallelSampleIter',
-                                                                Optional['BaseParallelSampleIter'],
-                                                                'DataConfig', 'DataInfo']:
+                            bucket_scaling: bool = True,
+                            allow_empty: bool = False,
+                            batch_sentences_multiple_of: int = 1) -> Tuple['BaseParallelSampleIter',
+                                                                                Optional['BaseParallelSampleIter'],
+                                                                                'DataConfig', 'DataInfo']:
     """
     Returns data iterators for training and validation data.
 
@@ -821,7 +870,11 @@ def get_training_data_iters(sources: List[str],
     :param max_seq_len_target: Maximum target sequence length.
     :param bucketing: Whether to use bucketing.
     :param bucket_width: Size of buckets.
+    :param bucket_scaling: Scale bucket steps based on source/target length ratio.
     :param allow_empty: Unless True if no sentences are below or equal to the maximum length an exception is raised.
+    :param batch_sentences_multiple_of: Round the number of sentences in each
+        bucket's batch to a multiple of this value (word-based batching only).
+
     :return: Tuple of (training data iterator, validation data iterator, data config).
     """
     logger.info("===============================")
@@ -837,9 +890,9 @@ def get_training_data_iters(sources: List[str],
                         "Consider increasing %s" % C.TRAINING_ARG_MAX_SEQ_LEN)
 
     # define buckets
-    buckets = define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_width,
-                                      length_statistics.length_ratio_mean) if bucketing else [
-        (max_seq_len_source, max_seq_len_target)]
+    buckets = define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_width, bucket_scaling,
+                                      length_statistics.length_ratio_mean) if bucketing else [(max_seq_len_source,
+                                                                                               max_seq_len_target)]
 
     sources_sentences, target_sentences = create_sequence_readers(sources, target, source_vocabs, target_vocab)
 
@@ -852,13 +905,14 @@ def get_training_data_iters(sources: List[str],
                                                    batch_size,
                                                    batch_by_words,
                                                    batch_num_devices,
-                                                   data_statistics.average_len_target_per_bucket)
+                                                   data_statistics.average_len_target_per_bucket,
+                                                   batch_sentences_multiple_of)
 
     data_statistics.log(bucket_batch_sizes)
 
     # Pass 3: Load the data into memory and return the iterator.
     data_loader = RawParallelDatasetLoader(buckets=buckets,
-                                           eos_id=target_vocab[C.EOS_SYMBOL],
+                                           eos_id=C.EOS_ID,
                                            pad_id=C.PAD_ID)
 
     training_data = data_loader.load(sources_sentences, target_sentences,
@@ -874,8 +928,7 @@ def get_training_data_iters(sources: List[str],
     config_data = DataConfig(data_statistics=data_statistics,
                              max_seq_len_source=max_seq_len_source,
                              max_seq_len_target=max_seq_len_target,
-                             num_source_factors=len(sources),
-                             source_with_eos=True)
+                             num_source_factors=len(sources))
 
     train_iter = ParallelSampleIter(data=training_data,
                                     buckets=buckets,
@@ -903,7 +956,6 @@ def get_scoring_data_iters(sources: List[str],
                            source_vocabs: List[vocab.Vocab],
                            target_vocab: vocab.Vocab,
                            batch_size: int,
-                           batch_num_devices: int,
                            max_seq_len_source: int,
                            max_seq_len_target: int) -> 'BaseParallelSampleIter':
     """
@@ -916,7 +968,6 @@ def get_scoring_data_iters(sources: List[str],
     :param source_vocabs: Source vocabulary and optional factor vocabularies.
     :param target_vocab: Target vocabulary.
     :param batch_size: Batch size.
-    :param batch_num_devices: Number of devices batches will be parallelized across.
     :param max_seq_len_source: Maximum source sequence length.
     :param max_seq_len_target: Maximum target sequence length.
     :return: The scoring data iterator.
@@ -930,7 +981,7 @@ def get_scoring_data_iters(sources: List[str],
 
     # ...One loader to raise them,
     data_loader = RawParallelDatasetLoader(buckets=[bucket],
-                                           eos_id=target_vocab[C.EOS_SYMBOL],
+                                           eos_id=C.EOS_ID,
                                            pad_id=C.PAD_ID,
                                            skip_blanks=False)
 
@@ -1017,29 +1068,18 @@ def describe_data_and_buckets(data_statistics: DataStatistics, bucket_batch_size
     check_condition(len(bucket_batch_sizes) == len(data_statistics.buckets),
                     "Number of bucket batch sizes (%d) does not match number of buckets in statistics (%d)."
                     % (len(bucket_batch_sizes), len(data_statistics.buckets)))
-    if data_statistics.length_ratio_stats_per_bucket:
-        for bucket_batch_size, num_seq, (lr_mean, lr_std) in zip(bucket_batch_sizes,
-                                                                data_statistics.num_sents_per_bucket,
-                                                                data_statistics.length_ratio_stats_per_bucket):
-            if num_seq > 0:
-                logger.info("Bucket %s: %d samples in %d batches of %d, ~%.1f target tokens/batch, "
-                            "trg/src length ratio: %.2f (+-%.2f)",
-                            bucket_batch_size.bucket,
-                            num_seq,
-                            math.ceil(num_seq / bucket_batch_size.batch_size),
-                            bucket_batch_size.batch_size,
-                            bucket_batch_size.average_target_words_per_batch,
-                            lr_mean, lr_std)
-    else:
-        # TODO: remove with next bump of C.PREPARED_DATA_VERSION
-        for bucket_batch_size, num_seq in zip(bucket_batch_sizes, data_statistics.num_sents_per_bucket):
-            if num_seq > 0:
-                logger.info("Bucket %s: %d samples in %d batches of %d, ~%.1f target tokens/batch, ",
-                            bucket_batch_size.bucket,
-                            num_seq,
-                            math.ceil(num_seq / bucket_batch_size.batch_size),
-                            bucket_batch_size.batch_size,
-                            bucket_batch_size.average_target_words_per_batch)
+    for bucket_batch_size, num_seq, (lr_mean, lr_std) in zip(bucket_batch_sizes,
+                                                             data_statistics.num_sents_per_bucket,
+                                                             data_statistics.length_ratio_stats_per_bucket):
+        if num_seq > 0:
+            logger.info("Bucket %s: %d samples in %d batches of %d, ~%.1f target tokens/batch, "
+                        "trg/src length ratio: %.2f (+-%.2f)",
+                        bucket_batch_size.bucket,
+                        num_seq,
+                        math.ceil(num_seq / bucket_batch_size.batch_size),
+                        bucket_batch_size.batch_size,
+                        bucket_batch_size.average_target_words_per_batch,
+                        lr_mean, lr_std)
 
 
 class DataInfo(config.Config):
@@ -1072,14 +1112,12 @@ def __init__(self,
                  data_statistics: DataStatistics,
                  max_seq_len_source: int,
                  max_seq_len_target: int,
-                 num_source_factors: int,
-                 source_with_eos: bool = False) -> None:
+                 num_source_factors: int) -> None:
         super().__init__()
         self.data_statistics = data_statistics
         self.max_seq_len_source = max_seq_len_source
         self.max_seq_len_target = max_seq_len_target
         self.num_source_factors = num_source_factors
-        self.source_with_eos = source_with_eos
 
 
 def read_content(path: str, limit: Optional[int] = None) -> Iterator[List[str]]:
@@ -1143,7 +1181,7 @@ def ids2tokens(token_ids: Iterable[int],
     return (tok for token_id, tok in zip(token_ids, tokens) if token_id not in exclude_set)
 
 
-class SequenceReader(Iterable):
+class SequenceReader:
     """
     Reads sequence samples from path and (optionally) creates integer id sequences.
     Streams from disk, instead of loading all samples into memory.
@@ -1167,12 +1205,9 @@ def __init__(self,
         self.bos_id = None
         self.eos_id = None
         if vocabulary is not None:
-            assert C.UNK_SYMBOL in vocabulary
-            assert vocabulary[C.PAD_SYMBOL] == C.PAD_ID
-            assert C.BOS_SYMBOL in vocabulary
-            assert C.EOS_SYMBOL in vocabulary
-            self.bos_id = vocabulary[C.BOS_SYMBOL]
-            self.eos_id = vocabulary[C.EOS_SYMBOL]
+            assert vocab.is_valid_vocab(vocabulary)
+            self.bos_id = C.BOS_ID
+            self.eos_id = C.EOS_ID
         else:
             check_condition(not add_bos and not add_eos, "Adding a BOS or EOS symbol requires a vocabulary")
         self.add_bos = add_bos
@@ -1268,7 +1303,7 @@ def parallel_iterate(source_iterators: Sequence[Iterator[Optional[Any]]],
         "Different number of lines in source(s) and target iterables.")
 
 
-class FileListReader(Iterator):
+class FileListReader:
     """
     Reads sequence samples from path provided in a file.
 
@@ -1342,22 +1377,18 @@ def get_target_bucket(buckets: List[Tuple[int, int]],
     return bucket
 
 
-class ParallelDataSet(Sized):
+class ParallelDataSet:
     """
-    Bucketed parallel data set with labels
+    Bucketed parallel data set
     """
 
     def __init__(self,
                  source: List[mx.nd.array],
-                 target: List[mx.nd.array],
-                 label: List[mx.nd.array]) -> None:
-        check_condition(len(source) == len(target) == len(label),
-                        "Number of buckets for source/target/label do not match: %d/%d/%d." % (len(source),
-                                                                                               len(target),
-                                                                                               len(label)))
+                 target: List[mx.nd.array]) -> None:
+        check_condition(len(source) == len(target),
+                        "Number of buckets for source/target do not match: %d/%d." % (len(source), len(target)))
         self.source = source
         self.target = target
-        self.label = label
 
     def __len__(self) -> int:
         return len(self.source)
@@ -1369,20 +1400,34 @@ def save(self, fname: str):
         """
         Saves the dataset to a binary .npy file.
         """
-        mx.nd.save(fname, self.source + self.target + self.label)
+        mx.nd.save(fname, self.source + self.target)
 
     @staticmethod
     def load(fname: str) -> 'ParallelDataSet':
         """
-        Loads a dataset from a binary .npy file.
+        Loads a dataset from a binary .npy file.  When running Horovod, the data
+        is sliced and each worker loads a different slice based on its rank.
         """
         data = mx.nd.load(fname)
-        n = len(data) // 3
+        n = len(data) // 2
         source = data[:n]
         target = data[n:2 * n]
-        label = data[2 * n:]
-        assert len(source) == len(target) == len(label)
-        return ParallelDataSet(source, target, label)
+        if horovod_mpi.using_horovod() and horovod_mpi.hvd.size() > 1:
+            split_index = horovod_mpi.hvd.rank()
+            total_splits = horovod_mpi.hvd.size()
+            i = split_index / total_splits
+            j = (split_index + 1) / total_splits
+            # Load this worker's slice of each bucket.  If the bucket is empty,
+            # there is no need to slice and attempting to do so will raise an
+            # error.
+            source = [s[math.floor(i * s.shape[0]):math.floor(j * s.shape[0])]
+                      if s.shape[0] > 0
+                      else s for s in source]
+            target = [t[math.floor(i * t.shape[0]):math.floor(j * t.shape[0])]
+                      if t.shape[0] > 0
+                      else t for t in target]
+        assert len(source) == len(target)
+        return ParallelDataSet(source, target)
 
     def fill_up(self,
                 bucket_batch_sizes: List[BucketBatchSize],
@@ -1396,32 +1441,38 @@ def fill_up(self,
         """
         source = list(self.source)
         target = list(self.target)
-        label = list(self.label)
 
         rs = np.random.RandomState(seed)
 
         for bucket_idx in range(len(self)):
-            bucket = bucket_batch_sizes[bucket_idx].bucket
             bucket_batch_size = bucket_batch_sizes[bucket_idx].batch_size
             bucket_source = self.source[bucket_idx]
             bucket_target = self.target[bucket_idx]
-            bucket_label = self.label[bucket_idx]
             num_samples = bucket_source.shape[0]
 
-            # Fill up the last batch by randomly sampling from the extant items.
+            # Determine the target number of samples (current value or minimally
+            # higher value that meets the batch size requirement).
+            target_num_samples = num_samples
             if num_samples % bucket_batch_size != 0:
-                rest = bucket_batch_size - num_samples % bucket_batch_size
-                desired_indices_np = rs.randint(num_samples, size=rest)
-                desired_indices = mx.nd.array(desired_indices_np)
+                target_num_samples = num_samples + (bucket_batch_size - (num_samples % bucket_batch_size))
 
-                if isinstance(source[bucket_idx], np.ndarray):
-                    source[bucket_idx] = np.concatenate((bucket_source, bucket_source.take(desired_indices_np)), axis=0)
-                else:
-                    source[bucket_idx] = mx.nd.concat(bucket_source, bucket_source.take(desired_indices), dim=0)
+            if horovod_mpi.using_horovod():
+                # Workers load different slices of the data.  When the total
+                # number of samples is not evenly divisible by the number of
+                # workers, each worker may have +/- 1 sample.  Use the largest
+                # target number of samples across all workers to keep the number
+                # of batches in sync and guarantee that all samples are used.
+                target_num_samples = max(horovod_mpi.MPI.COMM_WORLD.allgather(target_num_samples))
+
+            # Fill up the last batch by randomly sampling from the extant items.
+            rest = target_num_samples - num_samples
+            if rest > 0:
+                desired_indices_np = rs.randint(num_samples, size=rest)
+                desired_indices = mx.nd.from_numpy(desired_indices_np, zero_copy=True)
+                source[bucket_idx] = mx.nd.concat(bucket_source, bucket_source.take(desired_indices), dim=0)
                 target[bucket_idx] = mx.nd.concat(bucket_target, bucket_target.take(desired_indices), dim=0)
-                label[bucket_idx] = mx.nd.concat(bucket_label, bucket_label.take(desired_indices), dim=0)
 
-        return ParallelDataSet(source, target, label)
+        return ParallelDataSet(source, target)
 
     def permute(self, permutations: List[mx.nd.NDArray]) -> 'ParallelDataSet':
         """
@@ -1434,7 +1485,6 @@ def permute(self, permutations: List[mx.nd.NDArray]) -> 'ParallelDataSet':
         assert len(self) == len(permutations)
         source = []
         target = []
-        label = []
         for buck_idx in range(len(self)):
             num_samples = self.source[buck_idx].shape[0]
             if num_samples:  # not empty bucket
@@ -1444,13 +1494,11 @@ def permute(self, permutations: List[mx.nd.NDArray]) -> 'ParallelDataSet':
                 else:
                     source.append(self.source[buck_idx].take(permutation))
                 target.append(self.target[buck_idx].take(permutation))
-                label.append(self.label[buck_idx].take(permutation))
             else:
                 source.append(self.source[buck_idx])
                 target.append(self.target[buck_idx])
-                label.append(self.label[buck_idx])
 
-        return ParallelDataSet(source, target, label)
+        return ParallelDataSet(source, target)
 
 
 def get_permutations(bucket_counts: List[int]) -> Tuple[List[mx.nd.NDArray], List[mx.nd.NDArray]]:
@@ -1470,8 +1518,8 @@ def get_permutations(bucket_counts: List[int]) -> Tuple[List[mx.nd.NDArray], Lis
         data_permutation = np.random.permutation(num_samples)
         inverse_data_permutation = np.empty(num_samples, np.int32)
         inverse_data_permutation[data_permutation] = np.arange(num_samples)
-        inverse_data_permutation = mx.nd.array(inverse_data_permutation)
-        data_permutation = mx.nd.array(data_permutation)
+        inverse_data_permutation = mx.nd.from_numpy(inverse_data_permutation, zero_copy=True)
+        data_permutation = mx.nd.from_numpy(data_permutation, zero_copy=True)
 
         data_permutations.append(data_permutation)
         inverse_data_permutations.append(inverse_data_permutation)
@@ -1514,9 +1562,6 @@ class BaseParallelSampleIter(mx.io.DataIter):
 
     :param buckets: The list of buckets.
     :param bucket_batch_sizes: A list, parallel to `buckets`, containing the number of samples in each bucket.
-    :param source_data_name: The source data name.
-    :param target_data_name: The target data name.
-    :param label_name: The label name.
     :param num_factors: The number of source factors.
     :param permute: Randomly shuffle the parallel data.
     :param dtype: The MXNet data type.
@@ -1527,9 +1572,6 @@ def __init__(self,
                  buckets: List[Tuple[int, int]],
                  batch_size: int,
                  bucket_batch_sizes: List[BucketBatchSize],
-                 source_data_name: str,
-                 target_data_name: str,
-                 label_name: str,
                  num_factors: int = 1,
                  permute: bool = True,
                  dtype='float32') -> None:
@@ -1538,36 +1580,10 @@ def __init__(self,
         self.buckets = list(buckets)
         self.default_bucket_key = get_default_bucket_key(self.buckets)
         self.bucket_batch_sizes = bucket_batch_sizes
-        self.source_data_name = source_data_name
-        self.target_data_name = target_data_name
-        self.label_name = label_name
         self.num_factors = num_factors
         self.permute = permute
         self.dtype = dtype
 
-        # "Staging area" that needs to fit any size batch we're using by total number of elements.
-        # When computing per-bucket batch sizes, we guarantee that the default bucket will have the
-        # largest total batch size.
-        # Note: this guarantees memory sharing for input data and is generally a good heuristic for
-        # other parts of the model, but it is possible that some architectures will have intermediate
-        # operations that produce shapes larger than the default bucket size.  In these cases, MXNet
-        # will silently allocate additional memory.
-        self.provide_data = [
-            mx.io.DataDesc(name=self.source_data_name,
-                           shape=(self.bucket_batch_sizes[-1].batch_size, self.default_bucket_key[0],
-                                  self.num_factors),
-                           layout=C.BATCH_MAJOR),
-            mx.io.DataDesc(name=self.target_data_name,
-                           shape=(self.bucket_batch_sizes[-1].batch_size, self.default_bucket_key[1]),
-                           layout=C.BATCH_MAJOR)]
-        self.provide_label = [
-            mx.io.DataDesc(name=self.label_name,
-                           shape=(self.bucket_batch_sizes[-1].batch_size, self.default_bucket_key[1]),
-                           layout=C.BATCH_MAJOR)]
-
-        self.data_names = [self.source_data_name, self.target_data_name]
-        self.label_names = [self.label_name]
-
     @abstractmethod
     def reset(self):
         pass
@@ -1577,7 +1593,7 @@ def iter_next(self) -> bool:
         pass
 
     @abstractmethod
-    def next(self) -> mx.io.DataBatch:
+    def next(self) -> 'Batch':
         pass
 
     @abstractmethod
@@ -1606,19 +1622,20 @@ def __init__(self,
                  batch_size: int,
                  max_lens: Tuple[int, int],
                  num_factors: int = 1,
-                 source_data_name=C.SOURCE_NAME,
-                 target_data_name=C.TARGET_NAME,
-                 label_name=C.TARGET_LABEL_NAME,
                  dtype='float32') -> None:
-        super().__init__(buckets=[bucket], batch_size=batch_size, bucket_batch_sizes=[BucketBatchSize(bucket, batch_size, None)],
-                         source_data_name=source_data_name, target_data_name=target_data_name,
-                         label_name=label_name, num_factors=num_factors, permute=False, dtype=dtype)
+        super().__init__(buckets=[bucket],
+                         batch_size=batch_size,
+                         bucket_batch_sizes=[BucketBatchSize(bucket, batch_size, None)],
+                         num_factors=num_factors,
+                         permute=False,
+                         dtype=dtype)
         self.data_loader = data_loader
-        self.sources_sentences, self.target_sentences = create_sequence_readers(sources, target, source_vocabs, target_vocab)
+        self.sources_sentences, self.target_sentences = create_sequence_readers(sources, target,
+                                                                                source_vocabs, target_vocab)
         self.sources_iters = [iter(s) for s in self.sources_sentences]
         self.target_iter = iter(self.target_sentences)
         self.max_len_source, self.max_len_target = max_lens
-        self.next_batch = None
+        self.next_batch = None  # type: Optional[Batch]
         self.sentno = 1
 
     def reset(self):
@@ -1633,15 +1650,20 @@ def iter_next(self) -> bool:
         sources_sentences = [[] for x in self.sources_sentences]  # type: List[List[str]]
         target_sentences = []  # type: List[str]
         num_read = 0
-        for num_read, (sources, target) in enumerate(parallel_iterate(self.sources_iters, self.target_iter, skip_blanks=False), 1):
+        for num_read, (sources, target) in enumerate(
+                parallel_iterate(self.sources_iters, self.target_iter, skip_blanks=False), 1):
             source_len = 0 if sources[0] is None else len(sources[0])
             target_len = 0 if target is None else len(target)
             if source_len > self.max_len_source:
-                logger.info("Trimming source sentence {} ({} -> {})".format(self.sentno + num_read, source_len, self.max_len_source))
-                sources = [source[0:self.max_len_source] for source in sources]
+                logger.info("Trimming source sentence {} ({} -> {})".format(self.sentno + num_read,
+                                                                            source_len,
+                                                                            self.max_len_source))
+                sources = [source[0: self.max_len_source] for source in sources]
             if target_len > self.max_len_target:
-                logger.info("Trimming target sentence {} ({} -> {})".format(self.sentno + num_read, target_len, self.max_len_target))
-                target = target[0:self.max_len_target]
+                logger.info("Trimming target sentence {} ({} -> {})".format(self.sentno + num_read,
+                                                                            target_len,
+                                                                            self.max_len_target))
+                target = target[0: self.max_len_target]
 
             for i, source in enumerate(sources):
                 sources_sentences[i].append(source)
@@ -1655,25 +1677,11 @@ def iter_next(self) -> bool:
             self.next_batch = None
             return False
 
-        # The final batch may be underfilled, so mark it
-        num_pad = self.batch_size - num_read
-
-        dataset = self.data_loader.load(sources_sentences,
-                                        target_sentences,
-                                        [num_read]).fill_up(self.bucket_batch_sizes)
-
-        data = [dataset.source[0], dataset.target[0]]
-        label = dataset.label
-
-        provide_data = [mx.io.DataDesc(name=n, shape=x.shape, layout=C.BATCH_MAJOR) for n, x in
-                        zip(self.data_names, data)]
-        provide_label = [mx.io.DataDesc(name=n, shape=x.shape, layout=C.BATCH_MAJOR) for n, x in
-                         zip(self.label_names, label)]
-
-        self.next_batch = mx.io.DataBatch(data, label,
-                                          pad=num_pad, index=None, bucket_key=self.buckets[0],
-                                          provide_data=provide_data, provide_label=provide_label)
+        dataset = self.data_loader.load(sources_sentences, target_sentences, [num_read])
 
+        source = dataset.source[0]
+        target, label = create_target_and_shifted_label_sequences(dataset.target[0])
+        self.next_batch = create_batch_from_parallel_sample(source, target, label)
         return True
 
     def next(self) -> mx.io.DataBatch:
@@ -1685,10 +1693,10 @@ def next(self) -> mx.io.DataBatch:
         raise StopIteration
 
     def save_state(self, fname: str):
-        raise Exception('Not supported!')
+        raise NotImplementedError('Not supported!')
 
     def load_state(self, fname: str):
-        raise Exception('Not supported!')
+        raise NotImplementedError('Not supported!')
 
 
 class ShardedParallelSampleIter(BaseParallelSampleIter):
@@ -1702,15 +1710,11 @@ def __init__(self,
                  buckets,
                  batch_size,
                  bucket_batch_sizes,
-                 source_data_name=C.SOURCE_NAME,
-                 target_data_name=C.TARGET_NAME,
-                 label_name=C.TARGET_LABEL_NAME,
                  num_factors: int = 1,
                  permute: bool = True,
-                 dtype='float32') -> None:
+                 dtype = 'float32') -> None:
         super().__init__(buckets=buckets, batch_size=batch_size, bucket_batch_sizes=bucket_batch_sizes,
-                         source_data_name=source_data_name, target_data_name=target_data_name,
-                         label_name=label_name, num_factors=num_factors, permute=permute, dtype=dtype)
+                         num_factors=num_factors, permute=permute, dtype=dtype)
         assert len(shards_fnames) > 0
         self.shards_fnames = list(shards_fnames)
         self.shard_index = -1
@@ -1726,8 +1730,6 @@ def _load_shard(self):
                                              buckets=self.buckets,
                                              batch_size=self.batch_size,
                                              bucket_batch_sizes=self.bucket_batch_sizes,
-                                             source_data_name=self.source_data_name,
-                                             target_data_name=self.target_data_name,
                                              num_factors=self.num_factors,
                                              permute=self.permute)
 
@@ -1746,6 +1748,10 @@ def reset(self):
 
             self.shards_fnames = [next_shard_fname] + remaining_shards
 
+            if horovod_mpi.using_horovod():
+                # Synchronize shard order across workers
+                horovod_mpi.MPI.COMM_WORLD.bcast(self.shards_fnames, root=0)
+
             self.shard_index = 0
             self._load_shard()
         else:
@@ -1759,7 +1765,7 @@ def iter_next(self) -> bool:
         next_shard_index = self.shard_index + 1
         return self.shard_iter.iter_next() or next_shard_index < len(self.shards_fnames)
 
-    def next(self) -> mx.io.DataBatch:
+    def next(self) -> 'Batch':
         if not self.shard_iter.iter_next():
             if self.shard_index < len(self.shards_fnames) - 1:
                 self.shard_index += 1
@@ -1793,18 +1799,14 @@ def __init__(self,
                  buckets,
                  batch_size,
                  bucket_batch_sizes,
-                 source_data_name=C.SOURCE_NAME,
-                 target_data_name=C.TARGET_NAME,
-                 label_name=C.TARGET_LABEL_NAME,
                  num_factors: int = 1,
                  permute: bool = True,
                  dtype='float32') -> None:
         super().__init__(buckets=buckets, batch_size=batch_size, bucket_batch_sizes=bucket_batch_sizes,
-                         source_data_name=source_data_name, target_data_name=target_data_name,
-                         label_name=label_name, num_factors=num_factors, permute=permute, dtype=dtype)
+                         num_factors=num_factors, permute=permute, dtype=dtype)
 
         # create independent lists to be shuffled
-        self.data = ParallelDataSet(list(data.source), list(data.target), list(data.label))
+        self.data = ParallelDataSet(list(data.source), list(data.target))
 
         # create index tuples (buck_idx, batch_start_pos) into buckets.
         # This is the list of all batches across all buckets in the dataset. These will be shuffled.
@@ -1825,8 +1827,13 @@ def reset(self):
         """
         self.curr_batch_index = 0
         if self.permute:
-            # shuffle batch start indices
-            random.shuffle(self.batch_indices)
+            # Primary worker or not using Horovod: shuffle batch start indices.
+            if not horovod_mpi.using_horovod() or horovod_mpi.hvd.rank() == 0:
+                random.shuffle(self.batch_indices)
+            if horovod_mpi.using_horovod():
+                # Synchronize order across workers.  This guarantees that each
+                # worker processes a batch from the same bucket at each step.
+                self.batch_indices = horovod_mpi.MPI.COMM_WORLD.bcast(self.batch_indices, root=0)
 
             # restore the data permutation
             self.data = self.data.permute(self.inverse_data_permutations)
@@ -1841,7 +1848,7 @@ def iter_next(self) -> bool:
         """
         return self.curr_batch_index != len(self.batch_indices)
 
-    def next(self) -> mx.io.DataBatch:
+    def next(self) -> 'Batch':
         """
         Returns the next batch from the data iterator.
         """
@@ -1853,19 +1860,8 @@ def next(self) -> mx.io.DataBatch:
 
         batch_size = self.bucket_batch_sizes[i].batch_size
         source = self.data.source[i][j:j + batch_size]
-        target = self.data.target[i][j:j + batch_size]
-        data = [source, target]
-        label = [self.data.label[i][j:j + batch_size]]
-
-        provide_data = [mx.io.DataDesc(name=n, shape=x.shape, layout=C.BATCH_MAJOR) for n, x in
-                        zip(self.data_names, data)]
-        provide_label = [mx.io.DataDesc(name=n, shape=x.shape, layout=C.BATCH_MAJOR) for n, x in
-                         zip(self.label_names, label)]
-
-        # TODO: num pad examples is not set here if fillup policy would be padding
-        return mx.io.DataBatch(data, label,
-                               pad=0, index=None, bucket_key=self.buckets[i],
-                               provide_data=provide_data, provide_label=provide_label)
+        target, label = create_target_and_shifted_label_sequences(self.data.target[i][j:j + batch_size])
+        return create_batch_from_parallel_sample(source, target, label)
 
     def save_state(self, fname: str):
         """
@@ -1905,10 +1901,66 @@ def load_state(self, fname: str):
         self.data_permutations = []
 
         for bucket in range(len(self.data)):
-            inverse_permutation = mx.nd.array(inverse_data_permutations[bucket])
+            inverse_permutation = mx.nd.from_numpy(inverse_data_permutations[bucket], zero_copy=True)
             self.inverse_data_permutations.append(inverse_permutation)
 
-            permutation = mx.nd.array(data_permutations[bucket])
+            permutation = mx.nd.from_numpy(data_permutations[bucket], zero_copy=True)
             self.data_permutations.append(permutation)
 
         self.data = self.data.permute(self.data_permutations)
+
+
+class Batch:
+
+    __slots__ = ['source', 'source_length', 'target', 'target_length', 'labels', 'samples', 'tokens']
+
+    def __init__(self, source, source_length, target, target_length, labels, samples, tokens):
+        self.source = source
+        self.source_length = source_length
+        self.target = target
+        self.target_length = target_length
+        self.labels = labels
+        self.samples = samples
+        self.tokens = tokens
+
+    def split_and_load(self, ctx: List[mx.context.Context]) -> 'Batch':
+        source = mx.gluon.utils.split_and_load(self.source, ctx, batch_axis=0)
+        source_length = mx.gluon.utils.split_and_load(self.source_length, ctx, batch_axis=0)
+        target = mx.gluon.utils.split_and_load(self.target, ctx, batch_axis=0)
+        target_length = mx.gluon.utils.split_and_load(self.target_length, ctx, batch_axis=0)
+        labels = {name: mx.gluon.utils.split_and_load(label, ctx, batch_axis=0) for name, label in self.labels.items()}
+        return Batch(source, source_length, target, target_length, labels, self.samples, self.tokens)
+
+    def shards(self) -> Iterable[Tuple[Tuple, Dict[str, mx.nd.NDArray]]]:
+        assert isinstance(self.source, list), "Must call split_and_load() first"
+        for i, inputs in enumerate(zip(self.source, self.source_length, self.target, self.target_length)):
+            # model inputs, labels
+            yield inputs, {name: label[i] for name, label in self.labels.items()}
+
+
+def create_target_and_shifted_label_sequences(target_and_label: mx.nd.NDArray) -> Tuple[mx.nd.NDArray, mx.nd.NDArray]:
+    """
+    Returns the target and label sequence from a joint array of varying-length sequences including both <bos> and <eos>.
+    Both ndarrays returned have input size of second dimension - 1.
+    """
+    target = target_and_label[:, :-1]  # skip last column (for longest-possible sequence, this already removes <eos>)
+    target = mx.nd.where(target == C.EOS_ID, mx.nd.zeros_like(target), target)  # replace other <eos>'s with <pad>
+    label = target_and_label[:, 1:]  # label skips <bos>
+    return target, label
+
+
+def create_batch_from_parallel_sample(source: mx.nd.NDArray, target: mx.nd.NDArray, label: mx.nd.NDArray) -> Batch:
+    """
+    Creates a Batch instance from parallel data.
+    """
+    source_words = mx.nd.squeeze(mx.nd.slice(source, begin=(None, None, 0), end=(None, None, 1)), axis=2)
+    source_length = mx.nd.sum(source_words != C.PAD_ID, axis=1)
+    target_length = mx.nd.sum(target != C.PAD_ID, axis=1)
+    length_ratio = source_length / target_length
+
+    samples = source.shape[0]
+    tokens = source.shape[1] * samples
+
+    labels = {C.TARGET_LABEL_NAME: label, C.LENRATIO_LABEL_NAME: length_ratio}
+
+    return Batch(source, source_length, target, target_length, labels, samples, tokens)
diff --git a/sockeye/decoder.py b/sockeye/decoder.py
index c40c56367..222fceece 100644
--- a/sockeye/decoder.py
+++ b/sockeye/decoder.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -15,30 +15,24 @@
 Decoders for sequence-to-sequence models.
 """
 import logging
-from abc import ABC, abstractmethod
-from typing import Callable, cast, Dict, List, NamedTuple, Optional, Tuple, Union, Type
+from abc import abstractmethod
+from typing import Dict, List, Optional, Tuple, Union, Type
 
 import mxnet as mx
 
 from . import constants as C
-from . import convolution
-from . import encoder
 from . import layers
-from . import rnn
-from . import rnn_attention
 from . import transformer
-from . import utils
-from .config import Config
 
 logger = logging.getLogger(__name__)
-DecoderConfig = Union['RecurrentDecoderConfig', transformer.TransformerConfig, 'ConvolutionalDecoderConfig']
+DecoderConfig = Union[transformer.TransformerConfig]
 
 
-def get_decoder(config: DecoderConfig, prefix: str = '') -> 'Decoder':
-    return Decoder.get_decoder(config, prefix)
+def get_decoder(config: DecoderConfig, inference_only: bool = False, prefix: str = '', dtype: str = C.DTYPE_FP32) -> 'Decoder':
+    return Decoder.get_decoder(config, inference_only, prefix, dtype)
 
 
-class Decoder(ABC):
+class Decoder(mx.gluon.Block):
     """
     Generic decoder interface.
     A decoder needs to implement code to decode a target sequence known in advance (decode_sequence),
@@ -46,8 +40,6 @@ class Decoder(ABC):
     The latter is typically used for inference graphs in beam search.
     For the inference module to be able to keep track of decoder's states
     a decoder provides methods to return initial states (init_states), state variables and their shapes.
-
-    :param dtype: Data type.
     """
 
     __registry = {}  # type: Dict[Type[DecoderConfig], Tuple[Type['Decoder'], str]]
@@ -69,12 +61,14 @@ def wrapper(target_cls):
         return wrapper
 
     @classmethod
-    def get_decoder(cls, config: DecoderConfig, prefix: str) -> 'Decoder':
+    def get_decoder(cls, config: DecoderConfig, inference_only: bool, prefix: str, dtype: str) -> 'Decoder':
         """
         Creates decoder based on config type.
 
         :param config: Decoder config.
+        :param inference_ony: Create a decoder that is only used for inference.
         :param prefix: Prefix to prepend for decoder.
+        :param dtype: Data type for weights.
 
         :return: Decoder instance.
         """
@@ -83,124 +77,41 @@ def get_decoder(cls, config: DecoderConfig, prefix: str) -> 'Decoder':
             raise ValueError('Unsupported decoder configuration %s' % config_type.__name__)
         decoder_cls, suffix = cls.__registry[config_type]
         # TODO: move final suffix/prefix construction logic into config builder
-        return decoder_cls(config=config, prefix=prefix + suffix)
-
-    @abstractmethod
-    def __init__(self, dtype):
-        logger.info('{}.{} dtype: {}'.format(self.__module__, self.__class__.__name__, dtype))
-        self.dtype = dtype
-
-    @abstractmethod
-    def decode_sequence(self,
-                        source_encoded: mx.sym.Symbol,
-                        source_encoded_lengths: mx.sym.Symbol,
-                        source_encoded_max_length: int,
-                        target_embed: mx.sym.Symbol,
-                        target_embed_lengths: mx.sym.Symbol,
-                        target_embed_max_length: int) -> Tuple[mx.sym.Symbol, Optional[mx.sym.Symbol]]:
-        """
-        Decodes a sequence of embedded target words and returns sequence of last decoder
-        representations for each time step.
-
-        :param source_encoded: Encoded source: (batch_size, source_encoded_max_length, encoder_depth).
-        :param source_encoded_lengths: Lengths of encoded source sequences. Shape: (batch_size,).
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :param target_embed: Embedded target sequence. Shape: (batch_size, target_embed_max_length, target_num_embed).
-        :param target_embed_lengths: Lengths of embedded target sequences. Shape: (batch_size,).
-        :param target_embed_max_length: Dimension of the embedded target sequence.
-        :return:
-            Decoder data. Shape: (batch_size, target_embed_max_length, decoder_depth).
-            Pointer data. Shape: (batch_size, target_embed_max_length, source_seq_len).
-        """
-        pass
-
-    @abstractmethod
-    def decode_step(self,
-                    step: int,
-                    target_embed_prev: mx.sym.Symbol,
-                    source_encoded_max_length: int,
-                    *states: mx.sym.Symbol) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, List[mx.sym.Symbol], Optional[mx.sym.Symbol]]:
-        """
-        Decodes a single time step given the current step, the previous embedded target word,
-        and previous decoder states.
-        Returns decoder representation for the next prediction, attention probabilities, and next decoder states.
-        Implementations can maintain an arbitrary number of states.
-
-        :param step: Global step of inference procedure, starts with 1.
-        :param target_embed_prev: Previous target word embedding. Shape: (batch_size, target_num_embed).
-        :param source_encoded_max_length: Length of encoded source time dimension.
-        :param states: Arbitrary list of decoder states.
-        :return: logit inputs, attention probabilities, next decoder states, pointer scores.
-        """
-        pass
+        return decoder_cls(config=config, inference_only=inference_only, prefix=prefix + suffix, dtype=dtype)
 
     @abstractmethod
-    def reset(self):
-        """
-        Reset decoder method. Used for inference.
-        """
-        pass
+    def __init__(self):
+        super().__init__()
 
     @abstractmethod
-    def get_num_hidden(self) -> int:
-        """
-        :return: The representation size of this decoder.
-        """
-        pass
+    def state_structure(self) -> str:
+        raise NotImplementedError()
 
     @abstractmethod
-    def init_states(self,
-                    source_encoded: mx.sym.Symbol,
-                    source_encoded_lengths: mx.sym.Symbol,
-                    source_encoded_max_length: int) -> List[mx.sym.Symbol]:
-        """
-        Returns a list of symbolic states that represent the initial states of this decoder.
-        Used for inference.
-
-        :param source_encoded: Encoded source. Shape: (batch_size, source_encoded_max_length, encoder_depth).
-        :param source_encoded_lengths: Lengths of encoded source sequences. Shape: (batch_size,).
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :return: List of symbolic initial states.
-        """
-        pass
+    def init_state_from_encoder(self,
+                                encoder_outputs: mx.nd.NDArray,
+                                encoder_valid_length: Optional[mx.nd.NDArray] = None) -> List[mx.nd.NDArray]:
+        raise NotImplementedError()
 
     @abstractmethod
-    def state_variables(self, target_max_length: int) -> List[mx.sym.Symbol]:
+    def decode_seq(self, inputs: mx.nd.NDArray, states: List[mx.nd.NDArray]):
         """
-        Returns the list of symbolic variables for this decoder to be used during inference.
+        Decodes a sequence of embedded target words and returns sequence of last decoder
+        representations for each time step.
 
-        :param target_max_length: Current target sequence lengths.
-        :return: List of symbolic variables.
+        :param inputs: Encoded source: (batch_size, source_encoded_max_length, encoder_depth).
+        :param states: List of initial states, as given by init_state_from_encoder().
+        :return: Decoder output. Shape: (batch_size, target_embed_max_length, decoder_depth).
         """
-        pass
+        raise NotImplementedError()
 
     @abstractmethod
-    def state_shapes(self,
-                     batch_size: int,
-                     target_max_length: int,
-                     source_encoded_max_length: int,
-                     source_encoded_depth: int) -> List[mx.io.DataDesc]:
-        """
-        Returns a list of shape descriptions given batch size, encoded source max length and encoded source depth.
-        Used for inference.
-
-        :param batch_size: Batch size during inference.
-        :param target_max_length: Current target sequence length.
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :param source_encoded_depth: Depth of encoded source.
-        :return: List of shape descriptions.
-        """
-        pass
-
-    def get_max_seq_len(self) -> Optional[int]:
-        """
-        :return: The maximum length supported by the decoder if such a restriction exists.
-        """
-        return None
+    def get_num_hidden(self):
+        raise NotImplementedError()
 
 
 @Decoder.register(transformer.TransformerConfig, C.TRANSFORMER_DECODER_PREFIX)
-class TransformerDecoder(Decoder):
+class TransformerDecoder(Decoder, mx.gluon.HybridBlock):
     """
     Transformer decoder as in Vaswani et al, 2017: Attention is all you need.
     In training, computation scores for each position of the known target sequence are compouted in parallel,
@@ -211,1055 +122,207 @@ class TransformerDecoder(Decoder):
 
     :param config: Transformer configuration.
     :param prefix: Name prefix for symbols of this decoder.
+    :param inference_only: Only use the model for inference enabling some optimizations, such as disabling the auto-regressive mask.
     """
 
     def __init__(self,
                  config: transformer.TransformerConfig,
-                 prefix: str = C.TRANSFORMER_DECODER_PREFIX) -> None:
-        super().__init__(config.dtype)
+                 prefix: str = C.TRANSFORMER_DECODER_PREFIX,
+                 inference_only: bool = False,
+                 dtype: str = C.DTYPE_FP32) -> None:
+        Decoder.__init__(self)
+        mx.gluon.HybridBlock.__init__(self, prefix=prefix)
         self.config = config
-        self.prefix = prefix
-        self.layers = [transformer.TransformerDecoderBlock(
-            config, prefix="%s%d_" % (prefix, i)) for i in range(config.num_layers)]
-        self.final_process = transformer.TransformerProcessBlock(sequence=config.preprocess_sequence,
-                                                                 dropout=config.dropout_prepost,
-                                                                 prefix="%sfinal_process_" % prefix)
-
-        self.valid_length_mask = transformer.TransformerValidLengthMask(num_heads=self.config.attention_heads,
-                                                                        fold_heads=True,
-                                                                        name="%ssource_bias" % self.prefix)
-
-        self.pos_embedding = encoder.get_positional_embedding(config.positional_embedding_type,
-                                                              config.model_size,
-                                                              max_seq_len=config.max_seq_len_target,
-                                                              fixed_pos_embed_scale_up_input=True,
-                                                              fixed_pos_embed_scale_down_positions=False,
-                                                              prefix=C.TARGET_POSITIONAL_EMBEDDING_PREFIX)
-
-    def decode_sequence(self,
-                        source_encoded: mx.sym.Symbol,
-                        source_encoded_lengths: mx.sym.Symbol,
-                        source_encoded_max_length: int,
-                        target_embed: mx.sym.Symbol,
-                        target_embed_lengths: mx.sym.Symbol,
-                        target_embed_max_length: int) -> Tuple[mx.sym.Symbol, Optional[mx.sym.Symbol]]:
-        """
-        Decodes a sequence of embedded target words and returns sequence of last decoder
-        representations for each time step.
-
-        :param source_encoded: Encoded source: (batch_size, source_encoded_max_length, encoder_depth).
-        :param source_encoded_lengths: Lengths of encoded source sequences. Shape: (batch_size,).
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :param target_embed: Embedded target sequence. Shape: (batch_size, target_embed_max_length, target_num_embed).
-        :param target_embed_lengths: Lengths of embedded target sequences. Shape: (batch_size,).
-        :param target_embed_max_length: Dimension of the embedded target sequence.
-        :return:
-            Decoder data. Shape: (batch_size, target_embed_max_length, decoder_depth).
-            Pointer data. Shape: (batch_size, target_embed_max_length, source_seq_len).
-        """
-
-        # (batch_size * heads, max_length)
-        source_bias = self.valid_length_mask(source_encoded, source_encoded_lengths)
-
-        # (batch_size * heads, 1, max_length)
-        source_bias = mx.sym.expand_dims(source_bias, axis=1)
-
-        # (1, target_max_length, target_max_length)
-        target_bias = transformer.get_autoregressive_bias(target_embed_max_length)
-
-        # target: (batch_size, target_max_length, model_size)
-        target, _, target_max_length = self.pos_embedding.encode(target_embed, None, target_embed_max_length)
-
-        if self.config.dropout_prepost > 0.0:
-            target = mx.sym.Dropout(data=target, p=self.config.dropout_prepost)
-
-        for layer in self.layers:
-            target = layer(target, target_bias, source_encoded, source_bias)
-        target = self.final_process(target, None)
-
-        return target, None
-
-    def decode_step(self,
-                    step: int,
-                    target_embed_prev: mx.sym.Symbol,
-                    source_encoded_max_length: int,
-                    *states: mx.sym.Symbol) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, List[mx.sym.Symbol], Optional[mx.sym.Symbol]]:
-        """
-        Decodes a single time step given the current step, the previous embedded target word,
-        and previous decoder states.
-        Returns decoder representation for the next prediction, attention probabilities, and next decoder states.
-        Implementations can maintain an arbitrary number of states.
-
-        :param step: Global step of inference procedure, starts with 1.
-        :param target_embed_prev: Previous target word embedding. Shape: (batch_size, target_num_embed).
-        :param source_encoded_max_length: Length of encoded source time dimension.
-        :param states: Arbitrary list of decoder states.
-        :return: logit inputs, attention probabilities, next decoder states, pointer scores.
-        """
-        # for step > 1, states contains source_encoded, source_encoded_lengths, and cache tensors.
-        source_encoded, source_encoded_lengths, *cache = states  # type: ignore
-
-        # symbolic indices of the previous word
-        indices = mx.sym.arange(start=step - 1, stop=step, step=1, name='indices')
-        # (batch_size, num_embed)
-        target_embed_prev = self.pos_embedding.encode_positions(indices, target_embed_prev)
-        # (batch_size, 1, num_embed)
-        target = mx.sym.expand_dims(target_embed_prev, axis=1)
-
-        # (batch_size * heads, max_length)
-        source_bias = self.valid_length_mask(source_encoded, source_encoded_lengths)
-
-        # (batch_size * heads, 1, max_length)
-        source_bias = mx.sym.expand_dims(source_bias, axis=1)
-
-        # auto-regressive bias for last position in sequence
-        # (1, target_max_length, target_max_length)
-        target_bias = transformer.get_autoregressive_bias(step)
-        target_bias = mx.sym.slice_axis(target_bias, axis=1, begin=-1, end=step)
-
-        new_states = [source_encoded, source_encoded_lengths]
-        layer_caches = self._get_cache_per_layer(cast(List[mx.sym.Symbol], cache))
-        for layer, layer_cache in zip(self.layers, layer_caches):
-            target = layer(target, target_bias, source_encoded, source_bias, layer_cache)
-            # store updated keys and values in states list.
-            # (layer.__call__() has the side-effect of updating contents of layer_cache)
-            new_states += [layer_cache['k'], layer_cache['v']]
-
-        # (batch_size, 1, model_size)
-        target = self.final_process(target, None)
-        # (batch_size, model_size)
-        target = mx.sym.reshape(target, shape=(-3, -1))
-
-        # TODO(fhieber): no attention probs for now
-        attention_probs = mx.sym.sum(mx.sym.zeros_like(source_encoded), axis=2, keepdims=False)
-
-        return target, attention_probs, new_states, None
-
-    def _get_cache_per_layer(self, cache: List[mx.sym.Symbol]) -> List[Dict[str, Optional[mx.sym.Symbol]]]:
-        """
-        For decoder time steps > 1 there will be cache tensors available that contain
-        previously computed key & value tensors for each transformer layer.
-
-        :param cache: List of states passed to decode_step().
-        :return: List of layer cache dictionaries.
-        """
-        if not cache:  # first decoder step
-            return [{'k': None, 'v': None} for _ in range(len(self.layers))]
+        self.inference_only = inference_only
+        with self.name_scope():
+            self.pos_embedding = layers.PositionalEmbeddings(weight_type=self.config.positional_embedding_type,
+                                                             num_embed=self.config.model_size,
+                                                             max_seq_len=self.config.max_seq_len_target,
+                                                             prefix=C.TARGET_POSITIONAL_EMBEDDING_PREFIX,
+                                                             scale_up_input=True,
+                                                             scale_down_positions=False)
+            self.autoregressive_bias = transformer.AutoRegressiveBias(prefix="autoregressive_bias_")
+            self.valid_length_mask = transformer.TransformerValidLengthMask(num_heads=self.config.attention_heads,
+                                                                            fold_heads=False,
+                                                                            name="bias")
+            self.layers = mx.gluon.nn.HybridSequential()
+            for i in range(config.num_layers):
+                self.layers.add(transformer.TransformerDecoderBlock(config, prefix="%d_" % i, dtype=dtype))
+
+            self.final_process = transformer.TransformerProcessBlock(sequence=config.preprocess_sequence,
+                                                                     dropout=config.dropout_prepost,
+                                                                     prefix="final_process_",
+                                                                     num_hidden=self.config.model_size)
+
+    def state_structure(self) -> str:
+        """
+        Returns the structure of states used for manipulation of the states.
+        Each state is either labeled 's' for step, 'b' for source_mask, 'd' for decoder, or 'e' for encoder.
+        """
+        structure = ''
+        if self.inference_only:
+            structure += C.STEP_STATE + C.BIAS_STATE + C.ENCODER_STATE * self.config.num_layers * 2
         else:
-            assert len(cache) == len(self.layers) * 2
-            return [{'k': cache[2 * l + 0], 'v': cache[2 * l + 1]} for l in range(len(self.layers))]
-
-    def reset(self):
-        pass
-
-    def get_num_hidden(self) -> int:
-        """
-        :return: The representation size of this decoder.
-        """
-        return self.config.model_size
-
-    def init_states(self,
-                    source_encoded: mx.sym.Symbol,
-                    source_encoded_lengths: mx.sym.Symbol,
-                    source_encoded_max_length: int) -> List[mx.sym.Symbol]:
-        """
-        Returns a list of symbolic states that represent the initial states of this decoder.
-        Used for inference.
-
-        :param source_encoded: Encoded source. Shape: (batch_size, source_encoded_max_length, encoder_depth).
-        :param source_encoded_lengths: Lengths of encoded source sequences. Shape: (batch_size,).
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :return: List of symbolic initial states.
-        """
-        return [source_encoded, source_encoded_lengths]
-
-    def state_variables(self, target_max_length: int) -> List[mx.sym.Symbol]:
-        """
-        Returns the list of symbolic variables for this decoder to be used during inference.
+            structure += C.STEP_STATE + C.ENCODER_STATE + C.BIAS_STATE
+        structure += C.DECODER_STATE * self.config.num_layers * 2
 
-        :param target_max_length: Current target sequence length.
-        :return: List of symbolic variables.
-        """
-        variables = [mx.sym.Variable(C.SOURCE_ENCODED_NAME),
-                     mx.sym.Variable(C.SOURCE_LENGTH_NAME)]
-        if target_max_length > 1:  # no cache for initial decoder step
-            for l in range(len(self.layers)):
-                variables.append(mx.sym.Variable('cache_l%d_k' % l))
-                variables.append(mx.sym.Variable('cache_l%d_v' % l))
-        return variables
+        return structure
 
-    def state_shapes(self,
-                     batch_size: int,
-                     target_max_length: int,
-                     source_encoded_max_length: int,
-                     source_encoded_depth: int) -> List[mx.io.DataDesc]:
+    def init_state_from_encoder(self,
+                                encoder_outputs: mx.nd.NDArray,
+                                encoder_valid_length: Optional[mx.nd.NDArray] = None) -> List[mx.nd.NDArray]:
         """
-        Returns a list of shape descriptions given batch size, encoded source max length and encoded source depth.
-        Used for inference.
+        Returns the initial states given encoder output. States for teacher-forced training are encoder outputs
+        and a valid length mask for encoder outputs.
+        At inference, this method returns the following state tuple:
+        valid length bias, step state,
+        [projected encoder attention keys, projected encoder attention values] * num_layers,
+        [self attention dummies] * num_layers.
 
-        :param batch_size: Batch size during inference.
-        :param target_max_length: Current target sequence length.
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :param source_encoded_depth: Depth of encoded source.
-        :return: List of shape descriptions.
+        :param encoder_outputs: Encoder outputs. Shape: (batch, source_length, encoder_dim).
+        :param encoder_valid_length: Valid lengths of encoder outputs. Shape: (batch,).
+        :return: Initial states.
         """
-        shapes = [mx.io.DataDesc(C.SOURCE_ENCODED_NAME,
-                                 (batch_size, source_encoded_max_length, source_encoded_depth),
-                                 layout=C.BATCH_MAJOR),
-                  mx.io.DataDesc(C.SOURCE_LENGTH_NAME, (batch_size,), layout="N")]
+        source_mask = self.valid_length_mask(encoder_outputs, encoder_valid_length)
 
-        if target_max_length > 1:  # no cache for initial decoder step
-            for l in range(len(self.layers)):
-                shapes.append(mx.io.DataDesc(name='cache_l%d_k' % l,
-                                             shape=(batch_size, target_max_length - 1, self.config.model_size),
-                                             layout=C.BATCH_MAJOR))
-                shapes.append(mx.io.DataDesc(name='cache_l%d_v' % l,
-                                             shape=(batch_size, target_max_length - 1, self.config.model_size),
-                                             layout=C.BATCH_MAJOR))
-        return shapes
-
-    def get_max_seq_len(self) -> Optional[int]:
-        #  The positional embeddings potentially pose a limit on the maximum length at inference time.
-        return self.pos_embedding.get_max_seq_len()
-
-
-RecurrentDecoderState = NamedTuple('RecurrentDecoderState', [
-    ('hidden', mx.sym.Symbol),
-    ('layer_states', List[mx.sym.Symbol]),
-])
-"""
-RecurrentDecoder state.
-
-:param hidden: Hidden state after attention mechanism. Shape: (batch_size, num_hidden).
-:param layer_states: Hidden states for RNN layers of RecurrentDecoder. Shape: List[(batch_size, rnn_num_hidden)]
-"""
-
-
-class RecurrentDecoderConfig(Config):
-    """
-    Recurrent decoder configuration.
-
-    :param max_seq_len_source: Maximum source sequence length
-    :param rnn_config: RNN configuration.
-    :param attention_config: Attention configuration.
-    :param hidden_dropout: Dropout probability on next decoder hidden state.
-    :param state_init: Type of RNN decoder state initialization: zero, last, average.
-    :param state_init_lhuc: Apply LHUC for encoder to decoder initialization.
-    :param context_gating: Whether to use context gating.
-    :param layer_normalization: Apply layer normalization.
-    :param attention_in_upper_layers: Pass the attention value to all layers in the decoder.
-    :param enc_last_hidden_concat_to_embedding: Concatenate the last hidden representation of the encoder to the
-                                                input of the decoder (e.g., context + current embedding).
-    :param dtype: Data type.
-    """
-
-    def __init__(self,
-                 max_seq_len_source: int,
-                 rnn_config: rnn.RNNConfig,
-                 attention_config: rnn_attention.AttentionConfig,
-                 hidden_dropout: float = .0,
-                 state_init: str = C.RNN_DEC_INIT_LAST,
-                 state_init_lhuc: bool = False,
-                 context_gating: bool = False,
-                 layer_normalization: bool = False,
-                 attention_in_upper_layers: bool = False,
-                 dtype: str = C.DTYPE_FP32,
-                 enc_last_hidden_concat_to_embedding: bool = False) -> None:
-
-        super().__init__()
-        self.max_seq_len_source = max_seq_len_source
-        self.rnn_config = rnn_config
-        self.attention_config = attention_config
-        self.hidden_dropout = hidden_dropout
-        self.state_init = state_init
-        self.state_init_lhuc = state_init_lhuc
-        self.context_gating = context_gating
-        self.layer_normalization = layer_normalization
-        self.attention_in_upper_layers = attention_in_upper_layers
-        self.enc_last_hidden_concat_to_embedding = enc_last_hidden_concat_to_embedding
-        self.dtype = dtype
-
-
-@Decoder.register(RecurrentDecoderConfig, C.RNN_DECODER_PREFIX)
-class RecurrentDecoder(Decoder):
-    """
-    RNN Decoder with attention.
-    The architecture is based on Luong et al, 2015: Effective Approaches to Attention-based Neural Machine Translation.
-
-    :param config: Configuration for recurrent decoder.
-    :param prefix: Decoder symbol prefix.
-    """
-
-    def __init__(self,
-                 config: RecurrentDecoderConfig,
-                 prefix: str = C.RNN_DECODER_PREFIX) -> None:
-        super().__init__(config.dtype)
-        # TODO: implement variant without input feeding
-        self.config = config
-        self.rnn_config = config.rnn_config
-        self.attention = rnn_attention.get_attention(config.attention_config,
-                                                     config.max_seq_len_source,
-                                                     prefix + C.ATTENTION_PREFIX)
-        self.prefix = prefix
-
-        self.num_hidden = self.rnn_config.num_hidden
+        # (batch_size, 1)
+        step = mx.nd.expand_dims(mx.nd.zeros_like(encoder_valid_length), axis=1)
 
-        if self.config.context_gating:
-            utils.check_condition(not self.config.attention_in_upper_layers,
-                                  "Context gating is not supported with attention in upper layers.")
-            self.gate_w = mx.sym.Variable("%sgate_weight" % prefix)
-            self.gate_b = mx.sym.Variable("%sgate_bias" % prefix)
-            self.mapped_rnn_output_w = mx.sym.Variable("%smapped_rnn_output_weight" % prefix)
-            self.mapped_rnn_output_b = mx.sym.Variable("%smapped_rnn_output_bias" % prefix)
-            self.mapped_context_w = mx.sym.Variable("%smapped_context_weight" % prefix)
-            self.mapped_context_b = mx.sym.Variable("%smapped_context_bias" % prefix)
-        if self.rnn_config.residual:
-            utils.check_condition(self.config.rnn_config.first_residual_layer >= 2,
-                                  "Residual connections on the first decoder layer are not supported as input and "
-                                  "output dimensions do not match.")
+        if self.inference_only:
+            # Encoder projection caching, therefore we don't pass the encoder_outputs
+            states = [step, source_mask]
 
-        # Stacked RNN
-        if self.rnn_config.num_layers == 1 or not self.config.attention_in_upper_layers:
-            self.rnn_pre_attention = rnn.get_stacked_rnn(self.rnn_config, self.prefix, parallel_inputs=False)
-            self.rnn_post_attention = None
+            for layer in self.layers:
+                encoder_attention_keys, encoder_attention_values = \
+                    layer.enc_attention.project_and_isolate_heads(mx.nd, encoder_outputs)
+                states.append(encoder_attention_keys)
+                states.append(encoder_attention_values)
         else:
-            self.rnn_pre_attention = rnn.get_stacked_rnn(self.rnn_config, self.prefix, parallel_inputs=False,
-                                                         layers=[0])
-            self.rnn_post_attention = rnn.get_stacked_rnn(self.rnn_config, self.prefix, parallel_inputs=True,
-                                                          layers=range(1, self.rnn_config.num_layers))
-        self.rnn_pre_attention_n_states = len(self.rnn_pre_attention.state_shape)
-
-        if self.config.state_init != C.RNN_DEC_INIT_ZERO:
-            self._create_state_init_parameters()
+            # NO encoder projection caching
+            states = [step, encoder_outputs, source_mask]
+
+        batch_size = encoder_outputs.shape[0]
+        # shape: (batch, heads, length, depth_per_head)
+        self_att_key_value_dummies = [mx.nd.zeros((batch_size,
+                                                   self.config.attention_heads,
+                                                   1,
+                                                   self.config.model_size // self.config.attention_heads),
+                                                  ctx=encoder_outputs.context,
+                                                  dtype=encoder_outputs.dtype)] * self.config.num_layers * 2
+        states += self_att_key_value_dummies
 
-        # Hidden state parameters
-        self.hidden_w = mx.sym.Variable("%shidden_weight" % prefix)
-        self.hidden_b = mx.sym.Variable("%shidden_bias" % prefix)
-        self.hidden_norm = None
-        if self.config.layer_normalization:
-            self.hidden_norm = layers.LayerNormalization(prefix="%shidden_norm" % prefix)
-
-    def _create_state_init_parameters(self):
-        """
-        Creates parameters for encoder last state transformation into decoder layer initial states.
-        """
-        self.init_ws, self.init_bs, self.init_norms = [], [], []
-        # shallow copy of the state shapes:
-        state_shapes = list(self.rnn_pre_attention.state_shape)
-        if self.rnn_post_attention:
-            state_shapes += self.rnn_post_attention.state_shape
-        for state_idx, (_, init_num_hidden) in enumerate(state_shapes):
-            self.init_ws.append(mx.sym.Variable("%senc2decinit_%d_weight" % (self.prefix, state_idx)))
-            self.init_bs.append(mx.sym.Variable("%senc2decinit_%d_bias" % (self.prefix, state_idx)))
-            if self.config.layer_normalization:
-                self.init_norms.append(layers.LayerNormalization(prefix="%senc2decinit_%d_norm" % (self.prefix,
-                                                                                                   state_idx)))
+        return states
 
-    def decode_sequence(self,
-                        source_encoded: mx.sym.Symbol,
-                        source_encoded_lengths: mx.sym.Symbol,
-                        source_encoded_max_length: int,
-                        target_embed: mx.sym.Symbol,
-                        target_embed_lengths: mx.sym.Symbol,
-                        target_embed_max_length: int) -> Tuple[mx.sym.Symbol, Optional[mx.sym.Symbol]]:
+    def decode_seq(self, inputs: mx.nd.NDArray, states: List[mx.nd.NDArray]):
         """
         Decodes a sequence of embedded target words and returns sequence of last decoder
         representations for each time step.
 
-        :param source_encoded: Encoded source: (batch_size, source_encoded_max_length, encoder_depth).
-        :param source_encoded_lengths: Lengths of encoded source sequences. Shape: (batch_size,).
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :param target_embed: Embedded target sequence. Shape: (batch_size, target_embed_max_length, target_num_embed).
-        :param target_embed_lengths: Lengths of embedded target sequences. Shape: (batch_size,).
-        :param target_embed_max_length: Dimension of the embedded target sequence.
-        :return:
-            Decoder data. Shape: (batch_size, target_embed_max_length, decoder_depth).
-            Pointer data. Shape: (batch_size, target_embed_max_length, source_seq_len).
-        """
-
-        # target_embed: target_seq_len * (batch_size, num_target_embed)
-        target_embed = mx.sym.split(data=target_embed, num_outputs=target_embed_max_length, axis=1, squeeze_axis=True)
-
-        # Get last state from source (batch_size, num_target_embed)
-        enc_last_hidden = None
-        if self.config.enc_last_hidden_concat_to_embedding:
-            enc_last_hidden = mx.sym.SequenceLast(data=source_encoded,
-                                                  sequence_length=source_encoded_lengths,
-                                                  axis=1,
-                                                  use_sequence_length=True)
-
-        # get recurrent attention function conditioned on source
-        attention_func = self.attention.on(source_encoded, source_encoded_lengths,
-                                           source_encoded_max_length)
-        attention_state = self.attention.get_initial_state(source_encoded_lengths, source_encoded_max_length)
-
-        # initialize decoder states
-        # hidden: (batch_size, rnn_num_hidden)
-        # layer_states: List[(batch_size, state_num_hidden]
-        state = self.get_initial_state(source_encoded, source_encoded_lengths)
-
-        # hidden_all: target_embed_max_length * (batch_size, rnn_num_hidden)
-        hidden_states = []  # type: List[mx.sym.Symbol]
-        attention_scores = []  # type: List[mx.sym.Symbol]
-        
-        # TODO: possible alternative: feed back the context vector instead of the hidden (see lamtram)
-        self.reset()
-        for seq_idx in range(target_embed_max_length):
-            # hidden: (batch_size, rnn_num_hidden)
-            state, attention_state = self._step(target_embed[seq_idx],
-                                                state,
-                                                attention_func,
-                                                attention_state,
-                                                seq_idx,
-                                                enc_last_hidden=enc_last_hidden)
-            hidden_states.append(state.hidden)
-            attention_scores.append(attention_state.scores)
-
-        # concatenate along time axis: (batch_size, target_embed_max_length, rnn_num_hidden)
-        hidden_stack = mx.sym.stack(*hidden_states, axis=1, name='%shidden_stack' % self.prefix)
-        attention_scores_stack = mx.sym.stack(*attention_scores, axis=1, name='%sattention_scores_stack' % self.prefix)
-        return hidden_stack, attention_scores_stack
-
-    def decode_step(self,
-                    step: int,
-                    target_embed_prev: mx.sym.Symbol,
-                    source_encoded_max_length: int,
-                    *states: mx.sym.Symbol) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, List[mx.sym.Symbol], Optional[mx.sym.Symbol]]:
-        """
-        Decodes a single time step given the current step, the previous embedded target word,
-        and previous decoder states.
-        Returns decoder representation for the next prediction, attention probabilities, and next decoder states.
-        Implementations can maintain an arbitrary number of states.
-
-        :param step: Global step of inference procedure, starts with 1.
-        :param target_embed_prev: Previous target word embedding. Shape: (batch_size, target_num_embed).
-        :param source_encoded_max_length: Length of encoded source time dimension.
-        :param states: Arbitrary list of decoder states.
-        :return: logit inputs, attention probabilities, next decoder states, pointer scores.
+        :param inputs: Encoded source: (batch_size, source_encoded_max_length, encoder_depth).
+        :param states: List of initial states, as given by init_state_from_encoder().
+        :return: Decoder output. Shape: (batch_size, target_embed_max_length, decoder_depth).
         """
-        source_encoded, prev_dynamic_source, source_encoded_length, prev_hidden, *layer_states = states
+        outputs, _ = self.forward(inputs, states)
+        return outputs
 
-        # Get last state from source (batch_size, num_target_embed)
-        enc_last_hidden = None
-        if self.config.enc_last_hidden_concat_to_embedding:
-            enc_last_hidden = mx.sym.SequenceLast(data=source_encoded,
-                                                  sequence_length=source_encoded_length,
-                                                  axis=1,
-                                                  use_sequence_length=True)
-
-        attention_func = self.attention.on(source_encoded, source_encoded_length, source_encoded_max_length)
-
-        prev_state = RecurrentDecoderState(prev_hidden, list(layer_states))
-        prev_attention_state = rnn_attention.AttentionState(context=None, probs=None,
-                                                            dynamic_source=prev_dynamic_source, scores=None)
-
-        # state.hidden: (batch_size, rnn_num_hidden)
-        # attention_state.dynamic_source: (batch_size, source_seq_len, coverage_num_hidden)
-        # attention_state.probs: (batch_size, source_seq_len)
-        state, attention_state = self._step(target_embed_prev,
-                                            prev_state,
-                                            attention_func,
-                                            prev_attention_state,
-                                            enc_last_hidden=enc_last_hidden)
-
-        new_states = [source_encoded,
-                      attention_state.dynamic_source,
-                      source_encoded_length,
-                      state.hidden] + state.layer_states
-
-        return state.hidden, attention_state.probs, new_states, attention_state.scores
-
-    def reset(self):
-        """
-        Calls reset on the RNN cell.
-        """
-        self.rnn_pre_attention.reset()
-        # Shallow copy of cells
-        cells_to_reset = list(self.rnn_pre_attention._cells)
-        if self.rnn_post_attention:
-            self.rnn_post_attention.reset()
-            cells_to_reset += self.rnn_post_attention._cells
-        for cell in cells_to_reset:
-            # TODO remove this once mxnet.rnn.ModifierCell.reset() invokes reset() of base_cell
-            if isinstance(cell, mx.rnn.ModifierCell):
-                cell.base_cell.reset()
-            cell.reset()
-
-    def get_num_hidden(self) -> int:
-        """
-        :return: The representation size of this decoder.
-        """
-        return self.num_hidden
-
-    def init_states(self,
-                    source_encoded: mx.sym.Symbol,
-                    source_encoded_lengths: mx.sym.Symbol,
-                    source_encoded_max_length: int) -> List[mx.sym.Symbol]:
-        """
-        Returns a list of symbolic states that represent the initial states of this decoder.
-        Used for inference.
-
-        :param source_encoded: Encoded source. Shape: (batch_size, source_encoded_max_length, encoder_depth).
-        :param source_encoded_lengths: Lengths of encoded source sequences. Shape: (batch_size,).
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :return: List of symbolic initial states.
-        """
-        hidden, layer_states = self.get_initial_state(source_encoded, source_encoded_lengths)
-        context, attention_probs, dynamic_source, attention_scores = self.attention.get_initial_state(source_encoded_lengths,
-                                                                                    source_encoded_max_length)
-        states = [source_encoded, dynamic_source, source_encoded_lengths, hidden] + layer_states
-        return states
-
-    def state_variables(self, target_max_length: int) -> List[mx.sym.Symbol]:
-        """
-        Returns the list of symbolic variables for this decoder to be used during inference.
-
-        :param target_max_length: Current target sequence lengths.
-        :return: List of symbolic variables.
+    def forward(self, step_input, states):
         """
-        return [mx.sym.Variable(C.SOURCE_ENCODED_NAME),
-                mx.sym.Variable(C.SOURCE_DYNAMIC_PREVIOUS_NAME),
-                mx.sym.Variable(C.SOURCE_LENGTH_NAME),
-                mx.sym.Variable(C.HIDDEN_PREVIOUS_NAME)] + \
-               [mx.sym.Variable("%senc2decinit_%d" % (self.prefix, i)) for i in
-                range(len(sum([rnn.state_info for rnn in self.get_rnn_cells()], [])))]
+        Run forward pass of the decoder.
 
-    def state_shapes(self,
-                     batch_size: int,
-                     target_max_length: int,
-                     source_encoded_max_length: int,
-                     source_encoded_depth: int) -> List[mx.io.DataDesc]:
-        """
-        Returns a list of shape descriptions given batch size, encoded source max length and encoded source depth.
-        Used for inference.
-
-        :param batch_size: Batch size during inference.
-        :param target_max_length: Current target sequence length.
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :param source_encoded_depth: Depth of encoded source.
-        :return: List of shape descriptions.
-        """
-        return [mx.io.DataDesc(C.SOURCE_ENCODED_NAME,
-                               (batch_size, source_encoded_max_length, source_encoded_depth),
-                               layout=C.BATCH_MAJOR),
-                mx.io.DataDesc(C.SOURCE_DYNAMIC_PREVIOUS_NAME,
-                               (batch_size, source_encoded_max_length, self.attention.dynamic_source_num_hidden),
-                               layout=C.BATCH_MAJOR),
-                mx.io.DataDesc(C.SOURCE_LENGTH_NAME,
-                               (batch_size,),
-                               layout="N"),
-                mx.io.DataDesc(C.HIDDEN_PREVIOUS_NAME,
-                               (batch_size, self.num_hidden),
-                               layout="NC")] + \
-               [mx.io.DataDesc("%senc2decinit_%d" % (self.prefix, i),
-                               (batch_size, num_hidden),
-                               layout=C.BATCH_MAJOR) for i, (_, num_hidden) in enumerate(
-                   sum([rnn.state_shape for rnn in self.get_rnn_cells()], [])
-               )]
+        step_input is either:
+             (batch, num_hidden): single decoder step at inference time
+             (batch, seq_len, num_hidden): full sequence decode during training.
 
-    def get_rnn_cells(self) -> List[mx.rnn.BaseRNNCell]:
-        """
-        Returns a list of RNNCells used by this decoder.
+        states is either:
+            if self.inference_only == False: (Training and Checkpoint decoder during training)
+                steps, encoder_outputs, source_bias, layer_caches...
+            else: (during translation outside of training)
+                steps, source_bias, layer_caches..., projected encoder outputs...
         """
-        cells = [self.rnn_pre_attention]
-        if self.rnn_post_attention:
-            cells.append(self.rnn_post_attention)
-        return cells
+        input_shape = step_input.shape
 
-    def get_initial_state(self,
-                          source_encoded: mx.sym.Symbol,
-                          source_encoded_length: mx.sym.Symbol) -> RecurrentDecoderState:
-        """
-        Computes initial states of the decoder, hidden state, and one for each RNN layer.
-        Optionally, init states for RNN layers are computed using 1 non-linear FC
-        with the last state of the encoder as input.
-
-        :param source_encoded: Concatenated encoder states. Shape: (batch_size, source_seq_len, encoder_num_hidden).
-        :param source_encoded_length: Lengths of source sequences. Shape: (batch_size,).
-        :return: Decoder state.
-        """
-        # we derive the shape of hidden and layer_states from some input to enable
-        # shape inference for the batch dimension during inference.
-        # (batch_size, 1)
-        zeros = mx.sym.expand_dims(mx.sym.zeros_like(source_encoded_length), axis=1)
-        # last encoder state: (batch, num_hidden)
-        source_encoded_last = mx.sym.SequenceLast(data=source_encoded,
-                                                  axis=1,
-                                                  sequence_length=source_encoded_length,
-                                                  use_sequence_length=True) \
-            if self.config.state_init == C.RNN_DEC_INIT_LAST else None
-        # source_masked: (batch_size, source_seq_len, encoder_num_hidden)
-        source_masked = mx.sym.SequenceMask(data=source_encoded,
-                                            axis=1,
-                                            sequence_length=source_encoded_length,
-                                            use_sequence_length=True,
-                                            value=0.) if self.config.state_init == C.RNN_DEC_INIT_AVG else None
-
-        # decoder hidden state
-        hidden = mx.sym.tile(data=zeros, reps=(1, self.num_hidden))
-
-        # initial states for each layer
-        layer_states = []
-        for state_idx, (_, init_num_hidden) in enumerate(sum([rnn.state_shape for rnn in self.get_rnn_cells()], [])):
-            if self.config.state_init == C.RNN_DEC_INIT_ZERO:
-                init = mx.sym.tile(data=zeros, reps=(1, init_num_hidden))
-            else:
-                if self.config.state_init == C.RNN_DEC_INIT_LAST:
-                    init = source_encoded_last
-                elif self.config.state_init == C.RNN_DEC_INIT_AVG:
-                    # (batch_size, encoder_num_hidden)
-                    init = mx.sym.broadcast_div(mx.sym.sum(source_masked, axis=1, keepdims=False),
-                                                mx.sym.expand_dims(source_encoded_length, axis=1))
-                else:
-                    raise ValueError("Unknown decoder state init type '%s'" % self.config.state_init)
+        is_inference = len(input_shape) == 2
 
-                init = mx.sym.FullyConnected(data=init,
-                                             num_hidden=init_num_hidden,
-                                             weight=self.init_ws[state_idx],
-                                             bias=self.init_bs[state_idx],
-                                             name="%senc2decinit_%d" % (self.prefix, state_idx))
-                if self.config.layer_normalization:
-                    init = self.init_norms[state_idx](init)
-                init = mx.sym.Activation(data=init, act_type="tanh",
-                                         name="%senc2dec_inittanh_%d" % (self.prefix, state_idx))
-                if self.config.state_init_lhuc:
-                    lhuc = layers.LHUC(init_num_hidden, prefix="%senc2decinit_%d_" % (self.prefix, state_idx))
-                    init = lhuc(init)
-            layer_states.append(init)
-
-        return RecurrentDecoderState(hidden, layer_states)
-
-    def _step(self, word_vec_prev: mx.sym.Symbol,
-              state: RecurrentDecoderState,
-              attention_func: Callable,
-              attention_state: rnn_attention.AttentionState,
-              seq_idx: int = 0,
-              enc_last_hidden: Optional[mx.sym.Symbol] = None) -> Tuple[RecurrentDecoderState, rnn_attention.AttentionState]:
-
-        """
-        Performs single-time step in the RNN, given previous word vector, previous hidden state, attention function,
-        and RNN layer states.
-
-        :param word_vec_prev: Embedding of previous target word. Shape: (batch_size, num_target_embed).
-        :param state: Decoder state consisting of hidden and layer states.
-        :param attention_func: Attention function to produce context vector.
-        :param attention_state: Previous attention state.
-        :param seq_idx: Decoder time step.
-        :return: (new decoder state, updated attention state).
-        """
-        # (1) RNN step
-        # concat previous word embedding and previous hidden state
-        if enc_last_hidden is not None:
-            word_vec_prev = mx.sym.concat(word_vec_prev, enc_last_hidden, dim=1,
-                                          name="%sconcat_target_encoder_t%d" % (self.prefix, seq_idx))
-        rnn_input = mx.sym.concat(word_vec_prev, state.hidden, dim=1,
-                                  name="%sconcat_target_context_t%d" % (self.prefix, seq_idx))
-        # rnn_pre_attention_output: (batch_size, rnn_num_hidden)
-        # rnn_pre_attention_layer_states: num_layers * [batch_size, rnn_num_hidden]
-        rnn_pre_attention_output, rnn_pre_attention_layer_states = \
-            self.rnn_pre_attention(rnn_input, state.layer_states[:self.rnn_pre_attention_n_states])
-
-        # (2) Attention step
-        attention_input = self.attention.make_input(seq_idx, word_vec_prev, rnn_pre_attention_output)
-        attention_state = attention_func(attention_input, attention_state)
-
-        # (3) Attention handling (and possibly context gating)
-        if self.rnn_post_attention:
-            upper_rnn_output, upper_rnn_layer_states = \
-                self.rnn_post_attention(rnn_pre_attention_output, attention_state.context,
-                                        state.layer_states[self.rnn_pre_attention_n_states:])
-            hidden_concat = mx.sym.concat(upper_rnn_output, attention_state.context,
-                                          dim=1, name='%shidden_concat_t%d' % (self.prefix, seq_idx))
-            if self.config.hidden_dropout > 0:
-                hidden_concat = mx.sym.Dropout(data=hidden_concat, p=self.config.hidden_dropout,
-                                               name='%shidden_concat_dropout_t%d' % (self.prefix, seq_idx))
-            hidden = self._hidden_mlp(hidden_concat, seq_idx)
-            # TODO: add context gating?
+        if is_inference:
+            # Just add the length dimension:
+            # (batch, num_hidden) -> (batch, 1, num_hidden)
+            step_input = mx.nd.expand_dims(step_input, axis=1)
         else:
-            upper_rnn_layer_states = []
-            hidden_concat = mx.sym.concat(rnn_pre_attention_output, attention_state.context,
-                                          dim=1, name='%shidden_concat_t%d' % (self.prefix, seq_idx))
-            if self.config.hidden_dropout > 0:
-                hidden_concat = mx.sym.Dropout(data=hidden_concat, p=self.config.hidden_dropout,
-                                               name='%shidden_concat_dropout_t%d' % (self.prefix, seq_idx))
-
-            if self.config.context_gating:
-                hidden = self._context_gate(hidden_concat, rnn_pre_attention_output, attention_state, seq_idx)
+            assert not self.inference_only, "Decoder created with inference_only=True but used during training."
+            # Replace the single step by multiple steps for training
+            step, *states = states
+            # Create steps (1, trg_seq_len,)
+            steps = mx.nd.expand_dims(mx.nd.arange(step_input.shape[1], ctx=step_input.context), axis=0)
+            states = [steps] + states
+
+        # run decoder op
+        target, self_attention_key_values = super().forward(step_input, states)
+
+        if is_inference:
+            # During inference, length dimension of decoder output has size 1, squeeze it
+            # (batch, num_hidden)
+            target = mx.nd.reshape(target, shape=(-1, self.get_num_hidden()))
+
+            # We also increment time step state (1st state in the list) and add new caches
+            step = states[0] + 1
+
+            if self.inference_only:
+                # pass in cached encoder states
+                encoder_attention_keys_values = states[2:2 + self.config.num_layers * 2]
+                new_states = [step, states[1]] + encoder_attention_keys_values + self_attention_key_values
             else:
-                hidden = self._hidden_mlp(hidden_concat, seq_idx)
-
-        return RecurrentDecoderState(hidden, rnn_pre_attention_layer_states + upper_rnn_layer_states), attention_state
-
-    def _hidden_mlp(self, hidden_concat: mx.sym.Symbol, seq_idx: int) -> mx.sym.Symbol:
-        hidden = mx.sym.FullyConnected(data=hidden_concat,
-                                       num_hidden=self.num_hidden,  # to state size of RNN
-                                       weight=self.hidden_w,
-                                       bias=self.hidden_b,
-                                       name='%shidden_fc_t%d' % (self.prefix, seq_idx))
-        if self.config.layer_normalization:
-            hidden = self.hidden_norm(hidden)
-
-        # hidden: (batch_size, rnn_num_hidden)
-        hidden = mx.sym.Activation(data=hidden, act_type="tanh",
-                                   name="%snext_hidden_t%d" % (self.prefix, seq_idx))
-        return hidden
-
-    def _context_gate(self,
-                      hidden_concat: mx.sym.Symbol,
-                      rnn_output: mx.sym.Symbol,
-                      attention_state: rnn_attention.AttentionState,
-                      seq_idx: int) -> mx.sym.Symbol:
-        gate = mx.sym.FullyConnected(data=hidden_concat,
-                                     num_hidden=self.num_hidden,
-                                     weight=self.gate_w,
-                                     bias=self.gate_b,
-                                     name='%shidden_gate_t%d' % (self.prefix, seq_idx))
-        gate = mx.sym.Activation(data=gate, act_type="sigmoid",
-                                 name='%shidden_gate_act_t%d' % (self.prefix, seq_idx))
-
-        mapped_rnn_output = mx.sym.FullyConnected(data=rnn_output,
-                                                  num_hidden=self.num_hidden,
-                                                  weight=self.mapped_rnn_output_w,
-                                                  bias=self.mapped_rnn_output_b,
-                                                  name="%smapped_rnn_output_fc_t%d" % (self.prefix, seq_idx))
-        mapped_context = mx.sym.FullyConnected(data=attention_state.context,
-                                               num_hidden=self.num_hidden,
-                                               weight=self.mapped_context_w,
-                                               bias=self.mapped_context_b,
-                                               name="%smapped_context_fc_t%d" % (self.prefix, seq_idx))
-
-        hidden = gate * mapped_rnn_output + (1 - gate) * mapped_context
-
-        if self.config.layer_normalization:
-            hidden = self.hidden_norm(hidden)
-
-        # hidden: (batch_size, rnn_num_hidden)
-        hidden = mx.sym.Activation(data=hidden, act_type="tanh",
-                                   name="%snext_hidden_t%d" % (self.prefix, seq_idx))
-        return hidden
-
-
-class ConvolutionalDecoderConfig(Config):
-    """
-    Convolutional decoder configuration.
-
-    :param cnn_config: Configuration for the convolution block.
-    :param max_seq_len_target: Maximum target sequence length.
-    :param num_embed: Target word embedding size.
-    :param encoder_num_hidden: Number of hidden units of the encoder.
-    :param num_layers: The number of convolutional layers.
-    :param positional_embedding_type: The type of positional embedding.
-    :param hidden_dropout: Dropout probability on next decoder hidden state.
-    :param dtype: Data type.
-    """
-
-    def __init__(self,
-                 cnn_config: convolution.ConvolutionConfig,
-                 max_seq_len_target: int,
-                 num_embed: int,
-                 encoder_num_hidden: int,
-                 num_layers: int,
-                 positional_embedding_type: str,
-                 project_qkv: bool = False,
-                 hidden_dropout: float = .0,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__()
-        self.cnn_config = cnn_config
-        self.max_seq_len_target = max_seq_len_target
-        self.num_embed = num_embed
-        self.encoder_num_hidden = encoder_num_hidden
-        self.num_layers = num_layers
-        self.positional_embedding_type = positional_embedding_type
-        self.project_qkv = project_qkv
-        self.hidden_dropout = hidden_dropout
-        self.dtype = dtype
-
-
-@Decoder.register(ConvolutionalDecoderConfig, C.CNN_DECODER_PREFIX)
-class ConvolutionalDecoder(Decoder):
-    """
-    Convolutional decoder similar to Gehring et al. 2017.
+                encoder_outputs = states[1]
+                source_mask = states[2]
+                new_states = [step, encoder_outputs, source_mask] + self_attention_key_values
 
-    The decoder consists of an embedding layer, positional embeddings, and layers
-    of convolutional blocks with residual connections.
-
-    Notable differences to Gehring et al. 2017:
-     * Here the context vectors are created from the last encoder state (instead of using the last encoder state as the
-       key and the sum of the encoder state and the source embedding as the value)
-     * The encoder gradients are not scaled down by 1/(2 * num_attention_layers).
-     * Residual connections are not scaled down by math.sqrt(0.5).
-     * Attention is computed in the hidden dimension instead of the embedding dimension (removes need for training
-       several projection matrices)
-
-    :param config: Configuration for convolutional decoder.
-    :param prefix: Name prefix for symbols of this decoder.
-    """
-
-    def __init__(self,
-                 config: ConvolutionalDecoderConfig,
-                 prefix: str = C.DECODER_PREFIX) -> None:
-        super().__init__(config.dtype)
-        self.config = config
-        self.prefix = prefix
-
-        # TODO: potentially project the encoder hidden size to the decoder hidden size.
-        utils.check_condition(config.encoder_num_hidden == config.cnn_config.num_hidden,
-                              "We need to have the same number of hidden units in the decoder "
-                              "as we have in the encoder")
-
-        self.pos_embedding = encoder.get_positional_embedding(config.positional_embedding_type,
-                                                              num_embed=config.num_embed,
-                                                              max_seq_len=config.max_seq_len_target,
-                                                              fixed_pos_embed_scale_up_input=False,
-                                                              fixed_pos_embed_scale_down_positions=True,
-                                                              prefix=C.TARGET_POSITIONAL_EMBEDDING_PREFIX)
-
-        self.layers = [convolution.ConvolutionBlock(
-            config.cnn_config,
-            pad_type='left',
-            prefix="%s%d_" % (prefix, i)) for i in range(config.num_layers)]
-        if self.config.project_qkv:
-            self.attention_layers = [layers.ProjectedDotAttention("%s%d_" % (prefix, i),
-                                                                  self.config.cnn_config.num_hidden)
-                                     for i in range(config.num_layers)]
+            assert len(new_states) == len(states)
         else:
-            self.attention_layers = [layers.PlainDotAttention() for _ in range(config.num_layers)]  # type: ignore
-
-        self.i2h_weight = mx.sym.Variable('%si2h_weight' % prefix)
-
-    def decode_sequence(self,
-                        source_encoded: mx.sym.Symbol,
-                        source_encoded_lengths: mx.sym.Symbol,
-                        source_encoded_max_length: int,
-                        target_embed: mx.sym.Symbol,
-                        target_embed_lengths: mx.sym.Symbol,
-                        target_embed_max_length: int) -> Tuple[mx.sym.Symbol, Optional[mx.sym.Symbol]]:
-        """
-        Decodes a sequence of embedded target words and returns sequence of last decoder
-        representations for each time step.
+            new_states = None  # we don't care about states in training
+        return target, new_states
 
-        :param source_encoded: Encoded source: (batch_size, source_encoded_max_length, encoder_depth).
-        :param source_encoded_lengths: Lengths of encoded source sequences. Shape: (batch_size,).
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :param target_embed: Embedded target sequence. Shape: (batch_size, target_embed_max_length, target_num_embed).
-        :param target_embed_lengths: Lengths of embedded target sequences. Shape: (batch_size,).
-        :param target_embed_max_length: Dimension of the embedded target sequence.
-        :return:
-            Decoder data. Shape: (batch_size, target_embed_max_length, decoder_depth).
-            Pointer data. Shape: (batch_size, target_embed_max_length, source_seq_len).
-        """
-
-        # (batch_size, target_seq_len, num_hidden)
-        target_hidden = self._decode(source_encoded=source_encoded,
-                                     source_encoded_lengths=source_encoded_lengths,
-                                     target_embed=target_embed,
-                                     target_embed_lengths=target_embed_lengths,
-                                     target_embed_max_length=target_embed_max_length)
-
-        return target_hidden, None
-
-    def _decode(self,
-                source_encoded: mx.sym.Symbol,
-                source_encoded_lengths: mx.sym.Symbol,
-                target_embed: mx.sym.Symbol,
-                target_embed_lengths: mx.sym.Symbol,
-                target_embed_max_length: int) -> mx.sym.Symbol:
-        """
-        Decode the target and produce a sequence of hidden states.
-
-        :param source_encoded:  Shape: (batch_size, source_encoded_max_length, encoder_depth).
-        :param source_encoded_lengths: Shape: (batch_size,).
-        :param target_embed: Embedded target sequence. Shape: (batch_size, target_embed_max_length).
-        :param target_embed_lengths: Lengths of embedded target sequences. Shape: (batch_size,).
-        :param target_embed_max_length: Size of embedded target sequence dimension.
-        :return: The target hidden states. Shape: (batch_size, target_seq_len, num_hidden).
-        """
-        target_embed, target_embed_lengths, target_embed_max_length = self.pos_embedding.encode(target_embed,
-                                                                                                target_embed_lengths,
-                                                                                                target_embed_max_length)
-        # target_hidden: (batch_size, target_seq_len, num_hidden)
-        target_hidden = mx.sym.FullyConnected(data=target_embed,
-                                              num_hidden=self.config.cnn_config.num_hidden,
-                                              no_bias=True,
-                                              flatten=False,
-                                              weight=self.i2h_weight)
-        target_hidden_prev = target_hidden
-
-        drop_prob = self.config.hidden_dropout
+    def hybrid_forward(self, F, step_input, states):
+        if self.inference_only:
+            # No autoregressive mask needed for decoding
+            mask = None
 
-        for layer, att_layer in zip(self.layers, self.attention_layers):
-            # (batch_size, target_seq_len, num_hidden)
-            target_hidden = layer(mx.sym.Dropout(target_hidden, p=drop_prob) if drop_prob > 0 else target_hidden,
-                                  target_embed_lengths)
-
-            # (batch_size, target_seq_len, num_embed)
-            context = att_layer(target_hidden, source_encoded, source_encoded_lengths)
-
-            # residual connection:
-            target_hidden = target_hidden_prev + target_hidden + context
-            target_hidden_prev = target_hidden
-
-        return target_hidden
-
-    def decode_step(self,
-                    step: int,
-                    target_embed_prev: mx.sym.Symbol,
-                    source_encoded_max_length: int,
-                    *states: mx.sym.Symbol) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, List[mx.sym.Symbol], Optional[mx.sym.Symbol]]:
-        """
-        Decodes a single time step given the current step, the previous embedded target word,
-        and previous decoder states.
-        Returns decoder representation for the next prediction, attention probabilities, and next decoder states.
-        Implementations can maintain an arbitrary number of states.
-
-        :param step: Global step of inference procedure, starts with 1.
-        :param target_embed_prev: Previous target word embedding. Shape: (batch_size, target_num_embed).
-        :param source_encoded_max_length: Length of encoded source time dimension.
-        :param states: Arbitrary list of decoder states.
-        :return: logit inputs, attention probabilities, next decoder states, pointer scores.
-        """
-        # Source_encoded: (batch_size, source_encoded_max_length, encoder_depth)
-        source_encoded, source_encoded_lengths, *layer_states = states
+            steps, source_mask, *other = states
 
-        # The last layer doesn't keep any state as we only need the last hidden vector for the next word prediction
-        # but none of the previous hidden vectors
-        last_layer_state = None
-        embed_layer_state = layer_states[0]
-        cnn_layer_states = list(layer_states[1:]) + [last_layer_state]
-
-        kernel_width = self.config.cnn_config.kernel_width
-
-        new_layer_states = []
-
-        # symbolic indices of the previous word
-        # (batch_size, num_embed)
-        indices = mx.sym.arange(start=step - 1, stop=step, step=1, name='indices')
-        target_embed_prev = self.pos_embedding.encode_positions(indices, target_embed_prev)
-
-        # (batch_size, num_hidden)
-        target_hidden_step = mx.sym.FullyConnected(data=target_embed_prev,
-                                                   num_hidden=self.config.cnn_config.num_hidden,
-                                                   no_bias=True,
-                                                   weight=self.i2h_weight)
-        # re-arrange outcoming layer to the dimensions of the output
-        # (batch_size, 1, num_hidden)
-        target_hidden_step = mx.sym.expand_dims(target_hidden_step, axis=1)
-        # (batch_size, kernel_width, num_hidden)
-        target_hidden = mx.sym.concat(embed_layer_state, target_hidden_step, dim=1)
-
-        new_layer_states.append(mx.sym.slice_axis(data=target_hidden, axis=1, begin=1, end=kernel_width))
-
-        target_hidden_step_prev = target_hidden_step
-
-        drop_prob = self.config.hidden_dropout
-
-        for layer, att_layer, layer_state in zip(self.layers, self.attention_layers, cnn_layer_states):
-            # (batch_size, kernel_width, num_hidden) -> (batch_size, 1, num_hidden)
-            target_hidden_step = layer.step(mx.sym.Dropout(target_hidden, p=drop_prob)
-                                            if drop_prob > 0 else target_hidden)
-
-            # (batch_size, 1, num_embed)
-            # TODO: compute the source encoded projection only once for efficiency reasons
-            context_step = att_layer(target_hidden_step, source_encoded, source_encoded_lengths)
-
-            # residual connection:
-            target_hidden_step = target_hidden_step_prev + target_hidden_step + context_step
-            target_hidden_step_prev = target_hidden_step
-
-            if layer_state is not None:
-                # combine with layer state
-                # (batch_size, kernel_width, num_hidden)
-                target_hidden = mx.sym.concat(layer_state, target_hidden_step, dim=1)
-
-                new_layer_states.append(mx.sym.slice_axis(data=target_hidden, axis=1, begin=1, end=kernel_width))
-
-            else:
-                # last state, here we only care about the latest hidden state:
-                # (batch_size, 1, num_hidden) -> (batch_size, num_hidden)
-                target_hidden = mx.sym.reshape(target_hidden_step, shape=(-3, -1))
-
-        # (batch_size, source_encoded_max_length)
-        attention_probs = mx.sym.reshape(mx.sym.slice_axis(mx.sym.zeros_like(source_encoded),
-                                                           axis=2, begin=0, end=1),
-                                         shape=(0, -1))
-
-        return target_hidden, attention_probs, [source_encoded, source_encoded_lengths] + new_layer_states, None
+            source_encoded = None  # use constant pre-computed key value projections from the states
+            enc_att_kv = other[:self.config.num_layers * 2]
+            enc_att_kv = [enc_att_kv[i:i + 2] for i in range(0, len(enc_att_kv), 2)]
+            self_att_kv = other[self.config.num_layers * 2:]
+            self_att_kv = [self_att_kv[i:i + 2] for i in range(0, len(self_att_kv), 2)]
+        else:
+            mask = self.autoregressive_bias(step_input)  # mask: (1, length, length)
 
-    def reset(self):
-        pass
+            steps, source_encoded, source_mask, *other = states
 
-    def get_num_hidden(self) -> int:
-        """
-        :return: The representation size of this decoder.
-        """
-        return self.config.cnn_config.num_hidden
+            self_att_kv = other
+            self_att_kv = [self_att_kv[i:i + 2] for i in range(0, len(self_att_kv), 2)]
 
-    def init_states(self,
-                    source_encoded: mx.sym.Symbol,
-                    source_encoded_lengths: mx.sym.Symbol,
-                    source_encoded_max_length: int) -> List[mx.sym.Symbol]:
-        """
-        Returns a list of symbolic states that represent the initial states of this decoder.
-        Used for inference.
-
-        :param source_encoded: Encoded source. Shape: (batch_size, source_encoded_max_length, encoder_depth).
-        :param source_encoded_lengths: Lengths of encoded source sequences. Shape: (batch_size,).
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :return: List of symbolic initial states.
-        """
-        # Initially all layers get pad symbols as input (zeros)
-        # (batch_size, kernel_width, num_hidden)
-        num_hidden = self.config.cnn_config.num_hidden
-        kernel_width = self.config.cnn_config.kernel_width
-        # Note: We can not use mx.sym.zeros, as otherwise shape inference fails.
-        # Therefore we need to get a zero array of the right size through other means.
-        # (batch_size, 1, 1)
-        zeros = mx.sym.reshape(mx.sym.zeros_like(source_encoded_lengths), shape=(-1, 1, 1))
-        # (batch_size, kernel_width-1, num_hidden)
-        next_layer_inputs = [mx.sym.tile(data=zeros, reps=(1, kernel_width - 1, num_hidden),
-                                         name="%s%d_init" % (self.prefix, layer_idx))
-                             for layer_idx in range(0, self.config.num_layers)]
-        return [source_encoded, source_encoded_lengths] + next_layer_inputs
+            enc_att_kv = [(None, None) for _ in range(self.config.num_layers)]
 
-    def state_variables(self, target_max_length: int) -> List[mx.sym.Symbol]:
-        """
-        Returns the list of symbolic variables for this decoder to be used during inference.
+        # Fold the heads of source_mask (batch_size, num_heads, seq_len) -> (batch_size * num_heads, 1, seq_len)
+        source_mask = F.expand_dims(F.reshape(source_mask, shape=(-3, -2)), axis=1)
 
-        :param target_max_length: Current target sequence lengths.
-        :return: List of symbolic variables.
-        """
-        # we keep a fixed slice of the layer inputs as a state for all upper layers:
-        next_layer_inputs = [mx.sym.Variable("cnn_layer%d_in" % layer_idx)
-                             for layer_idx in range(0, self.config.num_layers)]
-        return [mx.sym.Variable(C.SOURCE_ENCODED_NAME),
-                mx.sym.Variable(C.SOURCE_LENGTH_NAME)] + next_layer_inputs
+        # target: (batch_size, length, model_size)
+        target = self.pos_embedding(step_input, steps)
 
-    def state_shapes(self,
-                     batch_size: int,
-                     target_max_length: int,
-                     source_encoded_max_length: int,
-                     source_encoded_depth: int) -> List[mx.io.DataDesc]:
-        """
-        Returns a list of shape descriptions given batch size, encoded source max length and encoded source depth.
-        Used for inference.
+        if self.config.dropout_prepost > 0.0:
+            target = F.Dropout(data=target, p=self.config.dropout_prepost)
+
+        new_self_att_kv = []  # type: List[Tuple]
+        for layer, (self_att_k, self_att_v), (enc_att_k, enc_att_v) in zip(self.layers, self_att_kv, enc_att_kv):
+            target, new_self_att_k, new_self_att_v = layer(target,
+                                                           mask,
+                                                           source_encoded,
+                                                           source_mask,
+                                                           self_att_k, self_att_v,
+                                                           enc_att_k, enc_att_v)
+            new_self_att_kv += [new_self_att_k, new_self_att_v]
+        target = self.final_process(target, None)
 
-        :param batch_size: Batch size during inference.
-        :param target_max_length: Current target sequence length.
-        :param source_encoded_max_length: Size of encoder time dimension.
-        :param source_encoded_depth: Depth of encoded source.
-        :return: List of shape descriptions.
-        """
-        num_hidden = self.config.cnn_config.num_hidden
-        kernel_width = self.config.cnn_config.kernel_width
-        next_layer_inputs = [mx.io.DataDesc("cnn_layer%d_in" % layer_idx,
-                                            shape=(batch_size, kernel_width - 1, num_hidden),
-                                            layout="NTW")
-                             for layer_idx in range(0, self.config.num_layers)]
-        return [mx.io.DataDesc(C.SOURCE_ENCODED_NAME,
-                               (batch_size, source_encoded_max_length, source_encoded_depth),
-                               layout=C.BATCH_MAJOR),
-                mx.io.DataDesc(C.SOURCE_LENGTH_NAME, (batch_size,), layout="N")] + next_layer_inputs
+        return target, new_self_att_kv
 
-    def get_max_seq_len(self) -> Optional[int]:
-        #  The positional embeddings potentially pose a limit on the maximum length at inference time.
-        return self.pos_embedding.get_max_seq_len()
+    def get_num_hidden(self):
+        return self.config.model_size
diff --git a/sockeye/embeddings.py b/sockeye/embeddings.py
index 9fa378fe9..772d7b26f 100644
--- a/sockeye/embeddings.py
+++ b/sockeye/embeddings.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -73,12 +73,10 @@ def nearest_k(similarity_matrix: mx.nd.NDArray,
 
 
 def get_embedding_parameter_names(config: model.ModelConfig) -> Tuple[str, str]:
-    if config.weight_tying and C.WEIGHT_TYING_SRC in config.weight_tying_type and \
-            C.WEIGHT_TYING_SRC_TRG_SOFTMAX in config.weight_tying_type:
+    if C.WEIGHT_TYING_SRC in config.weight_tying_type and C.WEIGHT_TYING_SRC_TRG_SOFTMAX in config.weight_tying_type:
         name = "%sweight" % C.SHARED_EMBEDDING_PREFIX
         return name, name
-    else:
-        return "%sweight" % C.SOURCE_EMBEDDING_PREFIX, "%sweight" % C.TARGET_EMBEDDING_PREFIX
+    return "%sweight" % C.SOURCE_EMBEDDING_PREFIX, "%sweight" % C.TARGET_EMBEDDING_PREFIX
 
 
 def main():
@@ -102,25 +100,21 @@ def main():
 def embeddings(args: argparse.Namespace):
     logger.info("Arguments: %s", args)
 
-    config = model.SockeyeModel.load_config(os.path.join(args.model, C.CONFIG_NAME))
-    source_embedding_name, target_embedding_name = get_embedding_parameter_names(config)
+    sockeye_model, source_vocabs, target_vocab = model.load_model(args.model, checkpoint=args.checkpoint, hybridize=False)
 
     if args.side == "source":
-        vocab = load_source_vocabs(args.model)[0]
+        vocab = source_vocabs[0]
     else:
-        vocab = load_target_vocab(args.model)
+        vocab = target_vocab
     vocab_inv = reverse_vocab(vocab)
 
-    params_fname = C.PARAMS_BEST_NAME
-    if args.checkpoint is not None:
-        params_fname = C.PARAMS_NAME % args.checkpoint
-    params, _ = utils.load_params(os.path.join(args.model, params_fname))
+    params = sockeye_model.collect_params()
     if args.side == "source":
-        logger.info("Loading %s", source_embedding_name)
-        weights = params[source_embedding_name]
+        logger.info("Loading %s", sockeye_model.source_embed_weight.name)
+        weights = params[sockeye_model.source_embed_weight.name].data()
     else:
-        logger.info("Loading %s", target_embedding_name)
-        weights = params[target_embedding_name]
+        logger.info("Loading %s", sockeye_model.target_embed_weight.name)
+        weights = params[sockeye_model.target_embed_weight.name].data()
     logger.info("Embedding size: %d", weights.shape[1])
 
     logger.info("Computing pairwise similarities...")
diff --git a/sockeye/encoder.py b/sockeye/encoder.py
index 130c4a777..ec4ea41ea 100644
--- a/sockeye/encoder.py
+++ b/sockeye/encoder.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -17,15 +17,13 @@
 import inspect
 import logging
 from abc import ABC, abstractmethod
-from math import ceil, floor
-from typing import Callable, List, Optional, Tuple, Union, Dict
+from typing import List, Optional, Union
 
 import mxnet as mx
 
 from . import config
 from . import constants as C
-from . import convolution
-from . import rnn
+from . import layers
 from . import transformer
 from . import utils
 
@@ -35,166 +33,11 @@
 ImageEncoderConfig = None
 
 
-def get_encoder(config: 'EncoderConfig', prefix: str = '') -> 'Encoder':
-    if isinstance(config, RecurrentEncoderConfig):
-        return get_recurrent_encoder(config, prefix)
-    elif isinstance(config, transformer.TransformerConfig):
-        return get_transformer_encoder(config, prefix)
-    elif isinstance(config, ConvolutionalEncoderConfig):
-        return get_convolutional_encoder(config, prefix)
-    elif isinstance(config, EmptyEncoderConfig):
-        return EncoderSequence([EmptyEncoder(config)], config.dtype)
-    else:
-        from .image_captioning.encoder import ImageLoadedCnnEncoderConfig, \
-            get_image_cnn_encoder
+def get_encoder(config: 'EncoderConfig', prefix: str = '', dtype: str = C.DTYPE_FP32) -> 'Encoder':
+    return get_transformer_encoder(config, prefix, dtype)
 
-        if isinstance(config, ImageLoadedCnnEncoderConfig):
-            return get_image_cnn_encoder(config)
-        else:
-            raise ValueError("Unsupported encoder configuration")
 
-
-class RecurrentEncoderConfig(config.Config):
-    """
-    Recurrent encoder configuration.
-
-    :param rnn_config: RNN configuration.
-    :param conv_config: Optional configuration for convolutional embedding.
-    :param reverse_input: Reverse embedding sequence before feeding into RNN.
-    :param dtype: Data type.
-    """
-
-    def __init__(self,
-                 rnn_config: rnn.RNNConfig,
-                 conv_config: Optional['ConvolutionalEmbeddingConfig'] = None,
-                 reverse_input: bool = False,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__()
-        self.rnn_config = rnn_config
-        self.conv_config = conv_config
-        self.reverse_input = reverse_input
-        self.dtype = dtype
-
-
-class ConvolutionalEncoderConfig(config.Config):
-    """
-    Convolutional encoder configuration.
-
-    :param cnn_config: CNN configuration.
-    :param num_layers: The number of convolutional layers on top of the embeddings.
-    :param positional_embedding_type: The type of positional embedding.
-    :param dtype: Data type.
-    """
-
-    def __init__(self,
-                 num_embed: int,
-                 max_seq_len_source: int,
-                 cnn_config: convolution.ConvolutionConfig,
-                 num_layers: int,
-                 positional_embedding_type: str,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__()
-        self.num_embed = num_embed
-        self.num_layers = num_layers
-        self.cnn_config = cnn_config
-        self.max_seq_len_source = max_seq_len_source
-        self.positional_embedding_type = positional_embedding_type
-        self.dtype = dtype
-
-
-class EmptyEncoderConfig(config.Config):
-    """
-    Empty encoder configuration.
-    :param num_embed: source embedding size.
-    :param num_hidden: the representation size of this encoder.
-    :param dtype: Data type.
-    """
-
-    def __init__(self,
-                 num_embed: int,
-                 num_hidden: int,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__()
-        self.num_embed = num_embed
-        self.num_hidden = num_hidden
-        self.dtype = dtype
-        self.allow_missing = True
-
-
-def get_recurrent_encoder(config: RecurrentEncoderConfig, prefix: str) -> 'Encoder':
-    """
-    Returns an encoder stack with a bi-directional RNN, and a variable number of uni-directional forward RNNs.
-
-    :param config: Configuration for recurrent encoder.
-    :param prefix: Prefix for variable names.
-    :return: Encoder instance.
-    """
-    # TODO give more control on encoder architecture
-    encoder_seq = EncoderSequence([], config.dtype)
-
-    if config.conv_config is not None:
-        encoder_seq.append(ConvolutionalEmbeddingEncoder, config=config.conv_config,
-                           prefix=prefix + C.CHAR_SEQ_ENCODER_PREFIX)
-        if config.conv_config.add_positional_encoding:
-            # If specified, add positional encodings to segment embeddings
-            encoder_seq.append(AddSinCosPositionalEmbeddings,
-                               num_embed=config.conv_config.num_embed,
-                               scale_up_input=False,
-                               scale_down_positions=False,
-                               prefix="%s%sadd_positional_encodings" % (prefix, C.CHAR_SEQ_ENCODER_PREFIX))
-        encoder_seq.append(ConvertLayout, infer_hidden=True, target_layout=C.TIME_MAJOR)
-    else:
-        encoder_seq.append(ConvertLayout, target_layout=C.TIME_MAJOR, num_hidden=0)
-
-    if config.reverse_input:
-        encoder_seq.append(ReverseSequence, infer_hidden=True)
-
-    if config.rnn_config.residual:
-        utils.check_condition(config.rnn_config.first_residual_layer >= 2,
-                              "Residual connections on the first encoder layer are not supported")
-
-    # One layer bi-directional RNN:
-    encoder_seq.append(BiDirectionalRNNEncoder,
-                       rnn_config=config.rnn_config.copy(num_layers=1),
-                       prefix=prefix + C.BIDIRECTIONALRNN_PREFIX,
-                       layout=C.TIME_MAJOR)
-
-    if config.rnn_config.num_layers > 1:
-        # Stacked uni-directional RNN:
-        # Because we already have a one layer bi-rnn we reduce the num_layers as well as the first_residual_layer.
-        remaining_rnn_config = config.rnn_config.copy(num_layers=config.rnn_config.num_layers - 1,
-                                                      first_residual_layer=config.rnn_config.first_residual_layer - 1)
-        encoder_seq.append(RecurrentEncoder,
-                           rnn_config=remaining_rnn_config,
-                           prefix=prefix + C.STACKEDRNN_PREFIX,
-                           layout=C.TIME_MAJOR)
-
-    encoder_seq.append(ConvertLayout, infer_hidden=True, target_layout=C.BATCH_MAJOR)
-
-    return encoder_seq
-
-
-def get_convolutional_encoder(config: ConvolutionalEncoderConfig, prefix: str) -> 'Encoder':
-    """
-    Creates a convolutional encoder.
-
-    :param config: Configuration for convolutional encoder.
-    :param prefix: Prefix for variable names.
-    :return: Encoder instance.
-    """
-    encoder_seq = EncoderSequence([], dtype=config.dtype)
-    cls, encoder_params = _get_positional_embedding_params(config.positional_embedding_type,
-                                                           config.num_embed,
-                                                           max_seq_len=config.max_seq_len_source,
-                                                           fixed_pos_embed_scale_up_input=False,
-                                                           fixed_pos_embed_scale_down_positions=True,
-                                                           prefix=prefix + C.SOURCE_POSITIONAL_EMBEDDING_PREFIX)
-    encoder_seq.append(cls, **encoder_params)
-    encoder_seq.append(ConvolutionalEncoder, config=config)
-    return encoder_seq
-
-
-def get_transformer_encoder(config: transformer.TransformerConfig, prefix: str) -> 'Encoder':
+def get_transformer_encoder(config: transformer.TransformerConfig, prefix: str, dtype: str) -> 'Encoder':
     """
     Returns a Transformer encoder, consisting of an embedding layer with
     positional encodings and a TransformerEncoder instance.
@@ -203,49 +46,30 @@ def get_transformer_encoder(config: transformer.TransformerConfig, prefix: str)
     :param prefix: Prefix for variable names.
     :return: Encoder instance.
     """
-    encoder_seq = EncoderSequence([], dtype=config.dtype)
-    cls, encoder_params = _get_positional_embedding_params(config.positional_embedding_type,
-                                                           config.model_size,
-                                                           config.max_seq_len_source,
-                                                           fixed_pos_embed_scale_up_input=True,
-                                                           fixed_pos_embed_scale_down_positions=False,
-                                                           prefix=prefix + C.SOURCE_POSITIONAL_EMBEDDING_PREFIX)
-    encoder_seq.append(cls, **encoder_params)
-    if config.conv_config is not None:
-        encoder_seq.append(ConvolutionalEmbeddingEncoder, config=config.conv_config,
-                           prefix=prefix + C.CHAR_SEQ_ENCODER_PREFIX)
-
-    encoder_seq.append(TransformerEncoder, config=config, prefix=prefix + C.TRANSFORMER_ENCODER_PREFIX)
+    return TransformerEncoder(config=config, prefix=prefix + C.TRANSFORMER_ENCODER_PREFIX, dtype=dtype)
 
-    return encoder_seq
 
-
-class Encoder(ABC):
+class Encoder(ABC, mx.gluon.HybridBlock):
     """
     Generic encoder interface.
-
-    :param dtype: Data type.
     """
 
     @abstractmethod
-    def __init__(self, dtype):
-        logger.info('{}.{} dtype: {}'.format(self.__module__, self.__class__.__name__, dtype))
-        self.dtype = dtype
+    def __init__(self, **kwargs):
+        mx.gluon.HybridBlock.__init__(self, **kwargs)
 
-    @abstractmethod
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: Optional[mx.sym.Symbol],
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
+    def forward(self, inputs, valid_length):  # pylint: disable=arguments-differ
+        return mx.gluon.HybridBlock.forward(self, inputs, valid_length)
+
+    def __call__(self, inputs, valid_length):  #pylint: disable=arguments-differ
         """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
+        Encodes inputs given valid lengths of individual examples.
 
-        :param data: Input data.
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Encoded versions of input data (data, data_length, seq_len).
+        :param inputs: Input data.
+        :param valid_length: Length of inputs without padding.
+        :return: Encoded versions of input data (data, data_length).
         """
-        pass
+        return mx.gluon.HybridBlock.__call__(self, inputs, valid_length)
 
     @abstractmethod
     def get_num_hidden(self) -> int:
@@ -267,68 +91,18 @@ def get_max_seq_len(self) -> Optional[int]:
         return None
 
 
-class ConvertLayout(Encoder):
-    """
-    Converts batch major data to time major by swapping the first dimension and setting the __layout__ attribute.
-
-    :param target_layout: The target layout to convert to (C.BATCH_MAJOR or C.TIMEMAJOR).
-    :param num_hidden: The number of hidden units of the previous encoder.
-    :param dtype: Data type.
-    """
-
-    def __init__(self, target_layout: str, num_hidden: int, dtype: str = C.DTYPE_FP32) -> None:
-        assert target_layout == C.BATCH_MAJOR or target_layout == C.TIME_MAJOR
-        super().__init__(dtype)
-        self.num_hidden = num_hidden
-        self.target_layout = target_layout
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: Optional[mx.sym.Symbol],
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
-
-        :param data: Input data.
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Encoded versions of input data (data, data_length, seq_len).
-        """
-        with mx.AttrScope(__layout__=self.target_layout):
-            return mx.sym.swapaxes(data=data, dim1=0, dim2=1), data_length, seq_len
-
-    def get_num_hidden(self) -> int:
-        return self.num_hidden
-
-
-class ReverseSequence(Encoder):
-    """
-    Reverses the input sequence. Requires time-major layout.
-
-    :param dtype: Data type.
-    """
-
-    def __init__(self, num_hidden: int, dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__(dtype)
-        self.num_hidden = num_hidden
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: mx.sym.Symbol,
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        data = mx.sym.SequenceReverse(data=data, sequence_length=data_length, use_sequence_length=True)
-        return data, data_length, seq_len
-
-    def get_num_hidden(self):
-        return self.num_hidden
-
-
 class FactorConfig(config.Config):
 
-    def __init__(self, vocab_size: int, num_embed: int) -> None:
+    def __init__(self,
+                 vocab_size: int,
+                 num_embed: int,
+                 combine: str, # From C.SOURCE_FACTORS_COMBINE_CHOICES
+                 share_source_embedding: bool) -> None:
         super().__init__()
         self.vocab_size = vocab_size
         self.num_embed = num_embed
+        self.combine = combine
+        self.share_source_embedding = share_source_embedding
 
 
 class EmbeddingConfig(config.Config):
@@ -338,8 +112,7 @@ def __init__(self,
                  num_embed: int,
                  dropout: float,
                  factor_configs: Optional[List[FactorConfig]] = None,
-                 source_factors_combine: str = C.SOURCE_FACTORS_COMBINE_CONCAT,
-                 dtype: str = C.DTYPE_FP32) -> None:
+                 allow_sparse_grad: bool = False) -> None:
         super().__init__()
         self.vocab_size = vocab_size
         self.num_embed = num_embed
@@ -348,8 +121,7 @@ def __init__(self,
         self.num_factors = 1
         if self.factor_configs is not None:
             self.num_factors += len(self.factor_configs)
-        self.source_factors_combine = source_factors_combine
-        self.dtype = dtype
+        self.allow_sparse_grad = allow_sparse_grad
 
 
 class Embedding(Encoder):
@@ -358,77 +130,89 @@ class Embedding(Encoder):
 
     :param config: Embedding config.
     :param prefix: Name prefix for symbols of this encoder.
-    :param embed_weight: Optionally use an existing embedding matrix instead of creating a new one.
     :param is_source: Whether this is the source embedding instance. Default: False.
+    :param dtype: Data type. Default: 'float32'.
     """
 
     def __init__(self,
                  config: EmbeddingConfig,
                  prefix: str,
-                 embed_weight: Optional[mx.sym.Symbol] = None,
-                 is_source: bool = False) -> None:
-        super().__init__(config.dtype)
+                 is_source: bool = False,
+                 embed_weight: Optional[mx.gluon.Parameter] = None,
+                 dtype: str = C.DTYPE_FP32) -> None:
+        super().__init__(prefix=prefix)
         self.config = config
-        self.prefix = prefix
-        self.embed_weight = embed_weight
         self.is_source = is_source
+        self._dtype = dtype
 
-        if self.embed_weight is None:
-            self.embed_weight = mx.sym.Variable(prefix + "weight",
-                                                shape=(self.config.vocab_size, self.config.num_embed))
-
-        self.embed_factor_weights = []  # type: List[mx.sym.Symbol]
-        if self.config.factor_configs is not None:
-            # Factor weights aren't shared so they're not passed in and we create them here.
-            for i, fc in enumerate(self.config.factor_configs):
-                self.embed_factor_weights.append(mx.sym.Variable(prefix + "factor%d_weight" % i,
-                                                                 shape=(fc.vocab_size, fc.num_embed)))
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: Optional[mx.sym.Symbol],
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
-
-        :param data: Input data.
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Encoded versions of input data (data, data_length, seq_len).
-        """
-        factor_embeddings = []  # type: List[mx.sym.Symbol]
-        if self.is_source:
-            data, *data_factors = mx.sym.split(data=data,
-                                               num_outputs=self.config.num_factors,
-                                               axis=2,
-                                               squeeze_axis=True, name=self.prefix + "factor_split")
+        with self.name_scope():
+            if embed_weight is None:
+                self.embed_weight = self.params.get('weight',
+                                                    shape=(self.config.vocab_size, self.config.num_embed),
+                                                    grad_stype='row_sparse',
+                                                    dtype=dtype)
+                self._use_sparse_grad = self.config.allow_sparse_grad
+            else:
+                self.embed_weight = embed_weight  # adds to self._reg_params
+                self.params.update({embed_weight.name: embed_weight})  # adds to self.params
+                self._use_sparse_grad = embed_weight._grad_stype == 'row_sparse' and self.config.allow_sparse_grad
 
             if self.config.factor_configs is not None:
-                for i, (factor_data, factor_config, factor_weight) in enumerate(zip(data_factors,
-                                                                                    self.config.factor_configs,
-                                                                                    self.embed_factor_weights)):
-                    factor_embeddings.append(mx.sym.Embedding(data=factor_data,
-                                                              input_dim=factor_config.vocab_size,
-                                                              weight=factor_weight,
-                                                              output_dim=factor_config.num_embed,
-                                                              name=self.prefix + "factor%d_embed" % i))
-
-        embedding = mx.sym.Embedding(data=data,
-                                     input_dim=self.config.vocab_size,
-                                     weight=self.embed_weight,
-                                     output_dim=self.config.num_embed,
-                                     name=self.prefix + "embed")
-
-        if self.config.factor_configs is not None:
-            if self.config.source_factors_combine == C.SOURCE_FACTORS_COMBINE_CONCAT:
-                embedding = mx.sym.concat(embedding, *factor_embeddings, dim=2, name=self.prefix + "embed_plus_factors")
+                for i, fc in enumerate(self.config.factor_configs):
+                    factor_weight_name = 'factor%d_weight' % i
+                    factor_weight = embed_weight if fc.share_source_embedding else \
+                        self.params.get('factor%d_weight' % i, shape=(fc.vocab_size, fc.num_embed), dtype=dtype)
+                    # We set the attribute of the class to trigger the hybrid_forward parameter creation "magic"
+                    setattr(self, factor_weight_name, factor_weight)
+
+    def hybrid_forward(self, F, data, valid_length, embed_weight, **kwargs):  # pylint: disable=arguments-differ
+        # We will catch the optional factor weights in kwargs
+        average_factors_embeds = []  # type: List[Union[mx.sym.Symbol, mx.nd.ndarray]]
+        concat_factors_embeds = []  # type: List[Union[mx.sym.Symbol, mx.nd.ndarray]]
+        sum_factors_embeds = []  # type: List[Union[mx.sym.Symbol, mx.nd.ndarray]]
+        if self.is_source:
+            if self.config.num_factors > 1 and self.config.factor_configs is not None:
+                data, *data_factors = F.split(data=data,
+                                              num_outputs=self.config.num_factors,
+                                              axis=2,
+                                              squeeze_axis=True)
+                for i, (factor_data, factor_config) in enumerate(zip(data_factors,
+                                                                     self.config.factor_configs)):
+                    factor_weight = kwargs['factor%d_weight' % i]
+                    factor_embedding = F.Embedding(data=factor_data,
+                                                   input_dim=factor_config.vocab_size,
+                                                   weight=factor_weight,
+                                                   output_dim=factor_config.num_embed)
+                    if factor_config.combine == C.SOURCE_FACTORS_COMBINE_CONCAT:
+                        concat_factors_embeds.append(factor_embedding)
+                    elif factor_config.combine == C.SOURCE_FACTORS_COMBINE_SUM:
+                        sum_factors_embeds.append(factor_embedding)
+                    elif factor_config.combine == C.SOURCE_FACTORS_COMBINE_AVERAGE:
+                        average_factors_embeds.append(factor_embedding)
+                    else:
+                        raise ValueError("Unknown combine value for source factors: %s" % factor_config.combine)
             else:
-                embedding = mx.sym.add_n(embedding, *factor_embeddings, name=self.prefix + "embed_plus_factors")
+                data = F.squeeze(data, axis=2)
+
+        embed = F.Embedding(data,
+                            weight=embed_weight,
+                            input_dim=self.config.vocab_size,
+                            output_dim=self.config.num_embed,
+                            dtype=self._dtype,
+                            sparse_grad=self._use_sparse_grad)
+
+        if self.config.num_factors > 1 and self.config.factor_configs is not None:
+            if average_factors_embeds:
+                embed = F.add_n(embed, *average_factors_embeds) / (len(average_factors_embeds) + 1)
+            if sum_factors_embeds:
+                embed = F.add_n(embed, *sum_factors_embeds)
+            if concat_factors_embeds:
+                embed = F.concat(embed, *concat_factors_embeds, dim=2)
 
         if self.config.dropout > 0:
-            embedding = mx.sym.Dropout(data=embedding, p=self.config.dropout, name="source_embed_dropout")
+            embed = F.Dropout(data=embed, p=self.config.dropout)
 
-        return embedding, data_length, seq_len
+        return embed, F.identity(valid_length)  # identity: See https://github.com/apache/incubator-mxnet/issues/14228
 
     def get_num_hidden(self) -> int:
         """
@@ -437,310 +221,37 @@ def get_num_hidden(self) -> int:
         return self.config.num_embed
 
 
-class PassThroughEmbeddingConfig(EmbeddingConfig):
-
-    def __init__(self) -> None:
-        super().__init__(vocab_size=0, num_embed=0, dropout=0.0, factor_configs=None)
-
-
-class PassThroughEmbedding(Encoder):
-    """
-    This is an embedding which passes through an input symbol without doing any operation.
-
-    :param config: PassThroughEmbeddingConfig config.
-    """
-
-    def __init__(self,
-                 config: PassThroughEmbeddingConfig) -> None:
-        super().__init__('float32')
-        self.config = config
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: Optional[mx.sym.Symbol],
-               seq_len: int = 0) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
-
-        :param data: Input data.
-        :param data_length: Vector with sequence lengths.
-        :return: Encoded versions of input data (data, data_length, seq_len).
-        """
-        return data, data_length, seq_len
-
-    def get_num_hidden(self) -> int:
-        """
-        Return the representation size of this encoder.
-        """
-        return 0
-
-
-class PositionalEncoder(Encoder):
-    @abstractmethod
-    def encode_positions(self,
-                         positions: mx.sym.Symbol,
-                         data: mx.sym.Symbol) -> mx.sym.Symbol:
-        """
-        Add positional encodings to the data using the provided positions.
-        :param positions: (batch_size,)
-        :param data: (batch_size, num_embed)
-        :return: (batch_size, num_embed)
-        """
-        pass
-
-
-class AddSinCosPositionalEmbeddings(PositionalEncoder):
-    """
-    Takes an encoded sequence and adds fixed positional embeddings as in Vaswani et al, 2017 to it.
-
-    :param num_embed: Embedding size.
-    :param prefix: Name prefix for symbols of this encoder.
-    :param scale_up_input: If True, scales input data up by num_embed ** 0.5.
-    :param scale_down_positions: If True, scales positional embeddings down by num_embed ** -0.5.
-    :param dtype: Data type.
-    """
-
-    def __init__(self,
-                 num_embed: int,
-                 prefix: str,
-                 scale_up_input: bool,
-                 scale_down_positions: bool,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        utils.check_condition(num_embed % 2 == 0, "Positional embeddings require an even embedding size it "
-                                                  "is however %d." % num_embed)
-        super().__init__(dtype)
-        self.scale_up_input = scale_up_input
-        self.scale_down_positions = scale_down_positions
-        self.num_embed = num_embed
-        self.prefix = prefix
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: Optional[mx.sym.Symbol],
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        :param data: (batch_size, source_seq_len, num_embed)
-        :param data_length: (batch_size,)
-        :param seq_len: sequence length.
-        :return: (batch_size, source_seq_len, num_embed)
-        """
-        positions = mx.sym.arange(0, seq_len)
-        embedding = self.encode_positions(positions, data)
-        return embedding, data_length, seq_len
-
-    def encode_positions(self,
-                         positions: mx.sym.Symbol,
-                         data: mx.sym.Symbol) -> mx.sym.Symbol:
-        """
-        :param positions: (batch_size,)
-        :param data: (batch_size, num_embed)
-        :return: (batch_size, num_embed)
-        """
-        # (batch_size, 1)
-        positions = mx.sym.expand_dims(positions, axis=1)
-        # (num_embed,)
-        channels = mx.sym.arange(0, self.num_embed // 2)
-        # (1, num_embed,)
-        scaling = mx.sym.expand_dims(1. / mx.sym.pow(10000, (2 * channels) / self.num_embed), axis=0)
-
-        # (batch_size, num_embed/2)
-        scaled_positions = mx.sym.dot(positions, scaling)
-
-        sin = mx.sym.sin(scaled_positions)
-        cos = mx.sym.cos(scaled_positions)
-
-        # (batch_size, num_embed)
-        pos_embedding = mx.sym.concat(sin, cos, dim=1)
-
-        if self.scale_up_input:
-            data = data * (self.num_embed ** 0.5)
-
-        if self.scale_down_positions:
-            pos_embedding = pos_embedding * (self.num_embed ** -0.5)
-
-        pos_embedding = mx.sym.BlockGrad(pos_embedding)
-
-        return mx.sym.broadcast_add(data, pos_embedding, name="%s_add" % self.prefix)
-
-    def get_num_hidden(self) -> int:
-        return self.num_embed
-
-
-class AddLearnedPositionalEmbeddings(PositionalEncoder):
-    """
-    Takes an encoded sequence and adds positional embeddings to it, which are learned jointly. Note that this will
-    limited the maximum sentence length during decoding.
-
-    :param num_embed: Embedding size.
-    :param max_seq_len: Maximum sequence length.
-    :param prefix: Name prefix for symbols of this encoder.
-    :param embed_weight: Optionally use an existing embedding matrix instead of creating a new one.
-    :param dtype: Data type.
-    """
-
-    def __init__(self,
-                 num_embed: int,
-                 max_seq_len: int,
-                 prefix: str,
-                 embed_weight: Optional[mx.sym.Symbol] = None,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__(dtype)
-        self.num_embed = num_embed
-        self.max_seq_len = max_seq_len
-        self.prefix = prefix
-        if embed_weight is not None:
-            self.embed_weight = embed_weight
-        else:
-            self.embed_weight = mx.sym.Variable(prefix + "weight")
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: Optional[mx.sym.Symbol],
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        :param data: (batch_size, source_seq_len, num_embed)
-        :param data_length: (batch_size,)
-        :param seq_len: sequence length.
-        :return: (batch_size, source_seq_len, num_embed)
-        """
-
-        # (1, source_seq_len)
-        positions = mx.sym.expand_dims(data=mx.sym.arange(start=0, stop=seq_len, step=1), axis=0)
-
-        # (1, source_seq_len, num_embed)
-        pos_embedding = mx.sym.Embedding(data=positions,
-                                         input_dim=self.max_seq_len,
-                                         weight=self.embed_weight,
-                                         output_dim=self.num_embed,
-                                         name=self.prefix + "pos_embed")
-        return mx.sym.broadcast_add(data, pos_embedding, name="%s_add" % self.prefix), data_length, seq_len
-
-    def encode_positions(self,
-                         positions: mx.sym.Symbol,
-                         data: mx.sym.Symbol) -> mx.sym.Symbol:
-        """
-        :param positions: (batch_size,)
-        :param data: (batch_size, num_embed)
-        :return: (batch_size, num_embed)
-        """
-
-        # (batch_size, source_seq_len, num_embed)
-        pos_embedding = mx.sym.Embedding(data=positions,
-                                         input_dim=self.max_seq_len,
-                                         weight=self.embed_weight,
-                                         output_dim=self.num_embed,
-                                         name=self.prefix + "pos_embed")
-        return mx.sym.broadcast_add(data, pos_embedding, name="%s_add" % self.prefix)
-
-    def get_num_hidden(self) -> int:
-        return self.num_embed
-
-    def get_max_seq_len(self) -> Optional[int]:
-        # we can only support sentences as long as the maximum length during training.
-        return self.max_seq_len
-
-
-class NoOpPositionalEmbeddings(PositionalEncoder):
-    """
-    Simple NoOp pos embedding. It does not modify the data, but avoids lots of if statements.
-
-    :param dtype: Data type.
-    """
-
-    def __init__(self, num_embed, dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__(dtype)
-        self.num_embed = num_embed
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: Optional[mx.sym.Symbol],
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        return data, data_length, seq_len
-
-    def encode_positions(self,
-                         positions: mx.sym.Symbol,
-                         data: mx.sym.Symbol) -> mx.sym.Symbol:
-        return data
-
-    def get_num_hidden(self) -> int:
-        return self.num_embed
-
-
-def _get_positional_embedding_params(positional_embedding_type: str,
-                                     num_embed: int,
-                                     max_seq_len: int,
-                                     fixed_pos_embed_scale_up_input: bool = False,
-                                     fixed_pos_embed_scale_down_positions: bool = False,
-                                     prefix: str = '') -> Tuple[Callable, Dict]:
-    if positional_embedding_type == C.FIXED_POSITIONAL_EMBEDDING:
-        return AddSinCosPositionalEmbeddings, dict(num_embed=num_embed,
-                                                   scale_up_input=fixed_pos_embed_scale_up_input,
-                                                   scale_down_positions=fixed_pos_embed_scale_down_positions,
-                                                   prefix=prefix)
-    elif positional_embedding_type == C.LEARNED_POSITIONAL_EMBEDDING:
-        return AddLearnedPositionalEmbeddings, dict(num_embed=num_embed,
-                                                    max_seq_len=max_seq_len,
-                                                    prefix=prefix)
-    elif positional_embedding_type == C.NO_POSITIONAL_EMBEDDING:
-        return NoOpPositionalEmbeddings, dict(num_embed=num_embed)
-    else:
-        raise ValueError("Unknown positional embedding type %s" % positional_embedding_type)
-
-
-def get_positional_embedding(positional_embedding_type: str,
-                             num_embed: int,
-                             max_seq_len: int,
-                             fixed_pos_embed_scale_up_input: bool = False,
-                             fixed_pos_embed_scale_down_positions: bool = False,
-                             prefix: str = '') -> PositionalEncoder:
-    cls, encoder_params = _get_positional_embedding_params(positional_embedding_type,
-                                                           num_embed,
-                                                           max_seq_len,
-                                                           fixed_pos_embed_scale_up_input,
-                                                           fixed_pos_embed_scale_down_positions,
-                                                           prefix)
-    return cls(**encoder_params)
-
-
-class EncoderSequence(Encoder):
+class EncoderSequence(Encoder, mx.gluon.nn.HybridSequential):
     """
     A sequence of encoders is itself an encoder.
-
-    :param encoders: List of encoders.
-    :param dtype: Data type.
     """
 
-    def __init__(self, encoders: List[Encoder], dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__(dtype)
-        self.encoders = encoders
+    def __init__(self, prefix: str = '') -> None:
+        Encoder.__init__(self)
+        mx.gluon.nn.HybridSequential.__init__(self, prefix=prefix)
 
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: mx.sym.Symbol,
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
+    def add(self, *encoders):
+        """Adds block on top of the stack."""
+        for encoder in encoders:
+            utils.check_condition(isinstance(encoder, Encoder), "%s is not of type Encoder" % encoder)
+        mx.gluon.nn.HybridSequential.add(self, *encoders)
 
-        :param data: Input data.
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Encoded versions of input data (data, data_length, seq_len).
-        """
-        for encoder in self.encoders:
-            data, data_length, seq_len = encoder.encode(data, data_length, seq_len)
-        return data, data_length, seq_len
+    def hybrid_forward(self, F, data, valid_length):  # pylint: disable=arguments-differ
+        for block in self._children.values():
+            data, valid_length = block(data, valid_length)
+        return data, F.identity(valid_length)  # identity: See https://github.com/apache/incubator-mxnet/issues/14228
 
     def get_num_hidden(self) -> int:
         """
         Return the representation size of this encoder.
         """
-        return self.encoders[-1].get_num_hidden()
+        return next(reversed(self._children.values())).get_num_hidden()
 
     def get_encoded_seq_len(self, seq_len: int) -> int:
         """
         Returns the size of the encoded sequence.
         """
-        for encoder in self.encoders:
+        for encoder in self._children.values():
             seq_len = encoder.get_encoded_seq_len(seq_len)
         return seq_len
 
@@ -749,13 +260,12 @@ def get_max_seq_len(self) -> Optional[int]:
         :return: The maximum length supported by the encoder if such a restriction exists.
         """
         max_seq_len = min((encoder.get_max_seq_len()
-                           for encoder in self.encoders if encoder.get_max_seq_len() is not None), default=None)
+                           for encoder in self._children.values() if encoder.get_max_seq_len() is not None), default=None)
         return max_seq_len
 
     def append(self, cls, infer_hidden: bool = False, **kwargs) -> Encoder:
         """
-        Extends sequence with new Encoder. 'dtype' gets passed into Encoder instance if not present in parameters
-        and supported by specific Encoder type.
+        Extends sequence with new Encoder.
 
         :param cls: Encoder type.
         :param infer_hidden: If number of hidden should be inferred from previous encoder.
@@ -768,251 +278,11 @@ def append(self, cls, infer_hidden: bool = False, **kwargs) -> Encoder:
             params['num_hidden'] = self.get_num_hidden()
 
         sig_params = inspect.signature(cls.__init__).parameters
-        if 'dtype' in sig_params and 'dtype' not in kwargs:
-            params['dtype'] = self.dtype
         encoder = cls(**params)
-        self.encoders.append(encoder)
+        self.add(encoder)
         return encoder
 
 
-class EmptyEncoder(Encoder):
-    """
-    This encoder ignores the input data and simply returns zero-filled states in the expected shape.
-    :param config: configuration.
-    """
-
-    def __init__(self,
-                 config: EmptyEncoderConfig) -> None:
-        super().__init__(config.dtype)
-        self.num_embed = config.num_embed
-        self.num_hidden = config.num_hidden
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: Optional[mx.sym.Symbol],
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
-        :param data: Input data.
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Expected number of empty states (zero-filled).
-        """
-        # outputs: (batch_size, seq_len, num_hidden)
-        outputs = mx.sym.dot(data, mx.sym.zeros((self.num_embed, self.num_hidden)))
-        return outputs, data_length, seq_len
-
-    def get_num_hidden(self):
-        """
-        Return the representation size of this encoder.
-        """
-        return self.num_hidden
-
-
-class RecurrentEncoder(Encoder):
-    """
-    Uni-directional (multi-layered) recurrent encoder.
-
-    :param rnn_config: RNN configuration.
-    :param prefix: Prefix for variable names.
-    :param layout: Data layout.
-    """
-
-    def __init__(self,
-                 rnn_config: rnn.RNNConfig,
-                 prefix: str = C.STACKEDRNN_PREFIX,
-                 layout: str = C.TIME_MAJOR) -> None:
-        super().__init__(rnn_config.dtype)
-        self.rnn_config = rnn_config
-        self.layout = layout
-        self.rnn = rnn.get_stacked_rnn(rnn_config, prefix)
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: Optional[mx.sym.Symbol],
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
-
-        :param data: Input data.
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Encoded versions of input data (data, data_length, seq_len).
-        """
-        
-        # The following piece of code illustrates how to unroll the RNN cell(s) over time independent of seq_len,
-        # using the new control-flow operator foreach. It works, but shape inference fails when using
-        # the VariationalDropout cell. ATM it is unclear how to fix it.
-
-        # self.rnn.reset()
-        # states = self.rnn.begin_state()  # type: List[mx.sym.Symbol]
-        # states.append(mx.sym.zeros((1,)))  # last state is step counter starting at 0
-        #
-        # def loop_body(inputs, states):
-        #     cell_states = states[:-1]
-        #     i = states[-1]
-        #     out, new_states = self.rnn(inputs, cell_states)
-        #     new_states.append(i + 1)
-        #     return out, new_states
-        #
-        # # last state item is step counter
-        # outputs, _ = mx.sym.contrib.foreach(loop_body, data, states)
-
-        outputs, _ = self.rnn.unroll(seq_len, inputs=data, merge_outputs=True, layout=self.layout)
-
-        return outputs, data_length, seq_len
-
-    def get_rnn_cells(self):
-        """
-        Returns RNNCells used in this encoder.
-        """
-        return [self.rnn]
-
-    def get_num_hidden(self):
-        """
-        Return the representation size of this encoder.
-        """
-        return self.rnn_config.num_hidden
-
-
-class BiDirectionalRNNEncoder(Encoder):
-    """
-    An encoder that runs a forward and a reverse RNN over input data.
-    States from both RNNs are concatenated together.
-
-    :param rnn_config: RNN configuration.
-    :param prefix: Prefix for variable names.
-    :param layout: Data layout.
-    :param encoder_class: Recurrent encoder class to use.
-    """
-
-    def __init__(self,
-                 rnn_config: rnn.RNNConfig,
-                 prefix=C.BIDIRECTIONALRNN_PREFIX,
-                 layout=C.TIME_MAJOR,
-                 encoder_class: Callable = RecurrentEncoder) -> None:
-        utils.check_condition(rnn_config.num_hidden % 2 == 0,
-                              "num_hidden must be a multiple of 2 for BiDirectionalRNNEncoders.")
-        super().__init__(rnn_config.dtype)
-        self.rnn_config = rnn_config
-        self.internal_rnn_config = rnn_config.copy(num_hidden=rnn_config.num_hidden // 2)
-        if layout[0] == 'N':
-            logger.warning("Batch-major layout for encoder input. Consider using time-major layout for faster speed")
-
-        # time-major layout as _encode needs to swap layout for SequenceReverse
-        self.forward_rnn = encoder_class(rnn_config=self.internal_rnn_config,
-                                         prefix=prefix + C.FORWARD_PREFIX,
-                                         layout=C.TIME_MAJOR)
-        self.reverse_rnn = encoder_class(rnn_config=self.internal_rnn_config,
-                                         prefix=prefix + C.REVERSE_PREFIX,
-                                         layout=C.TIME_MAJOR)
-        self.layout = layout
-        self.prefix = prefix
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: mx.sym.Symbol,
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
-
-        :param data: Input data.
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Encoded versions of input data (data, data_length, seq_len).
-        """
-        if self.layout[0] == 'N':
-            data = mx.sym.swapaxes(data=data, dim1=0, dim2=1)
-        data = self._encode(data, data_length, seq_len)
-        if self.layout[0] == 'N':
-            data = mx.sym.swapaxes(data=data, dim1=0, dim2=1)
-        return data, data_length, seq_len
-
-    def _encode(self, data: mx.sym.Symbol, data_length: mx.sym.Symbol, seq_len: int) -> mx.sym.Symbol:
-        """
-        Bidirectionally encodes time-major data.
-        """
-        # (seq_len, batch_size, num_embed)
-        data_reverse = mx.sym.SequenceReverse(data=data, sequence_length=data_length,
-                                              use_sequence_length=True)
-        # (seq_length, batch, cell_num_hidden)
-        hidden_forward, _, _ = self.forward_rnn.encode(data, data_length, seq_len)
-        # (seq_length, batch, cell_num_hidden)
-        hidden_reverse, _, _ = self.reverse_rnn.encode(data_reverse, data_length, seq_len)
-        # (seq_length, batch, cell_num_hidden)
-        hidden_reverse = mx.sym.SequenceReverse(data=hidden_reverse, sequence_length=data_length,
-                                                use_sequence_length=True)
-        # (seq_length, batch, 2 * cell_num_hidden)
-        hidden_concat = mx.sym.concat(hidden_forward, hidden_reverse, dim=2, name="%s_rnn" % self.prefix)
-
-        return hidden_concat
-
-    def get_num_hidden(self) -> int:
-        """
-        Return the representation size of this encoder.
-        """
-        return self.rnn_config.num_hidden
-
-    def get_rnn_cells(self) -> List[mx.rnn.BaseRNNCell]:
-        """
-        Returns a list of RNNCells used by this encoder.
-        """
-        return self.forward_rnn.get_rnn_cells() + self.reverse_rnn.get_rnn_cells()
-
-
-class ConvolutionalEncoder(Encoder):
-    """
-    Encoder that uses convolution instead of recurrent connections, similar to Gehring et al. 2017.
-
-    :param config: Configuration for convolutional encoder.
-    :param prefix: Name prefix for operations in this encoder.
-    """
-
-    def __init__(self,
-                 config: ConvolutionalEncoderConfig,
-                 prefix: str = C.CNN_ENCODER_PREFIX) -> None:
-        super().__init__(config.dtype)
-        self.config = config
-
-        # initialize the weights of the linear transformation required for the residual connections
-        self.i2h_weight = mx.sym.Variable('%si2h_weight' % prefix)
-
-        # initialize the layers of blocks containing a convolution and a GLU, since
-        # every layer is shared over all encode calls
-        self.layers = [convolution.ConvolutionBlock(
-            config.cnn_config,
-            pad_type='centered',
-            prefix="%s%d_" % (prefix, i)) for i in range(config.num_layers)]
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: mx.sym.Symbol,
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        Encodes data with a stack of Convolution+GLU blocks given sequence lengths of individual examples
-        and maximum sequence length.
-
-        :param data: Input data. Shape: (batch_size, seq_len, input_num_hidden).
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Encoded version of the data.
-        """
-        # data: (batch_size, seq_len, num_hidden)
-        data = mx.sym.FullyConnected(data=data,
-                                     num_hidden=self.config.cnn_config.num_hidden,
-                                     no_bias=True,
-                                     flatten=False,
-                                     weight=self.i2h_weight)
-
-        # Multiple layers with residual connections:
-        for layer in self.layers:
-            data = data + layer(data, data_length)
-        return data, data_length, seq_len
-
-    def get_num_hidden(self) -> int:
-        return self.config.cnn_config.num_hidden
-
-
 class TransformerEncoder(Encoder, mx.gluon.HybridBlock):
     """
     Non-recurrent encoder based on the transformer architecture in:
@@ -1026,53 +296,46 @@ class TransformerEncoder(Encoder, mx.gluon.HybridBlock):
 
     def __init__(self,
                  config: transformer.TransformerConfig,
-                 prefix: str = C.TRANSFORMER_ENCODER_PREFIX) -> None:
-        Encoder.__init__(self, dtype=config.dtype)
-        mx.gluon.HybridBlock.__init__(self, prefix=prefix)
+                 prefix: str = C.TRANSFORMER_ENCODER_PREFIX,
+                 dtype: str = C.DTYPE_FP32) -> None:
+        super().__init__(prefix=prefix)
         self.config = config
 
         with self.name_scope():
-            self.layers = mx.gluon.nn.HybridSequential()
-            for i in range(config.num_layers):
-                self.layers.add(transformer.TransformerEncoderBlock(config, prefix="%d_" % i))
+            self.pos_embedding = layers.PositionalEmbeddings(weight_type=self.config.positional_embedding_type,
+                                                             num_embed=self.config.model_size,
+                                                             max_seq_len=self.config.max_seq_len_source,
+                                                             prefix=C.SOURCE_POSITIONAL_EMBEDDING_PREFIX,
+                                                             scale_up_input=True,
+                                                             scale_down_positions=False)
             self.valid_length_mask = transformer.TransformerValidLengthMask(num_heads=self.config.attention_heads,
                                                                             fold_heads=True,
                                                                             name="bias")
+
+            self.layers = mx.gluon.nn.HybridSequential()
+            for i in range(config.num_layers):
+                self.layers.add(transformer.TransformerEncoderBlock(config, prefix="%d_" % i, dtype=dtype))
+
             self.final_process = transformer.TransformerProcessBlock(sequence=config.preprocess_sequence,
                                                                      dropout=config.dropout_prepost,
-                                                                     prefix="final_process_")
+                                                                     prefix="final_process_",
+                                                                     num_hidden=self.config.model_size)
 
-    def hybrid_forward(self, F, data, data_length):
-        return self._encode(F, data, data_length)
+    def hybrid_forward(self, F, data, valid_length):
+        # positional embedding
+        data = self.pos_embedding(data, None)
 
-    def _encode(self, F, data: mx.sym.Symbol, data_length: mx.sym.Symbol) -> mx.sym.Symbol:
-        data = utils.cast_conditionally(F, data, self.dtype)
         if self.config.dropout_prepost > 0.0:
             data = F.Dropout(data=data, p=self.config.dropout_prepost)
 
         # (batch_size * heads, 1, seq_len)
-        bias = F.expand_dims(self.valid_length_mask(data, data_length), axis=1)
-        bias = utils.cast_conditionally(F, bias, self.dtype)
-        for layer in self.layers:
-            # (batch_size, seq_len, config.model_size)
-            data = layer(data, bias)
-        data = self.final_process(data, None)
-        data = utils.uncast_conditionally(F, data, self.dtype)
-        return data
+        bias = F.expand_dims(self.valid_length_mask(data, valid_length), axis=1)
 
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: Optional[mx.sym.Symbol],
-               seq_len: int):
-        """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
+        for block in self.layers:
+            data = block(data, bias)
 
-        :param data: Input data.
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Encoded versions of input data data, data_length, seq_len.
-        """
-        return self._encode(mx.sym, data, data_length), data_length, seq_len
+        data = self.final_process(data, None)
+        return data, valid_length
 
     def get_num_hidden(self) -> int:
         """
@@ -1081,221 +344,4 @@ def get_num_hidden(self) -> int:
         return self.config.model_size
 
 
-class ConvolutionalEmbeddingConfig(config.Config):
-    """
-    Convolutional embedding encoder configuration.
-
-    :param num_embed: Input embedding size.
-    :param output_dim: Output segment embedding size.
-    :param max_filter_width: Maximum filter width for convolutions.
-    :param num_filters: Number of filters of each width.
-    :param pool_stride: Stride for pooling layer after convolutions.
-    :param num_highway_layers: Number of highway layers for segment embeddings.
-    :param dropout: Dropout probability.
-    :param add_positional_encoding: Dropout probability.
-    :param dtype: Data type.
-    """
-
-    def __init__(self,
-                 num_embed: int,
-                 output_dim: int = None,
-                 max_filter_width: int = 8,
-                 num_filters: Tuple[int, ...] = (200, 200, 250, 250, 300, 300, 300, 300),
-                 pool_stride: int = 5,
-                 num_highway_layers: int = 4,
-                 dropout: float = 0.0,
-                 add_positional_encoding: bool = False,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__()
-        self.num_embed = num_embed
-        self.output_dim = output_dim
-        self.max_filter_width = max_filter_width
-        self.num_filters = num_filters
-        self.pool_stride = pool_stride
-        self.num_highway_layers = num_highway_layers
-        self.dropout = dropout
-        self.add_positional_encoding = add_positional_encoding
-        if self.output_dim is None:
-            self.output_dim = sum(self.num_filters)
-        self.dtype = dtype
-
-
-class ConvolutionalEmbeddingEncoder(Encoder):
-    """
-    An encoder developed to map a sequence of character embeddings to a shorter sequence of segment
-    embeddings using convolutional, pooling, and highway layers.  More generally, it maps a sequence
-    of input embeddings to a sequence of span embeddings.
-
-    * "Fully Character-Level Neural Machine Translation without Explicit Segmentation"
-      Jason Lee; Kyunghyun Cho; Thomas Hofmann (https://arxiv.org/pdf/1610.03017.pdf)
-
-    :param config: Convolutional embedding config.
-    :param prefix: Name prefix for symbols of this encoder.
-    """
-
-    def __init__(self,
-                 config: ConvolutionalEmbeddingConfig,
-                 prefix: str = C.CHAR_SEQ_ENCODER_PREFIX) -> None:
-        utils.check_condition(len(config.num_filters) == config.max_filter_width,
-                              "num_filters must have max_filter_width elements.")
-        super().__init__(config.dtype)
-        self.num_embed = config.num_embed
-        self.output_dim = config.output_dim
-        self.max_filter_width = config.max_filter_width
-        self.num_filters = config.num_filters[:]
-        self.pool_stride = config.pool_stride
-        self.num_highway_layers = config.num_highway_layers
-        self.prefix = prefix
-        self.dropout = config.dropout
-        self.add_positional_encoding = config.add_positional_encoding
-
-        self.conv_weight = {filter_width: mx.sym.Variable("%s%s%d%s" % (self.prefix, "conv_", filter_width, "_weight"))
-                            for filter_width in range(1, self.max_filter_width + 1)}
-        self.conv_bias = {filter_width: mx.sym.Variable("%s%s%d%s" % (self.prefix, "conv_", filter_width, "_bias"))
-                          for filter_width in range(1, self.max_filter_width + 1)}
-
-        self.project_weight = mx.sym.Variable(self.prefix + "project_weight")
-        self.project_bias = mx.sym.Variable(self.prefix + "project_bias")
-
-        self.gate_weight = [mx.sym.Variable("%s%s%d%s" % (self.prefix, "gate_", i, "_weight"))
-                            for i in range(self.num_highway_layers)]
-        self.gate_bias = [mx.sym.Variable("%s%s%d%s" % (self.prefix, "gate_", i, "_bias"))
-                          for i in range(self.num_highway_layers)]
-
-        self.transform_weight = [mx.sym.Variable("%s%s%d%s" % (self.prefix, "transform_", i, "_weight"))
-                                 for i in range(self.num_highway_layers)]
-        self.transform_bias = [mx.sym.Variable("%s%s%d%s" % (self.prefix, "transform_", i, "_bias"))
-                               for i in range(self.num_highway_layers)]
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: mx.sym.Symbol,
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
-
-        :param data: Input data.
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Encoded versions of input data data, data_length, seq_len.
-        """
-        total_num_filters = sum(self.num_filters)
-        encoded_seq_len = self.get_encoded_seq_len(seq_len)
-
-        # (batch_size, channel=1, seq_len, num_embed)
-        data = mx.sym.Reshape(data=data, shape=(-1, 1, seq_len, self.num_embed))
-
-        # Convolution filters of width 1..N
-        conv_outputs = []
-        for filter_width, num_filter in enumerate(self.num_filters, 1):
-            # "half" padding: output length == input length
-            pad_before = ceil((filter_width - 1) / 2)
-            pad_after = floor((filter_width - 1) / 2)
-            # (batch_size, channel=1, seq_len + (filter_width - 1), num_embed)
-            padded = mx.sym.pad(data=data,
-                                mode="constant",
-                                constant_value=0,
-                                pad_width=(0, 0, 0, 0, pad_before, pad_after, 0, 0))
-            # (batch_size, num_filter, seq_len, num_scores=1)
-            conv = mx.sym.Convolution(data=padded,
-                                      # cudnn_tune="off",
-                                      kernel=(filter_width, self.num_embed),
-                                      num_filter=num_filter,
-                                      weight=self.conv_weight[filter_width],
-                                      bias=self.conv_bias[filter_width])
-            conv = mx.sym.Activation(data=conv, act_type="relu")
-            conv_outputs.append(conv)
-        # (batch_size, total_num_filters, seq_len, num_scores=1)
-        conv_concat = mx.sym.concat(*conv_outputs, dim=1)
-
-        # Max pooling with stride
-        uncovered = seq_len % self.pool_stride
-        if uncovered > 0:
-            pad_after = self.pool_stride - uncovered
-            # (batch_size, total_num_filters, seq_len + pad_to_final_stride, num_scores=1)
-            conv_concat = mx.sym.pad(data=conv_concat,
-                                     mode="constant",
-                                     constant_value=0,
-                                     pad_width=(0, 0, 0, 0, 0, pad_after, 0, 0))
-        # (batch_size, total_num_filters, seq_len/stride, num_scores=1)
-        pool = mx.sym.Pooling(data=conv_concat,
-                              pool_type="max",
-                              kernel=(self.pool_stride, 1),
-                              stride=(self.pool_stride, 1))
-        # (batch_size, total_num_filters, seq_len/stride)
-        pool = mx.sym.reshape(data=pool,
-                              shape=(-1, total_num_filters, encoded_seq_len))
-        # (batch_size, seq_len/stride, total_num_filters)
-        pool = mx.sym.swapaxes(data=pool, dim1=1, dim2=2)
-        if self.dropout > 0:
-            pool = mx.sym.Dropout(data=pool, p=self.dropout)
-
-        # Raw segment embeddings reshaped for highway network
-        # (batch_size * seq_len/stride, total_num_filters)
-        seg_embedding = mx.sym.Reshape(data=pool, shape=(-3, total_num_filters))
-
-        # Projection layer if requested output dimension is different from total number of filters
-        # (TransformerEncoder compatibility, not in original paper)
-        if self.output_dim != total_num_filters:
-            # (batch_size * seq_len/stride, outut_dim)
-            seg_embedding = mx.sym.FullyConnected(data=seg_embedding,
-                                                  num_hidden=self.output_dim,
-                                                  weight=self.project_weight,
-                                                  bias=self.project_bias)
-            seg_embedding = mx.sym.Activation(data=seg_embedding, act_type="relu")
-            if self.dropout > 0:
-                seg_embedding = mx.sym.Dropout(data=seg_embedding, p=self.dropout)
-
-        # Highway network
-        for i in range(self.num_highway_layers):
-            # Gate
-            gate = mx.sym.FullyConnected(data=seg_embedding,
-                                         num_hidden=self.output_dim,
-                                         weight=self.gate_weight[i],
-                                         bias=self.gate_bias[i])
-            gate = mx.sym.Activation(data=gate, act_type="sigmoid")
-            if self.dropout > 0:
-                gate = mx.sym.Dropout(data=gate, p=self.dropout)
-            # Transform
-            transform = mx.sym.FullyConnected(data=seg_embedding,
-                                              num_hidden=self.output_dim,
-                                              weight=self.transform_weight[i],
-                                              bias=self.transform_bias[i])
-            transform = mx.sym.Activation(data=transform, act_type="relu")
-            if self.dropout > 0:
-                transform = mx.sym.Dropout(data=transform, p=self.dropout)
-            # Connection
-            seg_embedding = gate * transform + (1 - gate) * seg_embedding
-        # (batch_size, seq_len/stride, output_dim) aka
-        # (batch_size, encoded_seq_len, num_segment_embed)
-        seg_embedding = mx.sym.Reshape(data=seg_embedding,
-                                       shape=(-1, encoded_seq_len, self.output_dim))
-
-        # Dropout on final segment embeddings
-        if self.dropout > 0:
-            seg_embedding = mx.sym.Dropout(data=seg_embedding, p=self.dropout)
-
-        # Ceiling function isn't differentiable so this will throw errors if we
-        # attempt to compute gradients.  Fortunately we aren't updating inputs
-        # so we can just block the backward pass here.
-        encoded_data_length = mx.sym.BlockGrad(mx.sym.ceil(data_length / self.pool_stride))
-
-        return seg_embedding, encoded_data_length, encoded_seq_len
-
-    def get_num_hidden(self) -> int:
-        """
-        Return the representation size of this encoder.
-        """
-        return self.output_dim
-
-    def get_encoded_seq_len(self, seq_len: int) -> int:
-        """
-        Returns the size of the encoded sequence.
-        """
-        return int(ceil(seq_len / self.pool_stride))
-
-
-EncoderConfig = Union[RecurrentEncoderConfig, transformer.TransformerConfig, ConvolutionalEncoderConfig,
-                      EmptyEncoderConfig]
-if ImageEncoderConfig is not None:
-    EncoderConfig = Union[EncoderConfig, ImageEncoderConfig]  # type: ignore
+EncoderConfig = Union[transformer.TransformerConfig]
diff --git a/sockeye/evaluate.py b/sockeye/evaluate.py
index 79ba2d627..9c4e98ea4 100644
--- a/sockeye/evaluate.py
+++ b/sockeye/evaluate.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -55,7 +55,7 @@ def raw_corpus_chrf(hypotheses: Iterable[str], references: Iterable[str]) -> flo
     :return: chrF score as float between 0 and 1.
     """
     return sacrebleu.corpus_chrf(hypotheses, references, order=sacrebleu.CHRF_ORDER, beta=sacrebleu.CHRF_BETA,
-                                 remove_whitespace=True)
+                                 remove_whitespace=True).score
 
 
 def raw_corpus_rouge1(hypotheses: Iterable[str], references: Iterable[str]) -> float:
diff --git a/sockeye/extract_parameters.py b/sockeye/extract_parameters.py
index e7aa7c3e4..73fdd2c5c 100644
--- a/sockeye/extract_parameters.py
+++ b/sockeye/extract_parameters.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -63,11 +63,11 @@ def extract(param_path: str,
     :return: Extracted parameter dictionary.
     """
     logger.info("Loading parameters from '%s'", param_path)
-    arg_params, aux_params = utils.load_params(param_path)
+    params = mx.nd.load(param_path)
 
     ext_params = {}  # type: Dict[str, np.ndarray]
-    param_names = _extract(param_names, arg_params, ext_params)
-    param_names = _extract(param_names, aux_params, ext_params)
+    param_names = _extract(param_names, params, ext_params)
+    param_names = _extract(param_names, params, ext_params)
 
     if len(param_names) > 0:
         logger.info("The following parameters were not found:")
@@ -77,14 +77,10 @@ def extract(param_path: str,
         list_all = True
 
     if list_all:
-        if arg_params:
+        if params:
             logger.info("Available arg parameters:")
-            for name in arg_params:
-                logger.info("\t%s: shape=%s", name, str(arg_params[name].shape))
-        if aux_params:
-            logger.info("Available aux parameters:")
-            for name in aux_params:
-                logger.info("\t%s: shape=%s", name, str(aux_params[name].shape))
+            for name in params:
+                logger.info("\t%s: shape=%s", name, str(params[name].shape))
 
     return ext_params
 
@@ -107,12 +103,12 @@ def extract_parameters(args: argparse.Namespace):
         param_path = os.path.join(args.input, C.PARAMS_BEST_NAME)
     else:
         param_path = args.input
-    ext_params = extract(param_path, args.names, args.list_all)
+    extracted_parameters = extract(param_path, args.names, args.list_all)
 
-    if len(ext_params) > 0:
+    if len(extracted_parameters) > 0:
         utils.check_condition(args.output is not None, "An output filename must be specified. (Use --output)")
         logger.info("Writing extracted parameters to '%s'", args.output)
-        np.savez_compressed(args.output, **ext_params)
+        np.savez_compressed(args.output, **extracted_parameters)
 
 
 if __name__ == "__main__":
diff --git a/sockeye/horovod_mpi.py b/sockeye/horovod_mpi.py
new file mode 100644
index 000000000..eca692eb8
--- /dev/null
+++ b/sockeye/horovod_mpi.py
@@ -0,0 +1,47 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+"""Optional Horovod and OpenMPI support"""
+
+# Import MPI-related packages once and in order.  Horovod should be initialized
+# once and mpi4py should not auto-initialize.
+
+# Import Horovod but do not call `init()` yet.  Initialization should be called
+# as part of the main program after all modules (including Sockeye modules) have
+# been imported.
+try:
+    import horovod.mxnet as hvd
+except ImportError:
+    hvd = None
+
+# Import mpi4py.MPI but do not automatically initialize/finalize the MPI
+# environment.  Horovod already initializes the environment and running multiple
+# initializations causes errors.  Finalization causes errors with other
+# processes.
+try:
+    import mpi4py
+    mpi4py.rc.initialize = False
+    mpi4py.rc.finalize = False
+    from mpi4py import MPI
+except ImportError:
+    mpi4py = None
+    MPI = None
+
+
+def using_horovod():
+    """
+    Returns true if the MPI environment is initialized, indicating that
+    `hvd.init()` has been called.
+    """
+    if MPI is not None:
+        return MPI.Is_initialized()
+    return False
diff --git a/sockeye/image_captioning/__init__.py b/sockeye/image_captioning/__init__.py
deleted file mode 100644
index 6db27beb7..000000000
--- a/sockeye/image_captioning/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
diff --git a/sockeye/image_captioning/arguments.py b/sockeye/image_captioning/arguments.py
deleted file mode 100644
index 4d2e84cda..000000000
--- a/sockeye/image_captioning/arguments.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Defines commandline arguments for the main CLIs with reasonable defaults.
-"""
-from .. import constants as C
-from ..arguments import regular_file, regular_folder, add_training_data_args, \
-    add_validation_data_params, add_prepared_data_args, add_bucketing_args, \
-    add_vocab_args, add_training_output_args, add_monitoring_args, \
-    add_device_args, int_greater_or_equal, add_model_parameters, \
-    add_training_args, add_logging_args, add_max_output_cli_args, \
-    add_translate_cli_args, add_score_cli_args
-
-
-def add_image_source_root_args(params, required=False):
-    params.add_argument('--source-root', '-sr',
-                        required=required,
-                        type=regular_folder(),
-                        help='Source root where the training images are located.')
-
-
-def add_image_validation_data_params(params):
-    add_validation_data_params(params)
-    params.add_argument('--validation-source-root', '-vsr',
-                        type=regular_folder(),
-                        help='Source root where the validation images are located.')
-
-
-def add_image_training_io_args(params):
-    params = params.add_argument_group("Data & I/O")
-    add_training_data_args(params, required=False)
-    add_image_source_root_args(params, required=False)
-    add_prepared_data_args(params)
-    add_image_validation_data_params(params)
-    add_bucketing_args(params)
-    add_vocab_args(params)
-    add_training_output_args(params)
-    add_monitoring_args(params)
-
-
-def add_image_extract_features_cli_args(params):
-    params = params.add_argument_group("Feature extraction")
-    add_image_model_parameters(params)
-    add_image_size_args(params)
-    add_device_args(params)
-    params.add_argument('--image-root', '-ir',
-                        required=True,
-                        type=regular_folder(),
-                        help='Source root where the training images are located.')
-    params.add_argument('--input', '-i',
-                        required=True,
-                        type=regular_file(),
-                        help='Input file containing the list of images (paths relative to image-root) '
-                             'to extract the features for.')
-    params.add_argument('--output-root', '-or',
-                        required=False,
-                        type=str,
-                        help='Where the actual features are stored.')
-    params.add_argument('--output', '-o',
-                        required=False,
-                        type=str,
-                        help='Output file where the list of features is stored (paths relative to output-root).')
-    params.add_argument('--batch-size', '-b',
-                        type=int_greater_or_equal(1),
-                        default=64,
-                        help='Mini-batch size. Default: %(default)s.')
-
-
-def add_image_size_args(params):
-    params.add_argument('--source-image-size', '-sis',
-                        nargs='+', type=int,
-                        default=[3, 224, 224],
-                        help='Source images are resized to this size. It must fit the input shape of the network. Default: %(default)s.')
-
-
-def add_image_model_parameters(params):
-    model_params = params.add_argument_group("ImageModelConfig")
-
-    # Image encoder arguments (pre-trained network)
-    model_params.add_argument('--image-positional-embedding-type',
-                              choices=C.POSITIONAL_EMBEDDING_TYPES,
-                              default=C.NO_POSITIONAL_EMBEDDING,
-                              help='The type of positional embedding. Default: %(default)s.')
-    model_params.add_argument('--image-encoder-model-path', type=str,
-                              default="/path/to/mxnet/image/model/",
-                              help="Path to the mxnet pre-trained model for image encoding. The model comes "
-                                   "with two files: .json and .params. NOTE: use the prefix only, do not include "
-                                   "the sufix -symbol.json or -0000.params.")
-    model_params.add_argument('--image-encoder-model-epoch', type=int,
-                              default=0,
-                              help="Epoch of the model to load. Default: %(default)s.")
-    model_params.add_argument('--image-encoder-layer', type=str,
-                              default="stage4_unit3_conv3",
-                              help="This string specifies the name of the layer from the image model used as "
-                                   "representation. The possible names can be found in the model file .json. Default: %(default)s.")
-    model_params.add_argument('--image-encoder-conv-map-size', type=int,
-                              default=49,
-                              help="Expected size of the feature map related to the layer specified in "
-                                   "--image-encoder-layer. If the conv map has shape 2048*7*7, the value "
-                                   "of this parameter will be 7*7, thus 49. Default: %(default)s.")
-    model_params.add_argument('--image-encoder-num-hidden', type=int,
-                              default=512,
-                              help="Number of hidden units of the fully-connected layer that encode "
-                                   "the original features. Suggested to be of dimension which is lower "
-                                   "than the original dimension. Default: %(default)s.")
-    model_params.add_argument('--no-image-encoder-global-descriptor',
-                              action="store_false",
-                              help="The image encodes can be augmented with a global descriptor, which is "
-                                   "the spatial average of the conv map. This is encoded with fully-connected "
-                                   "layer defined with --image-encoder-num-hidden. Use this option to disable it.")
-    add_preextracted_features_args(model_params)
-
-
-def add_preextracted_features_args(model_params):
-    model_params.add_argument('--load-all-features-to-memory',
-                              action="store_true",
-                              help="If we preextracted features, the files are loaded in batch from disk. "
-                                   "Enable this option to load all the features to memory in the beginning "
-                                   "only once. This speeds up, as long as the features fit to memory.")
-    model_params.add_argument('--extract-image-features',
-                              action="store_true",
-                              help="If True, it extracts features and caption directly from input images,"
-                                   "otherwise it will expect pre-extracted features.")
-
-
-def add_image_train_cli_args(params):
-    add_image_training_io_args(params)
-    add_model_parameters(params)
-    add_image_model_parameters(params)
-    add_training_args(params)
-    add_device_args(params)
-    add_logging_args(params)
-    add_max_output_cli_args(params)
-
-
-def add_image_caption_cli_args(params):
-    add_translate_cli_args(params)
-    add_image_source_root_args(params, required=False)
-    add_max_output_cli_args(params)
-    # Used only if images as input instead of features
-    add_image_model_parameters(params)
-    add_image_size_args(params)
-
-
-def add_image_score_caption_cli_args(params):
-    add_score_cli_args(params)
-    add_image_source_root_args(params, required=False)
-    # Used only if images as input instead of features
-    add_image_model_parameters(params)
-    add_image_size_args(params)
diff --git a/sockeye/image_captioning/captioner.py b/sockeye/image_captioning/captioner.py
deleted file mode 100644
index 55c3bfd79..000000000
--- a/sockeye/image_captioning/captioner.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Image captioning CLI.
-"""
-import argparse
-import os
-import tempfile
-import logging
-from contextlib import ExitStack
-
-import mxnet as mx
-
-from . import arguments as arguments_image
-from . import inference as inference_image
-from .train import read_feature_shape
-from .. import arguments
-from .. import constants as C
-from .. import inference
-from .. import output_handler
-from ..image_captioning import utils
-from ..image_captioning.extract_features import get_pretrained_net, \
-    batching, read_list_file, extract_features_forward
-from ..lexicon import TopKLexicon
-from ..log import setup_main_logger
-from ..translate import read_and_translate
-from ..utils import check_condition, log_basic_info, determine_context
-
-logger = logging.getLogger(__name__)
-
-
-def get_pretrained_caption_net(args: argparse.Namespace,
-                               context: mx.Context,
-                               image_preextracted_features: bool,
-                               features_in_memory=False) -> inference_image.ImageCaptioner:
-    models, target_vocab = inference_image.load_models(
-        context=context,
-        max_input_len=args.max_input_len,
-        beam_size=args.beam_size,
-        batch_size=args.batch_size,
-        model_folders=args.models,
-        checkpoints=args.checkpoints,
-        softmax_temperature=args.softmax_temperature,
-        max_output_length_num_stds=args.max_output_length_num_stds,
-        decoder_return_logit_inputs=args.restrict_lexicon is not None,
-        cache_output_layer_w_b=args.restrict_lexicon is not None,
-        source_image_size=tuple(args.feature_size),
-        forced_max_output_len=args.max_output_length
-    )
-    restrict_lexicon = None  # type: TopKLexicon
-    store_beam = args.output_type == C.OUTPUT_HANDLER_BEAM_STORE
-    if args.restrict_lexicon:
-        raise NotImplementedError('restrict lexicon does not work with image captioning for now.')
-
-    translator = inference_image.ImageCaptioner(context=context,
-                                                ensemble_mode=args.ensemble_mode,
-                                                bucket_source_width=0,
-                                                length_penalty=inference.LengthPenalty(
-                                                    args.length_penalty_alpha,
-                                                    args.length_penalty_beta),
-                                                brevity_penalty=inference.BrevityPenalty(
-                                                    weight=0.0),
-                                                beam_prune=args.beam_prune,
-                                                beam_search_stop=args.beam_search_stop,
-                                                nbest_size=1,
-                                                models=models,
-                                                source_vocabs=None,
-                                                target_vocab=target_vocab,
-                                                restrict_lexicon=restrict_lexicon,
-                                                store_beam=store_beam,
-                                                strip_unknown_words=args.strip_unknown_words,
-                                                source_image_size=tuple(
-                                                    args.feature_size),
-                                                source_root=args.source_root,
-                                                use_feature_loader=image_preextracted_features,
-                                                features_in_memory=features_in_memory)
-    return translator
-
-
-def _extract_features(args, context):
-    image_list = read_list_file(args.input)
-    image_model, _ = get_pretrained_net(args, context)
-    output_root = tempfile.mkdtemp()
-    output_file = os.path.join(output_root, "input.features")
-    with open(output_file, "w") as fout:
-        for i, im in enumerate(batching(image_list, args.batch_size)):
-            feats, out_names = extract_features_forward(im, image_model,
-                                                        args.source_root,
-                                                        output_root,
-                                                        args.batch_size,
-                                                        args.source_image_size,
-                                                        context)
-            # Save to disk
-            out_file_names = utils.save_features(out_names, feats)
-            # Write to output file
-            out_file_names = map(lambda x: os.path.basename(x) + "\n",
-                                 out_file_names)
-            fout.writelines(out_file_names)
-    return output_root, output_file, tuple(feats.shape[1:])
-
-
-def main():
-    params = arguments.ConfigArgumentParser(description='Image Captioning CLI')
-    arguments_image.add_image_caption_cli_args(params)
-    args = params.parse_args()
-    caption(args)
-
-
-def caption(args: argparse.Namespace):
-    image_preextracted_features = not args.extract_image_features
-
-    if args.output is not None:
-        setup_main_logger(console=not args.quiet,
-                                   file_logging=True,
-                                   path="%s.%s" % (args.output, C.LOG_NAME))
-    else:
-        setup_main_logger(file_logging=False)
-
-    if args.checkpoints is not None:
-        check_condition(len(args.checkpoints) == len(args.models),
-                        "must provide checkpoints for each model")
-
-    log_basic_info(args)
-
-    out_handler = output_handler.get_output_handler(args.output_type,
-                                                    args.output,
-                                                    args.sure_align_threshold)
-
-    with ExitStack() as exit_stack:
-        context = determine_context(device_ids=args.device_ids,
-                                    use_cpu=args.use_cpu,
-                                    disable_device_locking=args.disable_device_locking,
-                                    lock_dir=args.lock_dir,
-                                    exit_stack=exit_stack)[0]
-        logger.info("Captioning Device: %s", context)
-
-        if not image_preextracted_features:
-            # Extract features and override input and source_root with tmp location of features
-            args.source_root, args.input, args.feature_size = _extract_features(
-                args, context)
-            image_preextracted_features = True  # now we extracted features
-        else:  # Read feature size from disk
-            _, args.feature_size = read_feature_shape(args.source_root)
-
-        captioner = get_pretrained_caption_net(args, context,
-                                               image_preextracted_features,
-                                               features_in_memory=False)
-
-        read_and_translate(translator=captioner,
-                           output_handler=out_handler,
-                           chunk_size=args.chunk_size,
-                           input_file=args.input,
-                           input_is_json=args.json_input)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/sockeye/image_captioning/checkpoint_decoder.py b/sockeye/image_captioning/checkpoint_decoder.py
deleted file mode 100644
index 444a60fc0..000000000
--- a/sockeye/image_captioning/checkpoint_decoder.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Implements a thin wrapper around ImageCaptioner to compute BLEU scores on
-(a sample of) validation data during training.
-"""
-import logging
-import os
-import time
-from typing import Dict, List, Optional
-
-from .. import inference
-from . import inference as inference_image
-from .. import constants as C
-from .. import data_io
-from .. import evaluate
-from .. import output_handler
-from ..checkpoint_decoder import CheckpointDecoder
-
-logger = logging.getLogger(__name__)
-
-
-class CheckpointDecoderImageModel(CheckpointDecoder):
-    """
-    Decodes a (random sample of a) dataset using parameters at given checkpoint
-    and computes BLEU against references.
-
-    :param source_image_size: Size of the image feed into the net.
-    :param image_root: Root where the images are stored.
-    :param max_output_length: Max length of the generated sentence.
-    :param use_feature_loader: If True, features are loaded instead of images.
-    :param kwargs: Arguments passed to `sockeye.checkpoint_decoder.CheckpointDecoder`.
-    """
-
-    def __init__(self,
-                 source_image_size: tuple,
-                 image_root: str,
-                 max_output_length: int = 50,
-                 use_feature_loader: bool = False,
-                 **kwargs) -> None:
-
-        super().__init__(**kwargs)
-        self.source_image_size = source_image_size
-        self.image_root = image_root
-        self.max_output_length = max_output_length
-        self.use_feature_loader = use_feature_loader
-
-    def decode_and_evaluate(self,
-                            checkpoint: Optional[int] = None,
-                            output_name: str = os.devnull) -> Dict[str, float]:
-        """
-        Decodes data set and evaluates given a checkpoint.
-
-        :param checkpoint: Checkpoint to load parameters from.
-        :param output_name: Filename to write translations to. Defaults to /dev/null.
-        :return: Mapping of metric names to scores.
-        """
-        models, vocab_target = inference_image.load_models(context=self.context,
-                                                           max_input_len=self.max_input_len,
-                                                           beam_size=self.beam_size,
-                                                           batch_size=self.batch_size,
-                                                           model_folders=[self.model],
-                                                           checkpoints=[checkpoint],
-                                                           softmax_temperature=self.softmax_temperature,
-                                                           max_output_length_num_stds=self.max_output_length_num_stds,
-                                                           source_image_size=tuple(self.source_image_size),
-                                                           forced_max_output_len=self.max_output_length)
-        translator = inference_image.ImageCaptioner(context=self.context,
-                                                    ensemble_mode=self.ensemble_mode,
-                                                    bucket_source_width=0,
-                                                    length_penalty=inference.LengthPenalty(
-                                                        self.length_penalty_alpha,
-                                                        self.length_penalty_beta),
-                                                    brevity_penalty=inference.BrevityPenalty(
-                                                        weight=0.0),
-                                                    beam_prune=0.0,
-                                                    beam_search_stop='all',
-                                                    models=models,
-                                                    source_vocabs=None,
-                                                    target_vocab=vocab_target,
-                                                    restrict_lexicon=None,
-                                                    store_beam=False,
-                                                    source_image_size=tuple(
-                                                        self.source_image_size),
-                                                    source_root=self.image_root,
-                                                    use_feature_loader=self.use_feature_loader)
-
-        trans_wall_time = 0.0
-        translations = []
-        with data_io.smart_open(output_name, 'w') as output:
-            handler = output_handler.StringOutputHandler(output)
-            tic = time.time()
-            trans_inputs = []  # type: List[inference.TranslatorInput]
-            for i, inputs in enumerate(self.inputs_sentences):
-                trans_inputs.append(
-                    inference.make_input_from_multiple_strings(i, inputs))
-            trans_outputs = translator.translate(trans_inputs)
-            trans_wall_time = time.time() - tic
-            for trans_input, trans_output in zip(trans_inputs, trans_outputs):
-                handler.handle(trans_input, trans_output)
-                translations.append(trans_output.translation)
-        avg_time = trans_wall_time / len(self.target_sentences)
-
-        # TODO(fhieber): eventually add more metrics (METEOR etc.)
-        return {C.BLEU_VAL: evaluate.raw_corpus_bleu(hypotheses=translations,
-                                                     references=self.target_sentences,
-                                                     offset=0.01),
-                C.CHRF_VAL: evaluate.raw_corpus_chrf(hypotheses=translations,
-                                                     references=self.target_sentences),
-                C.AVG_TIME: avg_time,
-                C.DECODING_TIME: trans_wall_time}
diff --git a/sockeye/image_captioning/data_io.py b/sockeye/image_captioning/data_io.py
deleted file mode 100644
index 7699552d9..000000000
--- a/sockeye/image_captioning/data_io.py
+++ /dev/null
@@ -1,484 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Implements data iterators and I/O related functions for image-to-sequence
-models.
-"""
-import functools
-import logging
-import time
-from typing import Any, Dict, Iterable, List, Optional, Tuple
-
-import mxnet as mx
-import numpy as np
-
-from .utils import load_features, load_feature, load_preprocess_images, zero_pad_features
-from .. import constants as C
-from .. import vocab
-from ..data_io import ParallelDataSet, RawParallelDatasetLoader, \
-    BucketBatchSize, FileListReader, SequenceReader, DataConfig, DataInfo, \
-    ParallelSampleIter
-from ..data_io import get_target_bucket, get_data_statistics, \
-    define_empty_source_parallel_buckets, define_bucket_batch_sizes
-
-logger = logging.getLogger(__name__)
-
-
-class RawListTextDatasetLoader:
-    """
-    Loads a data set of variable-length parallel list of string and target sequences into buckets of NDArrays.
-    The list of strings are not converted to NDArrays, because we assume that the dataset does not fit in memory.
-    We assume that the used data iterator knows how to load the data from disk to memory every time a batch is consumed.
-    Note: it does not support multiple source, like `sockeye.data_io.RawParallelDatasetLoader`.
-
-    :param buckets: Bucket list.
-    :param eos_id: End-of-sentence id.
-    :param pad_id: Padding id.
-    :param eos_id: Unknown id.
-    :param dtype: Data type.
-    """
-
-    def __init__(self,
-                 buckets: List[Tuple[int, int]],
-                 eos_id: int,
-                 pad_id: int,
-                 dtype: str = 'float32') -> None:
-        self.buckets = buckets
-        self.eos_id = eos_id
-        self.pad_id = pad_id
-        self.dtype = dtype
-
-    def load(self,
-             source_list: Iterable[List[str]],
-             target_sentences: Iterable[List[Any]],
-             num_samples_per_bucket: List[int]) -> 'ParallelDataSet':
-        """
-        Creates a parallel dataset base on source list of strings and target sentences.
-        Returns a `sockeye.data_io.ParallelDataSet`.
-
-        :param source_list: Source list of strings (e.g., filenames).
-        :param target_sentences: Target sentences used to do bucketing.
-        :param num_samples_per_bucket: Number of samples per bucket.
-        :return: Returns a parallel dataset `sockeye.data_io.ParallelDataSet`.
-        """
-        assert len(num_samples_per_bucket) == len(self.buckets)
-
-        data_source = [np.full((num_samples,), self.pad_id, dtype=object)
-                       for num_samples in num_samples_per_bucket]
-        # data_source is a List[numpy.array[str]] which semantic is bucket, index, str
-        # Its loading to memory is deferred to the iterator, since the full data
-        # is supposed to not fit in memory.
-        data_target = [np.full((num_samples, target_len), self.pad_id, dtype=self.dtype)
-                       for (source_len, target_len), num_samples in zip(self.buckets, num_samples_per_bucket)]
-        data_label = [np.full((num_samples, target_len), self.pad_id, dtype=self.dtype)
-                      for (source_len, target_len), num_samples in zip(self.buckets, num_samples_per_bucket)]
-
-        bucket_sample_index = [0 for buck in self.buckets]
-
-        # track amount of padding introduced through bucketing
-        num_tokens_target = 0
-        num_pad_target = 0
-
-        # Bucket sentences as padded np arrays
-        for source, target in zip(source_list, target_sentences):
-            target_len = len(target)
-            buck_index, buck = get_target_bucket(self.buckets, target_len)
-            if buck is None:
-                continue  # skip this sentence pair
-
-            num_tokens_target += buck[1]
-            num_pad_target += buck[1] - target_len
-
-            sample_index = bucket_sample_index[buck_index]
-            data_source[buck_index][sample_index] = source
-            data_target[buck_index][sample_index, :target_len] = target
-            # NOTE(fhieber): while this is wasteful w.r.t memory, we need to explicitly create the label sequence
-            # with the EOS symbol here sentence-wise and not per-batch due to variable sequence length within a batch.
-            # Once MXNet allows item assignments given a list of indices (probably MXNet 1.0): e.g a[[0,1,5,2]] = x,
-            # we can try again to compute the label sequence on the fly in next().
-            data_label[buck_index][sample_index, :target_len] = target[1:] + [self.eos_id]
-
-            bucket_sample_index[buck_index] += 1
-
-        for i in range(len(data_source)):
-            data_target[i] = mx.nd.array(data_target[i], dtype=self.dtype)
-            data_label[i] = mx.nd.array(data_label[i], dtype=self.dtype)
-
-        if num_tokens_target > 0:
-            logger.info("Created bucketed parallel data set. Introduced padding: target=%.1f%%)",
-                        num_pad_target / num_tokens_target * 100)
-
-        return ParallelDataSet(data_source, data_target, data_label)
-
-
-def get_validation_image_text_data_iter(data_loader: RawParallelDatasetLoader,
-                                        validation_source_root: str,
-                                        validation_source: str,
-                                        validation_target: str,
-                                        buckets: List[Tuple[int, int]],
-                                        bucket_batch_sizes: List[BucketBatchSize],
-                                        source_image_size: tuple,
-                                        vocab_target: vocab.Vocab,
-                                        max_seq_len_target: int,
-                                        batch_size: int,
-                                        use_feature_loader: bool = False,
-                                        preload_features: bool = False) -> 'ParallelSampleIter':
-    """
-    Returns a ParallelSampleIter for the validation data.
-    """
-    logger.info("=================================")
-    logger.info("Creating validation data iterator")
-    logger.info("=================================")
-
-    validation_source_images = [FileListReader(validation_source, validation_source_root)]
-    validation_target_sentences = SequenceReader(validation_target, vocab_target, add_bos=True, limit=None)
-
-    validation_data_statistics = get_data_statistics(source_readers=None,
-                                                     target_reader=validation_target_sentences,
-                                                     buckets=buckets,
-                                                     length_ratio_mean=1.0,
-                                                     length_ratio_std=1.0,
-                                                     source_vocabs=None,
-                                                     target_vocab=vocab_target)
-    validation_data_statistics.log(bucket_batch_sizes)
-
-    validation_data = data_loader.load(validation_source_images[0],
-                                       validation_target_sentences,
-                                       validation_data_statistics.num_sents_per_bucket).fill_up(bucket_batch_sizes)
-    return ImageTextSampleIter(data=validation_data,
-                               buckets=buckets,
-                               batch_size=batch_size,
-                               bucket_batch_sizes=bucket_batch_sizes,
-                               image_size=source_image_size,
-                               use_feature_loader=use_feature_loader,
-                               preload_features=preload_features)
-
-
-def get_training_image_text_data_iters(source_root: str,
-                                       source: str, target: str,
-                                       validation_source_root: str,
-                                       validation_source: str, validation_target: str,
-                                       vocab_target: vocab.Vocab,
-                                       vocab_target_path: Optional[str],
-                                       batch_size: int,
-                                       batch_by_words: bool,
-                                       batch_num_devices: int,
-                                       source_image_size: tuple,
-                                       max_seq_len_target: int,
-                                       bucketing: bool,
-                                       bucket_width: int,
-                                       use_feature_loader: bool = False,
-                                       preload_features: bool = False) -> Tuple['ParallelSampleIter',
-                                                                                'ParallelSampleIter',
-                                                                                'DataConfig', 'DataInfo']:
-    """
-    Returns data iterators for training and validation data.
-
-    :param source_root: Path to source images since the file in source contains relative paths.
-    :param source: Path to source training data.
-    :param target: Path to target training data.
-    :param validation_source_root: Path to validation source images since the file in validation_source contains relative paths.
-    :param validation_source: Path to source validation data.
-    :param validation_target: Path to target validation data.
-    :param vocab_target: Target vocabulary.
-    :param vocab_target_path: Path to target vocabulary.
-    :param batch_size: Batch size.
-    :param batch_by_words: Size batches by words rather than sentences.
-    :param batch_num_devices: Number of devices batches will be parallelized across.
-    :param source_image_size: size to resize the image to (for iterator)
-    :param max_seq_len_target: Maximum target sequence length.
-    :param bucketing: Whether to use bucketing.
-    :param bucket_width: Size of buckets.
-    :param use_feature_loader: If True, features are loaded instead of images.
-    :param preload_features: If use_feature_loader si True, this enables load all the feature to memory
-    :return: Tuple of (training data iterator, validation data iterator, data config).
-    """
-    logger.info("===============================")
-    logger.info("Creating training data iterator")
-    logger.info("===============================")
-
-    # define buckets
-    buckets = define_empty_source_parallel_buckets(max_seq_len_target, bucket_width) if bucketing else [
-        (0, max_seq_len_target)]
-
-    source_images = [FileListReader(source, source_root)]
-    target_sentences = SequenceReader(target, vocab_target, add_bos=True)
-
-    # 2. pass: Get data statistics only on target (source not considered)
-    data_statistics = get_data_statistics(source_readers=None,
-                                          target_reader=target_sentences,
-                                          buckets=buckets,
-                                          length_ratio_mean=1.0,
-                                          length_ratio_std=1.0,
-                                          source_vocabs=None,
-                                          target_vocab=vocab_target)
-
-    bucket_batch_sizes = define_bucket_batch_sizes(buckets,
-                                                   batch_size,
-                                                   batch_by_words,
-                                                   batch_num_devices,
-                                                   data_statistics.average_len_target_per_bucket)
-
-    data_statistics.log(bucket_batch_sizes)
-
-    data_loader = RawListTextDatasetLoader(buckets=buckets,
-                                           eos_id=vocab_target[C.EOS_SYMBOL],
-                                           pad_id=C.PAD_ID)
-
-    training_data = data_loader.load(source_images[0], target_sentences,
-                                     data_statistics.num_sents_per_bucket).fill_up(bucket_batch_sizes)
-
-    data_info = DataInfo(sources=source_images,
-                         target=target,
-                         source_vocabs=None,
-                         target_vocab=vocab_target_path,
-                         shared_vocab=False,
-                         num_shards=1)
-
-    config_data = DataConfig(data_statistics=data_statistics,
-                             max_seq_len_source=0,
-                             max_seq_len_target=max_seq_len_target,
-                             num_source_factors=len(source_images))
-
-    # Add useful stuff to config_data
-    config_data.source_root = source_root
-    config_data.validation_source_root = validation_source_root
-    config_data.use_feature_loader = use_feature_loader
-
-    train_iter = ImageTextSampleIter(data=training_data,
-                                     buckets=buckets,
-                                     batch_size=batch_size,
-                                     bucket_batch_sizes=bucket_batch_sizes,
-                                     image_size=source_image_size,
-                                     use_feature_loader=use_feature_loader,
-                                     preload_features=preload_features)
-
-    validation_iter = get_validation_image_text_data_iter(data_loader=data_loader,
-                                                          validation_source_root=validation_source_root,
-                                                          validation_source=validation_source,
-                                                          validation_target=validation_target,
-                                                          buckets=buckets,
-                                                          bucket_batch_sizes=bucket_batch_sizes,
-                                                          source_image_size=source_image_size,
-                                                          vocab_target=vocab_target,
-                                                          max_seq_len_target=max_seq_len_target,
-                                                          batch_size=batch_size,
-                                                          use_feature_loader=use_feature_loader,
-                                                          preload_features=preload_features)
-
-    return train_iter, validation_iter, config_data, data_info
-
-
-def get_scoring_image_text_data_iters(source_root: str,
-                                      sources: List[str],
-                                      target: str,
-                                      vocab_target: vocab.Vocab,
-                                      batch_size: int,
-                                      batch_num_devices: int,
-                                      max_seq_len_source: int,
-                                      max_seq_len_target: int,
-                                      source_image_size: tuple,
-                                      use_feature_loader: bool = False,
-                                      preload_features: bool = False) -> 'BaseParallelSampleIter':
-  """
-  Returns a data iterator for scoring. The iterator loads data on demand,
-  batch by batch, and does not skip any lines. Lines that are too long
-  are truncated.
-  # TODO
-
-  """
-  logger.info("==============================")
-  logger.info("Creating scoring data iterator")
-  logger.info("==============================")
-
-  # One bucket to hold them all,
-  bucket = (max_seq_len_source, max_seq_len_target)
-  buckets = [bucket]
-
-  source_images = [FileListReader(sources[0], source_root)]
-  target_sentences = SequenceReader(target, vocab_target, add_bos=True)
-
-  # ...One loader to raise them,
-  data_loader = RawListTextDatasetLoader(buckets=buckets,
-                                         eos_id=vocab_target[C.EOS_SYMBOL],
-                                         pad_id=C.PAD_ID)
-
-  data_statistics = get_data_statistics(source_readers=None,
-                                        target_reader=target_sentences,
-                                        buckets=buckets,
-                                        length_ratio_mean=1.0,
-                                        length_ratio_std=1.0,
-                                        source_vocabs=None,
-                                        target_vocab=vocab_target)
-
-  bucket_batch_sizes = define_bucket_batch_sizes(buckets,
-                                                 batch_size,
-                                                 False,
-                                                 batch_num_devices,
-                                                 data_statistics.average_len_target_per_bucket)
-
-  data_statistics.log(bucket_batch_sizes)
-  data = data_loader.load(source_images[0], target_sentences, [data_statistics.num_sents])
-
-  # ...one iterator to traverse them all,
-  scoring_iter = ImageTextSampleIter(data=data,
-                                     buckets=buckets,
-                                     batch_size=batch_size,
-                                     bucket_batch_sizes=bucket_batch_sizes,
-                                     image_size=source_image_size,
-                                     use_feature_loader=use_feature_loader,
-                                     preload_features=preload_features,
-                                     permute=False)
-
-  # and with the model appraise them.
-  return scoring_iter
-
-
-class ImageTextSampleIter(ParallelSampleIter):
-    """
-    Data iterator on a bucketed ParallelDataSet which loads images in the source on the fly.
-    It also resizes and preprocesses the images. Shuffles data at every reset and
-    supports saving and loading the iterator state.
-    """
-
-    def __init__(self,
-                 data: ParallelDataSet,
-                 buckets,
-                 batch_size,
-                 bucket_batch_sizes,
-                 image_size: tuple,
-                 source_data_name=C.SOURCE_NAME,
-                 target_data_name=C.TARGET_NAME,
-                 label_name=C.TARGET_LABEL_NAME,
-                 dtype='float32',
-                 source_only=False,
-                 use_feature_loader: bool = False,
-                 preload_features: bool = False,
-                 permute: bool = True) -> None:
-        super().__init__(data, buckets, batch_size, bucket_batch_sizes,
-                         source_data_name, target_data_name, label_name, dtype=dtype, permute=permute)
-
-        self.with_text = not source_only
-        self.image_size = tuple(image_size)
-
-        # Override provide_data to make sure to use images
-        self.provide_data = [
-            mx.io.DataDesc(name=self.source_data_name,
-                           shape=(self.bucket_batch_sizes[-1].batch_size,) + self.image_size,  # "NCHW"
-                           layout=C.BATCH_MAJOR_IMAGE)
-        ]
-        if self.with_text:
-            self.provide_data += [
-                mx.io.DataDesc(name=self.target_data_name,
-                               shape=(self.bucket_batch_sizes[-1].batch_size, self.default_bucket_key[1]),
-                               layout=C.BATCH_MAJOR)
-            ]
-        self.use_feature_loader = use_feature_loader
-        self.preload_features = preload_features
-        if self.use_feature_loader:
-            self.data_loader = load_features
-            # Load already everything to memory
-            if self.preload_features:
-                logger.info("Loading all the features to memory (this might take a while, be patient)...")
-                start = time.time()
-                self.loaded_source = {}  # type: Dict[str, np.ndarray]
-                for bucket in self.data.source:
-                    for k in bucket:
-                        if k not in self.loaded_source:  # avoid to load twice
-                            self.loaded_source[k] = load_feature(k)
-                logger.info("Feature loaded in {} seconds.".format(time.time() - start))
-        else:
-            self.data_loader = functools.partial(load_preprocess_images,
-                                                 image_size=self.image_size)
-
-    def next(self) -> mx.io.DataBatch:
-        """
-        Returns the next batch from the data iterator.
-        """
-        if not self.iter_next():
-            raise StopIteration
-
-        i, j = self.batch_indices[self.curr_batch_index]
-        self.curr_batch_index += 1
-
-        batch_size = self.bucket_batch_sizes[i].batch_size
-        source = self.data.source[i][j:j + batch_size]
-        target = self.data.target[i][j:j + batch_size]
-        if self.preload_features:
-            loaded_source = []  # type: List[np.ndarray]
-            for k in source:
-                loaded_source.append(self.loaded_source[k])
-        else:
-            loaded_source = self.data_loader(source)
-        # zero pad features if not agree with expected shape
-        loaded_source = zero_pad_features(loaded_source, self.image_size)
-        loaded_source = mx.nd.array(loaded_source)
-
-        label = [self.data.label[i][j:j + batch_size]]
-
-        provide_data = [mx.io.DataDesc(name=self.source_data_name, shape=loaded_source.shape, layout=C.BATCH_MAJOR_IMAGE)]
-        if self.with_text:
-            provide_data += [mx.io.DataDesc(name=self.target_data_name, shape=target.shape, layout=C.BATCH_MAJOR)]
-        provide_label = [mx.io.DataDesc(name=n, shape=x.shape, layout=C.BATCH_MAJOR) for n, x in
-                         zip(self.label_names, label)]
-
-        data = [loaded_source]
-        if self.with_text:
-            data += [target]
-        return mx.io.DataBatch(data, label,
-                               pad=0, index=None, bucket_key=self.buckets[i],
-                               provide_data=provide_data, provide_label=provide_label)
-
-    @staticmethod
-    def visualize_batch(batch: mx.io.DataBatch,
-                        reverse_vocab: Dict[int, str],
-                        source_only: bool = False) -> None:
-
-        try:  # Try to import matplotlib
-            import matplotlib  # pylint: disable=import-error
-        except ImportError as e:
-            raise RuntimeError("Please install matplotlib.")
-        matplotlib.use('Agg')
-        import matplotlib.pyplot as plt
-
-        N = M = 4
-        fig, axs = plt.subplots(N, M, figsize=(20, 10))
-        # Remove axes
-        for i in range(N):
-            for j in range(M):
-                axs[i, j].axis("off")
-        for i, img in enumerate(batch.data[0]):
-            # (channel, height, width) -> (height, width, channel)
-            img_ = np.swapaxes(img.asnumpy(), 0, 2)
-            img_ = np.swapaxes(img_, 0, 1)
-            axs[i // N % M, i % N].imshow(np.uint8(img_))
-            axs[i // N % M, i % N].axis("off")
-            if not source_only:
-                sentence = ""
-                sentence_ids = batch.data[1][i].asnumpy()
-                carry_on = jj = 0
-                for j, v in enumerate(sentence_ids):
-                    if reverse_vocab[v] not in C.VOCAB_SYMBOLS:  # Ignore for visualization
-                        sentence += reverse_vocab[v]
-                        carry_on += len(reverse_vocab[v])
-                        if jj < len(sentence_ids):
-                            if carry_on >= 15:
-                                sentence += "\n"
-                                carry_on = 0
-                            else:
-                                sentence += " "
-                        jj += 1
-                axs[i // N % M, i % N].text(0, 8, sentence, fontsize=10,
-                                            bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 2})
-        plt.show()
diff --git a/sockeye/image_captioning/encoder.py b/sockeye/image_captioning/encoder.py
deleted file mode 100644
index 1521c0ff4..000000000
--- a/sockeye/image_captioning/encoder.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Encoders for sequence-to-sequence models.
-"""
-import logging
-from typing import List, Tuple
-
-import mxnet as mx
-
-from .. import constants as C
-from ..config import Config
-from ..encoder import EncoderSequence, Encoder
-from ..encoder import get_positional_embedding
-
-logger = logging.getLogger(__name__)
-
-
-class ImageLoadedCnnEncoderConfig(Config):
-    """
-    Image cnn encoder configuration. The symbolic model is loaded from disk.
-
-    :param model_path: Path where the json file is stored.
-    :param epoch: Epoch of the pre-trained model.
-    :param layer_name: Name of the layer of the loaded symbol to get the encoding from.
-    :param encoded_seq_len: Size of the feature layer. If the layer is a conv layer.
-        encoded_seq_len should be equal to the height*width of the convolutional map,
-        the number of kernel is not considered.
-    :param num_embed: Number of hiddens to project the local features to.
-    :param no_global_descriptor: By default the global visual feature (spatial avg of conv map)
-        is concatenated to the local visual features (conv map). This option disables the use of
-        the global descriptor, such that only the local ones are used.
-    :param number_of_kernels: If using preextracted features, we need to know the number of dim of the features.
-    :param positional_embedding_type: Which king of positional embeddingm if any.
-    :param preextracted_features: Turn to bool if you preextracted featured from existing model.
-    """
-
-    def __init__(self,
-                 model_path: str,
-                 epoch: int,
-                 layer_name: str,
-                 encoded_seq_len: int,
-                 num_embed: int,
-                 no_global_descriptor: bool = True,
-                 number_of_kernels: int = None,
-                 positional_embedding_type: str = "",
-                 preextracted_features: bool = False) -> None:
-        super().__init__()
-        self.model_path = model_path
-        self.layer_name = layer_name
-        self.epoch = epoch
-        self.encoded_seq_len = encoded_seq_len
-        self.num_embed = num_embed
-        self.no_global_descriptor = no_global_descriptor
-        self.number_of_kernels = number_of_kernels
-        self.positional_embedding_type = positional_embedding_type
-        self.preextracted_features = preextracted_features
-
-
-def get_image_cnn_encoder(config: ImageLoadedCnnEncoderConfig) -> 'Encoder':
-    """
-    Creates a image encoder.
-
-    :param config: Configuration for image encoder.
-    :return: Encoder instance.
-    """
-
-    encoders = list()  # type: List[Encoder]
-    max_seq_len = config.encoded_seq_len
-    if not config.no_global_descriptor:
-        max_seq_len += 1
-    encoders.append(get_positional_embedding(config.positional_embedding_type,
-                                             config.num_embed,
-                                             max_seq_len=max_seq_len,
-                                             fixed_pos_embed_scale_up_input=False,
-                                             fixed_pos_embed_scale_down_positions=True,
-                                             prefix=C.SOURCE_POSITIONAL_EMBEDDING_PREFIX))
-    encoders.append(ImageLoadedCnnEncoder(config=config))
-    return EncoderSequence(encoders)
-
-
-class ImageLoadedCnnEncoder(Encoder):
-    """
-    Image cnn encoder. The model is loaded from disk.
-
-    :param config: Image cnn encoder config.
-    :param prefix: Name prefix for symbols of this encoder.
-    """
-
-    def __init__(self,
-                 config: ImageLoadedCnnEncoderConfig,
-                 prefix: str = C.CHAR_SEQ_ENCODER_PREFIX) -> None:
-        self.model_path = config.model_path
-        self.layer_name = config.layer_name
-        self.epoch = config.epoch
-        self.encoded_seq_len = config.encoded_seq_len
-        self.num_embed = config.num_embed
-        self.no_global_descriptor = config.no_global_descriptor
-        self.preextracted_features = config.preextracted_features
-        if not self.preextracted_features:
-            sym, args, auxs = mx.model.load_checkpoint(self.model_path, self.epoch)
-            # get layers up to layer_name
-            all_layers = sym.get_internals()
-            try:
-                self.sym = all_layers[self.layer_name + "_output"]
-            except ValueError:
-                raise ValueError("Layer {} not found in the architecure located at "
-                                 "{}. Make sure that you choose an existing layer.".format(self.layer_name,
-                                                                                           self.model_path))
-            # throws away fc weights
-            self.args = dict({k: args[k] for k in args if 'fc1' not in k})
-            self.auxs = auxs
-            self.n_kernels = self.args[self.layer_name + "_weight"].shape[0]
-            # "rename" input
-            self.input = mx.sym.Variable(name=C.SOURCE_NAME)
-            self.sym = self.sym(data=self.input)
-        else:
-            self.args = {}
-            self.auxs = {}
-            self.n_kernels = config.number_of_kernels
-            self.sym = mx.sym.Variable(name=C.SOURCE_NAME)
-        self.names = ["local_image_encoding_weight"]
-        self.other_weights = {self.names[0]: mx.sym.Variable(self.names[0])}
-        if not self.no_global_descriptor:
-            self.names.append("global_image_encoding_weight")
-            self.other_weights[self.names[1]] = mx.sym.Variable(self.names[1])
-            self.encoded_seq_len += 1
-        # output
-        self.output_dim = self.num_embed
-
-    def encode(self,
-               data: mx.sym.Symbol,
-               data_length: mx.sym.Symbol,
-               seq_len: int) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, int]:
-        """
-        Encodes data given sequence lengths of individual examples and maximum sequence length.
-
-        :param data: Ignored. Assume that the input is the image.
-        :param data_length: Vector with sequence lengths.
-        :param seq_len: Maximum sequence length.
-        :return: Encoded versions of input data data, data_length, seq_len.
-        """
-
-        # (batch, n_kernels, height, width) -> (batch, width, height, n_kernels)
-        embedding = mx.sym.swapaxes(data=self.sym, dim1=1, dim2=3)
-        # (batch, width, height, n_kernels) -> (batch, height, width, n_kernels)
-        embedding = mx.sym.swapaxes(data=embedding, dim1=1, dim2=2)
-        # (batch, height, width, n_kernels) -> (batch, height*width, n_kernels)
-        embedding = mx.sym.Reshape(data=embedding, shape=(0, -3, self.n_kernels))
-        # Feature projection layer: (batch, height*width, num_embed)
-        embedding = mx.sym.FullyConnected(data=embedding, weight=self.other_weights[self.names[0]],
-                                          num_hidden=self.num_embed, no_bias=True, flatten=False)
-        embedding = mx.sym.Activation(data=embedding, act_type='relu')
-
-        # Visual global description: average pooling
-        if not self.no_global_descriptor:
-            glob_embedding = mx.sym.mean(data=embedding, axis=1)  # (batch, n_kernels)
-            glob_embedding = mx.sym.FullyConnected(data=glob_embedding, weight=self.other_weights[self.names[1]],
-                                                   num_hidden=self.num_embed, no_bias=True)
-            glob_embedding = mx.sym.Activation(data=glob_embedding, act_type='relu')
-            glob_embedding = mx.sym.expand_dims(glob_embedding, axis=1)
-            # Concatenate embeddings with global embedding: (batch, height*width+1, num_embed)
-            embedding = mx.sym.concat(embedding, glob_embedding, dim=1, name="local_global_image_embedding")
-
-        # Symbol to infer axis 1 dimension
-        d = mx.sym.slice_axis(data=embedding, axis=2, begin=0, end=1)  # (batch, height*width, num_embed)
-        d = mx.sym.clip(data=d, a_min=1.0, a_max=1.0)  # matrix of all ones
-        encoded_data_length = mx.sym.sum(mx.sym.broadcast_equal(d, mx.sym.ones((1,))), axis=1)  # (batch, 1)
-        encoded_data_length = mx.sym.reshape(data=encoded_data_length, shape=(-1,))  # (batch, )
-
-        return embedding, encoded_data_length, self.encoded_seq_len
-
-    def get_params(self):
-        """
-        Get the parameters of the pre-trained networks.
-
-        :return: Tuple of arguments and auxiliaries
-        """
-        return self.args, self.auxs
-
-    def get_num_hidden(self) -> int:
-        """
-        Return the representation size of this encoder.
-        """
-        return self.output_dim
-
-    def get_encoded_seq_len(self, seq_len: int) -> int:
-        """
-        :return: The size of the encoded sequence.
-        """
-        return self.encoded_seq_len
-
-    def get_initializers(self) -> List[Tuple[str, mx.init.Initializer]]:
-        """
-        Get the initializers of the network, considering the pretrained models.
-
-        :return: List of tuples (string name, mxnet initializer)
-        """
-        patterns_vals = []
-        # Load from args/auxs
-        for k in self.args.keys():
-            patterns_vals.append((k, mx.init.Load({k: self.args[k]})))
-        for k in self.auxs.keys():
-            patterns_vals.append((k, mx.init.Load({k: self.auxs[k]})))
-        # Initialize
-        for k in self.names:
-            patterns_vals.append((k, mx.init.Xavier(rnd_type='uniform', factor_type='avg', magnitude=3)))
-
-        return patterns_vals
-
-    def get_fixed_param_names(self) -> List[str]:
-        """
-        Get the fixed params of the network.
-
-        :return: List of strings, names of the layers
-        """
-        args = set(self.args.keys()) | set(self.auxs.keys())
-
-        return list(args & set(self.sym.list_arguments()))
diff --git a/sockeye/image_captioning/extract_features.py b/sockeye/image_captioning/extract_features.py
deleted file mode 100644
index 6b4c68ceb..000000000
--- a/sockeye/image_captioning/extract_features.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-CLI for extracting image features.
-"""
-import argparse
-import os
-import pickle
-import logging
-from contextlib import ExitStack
-from typing import List, Tuple
-
-import mxnet as mx
-import numpy as np
-
-from . import arguments
-from . import encoder
-from . import utils
-from .. import constants as C
-from ..log import setup_main_logger
-from ..utils import check_condition, determine_context
-
-# Temporary logger, the real one (logging to a file probably, will be created
-# in the main function)
-logger = logging.getLogger(__name__)
-
-
-def batching(iterable, n=1):
-    length = len(iterable)
-    for ndx in range(0, length, n):
-        yield iterable[ndx:min(ndx + n, length)]
-
-
-def get_pretrained_net(args: argparse.Namespace, context: mx.Context) -> Tuple[mx.mod.Module, Tuple[int]]:
-    # init encoder
-    image_cnn_encoder_config = encoder.ImageLoadedCnnEncoderConfig(
-        model_path=args.image_encoder_model_path,
-        epoch=args.image_encoder_model_epoch,
-        layer_name=args.image_encoder_layer,
-        encoded_seq_len=0,
-        num_embed=100,
-        preextracted_features=False)  # this num does not matter here
-
-    image_cnn_encoder = encoder.ImageLoadedCnnEncoder(image_cnn_encoder_config)
-    symbol = image_cnn_encoder.sym  # this is the net before further encoding
-    arg_shapes, out_shapes, aux_shapes = symbol.infer_shape(source=(1,) + tuple(args.source_image_size))
-    last_layer_shape = out_shapes[-1][1:]
-
-    # Create module
-    module = mx.mod.Module(symbol=symbol,
-                           data_names=[C.SOURCE_NAME],
-                           label_names=[],
-                           context=context)
-    module.bind(for_training=False, data_shapes=[(C.SOURCE_NAME, (args.batch_size,) + tuple(args.source_image_size))])
-
-    # Init with pretrained net
-    initializers = image_cnn_encoder.get_initializers()
-    init = mx.initializer.Mixed(*zip(*initializers))
-    module.init_params(init)
-
-    return module, last_layer_shape
-
-
-def extract_features_forward(im, module, image_root, output_root, batch_size, source_image_size, context):
-    batch = mx.nd.zeros((batch_size,) + tuple(source_image_size), context)
-    # Reading
-    out_names = []
-    for i, v in enumerate(im):
-        batch[i] = utils.load_preprocess_image(os.path.join(image_root, v), source_image_size[1:])
-        out_names.append(os.path.join(output_root, v.replace("/", "_")))
-    # Forward
-    module.forward(mx.io.DataBatch([batch]))
-    feats = module.get_outputs()[0].asnumpy()
-    # Chunk last batch which might be smaller
-    if len(im) < batch_size:
-        feats = feats[:len(im)]
-    return feats, out_names
-
-
-def read_list_file(inp: str) -> List[str]:
-    with open(inp, "r") as fd:
-        data_list = []  # type: List[str]
-        for i in fd.readlines():
-            data_list.append(i.split("\n")[0])
-    return data_list
-
-
-def main():
-    setup_main_logger(file_logging=False, console=True)
-    params = argparse.ArgumentParser(description='CLI to extract features from images.')
-    arguments.add_image_extract_features_cli_args(params)
-    args = params.parse_args()
-
-    image_root = os.path.abspath(args.image_root)
-    output_root = os.path.abspath(args.output_root)
-    output_file = os.path.abspath(args.output)
-    size_out_file = os.path.join(output_root, "image_feature_sizes.pkl")
-    if os.path.exists(output_root):
-        logger.info("Overwriting provided path {}.".format(output_root))
-    else:
-        os.makedirs(output_root)
-
-    # read image list file
-    image_list = read_list_file(args.input)
-
-    # Get pretrained net module (already bind)
-    with ExitStack() as exit_stack:
-        check_condition(len(args.device_ids) == 1, "extract_features only supports single device for now")
-        context = determine_context(device_ids=args.device_ids,
-                                    use_cpu=args.use_cpu,
-                                    disable_device_locking=args.disable_device_locking,
-                                    lock_dir=args.lock_dir,
-                                    exit_stack=exit_stack)[0]
-        module, _ = get_pretrained_net(args, context)
-
-        # Extract features
-        with open(output_file, "w") as fout:
-            for i, im in enumerate(batching(image_list, args.batch_size)):
-                logger.info("Processing batch {}/{}".format(i + 1, int(np.ceil(len(image_list) / args.batch_size))))
-                # TODO: enable caching to reuse features and resume computation
-                feats, out_names = extract_features_forward(im, module,
-                                                            image_root,
-                                                            output_root,
-                                                            args.batch_size,
-                                                            args.source_image_size,
-                                                            context)
-                # Save to disk
-                out_file_names = utils.save_features(out_names, feats)
-                # Write to output file
-                out_file_names = map(lambda x: os.path.basename(x) + "\n", out_file_names)
-                fout.writelines(out_file_names)
-
-        # Save the image size and feature size
-        with open(size_out_file, "wb") as fout:
-            pickle.dump({"image_shape": tuple(args.source_image_size), "features_shape": tuple(feats.shape[1:])}, fout)
-
-        # Copy image model to output_folder
-        image_encoder_model_path = utils.copy_mx_model_to(args.image_encoder_model_path,
-                                                          args.image_encoder_model_epoch,
-                                                          output_root)
-
-        logger.info("Files saved in {}, {} and {}.".format(output_file,
-                                                           size_out_file,
-                                                           image_encoder_model_path))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/sockeye/image_captioning/inference.py b/sockeye/image_captioning/inference.py
deleted file mode 100644
index c93321d71..000000000
--- a/sockeye/image_captioning/inference.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Code for inference/captioning.
-"""
-import functools
-import logging
-import os
-from typing import List, Optional, Tuple
-
-import mxnet as mx
-
-from . import utils as utils_image
-from .. import constants as C
-from .. import data_io
-from .. import lexical_constraints as constrained
-from .. import lexicon
-from .. import model
-from .. import utils
-from .. import vocab
-from ..inference import InferenceModel, Translator, \
-    TranslatorInput, TranslatorOutput, models_max_input_output_length
-
-logger = logging.getLogger(__name__)
-
-
-class ImageInferenceModel(InferenceModel):
-    """
-    ImageInferenceModel is a InferenceModel that supports image models as encoders.
-    """
-
-    def __init__(self, input_size, **kwargs) -> None:
-        super().__init__(**kwargs)
-        self.input_size = input_size
-
-    def _get_encoder_data_shapes(self, bucket_key: int, batch_size: int) -> List[mx.io.DataDesc]:
-        """
-        Returns data shapes of the encoder module.
-
-        :param bucket_key: Maximum input length.
-        :param batch_size: Batch size.
-        :return: List of data descriptions.
-        """
-        return [mx.io.DataDesc(name=C.SOURCE_NAME,
-                               shape=(batch_size,) + self.input_size,
-                               layout=C.BATCH_MAJOR_IMAGE)]
-
-    @property
-    def max_supported_seq_len_source(self) -> Optional[int]:
-        """ If not None this is the maximally supported source length during inference (hard constraint). """
-        return None
-
-
-class ImageCaptioner(Translator):
-    """
-    ImageCaptioner uses one or several models to output captions.
-    It holds references to vocabularies to takes care of encoding input strings as word ids and conversion
-    of target ids into a caption string.
-
-    :param context: MXNet context to bind modules to.
-    :param ensemble_mode: Ensemble mode: linear or log_linear combination.
-    :param length_penalty: Length penalty instance.
-    :param brevity_penalty: Brevity penalty instance.
-    :param beam_prune: Beam pruning difference threshold.
-    :param beam_search_stop: The stopping criterium.
-    :param models: List of models.
-    :param vocab_target: Target vocabulary.
-    :param restrict_lexicon: Top-k lexicon to use for target vocabulary restriction.
-    :param source_image_size: Shape of the image, input of the net
-    :param source_root: Root where the images are stored
-    :param use_feature_loader: Use precomputed features
-    :param features_in_memory: The features are already loaded in the TranslatorInput object. The option
-            use_feature_loader is ignored in this case, thus overridden.
-    :param store_beam: If True, store the beam search history and return it in the TranslatorOutput.
-    :param strip_unknown_words: If True, removes any <unk> symbols from outputs.
-    """
-
-    def __init__(self,
-                 source_image_size: tuple,
-                 source_root: str,
-                 use_feature_loader: bool,
-                 features_in_memory=False,
-                 **kwargs) -> None:
-        super().__init__(**kwargs)
-        self.source_image_size = source_image_size
-        self.source_root = source_root
-        self.use_feature_loader = use_feature_loader
-        self.features_in_memory = features_in_memory
-        if self.use_feature_loader:
-            self.data_loader = utils_image.load_features
-        else:
-            self.data_loader = functools.partial(utils_image.load_preprocess_images,
-                                                 image_size=self.source_image_size)
-        if self.features_in_memory:
-            self.data_loader = lambda x: x  # Assume that the features are already in memory
-
-    @staticmethod
-    def make_input(sentence_id: int, sentence: str) -> TranslatorInput:
-        """
-        Returns TranslatorInput from input_string
-        :param sentence_id: Input image id.
-        :param sentence: Input image path.
-        :return: Input for translate method.
-        """
-        return TranslatorInput(sentence_id=sentence_id, tokens=[sentence], factors=None)
-
-    def translate(self, trans_inputs: List[TranslatorInput]) -> List[TranslatorOutput]:
-        """
-        Batch-translates a list of TranslatorInputs, returns a list of TranslatorOutputs.
-        Splits oversized sentences to sentence chunks of size less than max_input_length.
-
-        :param trans_inputs: List of TranslatorInputs as returned by make_input().
-        :return: List of translation results.
-        """
-        batch_size = self.max_batch_size
-        # translate in batch-sized blocks over input chunks
-        translations = []
-        for batch_id, batch in enumerate(utils.grouper(trans_inputs, batch_size)):
-            logger.debug("Translating batch %d", batch_id)
-            # underfilled batch will be filled to a full batch size with copies of the 1st input
-            rest = batch_size - len(batch)
-            if rest > 0:
-                logger.debug("Extending the last batch to the full batch size (%d)", batch_size)
-                batch = batch + [batch[0]] * rest
-            batch_translations = self._translate_nd(*self._get_inference_input(batch))
-            # truncate to remove filler translations
-            if rest > 0:
-                batch_translations = batch_translations[:-rest]
-            translations.extend(batch_translations)
-
-        # Concatenate results
-        results = []  # type: List[TranslatorOutput]
-        for trans_input, translation in zip(trans_inputs, translations):
-            results.append(self._make_result(trans_input, translation))
-        return results
-
-    def _get_inference_input(self,
-                             trans_inputs: List[TranslatorInput]) -> Tuple[mx.nd.NDArray,
-                                                                           int,
-                                                                           Optional[lexicon.TopKLexicon],
-                                                                           List[
-                                                                               Optional[constrained.RawConstraintList]],
-                                                                           List[
-                                                                               Optional[constrained.RawConstraintList]],
-                                                                           mx.nd.NDArray]:
-        """
-        Returns NDArray of images and corresponding bucket_key and an NDArray of maximum output lengths
-        for each sentence in the batch.
-
-        :param trans_inputs: List of TranslatorInputs. The path of the image/feature is in the token field.
-        :param constraints: Optional list of constraints.
-        :return: NDArray of images paths, bucket key, a list of raw constraint lists,
-                an NDArray of maximum output lengths.
-        """
-        batch_size = len(trans_inputs)
-        image_paths = [None for _ in range(batch_size)]  # type: List[Optional[str]]
-        restrict_lexicon = None  # type: Optional[lexicon.TopKLexicon]
-        raw_constraints = [None for _ in range(batch_size)]  # type: List[Optional[constrained.RawConstraintList]]
-        raw_avoid_list = [None for _ in range(batch_size)]  # type: List[Optional[constrained.RawConstraintList]]
-        for j, trans_input in enumerate(trans_inputs):
-            # Join relative path with absolute
-            path = trans_input.tokens[0]
-            if self.source_root is not None and not(self.features_in_memory):
-                path = os.path.join(self.source_root, path)
-            image_paths[j] = path
-            # Preprocess constraints
-            if trans_input.constraints is not None and not(self.features_in_memory):
-                raw_constraints[j] = [data_io.tokens2ids(phrase, self.vocab_target) for phrase in
-                                      trans_input.constraints]
-
-        # Read data and zero pad if necessary
-        images = self.data_loader(image_paths)
-        images = utils_image.zero_pad_features(images, self.source_image_size)
-
-        max_input_length = 0
-        max_output_lengths = [self.models[0].get_max_output_length(max_input_length)] * len(image_paths)
-        return mx.nd.array(images), max_input_length, restrict_lexicon, raw_constraints, raw_avoid_list, \
-                mx.nd.array(max_output_lengths, ctx=self.context, dtype='int32')
-
-
-def load_models(context: mx.context.Context,
-                max_input_len: Optional[int],
-                beam_size: int,
-                batch_size: int,
-                model_folders: List[str],
-                checkpoints: Optional[List[int]] = None,
-                softmax_temperature: Optional[float] = None,
-                max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH,
-                decoder_return_logit_inputs: bool = False,
-                cache_output_layer_w_b: bool = False,
-                source_image_size: tuple = None,
-                forced_max_output_len: Optional[int] = None) -> Tuple[List[ImageInferenceModel], vocab.Vocab]:
-    """
-    Loads a list of models for inference.
-
-    :param context: MXNet context to bind modules to.
-    :param max_input_len: Maximum input length.
-    :param beam_size: Beam size.
-    :param batch_size: Batch size.
-    :param model_folders: List of model folders to load models from.
-    :param checkpoints: List of checkpoints to use for each model in model_folders. Use None to load best checkpoint.
-    :param softmax_temperature: Optional parameter to control steepness of softmax distribution.
-    :param max_output_length_num_stds: Number of standard deviations to add to mean target-source length ratio
-           to compute maximum output length.
-    :param decoder_return_logit_inputs: Model decoders return inputs to logit computation instead of softmax over target
-                                        vocabulary.  Used when logits/softmax are handled separately.
-    :param cache_output_layer_w_b: Models cache weights and biases for logit computation as NumPy arrays (used with
-                                   restrict lexicon).
-    :param source_image_size: Size of the image to resize to. Used only for the image-text models
-    :param forced_max_output_len: An optional overwrite of the maximum out length.
-    :return: List of models, target vocabulary, source factor vocabularies.
-    """
-    models = []  # type: List[ImageInferenceModel]
-    target_vocabs = []  # type: List[vocab.Vocab]
-
-    if checkpoints is None:
-        checkpoints = [None] * len(model_folders)
-
-    for model_folder, checkpoint in zip(model_folders, checkpoints):
-        target_vocabs.append(vocab.vocab_from_json(os.path.join(model_folder, C.VOCAB_TRG_NAME)))
-
-        model_version = utils.load_version(os.path.join(model_folder, C.VERSION_NAME))
-        logger.info("Model version: %s", model_version)
-        utils.check_version(model_version)
-        model_config = model.SockeyeModel.load_config(os.path.join(model_folder, C.CONFIG_NAME))
-
-        if checkpoint is None:
-            params_fname = os.path.join(model_folder, C.PARAMS_BEST_NAME)
-        else:
-            params_fname = os.path.join(model_folder, C.PARAMS_NAME % checkpoint)
-
-        inference_model = ImageInferenceModel(config=model_config,
-                                              params_fname=params_fname,
-                                              context=context,
-                                              beam_size=beam_size,
-                                              softmax_temperature=softmax_temperature,
-                                              decoder_return_logit_inputs=decoder_return_logit_inputs,
-                                              cache_output_layer_w_b=cache_output_layer_w_b,
-                                              input_size=source_image_size,
-                                              forced_max_output_len=forced_max_output_len)
-
-        models.append(inference_model)
-
-    # set a common max_output length for all models.
-    max_input_len, get_max_output_length = models_max_input_output_length(models,
-                                                                          max_output_length_num_stds,
-                                                                          max_input_len,
-                                                                          forced_max_output_len=forced_max_output_len)
-
-    for inference_model in models:
-        inference_model.initialize(batch_size, max_input_len, get_max_output_length)
-
-    return models, target_vocabs[0]
diff --git a/sockeye/image_captioning/score.py b/sockeye/image_captioning/score.py
deleted file mode 100644
index 5d5661336..000000000
--- a/sockeye/image_captioning/score.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Scoring CLI.
-"""
-import argparse
-import logging
-import os
-from contextlib import ExitStack
-from typing import Optional, List, Tuple
-
-import mxnet as mx
-from .. import arguments
-from . import arguments as arguments_image
-from .. import constants as C
-from . import data_io
-from .. import inference
-from .. import model
-from .. import scoring
-from . import scoring as scoring_images
-from .. import utils
-from .. import vocab
-from ..log import setup_main_logger
-from ..output_handler import get_output_handler
-from ..utils import check_condition
-from .train import read_feature_shape
-from .captioner import _extract_features
-from .encoder import ImageLoadedCnnEncoderConfig  # needed otherwise the model fails to be loaded
-
-# Temporary logger, the real one (logging to a file probably, will be created in the main function)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    params = arguments.ConfigArgumentParser(description='Score data with an existing model.')
-    arguments_image.add_image_score_caption_cli_args(params)
-    args = params.parse_args()
-    setup_main_logger(file_logging=False, console=True, level=args.loglevel)  # pylint: disable=no-member
-    score(args)
-
-
-def get_data_iters_and_vocabs(args: argparse.Namespace,
-                              model_folder: Optional[str],
-                              context: List[mx.Context]) -> Tuple['data_io.BaseParallelSampleIter',
-                                                                  List[vocab.Vocab], vocab.Vocab, model.ModelConfig]:
-    """
-    Loads the data iterators and vocabularies.
-
-    :param args: Arguments as returned by argparse.
-    :param model_folder: Output folder.
-    :return: The scoring data iterator as well as the source and target vocabularies.
-    """
-    image_preextracted_features = not args.extract_image_features
-
-    if not image_preextracted_features:
-        # Extract features and override input and source_root with tmp location of features
-        args.source_root, args.input, args.feature_size = _extract_features(args, context)
-        image_preextracted_features = True  # now we extracted features
-    else:  # Read feature size from disk
-        _, args.feature_size = read_feature_shape(args.source_root)
-
-    model_config = model.SockeyeModel.load_config(os.path.join(args.model, C.CONFIG_NAME))
-
-    if args.max_seq_len is None:
-        max_seq_len_source = model_config.config_data.max_seq_len_source
-        max_seq_len_target = model_config.config_data.max_seq_len_target
-    else:
-        max_seq_len_source, max_seq_len_target = args.max_seq_len
-
-    batch_num_devices = 1 if args.use_cpu else sum(-di if di < 0 else 1 for di in args.device_ids)
-
-    # Load the existing vocabs created when starting the training run.
-    source_vocabs = None
-    target_vocab = vocab.load_target_vocab(model_folder)
-
-    sources = [args.source] + args.source_factors
-    sources = [str(os.path.abspath(source)) for source in sources]
-
-    score_iter = data_io.get_scoring_image_text_data_iters(
-        source_root=args.source_root,
-        sources=sources,
-        target=os.path.abspath(args.target),
-        vocab_target=target_vocab,
-        batch_size=args.batch_size,
-        batch_num_devices=batch_num_devices,
-        max_seq_len_source=max_seq_len_source,
-        max_seq_len_target=max_seq_len_target,
-        source_image_size=tuple(args.feature_size),
-        use_feature_loader=image_preextracted_features,
-        preload_features=args.load_all_features_to_memory,
-    )
-
-    return score_iter, source_vocabs, target_vocab, model_config
-
-
-def score(args: argparse.Namespace):
-
-    setup_main_logger(file_logging=False, console=not args.quiet)
-
-    utils.log_basic_info(args)
-
-    with ExitStack() as exit_stack:
-        context = utils.determine_context(device_ids=args.device_ids,
-                                          use_cpu=args.use_cpu,
-                                          disable_device_locking=args.disable_device_locking,
-                                          lock_dir=args.lock_dir,
-                                          exit_stack=exit_stack)
-        if args.batch_type == C.BATCH_TYPE_SENTENCE:
-            check_condition(args.batch_size % len(context) == 0, "When using multiple devices the batch size must be "
-                                                                 "divisible by the number of devices. Choose a batch "
-                                                                 "size that is a multiple of %d." % len(context))
-        logger.info("Scoring Device(s): %s", ", ".join(str(c) for c in context))
-
-        # This call has a number of different parameters compared to training which reflect our need to get scores
-        # one-for-one and in the same order as the input data.
-        # To enable code reuse, we stuff the `args` parameter with some values.
-        # Bucketing and permuting need to be turned off in order to preserve the ordering of sentences.
-        # Finally, 'resume_training' needs to be set to True because it causes the model to be loaded instead of initialized.
-        args.no_bucketing = True
-        args.bucket_width = 10
-        score_iter, source_vocabs, target_vocab, model_config = get_data_iters_and_vocabs(
-            args=args,
-            model_folder=args.model,
-            context=context)
-
-        scoring_model = scoring.ScoringModel(config=model_config,
-                                             model_dir=args.model,
-                                             context=context,
-                                             provide_data=score_iter.provide_data,
-                                             provide_label=score_iter.provide_label,
-                                             default_bucket_key=score_iter.default_bucket_key,
-                                             score_type=args.score_type,
-                                             length_penalty=inference.LengthPenalty(alpha=args.length_penalty_alpha,
-                                                                                    beta=args.length_penalty_beta),
-                                             brevity_penalty=inference.BrevityPenalty(weight=args.brevity_penalty_weight),
-                                             softmax_temperature=args.softmax_temperature,
-                                             brevity_penalty_type=args.brevity_penalty_type,
-                                             constant_length_ratio=args.brevity_penalty_constant_length_ratio)
-
-        scorer = scoring_images.Scorer(scoring_model, source_vocabs, target_vocab)
-
-        scorer.score(score_iter=score_iter,
-                     output_handler=get_output_handler(output_type=args.output_type,
-                                                       output_fname=args.output))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/sockeye/image_captioning/scoring.py b/sockeye/image_captioning/scoring.py
deleted file mode 100644
index 19acd0ded..000000000
--- a/sockeye/image_captioning/scoring.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Code for scoring.
-"""
-import logging
-import math
-import time
-from typing import List
-
-import numpy as np
-
-from .. import constants as C
-from ..scoring import ScoringModel
-from .. import data_io
-from .. import vocab
-from ..inference import TranslatorInput, TranslatorOutput
-from ..output_handler import OutputHandler
-
-logger = logging.getLogger(__name__)
-
-
-class Scorer:
-    """
-    Scorer class takes a ScoringModel and uses it to score a stream of parallel image-sentence pairs.
-    It also takes the vocabularies so that the original sentences can be printed out, if desired.
-
-    :param model: The model to score with.
-    :param source_vocabs: The source vocabularies. Not used, kept for consistency with main sockeye.score.Scorer.
-    :param target_vocab: The target vocabulary.
-    """
-    def __init__(self,
-                 model: ScoringModel,
-                 source_vocabs: List[vocab.Vocab],
-                 target_vocab: vocab.Vocab,
-                 constant_length_ratio: float = -1.0) -> None:
-        self.target_vocab_inv = vocab.reverse_vocab(target_vocab)
-        self.model = model
-        self.exclude_list = {None, target_vocab[C.EOS_SYMBOL], C.PAD_ID}
-        self.constant_length_ratio = constant_length_ratio
-
-    def score(self,
-              score_iter,
-              output_handler: OutputHandler):
-
-        total_time = 0.
-        sentence_no = 0
-        batch_no = 0
-        for batch_no, batch in enumerate(score_iter, 1):
-            batch_tic = time.time()
-
-            # Run the model and get the outputs
-            scores = self.model.run(batch)[0]
-
-            batch_time = time.time() - batch_tic
-            total_time += batch_time
-
-            batch_size = len(batch.data[0])
-
-            for sentno, (source, target, score) in enumerate(zip(batch.data[0], batch.data[1], scores), 1):
-
-                # The last batch may be underfilled, in which case batch.pad will be set
-                if sentno > (batch_size - batch.pad):
-                    break
-
-                sentence_no += 1
-
-                # Transform arguments in preparation for printing
-                target_ids = [int(x) for x in target.asnumpy().tolist()]
-                target_string = C.TOKEN_SEPARATOR.join(
-                    data_io.ids2tokens(target_ids, self.target_vocab_inv, self.exclude_list))
-
-                # Report a score of -inf for invalid sentence pairs (empty source and/or target)
-                if target[0] == C.PAD_ID:
-                    score = -np.inf
-                else:
-                    score = score.asscalar()
-
-                # Output handling routines require us to make use of inference classes.
-                output_handler.handle(TranslatorInput(sentence_no, None),
-                                      TranslatorOutput(sentence_no, target_string, None, None, score),
-                                      batch_time)
-
-        if sentence_no != 0:
-            logger.info("Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f",
-                        sentence_no, math.ceil(sentence_no / batch_no), total_time,
-                        total_time / sentence_no, sentence_no / total_time)
-        else:
-            logger.info("Processed 0 lines.")
diff --git a/sockeye/image_captioning/train.py b/sockeye/image_captioning/train.py
deleted file mode 100644
index 97f7207c4..000000000
--- a/sockeye/image_captioning/train.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Training CLI for image captioning.
-"""
-
-# Start the forkserver. It is important that this is done before any other imports so that the forkserver is in a clean
-# state.
-if __name__ == "__main__":
-    import sockeye.multiprocessing_utils as mp
-    mp.initialize()
-
-
-import argparse
-import json
-import os
-import pickle
-import logging
-from contextlib import ExitStack
-from typing import cast, Dict, List, Tuple, Optional
-
-import mxnet as mx
-import numpy as np
-
-# Sockeye captioner
-from . import arguments as arguments_image
-from . import checkpoint_decoder
-from . import data_io as data_io_image
-from . import encoder as encoder_image
-from .. import arguments
-from .. import constants as C
-from .. import data_io
-from .. import encoder
-from .. import loss
-from .. import model
-from .. import training
-from .. import utils
-from .. import vocab
-from ..config import Config
-from ..log import setup_main_logger
-from ..train import check_resume, check_arg_compatibility, create_decoder_config, \
-    create_optimizer_config, create_training_model, get_num_embed
-from ..utils import check_condition
-
-# Temporary logger, the real one (logging to a file probably, will be created in the main function)
-logger = logging.getLogger(__name__)
-
-
-def read_feature_shape(path):
-    shape_file = os.path.join(path, "image_feature_sizes.pkl")
-    with open(shape_file, "rb") as fout:
-        shapes = pickle.load(fout)
-    return shapes["image_shape"], shapes["features_shape"]
-
-
-def create_checkpoint_decoder(args: argparse.Namespace,
-                              exit_stack: ExitStack,
-                              train_context: List[mx.Context]) -> Optional[checkpoint_decoder.CheckpointDecoder]:
-    """
-    Returns a checkpoint decoder or None.
-
-    :param args: Arguments as returned by argparse.
-    :param exit_stack: An ExitStack from contextlib.
-    :param train_context: Context for training.
-    :return: A CheckpointDecoder if --decode-and-evaluate != 0, else None.
-    """
-    sample_size = args.decode_and_evaluate
-    if args.optimized_metric == C.BLEU and sample_size == 0:
-        logger.info("You chose BLEU as the optimized metric, will turn on BLEU monitoring during training. "
-                    "To control how many validation sentences are used for calculating bleu use "
-                    "the --decode-and-evaluate argument.")
-        sample_size = -1
-
-    if sample_size == 0:
-        return None
-
-    if args.use_cpu or args.decode_and_evaluate_use_cpu:
-        context = mx.cpu()
-    elif args.decode_and_evaluate_device_id is not None:
-        context = utils.determine_context(device_ids=args.decode_and_evaluate_device_id,
-                                          use_cpu=False,
-                                          disable_device_locking=args.disable_device_locking,
-                                          lock_dir=args.lock_dir,
-                                          exit_stack=exit_stack)[0]
-    else:
-        # default decode context is the last training device
-        context = train_context[-1]
-
-    return checkpoint_decoder.CheckpointDecoderImageModel(context=context,
-                                                          inputs=[args.validation_source] + args.validation_source_factors,
-                                                          references=args.validation_target,
-                                                          model=args.output,
-                                                          sample_size=sample_size,
-                                                          source_image_size=args.source_image_size,
-                                                          image_root=args.validation_source_root,
-                                                          max_output_length=args.max_output_length,
-                                                          use_feature_loader=args.image_preextracted_features)
-
-
-def create_data_iters_and_vocab(args: argparse.Namespace,
-                                max_seq_len_source: int,
-                                max_seq_len_target: int,
-                                resume_training: bool,
-                                output_folder: str) -> Tuple['data_io.BaseParallelSampleIter',
-                                                             'data_io.BaseParallelSampleIter',
-                                                             'data_io.DataConfig', Dict]:
-    """
-    Create the data iterators and the vocabularies.
-
-    :param args: Arguments as returned by argparse.
-    :param max_seq_len_source: Source maximum sequence length.
-    :param max_seq_len_target: Target maximum sequence length.
-    :param resume_training: Whether to resume training.
-    :param output_folder: Output folder.
-    :return: The data iterators (train, validation, config_data) as well as the source and target vocabularies.
-    """
-
-    _, num_words_target = args.num_words
-    num_words_target = num_words_target if num_words_target > 0 else None
-    _, word_min_count_target = args.word_min_count
-    batch_num_devices = 1 if args.use_cpu else sum(-di if di < 0 else 1 for di in args.device_ids)
-    batch_by_words = args.batch_type == C.BATCH_TYPE_WORD
-
-    either_raw_or_prepared_error_msg = "Either specify a raw training corpus with %s or a preprocessed corpus " \
-                                       "with %s." % (C.TRAINING_ARG_TARGET,
-                                                     C.TRAINING_ARG_PREPARED_DATA)
-    # Note: ignore args.prepared_data for the moment
-    utils.check_condition(args.prepared_data is None and args.target is not None,
-                          either_raw_or_prepared_error_msg)
-
-    if resume_training:
-        # Load the existing vocab created when starting the training run.
-        target_vocab = vocab.vocab_from_json(os.path.join(output_folder, C.VOCAB_TRG_NAME))
-
-        # Recover the vocabulary path from the existing config file:
-        data_info = cast(data_io.DataInfo, Config.load(os.path.join(output_folder, C.DATA_INFO)))
-        target_vocab_path = data_info.target_vocab
-    else:
-        # Load vocab:
-        target_vocab_path = args.target_vocab
-        # Note: We do not care about the source vocab for images, that is why some inputs are mocked
-        target_vocab = vocab.load_or_create_vocab(data=args.target,
-                                                  vocab_path=target_vocab_path,
-                                                  num_words=num_words_target,
-                                                  word_min_count=word_min_count_target)
-
-    train_iter, validation_iter, config_data, data_info = data_io_image.get_training_image_text_data_iters(
-        source_root=args.source_root,
-        source=os.path.abspath(args.source),
-        target=os.path.abspath(args.target),
-        validation_source_root=args.validation_source_root,
-        validation_source=os.path.abspath(args.validation_source),
-        validation_target=os.path.abspath(args.validation_target),
-        vocab_target=target_vocab,
-        vocab_target_path=target_vocab_path,
-        batch_size=args.batch_size,
-        batch_by_words=batch_by_words,
-        batch_num_devices=batch_num_devices,
-        source_image_size=args.source_image_size,
-        max_seq_len_target=max_seq_len_target,
-        bucketing=not args.no_bucketing,
-        bucket_width=args.bucket_width,
-        use_feature_loader=args.image_preextracted_features,
-        preload_features=args.load_all_features_to_memory
-    )
-
-    data_info_fname = os.path.join(output_folder, C.DATA_INFO)
-    logger.info("Writing data config to '%s'", data_info_fname)
-    # Removing objects that cannot be saved:
-    data_info.sources = None
-    data_info.save(data_info_fname)
-
-    return train_iter, validation_iter, config_data, target_vocab
-
-
-def create_encoder_config(args: argparse.Namespace) -> Tuple[Config, int]:
-    if args.encoder == C.IMAGE_PRETRAIN_TYPE:
-        number_of_kernels = args.source_image_size[0]
-        encoded_seq_len = np.prod(args.source_image_size[1:])
-        config_encoder = encoder_image.ImageLoadedCnnEncoderConfig(model_path=args.image_encoder_model_path,
-                                                                   epoch=args.image_encoder_model_epoch,
-                                                                   layer_name=args.image_encoder_layer,
-                                                                   encoded_seq_len=encoded_seq_len,
-                                                                   num_embed=args.image_encoder_num_hidden,
-                                                                   no_global_descriptor=args.no_image_encoder_global_descriptor,
-                                                                   preextracted_features=args.image_preextracted_features,
-                                                                   number_of_kernels=number_of_kernels,
-                                                                   positional_embedding_type=args.image_positional_embedding_type)
-        encoder_num_hidden = args.image_encoder_num_hidden
-    else:
-        raise ValueError("Image encoder must be provided. (current: {}, "
-                         "expected: {})".format(args.encoder, C.ENCODERS))
-    return config_encoder, encoder_num_hidden
-
-
-def create_model_config(args: argparse.Namespace,
-                        vocab_target_size: int,
-                        max_seq_len_source: int,
-                        max_seq_len_target: int,
-                        config_data: data_io.DataConfig) -> model.ModelConfig:
-    """
-    Create a ModelConfig from the argument given in the command line.
-
-    :param args: Arguments as returned by argparse.
-    :param vocab_target_size: The size of the target vocabulary.
-    :param max_seq_len_source: Maximum source sequence length.
-    :param max_seq_len_target: Maximum target sequence length.
-    :param config_data: Data config.
-    :return: The model configuration.
-    """
-    num_embed_source, num_embed_target = get_num_embed(args)
-    _, embed_dropout_target = args.embed_dropout
-
-    config_encoder, encoder_num_hidden = create_encoder_config(args)
-    config_decoder = create_decoder_config(args, encoder_num_hidden, max_seq_len_source, max_seq_len_target,
-                                           num_embed_target)
-
-    config_embed_source = encoder.PassThroughEmbeddingConfig()
-    config_embed_target = encoder.EmbeddingConfig(vocab_size=vocab_target_size,
-                                                  num_embed=num_embed_target,
-                                                  dropout=embed_dropout_target)
-
-    config_loss = loss.LossConfig(name=args.loss,
-                                  vocab_size=vocab_target_size,
-                                  normalization_type=args.loss_normalization_type,
-                                  label_smoothing=args.label_smoothing)
-
-    model_config = model.ModelConfig(config_data=config_data,
-                                     vocab_source_size=0,
-                                     vocab_target_size=vocab_target_size,
-                                     config_embed_source=config_embed_source,
-                                     config_embed_target=config_embed_target,
-                                     config_encoder=config_encoder,
-                                     config_decoder=config_decoder,
-                                     config_loss=config_loss,
-                                     weight_tying=args.weight_tying,
-                                     weight_tying_type=args.weight_tying_type if args.weight_tying else None,
-                                     weight_normalization=args.weight_normalization,
-                                     lhuc=args.lhuc is not None)
-    return model_config
-
-
-def get_preinit_encoders(encoders: List[encoder.Encoder]) -> List[Tuple[str, mx.init.Initializer]]:
-    """
-    Get initializers from encoders. Some encoders might be initialized from pretrained models.
-
-    :param encoders: List of encoders
-    :return: The list of initializers
-    """
-    init = []  # type: List[Tuple[str, mx.init.Initializer]]
-    for enc in encoders:
-        if hasattr(enc, "get_initializers"):
-            enc = cast(encoder_image.ImageLoadedCnnEncoder, enc)
-            init.extend(enc.get_initializers())
-    return init
-
-
-def main():
-    params = arguments.ConfigArgumentParser(description='Train Sockeye images-to-text models.')
-    arguments_image.add_image_train_cli_args(params)
-    args = params.parse_args()
-    train(args)
-
-
-def train(args: argparse.Namespace):
-    # TODO: make training compatible with full net
-    args.image_preextracted_features = True  # override this for now
-
-    utils.seed_rngs(args.seed)
-
-    check_arg_compatibility(args)
-    output_folder = os.path.abspath(args.output)
-    resume_training = check_resume(args, output_folder)
-
-    setup_main_logger(file_logging=True,
-                      console=not args.quiet, path=os.path.join(output_folder, C.LOG_NAME))
-    utils.log_basic_info(args)
-    with open(os.path.join(output_folder, C.ARGS_STATE_NAME), "w") as fp:
-        json.dump(vars(args), fp)
-
-    max_seq_len_source, max_seq_len_target = args.max_seq_len
-    # The maximum length is the length before we add the BOS/EOS symbols
-    max_seq_len_source = max_seq_len_source + C.SPACE_FOR_XOS
-    max_seq_len_target = max_seq_len_target + C.SPACE_FOR_XOS
-    logger.info("Adjusting maximum length to reserve space for a BOS/EOS marker. New maximum length: (%d, %d)",
-                max_seq_len_source, max_seq_len_target)
-
-    with ExitStack() as exit_stack:
-        context = utils.determine_context(device_ids=args.device_ids,
-                                          use_cpu=args.use_cpu,
-                                          disable_device_locking=args.disable_device_locking,
-                                          lock_dir=args.lock_dir,
-                                          exit_stack=exit_stack)
-        if args.batch_type == C.BATCH_TYPE_SENTENCE:
-            check_condition(args.batch_size % len(context) == 0, "When using multiple devices the batch size must be "
-                                                                 "divisible by the number of devices. Choose a batch "
-                                                                 "size that is a multiple of %d." % len(context))
-        logger.info("Training Device(s): %s", ", ".join(str(c) for c in context))
-
-        # Read feature size
-        if args.image_preextracted_features:
-            _, args.source_image_size = read_feature_shape(args.source_root)
-
-        train_iter, eval_iter, config_data, target_vocab = create_data_iters_and_vocab(
-            args=args,
-            max_seq_len_source=max_seq_len_source,
-            max_seq_len_target=max_seq_len_target,
-            resume_training=resume_training,
-            output_folder=output_folder)
-        max_seq_len_source = config_data.max_seq_len_source
-        max_seq_len_target = config_data.max_seq_len_target
-
-        # Dump the vocabularies if we're just starting up
-        if not resume_training:
-            vocab.vocab_to_json(target_vocab, os.path.join(output_folder, C.VOCAB_TRG_NAME))
-
-        target_vocab_size = len(target_vocab)
-        logger.info("Vocabulary sizes: target=%d", target_vocab_size)
-
-        model_config = create_model_config(args=args,
-                                           vocab_target_size=target_vocab_size,
-                                           max_seq_len_source=max_seq_len_source, max_seq_len_target=max_seq_len_target,
-                                           config_data=config_data)
-        model_config.freeze()
-
-        training_model = create_training_model(config=model_config,
-                                               context=context,
-                                               output_dir=output_folder,
-                                               train_iter=train_iter,
-                                               args=args)
-
-        # Handle options that override training settings
-        min_updates = args.min_updates
-        max_updates = args.max_updates
-        min_samples = args.min_samples
-        max_samples = args.max_samples
-        max_num_checkpoint_not_improved = args.max_num_checkpoint_not_improved
-        min_epochs = args.min_num_epochs
-        max_epochs = args.max_num_epochs
-        if min_epochs is not None and max_epochs is not None:
-            check_condition(min_epochs <= max_epochs,
-                            "Minimum number of epochs must be smaller than maximum number of epochs")
-            
-        # Fixed training schedule always runs for a set number of updates
-        if args.learning_rate_schedule:
-            min_updates = None
-            max_updates = sum(num_updates for (_, num_updates) in args.learning_rate_schedule)
-            max_num_checkpoint_not_improved = -1
-            min_samples = None
-            max_samples = None
-            min_epochs = None
-            max_epochs = None
-
-        # Get initialization from encoders (useful for pretrained models)
-        extra_initializers = get_preinit_encoders(training_model.encoder.encoders)
-        if len(extra_initializers) == 0:
-            extra_initializers = None
-
-        trainer = training.EarlyStoppingTrainer(model=training_model,
-                                                optimizer_config=create_optimizer_config(args, [1.0],
-                                                                                         extra_initializers),
-                                                max_params_files_to_keep=args.keep_last_params,
-                                                keep_initializations=args.keep_initializations,
-                                                source_vocabs=[None],
-                                                target_vocab=target_vocab)
-
-        trainer.fit(train_iter=train_iter,
-                    validation_iter=eval_iter,
-                    early_stopping_metric=args.optimized_metric,
-                    metrics=args.metrics,
-                    checkpoint_interval=args.checkpoint_interval,
-                    max_num_not_improved=max_num_checkpoint_not_improved,
-                    max_checkpoints=args.max_checkpoints,
-                    min_samples=min_samples,
-                    max_samples=max_samples,
-                    min_updates=min_updates,
-                    max_updates=max_updates,
-                    min_epochs=min_epochs,
-                    max_epochs=max_epochs,
-                    lr_decay_param_reset=args.learning_rate_decay_param_reset,
-                    lr_decay_opt_states_reset=args.learning_rate_decay_optimizer_states_reset,
-                    decoder=create_checkpoint_decoder(args, exit_stack, context),
-                    mxmonitor_pattern=args.monitor_pattern,
-                    mxmonitor_stat_func=args.monitor_stat_func,
-                    allow_missing_parameters=args.allow_missing_params,
-                    existing_parameters=args.params)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/sockeye/image_captioning/utils.py b/sockeye/image_captioning/utils.py
deleted file mode 100644
index 8400de5df..000000000
--- a/sockeye/image_captioning/utils.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-A set of utility methods for images.
-"""
-import os
-import logging
-from shutil import copyfile
-from typing import List, Optional
-
-import numpy as np
-
-from ..log import setup_main_logger
-
-# Temporary logger, the real one (logging to a file probably, will be created
-# in the main function)
-logger = logging.getLogger(__name__)
-
-try:  # Try to import pillow
-    from PIL import ImageFile # pylint: disable=import-error
-    from PIL import Image  # pylint: disable=import-error
-    ImageFile.LOAD_TRUNCATED_IMAGES = True  # load truncated images
-    Image.MAX_IMAGE_PIXELS = None  # load big images
-except ImportError as e:
-    raise RuntimeError("Please install pillow.")
-
-
-def copy_mx_model_to(model_path, model_epoch, output_folder):
-    """
-    Copy mxnet models to new path.
-
-    :param model_path: Model path without -symbol.json and -%04d.params
-    :param model_epoch: Epoch of the pretrained model
-    :param output_folder: Output folder
-    :return: New folder where the files are moved to
-    """
-    target_path = os.path.join(output_folder, os.path.basename(model_path))
-    logger.info("Copying image model from {} to {}".format(model_path,
-                                                           target_path))
-    suffix = ['-symbol.json', '-%04d.params' % (model_epoch,)]
-    for s in suffix:
-        copyfile(model_path + s, target_path + s)
-    return target_path
-
-
-def crop_resize_image(image: np.ndarray, size) -> np.ndarray:
-    """
-    Resize the input image.
-
-    :param image: Original image which is a  PIL object.
-    :param size: Tuple of height and width to resize the image to.
-    :return: Resized image which is a PIL object
-    """
-    width, height = image.size
-    if width > height:
-        left = (width - height) / 2
-        right = width - left
-        top = 0
-        bottom = height
-    else:
-        top = (height - width) / 2
-        bottom = height - top
-        left = 0
-        right = width
-    image = image.crop((left, top, right, bottom))
-    image = image.resize(size, Image.ANTIALIAS)
-    return image
-
-
-def load_preprocess_images(image_paths: List[str], image_size: tuple) -> List[np.ndarray]:
-    """
-    Load and pre-process the images specified with absolute paths.
-
-    :param image_paths: List of images specified with paths.
-    :param image_size: Tuple to resize the image to (Channels, Height, Width)
-    :return: A list of loaded images (numpy arrays).
-    """
-    image_size = image_size[1:]  # we do not need the number of channels
-    images = []
-    for image_path in image_paths:
-        images.append(load_preprocess_image(image_path, image_size))
-    return images
-
-
-def load_preprocess_image(image_path: str, image_size: tuple) -> np.ndarray:
-    with Image.open(image_path) as image:
-        image_o = preprocess_image(image, image_size)
-        image_o = image_o[:3, :, :]
-    return image_o
-
-
-def preprocess_image(image: Image, image_size: tuple) -> np.ndarray:
-    # Resize to fixed input
-    image_o = crop_resize_image(image, image_size)
-    # convert to numpy
-    image_o = np.asarray(image_o)
-    # Alpha channel?
-    if len(image_o.shape) > 2:
-        if image_o.shape[2] == 4:  # RGBA
-            image_o = image_o[:,:,:3]
-        elif image_o.shape[2] == 2:  # Gray-Alpha
-            image_o = image_o[:,:,0]
-    # Gray-level to 3 channels
-    if len(image_o.shape) == 2:
-        image_o = np.tile(image_o[:, :, None], (1, 1, 3))
-    # (height, width, channel) -> (channel, height, width)
-    image_o = np.swapaxes(image_o, 0, 2)
-    image_o = np.swapaxes(image_o, 1, 2)
-    return image_o
-
-
-def load_features(paths: List[str],
-                  expected_shape: Optional[tuple] = None) -> List[np.ndarray]:
-    """
-    Load features specified with absolute paths.
-
-    :param paths: List of files specified with paths.
-    :param expected_shape: Optional expected shape.
-    :return: A list of loaded images (numpy arrays).
-    """
-    data = []  # type: List[np.ndarray]
-    for path in paths:
-        data.append(load_feature(path, expected_shape))
-    return data
-
-
-def load_feature(path: str,
-                 expected_shape: Optional[tuple] = None) -> np.ndarray:
-    try:  # compressed
-        data = np.load(path)['data']
-    except IndexError:  # uncompressed
-        data = np.load(path)
-    if expected_shape is not None:
-        np.testing.assert_array_equal(data.shape, expected_shape,
-                                      err_msg="Loaded feature shape different than provided one. "
-                                              "(current: {}, provided{})".format(data.shape,
-                                                                                 expected_shape))
-    return data
-
-
-def save_features(paths: List[str], datas: List[np.ndarray],
-                  compressed: bool = False) -> List:
-    """
-    Save features specified with absolute paths.
-
-    :param paths: List of files specified with paths.
-    :param datas: List of numpy ndarrays to save into the respective files
-    :param compressed: Use numpy compression
-    :return: A list of file names.
-    """
-    fnames = []  # type: List[str]
-    for path, data in zip(paths, datas):
-        fnames.append(save_feature(path, data, compressed))
-    return fnames
-
-
-def save_feature(path: str,
-                 data: np.ndarray,
-                 compressed: bool = False) -> str:
-    if compressed:
-        np.savez_compressed(path, data=data)
-        path += ".npz"
-    else:
-        np.save(path, data)
-        path += ".npy"
-    return path
-
-
-def zero_pad_features(features: List[np.ndarray],
-                      target_shape: tuple) -> List[np.ndarray]:
-    """
-    Zero pad to numpy array.
-
-    :param features: List of numpy arrays.
-    :param target_shape: Target shape of each numpy array in the list feat. Note:
-                   target_shape should be greater that the largest shapes in feat.
-    :return: A list of padded numpy arrays.
-    """
-    pad_features = []
-    for feature in features:
-        feature_shape = feature.shape
-        if len(feature_shape) < len(target_shape):  # add extra dimensions
-            for i in range(len(target_shape) - len(feature_shape)):
-                feature = np.expand_dims(feature, axis=len(feature.shape) + 1)
-                feature_shape = feature.shape
-        elif len(feature_shape) > len(target_shape):
-            raise ValueError("Provided target shape must be bigger then the original "
-                             "shape. (provided: {}, original {})".format(len(target_shape), len(feature_shape)))
-        diff_shape = np.subtract(target_shape, feature_shape)  # pylint: disable=assignment-from-no-return
-        if np.any(diff_shape < 0):
-            raise ValueError("Provided target values must be bigger then the original "
-                             "values for each dimension. (provided: {}, original {})".format(target_shape, feature_shape))
-        # pad format: ((before_1, after_1), ... (before_N, after_N))
-        diff_shape = [[0, d] for d in diff_shape]  # pylint: disable=not-an-iterable
-        p = np.pad(feature, diff_shape, 'constant', constant_values=0)
-        pad_features.append(p)
-    return pad_features
diff --git a/sockeye/image_captioning/visualize.py b/sockeye/image_captioning/visualize.py
deleted file mode 100644
index c099f8a40..000000000
--- a/sockeye/image_captioning/visualize.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Visualize the checkpoints of the model: image, ground truth caption and
-predicted caption.
-"""
-import argparse
-import os
-
-try:  # Try to import pillow
-    from PIL import Image  # pylint: disable=import-error
-except ImportError as e:
-    raise RuntimeError("Please install pillow.")
-
-try:  # Try to import matplotlib
-    import matplotlib  # pylint: disable=import-error
-    import matplotlib.pyplot as plt
-
-    matplotlib.use('Agg')
-except ImportError as e:
-    raise RuntimeError("Please install matplotlib.")
-
-
-def format_text_for_visualization(c, n):
-    c = c.split(" ")
-    c[0] = c[0].title()
-    out = ""
-    for j in range(0, len(c)):
-        out += c[j]
-        if j == len(c) - 1:
-            out += "."
-        else:
-            if (j + 1) % n == 0:
-                out += "\n"
-            else:
-                out += " "
-    return out
-
-
-def main():
-    params = argparse.ArgumentParser(
-        description="CLI to visualize the captions along with images and "
-                    "ground truth."
-    )
-    params.add_argument("-d", "--image-root",
-                        help="Absolute path of the dataset root where the "
-                             "images are stored.")
-    params.add_argument("-i", "--source",
-                        help="File containing the images or features used to "
-                             "generate the captions.")
-    params.add_argument("-c", "--prediction",
-                        help="File containing the captions. Each line "
-                             "corresponds to a line in the source.")
-    params.add_argument("-a", "--ground-truth",
-                        default=None,
-                        help="File file containing the ground-truth captions "
-                             "(optional).")
-    params.add_argument("-s", "--save-to-folder",
-                        default=None,
-                        help="Folder to save the visualizations.")
-    params.add_argument("-si", "--skip-images",
-                        default=2,
-                        help="Number of images to skip for visualization.")
-    params.add_argument("-nc", "--number-of-columns",
-                        default=4,
-                        help="Number of columns in the subplot (better if even "
-                             "number).")
-    args = params.parse_args()
-
-    skip = args.skip_images
-    N = M = args.number_of_columns
-
-    # adjust this if visualization is not nice
-    len_newline = 9
-    fontsize = 10
-    figsize = (30, 20)
-
-    # Collect results in a better data structure (dict)
-    # * Read predictions and image dir
-    fs = open(args.source)
-    fc = open(args.prediction)
-    predictions = {}
-    for s, c in zip(fs.readlines(), fc.readlines()):
-        predictions[s] = c  # just keep one sentence
-    fs.close()
-    fc.close()
-    # * Read ground truth optionally
-    ground_truth = {}
-    if args.ground_truth is not None:
-        fgt = open(args.ground_truth)
-        fs = open(args.source)
-        for s, gt in zip(fs.readlines(), fgt.readlines()):
-            if s in ground_truth:
-                ground_truth[s].append(gt)
-            else:
-                ground_truth[s] = [gt]
-        fgt.close()
-    fs.close()
-
-    # Prepare output folder, if needed
-    if args.save_to_folder is not None:
-        fontsize = 15
-        if not os.path.exists(args.save_to_folder):
-            os.makedirs(args.save_to_folder)
-
-    # Visualization
-    plt.ioff()
-    fig, axs = plt.subplots(N, M, figsize=figsize)
-    fig.tight_layout()
-    i = 0
-    ii = 1
-    for s in predictions.keys():  # Go over images (dict[image]=caption)
-        if ii % skip == 0:  # maybe you do not want to display all images
-            c = predictions[s]
-            if len(ground_truth) > 0:
-                gts = ground_truth[s]  # list
-            s = s.split("\n")[0]
-            c = c.split("\n")[0]
-            # Display image
-            image = Image.open(os.path.join(args.image_root, s))
-            if 'RGB' not in image.mode:
-                axs[i // N % M, i % N].imshow(image, cmap='gray')
-            else:
-                axs[i // N % M, i % N].imshow(image)
-            # Display predicted caption
-            axs[i // N % M, i % N].axis("off")
-            axs[(i + 1) // N % M, (i + 1) % N].text(0, 0.9,
-                                                    format_text_for_visualization(c, len_newline),
-                                                    fontsize=fontsize,
-                                                    bbox={'facecolor': 'white',
-                                                          'alpha': 0.85,
-                                                          'pad': 2})
-            # Display ground-truth caption(s) optionally
-            if len(ground_truth) > 0:
-                gt_vis = ""
-                for j, gt in enumerate(gts):
-                    gt = gt.split("\n")[0]
-                    gt_vis += \
-                        "* " + format_text_for_visualization(gt, len_newline) \
-                        + "\n"
-                axs[(i + 1) // N % M, (i + 1) % N].text(0, 0, gt_vis,
-                                                        fontsize=fontsize,
-                                                        bbox={'facecolor': 'green',
-                                                              'alpha': 0.3,
-                                                              'pad': 2})
-            axs[(i + 1) // N % M, (i + 1) % N].axis("off")
-            i += 2
-
-            # Show or save to disk
-            if i % (N * M) == 0:
-                if args.save_to_folder is None:
-                    plt.show()
-                else:
-                    plt.savefig(os.path.join(args.save_to_folder,
-                                             str(ii).zfill(6) + '.png'),
-                                bbox_inches='tight')
-                i = 0
-                # Reset axes, clean up
-                for k in range(N):
-                    for j in range(M):
-                        axs[k, j].cla()
-                        axs[k, j].axis("off")
-        ii += 1
-
-
-if __name__ == "__main__":
-    main()
diff --git a/sockeye/inference.py b/sockeye/inference.py
index f8b4c8132..cc0391e50 100644
--- a/sockeye/inference.py
+++ b/sockeye/inference.py
@@ -18,11 +18,8 @@
 import itertools
 import json
 import logging
-import os
-import time
-from collections import defaultdict
-from functools import lru_cache, partial
-from typing import Callable, Dict, Generator, List, NamedTuple, Optional, Tuple, Union, Set, Any
+from functools import partial
+from typing import Any, Callable, Dict, Generator, List, Optional, NamedTuple, Set, Tuple, Union
 
 import mxnet as mx
 import numpy as np
@@ -31,495 +28,18 @@
 from . import data_io
 from . import lexical_constraints as constrained
 from . import lexicon
-from . import model
 from . import utils
 from . import vocab
-from .log import is_python34
+from .beam_search import get_beam_search, CandidateScorer
+from .model import SockeyeModel
 
 logger = logging.getLogger(__name__)
 
 
-class InferenceModel(model.SockeyeModel):
-    """
-    InferenceModel is a SockeyeModel that supports three operations used for inference/decoding:
-
-    (1) Encoder forward call: encode source sentence and return initial decoder states.
-    (2) Decoder forward call: single decoder step: predict next word.
-
-    :param config: Configuration object holding details about the model.
-    :param params_fname: File with model parameters.
-    :param context: MXNet context to bind modules to.
-    :param beam_size: Beam size.
-    :param softmax_temperature: Optional parameter to control steepness of softmax distribution.
-    :param max_output_length_num_stds: Number of standard deviations as safety margin for maximum output length.
-    :param decoder_return_logit_inputs: Decoder returns inputs to logit computation instead of softmax over target
-                                        vocabulary.  Used when logits/softmax are handled separately.
-    :param cache_output_layer_w_b: Cache weights and biases for logit computation.
-    :param skip_softmax: If True, does not compute softmax for greedy decoding.
-    """
-
-    def __init__(self,
-                 config: model.ModelConfig,
-                 params_fname: str,
-                 context: mx.context.Context,
-                 beam_size: int,
-                 softmax_temperature: Optional[float] = None,
-                 max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH,
-                 decoder_return_logit_inputs: bool = False,
-                 cache_output_layer_w_b: bool = False,
-                 forced_max_output_len: Optional[int] = None,
-                 skip_softmax: bool = False) -> None:
-        super().__init__(config)
-        self.params_fname = params_fname
-        self.context = context
-        self.beam_size = beam_size
-        utils.check_condition(beam_size < self.config.vocab_target_size,
-                              'The beam size must be smaller than the target vocabulary size.')
-        if skip_softmax:
-            assert beam_size == 1, 'Skipping softmax does not have any effect for beam size > 1'
-        self.skip_softmax = skip_softmax
-
-        self.softmax_temperature = softmax_temperature
-        self.max_input_length, self.get_max_output_length = models_max_input_output_length([self],
-                                                                                           max_output_length_num_stds,
-                                                                                           forced_max_output_len=forced_max_output_len)
-
-        self.max_batch_size = None  # type: Optional[int]
-        self.encoder_module = None  # type: Optional[mx.mod.BucketingModule]
-        self.encoder_default_bucket_key = None  # type: Optional[int]
-        self.decoder_module = None  # type: Optional[mx.mod.BucketingModule]
-        self.decoder_default_bucket_key = None  # type: Optional[Tuple[int, int]]
-        self.decoder_return_logit_inputs = decoder_return_logit_inputs
-
-        self.cache_output_layer_w_b = cache_output_layer_w_b
-        self.output_layer_w = None  # type: Optional[mx.nd.NDArray]
-        self.output_layer_b = None  # type: Optional[mx.nd.NDArray]
-
-    @property
-    def num_source_factors(self) -> int:
-        """
-        Returns the number of source factors of this InferenceModel (at least 1).
-        """
-        return self.config.config_data.num_source_factors
-
-    def initialize(self, max_batch_size: int, max_input_length: int, get_max_output_length_function: Callable):
-        """
-        Delayed construction of modules to ensure multiple Inference models can agree on computing a common
-        maximum output length.
-
-        :param max_batch_size: Maximum batch size.
-        :param max_input_length: Maximum input length.
-        :param get_max_output_length_function: Callable to compute maximum output length.
-        """
-        self.max_batch_size = max_batch_size
-        self.max_input_length = max_input_length
-        if self.max_input_length > self.training_max_seq_len_source:
-            logger.warning("Model was only trained with sentences up to a length of %d, "
-                           "but a max_input_len of %d is used.",
-                           self.training_max_seq_len_source, self.max_input_length)
-        self.get_max_output_length = get_max_output_length_function
-
-        # check the maximum supported length of the encoder & decoder:
-        if self.max_supported_seq_len_source is not None:
-            utils.check_condition(self.max_input_length <= self.max_supported_seq_len_source,
-                                  "Encoder only supports a maximum length of %d" % self.max_supported_seq_len_source)
-        if self.max_supported_seq_len_target is not None:
-            decoder_max_len = self.get_max_output_length(max_input_length)
-            utils.check_condition(decoder_max_len <= self.max_supported_seq_len_target,
-                                  "Decoder only supports a maximum length of %d, but %d was requested. Note that the "
-                                  "maximum output length depends on the input length and the source/target length "
-                                  "ratio observed during training." % (self.max_supported_seq_len_target,
-                                                                       decoder_max_len))
-
-        self.encoder_module, self.encoder_default_bucket_key = self._get_encoder_module()
-        self.decoder_module, self.decoder_default_bucket_key = self._get_decoder_module()
-
-        max_encoder_data_shapes = self._get_encoder_data_shapes(self.encoder_default_bucket_key,
-                                                                self.max_batch_size)
-        max_decoder_data_shapes = self._get_decoder_data_shapes(self.decoder_default_bucket_key,
-                                                                self.max_batch_size * self.beam_size)
-        self.encoder_module.bind(data_shapes=max_encoder_data_shapes, for_training=False, grad_req="null")
-        self.decoder_module.bind(data_shapes=max_decoder_data_shapes, for_training=False, grad_req="null")
-
-        self.load_params_from_file(self.params_fname)
-        self.encoder_module.init_params(arg_params=self.params, aux_params=self.aux_params, allow_missing=False)
-        self.decoder_module.init_params(arg_params=self.params, aux_params=self.aux_params, allow_missing=False)
-
-        if self.cache_output_layer_w_b:
-            if self.output_layer.weight_normalization:
-                # precompute normalized output layer weight imperatively
-                assert self.output_layer.weight_norm is not None
-                weight = self.params[self.output_layer.weight_norm.weight.name].as_in_context(self.context)
-                scale = self.params[self.output_layer.weight_norm.scale.name].as_in_context(self.context)
-                self.output_layer_w = self.output_layer.weight_norm(weight, scale)
-            else:
-                self.output_layer_w = self.params[self.output_layer.w.name].as_in_context(self.context)
-            self.output_layer_b = self.params[self.output_layer.b.name].as_in_context(self.context)
-
-    def _get_encoder_module(self) -> Tuple[mx.mod.BucketingModule, int]:
-        """
-        Returns a BucketingModule for the encoder. Given a source sequence, it returns
-        the initial decoder states of the model.
-        The bucket key for this module is the length of the source sequence.
-
-        :return: Tuple of encoder module and default bucket key.
-        """
-
-        def sym_gen(source_seq_len: int):
-            source = mx.sym.Variable(C.SOURCE_NAME)
-            source_words = source.split(num_outputs=self.num_source_factors, axis=2, squeeze_axis=True)[0]
-            source_length = utils.compute_lengths(source_words)
-
-            # source embedding
-            (source_embed,
-             source_embed_length,
-             source_embed_seq_len) = self.embedding_source.encode(source, source_length, source_seq_len)
-
-            # encoder
-            # source_encoded: (source_encoded_length, batch_size, encoder_depth)
-            (source_encoded,
-             source_encoded_length,
-             source_encoded_seq_len) = self.encoder.encode(source_embed,
-                                                           source_embed_length,
-                                                           source_embed_seq_len)
-
-            # initial decoder states
-            decoder_init_states = self.decoder.init_states(source_encoded,
-                                                           source_encoded_length,
-                                                           source_encoded_seq_len)
-
-            data_names = [C.SOURCE_NAME]
-            label_names = []  # type: List[str]
-
-            # predict length ratios
-            predicted_length_ratios = []  # type: List[mx.nd.NDArray]
-            if self.length_ratio is not None:
-                # predicted_length_ratios: List[(n, 1)]
-                predicted_length_ratios = [self.length_ratio(source_encoded, source_encoded_length)]
-
-            return mx.sym.Group(decoder_init_states + predicted_length_ratios), data_names, label_names
-
-        default_bucket_key = self.max_input_length
-        module = mx.mod.BucketingModule(sym_gen=sym_gen,
-                                        default_bucket_key=default_bucket_key,
-                                        context=self.context)
-        return module, default_bucket_key
-
-    def _get_decoder_module(self) -> Tuple[mx.mod.BucketingModule, Tuple[int, int]]:
-        """
-        Returns a BucketingModule for a single decoder step.
-        Given previously predicted word and previous decoder states, it returns
-        a distribution over the next predicted word and the next decoder states.
-        The bucket key for this module is the length of the source sequence
-        and the current time-step in the inference procedure (e.g. beam search).
-        The latter corresponds to the current length of the target sequences.
-
-        :return: Tuple of decoder module and default bucket key.
-        """
-
-        def sym_gen(bucket_key: Tuple[int, int]):
-            """
-            Returns either softmax output (probs over target vocabulary) or inputs to logit
-            computation, controlled by decoder_return_logit_inputs
-            """
-            source_seq_len, decode_step = bucket_key
-            source_embed_seq_len = self.embedding_source.get_encoded_seq_len(source_seq_len)
-            source_encoded_seq_len = self.encoder.get_encoded_seq_len(source_embed_seq_len)
-
-            self.decoder.reset()
-            target_prev = mx.sym.Variable(C.TARGET_NAME)
-            states = self.decoder.state_variables(decode_step)
-            state_names = [state.name for state in states]
-
-            # embedding for previous word
-            # (batch_size, num_embed)
-            target_embed_prev, _, _ = self.embedding_target.encode(data=target_prev, data_length=None, seq_len=1)
-
-            # decoder
-            # target_decoded: (batch_size, decoder_depth)
-            (target_decoded,
-             attention_probs,
-             states,
-             pointer_scores) = self.decoder.decode_step(decode_step,
-                                                target_embed_prev,
-                                                source_encoded_seq_len,
-                                                *states)
-
-            if self.decoder_return_logit_inputs:
-                # skip output layer in graph
-                outputs = mx.sym.identity(target_decoded, name=C.LOGIT_INPUTS_NAME)
-            else:
-                # logits: (batch_size, target_vocab_size)
-                logits = self.output_layer(target_decoded)
-                if self.config.num_pointers:
-                    logits = mx.sym.concat(logits, pointer_scores, dim=1)
-                
-                if self.softmax_temperature is not None:
-                    logits = logits / self.softmax_temperature
-                if self.skip_softmax:
-                    # skip softmax for greedy decoding
-                    outputs = logits
-                else:
-                    outputs = mx.sym.softmax(data=logits, name=C.SOFTMAX_NAME)
-
-            data_names = [C.TARGET_NAME] + state_names
-            label_names = []  # type: List[str]
-            return mx.sym.Group([outputs, attention_probs] + states), data_names, label_names
-
-        # pylint: disable=not-callable
-        default_bucket_key = (self.max_input_length, self.get_max_output_length(self.max_input_length))
-        module = mx.mod.BucketingModule(sym_gen=sym_gen,
-                                        default_bucket_key=default_bucket_key,
-                                        context=self.context)
-        return module, default_bucket_key
-
-    def _get_encoder_data_shapes(self, bucket_key: int, batch_size: int) -> List[mx.io.DataDesc]:
-        """
-        Returns data shapes of the encoder module.
-
-        :param bucket_key: Maximum input length.
-        :return: List of data descriptions.
-        """
-        return [mx.io.DataDesc(name=C.SOURCE_NAME,
-                               shape=(batch_size, bucket_key, self.num_source_factors),
-                               layout=C.BATCH_MAJOR)]
-
-    @lru_cache(maxsize=None)
-    def _get_decoder_data_shapes(self, bucket_key: Tuple[int, int], batch_beam_size: int) -> List[mx.io.DataDesc]:
-        """
-        Returns data shapes of the decoder module.
-
-        :param bucket_key: Tuple of (maximum input length, maximum target length).
-        :param batch_beam_size: Batch size * beam size.
-        :return: List of data descriptions.
-        """
-        source_max_length, target_max_length = bucket_key
-        return [mx.io.DataDesc(name=C.TARGET_NAME, shape=(batch_beam_size,),
-                               layout="NT")] + self.decoder.state_shapes(batch_beam_size,
-                                                                         target_max_length,
-                                                                         self.encoder.get_encoded_seq_len(
-                                                                             source_max_length),
-                                                                         self.encoder.get_num_hidden())
-
-    def run_encoder(self,
-                    source: mx.nd.NDArray,
-                    source_max_length: int) -> Tuple['ModelState', mx.nd.NDArray]:
-        """
-        Runs forward pass of the encoder.
-        Encodes source given source length and bucket key.
-        Returns encoder representation of the source, source_length, initial hidden state of decoder RNN,
-        and initial decoder states tiled to beam size.
-
-        :param source: Integer-coded input tokens. Shape (batch_size, source length, num_source_factors).
-        :param source_max_length: Bucket key.
-        :return: Initial model state.
-        """
-        batch_size = source.shape[0]
-        batch = mx.io.DataBatch(data=[source],
-                                label=None,
-                                bucket_key=source_max_length,
-                                provide_data=self._get_encoder_data_shapes(source_max_length, batch_size))
-
-        self.encoder_module.forward(data_batch=batch, is_train=False)
-        decoder_init_states = self.encoder_module.get_outputs()
-
-        if self.length_ratio is not None:
-            estimated_length_ratio = decoder_init_states[-1]
-            estimated_length_ratio = mx.nd.repeat(estimated_length_ratio, repeats=self.beam_size, axis=0)
-            decoder_init_states = decoder_init_states[:-1]
-        else:
-            estimated_length_ratio = None
-            decoder_init_states = decoder_init_states
-        # replicate encoder/init module results beam size times
-        decoder_init_states = [mx.nd.repeat(s, repeats=self.beam_size, axis=0) for s in decoder_init_states]
-        return ModelState(decoder_init_states), estimated_length_ratio
-
-    def run_decoder(self,
-                    prev_word: mx.nd.NDArray,
-                    bucket_key: Tuple[int, int],
-                    model_state: 'ModelState') -> Tuple[mx.nd.NDArray, mx.nd.NDArray, 'ModelState']:
-        """
-        Runs forward pass of the single-step decoder.
-
-        :param prev_word: Previous word ids. Shape: (batch*beam,).
-        :param bucket_key: Bucket key.
-        :param model_state: Model states.
-        :return: Decoder stack output (logit inputs or probability distribution), attention scores, updated model state.
-        """
-        batch_beam_size = prev_word.shape[0]
-        batch = mx.io.DataBatch(
-            data=[prev_word.as_in_context(self.context)] + model_state.states,
-            label=None,
-            bucket_key=bucket_key,
-            provide_data=self._get_decoder_data_shapes(bucket_key, batch_beam_size))
-        self.decoder_module.forward(data_batch=batch, is_train=False)
-        out, attention_probs, *model_state.states = self.decoder_module.get_outputs()
-        return out, attention_probs, model_state
-
-    @property
-    def training_max_seq_len_source(self) -> int:
-        """ The maximum sequence length on the source side during training. """
-        return self.config.config_data.data_statistics.max_observed_len_source
-
-    @property
-    def training_max_seq_len_target(self) -> int:
-        """ The maximum sequence length on the target side during training. """
-        return self.config.config_data.data_statistics.max_observed_len_target
-
-    @property
-    def max_supported_seq_len_source(self) -> Optional[int]:
-        """ If not None this is the maximally supported source length during inference (hard constraint). """
-        max_src_len = self.encoder.get_max_seq_len()
-        if self.config.num_pointers > 0:
-            # Constraint given by the attention-based pointer mechanism
-            max_src_len = self.config.num_pointers if max_src_len is None else min(max_src_len, self.config.num_pointers)
-        return max_src_len
-
-    @property
-    def max_supported_seq_len_target(self) -> Optional[int]:
-        """ If not None this is the maximally supported target length during inference (hard constraint). """
-        return self.decoder.get_max_seq_len()
-
-    @property
-    def length_ratio_mean(self) -> float:
-        return self.config.config_data.data_statistics.length_ratio_mean
-
-    @property
-    def length_ratio_std(self) -> float:
-        return self.config.config_data.data_statistics.length_ratio_std
-
-    @property
-    def source_with_eos(self) -> bool:
-        return self.config.config_data.source_with_eos
-
-
-def load_models(context: mx.context.Context,
-                max_input_len: Optional[int],
-                beam_size: int,
-                batch_size: int,
-                model_folders: List[str],
-                checkpoints: Optional[List[int]] = None,
-                softmax_temperature: Optional[float] = None,
-                max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH,
-                decoder_return_logit_inputs: bool = False,
-                cache_output_layer_w_b: bool = False,
-                forced_max_output_len: Optional[int] = None,
-                override_dtype: Optional[str] = None,
-                output_scores: bool = False,
-                sampling: bool = False) -> Tuple[List[InferenceModel],
-                                                 List[vocab.Vocab],
-                                                 vocab.Vocab]:
-    """
-    Loads a list of models for inference.
-
-    :param context: MXNet context to bind modules to.
-    :param max_input_len: Maximum input length.
-    :param beam_size: Beam size.
-    :param batch_size: Batch size.
-    :param model_folders: List of model folders to load models from.
-    :param checkpoints: List of checkpoints to use for each model in model_folders. Use None to load best checkpoint.
-    :param softmax_temperature: Optional parameter to control steepness of softmax distribution.
-    :param max_output_length_num_stds: Number of standard deviations to add to mean target-source length ratio
-           to compute maximum output length.
-    :param decoder_return_logit_inputs: Model decoders return inputs to logit computation instead of softmax over target
-                                        vocabulary.  Used when logits/softmax are handled separately.
-    :param cache_output_layer_w_b: Models cache weights and biases for logit computation as NumPy arrays (used with
-                                   restrict lexicon).
-    :param forced_max_output_len: An optional overwrite of the maximum output length.
-    :param override_dtype: Overrides dtype of encoder and decoder defined at training time to a different one.
-    :param output_scores: Whether the scores will be needed as outputs. If True, scores will be normalized, negative
-           log probabilities. If False, scores will be negative, raw logit activations if decoding with beam size 1
-           and a single model.
-    :param sampling: True if the model is sampling instead of doing normal topk().
-    :return: List of models, source vocabulary, target vocabulary, source factor vocabularies.
-    """
-    logger.info("Loading %d model(s) from %s ...", len(model_folders), model_folders)
-    load_time_start = time.time()
-    models = []  # type: List[InferenceModel]
-    source_vocabs = []  # type: List[List[vocab.Vocab]]
-    target_vocabs = []  # type: List[vocab.Vocab]
-
-    if checkpoints is None:
-        checkpoints = [None] * len(model_folders)
-    else:
-        utils.check_condition(len(checkpoints) == len(model_folders), "Must provide checkpoints for each model")
-
-    skip_softmax = False
-    # performance tweak: skip softmax for a single model, decoding with beam size 1, when not sampling and no scores are required in output.
-    if len(model_folders) == 1 and beam_size == 1 and not output_scores and not sampling:
-        skip_softmax = True
-        logger.info("Enabled skipping softmax for a single model and greedy decoding.")
-
-    for model_folder, checkpoint in zip(model_folders, checkpoints):
-        model_source_vocabs = vocab.load_source_vocabs(model_folder)
-        model_target_vocab = vocab.load_target_vocab(model_folder)
-        source_vocabs.append(model_source_vocabs)
-        target_vocabs.append(model_target_vocab)
-
-        model_version = utils.load_version(os.path.join(model_folder, C.VERSION_NAME))
-        logger.info("Model version: %s", model_version)
-        utils.check_version(model_version)
-        model_config = model.SockeyeModel.load_config(os.path.join(model_folder, C.CONFIG_NAME))
-
-        logger.info("Disabling dropout layers for performance reasons")
-        model_config.disable_dropout()
-
-        if override_dtype is not None:
-            model_config.config_encoder.dtype = override_dtype
-            model_config.config_decoder.dtype = override_dtype
-            if override_dtype == C.DTYPE_FP16:
-                logger.warning('Experimental feature \'override_dtype=float16\' has been used. '
-                               'This feature may be removed or change its behaviour in future. '
-                               'DO NOT USE IT IN PRODUCTION!')
-
-        if checkpoint is None:
-            params_fname = os.path.join(model_folder, C.PARAMS_BEST_NAME)
-        else:
-            params_fname = os.path.join(model_folder, C.PARAMS_NAME % checkpoint)
-
-        inference_model = InferenceModel(config=model_config,
-                                         params_fname=params_fname,
-                                         context=context,
-                                         beam_size=beam_size,
-                                         softmax_temperature=softmax_temperature,
-                                         decoder_return_logit_inputs=decoder_return_logit_inputs,
-                                         cache_output_layer_w_b=cache_output_layer_w_b,
-                                         skip_softmax=skip_softmax)
-        utils.check_condition(inference_model.num_source_factors == len(model_source_vocabs),
-                              "Number of loaded source vocabularies (%d) does not match "
-                              "number of source factors for model '%s' (%d)" % (len(model_source_vocabs), model_folder,
-                                                                                inference_model.num_source_factors))
-        models.append(inference_model)
-
-    utils.check_condition(vocab.are_identical(*target_vocabs), "Target vocabulary ids do not match")
-    first_model_vocabs = source_vocabs[0]
-    for fi in range(len(first_model_vocabs)):
-        utils.check_condition(vocab.are_identical(*[source_vocabs[i][fi] for i in range(len(source_vocabs))]),
-                              "Source vocabulary ids do not match. Factor %d" % fi)
-
-    source_with_eos = models[0].source_with_eos
-    utils.check_condition(all(source_with_eos == m.source_with_eos for m in models),
-                          "All models must agree on using source-side EOS symbols or not. "
-                          "Did you try combining models trained with different versions?")
-
-    # set a common max_output length for all models.
-    max_input_len, get_max_output_length = models_max_input_output_length(models,
-                                                                          max_output_length_num_stds,
-                                                                          max_input_len,
-                                                                          forced_max_output_len=forced_max_output_len)
-
-    for inference_model in models:
-        inference_model.initialize(batch_size, max_input_len, get_max_output_length)
-
-    load_time = time.time() - load_time_start
-    logger.info("%d model(s) loaded in %.4fs", len(models), load_time)
-    return models, source_vocabs[0], target_vocabs[0]
-
-
-def models_max_input_output_length(models: List[InferenceModel],
+def models_max_input_output_length(models: List[SockeyeModel],
                                    num_stds: int,
-                                   forced_max_input_len: Optional[int] = None,
-                                   forced_max_output_len: Optional[int] = None) -> Tuple[int, Callable]:
+                                   forced_max_input_length: Optional[int] = None,
+                                   forced_max_output_length: Optional[int] = None) -> Tuple[int, Callable]:
     """
     Returns a function to compute maximum output length given a fixed number of standard deviations as a
     safety margin, and the current input length.
@@ -529,34 +49,25 @@ def models_max_input_output_length(models: List[InferenceModel],
     :param models: List of models.
     :param num_stds: Number of standard deviations to add as a safety margin. If -1, returned maximum output lengths
                      will always be 2 * input_length.
-    :param forced_max_input_len: An optional overwrite of the maximum input length.
-    :param forced_max_output_len: An optional overwrite of the maximum output length.
+    :param forced_max_input_length: An optional overwrite of the maximum input length. Does not include eos.
+    :param forced_max_output_length: An optional overwrite of the maximum output length. Does not include bos.
     :return: The maximum input length and a function to get the output length given the input length.
     """
     max_mean = max(model.length_ratio_mean for model in models)
     max_std = max(model.length_ratio_std for model in models)
-
-    supported_max_seq_len_source = min((model.max_supported_seq_len_source for model in models
-                                        if model.max_supported_seq_len_source is not None),
-                                       default=None)
-    supported_max_seq_len_target = min((model.max_supported_seq_len_target for model in models
-                                        if model.max_supported_seq_len_target is not None),
-                                       default=None)
-    training_max_seq_len_source = min(model.training_max_seq_len_source for model in models)
-
+    supported_max_seq_len_source = min((model.max_supported_len_source for model in models))
+    supported_max_seq_len_target = min((model.max_supported_len_target for model in models))
     return get_max_input_output_length(supported_max_seq_len_source,
                                        supported_max_seq_len_target,
-                                       training_max_seq_len_source,
                                        length_ratio_mean=max_mean,
                                        length_ratio_std=max_std,
                                        num_stds=num_stds,
-                                       forced_max_input_len=forced_max_input_len,
-                                       forced_max_output_len=forced_max_output_len)
+                                       forced_max_input_len=forced_max_input_length,
+                                       forced_max_output_len=forced_max_output_length)
 
 
-def get_max_input_output_length(supported_max_seq_len_source: Optional[int],
-                                supported_max_seq_len_target: Optional[int],
-                                training_max_seq_len_source: Optional[int],
+def get_max_input_output_length(supported_max_seq_len_source: int,
+                                supported_max_seq_len_target: int,
                                 length_ratio_mean: float,
                                 length_ratio_std: float,
                                 num_stds: int,
@@ -566,68 +77,41 @@ def get_max_input_output_length(supported_max_seq_len_source: Optional[int],
     Returns a function to compute maximum output length given a fixed number of standard deviations as a
     safety margin, and the current input length. It takes into account optional maximum source and target lengths.
 
-    :param supported_max_seq_len_source: The maximum source length supported by the models.
-    :param supported_max_seq_len_target: The maximum target length supported by the models.
-    :param training_max_seq_len_source: The maximum source length observed during training.
-    :param length_ratio_mean: The mean of the length ratio that was calculated on the raw sequences with special
-           symbols such as EOS or BOS.
+    :param supported_max_seq_len_source: The maximum source length supported by the models (includes eos).
+    :param supported_max_seq_len_target: The maximum target length supported by the models (includes bos).
+    :param length_ratio_mean: Length ratio mean computed on the training data (including bos/eos).
     :param length_ratio_std: The standard deviation of the length ratio.
     :param num_stds: The number of standard deviations the target length may exceed the mean target length (as long as
            the supported maximum length allows for this).
-    :param forced_max_input_len: An optional overwrite of the maximum input length.
-    :param forced_max_output_len: An optional overwrite of the maximum out length.
+    :param forced_max_input_len: An optional overwrite of the maximum input length. Does not include eos.
+    :param forced_max_output_len: An optional overwrite of the maximum output length. Does not include bos.
     :return: The maximum input length and a function to get the output length given the input length.
     """
-    space_for_bos = 1
-    space_for_eos = 1
 
     if num_stds < 0:
         factor = C.TARGET_MAX_LENGTH_FACTOR  # type: float
     else:
         factor = length_ratio_mean + (length_ratio_std * num_stds)
 
-    if forced_max_input_len is None:
-        # Make sure that if there is a hard constraint on the maximum source or target length we never exceed this
-        # constraint. This is for example the case for learned positional embeddings, which are only defined for the
-        # maximum source and target sequence length observed during training.
-        if supported_max_seq_len_source is not None and supported_max_seq_len_target is None:
-            max_input_len = supported_max_seq_len_source
-        elif supported_max_seq_len_source is None and supported_max_seq_len_target is not None:
-            max_output_len = supported_max_seq_len_target - space_for_bos - space_for_eos
-            if np.ceil(factor * training_max_seq_len_source) > max_output_len:
-                max_input_len = int(np.floor(max_output_len / factor))
-            else:
-                max_input_len = training_max_seq_len_source
-        elif supported_max_seq_len_source is not None or supported_max_seq_len_target is not None:
-            max_output_len = supported_max_seq_len_target - space_for_bos - space_for_eos
-            if np.ceil(factor * supported_max_seq_len_source) > max_output_len:
-                max_input_len = int(np.floor(max_output_len / factor))
-            else:
-                max_input_len = supported_max_seq_len_source
-        else:
-            # Any source/target length is supported and max_input_len was not manually set, therefore we use the
-            # maximum length from training.
-            max_input_len = training_max_seq_len_source
+    if forced_max_input_len is not None:
+        max_input_len = min(supported_max_seq_len_source, forced_max_input_len + C.SPACE_FOR_XOS)
     else:
-        max_input_len = forced_max_input_len
+        max_input_len = supported_max_seq_len_source
 
     def get_max_output_length(input_length: int):
         """
-        Returns the maximum output length for inference given the input length.
-        Explicitly includes space for BOS and EOS sentence symbols in the target sequence, because we assume
-        that the mean length ratio computed on the training data do not include these special symbols.
-        (see data_io.analyze_sequence_lengths)
+        Returns the maximum output length (including bos/eos) for inference given an input length that includes <eos>.
         """
         if forced_max_output_len is not None:
-            return forced_max_output_len
-        else:
-            return int(np.ceil(factor * input_length)) + space_for_bos + space_for_eos
+            return forced_max_output_len + C.SPACE_FOR_XOS
+        return int(np.ceil(factor * input_length))
 
     return max_input_len, get_max_output_length
 
 
 BeamHistory = Dict[str, List]
 Tokens = List[str]
+TokenIds = List[int]
 SentenceId = Union[int, str]
 
 
@@ -770,7 +254,7 @@ def make_input_from_json_string(sentence_id: SentenceId,
         return make_input_from_dict(sentence_id, jobj, translator)
 
     except Exception as e:
-        logger.exception(e, exc_info=True) if not is_python34() else logger.error(e)  # type: ignore
+        logger.exception(e, exc_info=True)  # type: ignore
         return _bad_input(sentence_id, reason=json_string)
 
 
@@ -841,7 +325,7 @@ def make_input_from_dict(sentence_id: SentenceId,
                                avoid_list=avoid_list, pass_through_dict=input_dict)
 
     except Exception as e:
-        logger.exception(e, exc_info=True) if not is_python34() else logger.error(e)  # type: ignore
+        logger.exception(e, exc_info=True)  # type: ignore
         return _bad_input(sentence_id, reason=str(input_dict))
 
 
@@ -912,53 +396,45 @@ class TranslatorOutput:
     :param sentence_id: Sentence id.
     :param translation: Translation string without sentence boundary tokens.
     :param tokens: List of translated tokens.
-    :param attention_matrix: Attention matrix. Shape: (target_length, source_length).
     :param score: Negative log probability of generated translation.
     :param pass_through_dict: Dictionary of key/value pairs to pass through when working with JSON.
     :param beam_histories: List of beam histories. The list will contain more than one
            history if it was split due to exceeding max_length.
     :param nbest_translations: List of nbest translations as strings.
     :param nbest_tokens: List of nbest translations as lists of tokens.
-    :param nbest_attention_matrices: List of attention matrices, one for each nbest translation.
     :param nbest_scores: List of nbest scores, one for each nbest translation.
     """
     __slots__ = ('sentence_id',
                  'translation',
                  'tokens',
-                 'attention_matrix',
                  'score',
                  'pass_through_dict',
                  'beam_histories',
                  'nbest_translations',
                  'nbest_tokens',
-                 'nbest_attention_matrices',
                  'nbest_scores')
 
     def __init__(self,
                  sentence_id: SentenceId,
                  translation: str,
                  tokens: Tokens,
-                 attention_matrix: np.ndarray,
                  score: float,
                  pass_through_dict: Optional[Dict[str,Any]] = None,
                  beam_histories: Optional[List[BeamHistory]] = None,
                  nbest_translations: Optional[List[str]] = None,
                  nbest_tokens: Optional[List[Tokens]] = None,
-                 nbest_attention_matrices: Optional[List[np.ndarray]] = None,
                  nbest_scores: Optional[List[float]] = None) -> None:
         self.sentence_id = sentence_id
         self.translation = translation
         self.tokens = tokens
-        self.attention_matrix = attention_matrix
         self.score = score
         self.pass_through_dict = copy.deepcopy(pass_through_dict) if pass_through_dict else {}
         self.beam_histories = beam_histories
         self.nbest_translations = nbest_translations
         self.nbest_tokens = nbest_tokens
-        self.nbest_attention_matrices = nbest_attention_matrices
         self.nbest_scores = nbest_scores
 
-    def json(self, align_threshold: float = 0.0) -> Dict:
+    def json(self) -> Dict:
         """
         Returns a dictionary suitable for json.dumps() representing all
         the information in the class. It is initialized with any keys
@@ -966,7 +442,6 @@ def json(self, align_threshold: float = 0.0) -> Dict:
         Keys from here that are not overwritten by Sockeye will thus be passed
         through to the output.
 
-        :param align_threshold: If alignments are defined, only print ones over this threshold.
         :return: A dictionary.
         """
         _d = self.pass_through_dict  # type: Dict[str, Any]
@@ -977,36 +452,23 @@ def json(self, align_threshold: float = 0.0) -> Dict:
         if self.nbest_translations is not None and len(self.nbest_translations) > 1:
             _d['translations'] = self.nbest_translations
             _d['scores'] = self.nbest_scores
-            if self.nbest_attention_matrices:
-                extracted_alignments = []
-                for alignment_matrix in self.nbest_attention_matrices:
-                    extracted_alignments.append(list(utils.get_alignments(alignment_matrix, threshold=align_threshold)))
-                _d['alignments'] = extracted_alignments
-
         return _d
 
 
-TokenIds = List[int]
-
-
 class NBestTranslations:
     __slots__ = ('target_ids_list',
-                 'attention_matrices',
                  'scores')
 
     def __init__(self,
                  target_ids_list: List[TokenIds],
-                 attention_matrices: List[np.ndarray],
                  scores: List[float]) -> None:
 
         self.target_ids_list = target_ids_list
-        self.attention_matrices = attention_matrices
         self.scores = scores
 
 
 class Translation:
     __slots__ = ('target_ids',
-                 'attention_matrix',
                  'score',
                  'beam_histories',
                  'nbest_translations',
@@ -1014,13 +476,11 @@ class Translation:
 
     def __init__(self,
                  target_ids: TokenIds,
-                 attention_matrix: np.ndarray,
                  score: float,
                  beam_histories: List[BeamHistory] = None,
                  nbest_translations: NBestTranslations = None,
                  estimated_reference_length: Optional[float] = None) -> None:
         self.target_ids = target_ids
-        self.attention_matrix = attention_matrix
         self.score = score
         self.beam_histories = beam_histories if beam_histories is not None else []
         self.nbest_translations = nbest_translations
@@ -1034,9 +494,8 @@ def empty_translation(add_nbest: bool = False) -> Translation:
     :param add_nbest: Include (empty) nbest_translations in the translation object.
     """
     return Translation(target_ids=[],
-                       attention_matrix=np.asarray([[0]]),
                        score=-np.inf,
-                       nbest_translations=NBestTranslations([], [], []) if add_nbest else None)
+                       nbest_translations=NBestTranslations([], []) if add_nbest else None)
 
 
 IndexedTranslatorInput = NamedTuple('IndexedTranslatorInput', [
@@ -1067,115 +526,15 @@ def empty_translation(add_nbest: bool = False) -> Translation:
 """
 
 
-class ModelState:
-    """
-    A ModelState encapsulates information about the decoder states of an InferenceModel.
-    """
-
-    def __init__(self, states: List[mx.nd.NDArray]) -> None:
-        self.states = states
-
-    def sort_state(self, best_hyp_indices: mx.nd.NDArray):
-        """
-        Sorts states according to k-best order from last step in beam search.
-        """
-        self.states = [mx.nd.take(ds, best_hyp_indices) for ds in self.states]
-
-
-class LengthPenalty(mx.gluon.HybridBlock):
-    """
-    Calculates the length penalty as:
-    (beta + len(Y))**alpha / (beta + 1)**alpha
-
-    See Wu et al. 2016 (note that in the paper beta has a different meaning,
-    and a fixed value 5 was used for this parameter)
-
-    :param alpha: The alpha factor for the length penalty (see above).
-    :param beta: The beta factor for the length penalty (see above).
-    """
-
-    def __init__(self, alpha: float = 1.0, beta: float = 0.0, **kwargs) -> None:
-        super().__init__(**kwargs)
-        self.alpha = alpha
-        self.beta = beta
-        self.denominator = (self.beta + 1.) ** self.alpha
-
-    def hybrid_forward(self, F, lengths):
-        if self.alpha == 0.0:
-            if F is None:
-                return 1.0
-            else:
-                return F.ones_like(lengths)
-        else:
-            numerator = self.beta + lengths if self.beta != 0.0 else lengths
-            numerator = numerator ** self.alpha if self.alpha != 1.0 else numerator
-            return numerator / self.denominator
-
-    def get(self, lengths: Union[mx.nd.NDArray, int, float]) -> Union[mx.nd.NDArray, float]:
-        """
-        Calculate the length penalty for the given vector of lengths.
-
-        :param lengths: A scalar or a matrix of sentence lengths of dimensionality (batch_size, 1).
-        :return: The length penalty. A scalar or a matrix (batch_size, 1) depending on the input.
-        """
-        return self.hybrid_forward(None, lengths)
-
-
-class BrevityPenalty(mx.gluon.HybridBlock):
-    """
-    Calculates the logarithmic brevity penalty as:
-      weight * log min(1, exp(1 - ref_len / hyp_len)) = weight * min(0, 1 - ref_len / hyp_len).
-
-    :param weight: Linear weight.
-    """
-
-    def __init__(self, weight: float = 0.0, **kwargs) -> None:
-        super().__init__(**kwargs)
-        self.weight = weight
-
-    def hybrid_forward(self, F, hyp_lengths, reference_lengths):
-        if self.weight == 0.0:
-            if F is None:
-                return 0.0
-            else:
-                # subtract to avoid MxNet's warning of not using both arguments
-                # this branch should not and is not used during inference
-                return F.zeros_like(hyp_lengths - reference_lengths)
-        else:
-            # log_bp is always <= 0.0
-            if F is None:
-                log_bp = min(0.0, 1.0 - reference_lengths / hyp_lengths)
-            else:
-                log_bp = F.minimum(F.zeros_like(hyp_lengths), 1.0 - reference_lengths / hyp_lengths)
-            return self.weight * log_bp
-
-    def get(self,
-            hyp_lengths: Union[mx.nd.NDArray, int, float],
-            reference_lengths: Optional[Union[mx.nd.NDArray, int, float]]) -> Union[mx.nd.NDArray, float]:
-        """
-        Calculate the length penalty for the given vector of lengths.
-
-        :param hyp_lengths: Hypotheses lengths.
-        :param reference_lengths: Reference lengths.
-        :return: The length penalty. A scalar or a matrix (batch_size, 1) depending on the input.
-        """
-        if reference_lengths is None:
-            return 0.0
-        else:
-            return self.hybrid_forward(None, hyp_lengths, reference_lengths)
-
-
-def _concat_nbest_translations(translations: List[Translation], stop_ids: Set[int],
-                               length_penalty: LengthPenalty,
-                               brevity_penalty: Optional[BrevityPenalty] = None) -> Translation:
+def _concat_nbest_translations(translations: List[Translation],
+                               stop_ids: Set[int],
+                               scorer: CandidateScorer) -> Translation:
     """
     Combines nbest translations through concatenation.
 
-    :param translations: A list of translations (sequence starting with BOS symbol,
-        attention_matrix), score and length.
+    :param translations: A list of translations (sequence starting with BOS symbol), score and length.
     :param stop_ids: The EOS symbols.
-    :param length_penalty: LengthPenalty.
-    :param brevity_penalty: Optional BrevityPenalty.
+    :param scorer: Candidate scorer for recomputing score of concatenated translations.
     :return: A concatenation of the translations with a score.
     """
     expanded_translations = (_expand_nbest_translation(translation) for translation in translations)
@@ -1185,8 +544,7 @@ def _concat_nbest_translations(translations: List[Translation], stop_ids: Set[in
     for translations_to_concat in zip(*expanded_translations):
         concatenated_translations.append(_concat_translations(translations=list(translations_to_concat),
                                                               stop_ids=stop_ids,
-                                                              length_penalty=length_penalty,
-                                                              brevity_penalty=brevity_penalty))
+                                                              scorer=scorer))
 
     return _reduce_nbest_translations(concatenated_translations)
 
@@ -1202,13 +560,11 @@ def _reduce_nbest_translations(nbest_translations_list: List[Translation]) -> Tr
     best_translation = nbest_translations_list[0]
 
     sequences = [translation.target_ids for translation in nbest_translations_list]
-    attention_matrices = [translation.attention_matrix for translation in nbest_translations_list]
     scores = [translation.score for translation in nbest_translations_list]
 
-    nbest_translations = NBestTranslations(sequences, attention_matrices, scores)
+    nbest_translations = NBestTranslations(sequences, scores)
 
     return Translation(best_translation.target_ids,
-                       best_translation.attention_matrix,
                        best_translation.score,
                        best_translation.beam_histories,
                        nbest_translations,
@@ -1224,10 +580,8 @@ def _expand_nbest_translation(translation: Translation) -> List[Translation]:
     :return: A list of Translation objects.
     """
     nbest_list = []  # type = List[Translation]
-    for target_ids, attention_matrix, score in zip(translation.nbest_translations.target_ids_list,
-                                                   translation.nbest_translations.attention_matrices,
-                                                   translation.nbest_translations.scores):
-        nbest_list.append(Translation(target_ids, attention_matrix, score, translation.beam_histories,
+    for target_ids, score in zip(translation.nbest_translations.target_ids_list, translation.nbest_translations.scores):
+        nbest_list.append(Translation(target_ids, score, translation.beam_histories,
                                       estimated_reference_length=translation.estimated_reference_length))
 
     return nbest_list
@@ -1235,58 +589,42 @@ def _expand_nbest_translation(translation: Translation) -> List[Translation]:
 
 def _concat_translations(translations: List[Translation],
                          stop_ids: Set[int],
-                         length_penalty: LengthPenalty,
-                         brevity_penalty: Optional[BrevityPenalty] = None) -> Translation:
+                         scorer: CandidateScorer) -> Translation:
     """
     Combines translations through concatenation.
 
-    :param translations: A list of translations (sequence starting with BOS symbol, attention_matrix), score and length.
+    :param translations: A list of translations (sequence starting with BOS symbol), score and length.
     :param stop_ids: The EOS symbols.
-    :param length_penalty: Instance of the LengthPenalty class initialized with alpha and beta.
-    :param brevity_penalty: Optional Instance of the BrevityPenalty class initialized with a brevity weight.
+    :param scorer: Candidate scorer for recomputing score of concatenated translations.
     :return: A concatenation of the translations with a score.
     """
+    if len(translations) == 1:
+        return translations[0]
+
     # Concatenation of all target ids without BOS and EOS
     target_ids = []
-    attention_matrices = []
     beam_histories = []  # type: List[BeamHistory]
-    estimated_reference_length = None  # type: float
+    estimated_reference_length = None  # type: Optional[float]
 
     for idx, translation in enumerate(translations):
         if idx == len(translations) - 1:
             target_ids.extend(translation.target_ids)
-            attention_matrices.append(translation.attention_matrix)
         else:
             if translation.target_ids[-1] in stop_ids:
                 target_ids.extend(translation.target_ids[:-1])
-                attention_matrices.append(translation.attention_matrix[:-1, :])
             else:
                 target_ids.extend(translation.target_ids)
-                attention_matrices.append(translation.attention_matrix)
         beam_histories.extend(translation.beam_histories)
         if translation.estimated_reference_length is not None:
             if estimated_reference_length is None:
                 estimated_reference_length = translation.estimated_reference_length
             else:
                 estimated_reference_length += translation.estimated_reference_length
-    # Combine attention matrices:
-    attention_shapes = [attention_matrix.shape for attention_matrix in attention_matrices]
-    attention_matrix_combined = np.zeros(np.sum(np.asarray(attention_shapes), axis=0))
-    pos_t, pos_s = 0, 0
-    for attention_matrix, (len_t, len_s) in zip(attention_matrices, attention_shapes):
-        attention_matrix_combined[pos_t:pos_t + len_t, pos_s:pos_s + len_s] = attention_matrix
-        pos_t += len_t
-        pos_s += len_s
-
-    def _brevity_penalty(hypothesis_length, reference_length):
-        return 0.0 if brevity_penalty is None else brevity_penalty.get(hypothesis_length, reference_length)
 
     # Unnormalize + sum and renormalize the score:
-    score = sum((translation.score + _brevity_penalty(len(translation.target_ids), translation.estimated_reference_length)) \
-                    * length_penalty.get(len(translation.target_ids))
-                 for translation in translations)
-    score = score / length_penalty.get(len(target_ids)) - _brevity_penalty(len(target_ids), estimated_reference_length)
-    return Translation(target_ids, attention_matrix_combined, score, beam_histories,
+    raw_score = sum(scorer.unnormalize(t.score, len(t.target_ids), t.estimated_reference_length) for t in translations)
+    score = scorer(raw_score, len(target_ids), estimated_reference_length)
+    return Translation(target_ids, score, beam_histories,
                        estimated_reference_length=estimated_reference_length)
 
 
@@ -1298,8 +636,7 @@ class Translator:
 
     :param context: MXNet context to bind modules to.
     :param ensemble_mode: Ensemble mode: linear or log_linear combination.
-    :param length_penalty: Length penalty instance.
-    :param beam_prune: Beam pruning difference threshold.
+    :param scorer: Hypothesis/Candidate scoring instance
     :param beam_search_stop: The stopping criterion.
     :param models: List of models.
     :param source_vocabs: Source vocabularies.
@@ -1308,165 +645,112 @@ class Translator:
     :param restrict_lexicon: Top-k lexicon to use for target vocabulary selection. Can be a dict of
                              of named lexicons.
     :param avoid_list: Global list of phrases to exclude from the output.
-    :param store_beam: If True, store the beam search history and return it in the TranslatorOutput.
     :param strip_unknown_words: If True, removes any <unk> symbols from outputs.
-    :param skip_topk: If True, uses argmax instead of topk for greedy decoding.
     :param sample: If True, sample from softmax multinomial instead of using topk.
+    :param output_scores: Whether the scores will be needed as outputs. If True, scores will be normalized, negative
+           log probabilities. If False, scores will be negative, raw logit activations if decoding with beam size 1
+           and a single model.
     :param constant_length_ratio: If > 0, will override models' prediction of the length ratio (if any).
-    :param brevity_penalty: Optional BrevityPenalty.
+    :param hybridize: Whether to hybridize inference code.
+    :param max_output_length_num_stds: Number of standard deviations to add as a safety margin when computing the
+           maximum output length. If -1, returned maximum output lengths will always be 2 * input_length.
+    :param max_input_length: Maximum input length this Translator should allow. If None, value will be taken from the
+           model(s). Inputs larger than this value will be chunked and translated in sequence.
+           If model(s) do not support given input length it will fall back to what the model(s) support.
+    :param max_output_length: Maximum output length this Translator is allowed to decode. If None, value will be taken
+           from the model(s). Decodings that do not finish within this limit, will be force-stopped.
+           If model(s) do not support given input length it will fall back to what the model(s) support.
     """
 
     def __init__(self,
                  context: mx.context.Context,
                  ensemble_mode: str,
-                 bucket_source_width: int,
-                 length_penalty: LengthPenalty,
-                 beam_prune: float,
+                 scorer: CandidateScorer,
+                 batch_size: int,
                  beam_search_stop: str,
-                 models: List[InferenceModel],
+                 models: List[SockeyeModel],
                  source_vocabs: List[vocab.Vocab],
                  target_vocab: vocab.Vocab,
+                 beam_size: int = 5,
                  nbest_size: int = 1,
                  restrict_lexicon: Optional[Union[lexicon.TopKLexicon, Dict[str, lexicon.TopKLexicon]]] = None,
                  avoid_list: Optional[str] = None,
-                 store_beam: bool = False,
                  strip_unknown_words: bool = False,
-                 skip_topk: bool = False,
                  sample: int = None,
+                 output_scores: bool = False,
                  constant_length_ratio: float = 0.0,
-                 brevity_penalty: Optional[BrevityPenalty] = None) -> None:
+                 hybridize: bool = True,
+                 max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH,
+                 max_input_length: Optional[int] = None,
+                 max_output_length: Optional[int] = None) -> None:
         self.context = context
-        self.length_penalty = length_penalty
-        self.brevity_penalty = brevity_penalty
-        self.constant_length_ratio = constant_length_ratio
-        self.beam_prune = beam_prune
+        self.dtype = C.DTYPE_FP32 if models[0].dtype == C.DTYPE_INT8 else models[0].dtype
+        self._scorer = scorer
+        self.batch_size = batch_size
+        self.beam_size = beam_size
         self.beam_search_stop = beam_search_stop
         self.source_vocabs = source_vocabs
         self.vocab_target = target_vocab
         self.vocab_target_inv = vocab.reverse_vocab(self.vocab_target)
         self.restrict_lexicon = restrict_lexicon
-        self.store_beam = store_beam
-        self.start_id = self.vocab_target[C.BOS_SYMBOL]
         assert C.PAD_ID == 0, "pad id should be 0"
-        self.stop_ids = {self.vocab_target[C.EOS_SYMBOL], C.PAD_ID}  # type: Set[int]
+        self.stop_ids = {C.EOS_ID, C.PAD_ID}  # type: Set[int]
         self.strip_ids = self.stop_ids.copy()  # ids to strip from the output
-        self.unk_id = self.vocab_target[C.UNK_SYMBOL]
+        self.unk_id = C.UNK_ID
         if strip_unknown_words:
             self.strip_ids.add(self.unk_id)
         self.models = models
-        utils.check_condition(all(models[0].source_with_eos == m.source_with_eos for m in models),
-                              "The source_with_eos property must match across models.")
-        self.source_with_eos = models[0].source_with_eos
-        self.interpolation_func = self._get_interpolation_func(ensemble_mode)
-        self.beam_size = self.models[0].beam_size
+
+        # after models are loaded we ensured that they agree on max_input_length, max_output_length and batch size
+        # set a common max_output length for all models.
+        self._max_input_length, self._get_max_output_length = models_max_input_output_length(
+            models,
+            max_output_length_num_stds,
+            forced_max_input_length=max_input_length,
+            forced_max_output_length=max_output_length)
+
         self.nbest_size = nbest_size
         utils.check_condition(self.beam_size >= nbest_size, 'nbest_size must be smaller or equal to beam_size.')
         if self.nbest_size > 1:
             utils.check_condition(self.beam_search_stop == C.BEAM_SEARCH_STOP_ALL,
                                   "nbest_size > 1 requires beam_search_stop to be set to 'all'")
 
-        # maximum allowed batch size of this translator instance
-        self.batch_size = self.models[0].max_batch_size
-
-        if any(m.skip_softmax for m in self.models):
-            utils.check_condition(len(self.models) == 1 and self.beam_size == 1,
-                                  "Skipping softmax cannot be enabled for ensembles or beam sizes > 1.")
-
-        self.skip_topk = skip_topk
-        if self.skip_topk:
-            utils.check_condition(self.beam_size == 1, "skip_topk has no effect if beam size is larger than 1")
-            utils.check_condition(len(self.models) == 1, "skip_topk has no effect for decoding with more than 1 model")
-
-        self.sample = sample
-        utils.check_condition(not self.sample or self.restrict_lexicon is None,
-                              "Sampling is not available when working with a restricted lexicon.")
-
-        # after models are loaded we ensured that they agree on max_input_length, max_output_length and batch size
-        self._max_input_length = self.models[0].max_input_length
-        if bucket_source_width > 0:
-            self.buckets_source = data_io.define_buckets(self._max_input_length, step=bucket_source_width)
-        else:
-            self.buckets_source = [self._max_input_length]
-
-        self._update_scores = UpdateScores()
-        self._update_scores.initialize(ctx=self.context)
-        self._update_scores.hybridize(static_alloc=True, static_shape=True)
-
-        # Vocabulary selection leads to different vocabulary sizes across requests. Hence, we cannot use a
-        # statically-shaped HybridBlock for the topk operation in this case; resorting to imperative topk
-        # function in this case.
-        if not self.restrict_lexicon:
-            if self.skip_topk:
-                self._top = Top1()  # type: mx.gluon.HybridBlock
-            elif self.sample is not None:
-                self._top = SampleK(k=self.beam_size,
-                                    n=self.sample,
-                                    max_batch_size=self.max_batch_size)  # type: mx.gluon.HybridBlock
-            else:
-                self._top = TopK(k=self.beam_size,
-                                 vocab_size=len(self.vocab_target))  # type: mx.gluon.HybridBlock
-
-            self._top.initialize(ctx=self.context)
-            self._top.hybridize(static_alloc=True, static_shape=True)
-        else:
-            if self.skip_topk:
-                self._top = utils.top1  # type: Callable
-            else:
-                self._top = partial(utils.topk, k=self.beam_size)  # type: Callable
-
-        self._sort_by_index = SortByIndex()
-        self._sort_by_index.initialize(ctx=self.context)
-        self._sort_by_index.hybridize(static_alloc=True, static_shape=True)
-
-        brevity_penalty_weight = self.brevity_penalty.weight if self.brevity_penalty is not None else 0.0
-        self._update_finished = NormalizeAndUpdateFinished(pad_id=C.PAD_ID,
-                                                           eos_id=self.vocab_target[C.EOS_SYMBOL],
-                                                           length_penalty_alpha=self.length_penalty.alpha,
-                                                           length_penalty_beta=self.length_penalty.beta,
-                                                           brevity_penalty_weight=brevity_penalty_weight)
-        self._update_finished.initialize(ctx=self.context)
-        self._update_finished.hybridize(static_alloc=True, static_shape=True)
-
-        self._prune_hyps = PruneHypotheses(threshold=self.beam_prune, beam_size=self.beam_size)
-        self._prune_hyps.initialize(ctx=self.context)
-        self._prune_hyps.hybridize(static_alloc=True, static_shape=True)
-
-        self.global_avoid_trie = None
-        if avoid_list is not None:
-            self.global_avoid_trie = constrained.AvoidTrie()
-            for phrase in data_io.read_content(avoid_list):
-                phrase_ids = data_io.tokens2ids(phrase, self.vocab_target)
-                if self.unk_id in phrase_ids:
-                    logger.warning("Global avoid phrase '%s' contains an %s; this may indicate improper preprocessing.",
-                                   ' '.join(phrase), C.UNK_SYMBOL)
-                self.global_avoid_trie.add_phrase(phrase_ids)
+        self._beam_search = get_beam_search(
+            models=self.models,
+            beam_size=self.beam_size,
+            context=self.context,
+            vocab_target=target_vocab,
+            output_scores=output_scores,
+            sample=sample,
+            ensemble_mode=ensemble_mode,
+            beam_search_stop=beam_search_stop,
+            scorer=self._scorer,
+            constant_length_ratio=constant_length_ratio,
+            avoid_list=avoid_list,
+            hybridize=hybridize)
 
         self._concat_translations = partial(_concat_nbest_translations if self.nbest_size > 1 else _concat_translations,
                                             stop_ids=self.stop_ids,
-                                            length_penalty=self.length_penalty,
-                                            brevity_penalty=self.brevity_penalty)  # type: Callable
+                                            scorer=self._scorer)  # type: Callable
 
-        logger.info("Translator (%d model(s) beam_size=%d beam_prune=%s beam_search_stop=%s "
-                    "nbest_size=%s ensemble_mode=%s max_batch_size=%d buckets_source=%s avoiding=%d)",
+        logger.info("Translator (%d model(s) beam_size=%d beam_search_stop=%s max_input_length=%s "
+                    "nbest_size=%s ensemble_mode=%s max_batch_size=%d avoiding=%d dtype=%s)",
                     len(self.models),
                     self.beam_size,
-                    'off' if not self.beam_prune else "%.2f" % self.beam_prune,
                     self.beam_search_stop,
+                    self.max_input_length,
                     self.nbest_size,
                     "None" if len(self.models) == 1 else ensemble_mode,
                     self.max_batch_size,
-                    self.buckets_source,
-                    0 if self.global_avoid_trie is None else len(self.global_avoid_trie))
+                    0 if self._beam_search.global_avoid_trie is None else len(self._beam_search.global_avoid_trie),
+                    self.dtype)
 
     @property
     def max_input_length(self) -> int:
         """
         Returns maximum input length for TranslatorInput objects passed to translate()
         """
-        if self.source_with_eos:
-            return self._max_input_length - C.SPACE_FOR_XOS
-        else:
-            return self._max_input_length
+        return self._max_input_length - C.SPACE_FOR_XOS
 
     @property
     def max_batch_size(self) -> int:
@@ -1479,29 +763,6 @@ def max_batch_size(self) -> int:
     def num_source_factors(self) -> int:
         return self.models[0].num_source_factors
 
-    @staticmethod
-    def _get_interpolation_func(ensemble_mode):
-        if ensemble_mode == 'linear':
-            return Translator._linear_interpolation
-        elif ensemble_mode == 'log_linear':
-            return Translator._log_linear_interpolation
-        else:
-            raise ValueError("unknown interpolation type")
-
-    @staticmethod
-    def _linear_interpolation(predictions):
-        # pylint: disable=invalid-unary-operand-type
-        return -mx.nd.log(utils.average_arrays(predictions))
-
-    @staticmethod
-    def _log_linear_interpolation(predictions):
-        """
-        Returns averaged and re-normalized log probabilities
-        """
-        log_probs = utils.average_arrays([p.log() for p in predictions])
-        # pylint: disable=invalid-unary-operand-type
-        return -log_probs.log_softmax()
-
     def translate(self, trans_inputs: List[TranslatorInput], fill_up_batches: bool = True) -> List[TranslatorOutput]:
         """
         Batch-translates a list of TranslatorInputs, returns a list of TranslatorOutputs.
@@ -1531,43 +792,23 @@ def translate(self, trans_inputs: List[TranslatorInput], fill_up_batches: bool =
                 translated_chunks.append(IndexedTranslation(input_idx=trans_input_idx, chunk_idx=0,
                                                             translation=empty_translation(add_nbest=(self.nbest_size > 1))))
             else:
-                # TODO(tdomhan): Remove branch without EOS with next major version bump, as future models will always be trained with source side EOS symbols
-                if self.source_with_eos:
-                    max_input_length_without_eos = self.max_input_length
+                if len(trans_input.tokens) > self.max_input_length:
                     # oversized input
-                    if len(trans_input.tokens) > max_input_length_without_eos:
-                        logger.debug(
-                            "Input %s has length (%d) that exceeds max input length (%d). "
-                            "Splitting into chunks of size %d.",
-                            trans_input.sentence_id, len(trans_input.tokens),
-                            self.buckets_source[-1], max_input_length_without_eos)
-                        chunks = [trans_input_chunk.with_eos()
-                                  for trans_input_chunk in trans_input.chunks(max_input_length_without_eos)]
-                        input_chunks.extend([IndexedTranslatorInput(trans_input_idx, chunk_idx, chunk_input)
-                                             for chunk_idx, chunk_input in enumerate(chunks)])
-                    # regular input
-                    else:
-                        input_chunks.append(IndexedTranslatorInput(trans_input_idx,
-                                                                   chunk_idx=0,
-                                                                   translator_input=trans_input.with_eos()))
+                    logger.debug(
+                        "Input %s has length (%d) that exceeds max input length (%d). "
+                        "Splitting into chunks of size %d.",
+                        trans_input.sentence_id, len(trans_input.tokens),
+                        self.max_input_length, self.max_input_length)
+                    chunks = [trans_input_chunk.with_eos()
+                              for trans_input_chunk in
+                              trans_input.chunks(self.max_input_length)]
+                    input_chunks.extend([IndexedTranslatorInput(trans_input_idx, chunk_idx, chunk_input)
+                                         for chunk_idx, chunk_input in enumerate(chunks)])
                 else:
-                    if len(trans_input.tokens) > self.max_input_length:
-                        # oversized input
-                        logger.debug(
-                            "Input %s has length (%d) that exceeds max input length (%d). "
-                            "Splitting into chunks of size %d.",
-                            trans_input.sentence_id, len(trans_input.tokens),
-                            self.buckets_source[-1], self.max_input_length)
-                        chunks = [trans_input_chunk
-                                  for trans_input_chunk in
-                                  trans_input.chunks(self.max_input_length)]
-                        input_chunks.extend([IndexedTranslatorInput(trans_input_idx, chunk_idx, chunk_input)
-                                             for chunk_idx, chunk_input in enumerate(chunks)])
-                    else:
-                        # regular input
-                        input_chunks.append(IndexedTranslatorInput(trans_input_idx,
-                                                                   chunk_idx=0,
-                                                                   translator_input=trans_input))
+                    # regular input
+                    input_chunks.append(IndexedTranslatorInput(trans_input_idx,
+                                                               chunk_idx=0,
+                                                               translator_input=trans_input.with_eos()))
 
             if trans_input.constraints is not None:
                 logger.info("Input %s has %d %s: %s", trans_input.sentence_id,
@@ -1641,22 +882,25 @@ def _get_inference_input(self,
 
         :param trans_inputs: List of TranslatorInputs.
         :return NDArray of source ids (shape=(batch_size, bucket_key, num_factors)),
-                bucket key, lexicon for vocabulary restriction, list of raw constraint
+                NDArray of valid source lengths, lexicon for vocabulary restriction, list of raw constraint
                 lists, and list of phrases to avoid, and an NDArray of maximum output
                 lengths.
         """
         batch_size = len(trans_inputs)
-        bucket_key = data_io.get_bucket(max(len(inp.tokens) for inp in trans_inputs), self.buckets_source)
-        source = mx.nd.zeros((batch_size, bucket_key, self.num_source_factors), ctx=self.context)
+        lengths = [len(inp) for inp in trans_inputs]
+        source_length = mx.nd.array(lengths, ctx=self.context, dtype=self.dtype)  # shape: (batch_size,)
+        max_length = max(len(inp) for inp in trans_inputs)
+        source_npy = np.zeros((batch_size, max_length, self.num_source_factors), dtype=np.float32)
+
         restrict_lexicon = None  # type: Optional[lexicon.TopKLexicon]
         raw_constraints = [None] * batch_size  # type: List[Optional[constrained.RawConstraintList]]
         raw_avoid_list = [None] * batch_size  # type: List[Optional[constrained.RawConstraintList]]
 
         max_output_lengths = []  # type: List[int]
         for j, trans_input in enumerate(trans_inputs):
-            num_tokens = len(trans_input)
-            max_output_lengths.append(self.models[0].get_max_output_length(data_io.get_bucket(num_tokens, self.buckets_source)))
-            source[j, :num_tokens, 0] = data_io.tokens2ids(trans_input.tokens, self.source_vocabs[0])
+            num_tokens = len(trans_input)  # includes eos
+            max_output_lengths.append(self._get_max_output_length(num_tokens))
+            source_npy[j, :num_tokens, 0] = data_io.tokens2ids(trans_input.tokens, self.source_vocabs[0])
 
             factors = trans_input.factors if trans_input.factors is not None else []
             num_factors = 1 + len(factors)
@@ -1666,7 +910,7 @@ def _get_inference_input(self,
             for i, factor in enumerate(factors[:self.num_source_factors - 1], start=1):
                 # fill in as many factors as there are tokens
 
-                source[j, :num_tokens, i] = data_io.tokens2ids(factor, self.source_vocabs[i])[:num_tokens]
+                source_npy[j, :num_tokens, i] = data_io.tokens2ids(factor, self.source_vocabs[i])[:num_tokens]
 
             # Check if vocabulary selection/restriction is enabled:
             # - First, see if the translator input provides a lexicon (used for multiple lexicons)
@@ -1699,32 +943,30 @@ def _get_inference_input(self,
                     logger.warning("Sentence %s: %s was found in the list of phrases to avoid; "
                                    "this may indicate improper preprocessing.", trans_input.sentence_id, C.UNK_SYMBOL)
 
-        return source, bucket_key, restrict_lexicon, raw_constraints, raw_avoid_list, \
+        source = mx.nd.array(source_npy, ctx=self.context)
+
+        return source, source_length, restrict_lexicon, raw_constraints, raw_avoid_list, \
                 mx.nd.array(max_output_lengths, ctx=self.context, dtype='int32')
 
     def _make_result(self,
                      trans_input: TranslatorInput,
                      translation: Translation) -> TranslatorOutput:
         """
-        Returns a translator result from generated target-side word ids, attention matrices and scores.
+        Returns a translator result from generated target-side word ids and scores.
         Strips stop ids from translation string.
 
         :param trans_input: Translator input.
-        :param translation: The translation + attention and score.
+        :param translation: The translation and score.
         :return: TranslatorOutput.
         """
         target_ids = translation.target_ids
         target_tokens = [self.vocab_target_inv[target_id] for target_id in target_ids]
         target_string = C.TOKEN_SEPARATOR.join(data_io.ids2tokens(target_ids, self.vocab_target_inv, self.strip_ids))
 
-        attention_matrix = translation.attention_matrix
-        attention_matrix = attention_matrix[:, :len(trans_input.tokens)]
-
         if translation.nbest_translations is None:
             return TranslatorOutput(sentence_id=trans_input.sentence_id,
                                     translation=target_string,
                                     tokens=target_tokens,
-                                    attention_matrix=attention_matrix,
                                     score=translation.score,
                                     pass_through_dict=trans_input.pass_through_dict,
                                     beam_histories=translation.beam_histories)
@@ -1736,35 +978,30 @@ def _make_result(self,
                                                    self.vocab_target_inv,
                                                    self.strip_ids)) for target_ids in nbest_target_ids]
 
-            attention_matrices = [matrix[:, :len(trans_input.tokens)] for matrix in
-                                  translation.nbest_translations.attention_matrices]
-
             scores = translation.nbest_translations.scores
 
             return TranslatorOutput(sentence_id=trans_input.sentence_id,
                                     translation=target_string,
                                     tokens=target_tokens,
-                                    attention_matrix=attention_matrix,
                                     score=translation.score,
                                     pass_through_dict=trans_input.pass_through_dict,
                                     beam_histories=translation.beam_histories,
                                     nbest_translations=target_strings,
                                     nbest_tokens=target_tokens_list,
-                                    nbest_attention_matrices=attention_matrices,
                                     nbest_scores=scores)
 
     def _translate_nd(self,
                       source: mx.nd.NDArray,
-                      source_length: int,
+                      source_length: mx.nd.NDArray,
                       restrict_lexicon: Optional[lexicon.TopKLexicon],
                       raw_constraints: List[Optional[constrained.RawConstraintList]],
                       raw_avoid_list: List[Optional[constrained.RawConstraintList]],
                       max_output_lengths: mx.nd.NDArray) -> List[Translation]:
         """
-        Translates source of source_length, given a bucket_key.
+        Translates source of source_length.
 
         :param source: Source ids. Shape: (batch_size, bucket_key, num_factors).
-        :param source_length: Bucket key.
+        :param source_length: Valid source lengths.
         :param restrict_lexicon: Lexicon to use for vocabulary restriction.
         :param raw_constraints: A list of optional constraint lists.
 
@@ -1777,414 +1014,19 @@ def _translate_nd(self,
                                                            raw_avoid_list,
                                                            max_output_lengths))
 
-    def _encode(self, sources: mx.nd.NDArray, source_length: int) -> Tuple[List[ModelState], mx.nd.NDArray]:
-        """
-        Returns a ModelState for each model representing the state of the model after encoding the source.
-
-        :param sources: Source ids. Shape: (batch_size, bucket_key, num_factors).
-        :param source_length: Bucket key.
-        :return: List of ModelStates and the estimated reference length based on ratios averaged over models.
-        """
-        model_states = []
-        ratios = []
-        for model in self.models:
-            state, ratio = model.run_encoder(sources, source_length)
-            model_states.append(state)
-            if ratio is not None:
-                ratios.append(ratio)
-
-        # num_seq takes batch_size and beam_size into account
-        num_seq = model_states[0].states[0].shape[0]
-        if self.constant_length_ratio > 0.0:
-            # override all ratios with the constant value
-            length_ratios = mx.nd.full(val=self.constant_length_ratio, shape=(num_seq, 1), ctx=self.context)
-        else:
-            if len(ratios) > 0:  # some model predicted a ratio?
-                # average the ratios over the models that actually we able to predict them
-                length_ratios = mx.nd.mean(mx.nd.stack(*ratios, axis=1), axis=1)
-            else:
-                length_ratios = mx.nd.zeros((num_seq, 1), ctx=self.context)
-
-        encoded_source_length=self.models[0].encoder.get_encoded_seq_len(source_length)
-        return model_states, length_ratios * encoded_source_length
-
-
-    def _decode_step(self,
-                     prev_word: mx.nd.NDArray,
-                     step: int,
-                     source_length: int,
-                     states: List[ModelState],
-                     models_output_layer_w: List[mx.nd.NDArray],
-                     models_output_layer_b: List[mx.nd.NDArray]) \
-            -> Tuple[mx.nd.NDArray, mx.nd.NDArray, List[ModelState]]:
-        """
-        Returns decoder predictions (combined from all models), attention scores, and updated states.
-
-        :param prev_word: Previous words of hypotheses. Shape: (batch_size * beam_size,).
-        :param step: Beam search iteration.
-        :param source_length: Length of the input sequence.
-        :param states: List of model states.
-        :param models_output_layer_w: Custom model weights for logit computation (empty for none).
-        :param models_output_layer_b: Custom model biases for logit computation (empty for none).
-        :return: (scores, attention scores, list of model states)
-        """
-        bucket_key = (source_length, step)
-
-        model_outs, model_attention_probs, model_states = [], [], []
-        # We use zip_longest here since we'll have empty lists when not using restrict_lexicon
-        for model, out_w, out_b, state in itertools.zip_longest(
-                self.models, models_output_layer_w, models_output_layer_b, states):
-            decoder_out, attention_probs, state = model.run_decoder(prev_word, bucket_key, state)
-            
-            if model.config.num_pointers:
-                # Fill up the predictions up to the maximum number of pointer elements for shorter source sentences
-                (beam_size, target_vocab_size) = decoder_out.shape
-                diff_size = len(self.vocab_target) - target_vocab_size
-                if diff_size > 0:
-                    decoder_out = mx.nd.concat(decoder_out,
-                                               mx.nd.zeros((beam_size, diff_size), ctx=self.context), dim=1)
-            
-            # Compute logits and softmax with restricted vocabulary
-            if self.restrict_lexicon:
-                # Apply output layer outside decoder module.
-                logits = model.output_layer(decoder_out, out_w, out_b)
-                if model.skip_softmax:
-                    model_out = logits  # raw logits
-                else:
-                    model_out = mx.nd.softmax(logits)  # normalized probabilities
-            else:
-                # Output layer is applied inside decoder module.
-                # if model.skip_softmax decoder_out represents logits, normalized probabilities else.
-                model_out = decoder_out
-            model_outs.append(model_out)
-            model_attention_probs.append(attention_probs)
-            model_states.append(state)
-        scores, attention_probs = self._combine_predictions(model_outs, model_attention_probs)
-        return scores, attention_probs, model_states
-
-    def _combine_predictions(self,
-                             model_outputs: List[mx.nd.NDArray],
-                             attention_probs: List[mx.nd.NDArray]) -> Tuple[mx.nd.NDArray, mx.nd.NDArray]:
-        """
-        Returns combined predictions of models and averaged attention prob scores.
-        If model_outputs are probabilities, they are converted to negative log probabilities before combination.
-        If model_outputs are logits (and no ensembling is used),
-        no combination is applied and logits are converted to negative logits.
-
-        :param model_outputs: List of Shape(beam_size, target_vocab_size).
-        :param attention_probs: List of Shape(beam_size, bucket_key).
-        :return: Combined scores, averaged attention scores.
-        """
-        # average attention prob scores. TODO: is there a smarter way to do this?
-        attention_prob_score = utils.average_arrays(attention_probs)
-
-        # combine model predictions and convert to neg log probs
-        if len(self.models) == 1:
-            if self.models[0].skip_softmax:
-                scores = -model_outputs[0]
-            else:
-                scores = -mx.nd.log(model_outputs[0])  # pylint: disable=invalid-unary-operand-type
-        else:
-            scores = self.interpolation_func(model_outputs)
-        return scores, attention_prob_score
-
-    def _beam_search(self,
-                     source: mx.nd.NDArray,
-                     source_length: int,
-                     restrict_lexicon: Optional[lexicon.TopKLexicon],
-                     raw_constraint_list: List[Optional[constrained.RawConstraintList]],
-                     raw_avoid_list: List[Optional[constrained.RawConstraintList]],
-                     max_output_lengths: mx.nd.NDArray) -> Tuple[np.ndarray,
-                                                                 np.ndarray,
-                                                                 np.ndarray,
-                                                                 np.ndarray,
-                                                                 np.ndarray,
-                                                                 List[Optional[np.ndarray]],
-                                                                 List[Optional[constrained.ConstrainedHypothesis]],
-                                                                 Optional[List[BeamHistory]]]:
-        """
-        Translates multiple sentences using beam search.
-
-        :param source: Source ids. Shape: (batch_size, bucket_key, num_factors).
-        :param source_length: Max source length.
-        :param restrict_lexicon: Lexicon to use for vocabulary restriction.
-        :param raw_constraint_list: A list of optional lists containing phrases (as lists of target word IDs)
-               that must appear in each output.
-        :param raw_avoid_list: A list of optional lists containing phrases (as lists of target word IDs)
-               that must NOT appear in each output.
-        :return List of best hypotheses indices, list of best word indices, list of attentions,
-                array of accumulated length-normalized negative log-probs, hypotheses lengths,
-                predicted lengths of references (if any), constraints (if any), beam histories (if any).
-        """
-        batch_size = source.shape[0]
-        logger.debug("_beam_search batch size: %d", batch_size)
-
-        # Length of encoded sequence (may differ from initial input length)
-        encoded_source_length = self.models[0].encoder.get_encoded_seq_len(source_length)
-        utils.check_condition(all(encoded_source_length ==
-                                  model.encoder.get_encoded_seq_len(source_length) for model in self.models),
-                              "Models must agree on encoded sequence length")
-        # Maximum output length
-        max_output_length = self.models[0].get_max_output_length(source_length)
-
-        # General data structure: batch_size * beam_size blocks in total;
-        # a full beam for each sentence, folloed by the next beam-block for the next sentence and so on
-
-        best_word_indices = mx.nd.full((batch_size * self.beam_size,), val=self.start_id, ctx=self.context,
-                                       dtype='int32')
-
-        # offset for hypothesis indices in batch decoding
-        offset = mx.nd.repeat(mx.nd.arange(0, batch_size * self.beam_size, self.beam_size,
-                                           dtype='int32', ctx=self.context), self.beam_size)
-
-        # locations of each batch item when first dimension is (batch * beam)
-        batch_indices = mx.nd.arange(0, batch_size * self.beam_size, self.beam_size, dtype='int32', ctx=self.context)
-        first_step_mask = mx.nd.full((batch_size * self.beam_size, 1), val=np.inf, ctx=self.context)
-        first_step_mask[batch_indices] = 1.0
-        pad_dist = mx.nd.full((batch_size * self.beam_size, len(self.vocab_target) - 1), val=np.inf,
-                              ctx=self.context)
-
-        # Best word and hypotheses indices across beam search steps from topk operation.
-        best_hyp_indices_list = []  # type: List[mx.nd.NDArray]
-        best_word_indices_list = []  # type: List[mx.nd.NDArray]
-
-        # Beam history
-        beam_histories = None  # type: Optional[List[BeamHistory]]
-        if self.store_beam:
-            beam_histories = [defaultdict(list) for _ in range(batch_size)]
-
-        lengths = mx.nd.zeros((batch_size * self.beam_size, 1), ctx=self.context)
-        finished = mx.nd.zeros((batch_size * self.beam_size,), ctx=self.context, dtype='int32')
-
-        # Extending max_output_lengths to shape (batch_size * beam_size,)
-        max_output_lengths = mx.nd.repeat(max_output_lengths, self.beam_size)
-
-        # Attention distributions across beam search steps
-        attentions = []  # type: List[mx.nd.NDArray]
-
-        # scores_accumulated: chosen smallest scores in scores (ascending).
-        scores_accumulated = mx.nd.zeros((batch_size * self.beam_size, 1), ctx=self.context)
-
-        # If using a top-k lexicon, select param rows for logit computation that correspond to the
-        # target vocab for this sentence.
-        models_output_layer_w = list()
-        models_output_layer_b = list()
-        vocab_slice_ids = None  # type: mx.nd.NDArray
-        if restrict_lexicon:
-            source_words = utils.split(source, num_outputs=self.num_source_factors, axis=2, squeeze_axis=True)[0]
-            # TODO: See note in method about migrating to pure MXNet when set operations are supported.
-            #       We currently convert source to NumPy and target ids back to NDArray.
-            vocab_slice_ids = restrict_lexicon.get_trg_ids(source_words.astype("int32").asnumpy())
-            if any(raw_constraint_list):
-                # Add the constraint IDs to the list of permissibled IDs, and then project them into the reduced space
-                constraint_ids = np.array([word_id for sent in raw_constraint_list for phr in sent for word_id in phr])
-                vocab_slice_ids = np.lib.arraysetops.union1d(vocab_slice_ids, constraint_ids)
-                full_to_reduced = dict((val, i) for i, val in enumerate(vocab_slice_ids))
-                raw_constraint_list = [[[full_to_reduced[x] for x in phr] for phr in sent] for sent in
-                                       raw_constraint_list]
-
-            vocab_slice_ids = mx.nd.array(vocab_slice_ids, ctx=self.context, dtype='int32')
-
-            if vocab_slice_ids.shape[0] < self.beam_size + 1:
-                # This fixes an edge case for toy models, where the number of vocab ids from the lexicon is
-                # smaller than the beam size.
-                logger.warning("Padding vocab_slice_ids (%d) with EOS to have at least %d+1 elements to expand",
-                               vocab_slice_ids.shape[0], self.beam_size)
-                n = self.beam_size - vocab_slice_ids.shape[0] + 1
-                vocab_slice_ids = mx.nd.concat(vocab_slice_ids,
-                                               mx.nd.full((n,), val=self.vocab_target[C.EOS_SYMBOL],
-                                                          ctx=self.context, dtype='int32'),
-                                               dim=0)
-
-            pad_dist = mx.nd.full((batch_size * self.beam_size, vocab_slice_ids.shape[0] - 1),
-                                  val=np.inf, ctx=self.context)
-            for m in self.models:
-                models_output_layer_w.append(m.output_layer_w.take(vocab_slice_ids))
-                models_output_layer_b.append(m.output_layer_b.take(vocab_slice_ids))
-
-        # (0) encode source sentence, returns a list
-        model_states, estimated_reference_lengths = self._encode(source, source_length)
-
-        # Initialize the beam to track constraint sets, where target-side lexical constraints are present
-        constraints = constrained.init_batch(raw_constraint_list, self.beam_size, self.start_id,
-                                             self.vocab_target[C.EOS_SYMBOL])
-
-        if self.global_avoid_trie or any(raw_avoid_list):
-            avoid_states = constrained.AvoidBatch(batch_size, self.beam_size,
-                                                  avoid_list=raw_avoid_list,
-                                                  global_avoid_trie=self.global_avoid_trie)
-            avoid_states.consume(best_word_indices)
-
-        # Records items in the beam that are inactive. At the beginning (t==1), there is only one valid or active
-        # item on the beam for each sentence
-        inactive = mx.nd.zeros((batch_size * self.beam_size), dtype='int32', ctx=self.context)
-        t = 1
-        for t in range(1, max_output_length):
-            # (1) obtain next predictions and advance models' state
-            # target_dists: (batch_size * beam_size, target_vocab_size)
-            # attention_scores: (batch_size * beam_size, bucket_key)
-            target_dists, attention_scores, model_states = self._decode_step(prev_word=best_word_indices,
-                                                                             step=t,
-                                                                             source_length=source_length,
-                                                                             states=model_states,
-                                                                             models_output_layer_w=models_output_layer_w,
-                                                                             models_output_layer_b=models_output_layer_b)
-
-            # (2) Produces the accumulated cost of target words in each row.
-            # There is special treatment for finished and inactive rows: inactive rows are inf everywhere;
-            # finished rows are inf everywhere except column zero, which holds the accumulated model score
-            scores = self._update_scores.forward(target_dists, finished, inactive, scores_accumulated, pad_dist)
-
-            # Mark entries that should be blocked as having a score of np.inf
-            if self.global_avoid_trie or any(raw_avoid_list):
-                block_indices = avoid_states.avoid()
-                if len(block_indices) > 0:
-                    scores[block_indices] = np.inf
-                    if self.sample is not None:
-                        target_dists[block_indices] = np.inf
-
-            # (3) Get beam_size winning hypotheses for each sentence block separately. Only look as
-            # far as the active beam size for each sentence.
-
-            if self.sample is not None:
-                best_hyp_indices, best_word_indices, scores_accumulated = self._top(scores, target_dists, finished)
-            else:
-                # On the first timestep, all hypotheses have identical histories, so force topk() to choose extensions
-                # of the first row only by setting all other rows to inf
-                if t == 1 and not self.skip_topk:
-                    scores *= first_step_mask
-
-                best_hyp_indices, best_word_indices, scores_accumulated = self._top(scores, offset)
-
-            # Constraints for constrained decoding are processed sentence by sentence
-            if any(raw_constraint_list):
-                best_hyp_indices, best_word_indices, scores_accumulated, constraints, inactive = constrained.topk(
-                    t,
-                    batch_size,
-                    self.beam_size,
-                    inactive,
-                    scores,
-                    constraints,
-                    best_hyp_indices,
-                    best_word_indices,
-                    scores_accumulated)
-
-            # Map from restricted to full vocab ids if needed
-            if restrict_lexicon:
-                best_word_indices = vocab_slice_ids.take(best_word_indices)
-
-            # (4) Reorder fixed-size beam data according to best_hyp_indices (ascending)
-            finished, lengths, attention_scores, estimated_reference_lengths \
-                                                = self._sort_by_index.forward(best_hyp_indices,
-                                                                              finished,
-                                                                              lengths,
-                                                                              attention_scores,
-                                                                              estimated_reference_lengths)
-
-            # (5) Normalize the scores of newly finished hypotheses. Note that after this until the
-            # next call to topk(), hypotheses may not be in sorted order.
-            finished, scores_accumulated, lengths = self._update_finished.forward(best_word_indices,
-                                                                                  max_output_lengths,
-                                                                                  finished,
-                                                                                  scores_accumulated,
-                                                                                  lengths,
-                                                                                  estimated_reference_lengths)
-
-            # (6) Prune out low-probability hypotheses. Pruning works by setting entries `inactive`.
-            if self.beam_prune > 0.0:
-                inactive, best_word_indices, scores_accumulated = self._prune_hyps.forward(best_word_indices,
-                                                                                           scores_accumulated,
-                                                                                           finished)
-
-            # (7) update negative constraints
-            if self.global_avoid_trie or any(raw_avoid_list):
-                avoid_states.reorder(best_hyp_indices)
-                avoid_states.consume(best_word_indices)
-
-            # (8) optionally save beam history
-            if self.store_beam:
-                finished_or_inactive = mx.nd.clip(data=finished + inactive, a_min=0, a_max=1)
-                unnormalized_scores = mx.nd.where(finished_or_inactive,
-                                                  scores_accumulated * self.length_penalty(lengths),
-                                                  scores_accumulated)
-                normalized_scores = mx.nd.where(finished_or_inactive,
-                                                scores_accumulated,
-                                                scores_accumulated / self.length_penalty(lengths))
-                for sent in range(batch_size):
-                    rows = slice(sent * self.beam_size, (sent + 1) * self.beam_size)
-
-                    best_word_indices_sent = best_word_indices[rows].asnumpy().tolist()
-                    # avoid adding columns for finished sentences
-                    if any(x for x in best_word_indices_sent if x != C.PAD_ID):
-                        beam_histories[sent]["predicted_ids"].append(best_word_indices_sent)
-                        beam_histories[sent]["predicted_tokens"].append([self.vocab_target_inv[x] for x in
-                                                                         best_word_indices_sent])
-                        # for later sentences in the matrix, shift from e.g. [5, 6, 7, 8, 6] to [0, 1, 3, 4, 1]
-                        shifted_parents = best_hyp_indices[rows] - (sent * self.beam_size)
-                        beam_histories[sent]["parent_ids"].append(shifted_parents.asnumpy().tolist())
-
-                        beam_histories[sent]["scores"].append(unnormalized_scores[rows].asnumpy().flatten().tolist())
-                        beam_histories[sent]["normalized_scores"].append(
-                            normalized_scores[rows].asnumpy().flatten().tolist())
-
-            # Collect best hypotheses, best word indices, and attention scores
-            best_hyp_indices_list.append(best_hyp_indices)
-            best_word_indices_list.append(best_word_indices)
-            attentions.append(attention_scores)
-
-            if self.beam_search_stop == C.BEAM_SEARCH_STOP_FIRST:
-                at_least_one_finished = finished.reshape((batch_size, self.beam_size)).sum(axis=1) > 0
-                if at_least_one_finished.sum().asscalar() == batch_size:
-                    break
-            else:
-                if finished.sum().asscalar() == batch_size * self.beam_size:  # all finished
-                    break
-
-            # (9) update models' state with winning hypotheses (ascending)
-            for ms in model_states:
-                ms.sort_state(best_hyp_indices)
-
-        logger.debug("Finished after %d / %d steps.", t + 1, max_output_length)
-
-        # (9) Sort the hypotheses within each sentence (normalization for finished hyps may have unsorted them).
-        folded_accumulated_scores = scores_accumulated.reshape((batch_size,
-                                                                self.beam_size * scores_accumulated.shape[-1]))
-        indices = mx.nd.cast(mx.nd.argsort(folded_accumulated_scores, axis=1), dtype='int32').reshape((-1,))
-        best_hyp_indices, _ = mx.nd.unravel_index(indices, scores_accumulated.shape) + offset
-        best_hyp_indices_list.append(best_hyp_indices)
-        lengths = lengths.take(best_hyp_indices)
-        scores_accumulated = scores_accumulated.take(best_hyp_indices)
-        constraints = [constraints[x] for x in best_hyp_indices.asnumpy()]
-
-        all_best_hyp_indices = mx.nd.stack(*best_hyp_indices_list, axis=1)
-        all_best_word_indices = mx.nd.stack(*best_word_indices_list, axis=1)
-        all_attentions = mx.nd.stack(*attentions, axis=1)
-
-        return all_best_hyp_indices.asnumpy(), \
-               all_best_word_indices.asnumpy(), \
-               all_attentions.asnumpy(), \
-               scores_accumulated.asnumpy(), \
-               lengths.asnumpy().astype('int32'), \
-               estimated_reference_lengths.asnumpy(), \
-               constraints, \
-               beam_histories
-
     def _get_best_from_beam(self,
                             best_hyp_indices: np.ndarray,
                             best_word_indices: np.ndarray,
-                            attentions: np.ndarray,
                             seq_scores: np.ndarray,
                             lengths: np.ndarray,
-                            estimated_reference_lengths: Optional[mx.nd.NDArray],
-                            constraints: List[Optional[constrained.ConstrainedHypothesis]],
+                            estimated_reference_lengths: Optional[mx.nd.NDArray] = None,
+                            constraints: List[Optional[constrained.ConstrainedHypothesis]] = [],
                             beam_histories: Optional[List[BeamHistory]] = None) -> List[Translation]:
         """
         Return the nbest (aka n top) entries from the n-best list.
 
         :param best_hyp_indices: Array of best hypotheses indices ids. Shape: (batch * beam, num_beam_search_steps + 1).
         :param best_word_indices: Array of best hypotheses indices ids. Shape: (batch * beam, num_beam_search_steps).
-        :param attentions: Array of attentions over source words.
-                           Shape: (batch * beam, num_beam_search_steps, encoded_source_length).
         :param seq_scores: Array of length-normalized negative log-probs. Shape: (batch * beam, 1)
         :param lengths: The lengths of all items in the beam. Shape: (batch * beam). Dtype: int32.
         :param estimated_reference_lengths: Predicted reference lengths.
@@ -2211,14 +1053,15 @@ def _get_best_from_beam(self,
                 best_ids += np.argmin(filtered, axis=1).astype('int32')
 
             # Obtain sequences for all best hypotheses in the batch
-            indices = self._get_best_word_indices_for_kth_hypotheses(best_ids, best_hyp_indices)  # type: np.ndarray
-            # pylint: disable=unsubscriptable-object
-            nbest_translations.append([self._assemble_translation(*x) for x in zip(best_word_indices[indices, np.arange(indices.shape[1])],
-                                                                                   lengths[best_ids],
-                                                                                   attentions[best_ids],
-                                                                                   seq_scores[best_ids],
-                                                                                   histories,
-                                                                                   reference_lengths[best_ids])])
+            indices = self._get_best_word_indices_for_kth_hypotheses(best_ids, best_hyp_indices)
+            nbest_translations.append(
+                    [self._assemble_translation(*x) for x in
+                     zip(best_word_indices[indices,
+                                           np.arange(indices.shape[1])],  # pylint: disable=unsubscriptable-object
+                         lengths[best_ids],
+                         seq_scores[best_ids],
+                         histories,
+                         reference_lengths[best_ids])])
         # reorder and regroup lists
         reduced_translations = [_reduce_nbest_translations(grouped_nbest) for grouped_nbest in zip(*nbest_translations)]
         return reduced_translations
@@ -2251,7 +1094,6 @@ def _get_best_word_indices_for_kth_hypotheses(ks: np.ndarray, all_hyp_indices: n
     @staticmethod
     def _assemble_translation(sequence: np.ndarray,
                               length: np.ndarray,
-                              attention_lists: np.ndarray,
                               seq_score: np.ndarray,
                               beam_history: Optional[BeamHistory],
                               estimated_reference_length: Optional[float]) -> Translation:
@@ -2260,8 +1102,6 @@ def _assemble_translation(sequence: np.ndarray,
         processing on each, and merges it into a Translation object.
         :param sequence: Array of word ids. Shape: (batch_size, bucket_key).
         :param length: The length of the translated segment.
-        :param attention_lists: Array of attentions over source words.
-                                Shape: (batch_size * self.beam_size, max_output_length, encoded_source_length).
         :param seq_score: Array of length-normalized negative log-probs.
         :param estimated_reference_length: Estimated reference length (if any).
         :param beam_history: The optional beam histories for each sentence in the batch.
@@ -2269,268 +1109,9 @@ def _assemble_translation(sequence: np.ndarray,
         """
         length = int(length)
         sequence = sequence[:length].tolist()
-        attention_matrix = attention_lists[:length, :]
         score = float(seq_score)
-        estimated_reference_length=float(estimated_reference_length) if estimated_reference_length else None
+        estimated_reference_length = float(estimated_reference_length) if estimated_reference_length else None
         beam_history_list = [beam_history] if beam_history is not None else []
-        return Translation(sequence, attention_matrix, score, beam_history_list,
+        return Translation(sequence, score, beam_history_list,
                            nbest_translations=None,
                            estimated_reference_length=estimated_reference_length)
-
-    def _print_beam(self,
-                    sequences: mx.nd.NDArray,
-                    accumulated_scores: mx.nd.NDArray,
-                    finished: mx.nd.NDArray,
-                    inactive: mx.nd.NDArray,
-                    constraints: List[Optional[constrained.ConstrainedHypothesis]],
-                    timestep: int) -> None:
-        """
-        Prints the beam for debugging purposes.
-
-        :param sequences: The beam histories (shape: batch_size * beam_size, max_output_len).
-        :param accumulated_scores: The accumulated scores for each item in the beam.
-               Shape: (batch_size * beam_size, target_vocab_size).
-        :param finished: Indicates which items are finished (shape: batch_size * beam_size).
-        :param inactive: Indicates any inactive items (shape: batch_size * beam_size).
-        :param timestep: The current timestep.
-        """
-        logger.info('BEAM AT TIMESTEP %d', timestep)
-        batch_beam_size = sequences.shape[0]
-        for i in range(batch_beam_size):
-            # for each hypothesis, print its entire history
-            score = accumulated_scores[i].asscalar()
-            word_ids = [int(x.asscalar()) for x in sequences[i]]
-            unmet = constraints[i].num_needed() if constraints[i] is not None else -1
-            hypothesis = '----------' if inactive[i] else ' '.join(
-                [self.vocab_target_inv[x] for x in word_ids if x != 0])
-            logger.info('%d %d %d %d %.2f %s', i + 1, finished[i].asscalar(), inactive[i].asscalar(), unmet, score,
-                        hypothesis)
-
-
-class PruneHypotheses(mx.gluon.HybridBlock):
-    """
-    A HybridBlock that returns an array of shape (batch*beam,) indicating which hypotheses are inactive due to pruning.
-
-    :param threshold: Pruning threshold.
-    :param beam_size: Beam size.
-    """
-
-    def __init__(self, threshold: float, beam_size: int) -> None:
-        super().__init__()
-        self.threshold = threshold
-        self.beam_size = beam_size
-        with self.name_scope():
-            self.inf = self.params.get_constant(name='inf', value=mx.nd.full((1, 1), val=np.inf))
-
-    def hybrid_forward(self, F, best_word_indices, scores, finished, inf):
-        # (batch*beam, 1) -> (batch, beam)
-        scores_2d = F.reshape(scores, shape=(-1, self.beam_size))
-        finished_2d = F.reshape(finished, shape=(-1, self.beam_size))
-        inf_array_2d = F.broadcast_like(inf, scores_2d)
-        inf_array = F.broadcast_like(inf, scores)
-
-        # best finished scores. Shape: (batch, 1)
-        best_finished_scores = F.min(F.where(finished_2d, scores_2d, inf_array_2d), axis=1, keepdims=True)
-        difference = F.broadcast_minus(scores_2d, best_finished_scores)
-        inactive = F.cast(difference > self.threshold, dtype='int32')
-        inactive = F.reshape(inactive, shape=(-1))
-
-        best_word_indices = F.where(inactive, F.zeros_like(best_word_indices), best_word_indices)
-        scores = F.where(inactive, inf_array, scores)
-
-        return inactive, best_word_indices, scores
-
-
-class SortByIndex(mx.gluon.HybridBlock):
-    """
-    A HybridBlock that sorts args by the given indices.
-    """
-
-    def hybrid_forward(self, F, indices, *args):
-        return [F.take(arg, indices) for arg in args]
-
-
-class TopK(mx.gluon.HybridBlock):
-    """
-    A HybridBlock for a statically-shaped batch-wise topk operation.
-    """
-
-    def __init__(self, k: int, vocab_size: int) -> None:
-        """
-        :param k: The number of smallest scores to return.
-        :param vocab_size: Vocabulary size.
-        """
-        super().__init__()
-        self.k = k
-        self.vocab_size = vocab_size
-
-    def hybrid_forward(self, F, scores, offset):
-        """
-        Get the lowest k elements per sentence from a `scores` matrix.
-
-        :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
-        :param offset: Array to add to the hypothesis indices for offsetting in batch decoding.
-        :return: The row indices, column indices and values of the k smallest items in matrix.
-        """
-        # Shape: (batch size, beam_size * vocab_size)
-        folded_scores = F.reshape(scores, shape=(-1, self.k * self.vocab_size))
-
-        values, indices = F.topk(folded_scores, axis=1, k=self.k, ret_typ='both', is_ascend=True)
-
-        # Project indices back into original shape (which is different for t==1 and t>1)
-        indices = F.reshape(F.cast(indices, 'int32'), shape=(-1,))
-        # TODO: we currently exploit a bug in the implementation of unravel_index to not require knowing the first shape
-        # value. See https://github.com/apache/incubator-mxnet/issues/13862
-        unraveled = F.unravel_index(indices, shape=(C.LARGEST_INT, self.vocab_size))
-
-        best_hyp_indices, best_word_indices = F.split(unraveled, axis=0, num_outputs=2, squeeze_axis=True)
-        best_hyp_indices = best_hyp_indices + offset
-        values = F.reshape(values, shape=(-1, 1))
-        return best_hyp_indices, best_word_indices, values
-
-
-class SampleK(mx.gluon.HybridBlock):
-    """
-    A HybridBlock for selecting a random word from each hypothesis according to its distribution.
-    """
-
-    def __init__(self, k: int, n: int, max_batch_size: int) -> None:
-        """
-        :param k: The size of the beam.
-        :param n: Sample from the top-N words in the vocab at each timestep.
-        :param max_batch_size: Number of sentences being decoded at once.
-        """
-        super().__init__()
-        self.n = n
-        with self.name_scope():
-            self.best_hyp_indices = self.params.get_constant(name='best_hyp_indices',
-                                                             value=mx.nd.arange(0, max_batch_size * k, dtype='int32'))
-
-    def hybrid_forward(self, F, scores, target_dists, finished, best_hyp_indices):
-        """
-        Choose an extension of each hypothesis from its softmax distribution.
-
-        :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
-        :param target_dists: The non-cumulative target distributions (ignored).
-        :param finished: The list of finished hypotheses.
-        :param best_hyp_indices: Best hypothesis indices constant.
-        :return: The row indices, column indices, and values of the sampled words.
-        """
-        # Map the negative logprobs to probabilities so as to have a distribution
-        target_dists = F.exp(-target_dists)
-
-        # n == 0 means sample from the full vocabulary. Otherwise, we sample from the top n.
-        if self.n != 0:
-            # select the top n in each row, via a mask
-            masked_items = F.topk(target_dists, k=self.n, ret_typ='mask', axis=1, is_ascend=False)
-            # set unmasked items to 0
-            masked_items = F.where(masked_items, target_dists, masked_items)
-            # renormalize
-            target_dists = F.broadcast_div(masked_items, F.sum(masked_items, axis=1, keepdims=True))
-
-        # Sample from the target distributions over words, then get the corresponding values from the cumulative scores
-        best_word_indices = F.random.multinomial(target_dists, get_prob=False)
-        # Zeroes for finished hypotheses.
-        best_word_indices = F.where(finished, F.zeros_like(best_word_indices), best_word_indices)
-        values = F.pick(scores, best_word_indices, axis=1, keepdims=True)
-
-        best_hyp_indices = F.slice_like(best_hyp_indices, best_word_indices, axes=(0,))
-
-        return best_hyp_indices, best_word_indices, values
-
-
-class Top1(mx.gluon.HybridBlock):
-    """
-    A HybridBlock for a statically-shaped batch-wise first-best operation.
-
-    Get the single lowest element per sentence from a `scores` matrix. Expects that
-    beam size is 1, for greedy decoding.
-
-    NOTE(mathmu): The current implementation of argmin in MXNet much slower than topk with k=1.
-    """
-
-    def hybrid_forward(self, F, scores, offset):
-        """
-        Get the single lowest element per sentence from a `scores` matrix. Expects that
-        beam size is 1, for greedy decoding.
-
-        :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
-        :param offset: Array to add to the hypothesis indices for offsetting in batch decoding.
-        :return: The row indices, column indices and values of the smallest items in matrix.
-        """
-        best_word_indices = F.cast(F.argmin(scores, axis=1), dtype='int32')
-        values = F.pick(scores, best_word_indices, axis=1)
-        values = F.reshape(values, shape=(-1, 1))
-
-        # for top1, the best hyp indices are equal to the plain offset
-        best_hyp_indices = offset
-
-        return best_hyp_indices, best_word_indices, values
-
-
-class NormalizeAndUpdateFinished(mx.gluon.HybridBlock):
-    """
-    A HybridBlock for normalizing newly finished hypotheses scores with LengthPenalty.
-    """
-
-    def __init__(self, pad_id: int,
-                 eos_id: int,
-                 length_penalty_alpha: float = 1.0,
-                 length_penalty_beta: float = 0.0,
-                 brevity_penalty_weight: float = 0.0) -> None:
-        super().__init__()
-        self.pad_id = pad_id
-        self.eos_id = eos_id
-        with self.name_scope():
-            self.length_penalty = LengthPenalty(alpha=length_penalty_alpha, beta=length_penalty_beta)
-            self.brevity_penalty = None  # type: Optional[BrevityPenalty]
-            if brevity_penalty_weight > 0.0:
-                self.brevity_penalty = BrevityPenalty(weight=brevity_penalty_weight)
-
-    def hybrid_forward(self, F, best_word_indices, max_output_lengths,
-                       finished, scores_accumulated, lengths, reference_lengths):
-        all_finished = F.broadcast_logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id)
-        newly_finished = F.broadcast_logical_xor(all_finished, finished)
-        if self.brevity_penalty is not None:
-            brevity_penalty = self.brevity_penalty(lengths, reference_lengths)
-        else:
-            brevity_penalty = F.zeros_like(reference_lengths)
-        scores_accumulated = F.where(newly_finished,
-                                     scores_accumulated / self.length_penalty(lengths) - brevity_penalty,
-                                     scores_accumulated)
-
-        # Update lengths of all items, except those that were already finished. This updates
-        # the lengths for inactive items, too, but that doesn't matter since they are ignored anyway.
-        lengths = lengths + F.cast(1 - F.expand_dims(finished, axis=1), dtype='float32')
-
-        # Now, recompute finished. Hypotheses are finished if they are
-        # - extended with <pad>, or
-        # - extended with <eos>, or
-        # - at their maximum length.
-        finished = F.broadcast_logical_or(F.broadcast_logical_or(best_word_indices == self.pad_id,
-                                                                 best_word_indices == self.eos_id),
-                                          (F.cast(F.reshape(lengths, shape=(-1,)), 'int32') >= max_output_lengths))
-
-        return finished, scores_accumulated, lengths
-
-
-class UpdateScores(mx.gluon.HybridBlock):
-    """
-    A HybridBlock that updates the scores from the decoder step with accumulated scores.
-    Inactive hypotheses receive score inf. Finished hypotheses receive their accumulated score for C.PAD_ID.
-    All other options are set to infinity.
-    """
-
-    def __init__(self):
-        super().__init__()
-        assert C.PAD_ID == 0, "This block only works with PAD_ID == 0"
-
-    def hybrid_forward(self, F, target_dists, finished, inactive, scores_accumulated, pad_dist):
-        # Special treatment for finished and inactive rows. Inactive rows are inf everywhere;
-        # finished rows are inf everywhere except column zero (pad_id), which holds the accumulated model score.
-        # Items that are finished (but not inactive) get their previous accumulated score for the <pad> symbol,
-        # infinity otherwise.
-        scores = F.broadcast_add(target_dists, scores_accumulated)
-        # pad_dist. Shape: (batch*beam, vocab_size-1)
-        scores = F.where(F.broadcast_logical_or(finished, inactive), F.concat(scores_accumulated, pad_dist), scores)
-        return scores
diff --git a/sockeye/init_embedding.py b/sockeye/init_embedding.py
index 7d72df2e2..8792e52fb 100644
--- a/sockeye/init_embedding.py
+++ b/sockeye/init_embedding.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -124,6 +124,7 @@ def main():
     """
     Commandline interface to initialize Sockeye embedding weights with pretrained word representations.
     """
+    raise NotImplementedError()  # TODO: re-implement for sockeye 2.0 / Gluon
     setup_main_logger(console=True, file_logging=False)
     params = argparse.ArgumentParser(description='Quick usage: python3 -m sockeye.init_embedding '
                                                  '-w embed-in-src.npy embed-in-tgt.npy '
@@ -159,7 +160,7 @@ def init_embeddings(args: argparse.Namespace):
         params[name] = init_weight(weight, vocab_in, vocab_out, initializer)
 
     logger.info('Saving initialized parameters to %s', args.file)
-    utils.save_params(params, args.file)
+    #utils.save_params(params, args.file)
 
 
 if __name__ == '__main__':
diff --git a/sockeye/initializer.py b/sockeye/initializer.py
deleted file mode 100644
index a86e928f2..000000000
--- a/sockeye/initializer.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import logging
-
-import mxnet as mx
-import numpy as np
-from typing import Optional, List, Tuple
-
-import sockeye.constants as C
-
-logger = logging.getLogger(__name__)
-
-
-def get_initializer(default_init_type: str, default_init_scale: float, default_init_xavier_rand_type: str,
-                    default_init_xavier_factor_type: str, embed_init_type: str, embed_init_sigma: float,
-                    rnn_init_type: str, extra_initializers: Optional[List[Tuple[str, mx.initializer.Initializer]]] = None) -> mx.initializer.Initializer:
-    """
-    Returns a mixed MXNet initializer.
-
-    :param default_init_type: The default weight initializer type.
-    :param default_init_scale: The scale used for default weight initialization (only used with uniform initialization).
-    :param default_init_xavier_rand_type: Xavier random number generator type.
-    :param default_init_xavier_factor_type: Xavier factor type.
-    :param embed_init_type: Embedding matrix initialization type.
-    :param embed_init_sigma: Sigma for normal initialization of embedding matrix.
-    :param rnn_init_type: Initialization type for RNN h2h matrices.
-    :param extra_initializers: Optional initializers provided from other sources.
-    :return: Mixed initializer.
-    """
-    # default initializer
-    if default_init_type == C.INIT_XAVIER:
-        default_init = [(C.DEFAULT_INIT_PATTERN,
-                         mx.init.Xavier(rnd_type=default_init_xavier_rand_type,
-                                        factor_type=default_init_xavier_factor_type,
-                                        magnitude=default_init_scale))]
-    elif default_init_type == C.INIT_UNIFORM:
-        default_init = [(C.DEFAULT_INIT_PATTERN, mx.init.Uniform(scale=default_init_scale))]
-    else:
-        raise ValueError("Unknown default initializer %s." % default_init_type)
-
-    # embedding initializer
-    if embed_init_type == C.EMBED_INIT_NORMAL:
-        embed_init = [(C.EMBED_INIT_PATTERN, mx.init.Normal(sigma=embed_init_sigma))]
-    elif embed_init_type == C.EMBED_INIT_DEFAULT:
-        embed_init = []
-    else:
-        raise ValueError('Unknown embedding initializer: %s' % embed_init_type)
-
-    # rnn initializer
-    if rnn_init_type == C.RNN_INIT_ORTHOGONAL:
-        rnn_init = [(C.RNN_INIT_PATTERN, mx.initializer.Orthogonal())]
-    elif rnn_init_type == C.RNN_INIT_ORTHOGONAL_STACKED:
-        rnn_init = [(C.RNN_INIT_PATTERN, StackedOrthogonalInit(scale=1.0, rand_type="eye"))]
-    elif rnn_init_type == C.RNN_INIT_DEFAULT:
-        rnn_init = []
-    else:
-        raise ValueError('Unknown RNN initializer: %s' % rnn_init_type)
-
-    params_init_pairs = embed_init + rnn_init + default_init
-    if extra_initializers is not None:
-        params_init_pairs = extra_initializers + params_init_pairs
-    return mx.initializer.Mixed(*zip(*params_init_pairs))
-
-
-@mx.init.register
-class StackedOrthogonalInit(mx.initializer.Initializer):
-    """
-    Initializes weight as Orthogonal matrix. Here we assume that the weight consists of stacked square matrices of
-    the same size.
-    For example one could have 3 (2,2) matrices resulting in a (6,2) matrix. This situation arises in RNNs when one
-    wants to perform multiple h2h transformations in a single matrix multiplication.
-
-    Reference:
-    Exact solutions to the nonlinear dynamics of learning in deep linear neural networks
-    arXiv preprint arXiv:1312.6120 (2013).
-
-    :param scale: Scaling factor of weight.
-    :param rand_type: use "uniform" or "normal" random number to initialize weight.
-           "eye" simply sets the matrix to an identity matrix.
-
-    """
-
-    def __init__(self, scale=1.414, rand_type="uniform"):
-        super().__init__()
-        self.scale = scale
-        self.rand_type = rand_type
-
-    def _init_weight(self, sym_name, arr):
-        assert len(arr.shape) == 2, "Only 2d weight matrices supported."
-        base_dim = arr.shape[1]
-        stacked_dim = arr.shape[0]  # base_dim * num_sub_matrices
-        assert stacked_dim % base_dim == 0, \
-            "Dim1 must be a multiple of dim2 (as weight = stacked square matrices)."
-
-        num_sub_matrices = stacked_dim // base_dim
-        logger.info("Initializing weight %s (shape=%s, num_sub_matrices=%d) with an orthogonal weight matrix.",
-                    sym_name, arr.shape, num_sub_matrices)
-
-        for mat_idx in range(0, num_sub_matrices):
-            if self.rand_type == "uniform":
-                tmp = np.random.uniform(-1.0, 1.0, (base_dim, base_dim))
-                _, __, q = np.linalg.svd(tmp)
-            elif self.rand_type == "normal":
-                tmp = np.random.normal(0.0, 1.0, (base_dim, base_dim))
-                _, __, q = np.linalg.svd(tmp)
-            elif self.rand_type == "eye":
-                q = np.eye(base_dim)
-            else:
-                raise ValueError("unknown rand_type %s" % self.rand_type)
-            q = self.scale * q
-            arr[mat_idx * base_dim:mat_idx * base_dim + base_dim] = q
diff --git a/sockeye/layers.py b/sockeye/layers.py
index 640322938..729d30613 100644
--- a/sockeye/layers.py
+++ b/sockeye/layers.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -12,30 +12,20 @@
 # permissions and limitations under the License.
 
 import logging
-import math
-from typing import Dict, Optional, Union
+from typing import Optional, Union, Tuple
+from functools import lru_cache
 
 import mxnet as mx
+import numpy as np
 
 from . import config
 from . import constants as C
+from . import quantization
 from . import utils
 
 logger = logging.getLogger(__name__)
 
 
-class GeLU(mx.gluon.HybridBlock):
-
-    def __init__(self, prefix=''):
-        super().__init__(prefix=prefix)
-        with self.name_scope():
-            self.act = mx.gluon.nn.Activation(activation="tanh")
-
-    def hybrid_forward(self, F, x):
-        # Approximation of x * gaussian_cdf(x) used by Hendrycks and Gimpel
-        return 0.5 * x * (1 + self.act((math.sqrt(2 / math.pi) * (x + (0.044715 * (x ** 3))))))
-
-
 def get_activation(act_type: str) -> mx.gluon.Block:
     """
     Returns Gluon Block for given activation type.
@@ -52,61 +42,9 @@ def get_activation(act_type: str) -> mx.gluon.Block:
     """
     if act_type == C.SWISH1:
         return mx.gluon.nn.Swish()
-    elif act_type == C.GELU:
-        return GeLU()
-    else:
-        return mx.gluon.nn.Activation(activation=act_type)
-
-
-# TODO: remove with next major version update to use mx.gluon.nn.LayerNorm (which uses different parameter naming).
-class LayerNormalization(mx.gluon.nn.HybridBlock):
-    """
-    Implements Ba et al, Layer Normalization (https://arxiv.org/abs/1607.06450).
-
-    Normalizes hidden units of data as follows:
-
-    data = scale * (data - mean) / sqrt(var + eps) + shift
-
-    Normalization is performed over the last dimension of the input data.
-
-    :param prefix: Optional prefix of layer name.
-    :param scale: Optional variable for scaling of shape (num_hidden,). Will be created if None.
-    :param shift: Optional variable for shifting of shape (num_hidden,). Will be created if None.
-    :param scale_init: Initial value of scale variable if scale is None. Default 1.0.
-    :param shift_init: Initial value of shift variable if shift is None. Default 0.0.
-    """
-    def __init__(self,
-                 prefix: str = 'layernorm',
-                 scale: Optional[mx.sym.Symbol] = None,
-                 shift: Optional[mx.sym.Symbol] = None,
-                 scale_init: float = 1.0,
-                 shift_init: float = 0.0,
-                 eps: float = 1e-06) -> None:
-        super().__init__(prefix=prefix)
-        self.eps = eps
-        self.scale = scale
-        if self.scale is None:
-            with self.name_scope():
-                self.scale = self.params.get('_gamma',
-                                             init=mx.init.Constant(value=scale_init),
-                                             allow_deferred_init=True)
-        self.shift = shift
-        if self.shift is None:
-            with self.name_scope():
-                self.shift = self.params.get('_beta',
-                                             init=mx.init.Constant(value=shift_init),
-                                             allow_deferred_init=True)
-
-    def hybrid_forward(self, F, data, **params):
-        if isinstance(self.scale, mx.sym.Symbol):
-            scale = self.scale
-        else:
-            scale = params['scale']
-        if isinstance(self.shift, mx.sym.Symbol):
-            shift = self.shift
-        else:
-            shift = params['shift']
-        return F.LayerNorm(data=data, gamma=scale, beta=shift, axis=-1, eps=self.eps, output_mean_var=False)
+    if act_type == C.GELU:
+        return mx.gluon.nn.GELU()
+    return mx.gluon.nn.Activation(activation=act_type)
 
 
 class LHUC(mx.gluon.HybridBlock):
@@ -117,154 +55,159 @@ class LHUC(mx.gluon.HybridBlock):
     Machine Translation Models" NAACL 2018
 
     :param num_hidden: Number of hidden units of the layer to be modified.
-    :param weight: Optional parameter vector.
     :param prefix: Optional prefix for created parameters (if not given as weight).
     """
     def __init__(self,
                  num_hidden: int,
-                 weight: Optional[mx.sym.Symbol] = None,
-                 prefix: str = "") -> None:
+                 prefix: str = C.LHUC_PREFIX,
+                 weight_init: Union[str, mx.init.Initializer] = mx.init.Uniform(0.1)) -> None:
         super().__init__(prefix=prefix)
-        self.num_hidden = num_hidden
-        self.weight = weight
-        if self.weight is None:
-            with self.name_scope():
-                self.lhuc = self.params.get(C.LHUC_NAME, shape=(num_hidden,), init=mx.init.Uniform(0.1))
-
-    def hybrid_forward(self, F, inputs: mx.sym.Symbol, **params) -> mx.sym.Symbol:
-        if isinstance(self.weight, mx.sym.Symbol):
-            weight = self.weight
-        else:
-            weight = params[C.LHUC_NAME]
+        with self.name_scope():
+            self.weight = self.params.get('weight', shape=(num_hidden,), init=weight_init)
 
+    def hybrid_forward(self, F, data, weight) -> mx.sym.Symbol:
         # We use a sigmoid with amplitude 2 for weighting the hidden units. The
         # activation is dampened when the value of the sigmoid is close to 0, and
         # strengthened when it's close to 2 (see also original paper)
-        weight_vector = 2 * F.Activation(data=weight, act_type="sigmoid")
-        out = F.broadcast_mul(weight_vector, inputs)
+        weight = 2 * F.Activation(weight, act_type="sigmoid")
+        return F.broadcast_mul(weight, data)
 
-        return out
 
-
-class WeightNormalization:
+class WeightNormalization(mx.gluon.HybridBlock):
     """
     Implements Weight Normalization, see Salimans & Kingma 2016 (https://arxiv.org/abs/1602.07868).
     For a given tensor the normalization is done per hidden dimension.
 
-    :param weight: Weight tensor of shape: (num_hidden, d1, d2, ...).
     :param num_hidden: Size of the first dimension.
     :param ndim: The total number of dimensions of the weight tensor.
     :param prefix: The prefix used for naming.
     """
 
-    def __init__(self, weight, num_hidden, ndim=2, prefix: str = '') -> None:
-        self.prefix = prefix
-        self.weight = weight
-        self.num_hidden = num_hidden
-        self.scale = mx.sym.Variable("%swn_scale" % prefix,
-                                     shape=tuple([num_hidden] + [1] * (ndim - 1)),
-                                     init=mx.init.Constant(value=1.0))
-
-    def __call__(self, weight: Optional[mx.nd.NDArray] = None, scale: Optional[mx.nd.NDArray] = None) -> mx.sym.Symbol:
-        """
-        Normalize each hidden dimension and scale afterwards
+    def __init__(self,
+                 num_hidden: int,
+                 ndim: int = 2,
+                 prefix: str = 'wn_') -> None:
+        super().__init__(prefix=prefix)
+        with self.name_scope():
+            self.scale = self.params.get("scale",
+                                         shape=tuple([num_hidden] + [1] * (ndim - 1)),
+                                         init=mx.init.Constant(value=1.0))
 
-        :return: A weight normalized weight tensor.
-        """
-        if weight is None and scale is None:
-            return mx.sym.broadcast_mul(lhs=mx.sym.L2Normalization(self.weight, mode='instance'),
-                                        rhs=self.scale, name="%swn_scale" % self.prefix)
-        else:
-            assert isinstance(weight, mx.nd.NDArray)
-            assert isinstance(scale, mx.nd.NDArray)
-            return mx.nd.broadcast_mul(lhs=mx.nd.L2Normalization(weight, mode='instance'), rhs=scale)
+    def hybrid_forward(self, F, weight, scale):
+        return F.broadcast_mul(lhs=F.L2Normalization(weight, mode='instance'), rhs=scale)
 
 
-class OutputLayer:
+class OutputLayer(mx.gluon.HybridBlock):
     """
     Defines the output layer of Sockeye decoders. Supports weight tying and weight normalization.
 
-    :param hidden_size: Decoder hidden size.
+    :param hidden_size: Input hidden size.
     :param vocab_size: Target vocabulary size.
-    :param weight_normalization: Whether to apply weight normalization.
+    :param weight: Optional shared weight Parameter.
+    :param weight_initializer: Initializer for weight.
+    :param bias_initializer: Initializer for bias.
+    :param dtype: Data type.
     :param prefix: Prefix used for naming.
+    :params params: Optional parameter dict for shared parameters.
     """
 
     def __init__(self,
                  hidden_size: int,
                  vocab_size: int,
-                 weight: Optional[mx.sym.Symbol],
-                 weight_normalization: bool,
-                 prefix: str = C.DEFAULT_OUTPUT_LAYER_PREFIX,
-                 name: str = C.LOGITS_NAME) -> None:
+                 weight: Optional[mx.gluon.Parameter] = None,
+                 weight_initializer: Optional[str] = None,
+                 bias_initializer: str = 'zeros',
+                 dtype: str = C.DTYPE_FP32,
+                 prefix: str = C.DEFAULT_OUTPUT_LAYER_PREFIX) -> None:
+        super().__init__(prefix=prefix)
         self.vocab_size = vocab_size
-        self.prefix = prefix
-        self.name = name
 
-        if weight is None:
-            self.w = mx.sym.Variable("%sweight" % self.prefix, shape=(vocab_size, hidden_size), dtype='float32')
+        with self.name_scope():
+            # If we are in int8 mode, the model will have a separate copy of
+            # quantized embeddings because the input embeddings remain
+            # unquantized for the time being.
+            if weight is None or dtype == C.DTYPE_INT8:
+                if dtype == C.DTYPE_INT8:
+                    self.scaling = self.params.get('scaling', shape=(1,), init=mx.initializer.Constant(-1.0), dtype=C.DTYPE_FP32, allow_deferred_init=False)
+                    # This is only for inference but MXNet tries to create an
+                    # initializer anyway, then fails because most random
+                    # generators don't support int8 output.
+                    weight_initializer = 'zeros'
+                self.weight = self.params.get("weight",
+                                              shape=(vocab_size, hidden_size),
+                                              init=weight_initializer,
+                                              dtype=dtype,
+                                              allow_deferred_init=False)
+            else:
+                self.weight = weight  # adds to self._reg_params
+                self.params.update({weight.name: weight})  # adds to self.params
+
+            self.bias = self.params.get("bias",
+                                        shape=(vocab_size,),
+                                        init=bias_initializer,
+                                        dtype=dtype if dtype != C.DTYPE_INT8 else C.DTYPE_FP32, # Bias stays fp32 even with int8 weights.
+                                        allow_deferred_init=False)
+
+    @lru_cache(maxsize=1)
+    def _take_slice(self, vocab_slice_ids: mx.nd.NDArray) -> Tuple[mx.nd.NDArray, mx.nd.NDArray]:
+        if self.weight.dtype == C.DTYPE_INT8:
+            weight = mx.nd.contrib.intgemm_take_weight(self.weight.data(), vocab_slice_ids)
         else:
-            self.w = weight
-
-        self.weight_normalization = weight_normalization
-        if weight_normalization:
-            logger.info("Normalizing output layer weights.")
-            self.weight_norm = WeightNormalization(self.w,
-                                                   num_hidden=vocab_size,
-                                                   ndim=2,
-                                                   prefix=self.prefix)
-            self.w = self.weight_norm()
-
-        self.b = mx.sym.Variable("%sbias" % self.prefix)
-
-    def __call__(self,
-                 hidden: Union[mx.sym.Symbol, mx.nd.NDArray],
-                 weight: Optional[mx.nd.NDArray] = None,
-                 bias: Optional[mx.nd.NDArray] = None):
-        """
-        Linear transformation to vocab size. Returns logits.
-
-        :param hidden: Decoder representation for n elements. Shape: (n, self.num_hidden).
-        :return: Logits. Shape(n, self.vocab_size).
-        """
-        if isinstance(hidden, mx.sym.Symbol):
-            # TODO dropout?
-            return mx.sym.FullyConnected(data=hidden,
-                                         num_hidden=self.vocab_size,
-                                         weight=self.w,
-                                         bias=self.b,
-                                         flatten=False,
-                                         name=self.name)
-
-        # Equivalent NDArray implementation (requires passed weights/biases)
-        assert isinstance(hidden, mx.nd.NDArray)
-        utils.check_condition(weight is not None and bias is not None,
-                              "OutputLayer NDArray implementation requires passing weight and bias NDArrays.")
-
-        return mx.nd.FullyConnected(data=hidden,
-                                    num_hidden=bias.shape[0],
+            weight = self.weight.data().take(vocab_slice_ids)
+        bias = self.bias.data().take(vocab_slice_ids)
+        return weight, bias
+
+    def forward(self, data, vocab_slice_ids):
+        if vocab_slice_ids is not None:
+            # imperative, reduced matrix multiplication for vocabulary selection
+            weight, bias = self._take_slice(vocab_slice_ids)
+            if self.weight.dtype == C.DTYPE_INT8:
+                return mx.nd.contrib.intgemm_fully_connected(data, weight, self.scaling.data(), bias,
+                                                             num_hidden=vocab_slice_ids.shape[0],
+                                                             flatten=False,
+                                                             name=C.LOGITS_NAME)
+            else:
+                return mx.nd.FullyConnected(data=data,
+                                            num_hidden=vocab_slice_ids.shape[0],
+                                            weight=weight,
+                                            bias=bias,
+                                            flatten=False,
+                                            name=C.LOGITS_NAME)
+        return super().forward(data)
+
+    def hybrid_forward(self, F, data, weight, bias, scaling = None):
+        if self.weight.dtype == C.DTYPE_INT8:
+            return F.contrib.intgemm_fully_connected(data=data,
+                                    num_hidden=self.vocab_size,
                                     weight=weight,
+                                    scaling=scaling,
                                     bias=bias,
-                                    flatten=False)
+                                    flatten=False,
+                                    name=C.LOGITS_NAME)
+        else:
+            return F.FullyConnected(data=data,
+                                    num_hidden=self.vocab_size,
+                                    weight=weight,
+                                    bias=bias,
+                                    flatten=False,
+                                    name=C.LOGITS_NAME)
 
 
 class LengthRatioConfig(config.Config):
     """
     Configuration of the length ratio predictor.
 
-    :param layers: Number of layers.
+    :param num_layers: Number of layers.
     :param weight: Weight of this loss.
     """
 
     def __init__(self, num_layers: int, weight: float) -> None:
         super().__init__()
         self.num_layers = num_layers
-        # TODO: keeping weight here is redundant because it is also stored
-        # in the loss config, but it's used to test if we need length prediction
         self.weight = weight
 
 
-class LengthRatio:
+class LengthRatio(mx.gluon.HybridBlock):
     """
     Defines the length-ratio prediction layer of Sockeye.
 
@@ -276,20 +219,23 @@ class LengthRatio:
     def __init__(self,
                  hidden_size: int,
                  num_layers: int,
-                 prefix: str = C.LENRATIOS_OUTPUT_LAYER_PREFIX) -> None:
+                 prefix: str = C.LENRATIOS_OUTPUT_LAYER_PREFIX,
+                 dtype: str = C.DTYPE_FP32) -> None:
         utils.check_condition(num_layers >= 1, "LengthRatio's num_layers has to be >=1.")
-        self.prefix = prefix
+        super().__init__(prefix=prefix)
         self.num_layers = num_layers
         self.hidden_size = hidden_size
 
-        self.layers = [mx.gluon.nn.Dense(units=hidden_size, activation='tanh', flatten=False, prefix=prefix + 'dense%d_' % l) \
-                        for l in range(num_layers - 1)]
-        # SoftReLU activation to ensure positiveness of the predicted length ratio
-        self.layers.append(mx.gluon.nn.Dense(units=1, activation='softrelu', flatten=False, prefix=prefix + 'dense%d_' % (num_layers - 1)))
-
-    def __call__(self,
-                 source_encoded: mx.sym.Symbol,
-                 source_encoded_length: mx.sym.Symbol) -> mx.sym.Symbol:
+        with self.name_scope():
+            self.layers = mx.gluon.nn.HybridSequential()
+            for l in range(num_layers - 1):
+                self.layers.add(quantization.QuantizableDense(units=hidden_size, activation='tanh',
+                                                  flatten=False, prefix='dense%d_' % l, dtype=dtype))
+            # SoftReLU activation to ensure positiveness of the predicted length ratio
+            self.layers.add(quantization.QuantizableDense(units=1, activation='softrelu',
+                                              flatten=False, prefix='dense%d_' % (num_layers - 1), dtype=dtype))
+
+    def hybrid_forward(self, F, source_encoded, source_encoded_length):
         """
         Transformation to the length ratio. Returns a vector.
 
@@ -297,50 +243,35 @@ def __call__(self,
         :param source_encoded_length: A vector of encoded sequence lengths. Shape: (n,).
         :return: Predictions of the ratio length(hypothesis)/length(reference). Shape(n, 1).
         """
-        # data: (n, hidden_size)
-        data = LengthRatio.average_sources(source_encoded, source_encoded_length)
-        # MLP
-        for layer in self.layers:
-            data = layer(data)
-        # data: (n, 1)
-        return data
-
-    @staticmethod
-    def average_sources(source_encoded: mx.sym.Symbol, source_encoded_length: mx.sym.Symbol) -> mx.nd.NDArray:
-        """
-        Calculate the average of encoded sources taking into account their lengths.
-
-        :param source_encoded: Encoder representation for n elements. Shape: (n, source_encoded_length, hidden_size).
-        :param source_encoded_length: A vector of encoded sequence lengths. Shape: (n,).
-        :return: Average vectors. Shape(n, hidden_size).
-        """
         # source_masked: (n, source_encoded_length, hidden_size)
-        source_masked = mx.sym.SequenceMask(data=source_encoded,
-                                            axis=1,
-                                            sequence_length=source_encoded_length,
-                                            use_sequence_length=True,
-                                            value=0.)
+        source_masked = F.SequenceMask(data=source_encoded,
+                                       axis=1,
+                                       sequence_length=source_encoded_length,
+                                       use_sequence_length=True,
+                                       value=0.)
         # calculate the proper means of encoded sources
-        averaged = mx.sym.broadcast_div(mx.sym.sum(source_masked, axis=1, keepdims=False),
-                                                   mx.sym.reshape(source_encoded_length, shape=(-1, 1)))
-        return averaged
+        # data: (n, hidden_size)
+        data = F.broadcast_div(F.sum(source_masked, axis=1, keepdims=False),
+                               F.reshape(source_encoded_length, shape=(-1, 1)))
+        # MLP. Shape: (n, 1)
+        data = self.layers(data)
+        # Shape: (n,)
+        return F.squeeze(data)
 
 
 def split_heads(F, x: mx.sym.Symbol, depth_per_head: int, heads: int) -> mx.sym.Symbol:
     """
-    Returns a symbol with head dimension folded into batch and depth divided by the number of heads.
+    Returns a symbol with heads as second dimension and channel depth / number of heads as last dimension.
 
     :param x: Symbol of shape (batch, length, depth).
     :param depth_per_head: Depth per head.
     :param heads: Number of heads.
-    :return: Symbol of shape (batch * heads, length, depth_per_heads).
+    :return: Symbol of shape (batch, heads, length, depth_per_heads).
     """
     # (batch, length, heads, depth_per_head)
     x = F.reshape(x, shape=(0, -1, heads, depth_per_head))
     # (batch, heads, length, depth/heads)
-    x = F.transpose(x, axes=(0, 2, 1, 3))
-    # (batch * heads, length, depth/heads)
-    return F.reshape(x, shape=(-3, -1, depth_per_head))
+    return F.transpose(x, axes=(0, 2, 1, 3))
 
 
 def combine_heads(F, x: mx.sym.Symbol, depth_per_head: int, heads: int) -> mx.sym.Symbol:
@@ -375,7 +306,8 @@ def broadcast_to_heads(F, x: mx.sym.Symbol, num_heads: int, ndim: int, fold_head
     # x: (batch, 1)
     x = F.expand_dims(x, axis=1)
     # x: (batch, heads, dims...)
-    x = F.broadcast_to(x, shape=[0, num_heads] + dims)
+    #x = F.broadcast_to(x, shape=[0, num_heads] + dims)
+    x = F.repeat(x, repeats=num_heads, axis=1)
     if fold_heads:
         # (batch * heads, dims...)
         return F.reshape(x, shape=[-3] + dims)
@@ -389,13 +321,18 @@ class DotAttentionCell(mx.gluon.HybridBlock):
     def __init__(self, dropout: float = 0.0, prefix: str = '') -> None:
         super().__init__(prefix=prefix)
         self.dropout = dropout
+        self._dtype = C.DTYPE_FP32
+
+    def cast(self, dtype):
+        self._dtype = dtype
+        super().cast(dtype)
 
     def hybrid_forward(self, F, queries, keys, values, lengths=None, bias=None):
-        utils.check_condition(lengths is not None or bias is not None,
-                              "Must provide either length or bias argument for masking")
         # (n, lq, lk)
         logits = F.batch_dot(lhs=queries, rhs=keys, transpose_b=True)
 
+        # TODO(fhieber): consider softmax with length argument once available.
+        # TODO(fhieber: Also see https://github.com/dmlc/gluon-nlp/pull/910
         if lengths is not None:
             # mask lk dimension
             # (lk, n, lq)
@@ -403,7 +340,7 @@ def hybrid_forward(self, F, queries, keys, values, lengths=None, bias=None):
             logits = F.SequenceMask(logits,
                                     use_sequence_length=True,
                                     sequence_length=lengths,
-                                    value=C.LARGE_NEGATIVE_VALUE)
+                                    value=-C.LARGE_VALUES[self._dtype])
             # (n, lq, lk)
             logits = F.transpose(data=logits, axes=(1, 2, 0))
 
@@ -426,13 +363,15 @@ class MultiHeadAttentionBase(mx.gluon.HybridBlock):
     :param heads: Number of attention heads.
     :param depth_out: Output depth / number of output units.
     :param dropout: Dropout probability on attention scores
+    :param dtype: Data type for weights
     """
     def __init__(self,
                  prefix: str,
                  depth_att: int = 512,
                  heads: int = 8,
                  depth_out: int = 512,
-                 dropout: float = 0.0) -> None:
+                 dropout: float = 0.0,
+                 dtype: str = C.DTYPE_FP32) -> None:
         super().__init__(prefix=prefix)
         utils.check_condition(depth_att % heads == 0,
                               "Number of heads (%d) must divide attention depth (%d)" % (heads, depth_att))
@@ -443,7 +382,7 @@ def __init__(self,
 
         with self.name_scope():
             self.dot_att = DotAttentionCell(dropout=dropout, prefix='dot_att')
-            self.ff_out = mx.gluon.nn.Dense(units=depth_out, flatten=False, use_bias=False, prefix='h2o_')
+            self.ff_out = quantization.QuantizableDense(in_units=depth_att, units=depth_out, flatten=False, use_bias=False, prefix='h2o_', dtype = dtype)
 
     def _attend(self,
                 F,
@@ -455,21 +394,20 @@ def _attend(self,
         """
         Returns context vectors of multi-head dot attention.
 
-        :param queries: Query tensor. Shape: (batch_size, query_max_length, depth).
-        :param keys: Keys. Shape: (batch_size, memory_max_length, depth).
-        :param values: Values. Shape: (batch_size, memory_max_length, depth).
+        :param queries: Query tensor. Shape: (batch_size, heads, query_max_length, depth_per_head).
+        :param keys: Keys. Shape: (batch_size, heads, memory_max_length, depth_per_head).
+        :param values: Values. Shape: (batch_size, heads, memory_max_length, depth_per_head).
         :param lengths: Optional lengths of keys. Shape: (batch_size,).
         :param bias: Optional 3d bias.
         :return: Context vectors. Shape: (batch_size, query_max_length, output_depth).
         """
-        # scale by sqrt(depth_per_head)
-        queries = queries * (self.depth_per_head ** -0.5)
-
+        # fold head dimension into batch dimension
         # (batch*heads, length, depth/heads)
-        queries = split_heads(F, queries, self.depth_per_head, self.heads)
-        keys = split_heads(F, keys, self.depth_per_head, self.heads)
-        values = split_heads(F, values, self.depth_per_head, self.heads)
-        lengths = broadcast_to_heads(F, lengths, self.heads, ndim=1, fold_heads=True) if lengths is not None else lengths
+        queries = F.reshape(queries, shape=(-3, -1, self.depth_per_head))
+        keys = F.reshape(keys, shape=(-3, -1, self.depth_per_head))
+        values = F.reshape(values, shape=(-3, -1, self.depth_per_head))
+        lengths = broadcast_to_heads(F, lengths, self.heads, ndim=1,
+                                     fold_heads=True) if lengths is not None else lengths
 
         # (batch*heads, query_max_length, depth_per_head)
         contexts = self.dot_att(queries, keys, values, lengths, bias)
@@ -493,24 +431,27 @@ class MultiHeadSelfAttention(MultiHeadAttentionBase):
     :param heads: Number of attention heads.
     :param depth_out: Output depth / number of output units.
     :param dropout: Dropout probability on attention scores
+    :param dtype: Data type for weights
     """
     def __init__(self,
                  prefix: str,
                  depth_att: int = 512,
                  heads: int = 8,
                  depth_out: int = 512,
-                 dropout: float = 0.0) -> None:
-        super().__init__(prefix, depth_att, heads, depth_out, dropout)
+                 dropout: float = 0.0,
+                 dtype: str = C.DTYPE_FP32) -> None:
+        super().__init__(prefix, depth_att, heads, depth_out, dropout, dtype)
 
+        self.depth_att = depth_att
         with self.name_scope():
-            self.ff_in = mx.gluon.nn.Dense(units=depth_att * 3, flatten=False, use_bias=False, prefix='i2h_')
+            self.ff_in = quantization.QuantizableDense(in_units=depth_att, units=depth_att * 3, flatten=False, use_bias=False, prefix='i2h_', dtype=dtype)
 
-    # TODO: input types will be problematic when using full Gluon, no Dict allowed. Need to think about cache unpacking.
     def hybrid_forward(self, F,
                        inputs: mx.sym.Symbol,
                        input_lengths: Optional[mx.sym.Symbol] = None,
                        bias: Optional[mx.sym.Symbol] = None,
-                       cache: Optional[Dict[str, Optional[mx.sym.Symbol]]] = None) -> mx.sym.Symbol:  # mypy: ignore
+                       previous_keys: Optional[mx.sym.Symbol] = None,
+                       previous_values: Optional[mx.sym.Symbol] = None):  # mypy: ignore
         """
         Computes multi-head attention on a set of inputs, serving as queries, keys, and values.
         If sequence lengths are provided, they will be used to mask the attention scores.
@@ -521,7 +462,8 @@ def hybrid_forward(self, F,
         :param inputs: Input Data. Shape: (batch, max_length, input_depth).
         :param input_lengths: Optional lengths of inputs to mask attention scores. Shape: (batch, 1).
         :param bias: Optional 3d bias tensor to mask attention scores.
-        :param cache: Optional dictionary of previously computed keys and values.
+        :param previous_keys: Optional previous input projections of keys. Shape: (batch, max_length+1, depth_att).
+        :param previous_values: Optional previous input projections of values. Shape: (batch, max_length+1, depth_att).
         :return: Symbol of shape (batch, max_length, output_depth).
         """
         # combined: (batch, max_length, depth * 3)
@@ -531,12 +473,33 @@ def hybrid_forward(self, F,
         # pylint: disable=unbalanced-tuple-unpacking
         queries, keys, values = F.split(combined, num_outputs=3, axis=2)
 
-        if cache is not None:
-            # append new keys & values to cache, update the cache
-            keys = cache['k'] = keys if cache['k'] is None else F.concat(cache['k'], keys, dim=1)
-            values = cache['v'] = values if cache['v'] is None else F.concat(cache['v'], values, dim=1)
+        # scale by sqrt(depth_per_head)
+        queries = queries * (self.depth_per_head ** -0.5)
+        # (batch, heads, length, depth/heads)
+        queries = split_heads(F, queries, self.depth_per_head, self.heads)
+        keys = split_heads(F, keys, self.depth_per_head, self.heads)
+        values = split_heads(F, values, self.depth_per_head, self.heads)
+
+        updated_keys = keys
+        if previous_keys is not None:
+            updated_keys = F.concat(previous_keys, keys, dim=2)
+            keys = _remove_first_step(F, updated_keys)
 
-        return self._attend(F, queries, keys, values, lengths=input_lengths, bias=bias)
+        updated_values = values
+        if previous_values is not None:
+            updated_values = F.concat(previous_values, values, dim=2)
+            values = _remove_first_step(F, updated_values)
+
+        return self._attend(F, queries, keys, values, lengths=input_lengths, bias=bias), updated_keys, updated_values
+
+
+def _remove_first_step(F, data):
+    """
+    :param F: MXNet namespace.
+    :param data: Input data. Shape: (batch, heads, length, num_hidden).
+    :return: Output data. Shape: (batch, heads, length[1:], num_hidden
+    """
+    return F.slice(data, begin=(None, None, 1, None), end=(None, None, None, None))
 
 
 class MultiHeadAttention(MultiHeadAttentionBase):
@@ -547,7 +510,9 @@ class MultiHeadAttention(MultiHeadAttentionBase):
     :param depth_att: Attention depth / number of hidden units.
     :param heads: Number of attention heads.
     :param depth_out: Output depth / number of output units.
+    :param depth_key_value: Dimension of input key and value vectors.
     :param dropout: Dropout probability on attention scores
+    :param dtype: Data type for weights
     """
 
     def __init__(self,
@@ -555,19 +520,36 @@ def __init__(self,
                  depth_att: int = 512,
                  heads: int = 8,
                  depth_out: int = 512,
-                 dropout: float = 0.0) -> None:
-        super().__init__(prefix, depth_att, heads, depth_out, dropout)
+                 dropout: float = 0.0,
+                 dtype: str = C.DTYPE_FP32,
+                 depth_key_value: int = 0) -> None:
+        super().__init__(prefix, depth_att, heads, depth_out, dropout, dtype)
 
         with self.name_scope():
-            self.ff_q = mx.gluon.nn.Dense(units=depth_att, flatten=False, use_bias=False, prefix='q2h_')
-            self.ff_k = mx.gluon.nn.Dense(units=depth_att, flatten=False, use_bias=False, prefix='k2h_')
-            self.ff_v = mx.gluon.nn.Dense(units=depth_att, flatten=False, use_bias=False, prefix='v2h_')
+            self.ff_q = quantization.QuantizableDense(in_units=depth_out, units=depth_att, flatten=False, use_bias=False, prefix='q2h_', dtype=dtype)
+            self.ff_k = quantization.QuantizableDense(in_units=depth_key_value, units=depth_att, flatten=False, use_bias=False, prefix='k2h_', dtype=dtype)
+            self.ff_v = quantization.QuantizableDense(in_units=depth_key_value, units=depth_att, flatten=False, use_bias=False, prefix='v2h_', dtype=dtype)
+
+    def project_and_isolate_heads(self, F, memory: mx.sym.Symbol) -> Tuple[mx.sym.Symbol, mx.sym.Symbol]:
+        """
+        Projects memory into keys and values, and separates attention heads dimension.
+
+        :param memory: Memory tensor. Shape: (batch, memory_max_length, input_depth).
+        :return: Symbol of shape (batch, heads, memory_max_length, depth_per_head).
+        """
+        keys = self.ff_k(memory)
+        values = self.ff_v(memory)
+        keys = split_heads(F, keys, depth_per_head=self.depth_per_head, heads=self.heads)
+        values = split_heads(F, values, depth_per_head=self.depth_per_head, heads=self.heads)
+        return keys, values
 
     def hybrid_forward(self, F,
                        queries: mx.sym.Symbol,
                        memory: mx.sym.Symbol,
                        memory_lengths: Optional[mx.sym.Symbol] = None,
-                       bias: Optional[mx.sym.Symbol] = None) -> mx.sym.Symbol:  # mypy: ignore
+                       bias: Optional[mx.sym.Symbol] = None,
+                       projected_memory_keys: Optional[mx.sym.Symbol] = None,
+                       projected_memory_values: Optional[mx.sym.Symbol] = None) -> mx.sym.Symbol:  # mypy: ignore
         """
         Computes multi-head attention for queries given a memory tensor.
         If sequence lengths are provided, they will be used to mask the attention scores.
@@ -578,14 +560,21 @@ def hybrid_forward(self, F,
         :param memory: Memory data to attend to. Shape: (batch, memory_max_length, input_depth).
         :param memory_lengths: Optional lengths of memory to mask attention scores. Shape: (batch, 1).
         :param bias: Optional 3d bias tensor to mask attention scores.
+        :param projected_memory_keys: Optional previously projected memory keys.
+        :param projected_memory_values: Optional previously projected memory values.
         :return: Symbol of shape (batch, query_seq_len, output_depth).
         """
         # (batch, query_max_length, depth)
         queries = self.ff_q(queries)
-        # (batch, memory_max_length, depth)
-        keys = self.ff_k(memory)
-        # (batch, memory_max_length, depth)
-        values = self.ff_v(memory)
+        # scale by sqrt(depth_per_head)
+        queries = queries * (self.depth_per_head ** -0.5)
+        # (batch, heads, length, depth/heads)
+        queries = split_heads(F, queries, self.depth_per_head, self.heads)
+
+        if projected_memory_keys is not None and projected_memory_values is not None:
+            keys, values = projected_memory_keys, projected_memory_values
+        else:
+            keys, values = self.project_and_isolate_heads(F, memory)
 
         return self._attend(F, queries, keys, values, bias=bias, lengths=memory_lengths)
 
@@ -623,12 +612,13 @@ class ProjectedDotAttention(mx.gluon.HybridBlock):
 
     def __init__(self,
                  prefix: str,
-                 num_hidden: int) -> None:
+                 num_hidden: int,
+                 dtype: str) -> None:
         super().__init__(prefix=prefix)
         self.num_hidden = num_hidden
         with self.name_scope():
-            self.q2h = mx.gluon.nn.Dense(units=num_hidden, flatten=False, use_bias=True)
-            self.kv2h = mx.gluon.nn.Dense(units=num_hidden * 2, flatten=False, use_bias=True)
+            self.q2h = quantization.QuantizableDense(units=num_hidden, flatten=False, use_bias=True, dtype=dtype)
+            self.kv2h = quantization.QuantizableDense(units=num_hidden * 2, flatten=False, use_bias=True, dtype=dtype)
             self.dot_att = DotAttentionCell()
 
     def hybrid_forward(self, F,
@@ -660,3 +650,88 @@ def hybrid_forward(self, F,
         contexts = self.dot_att(queries, keys, values, memory_lengths, None)
 
         return contexts
+
+
+def get_positional_embeddings(length, depth) -> np.ndarray:
+    utils.check_condition(depth % 2 == 0, "Positional embeddings require an even embedding size it "
+                                          "is however %d." % depth)
+    # (1, depth)
+    channels = np.arange(depth // 2).reshape((1, -1))
+
+    # (length, 1)
+    positions = np.arange(0, length).reshape((-1, 1))
+    scaled_positions = positions / np.power(10000, (2 * channels) / depth)
+    # sinusoids:
+    sin = np.sin(scaled_positions)
+    # cosines:
+    cos = np.cos(scaled_positions)
+    # interleave: (length, num_embed)
+    encodings = np.hstack([sin, cos])
+    return encodings
+
+
+class PositionalEmbeddings(mx.gluon.HybridBlock):
+    """
+    Takes an encoded sequence and adds sinusoidal or learned positional embeddings as in Vaswani et al, 2017 to it.
+
+    :param weight_type: type of embeddings, fixed or learned.
+    :param num_embed: Embedding size.
+    :param max_seq_len: Maximum sequence length.
+    :param prefix: Name prefix for symbols of this encoder.
+    :param scale_up_input: If True, scales input data up by num_embed ** 0.5.
+    :param scale_down_positions: If True, scales positional embeddings down by num_embed ** -0.5.
+    :param weight_init: Optional initializer for learned embeddings.
+    """
+
+    def __init__(self,
+                 weight_type: str,
+                 num_embed: int,
+                 max_seq_len: int,
+                 prefix: str,
+                 scale_up_input: bool,
+                 scale_down_positions: bool,
+                 weight_init: Optional[Union[str, mx.init.Initializer]] = None) -> None:
+        utils.check_condition(num_embed % 2 == 0, "Positional embeddings require an even embedding size it "
+                                                  "is however %d." % num_embed)
+        super().__init__(prefix=prefix)
+        self.weight_type = weight_type
+        self.num_embed = num_embed
+        self.max_seq_len = max_seq_len
+        self.scale_up_input = scale_up_input
+        self.scale_down_positions = scale_down_positions
+
+        with self.name_scope():
+            if self.weight_type == C.FIXED_POSITIONAL_EMBEDDING:
+                pos_weight = get_positional_embeddings(length=self.max_seq_len, depth=self.num_embed)
+                if self.scale_down_positions:
+                    pos_weight *= self.num_embed ** -0.5
+                self.weight = self.params.get_constant('weight', pos_weight)
+            elif self.weight_type == C.LEARNED_POSITIONAL_EMBEDDING:
+                self.weight = self.params.get('weight', shape=(self.max_seq_len, self.num_embed), init=weight_init)
+            else:
+                raise ValueError("weight_type '%s' is not supported!" % self.weight_type)
+
+    def hybrid_forward(self, F, data, steps, weight):  # pylint: disable=arguments-differ
+        """
+        Applies positional embeddings to input data.
+
+        :param data: Input data. Shape: (batch, length or 1, num_embed)
+        :param steps: Optional steps input. If given, shape is (batch_size or 1, seq_len,)
+        :param weight: Positional embedding constant.
+        :return: Data with positional embeddings added
+        """
+        # (length, num_embed)
+        if steps is None:
+            # (batch, length, num_embed)
+            pos_embedding = F.slice_like(F.expand_dims(weight, axis=0), data, axes=(1,))
+        else:
+            # (batch_size or 1, seq_len, num_embed)
+            pos_embedding = F.Embedding(steps, weight, self.max_seq_len, self.num_embed)
+
+        if self.weight_type == C.FIXED_POSITIONAL_EMBEDDING:
+            pos_embedding = F.BlockGrad(pos_embedding)
+
+        if self.scale_up_input:
+            data = data * (self.num_embed ** 0.5)
+
+        return F.broadcast_add(data, pos_embedding)
diff --git a/sockeye/lexical_constraints.py b/sockeye/lexical_constraints.py
index 2500c43c5..734b15d22 100644
--- a/sockeye/lexical_constraints.py
+++ b/sockeye/lexical_constraints.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2018--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -16,6 +16,10 @@
 from operator import attrgetter
 from typing import Dict, List, Optional, Tuple, Set
 
+from .data_io import read_content, tokens2ids
+from .vocab import Vocab
+from . import constants as C
+
 import mxnet as mx
 import numpy as np
 
@@ -97,6 +101,18 @@ def final(self) -> Set[int]:
         return self.final_ids
 
 
+def get_avoid_trie(avoid_list: str, vocab: Vocab) -> AvoidTrie:
+    trie = AvoidTrie()
+    unk_id = vocab[C.UNK_SYMBOL]
+    for phrase in read_content(avoid_list):
+        phrase_ids = tokens2ids(phrase, vocab)
+        if unk_id in phrase_ids:
+            logger.warning("Global avoid phrase '%s' contains an %s; this may indicate improper preprocessing.",
+                           ' '.join(phrase), C.UNK_SYMBOL)
+        trie.add_phrase(phrase_ids)
+    return trie
+
+
 class AvoidState:
     """
     Represents the state of a hypothesis in the AvoidTrie.
diff --git a/sockeye/lexicon.py b/sockeye/lexicon.py
index 0c140f6f3..df395ea83 100644
--- a/sockeye/lexicon.py
+++ b/sockeye/lexicon.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
diff --git a/sockeye/log.py b/sockeye/log.py
index 275f4f970..6e9e8ee70 100644
--- a/sockeye/log.py
+++ b/sockeye/log.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -103,11 +103,6 @@
 }
 
 
-def is_python34() -> bool:
-    version = sys.version_info
-    return version[0] == 3 and version[1] == 4
-
-
 def setup_main_logger(file_logging=True, console=True, path: Optional[str] = None, level=logging.INFO):
     """
     Configures logging for the main application.
@@ -136,13 +131,7 @@ def setup_main_logger(file_logging=True, console=True, path: Optional[str] = Non
     logging.config.dictConfig(log_config)  # type: ignore
 
     def exception_hook(exc_type, exc_value, exc_traceback):
-        if is_python34():
-            # Python3.4 does not seem to handle logger.exception() well
-            import traceback
-            traceback = "".join(traceback.format_tb(exc_traceback)) + exc_type.name
-            logging.error("Uncaught exception\n%s", traceback)
-        else:
-            logging.exception("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
+        logging.exception("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
 
     sys.excepthook = exception_hook
 
diff --git a/sockeye/loss.py b/sockeye/loss.py
index 1b9eed046..42edf60b9 100644
--- a/sockeye/loss.py
+++ b/sockeye/loss.py
@@ -1,4 +1,4 @@
-# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -15,360 +15,343 @@
 Functions to generate loss symbols for sequence-to-sequence models.
 """
 import logging
+import math
 from abc import ABC, abstractmethod
-from typing import List, Optional, Dict
+from typing import Any, Dict
 
 import mxnet as mx
-from mxnet.metric import EvalMetric
 
-from . import config
 from . import constants as C
+from . import utils
 
 logger = logging.getLogger(__name__)
 
 
-class LossConfig(config.Config):
+class Loss(mx.gluon.HybridBlock):
     """
-    Loss configuration.
-
-    :param name: Loss name.
-    :param vocab_size: Target vocab size.
-    :param normalization_type: How to normalize the loss.
-    :param label_smoothing: Optional smoothing constant for label smoothing.
-    :param link: Link function.
-    :param weight: Loss weight.
+    Generic Loss interface.
+    A loss has a name, a configuration, and stores information about the output and label it requires from the model(s),
+    as well as a weight (default 1.0) and a method to create the corresponding metric.
     """
 
     def __init__(self,
                  name: str,
-                 vocab_size: Optional[int] = None,
-                 normalization_type: Optional[str] = None,
-                 label_smoothing: float = 0.0,
-                 length_task_link: Optional[str] = None,
-                 length_task_weight: float = 1.0) -> None:
-        super().__init__()
-        self.name = name
-        self.vocab_size = vocab_size
-        self.normalization_type = normalization_type
-        self.label_smoothing = label_smoothing
-        self.length_task_link = length_task_link
-        self.length_task_weight = length_task_weight
-
-
-def get_loss(config: LossConfig) -> 'Loss':
-    """
-    Returns a Loss instance.
+                 output_name: str,
+                 label_name: str,
+                 weight: float = 1.0) -> None:
+        super().__init__(prefix=name)
+        self._output_name = output_name
+        self._label_name = label_name
+        self._weight = weight
+        self._metric = None
+        logger.info("Loss: %s | weight=%.2f | metric: %s | output_name: '%s' | label_name: '%s'",
+                    self.prefix, self.weight, self.metric.name, self.output_name, self.label_name)
+
+    def forward(self, outputs: Dict[str, Any], labels: Dict[str, Any]):
+        """
+        Loss retrieves the required output and label.
+        """
+        utils.check_condition(self.output_name in outputs,
+                              "output '%s' not found. Loss requires this output key" % self.output_name)
+        utils.check_condition(self.label_name in labels,
+                              "label '%s' not found. Loss requires this label key" % self.output_name)
+        output = outputs[self.output_name]
+        label = labels[self.label_name]
+        return super().forward(output.astype(label, copy=False), label)
+
+    def hybrid_forward(self, F, outputs, labels):
+        """
+        Given outputs and labels, the loss returns two scalars: the loss value and a normalizer for that loss value.
+        """
+        raise NotImplementedError()
 
-    :param config: Loss configuration.
-    :return: Instance implementing the Loss.
-    """
-    if config.name == C.CROSS_ENTROPY:
-        return CrossEntropyLoss(config,
-                                output_names=[C.SOFTMAX_OUTPUT_NAME],
-                                label_names=[C.TARGET_LABEL_NAME])
-    else:
-        raise ValueError("unknown loss name: %s" % config.name)
+    @abstractmethod
+    def create_metric(self) -> 'LossMetric':
+        """
+        Create an instance of the EvalMetric that corresponds to this Loss function.
+        """
+        raise NotImplementedError()
 
+    @property
+    def metric(self):
+        if self._metric is None:
+            self._metric = self.create_metric()
+        return self._metric
 
-def get_length_task_loss(config: LossConfig) -> 'Loss':
-    """
-    Returns a Loss instance.
+    @property
+    def weight(self):
+        return self._weight
 
-    :param config: Loss configuration.
-    :return: Instance implementing Loss.
-    """
-    if config.length_task_link is not None:
-        if config.length_task_link == C.LINK_NORMAL:
-            return MSELoss(config,
-                           output_names=[C.LENRATIO_OUTPUT_NAME],
-                           label_names=[C.LENRATIO_LABEL_NAME])
-        elif config.length_task_link == C.LINK_POISSON:
-            return PoissonLoss(config,
-                               output_names=[C.LENRATIO_OUTPUT_NAME],
-                               label_names=[C.LENRATIO_LABEL_NAME])
-        else:
-            raise ValueError("unknown link function name for length task: %s" % config.length_task_link)
-    return None
+    @property
+    def output_name(self):
+        return self._output_name
 
+    @property
+    def label_name(self):
+        return self._label_name
 
-class Loss(ABC):
-    """
-    Generic Loss interface.
-    get_loss() method should return a loss symbol.
-    The softmax outputs (named C.SOFTMAX_NAME) are used by EvalMetrics to compute various metrics,
-    e.g. perplexity, accuracy. In the special case of cross_entropy, the SoftmaxOutput symbol
-    provides softmax outputs for forward() AND cross_entropy gradients for backward().
-    """
 
-    def __init__(self, loss_config: LossConfig, output_names: List[str], label_names: List[str]) -> None:
-        self.output_names = output_names
-        self.label_names = label_names
-        self.loss_config = loss_config
+class LossMetric(ABC):
+    def __init__(self, name: str) -> None:
+        self._name = name
+        self._sum = 0.0
+        self._num_inst = 0.0
 
-    def get_loss(self, logits: mx.sym.Symbol, labels: mx.sym.Symbol) -> mx.sym.Symbol:
-        """
-        Returns loss and softmax output symbols given logits and integer-coded labels.
+    def __repr__(self):
+        return "%s(%.2f/%.2f=%.2f)" % (self.name, self._sum, self._num_inst, self.get())
 
-        :param logits: Shape: (batch_size * target_seq_len, target_vocab_size).
-        :param labels: Shape: (batch_size * target_seq_len,).
-        :return: Loss symbol.
-        """
-        raise NotImplementedError()
+    def __str__(self):
+        return "%s=%f" % (self.name, self.get())
 
-    def __repr__(self):
-        return self.loss_config.name
+    @property
+    def name(self):
+        return self._name
 
-    @abstractmethod
-    def create_metric(self) -> EvalMetric:
-        """
-        Create an instance of the EvalMetric that corresponds to this Loss function.
-        """
-        pass
+    def update(self, loss, num_samples):
+        self._sum += loss
+        self._num_inst += num_samples
+
+    def get(self) -> float:
+        return self._sum / self._num_inst if self._num_inst else float('nan')
+
+    def reset(self):
+        self._sum = 0.0
+        self._num_inst = 0.0
 
 
 class CrossEntropyLoss(Loss):
     """
     Computes the cross-entropy loss.
-
-    :param loss_config: Loss configuration.
+    Uses F.SoftmaxOutput to efficiently backpropagate cross-entropy gradients and do label smoothing.
     """
 
-    def __init__(self, loss_config: LossConfig,
-                 output_names: List[str], label_names: List[str],
-                 ignore_label: int=C.PAD_ID, name: str=C.SOFTMAX_NAME) -> None:
-        logger.info("Loss: CrossEntropy(normalization_type=%s, label_smoothing=%s)",
-                    loss_config.normalization_type, loss_config.label_smoothing)
-        super().__init__(loss_config=loss_config, output_names=output_names, label_names=label_names)
+    def __init__(self,
+                 name: str = C.CROSS_ENTROPY,
+                 weight: float = 1.0,
+                 label_smoothing: float = 0.0,
+                 dtype: str = C.DTYPE_FP32,
+                 output_name: str = C.LOGITS_NAME,
+                 label_name: str = C.TARGET_LABEL_NAME,
+                 ignore_label: int = C.PAD_ID) -> None:
+        super().__init__(name=name, output_name=output_name, label_name=label_name, weight=weight)
         self.ignore_label = ignore_label
-        self.name = name
+        self._alpha = label_smoothing
+        self._normalization = "valid"
+        self._dtype = dtype
 
-    def get_loss(self, logits: mx.sym.Symbol, labels: mx.sym.Symbol) -> mx.sym.Symbol:
+    def hybrid_forward(self, F, logits, labels):
         """
-        Returns loss symbol given logits and integer-coded labels.
+        Returns unnormalized cross-entropy loss of the batch.
 
-        :param logits: Shape: (batch_size * target_seq_len, target_vocab_size).
-        :param labels: Shape: (batch_size * target_seq_len,).
-        :return: List of loss symbols.
+        :param F: MXNet API namespace.
+        :param logits: Logits. Shape: (batch_size, sequence_length, output_dim).
+        :param labels: Sparse labels. Shape: (batch_size, sequence_length)
+        :return: Cross-entropy loss (1,), and number of valid tokens for normalization.
         """
-        if self.loss_config.normalization_type == C.LOSS_NORM_VALID:
-            normalization = "valid"
-        elif self.loss_config.normalization_type == C.LOSS_NORM_BATCH:
-            normalization = "null"
-        else:
-            raise ValueError("Unknown loss normalization type: %s" % self.loss_config.normalization_type)
-        return mx.sym.SoftmaxOutput(data=logits,
-                                    label=labels,
-                                    ignore_label=self.ignore_label,
-                                    use_ignore=True,
-                                    normalization=normalization,
-                                    smooth_alpha=self.loss_config.label_smoothing,
-                                    name=self.name)
-
-    def create_metric(self) -> "CrossEntropyMetric":
-        return CrossEntropyMetric(self.loss_config)
-
+        # computes softmax over the last axis, backpropagates ce gradients. Shape: (batch, len, vocab)
+        softmax_out = F.SoftmaxOutput(data=logits,
+                                      label=labels,
+                                      ignore_label=self.ignore_label,
+                                      use_ignore=True,
+                                      normalization=self._normalization,
+                                      smooth_alpha=self._alpha,
+                                      # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
+                                      grad_scale=self.weight,
+                                      preserve_shape=True)
+        # (batch, len)
+        pred = F.log(F.pick(F.BlockGrad(softmax_out), labels, axis=-1, keepdims=False))
+        # (batch, len,)
+        valid_mask = labels != self.ignore_label
+        # (batch, len)
+        pred = pred * valid_mask
+        # (1,)
+        ce = -F.sum(pred)
+        return ce, F.sum(valid_mask)
+
+    def create_metric(self) -> 'LossMetric':
+        """
+        Create an instance of the EvalMetric that corresponds to this Loss function.
+        """
+        return PerplexityMetric()
 
-class CrossEntropyMetric(EvalMetric):
-    """
-    Version of the cross entropy metric that ignores padding tokens.
 
-    :param loss_config: The configuration used for the corresponding loss.
-    :param name: Name of this metric instance for display.
-    :param output_names: Name of predictions that should be used when updating with update_dict.
-    :param label_names: Name of labels that should be used when updating with update_dict.
-    """
+class CrossEntropyLossWithoutSoftmaxOutput(Loss):
+    """ no label smoothing supported """
 
     def __init__(self,
-                 loss_config: LossConfig,
                  name: str = C.CROSS_ENTROPY,
-                 output_names: Optional[List[str]] = None,
-                 label_names: Optional[List[str]] = None) -> None:
-        super().__init__(name, output_names=output_names, label_names=label_names)
-        self.loss_config = loss_config
-
-    @staticmethod
-    def cross_entropy(logprob, label):
-        ce = -mx.nd.pick(logprob, label)  # pylint: disable=invalid-unary-operand-type
-        return ce
-
-    @staticmethod
-    def cross_entropy_smoothed(logprob, label, alpha, num_classes):
-        ce = CrossEntropyMetric.cross_entropy(logprob, label)
-        # gain for each incorrect class
-        per_class_gain = alpha / (num_classes - 1)
-        # discounted loss for correct class
-        ce *= 1 - alpha - per_class_gain
-        # add gain for incorrect classes to total cross-entropy
-        ce -= mx.nd.sum(logprob * per_class_gain, axis=-1, keepdims=False)
-        return ce
-
-    def update(self, labels, preds):
-        for label, pred in zip(labels, preds):
-            batch_size = label.shape[0]
-            label = label.as_in_context(pred.context).reshape((label.size,))
-
-            logprob = mx.nd.log(mx.nd.maximum(1e-10, pred))
-
-            # ce: (batch*time,)
-            if self.loss_config.label_smoothing > 0.0:
-                ce = self.cross_entropy_smoothed(logprob, label,
-                                                 alpha=self.loss_config.label_smoothing,
-                                                 num_classes=self.loss_config.vocab_size)
-            else:
-                ce = self.cross_entropy(logprob, label)
-
-            # mask pad tokens
-            valid = (label != C.PAD_ID).astype(dtype=pred.dtype)
-            ce *= valid
-
-            ce = mx.nd.sum(ce)
-            if self.loss_config.normalization_type == C.LOSS_NORM_VALID:
-                num_valid = mx.nd.sum(valid)
-                ce /= num_valid
-                self.num_inst += 1
-            elif self.loss_config.normalization_type == C.LOSS_NORM_BATCH:
-                # When not normalizing, we divide by the batch size (number of sequences)
-                # NOTE: This is different from MXNet's metrics
-                self.num_inst += batch_size
-
-            self.sum_metric += ce.asscalar()
+                 weight: float = 1.0,
+                 label_smoothing: float = 0.0,
+                 dtype: str = C.DTYPE_FP32,
+                 output_name: str = C.LOGITS_NAME,
+                 label_name: str = C.TARGET_LABEL_NAME,
+                 ignore_label: int = C.PAD_ID) -> None:
+        super().__init__(name=name, output_name=output_name, label_name=label_name, weight=weight)
+        self.ls = None
+        if label_smoothing > 0.0:
+            with self.name_scope():
+                self.ls = LabelSmoothing(epsilon=label_smoothing, units=8230)  # TODO
+        self.ignore_label = ignore_label
+        self._alpha = label_smoothing
+        self._dtype = dtype
 
+    def hybrid_forward(self, F, logits, labels):
+        pred = F.log_softmax(logits, axis=-1)
 
-class PoissonLoss(Loss):
-    """
-    Computes the Poisson regression loss.
-    MSEMetric for this loss will be reporting the mean
-    square error between lengths, not length ratios!
+        if self.ls is None:
+            # (batch, len)
+            loss = -F.pick(pred, labels, axis=-1, keepdims=False)
+        else:
+            loss = -F.sum(pred * self.ls(labels), axis=-1, keepdims=False)
 
-    :param loss_config: Loss configuration.
-    """
+        # (batch, len,)
+        valid_mask = labels != self.ignore_label
 
-    def __init__(self,
-                 loss_config: LossConfig,
-                 output_names: List[str], label_names: List[str],
-                 name: str = C.LENRATIO_LOSS_NAME) -> None:
-        super().__init__(loss_config=loss_config,
-                         output_names=output_names, label_names=label_names)
-        self.name = name
-
-    def get_loss(self, pred: mx.sym.Symbol, labels: mx.sym.Symbol) -> mx.sym.Symbol:
+        # (batch, len)
+        loss = loss * valid_mask
+
+        # (1,)
+        ce = F.sum(loss) * self.weight
+        return ce, F.sum(valid_mask)
+
+    def create_metric(self) -> 'LossMetric':
+        """
+        Create an instance of the EvalMetric that corresponds to this Loss function.
+        """
+        return PerplexityMetric()
+
+
+class LabelSmoothing(mx.gluon.HybridBlock):
+    """Applies label smoothing. See https://arxiv.org/abs/1512.00567.
+
+    Parameters
+    ----------
+    axis : int, default -1
+        The axis to smooth.
+    epsilon : float, default 0.1
+        The epsilon parameter in label smoothing
+    sparse_label : bool, default True
+        Whether input is an integer array instead of one hot array.
+    units : int or None
+        Vocabulary size. If units is not given, it will be inferred from the input.
+    prefix : str or None
+        Prefix for name of `Block`s
+        (and name of weight if params is `None`).
+    params : Parameter or None
+        Container for weight sharing between cells.
+        Created if `None`.
+    """
+    def __init__(self, axis=-1, epsilon=0.1, units=None,
+                 sparse_label=True, prefix=None, params=None):
+        super(LabelSmoothing, self).__init__(prefix=prefix, params=params)
+        self._axis = axis
+        self._epsilon = epsilon
+        self._sparse_label = sparse_label
+        self._units = units
+
+    def hybrid_forward(self, F, inputs, units=None): # pylint: disable=arguments-differ
         """
-        Returns Poisson loss and output symbol given data and expected integers as labels.
 
-        :param pred: Predictions. shape: (batch_size, 1).
-        :param labels: Target integers. Shape: (batch_size,).
-        :return: Loss symbol.
+        Parameters
+        ----------
+        F
+        inputs : Symbol or NDArray
+            Shape (batch_size, length) or (batch_size, length, V)
+        units : int or None
+        Returns
+        -------
+        smoothed_label : Symbol or NDArray
+            Shape (batch_size, length, V)
         """
-        labels = mx.sym.reshape(labels, shape=(-1, 1))
-        loss_value = pred - labels * mx.sym.log(mx.sym.maximum(1e-10, pred))
-        # MakeLoss scales only the gradient, so scaling explicitly
-        loss_value = self.loss_config.length_task_weight * loss_value
-        loss_value = mx.sym.MakeLoss(data=loss_value,
-                                     normalization='batch',
-                                     name=self.name)
-        return loss_value
+        if self._sparse_label:
+            assert units is not None or self._units is not None, \
+                'units needs to be given in function call or ' \
+                'instance initialization when sparse_label is False'
+            if units is None:
+                units = self._units
+            inputs = F.one_hot(inputs, depth=units)
+        if units is None and self._units is None:
+            return F.Custom(inputs, epsilon=self._epsilon, axis=self._axis,
+                            op_type='_smoothing_with_dim')
+        else:
+            if units is None:
+                units = self._units
+            return ((1 - self._epsilon) * inputs) + (self._epsilon / units)
 
-    def create_metric(self) -> 'MSEMetric':
-        return LengthRatioMSEMetric(name=C.LENRATIO_MSE,
-                                    output_names=self.output_names,
-                                    label_names=self.label_names)
 
+class PerplexityMetric(LossMetric):
 
-class MSELoss(Loss):
-    """
-    Computes the Mean Squared Error loss.
-    MSEMetric for this loss will be reporting the mea
-    square error between length ratios.
+    def __init__(self, name=C.PERPLEXITY):
+        super().__init__(name=name)
 
-    :param loss_config: Loss configuration.
-    """
+    def update(self, batch_cross_entropy: float, batch_num_valid: float):
+        self._sum += batch_cross_entropy
+        self._num_inst += batch_num_valid
 
-    def __init__(self,
-                 loss_config: LossConfig,
-                 output_names: List[str], label_names: List[str],
-                 name: str = C.LENRATIO_LOSS_NAME) -> None:
-        super().__init__(loss_config=loss_config,
-                         output_names=output_names, label_names=label_names)
-        self.name = name
-
-    def get_loss(self, pred: mx.sym.Symbol, labels: mx.sym.Symbol) -> mx.sym.Symbol:
-        """
-        Returns MSE loss and output symbol given logits and expected integers as labels.
-
-        :param pred: Predictions. Shape: (batch_size, 1).
-        :param labels: Targets. Shape: (batch_size,).
-        :return: Loss symbol.
-        """
-        labels = mx.sym.reshape(labels, shape=(-1, 1))
-        loss_value = self.loss_config.length_task_weight / 2 * mx.sym.square(pred - labels)
-        loss_value = mx.sym.MakeLoss(data=loss_value,
-                                     normalization='batch',
-                                     name=self.name)
-        return loss_value
-
-    def create_metric(self) -> 'MSEMetric':
-        return LengthRatioMSEMetric(name=C.LENRATIO_MSE,
-                                    output_names=self.output_names,
-                                    label_names=self.label_names)
+    def get(self):
+        return math.exp(super().get())
 
 
-class MSEMetric(EvalMetric):
+class PoissonLoss(Loss):
     """
-    Version of the MSE metric that ignores padding tokens.
-
-    :param loss_config: The configuration used for the corresponding loss.
-    :param name: Name of this metric instance for display.
-    :param output_names: Name of predictions that should be used when updating with update_dict.
-    :param label_names: Name of labels that should be used when updating with update_dict.
+    Computes the Poisson regression loss.
+    MSEMetric for this loss will be reporting the mean
+    square error between lengths, not length ratios!
     """
 
     def __init__(self,
-                 name: str,
-                 output_names: Optional[List[str]] = None,
-                 label_names: Optional[List[str]] = None) -> None:
-        super().__init__(name, output_names=output_names, label_names=label_names)
+                 name: str = C.LENRATIO_NAME + "_" + C.LINK_POISSON,
+                 weight: float = 1.0,
+                 output_name: str = C.LENRATIO_NAME,
+                 label_name: str = C.LENRATIO_LABEL_NAME) -> None:
+        super().__init__(name=name, output_name=output_name, label_name=label_name, weight=weight)
 
-    def update(self, labels, preds):
+    def hybrid_forward(self, F, length_predictions, labels):
         """
-        :param labels: List of (batch_size,)-shaped NDArrays.
-        :param preds: List of (batch_size,1)-shaped NDArrays.
+        Returns Poisson loss and output symbol given data and expected integers as labels.
+
+        :param length_predictions: Length predictions. Shape: (batch_size,).
+        :param labels: Targets. Shape: (batch_size,).
+        :return: Poisson loss of length predictions of the batch, and number of samples (batch size).
         """
-        for label, pred in zip(labels, preds):
-            batch_size = label.shape[0]
-            # label: (batch_size, 1)
-            label = label.as_in_context(pred.context).reshape((label.size,1))
-            # mse: (batch_size,)
-            mse = mx.nd.square(label - pred)
-            # mse: (1,)
-            mse = mx.nd.sum(mse)
-            self.num_inst += batch_size
+        # (batch_size,)
+        loss = length_predictions - labels * F.log(F.maximum(1e-10, length_predictions))
+        # (1,)
+        loss = F.sum(loss * self.weight)
+        num_samples = F.sum(F.ones_like(length_predictions))
+        return loss, num_samples
 
-            self.sum_metric += mse.asscalar()
+    def create_metric(self) -> 'LossMetric':
+        return LossMetric(name=C.LENRATIO_MSE)
 
 
-class LengthRatioMSEMetric(MSEMetric):
+class MSELoss(Loss):
     """
-    Version of the MSE metric specific to length ratio prediction, that
-    looks for its labels in the network outputs instead of the iterator,
-    as those are generated on the fly by the TrainingModel's sym_gen().
-
-    :param loss_config: The configuration used for the corresponding loss.
-    :param name: Name of this metric instance for display.
-    :param output_names: Name of predictions that should be used when updating with update_dict.
-    :param label_names: Name of labels that should be used when updating with update_dict.
+    Computes the Mean Squared Error loss.
+    MSEMetric for this loss will be reporting the mean square error between length ratios.
     """
 
     def __init__(self,
-                 name: str,
-                 output_names: Optional[List[str]] = None,
-                 label_names: Optional[List[str]] = None) -> None:
-        super().__init__(name, output_names=output_names, label_names=label_names)
+                 name: str = C.LENRATIO_NAME + "_" + C.LINK_NORMAL,
+                 weight: float = 1.0,
+                 output_name: str = C.LENRATIO_NAME,
+                 label_name: str = C.LENRATIO_LABEL_NAME) -> None:
+        super().__init__(name=name, output_name=output_name, label_name=label_name, weight=weight)
 
-    def update_dict(self, label: Dict, pred: Dict):
-        """
-        If label is missing the right name, copy it from the prediction.
+    def hybrid_forward(self, F, length_predictions, labels):
         """
-        if not set(self.label_names).issubset(set(label.keys())):
-            label.update({name:pred[name] for name in self.label_names})
-        super().update_dict(label, pred)
+        Returns MSE loss.
 
+        :param length_predictions: Length predictions. Shape: (batch_size,).
+        :param labels: Targets. Shape: (batch_size,).
+        :return: MSE loss of length predictions of the batch.
+        """
+        # (batch_size,)
+        loss = (self.weight / 2) * F.square(length_predictions - labels)
+        # (1,)
+        loss = F.sum(loss)
+        num_samples = F.sum(F.ones_like(length_predictions))
+        return loss, num_samples
+
+    def create_metric(self) -> 'LossMetric':
+        return LossMetric(name=C.LENRATIO_MSE)
diff --git a/sockeye/lr_scheduler.py b/sockeye/lr_scheduler.py
index 712e7b1eb..a8c9fe5d3 100644
--- a/sockeye/lr_scheduler.py
+++ b/sockeye/lr_scheduler.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -13,7 +13,7 @@
 
 import logging
 from math import sqrt
-from typing import List, Optional, Tuple
+from typing import Optional
 import sockeye.constants as C
 from sockeye.utils import check_condition
 
@@ -22,28 +22,26 @@
 
 class LearningRateScheduler:
 
-    def __init__(self, warmup: int = 0) -> None:
+    def __init__(self, warmup: int = 0, t_scale: float = 1.0) -> None:
         self.base_lr = None  # Note: will be overwritten by MXNet optimizer
         check_condition(warmup >= 0, "warmup needs to be >= 0.")
         self.warmup = warmup
-        self.log_warmup_every_t = max(self.warmup // 10, 1)
-        self.last_warmup_log = -1
+        self.t_scale = t_scale
+        self.lr = None  # type: Optional[float]
 
-    def __call__(self, num_updates):
+    def __call__(self, t):
         pass
 
-    def _warmup(self, num_updates):
+    def _warmup(self, scaled_t):
         """
-        Returns linearly increasing fraction of base_lr.
+        Returns linearly increasing fraction of base_lr.  Here t is not scaled
+        by t_scale, as individual schedulers should scale t prior to calling
+        this method.
         """
         assert self.base_lr is not None
         if not self.warmup:
             return self.base_lr
-        fraction = (num_updates + 1) * self.base_lr / (self.warmup + 1)
-        if num_updates > self.last_warmup_log and num_updates % self.log_warmup_every_t == 0:
-            self.last_warmup_log = num_updates
-            logger.info("Learning rate warmup: %3.0f%%", fraction / self.base_lr * 100.0)
-        return fraction
+        return self.base_lr * min(1.0, scaled_t / self.warmup)
 
 
 class AdaptiveLearningRateScheduler(LearningRateScheduler):
@@ -62,132 +60,63 @@ def new_evaluation_result(self, has_improved: bool) -> bool:
         return False
 
 
-class LearningRateSchedulerFixedStep(AdaptiveLearningRateScheduler):
-    """
-    Use a fixed schedule of learning rate steps: lr_1 for N steps, lr_2 for M steps, etc.
-
-    :param schedule: List of learning rate step tuples in the form (rate, num_updates).
-    :param updates_per_checkpoint: Updates per checkpoint.
+class LearningRateSchedulerInvSqrtDecay(LearningRateScheduler):
     """
+    Learning rate schedule: lr / sqrt(max(t, warmup_steps)).
 
-    def __init__(self, schedule: List[Tuple[float, int]], updates_per_checkpoint: int) -> None:
-        super().__init__()
-        check_condition(all(num_updates > 0 for (_, num_updates) in schedule),
-                        "num_updates for each step should be > 0.")
-        check_condition(all(num_updates % updates_per_checkpoint == 0 for (_, num_updates) in schedule),
-                        "num_updates for each step should be divisible by updates_per_checkpoint.")
-        self.schedule = schedule
-        self.current_step = 0
-        self.current_rate = 0.
-        self.current_step_num_updates = 0
-        self.current_step_started_at = 0
-        self.next_step_at = 0
-        self.latest_t = 0
-        self._update_rate(self.current_step)
-
-    def new_evaluation_result(self, has_improved: bool) -> bool:
-        """
-        Returns true if the parameters should be reset to the ones with the best validation score.
-
-        :param has_improved: Whether the model improved on held-out validation data.
-        :return: True if parameters should be reset to the ones with best validation score.
-        """
-        logger.info("Checkpoint learning rate: %1.2e (%d/%d updates)",
-                    self.current_rate,
-                    self.latest_t - self.current_step_started_at,
-                    self.current_step_num_updates)
-        if self.latest_t >= self.next_step_at:
-            self.current_step += 1
-            self._update_rate(self.current_step)
-        return False
-
-    def _update_rate(self, step: int):
-        if self.current_step < len(self.schedule):
-            self.current_rate, self.current_step_num_updates = self.schedule[step]
-            self.current_step_started_at = self.latest_t
-            self.next_step_at += self.current_step_num_updates
-            logger.info("Changing learning rate to %1.2e for %d updates",
-                        self.current_rate,
-                        self.current_step_num_updates)
-
-    def __call__(self, t: int):
-        self.latest_t = max(t, self.latest_t)
-        return self.current_rate
-
-    @staticmethod
-    def parse_schedule_str(schedule_str: str) -> List[Tuple[float, int]]:
-        """
-        Parse learning schedule string.
-
-        :param schedule_str: String in form rate1:num_updates1[,rate2:num_updates2,...]
-        :return: List of tuples (learning_rate, num_updates).
-        """
-        schedule = list()
-        for step in schedule_str.split(","):
-            rate, num_updates = step.split(":")
-            schedule.append((float(rate), int(num_updates)))
-        return schedule
-
-
-class LearningRateSchedulerInvSqrtT(LearningRateScheduler):
-    """
-    Learning rate schedule: lr / sqrt(1 + factor * t).
-    Note: The factor is calculated from the half life of the learning rate.
+    This is the schedule used by Vaswani et al. in the Transformer paper
+    (https://arxiv.org/pdf/1706.03762.pdf)
 
-    :param updates_per_checkpoint: Number of batches between checkpoints.
-    :param half_life: Half life of the learning rate in number of checkpoints.
-    :param warmup: Number of (linear) learning rate increases to warm-up.
+    :param warmup: Number of initial updates during which the learning rate
+                   linearly increases.
     """
 
-    def __init__(self, updates_per_checkpoint: int, half_life: int, warmup: int = 0) -> None:
-        super().__init__(warmup)
-        check_condition(updates_per_checkpoint > 0, "updates_per_checkpoint needs to be > 0.")
-        check_condition(half_life > 0, "half_life needs to be > 0.")
-        # 0.5 base_lr = base_lr * sqrt(1 + T * factor)
-        # then factor = 3 ./ T, with T = half_life * updates_per_checkpoint
-        self.factor = 3. / (half_life * updates_per_checkpoint)
-        self.t_last_log = -1
-        self.log_every_t = int(half_life * updates_per_checkpoint)
-
-    def __call__(self, num_updates: int):
-        lr = min(self.base_lr / sqrt(1 + num_updates * self.factor),
-                 self._warmup(num_updates) if self.warmup > 0 else C.LARGE_POSITIVE_VALUE)
-        # Note: this method is called once per parameter for the same t. Making sure to just log once.
-        if num_updates > self.t_last_log and num_updates % self.log_every_t == 0:
-            logger.info("Learning rate currently at %1.2e", lr)
-            self.t_last_log = num_updates
+    def __call__(self, t: int):
+        # Time scale
+        scaled_t = t * self.t_scale
+        # Warmup
+        warm_lr = self._warmup(scaled_t)
+        # Avoid square root of zero
+        warmup_steps = max(1, self.warmup)
+        # Warmup first N steps, then decay
+        lr = warm_lr / sqrt(max(scaled_t, warmup_steps))
+        # For this scheduler, `self.lr` represents the last seen lr and is only
+        # used for logging purposes.
+        self.lr = lr
 
         return lr
 
 
-class LearningRateSchedulerInvT(LearningRateScheduler):
+class LearningRateSchedulerLinearDecay(LearningRateScheduler):
     """
-    Learning rate schedule: lr / (1 + factor * t).
-    Note: The factor is calculated from the half life of the learning rate.
+    Learning rate schedule: lr * (1 - t / total_steps)
+    Step grows until it reaches decay_steps then remains constant.
 
-    :param updates_per_checkpoint: Number of batches between checkpoints.
-    :param half_life: Half life of the learning rate in number of checkpoints.
-    """
-
-    def __init__(self, updates_per_checkpoint: int, half_life: int, warmup: int = 0) -> None:
-        super().__init__(warmup)
-        check_condition(updates_per_checkpoint > 0, "updates_per_checkpoint needs to be > 0.")
-        check_condition(half_life > 0, "half_life needs to be > 0.")
+    This is the schedule used by Devlin et al. in the BERT paper
+    (https://arxiv.org/pdf/1810.04805.pdf).
 
-        # 0.5 base_lr = base_lr * (1 + T * factor)
-        # then factor = 1 ./ T, with T = half_life * updates_per_checkpoint
-        self.factor = 1. / (half_life * updates_per_checkpoint)
-        self.t_last_log = -1
-        self.log_every_t = int(half_life * updates_per_checkpoint)
+    :param max_updates: Number of total training updates.  The learning rate
+                        linearly decays to zero over this period.
+    :param warmup: Number of initial updates during which the learning rate
+                   linearly increases.
+    """
 
-    def __call__(self, num_updates: int):
-        lr = min(self.base_lr / (1 + num_updates * self.factor),
-                 self._warmup(num_updates) if self.warmup > 0 else C.LARGE_POSITIVE_VALUE)
-        # Note: this method is called once per parameter for the same t. Making sure to just log once.
-        if num_updates > self.t_last_log and num_updates % self.log_every_t == 0:
-            logger.info("Learning rate currently at %1.2e", lr)
-            self.t_last_log = num_updates
+    def __init__(self, total_steps: int, warmup: int = 0, t_scale: float = 1.0) -> None:
+        super().__init__(warmup, t_scale)
+        check_condition(total_steps >= 0, "total_steps need to be >= 0.")
+        self.total_steps = total_steps
 
+    def __call__(self, t: int):
+        # Time scale
+        scaled_t = t * self.t_scale
+        # Warmup
+        warm_lr = self._warmup(scaled_t)
+        # Linear decay
+        bounded_t = min(max(scaled_t, 1), self.total_steps)
+        lr = warm_lr * (1 - bounded_t / self.total_steps)
+        # For this scheduler, `self.lr` represents the last seen lr and is only
+        # used for logging purposes.
+        self.lr = lr
         return lr
 
 
@@ -201,12 +130,12 @@ class LearningRateSchedulerPlateauReduce(AdaptiveLearningRateScheduler):
 
     def __init__(self, reduce_factor: float, reduce_num_not_improved: int, warmup: int = 0) -> None:
         super().__init__(warmup)
-        check_condition(0.0 < reduce_factor <= 1, "reduce_factor should be in ]0,1].")
+        check_condition(0.0 < reduce_factor < 1, "reduce_factor should be between (0, 1).")
         self.reduce_factor = reduce_factor
         self.reduce_num_not_improved = reduce_num_not_improved
         self.num_not_improved = 0
 
-        self.lr = None  # type: float
+        self.lr = None  # type: Optional[float]
         self.t_last_log = -1
         self.warmed_up = not self.warmup > 0
         logger.info("Will reduce the learning rate by a factor of %.2f whenever"
@@ -251,49 +180,47 @@ def __repr__(self):
 
 
 def get_lr_scheduler(scheduler_type: str,
-                     updates_per_checkpoint: int,
-                     learning_rate_half_life: int,
+                     learning_rate_t_scale: float,
                      learning_rate_reduce_factor: float,
                      learning_rate_reduce_num_not_improved: int,
-                     learning_rate_schedule: Optional[List[Tuple[float, int]]] = None,
-                     learning_rate_warmup: Optional[int] = 0) -> Optional[LearningRateScheduler]:
+                     learning_rate_warmup: int = 0,
+                     max_updates: Optional[int] = None) -> Optional[LearningRateScheduler]:
     """
     Returns a learning rate scheduler.
 
     :param scheduler_type: Scheduler type.
-    :param updates_per_checkpoint: Number of batches between checkpoints.
-    :param learning_rate_half_life: Half life of the learning rate in number of checkpoints.
     :param learning_rate_reduce_factor: Factor to reduce learning rate with.
-    :param learning_rate_reduce_num_not_improved: Number of checkpoints with no improvement after which learning rate is
-           reduced.
-    :param learning_rate_schedule: Optional fixed learning rate schedule.
-    :param learning_rate_warmup: Number of batches that the learning rate is linearly increased.
+    :param learning_rate_t_scale: Scaling factor for step number.
+    :param learning_rate_reduce_num_not_improved: Number of checkpoints with no
+           improvement after which learning rate is reduced.
+    :param learning_rate_warmup: Number of initial updates during which the
+                                 learning rate linearly increases.
+    :param max_updates: Number of total training updates.
+
     :raises: ValueError if unknown scheduler_type
+
     :return: Learning rate scheduler.
     """
-    check_condition(learning_rate_schedule is None or scheduler_type == C.LR_SCHEDULER_FIXED_STEP,
-                    "Learning rate schedule can only be used with '%s' learning rate scheduler."
-                    % C.LR_SCHEDULER_FIXED_STEP)
-    if scheduler_type is None:
+    if scheduler_type is None or scheduler_type == C.LR_SCHEDULER_NONE:
         return None
-    if scheduler_type == C.LR_SCHEDULER_FIXED_RATE_INV_SQRT_T:
-        return LearningRateSchedulerInvSqrtT(updates_per_checkpoint, learning_rate_half_life, learning_rate_warmup)
-    elif scheduler_type == C.LR_SCHEDULER_FIXED_RATE_INV_T:
-        return LearningRateSchedulerInvT(updates_per_checkpoint, learning_rate_half_life, learning_rate_warmup)
-    elif scheduler_type == C.LR_SCHEDULER_FIXED_STEP:
-        check_condition(learning_rate_schedule is not None,
-                        "learning_rate_schedule needed for %s scheduler" % C.LR_SCHEDULER_FIXED_STEP)
-        return LearningRateSchedulerFixedStep(learning_rate_schedule, updates_per_checkpoint)
-    elif scheduler_type == C.LR_SCHEDULER_PLATEAU_REDUCE:
+    if scheduler_type == C.LR_SCHEDULER_INV_SQRT_DECAY:
+        return LearningRateSchedulerInvSqrtDecay(warmup=learning_rate_warmup, t_scale=learning_rate_t_scale)
+    if scheduler_type == C.LR_SCHEDULER_LINEAR_DECAY:
+        check_condition(max_updates is not None,
+                        "The total number of training updates (--max-updates) must be specified when using the linear "
+                        "decay learning rate scheduler.")
+        return LearningRateSchedulerLinearDecay(total_steps=max_updates,
+                                                warmup=learning_rate_warmup,
+                                                t_scale=learning_rate_t_scale)
+    if scheduler_type == C.LR_SCHEDULER_PLATEAU_REDUCE:
         check_condition(learning_rate_reduce_factor is not None,
                         "learning_rate_reduce_factor needed for %s scheduler" % C.LR_SCHEDULER_PLATEAU_REDUCE)
         check_condition(learning_rate_reduce_num_not_improved is not None,
                         "learning_rate_reduce_num_not_improved needed for %s scheduler" % C.LR_SCHEDULER_PLATEAU_REDUCE)
         if learning_rate_reduce_factor >= 1.0:
-            logger.warning("Not using %s learning rate scheduling: learning_rate_reduce_factor == 1.0"
-                           % C.LR_SCHEDULER_PLATEAU_REDUCE)
+            logger.warning("Not using %s learning rate scheduling: learning_rate_reduce_factor == 1.0",
+                           C.LR_SCHEDULER_PLATEAU_REDUCE)
             return None
         return LearningRateSchedulerPlateauReduce(learning_rate_reduce_factor, learning_rate_reduce_num_not_improved,
                                                   learning_rate_warmup)
-    else:
-        raise ValueError("Unknown learning rate scheduler type %s." % scheduler_type)
+    raise ValueError("Unknown learning rate scheduler type %s." % scheduler_type)
diff --git a/sockeye/model.py b/sockeye/model.py
index f5763f2dd..3e3d6121b 100644
--- a/sockeye/model.py
+++ b/sockeye/model.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -12,21 +12,23 @@
 # permissions and limitations under the License.
 
 import copy
+import time
 import logging
 import os
-from typing import cast, Dict, Optional, Tuple
+from typing import cast, Dict, Optional, Tuple, Union, List
 
 import mxnet as mx
-
 from sockeye import __version__
 from sockeye.config import Config
+
 from . import constants as C
 from . import data_io
 from . import decoder
 from . import encoder
 from . import layers
-from . import loss
+from . import quantization
 from . import utils
+from . import vocab
 
 logger = logging.getLogger(__name__)
 
@@ -44,11 +46,12 @@ class ModelConfig(Config):
     :param config_embed_target: Embedding config for target.
     :param config_encoder: Encoder configuration.
     :param config_decoder: Decoder configuration.
-    :param config_loss: Loss configuration.
-    :param weight_tying: Enables weight tying if True.
-    :param weight_tying_type: Determines which weights get tied. Must be set if weight_tying is enabled.
+    :param config_length_task: Optional length task configuration.
+    :param weight_tying_type: Determines which weights get tied.
     :param lhuc: LHUC (Vilar 2018) is applied at some part of the model.
-    :param num_pointers: The number of pointers to the source sequence that can be outputted by the decoder.
+    :param dtype: Data type of model parameters. Default: float32.
+    :param intgemm_custom_lib: Path to intgemm custom operator library used for dtype is int8.  Default: libintgemm.so
+                               in the same directory as this script.
     """
 
     def __init__(self,
@@ -59,14 +62,11 @@ def __init__(self,
                  config_embed_target: encoder.EmbeddingConfig,
                  config_encoder: encoder.EncoderConfig,
                  config_decoder: decoder.DecoderConfig,
-                 config_loss: loss.LossConfig,
-                 config_length_task_loss: Optional[loss.LossConfig] = None,
-                 config_length_task: layers.LengthRatioConfig = None,
-                 weight_tying: bool = False,
-                 weight_tying_type: Optional[str] = C.WEIGHT_TYING_TRG_SOFTMAX,
-                 weight_normalization: bool = False,
+                 config_length_task: layers.LengthRatioConfig= None,
+                 weight_tying_type: str = C.WEIGHT_TYING_SRC_TRG_SOFTMAX,
                  lhuc: bool = False,
-                 num_pointers: int = 0) -> None:
+                 dtype: str = C.DTYPE_FP32,
+                 intgemm_custom_lib: str = os.path.join(os.path.dirname(__file__), "libintgemm.so")) -> None:
         super().__init__()
         self.config_data = config_data
         self.vocab_source_size = vocab_source_size
@@ -75,19 +75,14 @@ def __init__(self,
         self.config_embed_target = config_embed_target
         self.config_encoder = config_encoder
         self.config_decoder = config_decoder
-        self.config_loss = config_loss
-        self.config_length_task_loss = config_length_task_loss
         self.config_length_task = config_length_task
-        self.weight_tying = weight_tying
         self.weight_tying_type = weight_tying_type
-        self.weight_normalization = weight_normalization
-        if weight_tying and weight_tying_type is None:
-            raise RuntimeError("weight_tying_type must be specified when using weight_tying.")
         self.lhuc = lhuc
-        self.num_pointers = num_pointers
+        self.dtype = dtype
+        self.intgemm_custom_lib = intgemm_custom_lib
 
 
-class SockeyeModel:
+class SockeyeModel(mx.gluon.Block):
     """
     SockeyeModel shares components needed for both training and inference.
     The main components of a Sockeye model are
@@ -101,52 +96,166 @@ class SockeyeModel:
     time.
 
     :param config: Model configuration.
+    :param inference_only: Use the model only for inference, enabling optimizations.
     :param prefix: Name prefix for all parameters of this model.
     """
 
-    def __init__(self, config: ModelConfig, prefix: str = '') -> None:
+    def __init__(self, config: ModelConfig, inference_only: bool = False, prefix: str = '', **kwargs) -> None:
+        super().__init__(prefix=prefix, **kwargs)
         self.config = copy.deepcopy(config)
-        self.config.freeze()
-        self.prefix = prefix
         logger.info("%s", self.config)
-
-        # encoder & decoder first (to know the decoder depth)
-        self.encoder = encoder.get_encoder(self.config.config_encoder, prefix=self.prefix)
-        self.decoder = decoder.get_decoder(self.config.config_decoder, prefix=self.prefix)
-
-        # source & target embeddings
-        embed_weight_source, embed_weight_target, out_weight_target = self._get_embed_weights(self.prefix)
-        if isinstance(self.config.config_embed_source, encoder.PassThroughEmbeddingConfig):
-            self.embedding_source = encoder.PassThroughEmbedding(self.config.config_embed_source)  # type: encoder.Encoder
-        else:
-            self.embedding_source = encoder.Embedding(self.config.config_embed_source,
-                                                      prefix=self.prefix + C.SOURCE_EMBEDDING_PREFIX,
-                                                      embed_weight=embed_weight_source,
-                                                      is_source=True)  # type: encoder.Encoder
-
-        self.embedding_target = encoder.Embedding(self.config.config_embed_target,
-                                                  prefix=self.prefix + C.TARGET_EMBEDDING_PREFIX,
-                                                  embed_weight=embed_weight_target)
-
-        # output layer
-        self.output_layer = layers.OutputLayer(hidden_size=self.decoder.get_num_hidden(),
-                                               vocab_size=self.config.vocab_target_size - self.config.num_pointers,
-                                               weight=out_weight_target,
-                                               weight_normalization=self.config.weight_normalization,
-                                               prefix=self.prefix + C.DEFAULT_OUTPUT_LAYER_PREFIX)
-
-        # create length ratio prediction layer(s)
-        self.length_ratio = None
-        if self.config.config_length_task is not None:
-            if self.config.config_length_task.weight > 0.0:
+        self.dtype = config.dtype
+
+        with self.name_scope():
+            # source & target embeddings
+            self.source_embed_weight, self.target_embed_weight, self.output_weight = self._get_embedding_weights()
+
+            self.embedding_source = encoder.Embedding(config.config_embed_source,
+                                                      prefix=self.prefix,
+                                                      is_source=True,
+                                                      embed_weight=self.source_embed_weight)
+            self.embedding_target = encoder.Embedding(config.config_embed_target,
+                                                      prefix=self.prefix,
+                                                      is_source=False,
+                                                      embed_weight=self.target_embed_weight)
+
+            # encoder & decoder first (to know the decoder depth)
+            self.encoder = encoder.get_encoder(self.config.config_encoder, prefix=self.prefix, dtype=config.dtype)
+            self.decoder = decoder.get_decoder(self.config.config_decoder, inference_only=inference_only,
+                                               prefix=self.prefix, dtype=config.dtype)
+
+            self.output_layer = layers.OutputLayer(hidden_size=self.decoder.get_num_hidden(),
+                                                   vocab_size=self.config.vocab_target_size,
+                                                   weight=self.output_weight, dtype=config.dtype)
+
+            self.length_ratio = None
+            if self.config.config_length_task is not None:
+                utils.check_condition(self.config.config_length_task.weight > 0.0,
+                                      'Auxiliary length task requested, but its loss weight is zero')
                 self.length_ratio = layers.LengthRatio(hidden_size=self.encoder.get_num_hidden(),
                                                        num_layers=self.config.config_length_task.num_layers,
                                                        prefix=self.prefix + C.LENRATIOS_OUTPUT_LAYER_PREFIX)
-            else:
-                logger.warning("Auxiliary length task requested, but its loss weight is zero -- this will have no effect.")
 
-        self.params = None  # type: Optional[Dict]
-        self.aux_params = None  # type: Optional[Dict]
+    def cast(self, dtype):
+        self.dtype = dtype
+        super().cast(dtype)
+
+    def state_structure(self):
+        return self.decoder.state_structure()
+
+    def encode(self, inputs, valid_length=None):
+        """Encode the input sequence.
+
+        Parameters
+        ----------
+        inputs : NDArray
+        valid_length : NDArray or None, default None
+
+        Returns
+        -------
+        outputs : list
+            Outputs of the encoder.
+        """
+        source_embed, source_embed_length = self.embedding_source(inputs, valid_length)
+        source_encoded, source_encoded_length = self.encoder(source_embed, source_embed_length)
+        return source_encoded, source_encoded_length
+
+    def encode_and_initialize(self, inputs, valid_length=None, constant_length_ratio=0.0):
+        """
+        Encodes the input sequence and initializes decoder states (and predicted output lengths if available).
+        Used for inference/decoding.
+
+        Parameters
+        ----------
+        inputs : NDArray
+        valid_length : NDArray or None, default None
+        constant_length_ratio : float
+
+        Returns
+        -------
+        states : list
+            Initial states for the decoder.
+        predicted_output_length : NDArray
+            Predicted output length of shape (batch_size,), 0 if not available.
+        """
+        # Encode input. Shape: (batch, length, num_hidden), (batch,)
+        source_encoded, source_encoded_lengths = self.encode(inputs, valid_length=valid_length)
+
+        predicted_output_length = self.predict_output_length(source_encoded,
+                                                             source_encoded_lengths,
+                                                             constant_length_ratio)
+        # Decoder init states
+        states = self.decoder.init_state_from_encoder(source_encoded, source_encoded_lengths)
+
+        return states, predicted_output_length
+
+    def decode_step(self, step_input, states, vocab_slice_ids=None):
+        """
+        One step decoding of the translation model.
+
+        Parameters
+        ----------
+        step_input : NDArray
+            Shape (batch_size,)
+        states : list of NDArrays
+        vocab_slice_ids : NDArray or None
+
+        Returns
+        -------
+        step_output : NDArray
+            Shape (batch_size, C_out)
+        states : list
+        step_additional_outputs : list
+            Additional outputs of the step, e.g, the attention weights
+        """
+        # TODO: do we need valid length!?
+        valid_length = mx.nd.ones(shape=(step_input.shape[0],), ctx=step_input.context)
+        # target_embed: (batch_size, num_hidden)
+        target_embed, _ = self.embedding_target(step_input, valid_length=valid_length)
+
+        # TODO: add step_additional_outputs
+        step_additional_outputs = []
+        # TODO: add support for states from the decoder
+        decoder_out, new_states = self.decoder(target_embed, states)
+
+        # step_output: (batch_size, target_vocab_size or vocab_slice_ids)
+        step_output = self.output_layer(decoder_out, vocab_slice_ids)
+
+        return step_output, new_states, step_additional_outputs
+
+    def forward(self, source, source_length, target, target_length):  # pylint: disable=arguments-differ
+        source_embed, source_embed_length = self.embedding_source(source, source_length)
+        target_embed, target_embed_length = self.embedding_target(target, target_length)
+        source_encoded, source_encoded_length = self.encoder(source_embed, source_embed_length)
+
+        states = self.decoder.init_state_from_encoder(source_encoded, source_encoded_length)
+        target = self.decoder.decode_seq(target_embed, states=states)
+
+        output = self.output_layer(target, None)
+
+        if self.length_ratio is not None:
+            # predicted_length_ratios: (batch_size,)
+            predicted_length_ratio = self.length_ratio(source_encoded, source_encoded_length)
+            return {C.LOGITS_NAME: output, C.LENRATIO_NAME: predicted_length_ratio}
+        else:
+            return {C.LOGITS_NAME: output}
+
+    def predict_output_length(self,
+                              source_encoded: mx.nd.NDArray,
+                              source_encoded_length: mx.nd.NDArray,
+                              constant_length_ratio: float = 0.0):
+        if self.length_ratio is not None:
+            # predicted_length_ratios: (batch_size,)
+            predicted_length_ratio = self.length_ratio(source_encoded, source_encoded_length)
+            predicted_output_length = predicted_length_ratio * source_encoded_length
+        elif constant_length_ratio > 0.0:
+            # (batch,)
+            predicted_output_length = source_encoded_length * constant_length_ratio
+        else:
+            # (batch,)
+            predicted_output_length = mx.nd.zeros_like(source_encoded_length)
+
+        return predicted_output_length
 
     def save_config(self, folder: str):
         """
@@ -156,7 +265,7 @@ def save_config(self, folder: str):
         """
         fname = os.path.join(folder, C.CONFIG_NAME)
         self.config.save(fname)
-        logger.info('Saved config to "%s"', fname)
+        logger.info('Saved model config to "%s"', fname)
 
     @staticmethod
     def load_config(fname: str) -> ModelConfig:
@@ -167,36 +276,84 @@ def load_config(fname: str) -> ModelConfig:
         :return: Model configuration.
         """
         config = ModelConfig.load(fname)
-        logger.info('ModelConfig loaded from "%s"', fname)
+        logger.info('Loaded model config from "%s"', fname)
         return cast(ModelConfig, config)  # type: ignore
 
-    def save_params_to_file(self, fname: str):
+    def save_parameters(self, fname: str):
         """
         Saves model parameters to file.
-
         :param fname: Path to save parameters to.
         """
-        if self.aux_params is not None:
-            utils.save_params(self.params.copy(), fname, self.aux_params.copy())
-        else:
-            utils.save_params(self.params.copy(), fname)
+        super().save_parameters(fname, deduplicate=True)
         logging.info('Saved params to "%s"', fname)
 
-    def load_params_from_file(self, fname: str):
+    def load_parameters(self,
+                        filename: str,
+                        ctx: Union[mx.Context, List[mx.Context]] = None,
+                        allow_missing: bool = False,
+                        ignore_extra: bool = False,
+                        cast_dtype: bool = False,
+                        dtype_source: str = 'current'):
+        """Load parameters from file previously saved by `save_parameters`.
+
+        Parameters
+        ----------
+        filename : str
+            Path to parameter file.
+        ctx : Context or list of Context, default cpu()
+            Context(s) to initialize loaded parameters on.
+        allow_missing : bool, default False
+            Whether to silently skip loading parameters not represents in the file.
+        ignore_extra : bool, default False
+            Whether to silently ignore parameters from the file that are not
+            present in this Block.
+        cast_dtype : bool, default False
+            Cast the data type of the NDArray loaded from the checkpoint to the dtype
+            provided by the Parameter if any.
+        dtype_source : str, default 'current'
+            must be in {'current', 'saved'}
+            Only valid if cast_dtype=True, specify the source of the dtype for casting
+            the parameters
+        References
+        ----------
+        `Saving and Loading Gluon Models \
+        <https://mxnet.incubator.apache.org/tutorials/gluon/save_load_params.html>`_
         """
-        Loads and sets model parameters from file.
+        utils.check_condition(os.path.exists(filename), "No model parameter file found under %s. "
+                                                     "This is either not a model directory or the first training "
+                                                     "checkpoint has not happened yet." % filename)
+        super().load_parameters(filename, ctx=ctx, allow_missing=allow_missing, ignore_extra=ignore_extra,
+                                cast_dtype=cast_dtype, dtype_source=dtype_source)
+        logger.info('Loaded params from "%s" to "%s"', filename, mx.cpu() if ctx is None else ctx)
+
+    def set_parameters(self,
+                       new_params: Dict[str, mx.gluon.parameter.Parameter],
+                       allow_missing: bool = True,
+                       ignore_extra: bool = False):
+        """
+        Update model params on all contexts of the model with new values from a dictionary.
 
-        :param fname: Path to load parameters from.
+        :param new_params: Dictionary containing the new parameters.
+        :param allow_missing: Whether to skip setting parameters not represented in the dictionary.
+        :param ignore_extra: Whether to ignore parameters from new_params that are not present in this model.
         """
-        utils.check_condition(os.path.exists(fname), "No model parameter file found under %s. "
-                                                     "This is either not a model directory or the first training "
-                                                     "checkpoint has not happened yet." % fname)
-        self.params, self.aux_params = utils.load_params(fname)
-        utils.check_condition(all(name.startswith(self.prefix) for name in self.params.keys()),
-                              "Not all parameter names start with model prefix '%s'" % self.prefix)
-        utils.check_condition(all(name.startswith(self.prefix) for name in self.aux_params.keys()),
-                              "Not all auxiliary parameter names start with model prefix '%s'" % self.prefix)
-        logger.info('Loaded params from "%s"', fname)
+        model_params = self.collect_params()
+        if not allow_missing:
+            for k in model_params.keys():
+                assert k in new_params.keys(), "Parameter '%s' is missing in new_params dictionary. " \
+                                               "Set allow_missing=True to ignore missing parameters." % k
+        for k in new_params:
+            assert new_params[k]._data is not None, "Parameter '%s' is not initialized in new_params dictionary." % k
+            if not ignore_extra and k not in model_params:
+                raise ValueError("Parameter '%s' in new_params dictionary is not preset in ParameterDict. "
+                                 "Set ignore_extra=True to ignore." % k)
+            if k in model_params:
+                assert model_params[k]._data is not None, "Parameter '%s' must be initialized before it can be reset " \
+                                                          "using set_parameters." % k
+                assert model_params[k].shape == new_params[k].shape, \
+                    "Parameter '%s' has shape '%s' in the model but shape '%s' in the new_params dictionary." % \
+                    (k, model_params[k].shape, new_params[k].shape)
+                model_params[k].set_data(new_params[k].data())
 
     @staticmethod
     def save_version(folder: str):
@@ -209,64 +366,259 @@ def save_version(folder: str):
         with open(fname, "w") as out:
             out.write(__version__)
 
-    def _get_embed_weights(self, prefix: str) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, mx.sym.Symbol]:
+    def _get_embedding_weights(self) -> Tuple[mx.gluon.Parameter, mx.gluon.Parameter, mx.gluon.Parameter]:
         """
-        Returns embedding parameters for source and target.
+        Returns embeddings for source, target, and output layer.
         When source and target embeddings are shared, they are created here and passed in to each side,
         instead of being created in the Embedding constructors.
 
-        :param prefix: Prefix.
-        :return: Tuple of source and target parameter symbols.
+        :return: Tuple of source, target, and output embedding parameters.
         """
-        w_embed_source = mx.sym.Variable(prefix + C.SOURCE_EMBEDDING_PREFIX + "weight",
-                                         shape=(self.config.config_embed_source.vocab_size,
-                                                self.config.config_embed_source.num_embed))
-        w_embed_target = mx.sym.Variable(prefix + C.TARGET_EMBEDDING_PREFIX + "weight",
-                                         shape=(self.config.config_embed_target.vocab_size,
-                                                self.config.config_embed_target.num_embed))
-
-        w_out_target = mx.sym.Variable(prefix + "target_output_weight", dtype='float32',
-                                       shape=(self.config.vocab_target_size - self.config.num_pointers, self.decoder.get_num_hidden()))
-
-        if self.config.weight_tying:
-            if C.WEIGHT_TYING_SRC in self.config.weight_tying_type \
-                    and C.WEIGHT_TYING_TRG in self.config.weight_tying_type:
-                logger.info("Tying the source and target embeddings.")
-                w_embed_source = w_embed_target = mx.sym.Variable(prefix + C.SHARED_EMBEDDING_PREFIX + "weight",
-                                                                  shape=(self.config.config_embed_source.vocab_size,
-                                                                         self.config.config_embed_source.num_embed))
-
-            if C.WEIGHT_TYING_SOFTMAX in self.config.weight_tying_type:
-                logger.info("Tying the target embeddings and output layer parameters.")
-                utils.check_condition(self.config.config_embed_target.num_embed == self.decoder.get_num_hidden(),
-                                      "Weight tying requires target embedding size and decoder hidden size " +
-                                      "to be equal: %d vs. %d" % (self.config.config_embed_target.num_embed,
-                                                                  self.decoder.get_num_hidden()))
-                if self.config.num_pointers > 0:
-                    w_out_target = mx.sym.slice(w_embed_target,
-                        begin=(0, None),
-                        end=(self.config.vocab_target_size - self.config.num_pointers, None))
-                else:
-                    w_out_target = w_embed_target
-
-        self._embed_weight_source_name = None
-        if w_embed_source is not None:
-            self._embed_weight_source_name = w_embed_source.name
-        self._embed_weight_target_name = w_embed_target.name
-        self._out_weight_target_name = w_out_target.name
-        return w_embed_source, w_embed_target, w_out_target
-
-    def get_source_embed_params(self) -> Optional[mx.nd.NDArray]:
-        if self.params is None:
-            return None
-        return self.params.get(self._embed_weight_source_name)
-
-    def get_target_embed_params(self) -> Optional[mx.nd.NDArray]:
-        if self.params is None:
-            return None
-        return self.params.get(self._embed_weight_target_name)
-
-    def get_output_embed_params(self) -> Optional[mx.nd.NDArray]:
-        if self.params is None:
-            return None
-        return self.params.get(self._out_weight_target_name)
+        share_embed = C.WEIGHT_TYING_SRC in self.config.weight_tying_type and \
+                      C.WEIGHT_TYING_TRG in self.config.weight_tying_type
+
+        tie_weights = C.WEIGHT_TYING_SOFTMAX in self.config.weight_tying_type
+
+        source_embed_name = C.SOURCE_EMBEDDING_PREFIX + "weight" if not share_embed else C.SHARED_EMBEDDING_PREFIX + "weight"
+        target_embed_name = C.TARGET_EMBEDDING_PREFIX + "weight" if not share_embed else C.SHARED_EMBEDDING_PREFIX + "weight"
+        output_embed_name = "target_output_weight" if not tie_weights else target_embed_name
+
+        source_grad_stype = 'row_sparse' if self.config.config_embed_source.allow_sparse_grad and not tie_weights else 'default'
+        source_embed_weight = self.params.get(source_embed_name,
+                                              shape=(self.config.config_embed_source.vocab_size,
+                                                     self.config.config_embed_source.num_embed),
+                                              allow_deferred_init=True,
+                                              grad_stype=source_grad_stype)
+
+        if share_embed:
+            target_embed_weight = source_embed_weight
+        else:
+            target_grad_stype = 'row_sparse' if self.config.config_embed_target.allow_sparse_grad and not tie_weights else 'default'
+            target_embed_weight = self.params.get(target_embed_name,
+                                                  shape=(self.config.config_embed_target.vocab_size,
+                                                         self.config.config_embed_target.num_embed),
+                                                  allow_deferred_init=True,
+                                                  grad_stype=target_grad_stype)
+
+        if tie_weights:
+            output_weight = target_embed_weight
+        else:
+            output_weight = self.params.get(output_embed_name,
+                                            shape=(self.config.config_embed_target.vocab_size,
+                                                   self.config.config_decoder.model_size),
+                                            allow_deferred_init=True)
+
+        return source_embed_weight, target_embed_weight, output_weight
+
+    @property
+    def num_source_factors(self) -> int:
+        """ Returns the number of source factors of this model (at least 1). """
+        return self.config.config_data.num_source_factors
+
+    @property
+    def training_max_observed_len_source(self) -> int:
+        """ The maximum sequence length on the source side observed during training. This includes the <eos> token. """
+        return self.config.config_data.data_statistics.max_observed_len_source
+
+    @property
+    def training_max_observed_len_target(self) -> int:
+        """ The maximum sequence length on the target side observed during training. This includes the <bos> token. """
+        return self.config.config_data.data_statistics.max_observed_len_target
+
+    @property
+    def max_supported_len_source(self) -> int:
+        """ The maximum supported source length. This includes the <eos> token. """
+        return self.config.config_data.max_seq_len_source
+
+    @property
+    def max_supported_len_target(self) -> int:
+        """ The maximum supported target length. This includes the <bos> token. """
+        return self.config.config_data.max_seq_len_target
+
+    @property
+    def length_ratio_mean(self) -> float:
+        return self.config.config_data.data_statistics.length_ratio_mean
+
+    @property
+    def length_ratio_std(self) -> float:
+        return self.config.config_data.data_statistics.length_ratio_std
+
+    @property
+    def output_layer_vocab_size(self) -> int:
+        return self.output_layer.vocab_size
+
+
+def load_model(model_folder: str,
+               context: Union[List[mx.context.Context], mx.context.Context] = mx.cpu(),
+               dtype: Optional[str] = None,
+               checkpoint: Optional[int] = None,
+               hybridize: bool = True,
+               inference_only: bool = False,
+               for_disk_saving: Optional[str] = None,
+               allow_missing: bool = False,
+               set_grad_req_null: bool = True) -> Tuple[SockeyeModel, List[vocab.Vocab], vocab.Vocab]:
+    """
+    Load a model from model_folder.
+
+    :param model_folder: Model folder.
+    :param context: MXNet context to bind modules to.
+    :param checkpoint: Checkpoint to use. If none, uses best checkpoint.
+    :param dtype: Optional data type to use. If None, will be inferred from stored model.
+    :param hybridize: Whether to hybridize the loaded models. Default: true.
+    :param inference_only: Use the model only for inference, enabling optimizations.
+    :param for_disk_saving: For saving quantized models to disk.
+           None: load as usual and the model will work.
+           int8: The model loaded into RAM will not work, but is suitable for
+               writing to disk in quantized format (including scaling factors).
+           float32: The model loaded into RAM will not work, but is suitable
+               for writing to disk as float32 with precomputed scaling factors.
+    :param allow_missing: Allow missing parameters in the loaded model.
+    :param set_grad_req_null: Set grad_req to null for model parameters.
+    :return: List of models, source vocabularies, target vocabulary.
+    """
+    source_vocabs = vocab.load_source_vocabs(model_folder)
+    target_vocab = vocab.load_target_vocab(model_folder)
+    model_version = utils.load_version(os.path.join(model_folder, C.VERSION_NAME))
+    logger.info("Model version: %s", model_version)
+    utils.check_version(model_version)
+    model_config = SockeyeModel.load_config(os.path.join(model_folder, C.CONFIG_NAME))
+
+    logger.info("Disabling dropout layers for performance reasons")
+    if inference_only:
+        model_config.disable_dropout()
+
+    if checkpoint is None:
+        params_fname = os.path.join(model_folder, C.PARAMS_BEST_NAME)
+    else:
+        params_fname = os.path.join(model_folder, C.PARAMS_NAME % checkpoint)
+
+    if (dtype == C.DTYPE_INT8 or
+        model_config.dtype == C.DTYPE_INT8 or
+        for_disk_saving is not None) and "intgemm_fully_connected" not in dir(mx.nd.contrib):
+        # We're going to use int8 but it's not compiled into mxnet.
+        path = os.path.abspath(model_config.intgemm_custom_lib)
+        try:
+            mx.library.load(path)
+        except mx.base.MXNetError:
+            raise NotImplementedError("8-bit int inference requested but intgemm was not compiled into MXNet and a "
+                                      "custom operator library was not found in `%s`.  Compile the custom "
+                                      "operator then set the path using intgemm_custom_lib in the config file." % path)
+
+    # Are we converting the model to 8-bit?
+    quantizing = model_config.dtype != C.DTYPE_INT8 and (dtype == C.DTYPE_INT8 or for_disk_saving is not None)
+    if quantizing:
+        model_config.dtype = C.DTYPE_INT8 # Ensure the scaling factor parameters are created.
+
+    model = SockeyeModel(model_config, inference_only=inference_only)
+    model.initialize(ctx=context)
+    if model_config.dtype != C.DTYPE_INT8:
+        # If model_config.dtype is int8, then the above model construction
+        # (which also used model_config) already set everything to the correct
+        # mix of float32 and int8.  Cast would try to make everything int8.
+        model.cast(model_config.dtype)
+
+    if quantizing:
+        logger.info("Model dtype: quantizing from float32 to int8")
+        #The scaling factors are missing
+        allow_missing = True
+        cast_dtype = True
+        dtype_source = 'saved'
+    elif dtype is None or dtype == model_config.dtype:
+        logger.info("Model dtype: %s" % model_config.dtype)
+        allow_missing = False
+        cast_dtype = False
+        dtype_source = 'saved'
+    else:
+        logger.info("Model dtype: overridden to %s" % dtype)
+        model.cast(dtype)
+        allow_missing = False
+        cast_dtype = True
+        dtype_source = 'current'
+
+    model.load_parameters(filename=params_fname,
+                          ctx=context,
+                          allow_missing=allow_missing,
+                          ignore_extra=True, #Scaling factors may be present in float32 models.
+                          cast_dtype=cast_dtype,
+                          dtype_source=dtype_source)
+
+    params = model.collect_params()
+    if set_grad_req_null:
+        for param in params.values():
+            param.grad_req = 'null'
+
+    if for_disk_saving is not None:
+        #Saving scaling factors and possibly int8 values to disk.
+        if not quantizing:
+            raise RuntimeError("Model is already quantized and for_disk_saving is set.")
+        quantization.convert_weights_disk_format(params, for_disk_saving)
+        model.config.dtype = for_disk_saving
+        #TODO: check for missing parameters somehow (we allowed scaling to be missing)
+    if for_disk_saving is None and model_config.dtype == C.DTYPE_INT8:
+        #Disk format to CPU-dependent format.
+        quantization.convert_weights_cpu_dependent(params)
+
+    if hybridize:
+        model.hybridize(static_alloc=True)
+
+    utils.check_condition(model.num_source_factors == len(source_vocabs),
+                          "Number of loaded source vocabularies (%d) does not match "
+                          "number of source factors for model '%s' (%d)" % (len(source_vocabs), model_folder,
+                                                                            model.num_source_factors))
+    return model, source_vocabs, target_vocab
+
+
+def load_models(context: Union[List[mx.context.Context], mx.context.Context],
+                model_folders: List[str],
+                checkpoints: Optional[List[int]] = None,
+                dtype: Optional[str] = C.DTYPE_FP32,
+                hybridize: bool = True,
+                inference_only: bool = False,
+                allow_missing: bool = False,
+                set_grad_req_null: bool = True) -> Tuple[List[SockeyeModel], List[vocab.Vocab], vocab.Vocab]:
+    """
+    Loads a list of models for inference.
+
+    :param context: MXNet context to bind modules to.
+    :param model_folders: List of model folders to load models from.
+    :param checkpoints: List of checkpoints to use for each model in model_folders. Use None to load best checkpoint.
+    :param dtype: Optional data type to use. If None, will be inferred from stored model.
+    :param hybridize: Whether to hybridize the loaded models. Default: true.
+    :param inference_only: Use the model only for inference, enabling optimizations.
+    :param allow_missing: Allow missing parameters in the loaded models.
+    :param set_grad_req_null: Set grad_req to null for model parameters.
+    :return: List of models, source vocabulary, target vocabulary, source factor vocabularies.
+    """
+    logger.info("Loading %d model(s) from %s ...", len(model_folders), model_folders)
+    load_time_start = time.time()
+    models = []  # type: List[SockeyeModel]
+    source_vocabs = []  # type: List[List[vocab.Vocab]]
+    target_vocabs = []  # type: List[vocab.Vocab]
+
+    if checkpoints is None:
+        checkpoints = [None] * len(model_folders)
+    else:
+        utils.check_condition(len(checkpoints) == len(model_folders), "Must provide checkpoints for each model")
+
+    for model_folder, checkpoint in zip(model_folders, checkpoints):
+        model, src_vcbs, trg_vcb = load_model(model_folder,
+                                              context=context,
+                                              dtype=dtype,
+                                              checkpoint=checkpoint,
+                                              hybridize=hybridize,
+                                              inference_only=inference_only,
+                                              allow_missing=allow_missing,
+                                              set_grad_req_null=set_grad_req_null)
+        models.append(model)
+        source_vocabs.append(src_vcbs)
+        target_vocabs.append(trg_vcb)
+
+    utils.check_condition(vocab.are_identical(*target_vocabs), "Target vocabulary ids do not match")
+    first_model_vocabs = source_vocabs[0]
+    for fi in range(len(first_model_vocabs)):
+        utils.check_condition(vocab.are_identical(*[source_vocabs[i][fi] for i in range(len(source_vocabs))]),
+                              "Source vocabulary ids do not match. Factor %d" % fi)
+
+    load_time = time.time() - load_time_start
+    logger.info("%d model(s) loaded in %.4fs", len(models), load_time)
+    return models, source_vocabs[0], target_vocabs[0]
diff --git a/sockeye/multiprocessing_utils.py b/sockeye/multiprocessing_utils.py
deleted file mode 100644
index 50a2147d9..000000000
--- a/sockeye/multiprocessing_utils.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Our checkpoint decoder runs in a separate python process. When launching this process (and also the sempaphore tracker
-process that gets launched by Python's own multiprocessing) one needs to be careful that MXNet, MKL or CUDA resources
-are not leaked from the parent to the child processes, as otherwise deadlocks can occur.
-We achieve this by using the forkserver spawn method. Specifically, we create the forkserver before MXNet gets imported,
-when the Python interpreter process is still in a "clean" state. All subsequent checkpoint decoder processes are then
-forked from this clean process. Additionally, we trigger the creation of the sempahore tracker process before MXNet
-is imported. In order to achieve this `initialize` must be called right after startup.
-"""
-
-
-import multiprocessing as mp
-import logging
-import os
-import sys
-
-logger = logging.getLogger(__name__)
-
-
-def __dummy_function_to_start_semaphore_tracker():
-    logger.info('Semphore tracker and forkserver started.')
-
-
-__context = None
-
-
-def initialize():
-    global __context
-
-    if __context is not None:
-        # Already initialized
-        return
-
-    if not __context:
-        if os.name == 'nt':
-            # Windows does not support the forkserver spawn method, we use the default instead
-            __context = mp.get_context()
-        else:
-            try:
-                __context = mp.get_context('forkserver')
-
-                # In order to ensure the forkserver is in a clean state we need to make sure initialize was called
-                # before mxnet was imported from anywhere.
-                all_imported_modules = sys.modules.keys()
-
-                assert 'mxnet' not in all_imported_modules, ("sockeye.multiprocessing_utils.initialize must be called "
-                                                             "before mxnet is imported.")
-
-                p = mp.Process(target=__dummy_function_to_start_semaphore_tracker)
-                p.start()
-                p.join()
-            except ValueError:
-                logger.warning("Forkserver spawn method not available. Default spawn method will be used.")
-                __context = mp.get_context()
-
-
-def get_context():
-    assert __context is not None, ("Multiprocessing context not initialized. Please call "
-                                   "sockeye.multiprocessing_utils.initialize() right after interpreter startup.")
-    return __context
diff --git a/sockeye/optimizers.py b/sockeye/optimizers.py
index c2aa3c3ce..e9d778bda 100644
--- a/sockeye/optimizers.py
+++ b/sockeye/optimizers.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -11,23 +11,12 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-"""
-Extra optimizers not included in MXNet.
-"""
-
-import math
-from abc import abstractmethod
-from collections import namedtuple
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional
 
 import mxnet as mx
 
 from . import config
 from .lr_scheduler import LearningRateScheduler
-from .utils import check_condition
-
-BatchState = namedtuple("BatchState", ["metric_val"])
-CheckpointState = namedtuple("CheckpointState", ["checkpoint", "metric_val"])
 
 
 class OptimizerConfig(config.Config):
@@ -55,225 +44,3 @@ def lr_scheduler(self) -> Optional[LearningRateScheduler]:
 
     def set_lr_scheduler(self, lr_scheduler: Optional[LearningRateScheduler]):
         self.params["lr_scheduler"] = lr_scheduler
-
-
-class SockeyeOptimizer(mx.optimizer.Optimizer):
-    """
-    Optimizer that has access to additional information from the last batch and the last checkpoint
-    when updating weights.
-
-    :param request_optimized_metric: Whether to request the optimized metric (e.g. perplexity) in
-                                     place of optimizer loss (e.g. cross-entropy).
-    """
-
-    def __init__(self, request_optimized_metric: bool = False, **kwargs) -> None:
-        self.request_optimized_metric = request_optimized_metric
-        self.batch_state = None  # type: Optional[BatchState]
-        self.checkpoint_state = None  # type: Optional[CheckpointState]
-        super().__init__(**kwargs)
-
-    def pre_update_batch(self, batch_state: BatchState):
-        """
-        Called automatically prior to `update()` for each batch.
-        """
-        self.batch_state = batch_state
-
-    def pre_update_checkpoint(self, checkpoint_state: CheckpointState):
-        """
-        Called automatically at each checkpoint.
-        """
-        self.checkpoint_state = checkpoint_state
-
-    @abstractmethod
-    def update(self, index, weight, grad, state):
-        """
-        Called automatically as normal.
-        """
-        pass
-
-
-class EveState:
-    """
-    Storage class for Eve optimizer state information.
-    """
-
-    def __init__(self, weight: mx.nd.NDArray) -> None:
-        # Mean and variance for Adam
-        self.mean = mx.nd.zeros_like(weight, ctx=weight.context)
-        self.variance = mx.nd.zeros_like(weight, ctx=weight.context)
-        # For Nadam warmup
-        self.m_schedule = 1.
-        # Values for computing Eve's d term (batch)
-        self.batch_f_hat_prev = 0.
-        self.batch_d_prev = 1.
-        # Values for computing Eve's d term (checkpoint)
-        self.checkpoint_prev = 0
-        self.checkpoint_f_hat_prev = 0.
-        self.checkpoint_d_prev = 1.
-
-
-@mx.optimizer.Optimizer.register
-class Eve(SockeyeOptimizer):
-    """
-    The Eve optimizer is an extended version of Adam that incorporates feedback from the objective
-    function to further adapt the learning rate.
-        * "Improving Stochastic Gradient Descent with Feedback"
-          Jayanth Koushik; Hiroaki Hayashi (https://arxiv.org/abs/1611.01505)
-
-    This version allows:
-        * Using validation checkpoint loss in addition to training batch loss.
-        * Using Adam or Nesterov Adam (Nadam) as the base algorithm
-
-    Eve does not currently support rescaling gradients, clipping gradients, or weight decay.
-
-    :param learning_rate: The initial learning rate.
-    :param beta1: Exponential decay rate for the first moment estimates.
-    :param beta2: Exponential decay rate for the second moment estimates.
-    :param beta3_batch: Exponential decay rate for batch objective relative change.
-    :param beta3_checkpoint: Exponential decay rate for checkpoint objective relative change.
-    :param epsilon: Small value to avoid division by 0.
-    :param k_lo: Lower threshold for relative change.
-    :param k_hi: Upper threshold for relative change.
-    :param use_batch_objective: Incorporate batch objective (can use both).
-    :param use_checkpoint_objective: Incorporate checkpoint objective (can use both).
-    :param use_nesterov_momentum: Use Nesterov-accelerated adaptive moment estimation (update rules
-                                  used by "Nadam" optimizer).
-    """
-
-    def __init__(self,
-                 learning_rate: float = 0.001,
-                 beta1: float = 0.9,
-                 beta2: float = 0.999,
-                 beta3_batch: float = 0.999,
-                 beta3_checkpoint: float = 0.,
-                 epsilon: float = 1e-8,
-                 k_lo: float = 0.1,
-                 k_hi: float = 10,
-                 schedule_decay: float = 0.004,
-                 use_batch_objective: bool = True,
-                 use_checkpoint_objective: bool = False,
-                 use_nesterov_momentum: bool = False,
-                 **kwargs) -> None:
-        check_condition(any((use_batch_objective, use_checkpoint_objective)),
-                        "Must use at least one of: batch objective, checkpoint objective")
-        super().__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.beta3_batch = beta3_batch
-        self.beta3_checkpoint = beta3_checkpoint
-        self.epsilon = epsilon
-        self.k_lo = k_lo
-        self.k_hi = k_hi
-        self.schedule_decay = schedule_decay
-        self.use_batch_objective = use_batch_objective
-        self.use_checkpoint_objective = use_checkpoint_objective
-        self.use_nesterov_momentum = use_nesterov_momentum
-
-    def create_state(self, index: int, weight: mx.nd.NDArray) -> EveState:
-        return EveState(weight)
-
-    def update(self, index: int, weight: mx.nd.NDArray, grad: mx.nd.NDArray, state: EveState):
-
-        assert isinstance(weight, mx.nd.NDArray)
-        assert isinstance(grad, mx.nd.NDArray)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-
-        t = self._index_update_count[index]
-
-        # Preprocess grad
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -1. * self.clip_gradient, self.clip_gradient)
-
-        # First compute Eve's f_hat and d terms
-
-        def compute_d(t: int, f: float, f_hat_prev: float, d_prev: float, beta: float) -> Tuple[float, float]:
-            """Compute Eve's f_hat and d terms as described in paper"""
-            if t > 1:
-                # The original paper has a typo in the algorithm here.  The following lines are re-
-                # written to reflect the actual logic presented in the authors' longer explanation.
-                if f <= f_hat_prev:
-                    delta_lo = 1. / (self.k_hi + 1.)
-                    delta_hi = 1. / (self.k_lo + 1.)
-                else:
-                    delta_lo = self.k_lo + 1.
-                    delta_hi = self.k_hi + 1.
-                # ^ End modified section ^
-                c = min(max(delta_lo, f / f_hat_prev), delta_hi)
-                f_hat = c * f_hat_prev
-                r = abs(f_hat - f_hat_prev) / min(f_hat, f_hat_prev)
-                d = beta * d_prev + (1. - beta) * r
-            else:
-                f_hat = f
-                d = 1.
-            return f_hat, d
-
-        batch_d, checkpoint_d = None, None
-
-        # Computation occurs for each batch
-        if self.use_batch_objective:
-            batch_f_hat, batch_d = compute_d(t,
-                                             self.batch_state.metric_val,
-                                             state.batch_f_hat_prev,
-                                             state.batch_d_prev,
-                                             self.beta3_batch)
-            state.batch_f_hat_prev = batch_f_hat
-            state.batch_d_prev = batch_d
-
-        # Computation occurs once per checkpoint using the checkpoint number as t.  Prior to the
-        # first checkpoint, d = 1.
-        if self.use_checkpoint_objective:
-            # Only need to recompute if we've seen a new checkpoint since the previous batch update
-            if (isinstance(self.checkpoint_state, CheckpointState) and
-                    self.checkpoint_state.checkpoint != state.checkpoint_prev):
-                checkpoint = self.checkpoint_state.checkpoint
-                checkpoint_f_hat, checkpoint_d = compute_d(checkpoint,
-                                                           self.checkpoint_state.metric_val,
-                                                           state.checkpoint_f_hat_prev,
-                                                           state.checkpoint_d_prev,
-                                                           self.beta3_checkpoint)
-                state.checkpoint_prev = checkpoint
-                state.checkpoint_f_hat_prev = checkpoint_f_hat
-                state.checkpoint_d_prev = checkpoint_d
-            else:
-                checkpoint_d = state.checkpoint_d_prev
-
-        # Batch and checkpoint contribute equally when both are used
-        if self.use_batch_objective and self.use_checkpoint_objective:
-            d = (batch_d + checkpoint_d) / 2.
-        elif self.use_batch_objective:
-            d = batch_d
-        elif self.use_checkpoint_objective:
-            d = checkpoint_d
-        else:
-            raise ValueError
-
-        # Update mean and variance (Adam/Nadam)
-        m_t, v_t = state.mean, state.variance
-
-        m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
-        v_t[:] = self.beta2 * v_t + (1. - self.beta2) * grad * grad
-
-        # Finally apply either Adam or Nadam update
-        if self.use_nesterov_momentum:
-            # Nadam warming momentum schedule
-            momentum_t = self.beta1 * (1. - 0.5 * 0.96 ** (t * self.schedule_decay))
-            momentum_t_1 = self.beta1 * (1. - 0.5 * 0.96 ** ((t + 1) * self.schedule_decay))
-            state.m_schedule = state.m_schedule * momentum_t
-            m_schedule_next = state.m_schedule * momentum_t_1
-            # Nadam update terms
-            grad_prime = grad / (1. - state.m_schedule)
-            m_t_prime = m_t / (1. - m_schedule_next)
-            v_t_prime = v_t / (1. - self.beta2 ** t)
-            m_t_bar = (1. - momentum_t) * grad_prime + momentum_t_1 * m_t_prime
-            # Final weight update with extra d term
-            weight[:] -= lr * m_t_bar / (d * mx.nd.sqrt(v_t_prime) + self.epsilon)
-        else:
-            # Adam warmup
-            coef1 = 1. - self.beta1 ** t
-            coef2 = 1. - self.beta2 ** t
-            lr *= math.sqrt(coef2) / coef1
-            # Final weight update with extra d term
-            weight[:] = weight - lr * m_t / (d * mx.nd.sqrt(v_t) + self.epsilon)
diff --git a/sockeye/output_handler.py b/sockeye/output_handler.py
index 636a1ceb8..4279becf3 100644
--- a/sockeye/output_handler.py
+++ b/sockeye/output_handler.py
@@ -19,17 +19,14 @@
 import sockeye.constants as C
 from . import data_io
 from . import inference
-from sockeye.utils import plot_attention, print_attention_text, get_alignments
 
 
 def get_output_handler(output_type: str,
-                       output_fname: Optional[str] = None,
-                       sure_align_threshold: float = 1.0) -> 'OutputHandler':
+                       output_fname: Optional[str] = None) -> 'OutputHandler':
     """
 
     :param output_type: Type of output handler.
     :param output_fname: Output filename. If none sys.stdout is used.
-    :param sure_align_threshold: Threshold to consider an alignment link as 'sure'.
     :raises: ValueError for unknown output_type.
     :return: Output handler.
     """
@@ -42,20 +39,10 @@ def get_output_handler(output_type: str,
         return PairWithScoreOutputHandler(output_stream)
     elif output_type == C.OUTPUT_HANDLER_TRANSLATION_WITH_SCORE:
         return StringWithScoreOutputHandler(output_stream)
-    elif output_type == C.OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENTS:
-        return StringWithAlignmentsOutputHandler(output_stream, sure_align_threshold)
-    elif output_type == C.OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENT_MATRIX:
-        return StringWithAlignmentMatrixOutputHandler(output_stream)
     elif output_type == C.OUTPUT_HANDLER_BENCHMARK:
         return BenchmarkOutputHandler(output_stream)
-    elif output_type == C.OUTPUT_HANDLER_ALIGN_PLOT:
-        return AlignPlotHandler(plot_prefix="align" if output_fname is None else output_fname)
-    elif output_type == C.OUTPUT_HANDLER_ALIGN_TEXT:
-        return AlignTextHandler(sure_align_threshold)
-    elif output_type == C.OUTPUT_HANDLER_BEAM_STORE:
-        return BeamStoringHandler(output_stream)
     elif output_type == C.OUTPUT_HANDLER_JSON:
-        return JSONOutputHandler(output_stream, sure_align_threshold)
+        return JSONOutputHandler(output_stream)
     else:
         raise ValueError("unknown output type")
 
@@ -132,7 +119,7 @@ def handle(self,
         :param t_output: Translator output.
         :param t_walltime: Total walltime for translation.
         """
-        self.stream.write("{:.3f}\t{}\n".format(t_output.score, t_output.translation))
+        self.stream.write("{:.6f}\t{}\n".format(t_output.score, t_output.translation))
         self.stream.flush()
 
     def reports_score(self) -> bool:
@@ -158,7 +145,7 @@ def handle(self,
         :param t_output: Translator output.
         :param t_walltime: Total walltime for translation.
         """
-        self.stream.write("{:.3f}\n".format(t_output.score))
+        self.stream.write("{:.6f}\n".format(t_output.score))
         self.stream.flush()
 
     def reports_score(self) -> bool:
@@ -184,7 +171,7 @@ def handle(self,
         :param t_output: Translator output.
         :param t_walltime: Total walltime for translation.
         """
-        self.stream.write("{:.3f}\t{}\t{}\n".format(t_output.score,
+        self.stream.write("{:.6f}\t{}\t{}\n".format(t_output.score,
                                                     C.TOKEN_SEPARATOR.join(t_input.tokens),
                                                     t_output.translation))
         self.stream.flush()
@@ -193,92 +180,6 @@ def reports_score(self) -> bool:
         return True
 
 
-class StringWithAlignmentsOutputHandler(StringOutputHandler):
-    """
-    Output handler to write translations and alignments to a stream. Translation and alignment string
-    are separated by a tab.
-    Alignments are written in the format:
-    <src_index>-<trg_index> ...
-    An alignment link is included if its probability is above the threshold.
-
-    :param stream: Stream to write translations and alignments to.
-    :param threshold: Threshold for including alignment links.
-    """
-
-    def __init__(self, stream, threshold: float) -> None:
-        super().__init__(stream)
-        self.threshold = threshold
-
-    def handle(self,
-               t_input: inference.TranslatorInput,
-               t_output: inference.TranslatorOutput,
-               t_walltime: float = 0.):
-        """
-        :param t_input: Translator input.
-        :param t_output: Translator output.
-        :param t_walltime: Total wall-clock time for translation.
-        """
-        alignments = " ".join(
-            ["%d-%d" % (s, t) for s, t in get_alignments(t_output.attention_matrix, threshold=self.threshold)])
-        self.stream.write("%s\t%s\n" % (t_output.translation, alignments))
-        self.stream.flush()
-
-    def reports_score(self) -> bool:
-        return False
-
-
-class StringWithAlignmentMatrixOutputHandler(StringOutputHandler):
-    """
-    Output handler to write translations and an alignment matrix to a stream.
-    Note that unlike other output handlers each input sentence will result in an output
-    consisting of multiple lines.
-    More concretely the format is:
-
-    ```
-    sentence id ||| target words ||| score ||| source words ||| number of source words ||| number of target words
-    ALIGNMENT FOR T_1
-    ALIGNMENT FOR T_2
-    ...
-    ALIGNMENT FOR T_n
-    ```
-
-    where the alignment is a list of probabilities of alignment to the source words.
-
-    :param stream: Stream to write translations and alignments to.
-    """
-
-    def __init__(self, stream) -> None:
-        super().__init__(stream)
-
-    def handle(self,
-               t_input: inference.TranslatorInput,
-               t_output: inference.TranslatorOutput,
-               t_walltime: float = 0.):
-        """
-        :param t_input: Translator input.
-        :param t_output: Translator output.
-        :param t_walltime: Total wall-clock time for translation.
-        """
-        line = "{sent_id} ||| {target} ||| {score:f} ||| {source} ||| {source_len:d} ||| {target_len:d}\n"
-        self.stream.write(line.format(sent_id=t_input.sentence_id,
-                                      target=" ".join(t_output.tokens),
-                                      score=t_output.score,
-                                      source=" ".join(t_input.tokens),
-                                      source_len=len(t_input.tokens),
-                                      target_len=len(t_output.tokens)))
-        attention_matrix = t_output.attention_matrix.T
-        for i in range(0, attention_matrix.shape[0]):
-            attention_vector = attention_matrix[i]
-            self.stream.write(" ".join(["%f" % value for value in attention_vector]))
-            self.stream.write("\n")
-
-        self.stream.write("\n")
-        self.stream.flush()
-
-    def reports_score(self) -> bool:
-        return True
-
-
 class BenchmarkOutputHandler(StringOutputHandler):
     """
     Output handler to write detailed benchmark information to a stream.
@@ -305,62 +206,6 @@ def reports_score(self) -> bool:
         return False
 
 
-class AlignPlotHandler(OutputHandler):
-    """
-    Output handler to plot alignment matrices to PNG files.
-
-    :param plot_prefix: Prefix for generated PNG files.
-    """
-
-    def __init__(self, plot_prefix: str) -> None:
-        self.plot_prefix = plot_prefix
-
-    def handle(self,
-               t_input: inference.TranslatorInput,
-               t_output: inference.TranslatorOutput,
-               t_walltime: float = 0.):
-        """
-        :param t_input: Translator input.
-        :param t_output: Translator output.
-        :param t_walltime: Total wall-clock time for translation.
-        """
-        plot_attention(t_output.attention_matrix,
-                       t_input.tokens,
-                       t_output.tokens,
-                       "%s_%s.png" % (self.plot_prefix, t_input.sentence_id))
-
-    def reports_score(self) -> bool:
-        return False
-
-
-class AlignTextHandler(OutputHandler):
-    """
-    Output handler to write alignment matrices as ASCII art.
-
-    :param threshold: Threshold for considering alignment links as sure.
-    """
-
-    def __init__(self, threshold: float) -> None:
-        self.threshold = threshold
-
-    def handle(self,
-               t_input: inference.TranslatorInput,
-               t_output: inference.TranslatorOutput,
-               t_walltime: float = 0.):
-        """
-        :param t_input: Translator input.
-        :param t_output: Translator output.
-        :param t_walltime: Total wall-clock time for translation.
-        """
-        print_attention_text(t_output.attention_matrix,
-                             t_input.tokens,
-                             t_output.tokens,
-                             self.threshold)
-
-    def reports_score(self) -> bool:
-        return False
-
-
 class BeamStoringHandler(OutputHandler):
     """
     Output handler to store beam histories in JSON format.
@@ -393,14 +238,14 @@ def handle(self,
     def reports_score(self) -> bool:
         return False
 
+
 class JSONOutputHandler(OutputHandler):
     """
     Output single-line JSON objects.
     Carries over extra fields from the input.
     """
-    def __init__(self, stream, threshold: float = 0.0) -> None:
+    def __init__(self, stream) -> None:
         self.stream = stream
-        self.align_threshold = threshold
 
     def handle(self,
                t_input: inference.TranslatorInput,
@@ -410,7 +255,7 @@ def handle(self,
         Outputs a JSON object of the fields in the `TranslatorOutput` object.
         """
 
-        d_ = t_output.json(self.align_threshold)
+        d_ = t_output.json()
 
         self.stream.write("%s\n" % json.dumps(d_, sort_keys=True))
         self.stream.flush()
diff --git a/sockeye/parallel.py b/sockeye/parallel.py
new file mode 100644
index 000000000..9818cb077
--- /dev/null
+++ b/sockeye/parallel.py
@@ -0,0 +1,148 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+"""Utility functions for parallel processing."""
+import threading
+
+try:
+    import Queue as queue
+except ImportError:
+    import queue
+
+
+class Parallelizable(object):
+    """Base class for parallelizable unit of work, which can be invoked by `Parallel`.
+    The subclass must implement the `forward_backward` method, and be used
+    together with `Parallel`. For example::
+
+        class ParallelNet(Parallelizable):
+            def __init__(self):
+                self._net = Model()
+                self._loss = gluon.loss.SoftmaxCrossEntropyLoss()
+
+            def forward_backward(self, x):
+                data, label = x
+                with mx.autograd.record():
+                    out = self._net(data)
+                    loss = self._loss(out, label)
+                loss.backward()
+                return loss
+
+        net = ParallelNet()
+        ctx = [mx.gpu(0), mx.gpu(1)]
+        parallel = Parallel(len(ctx), net)
+        # Gluon block is initialized after forwarding the first batch
+        initialized = False
+
+        for batch in batches:
+            for x in gluon.utils.split_and_load(batch, ctx):
+                parallel.put(x)
+            losses = [parallel.get() for _ in ctx]
+            trainer.step()
+    """
+
+    def forward_backward(self, x):
+        """ Forward and backward computation. """
+        raise NotImplementedError()
+
+
+class Parallel(object):
+    """Class for parallel processing with `Parallelizable`s. It invokes a
+    `Parallelizable` with multiple Python threads. For example::
+
+        class ParallelNet(Parallelizable):
+            def __init__(self):
+                self._net = Model()
+                self._loss = gluon.loss.SoftmaxCrossEntropyLoss()
+
+            def forward_backward(self, x):
+                data, label = x
+                mx.autograd.record():
+                    out = self._net(data)
+                    loss = self._loss(out, label)
+                loss.backward()
+                return loss
+
+        net = ParallelNet()
+        ctx = [mx.gpu(0), mx.gpu(1)]
+        parallel = Parallel(len(ctx), net)
+
+        for batch in batches:
+            for x in gluon.utils.split_and_load(batch, ctx):
+                parallel.put(x)
+            losses = [parallel.get() for _ in ctx]
+            trainer.step()
+
+    Parameters
+    ----------
+    num_workers : int
+        Number of worker threads. If set to 0, the main thread is used as the worker for
+        debugging purpose.
+    parallelizable :
+        Parallelizable net whose `forward` and `backward` methods are invoked
+        by multiple worker threads.
+    serial_init : bool, default True
+        Execute the first `num_workers` inputs in main thread, so that the `Block`
+        used in `parallizable` is initialized serially. Initialize a `Block` with
+        multiple threads may cause unexpected behavior.
+    """
+
+    class _StopSignal(object):
+        """Internal class to signal stop. """
+
+        def __init__(self, msg):
+            self._msg = msg
+
+    def __init__(self, num_workers, parallizable, serial_init=True):
+        self._in_queue = queue.Queue(-1)
+        self._out_queue = queue.Queue(-1)
+        self._num_workers = num_workers
+        self._threads = []
+        self._parallizable = parallizable
+        self._num_serial = num_workers if serial_init else 0
+
+        def _worker(in_queue, out_queue, parallel):
+            while True:
+                x = in_queue.get()
+                if isinstance(x, Parallel._StopSignal):
+                    return
+                out = parallel.forward_backward(x)
+                out_queue.put(out)
+
+        arg = (self._in_queue, self._out_queue, self._parallizable)
+        for _ in range(num_workers):
+            thread = threading.Thread(target=_worker, args=arg, daemon=True)
+            self._threads.append(thread)
+            thread.start()
+
+    def put(self, x):
+        """Assign input `x` to an available worker and invoke
+        `parallizable.forward_backward` with x. """
+        if self._num_serial > 0 or len(self._threads) == 0:
+            self._num_serial -= 1
+            out = self._parallizable.forward_backward(x)
+            self._out_queue.put(out)
+        else:
+            self._in_queue.put(x)
+
+    def get(self):
+        """Get an output of previous `parallizable.forward_backward` calls.
+        This method blocks if none of previous `parallizable.forward_backward`
+        calls have return any result. """
+        return self._out_queue.get()
+
+    def __del__(self):
+        for thread in self._threads:
+            if thread.is_alive():
+                self._in_queue.put(self._StopSignal('stop'))
+        for thread in self._threads:
+            thread.join(10)
diff --git a/sockeye/pre_mxnet.py b/sockeye/pre_mxnet.py
new file mode 100644
index 000000000..589709cd6
--- /dev/null
+++ b/sockeye/pre_mxnet.py
@@ -0,0 +1,54 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+'''Handle special settings that must be applied before mxnet is imported'''
+
+import logging
+import os
+import sys
+
+
+OMP_NUM_THREADS = 'OMP_NUM_THREADS'
+OMP_NUM_THREADS_ARG = '--omp-num-threads'
+ENV_ARG = '--env'
+
+
+logger = logging.getLogger(__name__)
+initialized = False
+
+
+def handle_omp_num_threads():
+    for i, arg in enumerate(sys.argv):
+        if arg.startswith(OMP_NUM_THREADS_ARG):
+            if '=' in arg:
+                val = arg.split('=')[1]
+            else:
+                val = sys.argv[i + 1]
+            logger.warning('Setting %s=%s', OMP_NUM_THREADS, val)
+            os.environ[OMP_NUM_THREADS] = val
+        elif arg.startswith(ENV_ARG):
+            if arg.startswith(ENV_ARG + '='):
+                argval = arg.split("=", 1)[1]
+            else:
+                argval = sys.argv[i + 1]
+            for var_val in argval.split(','):
+                var, val = var_val.split('=', 1)
+                logger.warning('Setting %s=%s', var, val)
+                os.environ[var] = val
+
+
+def init():
+    '''Call before importing mxnet module'''
+    global initialized
+    if not initialized:
+        handle_omp_num_threads()
+        initialized = True
diff --git a/sockeye/prepare_data.py b/sockeye/prepare_data.py
index 2d13ceb66..571093057 100644
--- a/sockeye/prepare_data.py
+++ b/sockeye/prepare_data.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -38,13 +38,14 @@ def prepare_data(args: argparse.Namespace):
     setup_main_logger(console=not args.quiet,
                       file_logging=not args.no_logfile,
                       path=os.path.join(output_folder, C.LOG_NAME))
-
+    utils.log_basic_info(args)
     utils.seed_rngs(args.seed)
 
     minimum_num_shards = args.min_num_shards
     samples_per_shard = args.num_samples_per_shard
     bucketing = not args.no_bucketing
     bucket_width = args.bucket_width
+    bucket_scaling = not args.no_bucket_scaling
 
     source_paths = [args.source] + args.source_factors
     source_factor_vocab_paths = [args.source_factor_vocabs[i] if i < len(args.source_factor_vocabs)
@@ -65,6 +66,7 @@ def prepare_data(args: argparse.Namespace):
 
     source_vocabs, target_vocab = vocab.load_or_create_vocabs(
         source_paths=source_paths,
+        factor_vocab_same_as_source=args.source_factors_use_source_vocab,
         target_path=args.target,
         source_vocab_paths=source_vocab_paths,
         target_vocab_path=args.target_vocab,
@@ -88,7 +90,9 @@ def prepare_data(args: argparse.Namespace):
                          bucket_width=bucket_width,
                          samples_per_shard=samples_per_shard,
                          min_num_shards=minimum_num_shards,
-                         output_prefix=output_folder)
+                         output_prefix=output_folder,
+                         bucket_scaling=bucket_scaling,
+                         max_processes=args.max_processes)
 
 
 if __name__ == "__main__":
diff --git a/sockeye/quantization.py b/sockeye/quantization.py
new file mode 100644
index 000000000..9ae305fe5
--- /dev/null
+++ b/sockeye/quantization.py
@@ -0,0 +1,251 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import logging
+import math
+
+import mxnet as mx
+from mxnet.gluon.nn.activations import Activation
+
+from . import constants as C
+
+logger = logging.getLogger(__name__)
+
+
+# Modified from the source to mxnet.gluon.nn.basic_layers.Dense which is:
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+class QuantizableDense(mx.gluon.HybridBlock):
+    r"""Optionally Quantized fully-connected NN layer.
+
+    `QuantDense` implements the operation:
+    `output = activation(dot(input, weight) + bias)`
+    where `activation` is the element-wise activation function
+    passed as the `activation` argument, `weight` is a weights matrix
+    created by the layer, and `bias` is a bias vector created by the layer
+    (only applicable if `use_bias` is `True`).
+
+    Note: the input must be a tensor with rank 2. Use `flatten` to convert it
+    to rank 2 manually if necessary.
+
+    Parameters
+    ----------
+    units : int
+        Dimensionality of the output space.
+    activation : str
+        Activation function to use. See help on `Activation` layer.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias : bool, default True
+        Whether the layer uses a bias vector.
+    flatten: bool, default True
+        Whether the input tensor should be flattened.
+        If true, all but the first axis of input data are collapsed together.
+        If false, all but the last axis of input data are kept the same, and the transformation
+        applies on the last axis.
+    dtype : str or np.dtype, default C.DTYPE_FP32
+        Data type of output embeddings.
+    weight_initializer : str or `Initializer`
+        Initializer for the `kernel` weights matrix.
+    bias_initializer: str or `Initializer`
+        Initializer for the bias vector.
+    in_units : int, optional
+        Size of the input data. If not specified, initialization will be
+        deferred to the first time `forward` is called and `in_units`
+        will be inferred from the shape of input data.
+    prefix : str or None
+        See document of `Block`.
+    params : ParameterDict or None
+        See document of `Block`.
+
+
+    Inputs:
+        - **data**: if `flatten` is True, `data` should be a tensor with shape
+          `(batch_size, x1, x2, ..., xn)`, where x1 * x2 * ... * xn is equal to
+          `in_units`. If `flatten` is False, `data` should have shape
+          `(x1, x2, ..., xn, in_units)`.
+
+    Outputs:
+        - **out**: if `flatten` is True, `out` will be a tensor with shape
+          `(batch_size, units)`. If `flatten` is False, `out` will have shape
+          `(x1, x2, ..., xn, units)`.
+    """
+    def __init__(self, units, dtype: str, activation=None, use_bias=True, flatten=True,
+                 weight_initializer=None, bias_initializer='zeros',
+                 in_units=0, **kwargs):
+        super(QuantizableDense, self).__init__(**kwargs)
+        self._flatten = flatten
+        self._dtype = dtype
+        with self.name_scope():
+            self._units = units
+            self._in_units = in_units
+            if dtype == C.DTYPE_INT8:
+                self.scaling = self.params.get('scaling', shape=(1,),
+                                               #Initialize to an obviously wrong value so we can detect later
+                                               init=mx.initializer.Constant(-1.0), dtype=C.DTYPE_FP32,
+                                               allow_deferred_init=True)
+                weight_initializer = 'zeros' # Most initializers don't work for int8, but this is for inference anyway.
+
+            self.weight = self.params.get('weight', shape=(units, in_units),
+                                          init=weight_initializer, dtype=dtype,
+                                          allow_deferred_init=True)
+
+            if use_bias:
+                self.bias = self.params.get('bias', shape=(units,),
+                                            init=bias_initializer, dtype = C.DTYPE_FP32,
+                                            allow_deferred_init=True)
+            else:
+                self.bias = None
+            if activation is not None:
+                self.act = Activation(activation, prefix=activation+'_')
+            else:
+                self.act = None
+
+    def cast(self, dtype):
+        if self._dtype != C.DTYPE_INT8:
+            self._dtype = dtype
+            super(QuantizableDense, self).cast(dtype)
+        else:
+            #No casting an already quantized matrix.
+            logger.warning("Ignoring casting on int8 matrix")
+
+    def hybrid_forward(self, F, x, weight, scaling=None, bias=None):
+        if self._dtype == C.DTYPE_INT8:
+            if bias is not None:
+                act = F.contrib.intgemm_fully_connected(x, weight, scaling, bias, no_bias=False, num_hidden=self._units,
+                                                        flatten=self._flatten, name='fwd')
+            else:
+                act = F.contrib.intgemm_fully_connected(x, weight, scaling, no_bias=True, num_hidden=self._units,
+                                                        flatten=self._flatten, name='fwd')
+        else:
+            #Newer MXNet allows a numpy array.
+            #fc = F.npx.fully_connected if is_np_array() else F.FullyConnected
+            act = F.FullyConnected(x, weight, bias, no_bias=bias is None, num_hidden=self._units,
+                     flatten=self._flatten, name='fwd')
+        if self.act is not None:
+            act = self.act(act)
+        return act
+
+    def __repr__(self):
+        s = '{name}({layout}, {act})'
+        shape = self.weight.shape
+        return s.format(name=self.__class__.__name__,
+                        act=self.act if self.act else 'linear',
+                        layout='{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0]))
+
+
+def optimize_quantization_mse(tensor, rounds=10):
+    """
+    Minimize mean squared error of quantizing a tensor, returning the top value
+    (i.e. the one that quantizes to 127).  Scaling = 127.0 / return value.
+
+    This is a convex optimization problem.  EM works but makes slow steps.
+    Instead of EM, use binary search in the direction minimization suggests.
+    """
+    best_mse = math.inf
+    best_top = None
+    maxabs = mx.nd.contrib.intgemm_maxabsolute(tensor)
+    low = 0.0
+    high = maxabs
+    for _ in range(rounds):
+        value = (low + high) / 2.0
+        quant = mx.nd.contrib.intgemm_prepare_data(tensor, value)
+        quant_float = mx.nd.cast(quant, dtype=C.DTYPE_FP32)
+        mse = (quant_float * (value / 127.0) - tensor).norm().asscalar() / math.sqrt(float(tensor.size))
+        if mse < best_mse:
+            best_mse = mse
+            best_top = value
+        # This optimizes scaling subject to cluster assignment.
+        # It can be used for EM but the step is really slow, so use it for direction.
+        scale = mx.nd.sum(quant_float * quant_float) / mx.nd.sum(quant_float * tensor)
+        top = 127.0 / scale.asscalar()
+        if top < value:
+            high = value
+        else:
+            low = value
+    return best_top
+
+
+def extract_quant_max(tensor_param: mx.gluon.parameter.Parameter, scaling_param: mx.gluon.parameter.Parameter) -> float:
+    """
+    Extract or tune the scaling factor for a parameter.
+    """
+    scaling = scaling_param.data()
+    if scaling.asscalar() < 0:
+        # Bogus auto initialized scaling factor.
+        b_max = optimize_quantization_mse(tensor_param.data())
+        scaling_param.set_data(b_max / 127.0)
+    else:
+        b_max = scaling * 127.0
+    return b_max
+
+
+def convert_weights_disk_format(params: mx.gluon.parameter.ParameterDict, dtype_store: str):
+    """
+    Convert weights from float32 MXNet format (B^T in float32) to disk format
+    (B^T in int8 format).
+
+    If dtype_store == 'int8' then compute scaling and quantize the model.
+    If dtype_store == 'float32' then just annotate with scaling factors.
+    :param params model parameters from model.collect_params() in a float32
+       model.
+    :param dtype_store data type to store on disk.
+    """
+    logger.info("Optimizing quantization scaling factors")
+    for name, param in params.items():
+        if name.endswith("_weight"):
+            scaling_name = name[0:-6] + "scaling"
+            if scaling_name in params:
+                b_max = extract_quant_max(param, params[scaling_name])
+                if dtype_store == C.DTYPE_INT8:
+                    quantized = mx.nd.contrib.intgemm_prepare_data(param.data(), b_max)
+                    param.set_data(quantized)
+                    param.dtype = C.DTYPE_INT8
+
+
+def convert_weights_cpu_dependent(params: mx.gluon.parameter.ParameterDict):
+    """
+    Convert weights from disk format to intgemm's CPU-dependent format for
+    quantized matrix multiplication.
+
+    :param params model parameters from model.collect_params() in a model that
+        came from convert_weights_disk_format.
+    """
+    logger.info("Converting weights to CPU format.")
+    for name, param in params.items():
+        if name.endswith("_weight"):
+            scaling_name = name[0:-6] + "scaling"
+            if scaling_name in params:
+                if param.dtype == C.DTYPE_INT8:
+                    # Already fully quantized, just rearrange.
+                    weight = mx.nd.contrib.intgemm_prepare_weight(param.data(), already_quantized = True)
+                else:
+                    # Use offline scaling factor if available.
+                    b_max = extract_quant_max(param, params[scaling_name])
+                    weight = mx.nd.contrib.intgemm_prepare_weight(param.data(), b_max)
+                param.set_data(weight)
+                param.dtype = C.DTYPE_INT8
diff --git a/sockeye/quantize.py b/sockeye/quantize.py
new file mode 100644
index 000000000..c46ca30dd
--- /dev/null
+++ b/sockeye/quantize.py
@@ -0,0 +1,59 @@
+# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import argparse
+import logging
+import os
+
+import sockeye.constants as C
+from sockeye.log import setup_main_logger, log_sockeye_version
+import sockeye.model
+from sockeye.utils import check_condition
+
+logger = logging.getLogger(__name__)
+
+
+def annotate_model_params(model_dir: str):
+    log_sockeye_version(logger)
+
+    params_best = os.path.join(model_dir, C.PARAMS_BEST_NAME)
+    params_best_float32 = os.path.join(model_dir, C.PARAMS_BEST_NAME_FLOAT32)
+    config = os.path.join(model_dir, C.CONFIG_NAME)
+    config_float32 = os.path.join(model_dir, C.CONFIG_NAME_FLOAT32)
+
+    for fname in params_best_float32, config_float32:
+        check_condition(not os.path.exists(fname),
+                        'File "%s" exists, indicating this model has already been quantized.' % fname)
+
+    # Load model and compute scaling factors
+    model = sockeye.model.load_model(model_dir, for_disk_saving='float32', dtype='int8')
+    # Move original params and config files
+    os.rename(params_best, params_best_float32)
+    os.rename(config, config_float32)
+    # Write new params and config files with annotated scaling factors
+    model[0].save_parameters(params_best)
+    model[0].save_config(model_dir)
+
+
+def main():
+    setup_main_logger(console=True, file_logging=False)
+    params = argparse.ArgumentParser(
+        description='Annotate trained model with scaling factors for fast loading/quantization for int8 inference.')
+    params.add_argument('--model', '-m', required=True, help='Trained Sockeye model directory.')
+    args = params.parse_args()
+
+    annotate_model_params(args.model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sockeye/rerank.py b/sockeye/rerank.py
index fcb6e59b4..2a613bb34 100644
--- a/sockeye/rerank.py
+++ b/sockeye/rerank.py
@@ -1,4 +1,4 @@
-# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -59,7 +59,7 @@ def rerank(self, hypotheses: Dict[str, Any], reference: str) -> Dict[str, Any]:
         :param reference: A single string with the actual reference translation.
         :return: Nbest translations sorted by reranking scores.
         """
-        scores = [self.scoring_function(hypothesis, reference) for hypothesis in hypotheses['translations']]
+        scores = [self.scoring_function(hypothesis, reference).score for hypothesis in hypotheses['translations']]
         ranking = list(np.argsort(scores, kind='mergesort')[::-1])  # descending
         reranked_hypotheses = self._sort_by_ranking(hypotheses, ranking)
         if self.return_score:
diff --git a/sockeye/rnn.py b/sockeye/rnn.py
deleted file mode 100644
index fd44dfbcf..000000000
--- a/sockeye/rnn.py
+++ /dev/null
@@ -1,524 +0,0 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-# List is needed for mypy, but not used in the code, only in special comments
-from typing import Optional, List, Iterable  # NOQA pylint: disable=unused-import
-
-import mxnet as mx
-
-from sockeye.config import Config
-from sockeye.layers import LayerNormalization, LHUC
-from . import constants as C
-from . import utils
-
-
-class RNNConfig(Config):
-    """
-    RNN configuration.
-
-    :param cell_type: RNN cell type.
-    :param num_hidden: Number of RNN hidden units.
-    :param num_layers: Number of RNN layers.
-    :param dropout_inputs: Dropout probability on RNN inputs (Gal, 2015).
-    :param dropout_states: Dropout probability on RNN states (Gal, 2015).
-    :param dropout_recurrent: Dropout probability on cell update (Semeniuta, 2016).
-    :param residual: Whether to add residual connections between multi-layered RNNs.
-    :param first_residual_layer: First layer with a residual connection (1-based indexes).
-           Default is to start at the second layer.
-    :param forget_bias: Initial value of forget biases.
-    :param lhuc: Apply LHUC (Vilar 2018) to the hidden units of the RNN.
-    :param dtype: Data type.
-    """
-
-    def __init__(self,
-                 cell_type: str,
-                 num_hidden: int,
-                 num_layers: int,
-                 dropout_inputs: float,
-                 dropout_states: float,
-                 dropout_recurrent: float = 0,
-                 residual: bool = False,
-                 first_residual_layer: int = 2,
-                 forget_bias: float = 0.0,
-                 lhuc: bool = False,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__()
-        self.cell_type = cell_type
-        self.num_hidden = num_hidden
-        self.num_layers = num_layers
-        self.dropout_inputs = dropout_inputs
-        self.dropout_states = dropout_states
-        self.dropout_recurrent = dropout_recurrent
-        self.residual = residual
-        self.first_residual_layer = first_residual_layer
-        self.forget_bias = forget_bias
-        self.lhuc = lhuc
-        self.dtype = dtype
-
-
-class SequentialRNNCellParallelInput(mx.rnn.SequentialRNNCell):
-    """
-    A SequentialRNNCell, where an additional "parallel" input can be given at
-    call time and it will be added to the input of each layer
-    """
-
-    def __call__(self, inputs, parallel_inputs, states):
-        # Adapted copy of mx.rnn.SequentialRNNCell.__call__()
-        self._counter += 1
-        next_states = []
-        pos = 0
-        for cell in self._cells:
-            assert not isinstance(cell, mx.rnn.BidirectionalCell)
-            length = len(cell.state_info)
-            state = states[pos:pos + length]
-            pos += length
-            inputs, state = cell(inputs, parallel_inputs, state)
-            next_states.append(state)
-        return inputs, sum(next_states, [])
-
-
-class ParallelInputCell(mx.rnn.ModifierCell):
-    """
-    A modifier cell that accepts two input vectors and concatenates them before
-    calling the original cell. Typically it is used for concatenating the
-    normal and the parallel input in a stacked rnn.
-    """
-
-    def __call__(self, inputs, parallel_inputs, states):
-        concat_inputs = mx.sym.concat(inputs, parallel_inputs)
-        output, states = self.base_cell(concat_inputs, states)
-        return output, states
-
-
-class ResidualCellParallelInput(mx.rnn.ResidualCell):
-    """
-    A ResidualCell, where an additional "parallel" input can be given at call
-    time and it will be added to the input of each layer, but not considered
-    for the residual connection itself.
-    """
-
-    def __call__(self, inputs, parallel_inputs, states):
-        concat_inputs = mx.sym.concat(inputs, parallel_inputs)
-        output, states = self.base_cell(concat_inputs, states)
-        output = mx.symbol.elemwise_add(output, inputs, name="%s_plus_residual" % output.name)
-        return output, states
-
-
-def get_stacked_rnn(config: RNNConfig, prefix: str,
-                    parallel_inputs: bool = False,
-                    layers: Optional[Iterable[int]] = None) -> mx.rnn.SequentialRNNCell:
-    """
-    Returns (stacked) RNN cell given parameters.
-
-    :param config: rnn configuration.
-    :param prefix: Symbol prefix for RNN.
-    :param parallel_inputs: Support parallel inputs for the stacked RNN cells.
-    :param layers: Specify which layers to create as a list of layer indexes.
-
-    :return: RNN cell.
-    """
-
-    rnn = mx.rnn.SequentialRNNCell() if not parallel_inputs else SequentialRNNCellParallelInput()
-    if not layers:
-        layers = range(config.num_layers)
-    for layer_idx in layers:
-        # fhieber: the 'l' in the prefix does NOT stand for 'layer' but for the direction 'l' as in mx.rnn.rnn_cell::517
-        # this ensures parameter name compatibility of training w/ FusedRNN and decoding with 'unfused' RNN.
-        cell_prefix = "%sl%d_" % (prefix, layer_idx)
-        if config.cell_type == C.LSTM_TYPE:
-            if config.dropout_recurrent > 0.0:
-                cell = RecurrentDropoutLSTMCell(num_hidden=config.num_hidden, prefix=cell_prefix,
-                                                forget_bias=config.forget_bias, dropout=config.dropout_recurrent)
-            else:
-                cell = mx.rnn.LSTMCell(num_hidden=config.num_hidden, prefix=cell_prefix, forget_bias=config.forget_bias)
-        elif config.cell_type == C.LNLSTM_TYPE:
-            cell = LayerNormLSTMCell(num_hidden=config.num_hidden, prefix=cell_prefix, forget_bias=config.forget_bias)
-        elif config.cell_type == C.LNGLSTM_TYPE:
-            cell = LayerNormPerGateLSTMCell(num_hidden=config.num_hidden, prefix=cell_prefix,
-                                            forget_bias=config.forget_bias)
-        elif config.cell_type == C.GRU_TYPE:
-            cell = mx.rnn.GRUCell(num_hidden=config.num_hidden, prefix=cell_prefix)
-        elif config.cell_type == C.LNGRU_TYPE:
-            cell = LayerNormGRUCell(num_hidden=config.num_hidden, prefix=cell_prefix)
-        elif config.cell_type == C.LNGGRU_TYPE:
-            cell = LayerNormPerGateGRUCell(num_hidden=config.num_hidden, prefix=cell_prefix)
-        else:
-            raise NotImplementedError()
-
-        if config.dropout_inputs > 0 or config.dropout_states > 0:
-            cell = VariationalDropoutCell(cell,
-                                          dropout_inputs=config.dropout_inputs,
-                                          dropout_states=config.dropout_states)
-
-        if config.lhuc:
-            cell = LHUCCell(cell, config.num_hidden, config.dtype)
-
-        # layer_idx is 0 based, whereas first_residual_layer is 1-based
-        if config.residual and layer_idx + 1 >= config.first_residual_layer:
-            cell = mx.rnn.ResidualCell(cell) if not parallel_inputs else ResidualCellParallelInput(cell)
-        elif parallel_inputs:
-            cell = ParallelInputCell(cell)
-
-        rnn.add(cell)
-
-    return rnn
-
-
-class LayerNormLSTMCell(mx.rnn.LSTMCell):
-    """
-    Long-Short Term Memory (LSTM) network cell with layer normalization across gates.
-    Based on Jimmy Lei Ba et al: Layer Normalization (https://arxiv.org/pdf/1607.06450.pdf)
-
-    :param num_hidden: number of RNN hidden units. Number of units in output symbol.
-    :param prefix: prefix for name of layers (and name of weight if params is None).
-    :param params: RNNParams or None. Container for weight sharing between cells. Created if None.
-    :param forget_bias: bias added to forget gate, default 1.0. Jozefowicz et al. 2015 recommends setting this to 1.0.
-    :param norm_scale: scale/gain for layer normalization.
-    :param norm_shift: shift/bias after layer normalization.
-    """
-
-    def __init__(self,
-                 num_hidden: int,
-                 prefix: str = 'lnlstm_',
-                 params: Optional[mx.rnn.RNNParams] = None,
-                 forget_bias: float = 1.0,
-                 norm_scale: float = 1.0,
-                 norm_shift: float = 0.0) -> None:
-        super(LayerNormLSTMCell, self).__init__(num_hidden, prefix, params, forget_bias)
-        self._iN = LayerNormalization(prefix="%si2h" % self._prefix,
-                                      scale=self.params.get('i2h_scale', shape=(num_hidden * 4,), init=mx.init.Constant(value=norm_scale)),
-                                      shift=self.params.get('i2h_shift', shape=(num_hidden * 4,), init=mx.init.Constant(value=norm_shift)))
-        self._hN = LayerNormalization(prefix="%sh2h" % self._prefix,
-                                      scale=self.params.get('h2h_scale', shape=(num_hidden * 4,), init=mx.init.Constant(value=norm_scale)),
-                                      shift=self.params.get('h2h_shift', shape=(num_hidden * 4,), init=mx.init.Constant(value=norm_shift)))
-        self._cN = LayerNormalization(prefix="%sc" % self._prefix,
-                                      scale=self.params.get('c_scale', shape=(num_hidden,), init=mx.init.Constant(value=norm_scale)),
-                                      shift=self.params.get('c_shift', shape=(num_hidden,), init=mx.init.Constant(value=norm_shift)))
-
-    def __call__(self, inputs, states):
-        self._counter += 1
-        name = '%st%d_' % (self._prefix, self._counter)
-        i2h = mx.sym.FullyConnected(data=inputs, weight=self._iW, bias=self._iB,
-                                    num_hidden=self._num_hidden * 4,
-                                    name='%si2h' % name)
-        h2h = mx.sym.FullyConnected(data=states[0], weight=self._hW, bias=self._hB,
-                                    num_hidden=self._num_hidden * 4,
-                                    name='%sh2h' % name)
-        gates = self._iN(i2h) + self._hN(h2h + mx.sym.zeros_like(i2h))
-        # pylint: disable=unbalanced-tuple-unpacking
-        in_gate, forget_gate, in_transform, out_gate = mx.sym.split(gates,
-                                                                    num_outputs=4,
-                                                                    axis=1,
-                                                                    name="%sslice" % name)
-        in_gate = mx.sym.Activation(in_gate, act_type="sigmoid",
-                                    name='%si' % name)
-        forget_gate = mx.sym.Activation(forget_gate, act_type="sigmoid",
-                                        name='%sf' % name)
-        in_transform = mx.sym.Activation(in_transform, act_type="tanh",
-                                         name='%sc' % name)
-        out_gate = mx.sym.Activation(out_gate, act_type="sigmoid",
-                                     name='%so' % name)
-        next_c = mx.sym._internal._plus(forget_gate * states[1], in_gate * in_transform,
-                                        name='%sstate' % name)
-        next_h = mx.sym._internal._mul(out_gate,
-                                       mx.sym.Activation(self._cN(next_c), act_type="tanh"),
-                                       name='%sout' % name)
-        return next_h, [next_h, next_c]
-
-
-class LayerNormPerGateLSTMCell(mx.rnn.LSTMCell):
-    """
-    Long-Short Term Memory (LSTM) network cell with layer normalization per gate.
-    Based on Jimmy Lei Ba et al: Layer Normalization (https://arxiv.org/pdf/1607.06450.pdf)
-
-    :param num_hidden: number of RNN hidden units. Number of units in output symbol.
-    :param prefix: prefix for name of layers (and name of weight if params is None).
-    :param params: RNNParams or None. Container for weight sharing between cells. Created if None.
-    :param forget_bias: bias added to forget gate, default 1.0. Jozefowicz et al. 2015 recommends setting this to 1.0.
-    :param norm_scale: scale/gain for layer normalization.
-    :param norm_shift: shift/bias after layer normalization.
-    """
-
-    def __init__(self,
-                 num_hidden: int,
-                 prefix: str = 'lnglstm_',
-                 params: Optional[mx.rnn.RNNParams] = None,
-                 forget_bias: float = 1.0,
-                 norm_scale: float = 1.0,
-                 norm_shift: float = 0.0) -> None:
-        super(LayerNormPerGateLSTMCell, self).__init__(num_hidden, prefix, params, forget_bias)
-        self._norm_layers = list()  # type: List[LayerNormalization]
-        for name in ['i', 'f', 'c', 'o', 's']:
-            scale = self.params.get('%s_shift' % name,
-                                    init=mx.init.Constant(value=norm_shift))
-            shift = self.params.get('%s_scale' % name,
-                                    init=mx.init.Constant(value=norm_scale if name != "f" else forget_bias))
-            self._norm_layers.append(
-                LayerNormalization(prefix="%s%s" % (self._prefix, name), scale=scale, shift=shift))
-
-    def __call__(self, inputs, states):
-        self._counter += 1
-        name = '%st%d_' % (self._prefix, self._counter)
-        i2h = mx.sym.FullyConnected(data=inputs, weight=self._iW, bias=self._iB,
-                                    num_hidden=self._num_hidden * 4,
-                                    name='%si2h' % name)
-        h2h = mx.sym.FullyConnected(data=states[0], weight=self._hW, bias=self._hB,
-                                    num_hidden=self._num_hidden * 4,
-                                    name='%sh2h' % name)
-        gates = i2h + h2h
-        # pylint: disable=unbalanced-tuple-unpacking
-        in_gate, forget_gate, in_transform, out_gate = mx.sym.split(
-            gates, num_outputs=4, name="%sslice" % name)
-
-        in_gate = self._norm_layers[0](in_gate)
-        forget_gate = self._norm_layers[1](forget_gate)
-        in_transform = self._norm_layers[2](in_transform)
-        out_gate = self._norm_layers[3](out_gate)
-
-        in_gate = mx.sym.Activation(in_gate, act_type="sigmoid",
-                                    name='%si' % name)
-        forget_gate = mx.sym.Activation(forget_gate, act_type="sigmoid",
-                                        name='%sf' % name)
-        in_transform = mx.sym.Activation(in_transform, act_type="tanh",
-                                         name='%sc' % name)
-        out_gate = mx.sym.Activation(out_gate, act_type="sigmoid",
-                                     name='%so' % name)
-        next_c = mx.sym._internal._plus(forget_gate * states[1], in_gate * in_transform,
-                                        name='%sstate' % name)
-        next_h = mx.sym._internal._mul(out_gate,
-                                       mx.sym.Activation(self._norm_layers[4].__call__(next_c), act_type="tanh"),
-                                       name='%sout' % name)
-        return next_h, [next_h, next_c]
-
-
-class LHUCCell(mx.rnn.ModifierCell):
-    """
-    Adds a LHUC operation to the output of the cell.
-    """
-    def __init__(self, base_cell, num_hidden, dtype) -> None:
-        super().__init__(base_cell)
-        self.num_hidden = num_hidden
-        self.lhuc_params = self.params.get(C.LHUC_NAME, shape=(num_hidden,), dtype=dtype, init=mx.init.Uniform(0.1))
-        self.lhuc = LHUC(num_hidden, self.lhuc_params)
-
-    def __call__(self, inputs, states):
-        output, states = self.base_cell(inputs, states)
-        output = self.lhuc(output)
-        return output, states
-
-
-class RecurrentDropoutLSTMCell(mx.rnn.LSTMCell):
-    """
-    LSTMCell with recurrent dropout without memory loss as in:
-    http://aclanthology.coli.uni-saarland.de/pdf/C/C16/C16-1165.pdf
-    """
-
-    def __init__(self, num_hidden, prefix='lstm_', params=None, forget_bias=1.0, dropout: float = 0.0) -> None:
-        super().__init__(num_hidden, prefix, params, forget_bias)
-        utils.check_condition(dropout > 0.0, "RecurrentDropoutLSTMCell shoud have dropout > 0.0")
-        self.dropout = dropout
-
-    def __call__(self, inputs, states):
-        self._counter += 1
-        name = '%st%d_' % (self._prefix, self._counter)
-        i2h = mx.sym.FullyConnected(data=inputs, weight=self._iW, bias=self._iB,
-                                    num_hidden=self._num_hidden * 4,
-                                    name='%si2h' % name)
-        h2h = mx.sym.FullyConnected(data=states[0], weight=self._hW, bias=self._hB,
-                                    num_hidden=self._num_hidden * 4,
-                                    name='%sh2h' % name)
-        gates = i2h + h2h
-        slice_gates = mx.sym.SliceChannel(gates, num_outputs=4,
-                                          name="%sslice" % name)
-        in_gate = mx.sym.Activation(slice_gates[0], act_type="sigmoid",
-                                    name='%si' % name)
-        forget_gate = mx.sym.Activation(slice_gates[1], act_type="sigmoid",
-                                        name='%sf' % name)
-        in_transform = mx.sym.Activation(slice_gates[2], act_type="tanh",
-                                         name='%sc' % name)
-        if self.dropout > 0.0:
-            in_transform = mx.sym.Dropout(in_transform, p=self.dropout, name='%sc_dropout' % name)
-        out_gate = mx.sym.Activation(slice_gates[3], act_type="sigmoid",
-                                     name='%so' % name)
-        next_c = mx.sym._internal._plus(forget_gate * states[1], in_gate * in_transform,
-                                        name='%sstate' % name)
-        next_h = mx.sym._internal._mul(out_gate, mx.sym.Activation(next_c, act_type="tanh"),
-                                       name='%sout' % name)
-
-        return next_h, [next_h, next_c]
-
-
-class LayerNormGRUCell(mx.rnn.GRUCell):
-    """
-    Gated Recurrent Unit (GRU) network cell with layer normalization across gates.
-    Based on Jimmy Lei Ba et al: Layer Normalization (https://arxiv.org/pdf/1607.06450.pdf)
-
-    :param num_hidden: number of RNN hidden units. Number of units in output symbol.
-    :param prefix: prefix for name of layers (and name of weight if params is None).
-    :param params: RNNParams or None. Container for weight sharing between cells. Created if None.
-    :param norm_scale: scale/gain for layer normalization.
-    :param norm_shift: shift/bias after layer normalization.
-    """
-
-    def __init__(self,
-                 num_hidden: int,
-                 prefix: str = 'lngru_',
-                 params: Optional[mx.rnn.RNNParams] = None,
-                 norm_scale: float = 1.0,
-                 norm_shift: float = 0.0) -> None:
-        super(LayerNormGRUCell, self).__init__(num_hidden, prefix, params)
-        self._iN = LayerNormalization(prefix="%si2h" % self._prefix,
-                                      scale=self.params.get('i2h_scale', init=mx.init.Constant(value=norm_scale)),
-                                      shift=self.params.get('i2h_shift', init=mx.init.Constant(value=norm_shift)))
-        self._hN = LayerNormalization(prefix="%sh2h" % self._prefix,
-                                      scale=self.params.get('h2h_scale', init=mx.init.Constant(value=norm_scale)),
-                                      shift=self.params.get('h2h_shift', init=mx.init.Constant(value=norm_shift)))
-
-    def __call__(self, inputs, states):
-        self._counter += 1
-
-        seq_idx = self._counter
-        name = '%st%d_' % (self._prefix, seq_idx)
-        prev_state_h = states[0]
-
-        i2h = mx.sym.FullyConnected(data=inputs,
-                                    weight=self._iW,
-                                    bias=self._iB,
-                                    num_hidden=self._num_hidden * 3,
-                                    name="%s_i2h" % name)
-        h2h = mx.sym.FullyConnected(data=prev_state_h,
-                                    weight=self._hW,
-                                    bias=self._hB,
-                                    num_hidden=self._num_hidden * 3,
-                                    name="%s_h2h" % name)
-
-        i2h = self._iN(i2h)
-        h2h = self._hN(h2h)
-
-        # pylint: disable=unbalanced-tuple-unpacking
-        i2h_r, i2h_z, i2h = mx.sym.split(i2h, num_outputs=3, name="%s_i2h_slice" % name)
-        h2h_r, h2h_z, h2h = mx.sym.split(h2h, num_outputs=3, name="%s_h2h_slice" % name)
-
-        reset_gate = mx.sym.Activation(i2h_r + h2h_r, act_type="sigmoid",
-                                       name="%s_r_act" % name)
-        update_gate = mx.sym.Activation(i2h_z + h2h_z, act_type="sigmoid",
-                                        name="%s_z_act" % name)
-
-        next_h_tmp = mx.sym.Activation(i2h + reset_gate * h2h, act_type="tanh",
-                                       name="%s_h_act" % name)
-
-        next_h = mx.sym._internal._plus((1. - update_gate) * next_h_tmp, update_gate * prev_state_h,
-                                        name='%sout' % name)
-
-        return next_h, [next_h]
-
-
-class LayerNormPerGateGRUCell(mx.rnn.GRUCell):
-    """
-    Gated Recurrent Unit (GRU) network cell with layer normalization per gate.
-    Based on Jimmy Lei Ba et al: Layer Normalization (https://arxiv.org/pdf/1607.06450.pdf)
-
-    :param num_hidden: number of RNN hidden units. Number of units in output symbol.
-    :param prefix: prefix for name of layers (and name of weight if params is None).
-    :param params: RNNParams or None. Container for weight sharing between cells. Created if None.
-    :param norm_scale: scale/gain for layer normalization.
-    :param norm_shift: shift/bias after layer normalization.
-    """
-
-    def __init__(self,
-                 num_hidden: int,
-                 prefix: str = 'lnggru_',
-                 params: Optional[mx.rnn.RNNParams] = None,
-                 norm_scale: float = 1.0,
-                 norm_shift: float = 0.0) -> None:
-        super(LayerNormPerGateGRUCell, self).__init__(num_hidden, prefix, params)
-        self._norm_layers = list()  # type: List[LayerNormalization]
-        for name in ['r', 'z', 'o']:
-            scale = self.params.get('%s_shift' % name, init=mx.init.Constant(value=norm_shift))
-            shift = self.params.get('%s_scale' % name, init=mx.init.Constant(value=norm_scale))
-            self._norm_layers.append(LayerNormalization(prefix="%s%s" % (self._prefix, name), scale=scale, shift=shift))
-
-    def __call__(self, inputs, states):
-        self._counter += 1
-
-        seq_idx = self._counter
-        name = '%st%d_' % (self._prefix, seq_idx)
-        prev_state_h = states[0]
-
-        i2h = mx.sym.FullyConnected(data=inputs,
-                                    weight=self._iW,
-                                    bias=self._iB,
-                                    num_hidden=self._num_hidden * 3,
-                                    name="%s_i2h" % name)
-        h2h = mx.sym.FullyConnected(data=prev_state_h,
-                                    weight=self._hW,
-                                    bias=self._hB,
-                                    num_hidden=self._num_hidden * 3,
-                                    name="%s_h2h" % name)
-
-        # pylint: disable=unbalanced-tuple-unpacking
-        i2h_r, i2h_z, i2h = mx.sym.split(i2h, num_outputs=3, name="%s_i2h_slice" % name)
-        h2h_r, h2h_z, h2h = mx.sym.split(h2h, num_outputs=3, name="%s_h2h_slice" % name)
-
-        reset_gate = mx.sym.Activation(self._norm_layers[0](i2h_r + h2h_r),
-                                       act_type="sigmoid", name="%s_r_act" % name)
-        update_gate = mx.sym.Activation(self._norm_layers[1](i2h_z + h2h_z),
-                                        act_type="sigmoid", name="%s_z_act" % name)
-
-        next_h_tmp = mx.sym.Activation(self._norm_layers[2](i2h + reset_gate * h2h),
-                                       act_type="tanh", name="%s_h_act" % name)
-
-        next_h = mx.sym._internal._plus((1. - update_gate) * next_h_tmp, update_gate * prev_state_h,
-                                        name='%sout' % name)
-
-        return next_h, [next_h]
-
-
-class VariationalDropoutCell(mx.rnn.ModifierCell):
-    """
-    Apply Bayesian Dropout on input and states separately. The dropout mask does not change when applied sequentially.
-
-    :param base_cell: Base cell to be modified.
-    :param dropout_inputs: Dropout probability for inputs.
-    :param dropout_states: Dropout probability for state inputs.
-    """
-
-    def __init__(self,
-                 base_cell: mx.rnn.BaseRNNCell,
-                 dropout_inputs: float,
-                 dropout_states: float) -> None:
-        super().__init__(base_cell)
-        self.dropout_inputs = dropout_inputs
-        self.dropout_states = dropout_states
-        self.mask_inputs = None
-        self.mask_states = None
-
-    def __call__(self, inputs, states):
-        if self.dropout_inputs > 0:
-            if self.mask_inputs is None:
-                self.mask_inputs = mx.sym.Dropout(data=mx.sym.ones_like(inputs), p=self.dropout_inputs)
-            inputs = inputs * self.mask_inputs
-
-        if self.dropout_states > 0:
-            if self.mask_states is None:
-                self.mask_states = mx.sym.Dropout(data=mx.sym.ones_like(states[0]), p=self.dropout_states)
-            states[0] = states[0] * self.mask_states
-
-        output, states = self.base_cell(inputs, states)
-
-        return output, states
-
-    def reset(self):
-        super(VariationalDropoutCell, self).reset()
-        self.mask_inputs = None
-        self.mask_states = None
diff --git a/sockeye/rnn_attention.py b/sockeye/rnn_attention.py
deleted file mode 100644
index 13a1bb97b..000000000
--- a/sockeye/rnn_attention.py
+++ /dev/null
@@ -1,831 +0,0 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-"""
-Implementations of different attention mechanisms in sequence-to-sequence models.
-"""
-import logging
-import inspect
-from typing import Callable, NamedTuple, Optional, Tuple, Dict, Type
-
-import numpy as np
-import mxnet as mx
-
-from . import config
-from . import constants as C
-from . import coverage
-from . import layers
-from . import utils
-
-logger = logging.getLogger(__name__)
-
-
-class AttentionConfig(config.Config):
-    """
-    Attention configuration.
-
-    :param type: Attention name.
-    :param num_hidden: Number of hidden units for attention networks.
-    :param input_previous_word: Feeds the previous target embedding into the attention mechanism.
-    :param source_num_hidden: Number of hidden units of the source.
-    :param query_num_hidden: Number of hidden units of the query.
-    :param layer_normalization: Apply layer normalization to MLP attention.
-    :param config_coverage: Optional coverage configuration.
-    :param num_heads: Number of attention heads. Only used for Multi-head dot attention.
-    :param is_scaled: If 'dot' attentions should be scaled.
-    :param dtype: Data type.
-    """
-    def __init__(self,
-                 type: str,
-                 num_hidden: int,
-                 input_previous_word: bool,
-                 source_num_hidden: int,
-                 query_num_hidden: int,
-                 layer_normalization: bool,
-                 config_coverage: Optional[coverage.CoverageConfig] = None,
-                 num_heads: Optional[int] = None,
-                 is_scaled: Optional[bool] = False,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__()
-        self.type = type
-        self.num_hidden = num_hidden
-        self.input_previous_word = input_previous_word
-        self.source_num_hidden = source_num_hidden
-        self.query_num_hidden = query_num_hidden
-        self.layer_normalization = layer_normalization
-        self.config_coverage = config_coverage
-        self.num_heads = num_heads
-        self.is_scaled = is_scaled
-        self.dtype = dtype
-
-
-def _instantiate(cls, params):
-    """
-    Helper to instantiate Attention classes from parameters. Warns in log if parameter is not supported
-    by class constructor.
-
-    :param cls: Attention class.
-    :param params: configuration parameters.
-    :return: instance of `cls` type.
-    """
-    sig_params = inspect.signature(cls.__init__).parameters
-    valid_params = dict()
-    for key, value in params.items():
-        if key in sig_params:
-            valid_params[key] = value
-        else:
-            logger.debug('Type %s does not support parameter \'%s\'' % (cls.__name__, key))
-    return cls(**valid_params)
-
-
-def get_attention(config: AttentionConfig, max_seq_len: int, prefix: str = C.ATTENTION_PREFIX) -> 'Attention':
-    """
-    Returns an Attention instance based on attention_type.
-
-    :param config: Attention configuration.
-    :param max_seq_len: Maximum length of source sequences.
-    :param prefix: Name prefix.
-    :return: Instance of Attention.
-    """
-
-    att_cls = Attention.get_attention_cls(config.type)
-    params = config.__dict__.copy()
-    params.pop('_frozen')
-    params['max_seq_len'] = max_seq_len
-    params['prefix'] = prefix
-    return _instantiate(att_cls, params)
-
-
-AttentionInput = NamedTuple('AttentionInput', [('seq_idx', int), ('query', mx.sym.Symbol)])
-"""
-Input to attention callables.
-
-:param seq_idx: Decoder time step / sequence index.
-:param query: Query input to attention mechanism, e.g. decoder hidden state (plus previous word).
-"""
-
-AttentionState = NamedTuple('AttentionState', [
-    ('context', mx.sym.Symbol),
-    ('probs', mx.sym.Symbol),
-    ('dynamic_source', mx.sym.Symbol),
-    ('scores', mx.sym.Symbol),
-])
-"""
-Results returned from attention callables.
-
-:param context: Context vector (Bahdanau et al, 15). Shape: (batch_size, encoder_num_hidden)
-:param probs: Attention distribution over source encoder states. Shape: (batch_size, source_seq_len).
-:param dynamic_source: Dynamically updated source encoding.
-       Shape: (batch_size, source_seq_len, dynamic_source_num_hidden)
-"""
-
-
-class Attention(object):
-    """
-    Generic attention interface that returns a callable for attending to source states.
-
-    :param input_previous_word: Feed the previous target embedding into the attention mechanism.
-    :param dynamic_source_num_hidden: Number of hidden units of dynamic source encoding update mechanism.
-    :param dtype: Data type.
-    """
-
-    __registry = {}  # type: Dict[str, Type['Attention']]
-
-    @classmethod
-    def register(cls, att_type: str):
-        def wrapper(target_cls):
-            cls.__registry[att_type] = target_cls
-            return target_cls
-        return wrapper
-
-    @classmethod
-    def get_attention_cls(cls, att_type: str):
-        if att_type not in cls.__registry:
-            raise ValueError('Unknown attention type %s' % att_type)
-        return cls.__registry[att_type]
-
-    def __init__(self,
-                 input_previous_word: bool,
-                 dynamic_source_num_hidden: int = 1,
-                 prefix: str = C.ATTENTION_PREFIX,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        self.dynamic_source_num_hidden = dynamic_source_num_hidden
-        self._input_previous_word = input_previous_word
-        self.prefix = prefix
-        self.dtype = dtype
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for recurrent attention in a sequence decoder.
-        The callable is a recurrent function of the form:
-        AttentionState = attend(AttentionInput, AttentionState).
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Attention callable.
-        """
-
-        def attend(att_input: AttentionInput, att_state: AttentionState) -> AttentionState:
-            """
-            Returns updated attention state given attention input and current attention state.
-
-            :param att_input: Attention input as returned by make_input().
-            :param att_state: Current attention state
-            :return: Updated attention state.
-            """
-            raise NotImplementedError()
-
-        return attend
-
-    def get_initial_state(self, source_length: mx.sym.Symbol, source_seq_len: int) -> AttentionState:
-        """
-        Returns initial attention state. Dynamic source encoding is initialized with zeros.
-
-        :param source_length: Source length. Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        """
-        dynamic_source = mx.sym.reshape(mx.sym.zeros_like(source_length), shape=(-1, 1, 1))
-        # dynamic_source: (batch_size, source_seq_len, num_hidden_dynamic_source)
-        dynamic_source = mx.sym.broadcast_to(dynamic_source, shape=(0, source_seq_len, self.dynamic_source_num_hidden))
-        return AttentionState(context=None, probs=None, dynamic_source=dynamic_source, scores=None)
-
-    def make_input(self,
-                   seq_idx: int,
-                   word_vec_prev: mx.sym.Symbol,
-                   decoder_state: mx.sym.Symbol) -> AttentionInput:
-        """
-        Returns AttentionInput to be fed into the attend callable returned by the on() method.
-
-        :param seq_idx: Decoder time step.
-        :param word_vec_prev: Embedding of previously predicted ord
-        :param decoder_state: Current decoder state
-        :return: Attention input.
-        """
-        query = decoder_state
-        if self._input_previous_word:
-            # (batch_size, num_target_embed + rnn_num_hidden)
-            query = mx.sym.concat(word_vec_prev, decoder_state, dim=1,
-                                  name='%sconcat_prev_word_%d' % (self.prefix, seq_idx))
-        return AttentionInput(seq_idx=seq_idx, query=query)
-
-
-@Attention.register(C.ATT_BILINEAR)
-class BilinearAttention(Attention):
-    """
-    Bilinear attention based on Luong et al. 2015.
-
-    :math:`score(h_t, h_s) = h_t^T \\mathbf{W} h_s`
-
-    For implementation reasons we modify to:
-
-    :math:`score(h_t, h_s) = h_s^T \\mathbf{W} h_t`
-
-    :param query_num_hidden: Number of hidden units the source will be projected to.
-    :param dtype: data type.
-    :param prefix: Name prefix.
-    """
-
-    def __init__(self, query_num_hidden: int, dtype: str = C.DTYPE_FP32, prefix: str = C.ATTENTION_PREFIX) -> None:
-        super().__init__(False, dtype=dtype, prefix=prefix)
-        self.num_hidden = query_num_hidden
-        self.s2t_weight = mx.sym.Variable("%ss2t_weight" % self.prefix)
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for recurrent attention in a sequence decoder.
-        The callable is a recurrent function of the form:
-        AttentionState = attend(AttentionInput, AttentionState).
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Attention callable.
-        """
-
-        # (batch_size, seq_len, self.num_hidden)
-        source_hidden = mx.sym.FullyConnected(data=source,
-                                              weight=self.s2t_weight,
-                                              num_hidden=self.num_hidden,
-                                              no_bias=True,
-                                              flatten=False,
-                                              name="%ssource_hidden_fc" % self.prefix)
-
-        def attend(att_input: AttentionInput, att_state: AttentionState) -> AttentionState:
-            """
-            Returns updated attention state given attention input and current attention state.
-
-            :param att_input: Attention input as returned by make_input().
-            :param att_state: Current attention state
-            :return: Updated attention state.
-            """
-            # (batch_size, decoder_num_hidden, 1)
-            query = mx.sym.expand_dims(att_input.query, axis=2)
-
-            # in:  (batch_size, source_seq_len, self.num_hidden) X (batch_size, self.num_hidden, 1)
-            # out: (batch_size, source_seq_len, 1).
-            attention_scores = mx.sym.batch_dot(lhs=source_hidden, rhs=query, name="%sbatch_dot" % self.prefix)
-
-            context, attention_probs = get_context_and_attention_probs(source, source_length, attention_scores,
-                                                                       self.dtype)
-            
-            # (batch_size, seq_len, 1) -> (batch_size, seq_len)
-            attention_scores = mx.sym.reshape(data=attention_scores, shape=(0, 0))
-            
-            return AttentionState(context=context,
-                                  probs=attention_probs,
-                                  dynamic_source=att_state.dynamic_source,
-                                  scores=attention_scores)
-
-        return attend
-
-
-@Attention.register(C.ATT_DOT)
-class DotAttention(Attention):
-    """
-    Attention mechanism with dot product between encoder and decoder hidden states [Luong et al. 2015].
-
-    :math:`score(h_t, h_s) =  \\langle h_t, h_s \\rangle`
-
-    :math:`a = softmax(score(*, h_s))`
-
-    If rnn_num_hidden != num_hidden, states are projected with additional parameters to num_hidden.
-
-    :math:`score(h_t, h_s) = \\langle \\mathbf{W}_t h_t, \\mathbf{W}_s h_s \\rangle`
-
-    :param input_previous_word: Feed the previous target embedding into the attention mechanism.
-    :param source_num_hidden: Number of hidden units in source.
-    :param query_num_hidden: Number of hidden units in query.
-    :param num_hidden: Number of hidden units.
-    :param is_scaled: Optionally scale query before dot product [Vaswani et al, 2017].
-    :param prefix: Name prefix.
-    :param dtype: data type.
-    """
-
-    def __init__(self,
-                 input_previous_word: bool,
-                 source_num_hidden: int,
-                 query_num_hidden: int,
-                 num_hidden: int,
-                 is_scaled: bool = False,
-                 prefix: str = C.ATTENTION_PREFIX,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__(input_previous_word, dtype=dtype, prefix=prefix)
-        self.project_source = source_num_hidden != num_hidden
-        self.project_query = query_num_hidden != num_hidden
-        self.num_hidden = num_hidden
-        self.is_scaled = is_scaled
-        self.scale = num_hidden ** -0.5 if is_scaled else None
-        self.s2h_weight = mx.sym.Variable("%ss2h_weight" % self.prefix) if self.project_source else None
-        self.t2h_weight = mx.sym.Variable("%st2h_weight" % self.prefix) if self.project_query else None
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for recurrent attention in a sequence decoder.
-        The callable is a recurrent function of the form:
-        AttentionState = attend(AttentionInput, AttentionState).
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Attention callable.
-        """
-
-        if self.project_source:
-            # (batch_size, seq_len, self.num_hidden)
-            source_hidden = mx.sym.FullyConnected(data=source,
-                                                  weight=self.s2h_weight,
-                                                  num_hidden=self.num_hidden,
-                                                  no_bias=True,
-                                                  flatten=False,
-                                                  name="%ssource_hidden_fc" % self.prefix)
-        else:
-            source_hidden = source
-
-        def attend(att_input: AttentionInput, att_state: AttentionState) -> AttentionState:
-            """
-            Returns updated attention state given attention input and current attention state.
-
-            :param att_input: Attention input as returned by make_input().
-            :param att_state: Current attention state
-            :return: Updated attention state.
-            """
-            query = att_input.query
-            if self.project_query:
-                # query: (batch_size, self.num_hidden)
-                query = mx.sym.FullyConnected(data=query,
-                                              weight=self.t2h_weight,
-                                              num_hidden=self.num_hidden,
-                                              no_bias=True, name="%squery_hidden_fc" % self.prefix)
-
-            # scale down dot product by sqrt(num_hidden) [Vaswani et al, 17]
-            if self.is_scaled:
-                query = query * self.scale
-
-            # (batch_size, decoder_num_hidden, 1)
-            expanded_decoder_state = mx.sym.expand_dims(query, axis=2)
-
-            # batch_dot: (batch, M, K) X (batch, K, N) –> (batch, M, N).
-            # (batch_size, seq_len, 1)
-            attention_scores = mx.sym.batch_dot(lhs=source_hidden, rhs=expanded_decoder_state,
-                                                name="%sbatch_dot" % self.prefix)
-
-            context, attention_probs = get_context_and_attention_probs(source, source_length, attention_scores,
-                                                                       self.dtype)
-            
-            # (batch_size, seq_len, 1) -> (batch_size, seq_len)
-            attention_scores = mx.sym.reshape(data=attention_scores, shape=(0, 0))
-            
-            return AttentionState(context=context,
-                                  probs=attention_probs,
-                                  dynamic_source=att_state.dynamic_source,
-                                  scores=attention_scores)
-        
-        return attend
-
-
-@Attention.register(C.ATT_MH_DOT)
-class MultiHeadDotAttention(Attention):
-    """
-    Dot product attention with multiple heads as proposed in Vaswani et al, Attention is all you need.
-    Can be used with a RecurrentDecoder.
-
-    :param input_previous_word: Feed the previous target embedding into the attention mechanism.
-    :param source_num_hidden: Number of hidden units.
-    :param num_heads: Number of attention heads / independently computed attention scores.
-    :param prefix: Name prefix.
-    :param dtype: data type.
-    """
-
-    def __init__(self,
-                 input_previous_word: bool,
-                 source_num_hidden: int,
-                 num_heads: int,
-                 prefix: str = C.ATTENTION_PREFIX,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__(input_previous_word, dtype=dtype, prefix=prefix)
-        utils.check_condition(num_heads is not None, "%s requires setting num-heads." % C.ATT_MH_DOT)
-        utils.check_condition(source_num_hidden % num_heads == 0,
-                              "Number of heads (%d) must divide attention depth (%d)" % (num_heads, source_num_hidden))
-        self.num_hidden = source_num_hidden
-        self.heads = num_heads
-        self.num_hidden_per_head = self.num_hidden // self.heads
-        self.s2h_weight = mx.sym.Variable("%ss2h_weight" % self.prefix)
-        self.s2h_bias = mx.sym.Variable("%ss2h_bias" % self.prefix)
-        self.t2h_weight = mx.sym.Variable("%st2h_weight" % self.prefix)
-        self.t2h_bias = mx.sym.Variable("%st2h_bias" % self.prefix)
-        self.h2o_weight = mx.sym.Variable("%sh2o_weight" % self.prefix)
-        self.h2o_bias = mx.sym.Variable("%sh2o_bias" % self.prefix)
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for recurrent attention in a sequence decoder.
-        The callable is a recurrent function of the form:
-        AttentionState = attend(AttentionInput, AttentionState).
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Attention callable.
-        """
-        # (batch, length, num_hidden * 2)
-        source_hidden = mx.sym.FullyConnected(data=source,
-                                              weight=self.s2h_weight,
-                                              bias=self.s2h_bias,
-                                              num_hidden=self.num_hidden * 2,
-                                              flatten=False,
-                                              name="%ssource_hidden_fc" % self.prefix)
-        # split keys and values
-        # (batch, length, num_hidden)
-        # pylint: disable=unbalanced-tuple-unpacking
-        keys, values = mx.sym.split(data=source_hidden, num_outputs=2, axis=2)
-
-        # (batch*heads, length, num_hidden/head)
-        keys = layers.split_heads(mx.sym, keys, self.num_hidden_per_head, self.heads)
-        values = layers.split_heads(mx.sym, values, self.num_hidden_per_head, self.heads)
-
-        def attend(att_input: AttentionInput, att_state: AttentionState) -> AttentionState:
-            """
-            Returns updated attention state given attention input and current attention state.
-
-            :param att_input: Attention input as returned by make_input().
-            :param att_state: Current attention state
-            :return: Updated attention state.
-            """
-            # (batch, num_hidden)
-            query = mx.sym.FullyConnected(data=att_input.query,
-                                          weight=self.t2h_weight, bias=self.t2h_bias,
-                                          num_hidden=self.num_hidden, name="%squery_hidden_fc" % self.prefix)
-            # (batch, length, heads, num_hidden/head)
-            query = mx.sym.reshape(query, shape=(0, 1, self.heads, self.num_hidden_per_head))
-            # (batch, heads, num_hidden/head, length)
-            query = mx.sym.transpose(query, axes=(0, 2, 3, 1))
-            # (batch * heads, num_hidden/head, 1)
-            query = mx.sym.reshape(query, shape=(-3, self.num_hidden_per_head, 1))
-
-            # scale dot product
-            query = query * (self.num_hidden_per_head ** -0.5)
-
-            # (batch*heads, length, num_hidden/head) X (batch*heads, num_hidden/head, 1)
-            #   -> (batch*heads, length, 1)
-            attention_scores = mx.sym.batch_dot(lhs=keys, rhs=query, name="%sdot" % self.prefix)
-
-            # (batch*heads, 1)
-            lengths = layers.broadcast_to_heads(mx.sym, source_length, self.heads, ndim=1, fold_heads=True)
-
-            # context: (batch*heads, num_hidden/head)
-            # attention_probs: (batch*heads, length)
-            context, attention_probs = get_context_and_attention_probs(values, lengths, attention_scores, self.dtype)
-
-            # combine heads
-            # (batch*heads, 1, num_hidden/head)
-            context = mx.sym.expand_dims(context, axis=1)
-            # (batch, 1, num_hidden)
-            context = layers.combine_heads(mx.sym, context, self.num_hidden_per_head, heads=self.heads)
-            # (batch, num_hidden)
-            context = mx.sym.reshape(context, shape=(-3, -1))
-
-            # (batch, heads, length)
-            attention_probs = mx.sym.reshape(data=attention_probs, shape=(-4, -1, self.heads, source_seq_len))
-            # just average over distributions
-            attention_probs = mx.sym.mean(attention_probs, axis=1, keepdims=False)
-            
-            # (batch_size, seq_len, 1) -> (batch_size, seq_len)
-            attention_scores = mx.sym.reshape(data=attention_scores, shape=(0, 0))
-            
-            return AttentionState(context=context,
-                                  probs=attention_probs,
-                                  dynamic_source=att_state.dynamic_source,
-                                  scores=attention_scores)
-
-        return attend
-
-
-@Attention.register(C.ATT_FIXED)
-class EncoderLastStateAttention(Attention):
-    """
-    Always returns the last encoder state independent of the query vector.
-    Equivalent to no attention.
-    """
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for recurrent attention in a sequence decoder.
-        The callable is a recurrent function of the form:
-        AttentionState = attend(AttentionInput, AttentionState).
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Attention callable.
-        """
-        encoder_last_state = mx.sym.SequenceLast(data=source, axis=1, sequence_length=source_length,
-                                                 use_sequence_length=True)
-        fixed_probs = mx.sym.one_hot(source_length - 1, depth=source_seq_len)
-
-        def attend(att_input: AttentionInput, att_state: AttentionState) -> AttentionState:
-            return AttentionState(context=encoder_last_state,
-                                  probs=fixed_probs,
-                                  dynamic_source=att_state.dynamic_source,
-                                  scores=fixed_probs)
-
-        return attend
-
-
-@Attention.register(C.ATT_LOC)
-class LocationAttention(Attention):
-    """
-    Attends to locations in the source [Luong et al, 2015]
-
-    :math:`a_t = softmax(\\mathbf{W}_a h_t)` for decoder hidden state at time t.
-
-    :note: :math:`\\mathbf{W}_a` is of shape (max_source_seq_len, decoder_num_hidden).
-
-    :param input_previous_word: Feed the previous target embedding into the attention mechanism.
-    :param max_seq_len: Maximum length of source sequences.
-    :param prefix: Name prefix.
-    :param dtype: data type.
-    """
-
-    def __init__(self,
-                 input_previous_word: bool,
-                 max_seq_len: int,
-                 prefix: str = C.ATTENTION_PREFIX,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__(input_previous_word, dtype=dtype, prefix=prefix)
-        self.max_source_seq_len = max_seq_len
-        self.location_weight = mx.sym.Variable("%sloc_weight" % self.prefix)
-        self.location_bias = mx.sym.Variable("%sloc_bias" % self.prefix)
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for recurrent attention in a sequence decoder.
-        The callable is a recurrent function of the form:
-        AttentionState = attend(AttentionInput, AttentionState).
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Attention callable.
-        """
-
-        def attend(att_input: AttentionInput, att_state: AttentionState) -> AttentionState:
-            """
-            Returns updated attention state given attention input and current attention state.
-
-            :param att_input: Attention input as returned by make_input().
-            :param att_state: Current attention state
-            :return: Updated attention state.
-            """
-            # attention_scores: (batch_size, seq_len)
-            attention_scores = mx.sym.FullyConnected(data=att_input.query,
-                                                     num_hidden=self.max_source_seq_len,
-                                                     weight=self.location_weight,
-                                                     bias=self.location_bias)
-
-            # attention_scores: (batch_size, seq_len)
-            attention_scores = mx.sym.slice_axis(data=attention_scores,
-                                                 axis=1,
-                                                 begin=0,
-                                                 end=source_seq_len)
-
-            # attention_scores: (batch_size, seq_len, 1)
-            attention_scores = mx.sym.expand_dims(data=attention_scores, axis=2)
-
-            context, attention_probs = get_context_and_attention_probs(source, source_length, attention_scores,
-                                                                       self.dtype)
-            
-            # (batch_size, seq_len, 1) -> (batch_size, seq_len)
-            attention_scores = mx.sym.reshape(data=attention_scores, shape=(0, 0))
-            
-            return AttentionState(context=context,
-                                  probs=attention_probs,
-                                  dynamic_source=att_state.dynamic_source,
-                                  scores=attention_scores)
-
-        return attend
-
-
-@Attention.register(C.ATT_MLP)
-class MlpAttention(Attention):
-    """
-    Attention computed through a one-layer MLP with num_hidden units [Luong et al, 2015].
-
-    :math:`score(h_t, h_s) = \\mathbf{W}_a tanh(\\mathbf{W}_c [h_t, h_s] + b)`
-
-    :math:`a = softmax(score(*, h_s))`
-
-    Optionally, if attention_coverage_type is not None, attention uses dynamic source encoding ('coverage' mechanism)
-    as in Tu et al. (2016): Modeling Coverage for Neural Machine Translation.
-
-    :math:`score(h_t, h_s) = \\mathbf{W}_a tanh(\\mathbf{W}_c [h_t, h_s, c_s] + b)`
-
-    :math:`c_s` is the decoder time-step dependent source encoding which is updated using the current
-    decoder state.
-
-    :param input_previous_word: Feed the previous target embedding into the attention mechanism.
-    :param num_hidden: Number of hidden units.
-    :param layer_normalization: If true, normalizes hidden layer outputs before tanh activation.
-    :param prefix: Name prefix
-    :param dtype: data type.
-    """
-
-    def __init__(self,
-                 input_previous_word: bool,
-                 num_hidden: int,
-                 layer_normalization: bool = False,
-                 prefix: str = C.ATTENTION_PREFIX,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__(input_previous_word=input_previous_word,
-                         dynamic_source_num_hidden=1,
-                         prefix=prefix,
-                         dtype=dtype)
-        self.attention_num_hidden = num_hidden
-        # input (encoder) to hidden
-        self.att_e2h_weight = mx.sym.Variable("%se2h_weight" % self.prefix)
-        # input (query) to hidden
-        self.att_q2h_weight = mx.sym.Variable("%sq2h_weight" % self.prefix)
-        # hidden to score
-        self.att_h2s_weight = mx.sym.Variable("%sh2s_weight" % self.prefix)
-        # coverage
-        self.coverage = None  # type: Optional[coverage.Coverage]
-        # dynamic source (coverage) weights and settings
-        # input (coverage) to hidden
-        self.att_c2h_weight = None
-        # layer normalization
-        self._ln = None
-        if layer_normalization:
-            self._ln = layers.LayerNormalization(prefix="%snorm" % self.prefix)
-
-    def on(self, source: mx.sym.Symbol, source_length: mx.sym.Symbol, source_seq_len: int) -> Callable:
-        """
-        Returns callable to be used for recurrent attention in a sequence decoder.
-        The callable is a recurrent function of the form:
-        AttentionState = attend(AttentionInput, AttentionState).
-
-        :param source: Shape: (batch_size, seq_len, encoder_num_hidden).
-        :param source_length: Shape: (batch_size,).
-        :param source_seq_len: Maximum length of source sequences.
-        :return: Attention callable.
-        """
-
-        if self.coverage is not None:
-            coverage_func = self.coverage.on(source, source_length, source_seq_len)
-
-        # (batch_size, seq_len, attention_num_hidden)
-        source_hidden = mx.sym.FullyConnected(data=source,
-                                              weight=self.att_e2h_weight,
-                                              num_hidden=self.attention_num_hidden,
-                                              no_bias=True,
-                                              flatten=False,
-                                              name="%ssource_hidden_fc" % self.prefix)
-
-        def attend(att_input: AttentionInput, att_state: AttentionState) -> AttentionState:
-            """
-            Returns updated attention state given attention input and current attention state.
-
-            :param att_input: Attention input as returned by make_input().
-            :param att_state: Current attention state
-            :return: Updated attention state.
-            """
-
-            # (batch_size, attention_num_hidden)
-            query_hidden = mx.sym.FullyConnected(data=att_input.query,
-                                                 weight=self.att_q2h_weight,
-                                                 num_hidden=self.attention_num_hidden,
-                                                 no_bias=True,
-                                                 name="%squery_hidden" % self.prefix)
-
-            # (batch_size, 1, attention_num_hidden)
-            query_hidden = mx.sym.expand_dims(data=query_hidden,
-                                              axis=1,
-                                              name="%squery_hidden_expanded" % self.prefix)
-
-            attention_hidden_lhs = source_hidden
-            if self.coverage:
-                # (batch_size, seq_len, attention_num_hidden)
-                dynamic_hidden = mx.sym.FullyConnected(data=att_state.dynamic_source,
-                                                       weight=self.att_c2h_weight,
-                                                       num_hidden=self.attention_num_hidden,
-                                                       no_bias=True,
-                                                       flatten=False,
-                                                       name="%sdynamic_source_hidden_fc" % self.prefix)
-
-                # (batch_size, seq_len, attention_num_hidden
-                attention_hidden_lhs = dynamic_hidden + source_hidden
-
-            # (batch_size, seq_len, attention_num_hidden)
-            attention_hidden = mx.sym.broadcast_add(lhs=attention_hidden_lhs, rhs=query_hidden,
-                                                    name="%squery_plus_input" % self.prefix)
-
-            if self._ln is not None:
-                attention_hidden = self._ln(attention_hidden)
-
-            # (batch_size, seq_len, attention_num_hidden)
-            attention_hidden = mx.sym.Activation(attention_hidden, act_type="tanh",
-                                                 name="%shidden" % self.prefix)
-
-            # (batch_size, seq_len, 1)
-            attention_scores = mx.sym.FullyConnected(data=attention_hidden,
-                                                     weight=self.att_h2s_weight,
-                                                     num_hidden=1,
-                                                     no_bias=True,
-                                                     flatten=False,
-                                                     name="%sraw_att_score_fc" % self.prefix)
-
-            context, attention_probs = get_context_and_attention_probs(source, source_length, attention_scores,
-                                                                       self.dtype)
-            
-            # (batch_size, seq_len, 1) -> (batch_size, seq_len)
-            attention_scores = mx.sym.reshape(data=attention_scores, shape=(0, 0))
-            
-            dynamic_source = att_state.dynamic_source
-            if self.coverage is not None:
-                # update dynamic source encoding
-                # Note: this is a slight change to the Tu et al, 2016 paper: input to the coverage update
-                # is the attention input query, not the previous decoder state.
-                dynamic_source = coverage_func(prev_hidden=att_input.query,
-                                               attention_prob_scores=attention_probs,
-                                               prev_coverage=att_state.dynamic_source)
-
-            return AttentionState(context=context,
-                                  probs=attention_probs,
-                                  dynamic_source=dynamic_source,
-                                  scores=attention_scores)
-
-        return attend
-
-
-@Attention.register(C.ATT_COV)
-class MlpCovAttention(MlpAttention):
-    """
-    MlpAttention with optional coverage config.
-
-    :param input_previous_word: Feed the previous target embedding into the attention mechanism.
-    :param num_hidden: Number of hidden units.
-    :param layer_normalization: If true, normalizes hidden layer outputs before tanh activation.
-    :param config_coverage: coverage config.
-    :param prefix: Name prefix.
-    :param dtype: data type.
-    """
-
-    def __init__(self,
-                 input_previous_word: bool,
-                 num_hidden: int,
-                 layer_normalization: bool = False,
-                 config_coverage: coverage.CoverageConfig = None,
-                 prefix: str = C.ATTENTION_PREFIX,
-                 dtype: str = C.DTYPE_FP32) -> None:
-        super().__init__(input_previous_word=input_previous_word,
-                         num_hidden=num_hidden,
-                         layer_normalization=layer_normalization,
-                         prefix=prefix,
-                         dtype=dtype)
-        self.coverage = coverage.get_coverage(config_coverage)
-        self.dynamic_source_num_hidden = config_coverage.num_hidden
-        self.att_c2h_weight = mx.sym.Variable("%sc2h_weight" % self.prefix)
-
-
-def get_context_and_attention_probs(values: mx.sym.Symbol,
-                                    length: mx.sym.Symbol,
-                                    logits: mx.sym.Symbol,
-                                    dtype: str) -> Tuple[mx.sym.Symbol, mx.sym.Symbol]:
-    """
-    Returns context vector and attention probabilities
-    via a weighted sum over values.
-
-    :param values: Shape: (batch_size, seq_len, encoder_num_hidden).
-    :param length: Shape: (batch_size,).
-    :param logits: Shape: (batch_size, seq_len, 1).
-    :param dtype: data type.
-    :return: context: (batch_size, encoder_num_hidden), attention_probs: (batch_size, seq_len).
-    """
-    # masks attention scores according to sequence length.
-    # (batch_size, seq_len, 1)
-    logits = mx.sym.SequenceMask(data=logits,
-                                 axis=1,
-                                 use_sequence_length=True,
-                                 sequence_length=length,
-                                 value=-C.LARGE_VALUES[dtype])
-
-    # (batch_size, seq_len, 1)
-    probs = mx.sym.softmax(logits, axis=1, name='attention_softmax')
-
-    # batch_dot: (batch, M, K) X (batch, K, N) –> (batch, M, N).
-    # (batch_size, seq_len, num_hidden) X (batch_size, seq_len, 1) -> (batch_size, num_hidden, 1)
-    context = mx.sym.batch_dot(lhs=values, rhs=probs, transpose_a=True)
-    # (batch_size, encoder_num_hidden, 1)-> (batch_size, encoder_num_hidden)
-    context = mx.sym.reshape(data=context, shape=(0, 0))
-    probs = mx.sym.reshape(data=probs, shape=(0, 0))
-
-    return context, probs
diff --git a/sockeye/score.py b/sockeye/score.py
index acf0c0007..1d6bc2bdc 100644
--- a/sockeye/score.py
+++ b/sockeye/score.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2018--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -18,17 +18,15 @@
 import logging
 import os
 from contextlib import ExitStack
-from typing import Optional, List, Tuple
 
 from . import arguments
 from . import constants as C
 from . import data_io
-from . import inference
-from . import model
 from . import scoring
 from . import utils
-from . import vocab
+from .beam_search import CandidateScorer
 from .log import setup_main_logger
+from .model import load_model
 from .output_handler import get_output_handler
 from .utils import check_condition
 
@@ -43,47 +41,6 @@ def main():
     score(args)
 
 
-def get_data_iters_and_vocabs(args: argparse.Namespace,
-                              model_folder: Optional[str]) -> Tuple['data_io.BaseParallelSampleIter',
-                                                                    List[vocab.Vocab], vocab.Vocab, model.ModelConfig]:
-    """
-    Loads the data iterators and vocabularies.
-
-    :param args: Arguments as returned by argparse.
-    :param model_folder: Output folder.
-    :return: The scoring data iterator as well as the source and target vocabularies.
-    """
-
-    model_config = model.SockeyeModel.load_config(os.path.join(args.model, C.CONFIG_NAME))
-
-    if args.max_seq_len is None:
-        max_seq_len_source = model_config.config_data.max_seq_len_source
-        max_seq_len_target = model_config.config_data.max_seq_len_target
-    else:
-        max_seq_len_source, max_seq_len_target = args.max_seq_len
-
-    batch_num_devices = 1 if args.use_cpu else sum(-di if di < 0 else 1 for di in args.device_ids)
-
-    # Load the existing vocabs created when starting the training run.
-    source_vocabs = vocab.load_source_vocabs(model_folder)
-    target_vocab = vocab.load_target_vocab(model_folder)
-
-    sources = [args.source] + args.source_factors
-    sources = [str(os.path.abspath(source)) for source in sources]
-
-    score_iter = data_io.get_scoring_data_iters(
-        sources=sources,
-        target=os.path.abspath(args.target),
-        source_vocabs=source_vocabs,
-        target_vocab=target_vocab,
-        batch_size=args.batch_size,
-        batch_num_devices=batch_num_devices,
-        max_seq_len_source=max_seq_len_source,
-        max_seq_len_target=max_seq_len_target)
-
-    return score_iter, source_vocabs, target_vocab, model_config
-
-
 def score(args: argparse.Namespace):
     setup_main_logger(file_logging=False,
                       console=not args.quiet,
@@ -103,32 +60,50 @@ def score(args: argparse.Namespace):
                                                                  "size that is a multiple of %d." % len(context))
         logger.info("Scoring Device(s): %s", ", ".join(str(c) for c in context))
 
-        # This call has a number of different parameters compared to training which reflect our need to get scores
-        # one-for-one and in the same order as the input data.
-        # To enable code reuse, we stuff the `args` parameter with some values.
-        # Bucketing and permuting need to be turned off in order to preserve the ordering of sentences.
-        # Finally, 'resume_training' needs to be set to True because it causes the model to be loaded instead of initialized.
-        args.no_bucketing = True
-        args.bucket_width = 10
-        score_iter, source_vocabs, target_vocab, model_config = get_data_iters_and_vocabs(
-            args=args,
-            model_folder=args.model)
-
-        scoring_model = scoring.ScoringModel(config=model_config,
-                                             model_dir=args.model,
-                                             context=context,
-                                             provide_data=score_iter.provide_data,
-                                             provide_label=score_iter.provide_label,
-                                             default_bucket_key=score_iter.default_bucket_key,
-                                             score_type=args.score_type,
-                                             length_penalty=inference.LengthPenalty(alpha=args.length_penalty_alpha,
-                                                                                    beta=args.length_penalty_beta),
-                                             brevity_penalty=inference.BrevityPenalty(weight=args.brevity_penalty_weight),
-                                             softmax_temperature=args.softmax_temperature,
-                                             brevity_penalty_type=args.brevity_penalty_type,
-                                             constant_length_ratio=args.brevity_penalty_constant_length_ratio)
-
-        scorer = scoring.Scorer(scoring_model, source_vocabs, target_vocab)
+        model, source_vocabs, target_vocab = load_model(args.model, context=context, dtype=args.dtype)
+
+        max_seq_len_source = model.max_supported_len_source
+        max_seq_len_target = model.max_supported_len_target
+        if args.max_seq_len is not None:
+            max_seq_len_source = min(args.max_seq_len[0] + C.SPACE_FOR_XOS, max_seq_len_source)
+            max_seq_len_target = min(args.max_seq_len[1] + C.SPACE_FOR_XOS, max_seq_len_target)
+
+        hybridize = not args.no_hybridization
+
+        sources = [args.source] + args.source_factors
+        sources = [str(os.path.abspath(source)) for source in sources]
+        target = os.path.abspath(args.target)
+
+        score_iter = data_io.get_scoring_data_iters(
+            sources=sources,
+            target=target,
+            source_vocabs=source_vocabs,
+            target_vocab=target_vocab,
+            batch_size=args.batch_size,
+            max_seq_len_source=max_seq_len_source,
+            max_seq_len_target=max_seq_len_target)
+
+        constant_length_ratio = args.brevity_penalty_constant_length_ratio
+        if args.brevity_penalty_type == C.BREVITY_PENALTY_CONSTANT:
+            if constant_length_ratio <= 0.0:
+                constant_length_ratio = model.length_ratio_mean
+                logger.info("Using constant length ratio saved in the model config: %f", constant_length_ratio)
+        else:
+            constant_length_ratio = -1.0
+
+        batch_scorer = scoring.BatchScorer(scorer=CandidateScorer(length_penalty_alpha=args.length_penalty_alpha,
+                                                                  length_penalty_beta=args.length_penalty_beta,
+                                                                  brevity_penalty_weight=args.brevity_penalty_weight),
+                                           score_type=args.score_type,
+                                           constant_length_ratio=constant_length_ratio)
+        if hybridize:
+            batch_scorer.hybridize(static_alloc=True)
+
+        scorer = scoring.Scorer(model=model,
+                                batch_scorer=batch_scorer,
+                                source_vocabs=source_vocabs,
+                                target_vocab=target_vocab,
+                                context=context)
 
         scorer.score(score_iter=score_iter,
                      output_handler=get_output_handler(output_type=args.output_type,
diff --git a/sockeye/scoring.py b/sockeye/scoring.py
index 2aadacfb0..f34c2b741 100644
--- a/sockeye/scoring.py
+++ b/sockeye/scoring.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2018--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -16,9 +16,8 @@
 """
 import logging
 import math
-import os
 import time
-from typing import List, Optional, Tuple
+from typing import cast, Dict, List, Optional, Union
 
 import mxnet as mx
 import numpy as np
@@ -26,203 +25,57 @@
 from . import constants as C
 from . import data_io
 from . import inference
-from . import model
-from . import utils
 from . import vocab
-from .inference import TranslatorInput, TranslatorOutput
+from .model import SockeyeModel
+from .beam_search import CandidateScorer
 from .output_handler import OutputHandler
 
 logger = logging.getLogger(__name__)
 
 
-class ScoringModel(model.SockeyeModel):
-    """
-    ScoringModel is a TrainingModel (which is in turn a SockeyeModel) that scores a pair of sentences.
-    That is, it full unrolls over source and target sequences, running the encoder and decoder,
-    but stopping short of computing a loss and backpropagating.
-    It is analogous to TrainingModel, but more limited.
-
-    :param config: Configuration object holding details about the model.
-    :param model_dir: Directory containing the trained model.
-    :param context: The context(s) that MXNet will be run in (GPU(s)/CPU).
-    :param provide_data: List of input data descriptions.
-    :param provide_label: List of label descriptions.
-    :param default_bucket_key: Default bucket key.
-    :param score_type: The type of score to output (negative logprob or logprob).
-    :param length_penalty: The length penalty instance to use.
-    :param brevity_penalty: The brevity penalty instance to use.
-    """
+class BatchScorer(mx.gluon.HybridBlock):
 
     def __init__(self,
-                 config: model.ModelConfig,
-                 model_dir: str,
-                 context: List[mx.context.Context],
-                 provide_data: List[mx.io.DataDesc],
-                 provide_label: List[mx.io.DataDesc],
-                 default_bucket_key: Tuple[int, int],
-                 score_type: str,
-                 length_penalty: inference.LengthPenalty,
-                 brevity_penalty: inference.BrevityPenalty,
-                 softmax_temperature: Optional[float] = None,
-                 brevity_penalty_type: str = '',
-                 constant_length_ratio: float = 0.0) -> None:
-        super().__init__(config)
-        self.context = context
+                 scorer: CandidateScorer,
+                 score_type: str = C.SCORING_TYPE_DEFAULT,
+                 constant_length_ratio: Optional[float] = None,
+                 prefix='BatchScorer_') -> None:
+        super().__init__(prefix=prefix)
         self.score_type = score_type
-        self.length_penalty = length_penalty
-        self.brevity_penalty = brevity_penalty
-        self.softmax_temperature = softmax_temperature
-
-        if brevity_penalty_type == C.BREVITY_PENALTY_CONSTANT:
-            if constant_length_ratio <= 0.0:
-                self.constant_length_ratio = self.length_ratio_mean
-                logger.info("Using constant length ratio saved in the model config: %f",
-                            self.constant_length_ratio)
-        else:
-            self.constant_length_ratio = -1.0
-
-        # Create the computation graph
-        self._initialize(provide_data, provide_label, default_bucket_key)
-
-        # Load model parameters into graph
-        params_fname = os.path.join(model_dir, C.PARAMS_BEST_NAME)
-        super().load_params_from_file(params_fname)
-        self.module.set_params(arg_params=self.params,
-                               aux_params=self.aux_params,
-                               allow_missing=False)
-
-    @property
-    def length_ratio_mean(self) -> float:
-        return self.config.config_data.data_statistics.length_ratio_mean
+        self.scorer = scorer
+        self.constant_length_ratio = constant_length_ratio
 
-    def _initialize(self,
-                    provide_data: List[mx.io.DataDesc],
-                    provide_label: List[mx.io.DataDesc],
-                    default_bucket_key: Tuple[int, int]) -> None:
+    def hybrid_forward(self, F, logits, labels, length_ratio, source_length, target_length):
         """
-        Initializes model components, creates scoring symbol and module, and binds it.
 
-        :param provide_data: List of data descriptors.
-        :param provide_label: List of label descriptors.
-        :param default_bucket_key: The default maximum (source, target) lengths.
+        :param F: MXNet Namespace
+        :param logits: Model logits. Shape: (batch, length, vocab_size).
+        :param labels: Gold targets. Shape: (batch, length).
+        :param length_ratio: Length Ratios. Shape: (batch,).
+        :param source_length: Source lengths. Shape: (batch,).
+        :param target_length: Target lengths. Shape: (batch,).
+        :return: Sequence scores. Shape: (batch,).
         """
-        source = mx.sym.Variable(C.SOURCE_NAME)
-        source_words = source.split(num_outputs=self.config.config_embed_source.num_factors,
-                                    axis=2, squeeze_axis=True)[0]
-        source_length = utils.compute_lengths(source_words)
-        target = mx.sym.Variable(C.TARGET_NAME)
-        target_length = utils.compute_lengths(target)
-
-        # labels shape: (batch_size, target_length) (usually the maximum target sequence length)
-        labels = mx.sym.Variable(C.TARGET_LABEL_NAME)
-
-        data_names = [C.SOURCE_NAME, C.TARGET_NAME]
-        label_names = [C.TARGET_LABEL_NAME]
-
-        # check provide_{data,label} names
-        provide_data_names = [d[0] for d in provide_data]
-        utils.check_condition(provide_data_names == data_names,
-                              "incompatible provide_data: %s, names should be %s" % (provide_data_names, data_names))
-        provide_label_names = [d[0] for d in provide_label]
-        utils.check_condition(provide_label_names == label_names,
-                              "incompatible provide_label: %s, names should be %s" % (provide_label_names, label_names))
-
-        def sym_gen(seq_lens):
-            """
-            Returns a (grouped) symbol containing the summed score for each sentence, as well as the entire target
-            distributions for each word.
-            Also returns data and label names for the BucketingModule.
-            """
-            source_seq_len, target_seq_len = seq_lens
-
-            # source embedding
-            (source_embed,
-             source_embed_length,
-             source_embed_seq_len) = self.embedding_source.encode(source, source_length, source_seq_len)
-
-            # target embedding
-            (target_embed,
-             target_embed_length,
-             target_embed_seq_len) = self.embedding_target.encode(target, target_length, target_seq_len)
-
-            # encoder
-            # source_encoded: (batch_size, source_encoded_length, encoder_depth)
-            (source_encoded,
-             source_encoded_length,
-             source_encoded_seq_len) = self.encoder.encode(source_embed,
-                                                           source_embed_length,
-                                                           source_embed_seq_len)
-
-            # decoder
-            # target_decoded: (batch-size, target_len, decoder_depth)
-            target_decoded, pointer_scores = self.decoder.decode_sequence(source_encoded, source_encoded_length, source_encoded_seq_len,
-                                                          target_embed, target_embed_length, target_embed_seq_len)
+        logprobs = F.log_softmax(logits, axis=-1)
 
-            # output layer
-            # logits: (batch_size * target_seq_len, target_vocab_size)
-            logits = self.output_layer(mx.sym.reshape(data=target_decoded, shape=(-3, 0)))
-            if self.config.num_pointers:
-                logits = mx.sym.concat(logits, pointer_scores, dim=1)
-            
-            # logits after reshape: (batch_size, target_seq_len, target_vocab_size)
-            logits = mx.sym.reshape(data=logits, shape=(-4, -1, target_embed_seq_len, 0))
+        # Select the label probability, then take their logs.
+        # probs and scores: (batch_size, target_seq_len)
+        token_scores = F.pick(logprobs, labels, axis=-1)
+        if self.score_type == C.SCORING_TYPE_NEGLOGPROB:
+            token_scores = token_scores * -1
 
-            if self.softmax_temperature is not None:
-                logits = logits / self.softmax_temperature
+        # Sum, then apply length penalty. The call to `mx.sym.where` masks out invalid values from scores.
+        # zeros and sums: (batch_size,)
+        scores = F.sum(F.where(labels != 0, token_scores, F.zeros_like(token_scores)), axis=1)
 
-            # Compute the softmax along the final dimension.
-            # target_dists: (batch_size, target_seq_len, target_vocab_size)
-            target_dists = mx.sym.softmax(data=logits, axis=2, name=C.SOFTMAX_NAME)
-
-            # Select the label probability, then take their logs.
-            # probs and scores: (batch_size, target_seq_len)
-            probs = mx.sym.pick(target_dists, labels)
-            scores = mx.sym.log(probs)
-            if self.score_type == C.SCORING_TYPE_NEGLOGPROB:
-                scores = -1 * scores
-
-            # Sum, then apply length penalty. The call to `mx.sym.where` masks out invalid values from scores.
-            # zeros and sums: (batch_size,)
-            zeros = mx.sym.zeros_like(scores)
-            sums = mx.sym.sum(mx.sym.where(labels != 0, scores, zeros), axis=1) / (self.length_penalty(target_length - 1))
-
-            # Deal with the potential presence of brevity penalty
-            # length_ratio: (batch_size,)
-            if self.constant_length_ratio > 0.0:
-                # override all ratios with the constant value
-                length_ratio = self.constant_length_ratio * mx.sym.ones_like(sums)
-            else:
-                # predict length ratio if supported
-                length_ratio = self.length_ratio(source_encoded, source_encoded_length).reshape((-1,)) \
-                                    if self.length_ratio is not None else mx.sym.zeros_like(sums)
-            sums = sums - self.brevity_penalty(target_length - 1, length_ratio * source_encoded_length)
-
-            # Return the sums and the target distributions
-            # sums: (batch_size,) target_dists: (batch_size, target_seq_len, target_vocab_size)
-            return mx.sym.Group([sums, target_dists]), data_names, label_names
-
-        symbol, _, __ = sym_gen(default_bucket_key)
-        self.module = mx.mod.Module(symbol=symbol,
-                                    data_names=data_names,
-                                    label_names=label_names,
-                                    logger=logger,
-                                    context=self.context)
-
-        self.module.bind(data_shapes=provide_data,
-                         label_shapes=provide_label,
-                         for_training=False,
-                         force_rebind=False,
-                         grad_req='null')
+        if self.constant_length_ratio is not None and self.constant_length_ratio > 0.0:
+            predicted_output_length = source_length * self.constant_length_ratio
+        else:
+            predicted_output_length = source_length * length_ratio
 
-    def run(self, batch: mx.io.DataBatch) -> List[mx.nd.NDArray]:
-        """
-        Runs the forward pass and returns the outputs.
+        scores = self.scorer(scores, target_length, predicted_output_length)
 
-        :param batch: The batch to run.
-        :return: The grouped symbol (probs and target dists) and lists containing the data names and label names.
-        """
-        self.module.forward(batch, is_train=False)
-        return self.module.get_outputs()
+        return scores
 
 
 class Scorer:
@@ -231,62 +84,69 @@ class Scorer:
     It also takes the vocabularies so that the original sentences can be printed out, if desired.
 
     :param model: The model to score with.
+    :param batch_scorer: BatchScorer block to score each batch.
     :param source_vocabs: The source vocabularies.
     :param target_vocab: The target vocabulary.
+    :param context: Context.
     """
     def __init__(self,
-                 model: ScoringModel,
+                 model: SockeyeModel,
+                 batch_scorer: BatchScorer,
                  source_vocabs: List[vocab.Vocab],
                  target_vocab: vocab.Vocab,
-                 constant_length_ratio: float = -1.0) -> None:
+                 context: Union[List[mx.context.Context], mx.context.Context]) -> None:
         self.source_vocab_inv = vocab.reverse_vocab(source_vocabs[0])
         self.target_vocab_inv = vocab.reverse_vocab(target_vocab)
         self.model = model
-        self.exclude_list = {source_vocabs[0][C.BOS_SYMBOL], target_vocab[C.EOS_SYMBOL], C.PAD_ID}
-        self.constant_length_ratio = constant_length_ratio
-
-    def score(self,
-              score_iter,
-              output_handler: OutputHandler):
-
+        self.batch_scorer = batch_scorer
+        self.context = context
+        self.exclude_list = {C.BOS_ID, C.EOS_ID, C.PAD_ID}
+
+    def score_batch(self, batch: data_io.Batch) -> mx.nd.NDArray:
+        batch = batch.split_and_load(ctx=self.context)
+        batch_scores = []  # type: List[mx.nd.NDArray]
+        for inputs, labels in batch.shards():
+            source, source_length, target, target_length = inputs
+            outputs = self.model(*inputs)  # type: Dict[str, mx.nd.NDArray]
+            logits = outputs[C.LOGITS_NAME]  # type: mx.nd.NDArray
+            label = labels[C.TARGET_LABEL_NAME]
+            length_ratio = outputs.get(C.LENRATIO_NAME, mx.nd.zeros_like(source_length))
+            scores = self.batch_scorer(logits, label, length_ratio, source_length, target_length)
+            batch_scores.append(scores)
+
+        # shape: (batch_size,).
+        batch_scores = mx.nd.concat(*batch_scores, dim=0)
+        return cast(mx.nd.NDArray, batch_scores)
+
+    def score(self, score_iter: data_io.BaseParallelSampleIter, output_handler: OutputHandler):
         total_time = 0.
         sentence_no = 0
         batch_no = 0
         for batch_no, batch in enumerate(score_iter, 1):
             batch_tic = time.time()
-
-            # Run the model and get the outputs
-            scores = self.model.run(batch)[0]
-
+            scores = self.score_batch(batch)
             batch_time = time.time() - batch_tic
             total_time += batch_time
 
-            batch_size = len(batch.data[0])
-
-            for sentno, (source, target, score) in enumerate(zip(batch.data[0], batch.data[1], scores), 1):
-
-                # The last batch may be underfilled, in which case batch.pad will be set
-                if sentno > (batch_size - batch.pad):
-                    break
-
+            for sentno, (source, target, score) in enumerate(zip(batch.source.astype('int32')[:, :, 0].asnumpy(),
+                                                                 batch.target.astype('int32').asnumpy(),
+                                                                 scores.asnumpy()), 1):
                 sentence_no += 1
 
                 # Transform arguments in preparation for printing
-                source_ids = [int(x) for x in source[:, 0].asnumpy().tolist()]
+                source_ids = source.tolist()
                 source_tokens = list(data_io.ids2tokens(source_ids, self.source_vocab_inv, self.exclude_list))
-                target_ids = [int(x) for x in target.asnumpy().tolist()]
+                target_ids = target.tolist()
                 target_string = C.TOKEN_SEPARATOR.join(
                     data_io.ids2tokens(target_ids, self.target_vocab_inv, self.exclude_list))
 
                 # Report a score of -inf for invalid sentence pairs (empty source and/or target)
-                if source[0][0] == C.PAD_ID or target[0] == C.PAD_ID:
+                if source[0] == C.PAD_ID or target[0] == C.PAD_ID:
                     score = -np.inf
-                else:
-                    score = score.asscalar()
 
                 # Output handling routines require us to make use of inference classes.
-                output_handler.handle(TranslatorInput(sentence_no, source_tokens),
-                                      TranslatorOutput(sentence_no, target_string, None, None, score),
+                output_handler.handle(inference.TranslatorInput(sentence_no, source_tokens),
+                                      inference.TranslatorOutput(sentence_no, target_string, None, score),
                                       batch_time)
 
         if sentence_no != 0:
diff --git a/sockeye/test_utils.py b/sockeye/test_utils.py
new file mode 100644
index 000000000..9b8d473c2
--- /dev/null
+++ b/sockeye/test_utils.py
@@ -0,0 +1,348 @@
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import json
+import logging
+import os
+import random
+import sys
+from contextlib import contextmanager
+from tempfile import TemporaryDirectory
+from typing import Any, Dict, List, Tuple
+from unittest.mock import patch
+
+import numpy as np
+
+import sockeye.average
+import sockeye.checkpoint_decoder
+import sockeye.evaluate
+import sockeye.extract_parameters
+import sockeye.lexicon
+import sockeye.model
+import sockeye.prepare_data
+import sockeye.score
+import sockeye.train
+import sockeye.translate
+import sockeye.utils
+
+logger = logging.getLogger(__name__)
+
+
+_DIGITS = "0123456789"
+_MID = 5
+
+
+def generate_digits_file(source_path: str,
+                         target_path: str,
+                         line_count: int = 100,
+                         line_length: int = 9,
+                         sort_target: bool = False,
+                         line_count_empty: int = 0,
+                         seed=13):
+    assert line_count_empty <= line_count
+    random_gen = random.Random(seed)
+    with open(source_path, "w") as source_out, open(target_path, "w") as target_out:
+        all_digits = []
+        for _ in range(line_count - line_count_empty):
+            digits = [random_gen.choice(_DIGITS) for _ in range(random_gen.randint(1, line_length))]
+            all_digits.append(digits)
+        for _ in range(line_count_empty):
+            all_digits.append([])
+        random_gen.shuffle(all_digits)
+        for digits in all_digits:
+            print(" ".join(digits), file=source_out)
+            if sort_target:
+                digits.sort()
+            print(" ".join(digits), file=target_out)
+
+
+def generate_low_high_factors(source_path: str,
+                              output_path: str):
+    """
+    Writes low/high factor file given a source file of digit sequences.
+    """
+    with open(source_path, 'r') as fin, open(output_path, 'w') as fout:
+        for line in fin:
+            digits = map(int, line.rstrip().split())
+            factors = ("l" if digit < _MID else "h" for digit in digits)
+            print(" ".join(factors), file=fout)
+
+
+def generate_fast_align_lex(lex_path: str):
+    """
+    Generate a fast_align format lex table for digits.
+
+    :param lex_path: Path to write lex table.
+    """
+    with open(lex_path, "w") as lex_out:
+        for digit in _DIGITS:
+            print("{0}\t{0}\t0".format(digit), file=lex_out)
+
+
+LEXICON_CREATE_PARAMS_COMMON = "create -i {input} -m {model} -k {topk} -o {lexicon}"
+
+
+@contextmanager
+def tmp_digits_dataset(prefix: str,
+                       train_line_count: int, train_line_count_empty: int, train_max_length: int,
+                       dev_line_count: int, dev_max_length: int,
+                       test_line_count: int, test_line_count_empty: int, test_max_length: int,
+                       sort_target: bool = False,
+                       seed_train: int = 13, seed_dev: int = 13,
+                       with_n_source_factors: int = 0) -> Dict[str, Any]:
+    """
+    Creates a temporary dataset with train, dev, and test. Returns a dictionary with paths to the respective temporary
+    files.
+    """
+    with TemporaryDirectory(prefix=prefix) as work_dir:
+        # Simple digits files for train/dev data
+        train_source_path = os.path.join(work_dir, "train.src")
+        train_target_path = os.path.join(work_dir, "train.tgt")
+        dev_source_path = os.path.join(work_dir, "dev.src")
+        dev_target_path = os.path.join(work_dir, "dev.tgt")
+        test_source_path = os.path.join(work_dir, "test.src")
+        test_target_path = os.path.join(work_dir, "test.tgt")
+        generate_digits_file(train_source_path, train_target_path, train_line_count, train_max_length,
+                             line_count_empty=train_line_count_empty, sort_target=sort_target, seed=seed_train)
+        generate_digits_file(dev_source_path, dev_target_path, dev_line_count, dev_max_length, sort_target=sort_target,
+                             seed=seed_dev)
+        generate_digits_file(test_source_path, test_target_path, test_line_count, test_max_length,
+                             line_count_empty=test_line_count_empty, sort_target=sort_target, seed=seed_dev)
+        data = {'work_dir': work_dir,
+                'train_source': train_source_path,
+                'train_target': train_target_path,
+                'dev_source': dev_source_path,
+                'dev_target': dev_target_path,
+                'test_source': test_source_path,
+                'test_target': test_target_path}
+
+        if with_n_source_factors > 0:
+            data['train_source_factors'] = []
+            data['dev_source_factors'] = []
+            data['test_source_factors'] = []
+            for i in range(with_n_source_factors):
+                train_factor_path = train_source_path + ".factors%d" % i
+                dev_factor_path = dev_source_path + ".factors%d" % i
+                test_factor_path = test_source_path + ".factors%d" % i
+                generate_low_high_factors(train_source_path, train_factor_path)
+                generate_low_high_factors(dev_source_path, dev_factor_path)
+                generate_low_high_factors(test_source_path, test_factor_path)
+                data['train_source_factors'].append(train_factor_path)
+                data['dev_source_factors'].append(dev_factor_path)
+                data['test_source_factors'].append(test_factor_path)
+
+        yield data
+
+
+TRAIN_PARAMS_COMMON = "--use-cpu --max-seq-len {max_len} --source {train_source} --target {train_target}" \
+                       " --validation-source {dev_source} --validation-target {dev_target} --output {model}" \
+                       " --seed {seed}"
+
+PREPARE_DATA_COMMON = " --max-seq-len {max_len} --source {train_source} --target {train_target}" \
+                       " --output {output} --pad-vocab-to-multiple-of 16"
+
+TRAIN_WITH_FACTORS_COMMON = " --source-factors {source_factors}"
+DEV_WITH_FACTORS_COMMON = " --validation-source-factors {dev_source_factors}"
+
+TRAIN_PARAMS_PREPARED_DATA_COMMON = "--use-cpu --max-seq-len {max_len} --prepared-data {prepared_data}" \
+                                     " --validation-source {dev_source} --validation-target {dev_target} " \
+                                     "--output {model}"
+
+TRANSLATE_PARAMS_COMMON = "--use-cpu --models {model} --input {input} --output {output} " \
+                           "--output-type translation_with_score"
+
+TRANSLATE_WITH_FACTORS_COMMON = " --input-factors {input_factors}"
+
+TRANSLATE_PARAMS_RESTRICT = "--restrict-lexicon {lexicon} --restrict-lexicon-topk {topk}"
+
+SCORE_PARAMS_COMMON = "--use-cpu --model {model} --source {source} --target {target} --output {output}"
+
+SCORE_WITH_FACTORS_COMMON = " --source-factors {source_factors}"
+
+
+def run_train_translate(train_params: str,
+                        translate_params: str,
+                        data: Dict[str, Any],
+                        use_prepared_data: bool = False,
+                        max_seq_len: int = 10,
+                        seed: int = 13) -> Dict[str, Any]:
+    """
+    Train a model and translate a test set. Returns the updated data dictionary containing paths to translation outputs
+    and scores.
+
+    :param train_params: Command line args for model training.
+    :param translate_params: First command line args for translation.
+    :param data: Dictionary containing test data
+    :param use_prepared_data: Whether to use the prepared data functionality.
+    :param max_seq_len: The maximum sequence length.
+    :param seed: The seed used for training.
+    :return: Data dictionary, updated with translation outputs and scores
+    """
+    work_dir = os.path.join(data['work_dir'], 'train_translate')
+    data['model'] = os.path.join(work_dir, "model")
+    # Optionally create prepared data directory
+    if use_prepared_data:
+        data['train_prepared'] = os.path.join(work_dir, "prepared_data")
+        prepare_params = "{} {}".format(sockeye.prepare_data.__file__,
+                                        PREPARE_DATA_COMMON.format(train_source=data['train_source'],
+                                                                   train_target=data['train_target'],
+                                                                   output=data['train_prepared'],
+                                                                   max_len=max_seq_len))
+        if 'train_source_factors' in data:
+            prepare_params += TRAIN_WITH_FACTORS_COMMON.format(source_factors=" ".join(data['train_source_factors']))
+
+        if '--weight-tying-type src_trg' in train_params:
+            prepare_params += ' --shared-vocab'
+
+        logger.info("Preparing data with parameters %s.", prepare_params)
+        with patch.object(sys, "argv", prepare_params.split()):
+            sockeye.prepare_data.main()
+        # Train model
+        params = "{} {} {}".format(sockeye.train.__file__,
+                                   TRAIN_PARAMS_PREPARED_DATA_COMMON.format(prepared_data=data['train_prepared'],
+                                                                            dev_source=data['dev_source'],
+                                                                            dev_target=data['dev_target'],
+                                                                            model=data['model'],
+                                                                            max_len=max_seq_len),
+                                   train_params)
+
+        if 'dev_source_factors' in data:
+            params += DEV_WITH_FACTORS_COMMON.format(dev_source_factors=" ".join(data['dev_source_factors']))
+
+        logger.info("Starting training with parameters %s.", train_params)
+        with patch.object(sys, "argv", params.split()):
+            sockeye.train.main()
+    else:
+        # Train model
+        params = "{} {} {}".format(sockeye.train.__file__,
+                                   TRAIN_PARAMS_COMMON.format(train_source=data['train_source'],
+                                                              train_target=data['train_target'],
+                                                              dev_source=data['dev_source'],
+                                                              dev_target=data['dev_target'],
+                                                              model=data['model'],
+                                                              max_len=max_seq_len,
+                                                              seed=seed),
+                                   train_params)
+
+        if 'train_source_factors' in data:
+            params += TRAIN_WITH_FACTORS_COMMON.format(source_factors=" ".join(data['train_source_factors']))
+        if 'dev_source_factors' in data:
+            params += DEV_WITH_FACTORS_COMMON.format(dev_source_factors=" ".join(data['dev_source_factors']))
+
+        logger.info("Starting training with parameters %s.", train_params)
+        with patch.object(sys, "argv", params.split()):
+            sockeye.train.main()
+
+    # create Top-K lexicon from simple ttable mapping digit to digit
+    ttable_path = os.path.join(data['work_dir'], "ttable")
+    generate_fast_align_lex(ttable_path)
+    lexicon_path = os.path.join(data['work_dir'], "lexicon")
+    params = "{} {}".format(sockeye.lexicon.__file__,
+                            LEXICON_CREATE_PARAMS_COMMON.format(input=ttable_path,
+                                                                model=data['model'],
+                                                                topk=20,
+                                                                lexicon=lexicon_path))
+    with patch.object(sys, "argv", params.split()):
+        sockeye.lexicon.main()
+    data['lexicon'] = lexicon_path
+
+    # Translate corpus with the 1st params and scoring output handler to obtain scores
+    data['test_output'] = os.path.join(work_dir, "test.out")
+    params = "{} {} {}".format(sockeye.translate.__file__,
+                               TRANSLATE_PARAMS_COMMON.format(model=data['model'],
+                                                              input=data['test_source'],
+                                                              output=data['test_output']),
+                               translate_params)
+
+    if 'test_source_factors' in data:
+        params += TRANSLATE_WITH_FACTORS_COMMON.format(input_factors=" ".join(data['test_source_factors']))
+
+    logger.info("Translating with params %s", params)
+    with patch.object(sys, "argv", params.split()):
+        sockeye.translate.main()
+
+    # Collect test inputs
+    with open(data['test_source']) as inputs:
+        data['test_inputs'] = [line.strip() for line in inputs]
+
+    # Collect test references
+    with open(data['test_target'], "r") as ref:
+        data['test_targets'] = [line.strip() for line in ref]
+
+    # Collect test translate outputs and scores
+    data['test_outputs'], data['test_scores'] = collect_translate_output_and_scores(data['test_output'])
+    assert len(data['test_inputs']) == len(data['test_targets']) == len(data['test_outputs']) == len(data['test_scores'])
+    return data
+
+
+def run_translate_restrict(data: Dict[str, Any], translate_params: str) -> Dict[str, Any]:
+    """
+    Runs sockeye.translate with vocabulary selection and checks if number of outputs are the same as without
+    vocabulary selection. Adds restricted outputs and scores to the data dictionary.
+    """
+    out_path = os.path.join(data['work_dir'], "out-restrict.txt")
+    # Translate corpus with restrict-lexicon
+    params = "{} {} {} {}".format(sockeye.translate.__file__,
+                                  TRANSLATE_PARAMS_COMMON.format(model=data['model'],
+                                                                 input=data['test_source'],
+                                                                 output=out_path),
+                                  translate_params,
+                                  TRANSLATE_PARAMS_RESTRICT.format(lexicon=data['lexicon'], topk=1))
+    if 'test_source_factors' in data:
+        params += TRANSLATE_WITH_FACTORS_COMMON.format(input_factors=" ".join(data['test_source_factors']))
+    with patch.object(sys, "argv", params.split()):
+        sockeye.translate.main()
+
+    # Collect test translate outputs and scores
+    data['test_outputs_restricted'], data['test_scores_restricted'] = collect_translate_output_and_scores(out_path)
+    assert len(data['test_outputs_restricted']) == len(data['test_outputs'])
+    return data
+
+
+def create_reference_constraints(translate_inputs: List[str], translate_outputs: List[str]) -> List[Dict[str, Any]]:
+    constrained_inputs = []
+    for sentno, (source, translate_output) in enumerate(zip(translate_inputs, translate_outputs)):
+        constrained_inputs.append(json.dumps({'text': source, 'constraints': ['<s> {} </s>'.format(translate_output)]},
+                                             ensure_ascii=False))
+    return constrained_inputs
+
+
+def collect_translate_output_and_scores(out_path: str) -> Tuple[List[str], List[float]]:
+    """
+    Collects translation outputs and scores from an output file
+    produced with the 'translation_and_score' or nbest output handler.
+    """
+    logger.debug("collect_translate_output_and_scores(%s)", out_path)
+    translations = []  # type: List[str]
+    scores = []  # type: List[float]
+    with open(out_path) as out_fh:
+        for line in out_fh:
+            logger.debug(" line: %s", line.strip())
+            output = line.strip()
+            translation = ''
+            score = -np.inf
+            try:
+                json_output = json.loads(output)
+                try:
+                    translation = json_output['translation']
+                    score = json_output['score']
+                except IndexError:
+                    pass
+            except:
+                try:
+                    score, translation = output.split('\t', 1)
+                except ValueError:
+                    pass
+            translations.append(translation)
+            scores.append(float(score))
+    return translations, scores
diff --git a/sockeye/train.py b/sockeye/train.py
index 8e6916806..6f791c992 100644
--- a/sockeye/train.py
+++ b/sockeye/train.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -14,40 +14,34 @@
 """
 Simple Training CLI.
 """
-
-# Start the forkserver. It is important that this is done before any other imports so that the forkserver is in a clean
-# state.
-import sockeye.multiprocessing_utils as mp
-mp.initialize()
-
+from . import pre_mxnet
+# Called before importing mxnet or any module that imports mxnet
+pre_mxnet.init()
 
 import argparse
+import logging
 import os
 import shutil
 import sys
 import tempfile
-import logging
 from contextlib import ExitStack
-from typing import Any, cast, Optional, Dict, List, Tuple, Callable
+from typing import cast, Callable, Optional, Dict, List, Tuple, Union
 
 import mxnet as mx
-
+from mxnet import gluon
+from mxnet.contrib import amp
 
 from . import arguments
 from . import checkpoint_decoder
 from . import constants as C
-from . import convolution
-from . import coverage
 from . import data_io
 from . import decoder
 from . import encoder
-from . import initializer
-from . import loss
+from . import horovod_mpi
 from . import layers
+from . import loss
 from . import lr_scheduler
 from . import model
-from . import rnn
-from . import rnn_attention
 from . import training
 from . import transformer
 from . import utils
@@ -89,20 +83,34 @@ def check_arg_compatibility(args: argparse.Namespace):
     :param args: Arguments as returned by argparse.
     """
 
-    if args.lhuc is not None:
-        # Actually this check is a bit too strict
-        check_condition(args.encoder != C.CONVOLUTION_TYPE or args.decoder != C.CONVOLUTION_TYPE,
-                        "LHUC is not supported for convolutional models yet.")
-        check_condition(args.decoder != C.TRANSFORMER_TYPE or C.LHUC_STATE_INIT not in args.lhuc,
-                        "The %s options only applies to RNN models" % C.LHUC_STATE_INIT)
+    # Require at least one stopping criteria
+    check_condition(any((args.max_samples,
+                         args.max_updates,
+                         args.max_seconds,
+                         args.max_checkpoints,
+                         args.max_num_epochs,
+                         args.max_num_checkpoint_not_improved)),
+                    'Please specify at least one stopping criteria: --max-samples --max-updates --max-checkpoints '
+                    '--max-num-epochs --max-num-checkpoint-not-improved')
+
+    # Check and possibly adapt the parameters for source factors
+    n_source_factors = len(args.validation_source_factors)
+    if len(args.source_factors_combine) > 1:
+        check_condition(n_source_factors == len(args.source_factors_combine),
+                        'The number of combination strategies for source '
+                        'factors does not match the number of source factors.')
+    else:
+        # Length 1: expand the list to the appropriate length
+        args.source_factors_combine = args.source_factors_combine * n_source_factors
+    if len(args.source_factors_share_embedding) > 1:
+        check_condition(n_source_factors == len(args.source_factors_share_embedding),
+                        'The number of vocabulary sharing flags for source '
+                        'factors does not match the number of source factors.')
+    else:
+        # Length 1: expand the list to the appropriate length
+        args.source_factors_share_embedding = args.source_factors_share_embedding * n_source_factors
+
 
-    if args.decoder_only:
-        check_condition(args.decoder != C.TRANSFORMER_TYPE and args.decoder != C.CONVOLUTION_TYPE,
-                        "Decoder pre-training currently supports RNN decoders only.")
-    
-    if args.attention_based_copying:
-        check_condition(args.decoder == C.RNN_NAME,
-                        "The attention-based copying mechanism currently supports RNN decoders only.")
 
 
 def check_resume(args: argparse.Namespace, output_folder: str) -> bool:
@@ -111,11 +119,17 @@ def check_resume(args: argparse.Namespace, output_folder: str) -> bool:
 
     :param args: Arguments as returned by argparse.
     :param output_folder: Main output folder for the model.
+
     :return: Flag signaling if we are resuming training and the directory with
         the training status.
     """
     resume_training = False
     training_state_dir = os.path.join(output_folder, C.TRAINING_STATE_DIRNAME)
+    if horovod_mpi.using_horovod() and horovod_mpi.hvd.rank() > 0:
+        # Horovod secondary workers: wait for primary worker to create the sub-
+        # directory where secondary workers create output directories.
+        primary_worker_dir_check = False
+        horovod_mpi.MPI.COMM_WORLD.bcast(primary_worker_dir_check, root=0)
     if os.path.exists(output_folder):
         if args.overwrite_output:
             logger.info("Removing existing output folder %s.", output_folder)
@@ -144,19 +158,32 @@ def check_resume(args: argparse.Namespace, output_folder: str) -> bool:
                         "Will start training from scratch.", output_folder)
     else:
         os.makedirs(output_folder)
+    if horovod_mpi.using_horovod() and horovod_mpi.hvd.rank() == 0:
+        # Horovod primary worker: make sure sub-directory for secondary worker
+        # outputs exists and signal secondary workers.
+        os.makedirs(os.path.join(output_folder, C.HOROVOD_SECONDARY_WORKERS_DIRNAME), exist_ok=True)
+        primary_worker_dir_check = True
+        horovod_mpi.MPI.COMM_WORLD.bcast(primary_worker_dir_check, root=0)
 
     return resume_training
 
 
-def create_checkpoint_decoder(args: argparse.Namespace,
-                              exit_stack: ExitStack,
-                              train_context: List[mx.Context]) -> Optional[checkpoint_decoder.CheckpointDecoder]:
+def create_checkpoint_decoder(
+        args: argparse.Namespace,
+        exit_stack: ExitStack,
+        train_context: List[mx.Context],
+        sockeye_model: model.SockeyeModel,
+        source_vocabs: List[vocab.Vocab], target_vocab: vocab.Vocab,
+        hybridize: bool = True) -> Optional[checkpoint_decoder.CheckpointDecoder]:
     """
     Returns a checkpoint decoder or None.
 
     :param args: Arguments as returned by argparse.
-    :param exit_stack: An ExitStack from contextlib.
-    :param train_context: Context for training.
+    :param exit_stack: The exit stack potentially used to aquire GPUs with.
+    :param train_context: The training contexts.
+    :param sockeye_model: The Sockeye model instance.
+    :param source_vocabs: The source vocabs.
+    :param hybridize: Turn hybridization of the Translator on/off (the model is already hybridized or not).
     :return: A CheckpointDecoder if --decode-and-evaluate != 0, else None.
     """
     sample_size = args.decode_and_evaluate
@@ -169,9 +196,11 @@ def create_checkpoint_decoder(args: argparse.Namespace,
     if sample_size == 0:
         return None
 
-    if args.use_cpu or args.decode_and_evaluate_use_cpu:
-        context = mx.cpu()
-    elif args.decode_and_evaluate_device_id is not None:
+    if horovod_mpi.using_horovod() and horovod_mpi.hvd.rank() > 0:
+        logger.info("This is a secondary worker, not creating a checkpoint decoder for this training instance")
+        return None
+
+    if args.decode_and_evaluate_device_id is not None:
         context = utils.determine_context(device_ids=[args.decode_and_evaluate_device_id],
                                           use_cpu=False,
                                           disable_device_locking=args.disable_device_locking,
@@ -181,11 +210,15 @@ def create_checkpoint_decoder(args: argparse.Namespace,
         # default decode context is the last training device
         context = train_context[-1]
 
-    return checkpoint_decoder.CheckpointDecoder(context=context,
+    return checkpoint_decoder.CheckpointDecoder(model_folder=args.output,
                                                 inputs=[args.validation_source] + args.validation_source_factors,
                                                 references=args.validation_target,
-                                                model=args.output,
-                                                sample_size=sample_size)
+                                                sample_size=sample_size,
+                                                model=sockeye_model,
+                                                source_vocabs=source_vocabs,
+                                                target_vocab=target_vocab,
+                                                context=context,
+                                                hybridize=hybridize)
 
 
 def use_shared_vocab(args: argparse.Namespace) -> bool:
@@ -194,19 +227,13 @@ def use_shared_vocab(args: argparse.Namespace) -> bool:
 
     :param: args: Arguments as returned by argparse.
     """
-    weight_tying = args.weight_tying
     weight_tying_type = args.weight_tying_type
     shared_vocab = args.shared_vocab
-    decoder_only = args.decoder_only
-    if weight_tying and C.WEIGHT_TYING_SRC in weight_tying_type and C.WEIGHT_TYING_TRG in weight_tying_type:
+    if C.WEIGHT_TYING_SRC in weight_tying_type and C.WEIGHT_TYING_TRG in weight_tying_type:
         if not shared_vocab:
             logger.info("A shared source/target vocabulary will be used as weight tying source/target weight tying "
                         "is enabled")
         shared_vocab = True
-    if decoder_only:
-        if not shared_vocab:
-            logger.info("A shared source/target vocabulary will be used for pre-training the decoder.")
-        shared_vocab = True
     return shared_vocab
 
 
@@ -242,6 +269,10 @@ def create_data_iters_and_vocabs(args: argparse.Namespace,
     validation_sources = [str(os.path.abspath(source)) for source in validation_sources]
     validation_target = str(os.path.abspath(args.validation_target))
 
+    if args.horovod:
+        horovod_data_error_msg = "Horovod training requires prepared training data.  Use `python -m " \
+                                 "sockeye.prepare_data` and specify with %s" % C.TRAINING_ARG_PREPARED_DATA
+        check_condition(args.prepared_data is not None, horovod_data_error_msg)
     either_raw_or_prepared_error_msg = "Either specify a raw training corpus with %s and %s or a preprocessed corpus " \
                                        "with %s." % (C.TRAINING_ARG_SOURCE,
                                                      C.TRAINING_ARG_TARGET,
@@ -259,9 +290,11 @@ def create_data_iters_and_vocabs(args: argparse.Namespace,
             shared_vocab=shared_vocab,
             batch_size=args.batch_size,
             batch_by_words=batch_by_words,
-            batch_num_devices=batch_num_devices)
+            batch_num_devices=batch_num_devices,
+            batch_sentences_multiple_of=args.round_batch_sizes_to_multiple_of)
 
-        check_condition(args.source_factors_combine == C.SOURCE_FACTORS_COMBINE_SUM \
+        check_condition(all([combine in [C.SOURCE_FACTORS_COMBINE_SUM, C.SOURCE_FACTORS_COMBINE_AVERAGE]
+                             for combine in args.source_factors_combine])
                         or len(source_vocabs) == len(args.source_factors_num_embed) + 1,
                         "Data was prepared with %d source factors, but only provided %d source factor dimensions." % (
                             len(source_vocabs), len(args.source_factors_num_embed) + 1))
@@ -302,21 +335,21 @@ def create_data_iters_and_vocabs(args: argparse.Namespace,
                                          else None for i in range(len(args.source_factors))]
             source_vocab_paths = [args.source_vocab] + source_factor_vocab_paths
             target_vocab_path = args.target_vocab
-            num_pointers = max_seq_len_source if args.attention_based_copying else 0
             source_vocabs, target_vocab = vocab.load_or_create_vocabs(
                 source_paths=[args.source] + args.source_factors,
                 target_path=args.target,
                 source_vocab_paths=source_vocab_paths,
+                factor_vocab_same_as_source=args.source_factors_share_embedding,
                 target_vocab_path=target_vocab_path,
                 shared_vocab=shared_vocab,
                 num_words_source=num_words_source,
                 num_words_target=num_words_target,
                 word_min_count_source=word_min_count_source,
                 word_min_count_target=word_min_count_target,
-                pad_to_multiple_of=args.pad_vocab_to_multiple_of,
-                num_pointers=num_pointers)
+                pad_to_multiple_of=args.pad_vocab_to_multiple_of)
 
-        check_condition(args.source_factors_combine == C.SOURCE_FACTORS_COMBINE_SUM \
+        check_condition(all([combine in [C.SOURCE_FACTORS_COMBINE_SUM, C.SOURCE_FACTORS_COMBINE_AVERAGE]
+                             for combine in args.source_factors_combine])
                         or len(args.source_factors) == len(args.source_factors_num_embed),
                         "Number of source factor data (%d) differs from provided source factor dimensions (%d)" % (
                             len(args.source_factors), len(args.source_factors_num_embed)))
@@ -344,7 +377,9 @@ def create_data_iters_and_vocabs(args: argparse.Namespace,
             max_seq_len_source=max_seq_len_source,
             max_seq_len_target=max_seq_len_target,
             bucketing=not args.no_bucketing,
-            bucket_width=args.bucket_width)
+            bucket_width=args.bucket_width,
+            bucket_scaling=not args.no_bucket_scaling,
+            batch_sentences_multiple_of=args.round_batch_sizes_to_multiple_of)
 
         data_info_fname = os.path.join(output_folder, C.DATA_INFO)
         logger.info("Writing data config to '%s'", data_info_fname)
@@ -356,7 +391,6 @@ def create_data_iters_and_vocabs(args: argparse.Namespace,
 def create_encoder_config(args: argparse.Namespace,
                           max_seq_len_source: int,
                           max_seq_len_target: int,
-                          config_conv: Optional[encoder.ConvolutionalEmbeddingConfig],
                           num_embed_source: int) -> Tuple[encoder.EncoderConfig, int]:
     """
     Create the encoder config.
@@ -364,90 +398,46 @@ def create_encoder_config(args: argparse.Namespace,
     :param args: Arguments as returned by argparse.
     :param max_seq_len_source: Maximum source sequence length.
     :param max_seq_len_target: Maximum target sequence length.
-    :param config_conv: The config for the convolutional encoder (optional).
     :param num_embed_source: The size of the source embedding.
     :return: The encoder config and the number of hidden units of the encoder.
     """
     encoder_num_layers, _ = args.num_layers
-    config_encoder = None  # type: Optional[Config]
 
-    if args.decoder_only:
-        if args.encoder in (C.TRANSFORMER_TYPE, C.TRANSFORMER_WITH_CONV_EMBED_TYPE):
-            encoder_num_hidden = args.transformer_model_size[0]
-        elif args.encoder == C.CONVOLUTION_TYPE:
-            encoder_num_hidden = args.cnn_num_hidden
-        else:
-            encoder_num_hidden = args.rnn_num_hidden
-        config_encoder = encoder.EmptyEncoderConfig(num_embed=num_embed_source,
-                                                    num_hidden=encoder_num_hidden)
-    elif args.encoder in (C.TRANSFORMER_TYPE, C.TRANSFORMER_WITH_CONV_EMBED_TYPE):
-        encoder_transformer_preprocess, _ = args.transformer_preprocess
-        encoder_transformer_postprocess, _ = args.transformer_postprocess
-        encoder_transformer_model_size = args.transformer_model_size[0]
-
-        total_source_factor_size = sum(args.source_factors_num_embed)
-        if args.source_factors_combine == C.SOURCE_FACTORS_COMBINE_CONCAT and total_source_factor_size > 0:
-            logger.info("Encoder transformer-model-size adjusted to account for source factor embeddings: %d -> %d" % (
-                encoder_transformer_model_size, num_embed_source + total_source_factor_size))
-            encoder_transformer_model_size = num_embed_source + total_source_factor_size
-        config_encoder = transformer.TransformerConfig(
-            model_size=encoder_transformer_model_size,
-            attention_heads=args.transformer_attention_heads[0],
-            feed_forward_num_hidden=args.transformer_feed_forward_num_hidden[0],
-            act_type=args.transformer_activation_type,
-            num_layers=encoder_num_layers,
-            dropout_attention=args.transformer_dropout_attention,
-            dropout_act=args.transformer_dropout_act,
-            dropout_prepost=args.transformer_dropout_prepost,
-            positional_embedding_type=args.transformer_positional_embedding_type,
-            preprocess_sequence=encoder_transformer_preprocess,
-            postprocess_sequence=encoder_transformer_postprocess,
-            max_seq_len_source=max_seq_len_source,
-            max_seq_len_target=max_seq_len_target,
-            conv_config=config_conv,
-            lhuc=args.lhuc is not None and (C.LHUC_ENCODER in args.lhuc or C.LHUC_ALL in args.lhuc))
-        encoder_num_hidden = encoder_transformer_model_size
-    elif args.encoder == C.CONVOLUTION_TYPE:
-        cnn_kernel_width_encoder, _ = args.cnn_kernel_width
-        cnn_config = convolution.ConvolutionConfig(kernel_width=cnn_kernel_width_encoder,
-                                                   num_hidden=args.cnn_num_hidden,
-                                                   act_type=args.cnn_activation_type,
-                                                   weight_normalization=args.weight_normalization)
-        cnn_num_embed = num_embed_source
-        if args.source_factors_combine == C.SOURCE_FACTORS_COMBINE_CONCAT:
-            cnn_num_embed += sum(args.source_factors_num_embed)
-        config_encoder = encoder.ConvolutionalEncoderConfig(num_embed=cnn_num_embed,
-                                                            max_seq_len_source=max_seq_len_source,
-                                                            cnn_config=cnn_config,
-                                                            num_layers=encoder_num_layers,
-                                                            positional_embedding_type=args.cnn_positional_embedding_type)
-
-        encoder_num_hidden = args.cnn_num_hidden
-    else:
-        encoder_rnn_dropout_inputs, _ = args.rnn_dropout_inputs
-        encoder_rnn_dropout_states, _ = args.rnn_dropout_states
-        encoder_rnn_dropout_recurrent, _ = args.rnn_dropout_recurrent
-        config_encoder = encoder.RecurrentEncoderConfig(
-            rnn_config=rnn.RNNConfig(cell_type=args.rnn_cell_type,
-                                     num_hidden=args.rnn_num_hidden,
-                                     num_layers=encoder_num_layers,
-                                     dropout_inputs=encoder_rnn_dropout_inputs,
-                                     dropout_states=encoder_rnn_dropout_states,
-                                     dropout_recurrent=encoder_rnn_dropout_recurrent,
-                                     residual=args.rnn_residual_connections,
-                                     first_residual_layer=args.rnn_first_residual_layer,
-                                     forget_bias=args.rnn_forget_bias,
-                                     lhuc=args.lhuc is not None and (C.LHUC_ENCODER in args.lhuc or C.LHUC_ALL in args.lhuc)),
-            conv_config=config_conv,
-            reverse_input=args.rnn_encoder_reverse_input)
-        encoder_num_hidden = args.rnn_num_hidden
+    encoder_transformer_preprocess, _ = args.transformer_preprocess
+    encoder_transformer_postprocess, _ = args.transformer_postprocess
+    encoder_transformer_model_size = args.transformer_model_size[0]
+
+    total_source_factor_size = 0
+    for factor_combine, factor_size in zip(args.source_factors_combine, args.source_factors_num_embed):
+        if factor_combine == C.SOURCE_FACTORS_COMBINE_CONCAT:
+            total_source_factor_size += factor_size
+    if total_source_factor_size > 0:
+        logger.info("Encoder transformer-model-size adjusted to account for source factor embeddings: %d -> %d" % (
+            encoder_transformer_model_size, num_embed_source + total_source_factor_size))
+        encoder_transformer_model_size = num_embed_source + total_source_factor_size
+
+    config_encoder = transformer.TransformerConfig(
+        model_size=encoder_transformer_model_size,
+        attention_heads=args.transformer_attention_heads[0],
+        feed_forward_num_hidden=args.transformer_feed_forward_num_hidden[0],
+        act_type=args.transformer_activation_type[0],
+        num_layers=encoder_num_layers,
+        dropout_attention=args.transformer_dropout_attention[0],
+        dropout_act=args.transformer_dropout_act[0],
+        dropout_prepost=args.transformer_dropout_prepost[0],
+        positional_embedding_type=args.transformer_positional_embedding_type,
+        preprocess_sequence=encoder_transformer_preprocess,
+        postprocess_sequence=encoder_transformer_postprocess,
+        max_seq_len_source=max_seq_len_source,
+        max_seq_len_target=max_seq_len_target,
+        lhuc=args.lhuc is not None and (C.LHUC_ENCODER in args.lhuc or C.LHUC_ALL in args.lhuc))
+    encoder_num_hidden = encoder_transformer_model_size
 
     return config_encoder, encoder_num_hidden
 
 
 def create_decoder_config(args: argparse.Namespace, encoder_num_hidden: int,
-                          max_seq_len_source: int, max_seq_len_target: int,
-                          num_embed_target: int) -> decoder.DecoderConfig:
+                          max_seq_len_source: int, max_seq_len_target: int) -> decoder.DecoderConfig:
     """
     Create the config for the decoder.
 
@@ -455,127 +445,32 @@ def create_decoder_config(args: argparse.Namespace, encoder_num_hidden: int,
     :param encoder_num_hidden: Number of hidden units of the Encoder.
     :param max_seq_len_source: Maximum source sequence length.
     :param max_seq_len_target: Maximum target sequence length.
-    :param num_embed_target: The size of the source embedding.
     :return: The config for the decoder.
     """
     _, decoder_num_layers = args.num_layers
 
-    config_decoder = None  # type: Optional[Config]
-
-    if args.decoder == C.TRANSFORMER_TYPE:
-        if args.decoder_only:
-            raise NotImplementedError()
-        _, decoder_transformer_preprocess = args.transformer_preprocess
-        _, decoder_transformer_postprocess = args.transformer_postprocess
-        config_decoder = transformer.TransformerConfig(
-            model_size=args.transformer_model_size[1],
-            attention_heads=args.transformer_attention_heads[1],
-            feed_forward_num_hidden=args.transformer_feed_forward_num_hidden[1],
-            act_type=args.transformer_activation_type,
-            num_layers=decoder_num_layers,
-            dropout_attention=args.transformer_dropout_attention,
-            dropout_act=args.transformer_dropout_act,
-            dropout_prepost=args.transformer_dropout_prepost,
-            positional_embedding_type=args.transformer_positional_embedding_type,
-            preprocess_sequence=decoder_transformer_preprocess,
-            postprocess_sequence=decoder_transformer_postprocess,
-            max_seq_len_source=max_seq_len_source,
-            max_seq_len_target=max_seq_len_target,
-            conv_config=None,
-            lhuc=args.lhuc is not None and (C.LHUC_DECODER in args.lhuc or C.LHUC_ALL in args.lhuc))
-
-    elif args.decoder == C.CONVOLUTION_TYPE:
-        if args.decoder_only:
-            raise NotImplementedError()
-        _, cnn_kernel_width_decoder = args.cnn_kernel_width
-        convolution_config = convolution.ConvolutionConfig(kernel_width=cnn_kernel_width_decoder,
-                                                           num_hidden=args.cnn_num_hidden,
-                                                           act_type=args.cnn_activation_type,
-                                                           weight_normalization=args.weight_normalization)
-        config_decoder = decoder.ConvolutionalDecoderConfig(cnn_config=convolution_config,
-                                                            max_seq_len_target=max_seq_len_target,
-                                                            num_embed=num_embed_target,
-                                                            encoder_num_hidden=encoder_num_hidden,
-                                                            num_layers=decoder_num_layers,
-                                                            positional_embedding_type=args.cnn_positional_embedding_type,
-                                                            project_qkv=args.cnn_project_qkv,
-                                                            hidden_dropout=args.cnn_hidden_dropout)
-
-    else:
-        if args.decoder_only:
-            args.rnn_decoder_state_init = C.RNN_DEC_INIT_ZERO
-            args.rnn_context_gating = False
-            args.rnn_attention_type = C.ATT_FIXED
-            args.rnn_attention_in_upper_layers = False
-            args.lhuc = None
-            args.rnn_enc_last_hidden_concat_to_embedding = False
-
-        rnn_attention_num_hidden = args.rnn_num_hidden if args.rnn_attention_num_hidden is None else args.rnn_attention_num_hidden
-        config_coverage = None
-        if args.rnn_attention_type == C.ATT_COV:
-            config_coverage = coverage.CoverageConfig(type=args.rnn_attention_coverage_type,
-                                                      max_fertility=args.rnn_attention_coverage_max_fertility,
-                                                      num_hidden=args.rnn_attention_coverage_num_hidden,
-                                                      layer_normalization=args.layer_normalization)
-        config_attention = rnn_attention.AttentionConfig(type=args.rnn_attention_type,
-                                                         num_hidden=rnn_attention_num_hidden,
-                                                         input_previous_word=args.rnn_attention_use_prev_word,
-                                                         source_num_hidden=encoder_num_hidden,
-                                                         query_num_hidden=args.rnn_num_hidden,
-                                                         layer_normalization=args.layer_normalization,
-                                                         config_coverage=config_coverage,
-                                                         num_heads=args.rnn_attention_mhdot_heads,
-                                                         is_scaled=args.rnn_scale_dot_attention)
-
-        _, decoder_rnn_dropout_inputs = args.rnn_dropout_inputs
-        _, decoder_rnn_dropout_states = args.rnn_dropout_states
-        _, decoder_rnn_dropout_recurrent = args.rnn_dropout_recurrent
-
-        config_decoder = decoder.RecurrentDecoderConfig(
-            max_seq_len_source=max_seq_len_source,
-            rnn_config=rnn.RNNConfig(cell_type=args.rnn_cell_type,
-                                     num_hidden=args.rnn_num_hidden,
-                                     num_layers=decoder_num_layers,
-                                     dropout_inputs=decoder_rnn_dropout_inputs,
-                                     dropout_states=decoder_rnn_dropout_states,
-                                     dropout_recurrent=decoder_rnn_dropout_recurrent,
-                                     residual=args.rnn_residual_connections,
-                                     first_residual_layer=args.rnn_first_residual_layer,
-                                     forget_bias=args.rnn_forget_bias,
-                                     lhuc=args.lhuc is not None and (C.LHUC_DECODER in args.lhuc or C.LHUC_ALL in args.lhuc)),
-            attention_config=config_attention,
-            hidden_dropout=args.rnn_decoder_hidden_dropout,
-            state_init=args.rnn_decoder_state_init,
-            context_gating=args.rnn_context_gating,
-            layer_normalization=args.layer_normalization,
-            attention_in_upper_layers=args.rnn_attention_in_upper_layers,
-            state_init_lhuc=args.lhuc is not None and (C.LHUC_STATE_INIT in args.lhuc or C.LHUC_ALL in args.lhuc),
-            enc_last_hidden_concat_to_embedding=args.rnn_enc_last_hidden_concat_to_embedding)
+    _, decoder_transformer_preprocess = args.transformer_preprocess
+    _, decoder_transformer_postprocess = args.transformer_postprocess
+    config_decoder = transformer.TransformerConfig(
+        model_size=args.transformer_model_size[1],
+        attention_heads=args.transformer_attention_heads[1],
+        feed_forward_num_hidden=args.transformer_feed_forward_num_hidden[1],
+        act_type=args.transformer_activation_type[1],
+        num_layers=decoder_num_layers,
+        dropout_attention=args.transformer_dropout_attention[1],
+        dropout_act=args.transformer_dropout_act[1],
+        dropout_prepost=args.transformer_dropout_prepost[1],
+        positional_embedding_type=args.transformer_positional_embedding_type,
+        preprocess_sequence=decoder_transformer_preprocess,
+        postprocess_sequence=decoder_transformer_postprocess,
+        max_seq_len_source=max_seq_len_source,
+        max_seq_len_target=max_seq_len_target,
+        lhuc=args.lhuc is not None and (C.LHUC_DECODER in args.lhuc or C.LHUC_ALL in args.lhuc),
+        depth_key_value=encoder_num_hidden)
 
     return config_decoder
 
 
-def check_encoder_decoder_args(args) -> None:
-    """
-    Check possible encoder-decoder argument conflicts.
-
-    :param args: Arguments as returned by argparse.
-    """
-    encoder_embed_dropout, decoder_embed_dropout = args.embed_dropout
-    encoder_rnn_dropout_inputs, decoder_rnn_dropout_inputs = args.rnn_dropout_inputs
-    encoder_rnn_dropout_states, decoder_rnn_dropout_states = args.rnn_dropout_states
-    if encoder_embed_dropout > 0 and encoder_rnn_dropout_inputs > 0:
-        logger.warning("Setting encoder RNN AND source embedding dropout > 0 leads to "
-                       "two dropout layers on top of each other.")
-    if decoder_embed_dropout > 0 and decoder_rnn_dropout_inputs > 0:
-        logger.warning("Setting encoder RNN AND source embedding dropout > 0 leads to "
-                       "two dropout layers on top of each other.")
-    encoder_rnn_dropout_recurrent, decoder_rnn_dropout_recurrent = args.rnn_dropout_recurrent
-    if encoder_rnn_dropout_recurrent > 0 or decoder_rnn_dropout_recurrent > 0:
-        check_condition(args.rnn_cell_type == C.LSTM_TYPE,
-                        "Recurrent dropout without memory loss only supported for LSTMs right now.")
-
-
 def get_num_embed(args: argparse.Namespace) -> Tuple[int, int]:
     num_embed_source, num_embed_target = args.num_embed
     if args.encoder == C.TRANSFORMER_TYPE:
@@ -587,10 +482,13 @@ def get_num_embed(args: argparse.Namespace) -> Tuple[int, int]:
         else:
             check_condition(args.transformer_model_size[0] == num_embed_source,
                             "Source embedding size must match transformer model size: %s vs. %s"
-                            % (args.transformer_model_size, num_embed_source))
+                            % (args.transformer_model_size[0], num_embed_source))
 
-        total_source_factor_size = sum(args.source_factors_num_embed)
-        if total_source_factor_size > 0 and args.source_factors_combine == C.SOURCE_FACTORS_COMBINE_CONCAT:
+        total_source_factor_size = 0
+        for factor_combine, factor_size in zip(args.source_factors_combine, args.source_factors_num_embed):
+            if factor_combine == C.SOURCE_FACTORS_COMBINE_CONCAT:
+                total_source_factor_size += factor_size
+        if total_source_factor_size > 0:
             adjusted_transformer_encoder_model_size = num_embed_source + total_source_factor_size
             check_condition(adjusted_transformer_encoder_model_size % 2 == 0 and
                             adjusted_transformer_encoder_model_size % args.transformer_attention_heads[0] == 0,
@@ -608,7 +506,7 @@ def get_num_embed(args: argparse.Namespace) -> Tuple[int, int]:
             # Make sure that if the user sets num_embed it matches the Transformer model size
             check_condition(args.transformer_model_size[1] == num_embed_target,
                             "Target embedding size must match transformer model size: %s vs. %s"
-                            % (args.transformer_model_size, num_embed_target))
+                            % (args.transformer_model_size[1], num_embed_target))
 
     if not num_embed_source:
         num_embed_source = C.DEFAULT_NUM_EMBED
@@ -640,67 +538,52 @@ def create_model_config(args: argparse.Namespace,
     embed_dropout_source, embed_dropout_target = args.embed_dropout
     source_vocab_size, *source_factor_vocab_sizes = source_vocab_sizes
 
-    check_encoder_decoder_args(args)
-
-    config_conv = None
-    if args.encoder == C.RNN_WITH_CONV_EMBED_NAME:
-        config_conv = encoder.ConvolutionalEmbeddingConfig(num_embed=num_embed_source,
-                                                           max_filter_width=args.conv_embed_max_filter_width,
-                                                           num_filters=args.conv_embed_num_filters,
-                                                           pool_stride=args.conv_embed_pool_stride,
-                                                           num_highway_layers=args.conv_embed_num_highway_layers,
-                                                           dropout=args.conv_embed_dropout)
-    if args.encoder == C.TRANSFORMER_WITH_CONV_EMBED_TYPE:
-        config_conv = encoder.ConvolutionalEmbeddingConfig(num_embed=num_embed_source,
-                                                           output_dim=num_embed_source,
-                                                           max_filter_width=args.conv_embed_max_filter_width,
-                                                           num_filters=args.conv_embed_num_filters,
-                                                           pool_stride=args.conv_embed_pool_stride,
-                                                           num_highway_layers=args.conv_embed_num_highway_layers,
-                                                           dropout=args.conv_embed_dropout)
-
     config_encoder, encoder_num_hidden = create_encoder_config(args, max_seq_len_source, max_seq_len_target,
-                                                               config_conv, num_embed_source)
-    config_decoder = create_decoder_config(args, encoder_num_hidden, max_seq_len_source, max_seq_len_target,
-                                           num_embed_target)
+                                                               num_embed_source)
+    config_decoder = create_decoder_config(args, encoder_num_hidden, max_seq_len_source, max_seq_len_target)
 
     source_factor_configs = None
     if len(source_vocab_sizes) > 1:
         source_factors_num_embed = args.source_factors_num_embed
-        if args.source_factors_combine == C.SOURCE_FACTORS_COMBINE_SUM:
-            # If factors are being added instead of concatenated, set all dimensions to the embedding dimensions
-            logger.info("Setting all source factor embedding sizes to `num_embed` ('%d') for summing",
+        if not source_factors_num_embed:
+            # This happens if the combination method is sum or average. We then
+            # set the dimension to num_embed_source for all factors
+            logger.info("Setting all source factor embedding sizes to `num_embed` ('%d')",
                         num_embed_source)
             source_factors_num_embed = [num_embed_source] * len(source_factor_vocab_sizes)
-
-        source_factor_configs = [encoder.FactorConfig(size, dim) for size, dim in zip(source_factor_vocab_sizes,
-                                                                                      source_factors_num_embed)]
+        else:
+            # Check each individual factor
+            for i, combine in enumerate(args.source_factors_combine):
+                if combine in [C.SOURCE_FACTORS_COMBINE_SUM, C.SOURCE_FACTORS_COMBINE_AVERAGE]:
+                    logger.info("Setting embedding size of factor %d to `num_embed` ('%d') for %s",
+                                i + 1, num_embed_source,
+                                "summing" if combine == C.SOURCE_FACTORS_COMBINE_SUM else "averaging")
+                    source_factors_num_embed[i] = num_embed_source
+
+        source_factor_configs = [encoder.FactorConfig(size, dim, combine, share) \
+                                 for size, dim, combine, share in zip(source_factor_vocab_sizes,
+                                                                      source_factors_num_embed,
+                                                                      args.source_factors_combine,
+                                                                      args.source_factors_share_embedding)]
+
+    allow_sparse_grad = args.update_interval == 1  # sparse embedding gradients do not work with grad_req='add'
 
     config_embed_source = encoder.EmbeddingConfig(vocab_size=source_vocab_size,
                                                   num_embed=num_embed_source,
                                                   dropout=embed_dropout_source,
                                                   factor_configs=source_factor_configs,
-                                                  source_factors_combine=args.source_factors_combine)
+                                                  allow_sparse_grad=allow_sparse_grad)
 
     config_embed_target = encoder.EmbeddingConfig(vocab_size=target_vocab_size,
                                                   num_embed=num_embed_target,
-                                                  dropout=embed_dropout_target)
-
-    config_loss = loss.LossConfig(name=args.loss,
-                                  vocab_size=target_vocab_size,
-                                  normalization_type=args.loss_normalization_type,
-                                  label_smoothing=args.label_smoothing)
+                                                  dropout=embed_dropout_target,
+                                                  allow_sparse_grad=allow_sparse_grad)
 
+    config_length_task = None
     if args.length_task is not None:
-        config_length_task = layers.LengthRatioConfig(num_layers=args.length_task_layers, weight=args.length_task_weight)
-        link = C.LINK_NORMAL if args.length_task == C.LENGTH_TASK_RATIO else C.LINK_POISSON
-        config_length_task_loss = loss.LossConfig(name=C.LENRATIO_REGRESSION,
-                                                   length_task_link=link,
-                                                   length_task_weight=args.length_task_weight)
-    else:
-        config_length_task = None
-        config_length_task_loss = None
-    num_pointers = max_seq_len_source if args.attention_based_copying else 0
+        config_length_task = layers.LengthRatioConfig(num_layers=args.length_task_layers,
+                                                      weight=args.length_task_weight)
+
     model_config = model.ModelConfig(config_data=config_data,
                                      vocab_source_size=source_vocab_size,
                                      vocab_target_size=target_vocab_size,
@@ -708,66 +591,42 @@ def create_model_config(args: argparse.Namespace,
                                      config_embed_target=config_embed_target,
                                      config_encoder=config_encoder,
                                      config_decoder=config_decoder,
-                                     config_loss=config_loss,
-                                     config_length_task_loss=config_length_task_loss,
                                      config_length_task=config_length_task,
-                                     weight_tying=args.weight_tying,
-                                     weight_tying_type=args.weight_tying_type if args.weight_tying else None,
-                                     weight_normalization=args.weight_normalization,
+                                     weight_tying_type=args.weight_tying_type,
                                      lhuc=args.lhuc is not None,
-                                     num_pointers=num_pointers)
+                                     dtype=args.dtype)
     return model_config
 
 
-def create_training_model(config: model.ModelConfig,
-                          context: List[mx.Context],
-                          output_dir: str,
-                          train_iter: data_io.BaseParallelSampleIter,
-                          args: argparse.Namespace) -> training.TrainingModel:
-    """
-    Create a training model and load the parameters from disk if needed.
-
-    :param config: The configuration for the model.
-    :param context: The context(s) to run on.
-    :param output_dir: Output folder.
-    :param train_iter: The training data iterator.
-    :param args: Arguments as returned by argparse.
-    :return: The training model.
-    """
-    training_model = training.TrainingModel(config=config,
-                                            context=context,
-                                            output_dir=output_dir,
-                                            provide_data=train_iter.provide_data,
-                                            provide_label=train_iter.provide_label,
-                                            default_bucket_key=train_iter.default_bucket_key,
-                                            bucketing=not args.no_bucketing,
-                                            gradient_compression_params=gradient_compression_params(args),
-                                            gradient_accumulation=args.update_interval > 1,
-                                            fixed_param_names=args.fixed_param_names,
-                                            fixed_param_strategy=args.fixed_param_strategy)
-
-    return training_model
-
-
-def gradient_compression_params(args: argparse.Namespace) -> Optional[Dict[str, Any]]:
-    """
-    :param args: Arguments as returned by argparse.
-    :return: Gradient compression parameters or None.
-    """
-    if args.gradient_compression_type is None:
-        return None
-    else:
-        return {'type': args.gradient_compression_type, 'threshold': args.gradient_compression_threshold}
+def create_losses(args: argparse.Namespace) -> List[loss.Loss]:
+    softmax_output_grad_scale = C.FIXED_GRAD_SCALE_FP16 if args.dtype == C.DTYPE_FP16 else 1.0
+    losses = [loss.CrossEntropyLoss(name=C.CROSS_ENTROPY,
+                                    weight=softmax_output_grad_scale,
+                                    label_smoothing=args.label_smoothing,
+                                    dtype=args.dtype,
+                                    output_name=C.LOGITS_NAME,
+                                    label_name=C.TARGET_LABEL_NAME)]
+    if args.length_task is not None:
+        weight = args.length_task_weight
+        if args.length_task == C.LENGTH_TASK_RATIO:
+            length_loss = loss.MSELoss(name=C.LENRATIO_NAME + "_" + C.LINK_NORMAL,
+                                       weight=weight,
+                                       output_name=C.LENRATIO_NAME,
+                                       label_name=C.LENRATIO_LABEL_NAME)
+        else:
+            length_loss = loss.PoissonLoss(name=C.LENRATIO_NAME + "_" + C.LINK_POISSON,
+                                           weight=weight,
+                                           output_name=C.LENRATIO_NAME,
+                                           label_name=C.LENRATIO_LABEL_NAME)
+        losses.append(length_loss)
+    return losses
 
 
-def create_optimizer_config(args: argparse.Namespace, source_vocab_sizes: List[int],
-                            extra_initializers: List[Tuple[str, mx.initializer.Initializer]] = None) -> OptimizerConfig:
+def create_optimizer_config(args: argparse.Namespace) -> OptimizerConfig:
     """
     Returns an OptimizerConfig.
 
     :param args: Arguments as returned by argparse.
-    :param source_vocab_sizes: Source vocabulary sizes.
-    :param extra_initializers: extra initializer to pass to `get_initializer`.
     :return: The optimizer type and its parameters as well as the kvstore.
     """
     optimizer_params = {'wd': args.weight_decay,
@@ -780,7 +639,8 @@ def create_optimizer_config(args: argparse.Namespace, source_vocab_sizes: List[i
     else:
         gradient_clipping_type = args.gradient_clipping_type
 
-    effective_batch_size = args.batch_size * args.update_interval
+    num_workers = 1 if not args.horovod else horovod_mpi.hvd.size()
+    effective_batch_size = args.batch_size * args.update_interval * num_workers
 
     # Note: for 'abs' we use the implementation inside of MXNet's optimizer and 'norm_*' we implement ourselves
     # inside the TrainingModel.
@@ -788,33 +648,31 @@ def create_optimizer_config(args: argparse.Namespace, source_vocab_sizes: List[i
         optimizer_params["clip_gradient"] = gradient_clipping_threshold
     if args.momentum is not None:
         optimizer_params["momentum"] = args.momentum
-    if args.loss_normalization_type == C.LOSS_NORM_VALID:
-        # When we normalize by the number of non-PAD symbols in a batch we need to disable rescale_grad.
-        optimizer_params["rescale_grad"] = 1.0 / args.update_interval
-    elif args.loss_normalization_type == C.LOSS_NORM_BATCH:
-        # Making MXNet module API's default scaling factor explicit
-        optimizer_params["rescale_grad"] = 1.0 / effective_batch_size
+    # We normalize by the number of non-PAD symbols in a batch we need to disable rescale_grad.
+    optimizer_params["rescale_grad"] = 1.0
+    if args.dtype == C.DTYPE_FP16:
+        os.environ[C.MXNET_SAFE_ACCUMULATION] = '1'
+        optimizer_params["multi_precision"] = True
+        optimizer_params["rescale_grad"] /= C.FIXED_GRAD_SCALE_FP16
     # Manually specified params
     if args.optimizer_params:
         optimizer_params.update(args.optimizer_params)
 
-    weight_init = initializer.get_initializer(default_init_type=args.weight_init,
-                                              default_init_scale=args.weight_init_scale,
-                                              default_init_xavier_rand_type=args.weight_init_xavier_rand_type,
-                                              default_init_xavier_factor_type=args.weight_init_xavier_factor_type,
-                                              embed_init_type=args.embed_weight_init,
-                                              embed_init_sigma=source_vocab_sizes[0] ** -0.5,
-                                              rnn_init_type=args.rnn_h2h_init,
-                                              extra_initializers=extra_initializers)
+    if args.weight_init == C.INIT_XAVIER:
+        weight_init = mx.init.Xavier(rnd_type=args.weight_init_xavier_rand_type,
+                                     factor_type=args.weight_init_xavier_factor_type,
+                                     magnitude=args.weight_init_scale)
+    elif args.weight_init == C.INIT_UNIFORM:
+        weight_init = mx.init.Uniform(scale=args.weight_init_scale)
+    else:
+        raise ValueError("Invalid weight initialization type: %s" % args.weight_init)
 
     lr_sched = lr_scheduler.get_lr_scheduler(args.learning_rate_scheduler_type,
-                                             args.checkpoint_interval,
-                                             none_if_negative(args.learning_rate_half_life),
+                                             args.learning_rate_t_scale,
                                              args.learning_rate_reduce_factor,
                                              args.learning_rate_reduce_num_not_improved,
-                                             args.learning_rate_schedule,
-                                             args.learning_rate_warmup)
-
+                                             args.learning_rate_warmup,
+                                             args.max_updates)
     config = OptimizerConfig(name=args.optimizer,
                              params=optimizer_params,
                              kvstore=args.kvstore,
@@ -823,14 +681,82 @@ def create_optimizer_config(args: argparse.Namespace, source_vocab_sizes: List[i
                              gradient_clipping_threshold=gradient_clipping_threshold,
                              update_interval=args.update_interval)
     config.set_lr_scheduler(lr_sched)
-    logger.info("Optimizer: %s", config)
-    logger.info("Gradient Compression: %s", gradient_compression_params(args))
-    if args.update_interval > 1:
-        logger.info("Gradient accumulation over %d batches. Effective batch size: %d",
-                    args.update_interval, effective_batch_size)
+    logger.info("Optimizer: %s | kvstore=%s | params=%s | initializer=%s",
+                config.name, config.kvstore, config.params, config.initializer)
+    logger.info("Gradient accumulation over %d batch(es) by %d worker(s). Effective batch size: %d",
+                args.update_interval, num_workers, effective_batch_size)
     return config
 
 
+def set_grad_req_for_fixed_params(config: model.ModelConfig,
+                                  params: mx.gluon.ParameterDict,
+                                  fixed_param_names: List[str],
+                                  fixed_param_strategy: Optional[str] = None):
+    utils.check_condition(not config.lhuc or fixed_param_strategy is None,
+                          "LHUC fixes all other parameters and is thus not compatible with other fixing strategies.")
+    if config.lhuc:
+        # fix everything except LHUC-related parameters
+        fixed_param_names += [name for name in params if not name.endswith(C.LHUC_PREFIX + "weight")]
+        logger.info("LHUC enabled, fixing all non-LHUC parameters")
+    elif fixed_param_strategy is not None:
+        fixed_param_names += fixed_param_names_from_stragegy(config, params, fixed_param_strategy)
+        logger.info("Fixed param strategy: '%s'", fixed_param_strategy)
+
+    # set grad_req for fixed params
+    for name in fixed_param_names:
+        if name not in params:
+            logger.warning("Fixed parameter name '%s' not part of model parameters, ignoring", name)
+            continue
+        params[name].grad_req = 'null'
+
+    return params
+
+
+def fixed_param_names_from_stragegy(config: model.ModelConfig,
+                                    params: Union[Dict, mx.gluon.ParameterDict],
+                                    strategy: str) -> List[str]:
+    """
+    Generate a fixed parameter list given a list of all parameter names and
+    a strategy.
+    """
+    # Number of encoder/decoder layers in model.
+    num_encoder_layers = config.config_encoder.num_layers
+    num_decoder_layers = config.config_decoder.num_layers
+
+    def is_fixed(name: str) -> bool:
+        if strategy == C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_DECODER:
+            # Any decoder layer.
+            return not name.startswith(C.DECODER_PREFIX)
+        if strategy == C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_OUTER_LAYERS:
+            # First and last encoder and decoder layers.
+            return not (name.startswith("{}{}".format(C.TRANSFORMER_ENCODER_PREFIX, 0)) or
+                        name.startswith("{}{}".format(C.TRANSFORMER_ENCODER_PREFIX, num_encoder_layers - 1)) or
+                        name.startswith("{}{}".format(C.TRANSFORMER_DECODER_PREFIX, 0)) or
+                        name.startswith("{}{}".format(C.TRANSFORMER_DECODER_PREFIX, num_decoder_layers - 1)))
+        if strategy == C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_EMBEDDINGS:
+            # Any type of learned embedding.
+            return not (name.startswith(C.SOURCE_EMBEDDING_PREFIX) or
+                        name.startswith(C.TARGET_EMBEDDING_PREFIX) or
+                        name.startswith(C.SHARED_EMBEDDING_PREFIX))
+        if strategy == C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_OUTPUT_PROJ:
+            # Target output projection.
+            return not name.startswith(C.DEFAULT_OUTPUT_LAYER_PREFIX)
+        if strategy == C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_FEED_FORWARD:
+            return not (name.endswith("_ff_h2o_bias") or name.endswith("_ff_h2o_weight") or
+                        name.endswith("_ff_i2h_bias") or name.endswith("_ff_i2h_weight"))
+        if strategy == C.FIXED_PARAM_STRATEGY_ENCODER_AND_SOURCE_EMBEDDINGS:
+            return name.startswith(C.ENCODER_PREFIX) or name.startswith(C.SOURCE_EMBEDDING_PREFIX)
+        if strategy == C.FIXED_PARAM_STRATEGY_ENCODER_HALF_AND_SOURCE_EMBEDDINGS:
+            if name.startswith(C.ENCODER_PREFIX):
+                for i in range(num_encoder_layers // 2):
+                    if name.startswith("{}{}_".format(C.TRANSFORMER_ENCODER_PREFIX, i)):
+                        return True
+            return name.startswith(C.SOURCE_EMBEDDING_PREFIX)
+        raise ValueError("Unknown fixed parameter strategy: %s" % strategy)
+
+    return [name for name in params if is_fixed(name)]
+
+
 def main():
     params = arguments.ConfigArgumentParser(description='Train Sockeye sequence-to-sequence models.')
     arguments.add_train_cli_args(params)
@@ -839,14 +765,15 @@ def main():
 
 
 def train(args: argparse.Namespace, custom_metrics_logger: Optional[Callable] = None,
-         checkpoint_callback: Optional[Callable] = None) -> training.TrainState:
+          checkpoint_callback: Optional[Callable] = None) -> training.TrainState:
     """
     :param custom_metrics_logger: Optional custom metrics logging function. If supplied, takes care of metrics produced
-                                  during training in a custom way. It should accept a dictionary of
-                                  metric name -> metric value pairs and a global_step/checkpoint parameter.
+                                  during training in a custom way. It should accept a list or a dictionary of
+                                  (metric name, metric value) pairs, and an optional global_step/checkpoint parameter.
     :param checkpoint_callback: An optional callback function (int -> None). The function will be called
-                                each time a checkpoint has been reached 
++                                each time a checkpoint has been reached
     """
+
     if args.dry_run:
         # Modify arguments so that we write to a temporary directory and
         # perform 0 training iterations
@@ -854,7 +781,35 @@ def train(args: argparse.Namespace, custom_metrics_logger: Optional[Callable] =
         args.output = temp_dir.name
         args.max_updates = 0
 
-    utils.seed_rngs(args.seed)
+    # Automatic Mixed Precision training
+    using_amp = False
+    if args.amp:
+        using_amp = True
+        amp.init()
+
+    # When using Horovod, multiple workers (instances of sockeye.train) are
+    # launched via OpenMPI.  Each worker has a rank (unique among all workers in
+    # the training run) and a local rank (unique on the current host).  For
+    # example, running on 2 hosts with 4 slots each will assign ranks 0-7 and
+    # local ranks 0-3.
+    if args.horovod:
+        if horovod_mpi.hvd is None or horovod_mpi.MPI is None:
+            raise RuntimeError('Horovod training requires the following packages to be installed: horovod mpi4py')
+        # Unless explicitly set otherwise, use NCCL for same-host
+        # allreduce/allgather and MPI for cross-host allreduce/allgather.
+        if C.HOROVOD_HIERARCHICAL_ALLREDUCE not in os.environ:
+            os.environ[C.HOROVOD_HIERARCHICAL_ALLREDUCE] = '1'
+        if C.HOROVOD_HIERARCHICAL_ALLGATHER not in os.environ:
+            os.environ[C.HOROVOD_HIERARCHICAL_ALLGATHER] = '1'
+        horovod_mpi.hvd.init()
+        # Each worker uses a separate output directory.  The primary worker
+        # (rank 0) writes files to the root of the output directory (standard
+        # behavior).  Secondary workers write files to rank-named
+        # sub-directories.
+        if horovod_mpi.hvd.rank() > 0:
+            args.output = os.path.join(args.output, C.HOROVOD_SECONDARY_WORKERS_DIRNAME, str(horovod_mpi.hvd.rank()))
+            # Do not keep redundant copies of the checkpoint history
+            args.keep_last_params = 1
 
     check_arg_compatibility(args)
     output_folder = os.path.abspath(args.output)
@@ -864,21 +819,16 @@ def train(args: argparse.Namespace, custom_metrics_logger: Optional[Callable] =
                       console=not args.quiet,
                       path=os.path.join(output_folder, C.LOG_NAME),
                       level=args.loglevel)
-    if hasattr(args, "checkpoint_frequency"):
-        logger.warning("'--checkpoint-frequency' is deprecated, and will be removed in the future.  Please use '--checkpoint-interval'")
     utils.log_basic_info(args)
     arguments.save_args(args, os.path.join(output_folder, C.ARGS_STATE_NAME))
 
     max_seq_len_source, max_seq_len_target = args.max_seq_len
-    # The maximum length is the length before we add the BOS/EOS symbols
+    # The maximum length given by the user is the length before we add the BOS/EOS symbols
     max_seq_len_source = max_seq_len_source + C.SPACE_FOR_XOS
     max_seq_len_target = max_seq_len_target + C.SPACE_FOR_XOS
     logger.info("Adjusting maximum length to reserve space for a BOS/EOS marker. New maximum length: (%d, %d)",
                 max_seq_len_source, max_seq_len_target)
 
-    check_condition(args.length_task is not None or C.LENRATIO_MSE not in args.metrics,
-                    "%s metrics requires enabling length ratio prediction with --length-task." % C.LENRATIO_MSE)
-
     with ExitStack() as exit_stack:
         context = utils.determine_context(device_ids=args.device_ids,
                                           use_cpu=args.use_cpu,
@@ -891,6 +841,8 @@ def train(args: argparse.Namespace, custom_metrics_logger: Optional[Callable] =
                                                                  "size that is a multiple of %d." % len(context))
         logger.info("Training Device(s): %s", ", ".join(str(c) for c in context))
 
+        utils.seed_rngs(args.seed, ctx=context)
+
         train_iter, eval_iter, config_data, source_vocabs, target_vocab = create_data_iters_and_vocabs(
             args=args,
             max_seq_len_source=max_seq_len_source,
@@ -898,8 +850,15 @@ def train(args: argparse.Namespace, custom_metrics_logger: Optional[Callable] =
             shared_vocab=use_shared_vocab(args),
             resume_training=resume_training,
             output_folder=output_folder)
-        max_seq_len_source = config_data.max_seq_len_source
-        max_seq_len_target = config_data.max_seq_len_target
+
+        if max_seq_len_source != config_data.max_seq_len_source:
+            logger.info("Maximum source length determined by prepared data. Using %d instead of %d",
+                        config_data.max_seq_len_source, max_seq_len_source)
+            max_seq_len_source = config_data.max_seq_len_source
+        if max_seq_len_target != config_data.max_seq_len_target:
+            logger.info("Maximum target length determined by prepared data. Using %d instead of %d",
+                        config_data.max_seq_len_target, max_seq_len_target)
+            max_seq_len_target = config_data.max_seq_len_target
 
         # Dump the vocabularies if we're just starting up
         if not resume_training:
@@ -913,72 +872,126 @@ def train(args: argparse.Namespace, custom_metrics_logger: Optional[Callable] =
                     target_vocab_size)
 
         model_config = create_model_config(args=args,
-                                           source_vocab_sizes=source_vocab_sizes, target_vocab_size=target_vocab_size,
-                                           max_seq_len_source=max_seq_len_source, max_seq_len_target=max_seq_len_target,
+                                           source_vocab_sizes=source_vocab_sizes,
+                                           target_vocab_size=target_vocab_size,
+                                           max_seq_len_source=max_seq_len_source,
+                                           max_seq_len_target=max_seq_len_target,
                                            config_data=config_data)
-        model_config.freeze()
 
-        training_model = create_training_model(config=model_config,
-                                               context=context,
-                                               output_dir=output_folder,
-                                               train_iter=train_iter,
-                                               args=args)
+        training_model = model.SockeyeModel(model_config)
 
         # Handle options that override training settings
-        min_updates = args.min_updates
-        max_updates = args.max_updates
-        max_seconds = args.max_seconds
-        min_samples = args.min_samples
-        max_samples = args.max_samples
-        max_num_checkpoint_not_improved = args.max_num_checkpoint_not_improved
-        min_epochs = args.min_num_epochs
-        max_epochs = args.max_num_epochs
-        if min_epochs is not None and max_epochs is not None:
-            check_condition(min_epochs <= max_epochs,
+        trainer_config = training.TrainerConfig(
+            output_dir=args.output,
+            early_stopping_metric=args.optimized_metric,
+            max_params_files_to_keep=args.keep_last_params,
+            keep_initializations=args.keep_initializations,
+            checkpoint_interval=args.checkpoint_interval,
+            max_num_checkpoint_not_improved=args.max_num_checkpoint_not_improved,
+            checkpoint_improvement_threshold=args.checkpoint_improvement_threshold,
+            max_checkpoints=args.max_checkpoints,
+            min_samples=args.min_samples,
+            max_samples=args.max_samples,
+            min_updates=args.min_updates,
+            max_updates=args.max_updates,
+            min_epochs=args.min_num_epochs,
+            max_epochs=args.max_num_epochs,
+            max_seconds=args.max_seconds,
+            update_interval=args.update_interval,
+            stop_training_on_decoder_failure=args.stop_training_on_decoder_failure
+        )
+        if trainer_config.min_epochs is not None and trainer_config.max_epochs is not None:
+            check_condition(trainer_config.min_epochs <= trainer_config.max_epochs,
                             "Minimum number of epochs must be smaller than maximum number of epochs")
 
-        # Fixed training schedule always runs for a set number of updates
-        if args.learning_rate_schedule:
-            min_updates = None
-            max_updates = sum(num_updates for (_, num_updates) in args.learning_rate_schedule)
-            max_num_checkpoint_not_improved = -1
-            min_samples = None
-            max_samples = None
-            min_epochs = None
-            max_epochs = None
-
-        trainer = training.EarlyStoppingTrainer(model=training_model,
-                                                optimizer_config=create_optimizer_config(args, source_vocab_sizes),
-                                                max_params_files_to_keep=args.keep_last_params,
-                                                keep_initializations=args.keep_initializations,
-                                                source_vocabs=source_vocabs,
-                                                target_vocab=target_vocab,
-                                                stop_training_on_decoder_failure=args.stop_training_on_decoder_failure,
-                                                custom_metrics_logger=custom_metrics_logger,
-                                                checkpoint_callback=checkpoint_callback)
-
-        training_state = trainer.fit(train_iter=train_iter,
-                                     validation_iter=eval_iter,
-                                     early_stopping_metric=args.optimized_metric,
-                                     metrics=args.metrics,
-                                     checkpoint_interval=args.checkpoint_interval,
-                                     max_num_not_improved=max_num_checkpoint_not_improved,
-                                     max_checkpoints=args.max_checkpoints,
-                                     min_samples=min_samples,
-                                     max_samples=max_samples,
-                                     min_updates=min_updates,
-                                     max_updates=max_updates,
-                                     max_seconds=max_seconds,
-                                     min_epochs=min_epochs,
-                                     max_epochs=max_epochs,
-                                     lr_decay_param_reset=args.learning_rate_decay_param_reset,
-                                     lr_decay_opt_states_reset=args.learning_rate_decay_optimizer_states_reset,
-                                     decoder=create_checkpoint_decoder(args, exit_stack, context),
-                                     mxmonitor_pattern=args.monitor_pattern,
-                                     mxmonitor_stat_func=args.monitor_stat_func,
-                                     allow_missing_parameters=args.allow_missing_params or model_config.lhuc,
-                                     existing_parameters=args.params)
+        optimizer_config = create_optimizer_config(args)
+        training_model.initialize(optimizer_config.initializer, ctx=context)
+        if args.params is not None:  # load existing parameters if present
+            training_model.load_parameters(filename=args.params,
+                                           ctx=context,
+                                           allow_missing=args.allow_missing_params or model_config.lhuc,
+                                           cast_dtype=True,
+                                           dtype_source='current')
+        params = training_model.collect_params()
+        # set grad_req for fixed params
+        params = set_grad_req_for_fixed_params(config=model_config,
+                                               params=params,
+                                               fixed_param_names=args.fixed_param_names,
+                                               fixed_param_strategy=args.fixed_param_strategy)
+
+        # When using Horovod, synchronize the parameter initialization point
+        # across all workers by broadcasting worker 0's values.  This is not
+        # required when resuming training as synchronized training states
+        # already exist.
+        if horovod_mpi.using_horovod() and not resume_training:
+            for ctx in context:
+                with mx.Context(ctx):
+                    horovod_mpi.hvd.broadcast_parameters(params, root_rank=0)
+
+        if args.dtype == C.DTYPE_FP16:
+            training_model.cast(C.DTYPE_FP16)
+        utils.log_parameters(params)
+
+        # set grad_req to 'add' for trainable parameters
+        if args.update_interval > 1:
+            for name, param in params.items():
+                if param.grad_req != 'null':
+                    param.grad_req = 'add'
+
+        kvstore = mx.kvstore.create(args.kvstore)
+
+        if horovod_mpi.using_horovod():
+            # Horovod provides a trainer that subclasses gluon.Trainer and uses
+            # allreduce to collect averaged gradients across all workers for
+            # each update.
+            gluon_trainer = horovod_mpi.hvd.DistributedTrainer(params,
+                                                               optimizer_config.name,
+                                                               optimizer_config.params)
+        else:
+            gluon_trainer = gluon.Trainer(params,
+                                          optimizer_config.name,
+                                          optimizer_config.params,
+                                          kvstore=kvstore,
+                                          update_on_kvstore=False if using_amp else None)
+
+        if using_amp:
+            amp.init_trainer(gluon_trainer)
+            # AMP does not allow passing args when creating the loss scaler, so
+            # we set them immediately after calling init.
+            gluon_trainer._amp_loss_scaler._scale_seq_len = args.amp_scale_interval
+
+        losses = create_losses(args)
+
+        hybridize = not args.no_hybridization
+        if hybridize:
+            training_model.hybridize(static_alloc=True)
+            if not using_amp:
+                # Do not hybridize losses when using AMP.  Dynamic loss scaling
+                # requires adjusting SoftmaxOutput's grad_rescale value
+                # throughout training, which is not possible when using the
+                # Symbol API.
+                for lf in losses:
+                    lf.hybridize(static_alloc=True)
+
+        trainer = training.GluonEarlyStoppingTrainer(
+            config=trainer_config,
+            optimizer_config=optimizer_config,
+            sockeye_model=training_model,
+            trainer=gluon_trainer,
+            loss_functions=losses,
+            context=context,
+            dtype=args.dtype,
+            using_amp=using_amp,
+            custom_metrics_logger=custom_metrics_logger,
+            checkpoint_callback=checkpoint_callback
+        )
+
+        cp_decoder = create_checkpoint_decoder(args, exit_stack, context,
+                                               training_model, source_vocabs, target_vocab, hybridize=hybridize)
+
+        training_state = trainer.fit(train_iter=train_iter, validation_iter=eval_iter, checkpoint_decoder=cp_decoder)
         return training_state
 
+
 if __name__ == "__main__":
     main()
diff --git a/sockeye/training.py b/sockeye/training.py
index f9f3c994c..d5fc70776 100644
--- a/sockeye/training.py
+++ b/sockeye/training.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -14,501 +14,91 @@
 """
 Code for training
 """
+from collections import deque
 import logging
 import os
 import pickle
 import random
 import shutil
 import time
-from functools import reduce
-from typing import Any, Dict, List, Optional, Tuple, Union, Callable
+from math import sqrt
+from typing import Callable, Dict, List, Optional, Iterable, Tuple, Union
 
 import mxnet as mx
+from mxnet.contrib import amp
 import numpy as np
-from math import sqrt
 
-from . import checkpoint_decoder
+from .checkpoint_decoder import CheckpointDecoder
 from . import constants as C
 from . import data_io
+from . import horovod_mpi
 from . import loss
 from . import lr_scheduler
-from . import model
 from . import utils
 from . import vocab
-from .encoder import EmptyEncoderConfig, RecurrentEncoderConfig
-from .decoder import RecurrentDecoderConfig
-from .optimizers import BatchState, CheckpointState, SockeyeOptimizer, OptimizerConfig
-import multiprocessing
-import sockeye.multiprocessing_utils as mp_utils
+from . import parallel
+from .config import Config
+from .model import SockeyeModel
+from .optimizers import OptimizerConfig
 
 logger = logging.getLogger(__name__)
 
 
-class TrainingModel(model.SockeyeModel):
-    """
-    TrainingModel is a SockeyeModel that fully unrolls over source and target sequences.
-
-    :param config: Configuration object holding details about the model.
-    :param context: The context(s) that MXNet will be run in (GPU(s)/CPU).
-    :param output_dir: Directory where this model is stored.
-    :param provide_data: List of input data descriptions.
-    :param provide_label: List of label descriptions.
-    :param default_bucket_key: Default bucket key.
-    :param bucketing: If True bucketing will be used, if False the computation graph will always be
-            unrolled to the full length.
-    :param gradient_compression_params: Optional dictionary of gradient compression parameters.
-    :param gradient_accumulation: Whether to accumulate gradients over batches. Default: False.
-    :param fixed_param_names: Optional list of params to fix during training (i.e. their values will not be trained).
-    :param fixed_param_strategy: Optional string indicating a named strategy for fixing parameters.
-    """
-
-    def __init__(self,
-                 config: model.ModelConfig,
-                 context: List[mx.context.Context],
-                 output_dir: str,
-                 provide_data: List[mx.io.DataDesc],
-                 provide_label: List[mx.io.DataDesc],
-                 default_bucket_key: Tuple[int, int],
-                 bucketing: bool,
-                 gradient_compression_params: Optional[Dict[str, Any]] = None,
-                 gradient_accumulation: bool = False,
-                 fixed_param_names: Optional[List[str]] = None,
-                 fixed_param_strategy: Optional[str] = None) -> None:
-        super().__init__(config)
-        self.context = context
-        self.output_dir = output_dir
-        self.fixed_param_names = fixed_param_names
-        self.fixed_param_strategy = fixed_param_strategy
-        self._bucketing = bucketing
-        self._gradient_compression_params = gradient_compression_params
-        self._gradient_accumulation = gradient_accumulation
-        self._initialize(provide_data, provide_label, default_bucket_key)
-        self._monitor = None  # type: Optional[mx.monitor.Monitor]
-
-    def _initialize(self,
-                    provide_data: List[mx.io.DataDesc],
-                    provide_label: List[mx.io.DataDesc],
-                    default_bucket_key: Tuple[int, int]):
-        """
-        Initializes model components, creates training symbol and module, and binds it.
-        """
-        source = mx.sym.Variable(C.SOURCE_NAME)
-        source_words = source.split(num_outputs=self.config.config_embed_source.num_factors,
-                                    axis=2, squeeze_axis=True)[0]
-        source_length = utils.compute_lengths(source_words)
-        target = mx.sym.Variable(C.TARGET_NAME)
-        target_length = utils.compute_lengths(target)
-        labels = mx.sym.reshape(data=mx.sym.Variable(C.TARGET_LABEL_NAME), shape=(-1,))
-
-        self.model_loss = loss.get_loss(self.config.config_loss)
-        logger.info("Using model loss: %s", self.model_loss)
-        if self.config.config_length_task_loss is not None:
-            self.length_task_loss = loss.get_length_task_loss(self.config.config_length_task_loss)
-            logger.info("Using length task loss: %s", self.length_task_loss)
-        else:
-            self.length_task_loss = None
-
-        data_names = [C.SOURCE_NAME, C.TARGET_NAME]
-        label_names = [C.TARGET_LABEL_NAME]
-
-        # length_ratio: (batch_size, ). Will be pruned if not used
-        length_ratio = mx.sym.broadcast_div(target_length, source_length, name=C.LENRATIO_LABEL_NAME)
-
-        # check provide_{data,label} names
-        provide_data_names = [d[0] for d in provide_data]
-        utils.check_condition(provide_data_names == data_names,
-                              "incompatible provide_data: %s, names should be %s" % (provide_data_names, data_names))
-        provide_label_names = [d[0] for d in provide_label]
-        utils.check_condition(provide_label_names == label_names,
-                              "incompatible provide_label: %s, names should be %s" % (provide_label_names, label_names))
-
-        def sym_gen(seq_lens):
-            """
-            Returns a (grouped) loss symbol given source & target input lengths.
-            Also returns data and label names for the BucketingModule.
-            """
-            source_seq_len, target_seq_len = seq_lens
-
-            # source embedding
-            (source_embed,
-             source_embed_length,
-             source_embed_seq_len) = self.embedding_source.encode(source, source_length, source_seq_len)
-
-            # target embedding
-            (target_embed,
-             target_embed_length,
-             target_embed_seq_len) = self.embedding_target.encode(target, target_length, target_seq_len)
-
-            # encoder
-            # source_encoded: (batch_size, source_encoded_length, encoder_depth)
-            (source_encoded,
-             source_encoded_length,
-             source_encoded_seq_len) = self.encoder.encode(source_embed,
-                                                           source_embed_length,
-                                                           source_embed_seq_len)
-            # decoder
-            # target_decoded: (batch-size, target_len, decoder_depth)
-            # pointer_scores: (batch-size, target_seq_len, source_seq_len)
-            target_decoded, pointer_scores = self.decoder.decode_sequence(source_encoded, source_encoded_length, source_encoded_seq_len,
-                                                          target_embed, target_embed_length, target_embed_seq_len)
-
-            # target_decoded: (batch_size * target_seq_len, decoder_depth)
-            target_decoded = mx.sym.reshape(data=target_decoded, shape=(-3, 0))
-
-            # output layer
-            # logits: (batch_size * target_seq_len, target_vocab_size)
-            logits = self.output_layer(target_decoded)
-            
-            if self.config.num_pointers:
-                # pointer_scores: (batch-size * target_seq_len, source_seq_len)
-                pointer_scores = mx.sym.reshape(data=pointer_scores, shape=(-3, 0))
-                
-                # Concatenate the output vocabulary logits to the pointer scores
-                logits = mx.sym.concat(logits, pointer_scores, dim=1)
-            
-            # 1) standard cross-entropy loss
-            net_outputs = [self.model_loss.get_loss(logits, labels)]
-            # 2) length task losses
-            if self.length_task_loss is not None:
-                # predicted_length_ratios: (batch_size, 1)
-                predicted_length_ratio = self.length_ratio(source_encoded, source_encoded_length)
-                if isinstance(self.length_task_loss, loss.MSELoss):
-                    loss_symbol = self.length_task_loss.get_loss(predicted_length_ratio, length_ratio)
-                elif isinstance(self.length_task_loss, loss.PoissonLoss):
-                    # convert ratios to (expected) length estimations for the Poisson loss
-                    predicted_reference_length = predicted_length_ratio * source_encoded_length.reshape((-1, 1))
-                    loss_symbol = self.length_task_loss.get_loss(predicted_reference_length, target_length)
-                # return both the loss symbol, prediction and the computed length_ratio to be used in metrics
-                net_outputs.extend([loss_symbol,
-                                    mx.sym.BlockGrad(predicted_length_ratio, name=C.LENRATIO_NAME),
-                                    mx.sym.BlockGrad(length_ratio, name=C.LENRATIO_LABEL_NAME)])
-
-            return mx.sym.Group(net_outputs), data_names, label_names
-
-        # Fix model parameters as needed for different training options.
-        utils.check_condition(not self.config.lhuc or self.fixed_param_strategy is None,
-                "LHUC fixes all other parameters and is thus not compatible with other fixing strategies.")
-        if self.config.lhuc:
-            arguments = sym_gen(default_bucket_key)[0].list_arguments()
-            fixed_param_names = [a for a in arguments if not a.endswith(C.LHUC_NAME)]
-        elif self.fixed_param_strategy is not None:
-            arguments = sym_gen(default_bucket_key)[0].list_arguments()
-            fixed_param_names = self._generate_fixed_param_names(arguments, self.fixed_param_strategy)
-        else:
-            fixed_param_names = self.fixed_param_names
-
-        if self._bucketing:
-            logger.info("Using bucketing. Default max_seq_len=%s", default_bucket_key)
-            self.module = mx.mod.BucketingModule(sym_gen=sym_gen,
-                                                 logger=logger,
-                                                 default_bucket_key=default_bucket_key,
-                                                 context=self.context,
-                                                 compression_params=self._gradient_compression_params,
-                                                 fixed_param_names=fixed_param_names)
-        else:
-            logger.info("No bucketing. Unrolled to (%d,%d)",
-                        self.config.config_data.max_seq_len_source, self.config.config_data.max_seq_len_target)
-            symbol, _, __ = sym_gen(default_bucket_key)
-            self.module = mx.mod.Module(symbol=symbol,
-                                        data_names=data_names,
-                                        label_names=label_names,
-                                        logger=logger,
-                                        context=self.context,
-                                        compression_params=self._gradient_compression_params,
-                                        fixed_param_names=fixed_param_names)
-
-        self.module.bind(data_shapes=provide_data,
-                         label_shapes=provide_label,
-                         for_training=True,
-                         force_rebind=True,
-                         grad_req='add' if self._gradient_accumulation else 'write')
-
-        self.module.symbol.save(os.path.join(self.output_dir, C.SYMBOL_NAME))
-
-        self.save_version(self.output_dir)
-        self.save_config(self.output_dir)
-
-    def _generate_fixed_param_names(self, param_names: List[str], strategy: str) -> List[str]:
-        """
-        Generate a fixed parameter list given a list of all parameter names and
-        a strategy.
-        """
-        # Number of encoder/decoder layers in model.
-        if isinstance(self.config.config_encoder, EmptyEncoderConfig):
-            num_encoder_layers = 1
-        elif isinstance(self.config.config_encoder, RecurrentEncoderConfig):
-            num_encoder_layers = self.config.config_encoder.rnn_config.num_layers
-        else:
-            num_encoder_layers = self.config.config_encoder.num_layers
-        if isinstance(self.config.config_decoder, RecurrentDecoderConfig):
-            num_decoder_layers = self.config.config_decoder.rnn_config.num_layers
-        else:
-            num_decoder_layers = self.config.config_decoder.num_layers
-
-        def is_fixed(name: str) -> bool:
-            if strategy == C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_DECODER:
-                # Any decoder layer.
-                return not name.startswith(C.DECODER_PREFIX)
-            if strategy == C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_OUTER_LAYERS:
-                # First and last encoder and decoder layers for RNN,
-                # Transformer, and CNN models.
-                return not (name.startswith("{}{}l{}".format(C.BIDIRECTIONALRNN_PREFIX, C.FORWARD_PREFIX, 0)) or
-                            name.startswith("{}{}l{}".format(C.BIDIRECTIONALRNN_PREFIX, C.REVERSE_PREFIX, 0)) or
-                            name.startswith("{}l{}".format(C.STACKEDRNN_PREFIX, num_encoder_layers - 2)) or
-                            name.startswith("{}l{}".format(C.RNN_DECODER_PREFIX, 0)) or
-                            name.startswith("{}l{}".format(C.RNN_DECODER_PREFIX, num_decoder_layers - 1)) or
-                            name.startswith("{}{}".format(C.TRANSFORMER_ENCODER_PREFIX, 0)) or
-                            name.startswith("{}{}".format(C.TRANSFORMER_ENCODER_PREFIX, num_encoder_layers - 1)) or
-                            name.startswith("{}{}".format(C.TRANSFORMER_DECODER_PREFIX, 0)) or
-                            name.startswith("{}{}".format(C.TRANSFORMER_DECODER_PREFIX, num_decoder_layers - 1)) or
-                            name.startswith("{}{}".format(C.CNN_ENCODER_PREFIX, 0)) or
-                            name.startswith("{}{}".format(C.CNN_ENCODER_PREFIX, num_encoder_layers - 1)) or
-                            name.startswith("{}{}".format(C.CNN_DECODER_PREFIX, 0)) or
-                            name.startswith("{}{}".format(C.CNN_DECODER_PREFIX, num_decoder_layers - 1)))
-            if strategy == C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_EMBEDDINGS:
-                # Any type of learned embedding.
-                return not (name.startswith(C.SOURCE_EMBEDDING_PREFIX) or
-                            name.startswith(C.SOURCE_POSITIONAL_EMBEDDING_PREFIX) or
-                            name.startswith(C.TARGET_EMBEDDING_PREFIX) or
-                            name.startswith(C.TARGET_POSITIONAL_EMBEDDING_PREFIX) or
-                            name.startswith(C.SHARED_EMBEDDING_PREFIX))
-            if strategy == C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_OUTPUT_PROJ:
-                # Target output projection.
-                return not name.startswith(C.DEFAULT_OUTPUT_LAYER_PREFIX)
-            raise ValueError("Unknown fixed parameter strategy: %s" % strategy)
-
-        return [name for name in param_names if is_fixed(name)]
-
-    def run_forward_backward(self, batch: mx.io.DataBatch, metric: mx.metric.EvalMetric):
-        """
-        Runs forward/backward pass and updates training metric(s).
-        """
-        self.module.forward_backward(batch)
-        self.module.update_metric(metric, batch.label)
-
-    def update(self):
-        """
-        Updates parameters of the module.
-        """
-        self.module.update()
-
-    def get_gradients(self) -> Dict[str, List[mx.nd.NDArray]]:
-        """
-        Returns a mapping of parameters names to gradient arrays. Parameter names are prefixed with the device.
-        """
-        # We may have None if not all parameters are optimized
-        return {"dev_%d_%s" % (i, name): exe.grad_arrays[j] for i, exe in enumerate(self.executors) for j, name in
-                enumerate(self.executor_group.arg_names)
-                if name in self.executor_group.param_names and self.executors[0].grad_arrays[j] is not None}
-
-    def get_global_gradient_norm(self) -> float:
-        """
-        Returns global gradient norm.
-        """
-        # average norm across executors:
-        exec_norms = [global_norm([arr for arr in exe.grad_arrays if arr is not None]) for exe in self.executors]
-        norm_val = sum(exec_norms) / float(len(exec_norms))
-        norm_val *= self.optimizer.rescale_grad
-        return norm_val
-
-    def rescale_gradients(self, scale: float):
-        """
-        Rescales gradient arrays of executors by scale.
-        """
-        for exe in self.executors:
-            for arr in exe.grad_arrays:
-                if arr is None:
-                    continue
-                arr *= scale
-
-    def zero_gradients(self):
-        """
-        Sets all gradients to zero.
-        """
-        self.rescale_gradients(0.)
-
-    def prepare_batch(self, batch: mx.io.DataBatch):
-        """
-        Pre-fetches the next mini-batch.
-
-        :param batch: The mini-batch to prepare.
-        """
-        self.module.prepare(batch)
-
-    def evaluate(self, eval_iter: data_io.BaseParallelSampleIter, eval_metric: mx.metric.EvalMetric):
-        """
-        Resets and recomputes evaluation metric on given data iterator.
-        """
-        for eval_batch in eval_iter:
-            self.module.forward(eval_batch, is_train=False)
-            self.module.update_metric(eval_metric, eval_batch.label)
-
-    @property
-    def current_module(self) -> mx.module.Module:
-        # As the BucketingModule does not expose all methods of the underlying Module we need to directly access
-        # the currently active module, when we use bucketing.
-        return self.module._curr_module if self._bucketing else self.module
-
-    @property
-    def executor_group(self):
-        return self.current_module._exec_group
-
-    @property
-    def executors(self):
-        return self.executor_group.execs
-
-    @property
-    def loss(self):
-        return [self.model_loss] + [self.length_task_loss] if self.length_task_loss is not None else []
-
-    @property
-    def optimizer(self) -> Union[mx.optimizer.Optimizer, SockeyeOptimizer]:
-        """
-        Returns the optimizer of the underlying module.
-        """
-        # TODO: Push update to MXNet to expose the optimizer (Module should have a get_optimizer method)
-        return self.current_module._optimizer
-
-    def initialize_optimizer(self, config: OptimizerConfig):
-        """
-        Initializes the optimizer of the underlying module with an optimizer config.
-        """
-        self.module.init_optimizer(kvstore=config.kvstore,
-                                   optimizer=config.name,
-                                   optimizer_params=config.params,
-                                   force_init=True)  # force init for training resumption use case
-
-    def save_optimizer_states(self, fname: str):
-        """
-        Saves optimizer states to a file.
-
-        :param fname: File name to save optimizer states to.
-        """
-        self.current_module.save_optimizer_states(fname)
-
-    def load_optimizer_states(self, fname: str):
-        """
-        Loads optimizer states from file.
-
-        :param fname: File name to load optimizer states from.
-        """
-        self.current_module.load_optimizer_states(fname)
-
-    def initialize_parameters(self, initializer: mx.init.Initializer, allow_missing_params: bool):
-        """
-        Initializes the parameters of the underlying module.
-
-        :param initializer: Parameter initializer.
-        :param allow_missing_params: Whether to allow missing parameters.
-        """
-        self.module.init_params(initializer=initializer,
-                                arg_params=self.params,
-                                aux_params=self.aux_params,
-                                allow_missing=allow_missing_params,
-                                force_init=False)
-
-    def log_parameters(self):
-        """
-        Logs information about model parameters.
-        """
-        arg_params, aux_params = self.module.get_params()
-        total_parameters = 0
-        fixed_parameters = 0
-        learned_parameters = 0
-        info = []  # type: List[str]
-        for name, array in sorted(arg_params.items()):
-            info.append("%s: %s" % (name, array.shape))
-            num_parameters = reduce(lambda x, y: x * y, array.shape)
-            total_parameters += num_parameters
-            if name in self.module._fixed_param_names:
-                fixed_parameters += num_parameters
-            else:
-                learned_parameters += num_parameters
-        percent_fixed = 100 * (fixed_parameters / max(1, total_parameters))
-        percent_learned = 100 * (learned_parameters / max(1, total_parameters))
-        logger.info("Model parameters: %s", ", ".join(info))
-        logger.info("Fixed model parameters: %s", ", ".join(self.module._fixed_param_names))
-        logger.info("Fixing %d parameters (%0.2f%%)", fixed_parameters, percent_fixed)
-        logger.info("Learning %d parameters (%0.2f%%)", learned_parameters, percent_learned)
-        logger.info("Total # of parameters: %d", total_parameters)
-
-    def save_params_to_file(self, fname: str):
-        """
-        Synchronizes parameters across devices, saves the parameters to disk, and updates self.params
-        and self.aux_params.
-
-        :param fname: Filename to write parameters to.
-        """
-        arg_params, aux_params = self.module.get_params()
-        self.module.set_params(arg_params, aux_params)
-        self.params = arg_params
-        self.aux_params = aux_params
-        super().save_params_to_file(fname)
-
-    def load_params_from_file(self, fname: str, allow_missing_params: bool = False):
-        """
-        Loads parameters from a file and sets the parameters of the underlying module and this model instance.
-
-        :param fname: File name to load parameters from.
-        :param allow_missing_params: If set, the given parameters are allowed to be a subset of the Module parameters.
-        """
-        super().load_params_from_file(fname)  # sets self.params & self.aux_params
-        self.module.set_params(arg_params=self.params,
-                               aux_params=self.aux_params,
-                               allow_missing=allow_missing_params)
-
-    def install_monitor(self, monitor_pattern: str, monitor_stat_func_name: str):
-        """
-        Installs an MXNet monitor onto the underlying module.
-
-        :param monitor_pattern: Pattern string.
-        :param monitor_stat_func_name: Name of monitor statistics function.
-        """
-        self._monitor = mx.monitor.Monitor(interval=C.MEASURE_SPEED_EVERY,
-                                           stat_func=C.MONITOR_STAT_FUNCS.get(monitor_stat_func_name),
-                                           pattern=monitor_pattern,
-                                           sort=True)
-        self.module.install_monitor(self._monitor)
-        logger.info("Installed MXNet monitor; pattern='%s'; statistics_func='%s'",
-                    monitor_pattern, monitor_stat_func_name)
-
-    @property
-    def monitor(self) -> Optional[mx.monitor.Monitor]:
-        return self._monitor
-
-
 def global_norm(ndarrays: List[mx.nd.NDArray]) -> float:
     # accumulate in a list, as asscalar is blocking and this way we can run the norm calculation in parallel.
     norms = [mx.nd.square(mx.nd.norm(arr)) for arr in ndarrays if arr is not None]
     return sqrt(sum(norm.asscalar() for norm in norms))
 
 
-def safe_custom_metrics_logger(logging_function, attribute_value_mapping, global_step=None):
-    """
-    A thin wrapper for calling a custom metrics logging function, if supplied. As it uses an external function,
-    it should never throw an exception. If there is no logging_function supplied, the function does nothing.
-
-    :param logging_function: The function supplied by a caller of sockeye.train
-    :param attribute_value_mapping: A list or a dictionary of (metric name, metric value) pairs.
-    :param global_step: Optional argument, which can be used e.g. by Tensorboard.
-    """
-    if logging_function is None:
-        return
-    try:
-        logging_function(attribute_value_mapping, global_step)
-    except Exception as e:
-        logger.warning("Didn't use custom metrics logger, exception '{}' occured".format(str(e)))
+class TrainerConfig(Config):
+    def __init__(self,
+                 output_dir: str,
+                 early_stopping_metric: str,
+                 max_params_files_to_keep: int,
+                 keep_initializations: bool,
+                 checkpoint_interval: int,
+                 max_num_checkpoint_not_improved: int,
+                 checkpoint_improvement_threshold: float,
+                 max_checkpoints: Optional[int] = None,
+                 min_samples: Optional[int] = None,
+                 max_samples: Optional[int] = None,
+                 min_updates: Optional[int] = None,
+                 max_updates: Optional[int] = None,
+                 min_epochs: Optional[int] = None,
+                 max_epochs: Optional[int] = None,
+                 max_seconds: Optional[int] = None,
+                 update_interval: int = 1,
+                 stop_training_on_decoder_failure: bool = False) -> None:
+        super().__init__()
+        self.output_dir = output_dir
+        self.early_stopping_metric = early_stopping_metric
+        self.max_params_files_to_keep = max_params_files_to_keep
+        self.keep_initializations = keep_initializations
+        self.checkpoint_interval = checkpoint_interval
+        self.max_num_checkpoint_not_improved = max_num_checkpoint_not_improved
+        self.checkpoint_improvement_threshold = checkpoint_improvement_threshold
+        self.max_checkpoints = max_checkpoints
+        self.min_samples = min_samples
+        self.max_samples = max_samples
+        self.min_updates = min_updates
+        self.max_updates = max_updates
+        self.min_epochs = min_epochs
+        self.max_epochs = max_epochs
+        self.max_seconds = max_seconds
+        self.update_interval = update_interval
+        self.stop_training_on_decoder_failure = stop_training_on_decoder_failure
 
 
 class TrainState:
     """
     Stores the state an EarlyStoppingTrainer instance.
     """
-    
-    _pickle_slots = ['num_not_improved', 'epoch', 'checkpoint', 'best_checkpoint', 'batches',
-                 'updates', 'samples', 'gradient_norm', 'metrics', 'start_tic', '_tic_last_time_elapsed',
-                 '_time_elapsed', 'early_stopping_metric', 'best_metric', 'best_checkpoint', 'converged', 'diverged']
 
+    _pickle_slots = ['num_not_improved', 'epoch', 'checkpoint', 'best_checkpoint', 'batches',
+                     'updates', 'samples', 'gradient_norm', 'metrics', 'start_tic', '_tic_last_time_elapsed',
+                     '_time_elapsed', 'early_stopping_metric', 'best_metric', 'best_metric_history', 
+                     'best_checkpoint', 'converged', 'diverged']
+ 
     __slots__ = _pickle_slots + ['gradients']
 
     def __init__(self, early_stopping_metric: str) -> None:
@@ -528,6 +118,8 @@ def __init__(self, early_stopping_metric: str) -> None:
         self._time_elapsed = 0.0
         self.early_stopping_metric = early_stopping_metric
         self.best_metric = C.METRIC_WORST[early_stopping_metric]
+        # List of the last N best metrics, used for threshold-based stopping
+        self.best_metric_history = deque([self.best_metric])
         self.best_checkpoint = 0
         self.converged = False
         self.diverged = False
@@ -547,11 +139,6 @@ def load(fname: str) -> 'TrainState':
         """
         with open(fname, "rb") as fp:
             state = pickle.load(fp)
-            if not hasattr(state, '_time_elapsed'):
-                # backwards compatibility
-                state._time_elapsed = time.time() - state.start_tic
-                logger.warning("Training state did not encode the elapsed time. Will assume take the absolute time "
-                               "since the training started as the elapsed time (%f).", state._time_elapsed)
             state._tic_last_time_elapsed = time.time()
             return state
 
@@ -573,298 +160,130 @@ def __setstate__(self, state):
         self.gradients = {}
 
 
-class EarlyStoppingTrainer:
-    """
-    Trainer class that fits a TrainingModel using early stopping on held-out validation data.
-
-    :param model: TrainingModel instance.
-    :param optimizer_config: The optimizer configuration.
-    :param max_params_files_to_keep: Maximum number of params files to keep in the output folder (last n are kept).
-    :param keep_initializations: Regardless of number of params to keep, never delete the first checkpoint.
-    :param source_vocabs: Source vocabulary (and optional source factor vocabularies).
-    :param target_vocab: Target vocabulary.
-    """
+class GluonEarlyStoppingTrainer:
 
     def __init__(self,
-                 model: TrainingModel,
+                 config: TrainerConfig,
                  optimizer_config: OptimizerConfig,
-                 max_params_files_to_keep: int,
-                 keep_initializations: bool,
-                 source_vocabs: List[vocab.Vocab],
-                 target_vocab: vocab.Vocab,
-                 stop_training_on_decoder_failure: bool = False,
+                 sockeye_model: SockeyeModel,
+                 trainer: mx.gluon.Trainer,
+                 loss_functions: List[loss.Loss],
+                 context: List[mx.context.Context],
+                 dtype: str,
+                 using_amp: bool = False,
                  custom_metrics_logger: Optional[Callable] = None,
                  checkpoint_callback: Optional[Callable] = None) -> None:
-        self.model = model
+        self.config = config
         self.optimizer_config = optimizer_config
-        self.max_params_files_to_keep = max_params_files_to_keep
-        self.keep_initializations = keep_initializations
-        self.update_interval = self.optimizer_config.update_interval
-        self.tflogger = TensorboardLogger(logdir=os.path.join(model.output_dir, C.TENSORBOARD_NAME),
-                                          source_vocab=source_vocabs[0],
-                                          target_vocab=target_vocab)
-        self.target_vocab = target_vocab
+        self.model = sockeye_model
+        self.trainer = trainer
+        self.loss_functions = loss_functions
+        self.context = context
+        self.dtype = dtype
+        self.using_amp = using_amp
+        self._parallel = parallel.Parallel(len(context) if len(context) > 1 else 0,
+                                           ParallelModel(sockeye_model,
+                                                         loss_functions,
+                                                         trainer,
+                                                         using_amp=using_amp))
         self.state = None  # type: Optional[TrainState]
-        self.stop_training_on_decoder_failure = stop_training_on_decoder_failure
-        self.custom_metrics_logger = custom_metrics_logger
+        self._speedometer = Speedometer(frequency=C.MEASURE_SPEED_EVERY, auto_reset=False)
+        self._custom_metrics_logger = custom_metrics_logger
+        self._tflogger = TensorboardLogger(logdir=os.path.join(self.config.output_dir, C.TENSORBOARD_NAME))
         self.checkpoint_callback = checkpoint_callback
 
     def fit(self,
             train_iter: data_io.BaseParallelSampleIter,
             validation_iter: data_io.BaseParallelSampleIter,
-            early_stopping_metric,
-            metrics: List[str],
-            checkpoint_interval: int,
-            max_num_not_improved: int,
-            max_checkpoints: Optional[int] = None,
-            min_samples: Optional[int] = None,
-            max_samples: Optional[int] = None,
-            min_updates: Optional[int] = None,
-            max_updates: Optional[int] = None,
-            max_seconds: Optional[int] = None,
-            min_epochs: Optional[int] = None,
-            max_epochs: Optional[int] = None,
-            lr_decay_param_reset: bool = False,
-            lr_decay_opt_states_reset: str = C.LR_DECAY_OPT_STATES_RESET_OFF,
-            decoder: Optional[checkpoint_decoder.CheckpointDecoder] = None,
-            mxmonitor_pattern: Optional[str] = None,
-            mxmonitor_stat_func: Optional[str] = None,
-            allow_missing_parameters: bool = False,
-            existing_parameters: Optional[str] = None) -> TrainState:
-        """
-        Fits model to data given by train_iter using early-stopping w.r.t data given by val_iter.
-        Saves all intermediate and final output to output_folder.
-
-        :param train_iter: The training data iterator.
-        :param validation_iter: The data iterator for held-out data.
-
-        :param early_stopping_metric: The metric that is evaluated on held-out data and optimized.
-        :param metrics: List of metrics that will be tracked during training.
-        :param checkpoint_interval: Frequency of checkpoints in number of update steps.
-
-        :param max_num_not_improved: Stop training if early_stopping_metric did not improve for this many checkpoints.
-               Use -1 to disable stopping based on early_stopping_metric.
-        :param max_checkpoints: Stop training after this many checkpoints.
-               Use None to disable.
-
-        :param min_samples: Optional minimum number of samples.
-        :param max_samples: Optional maximum number of samples.
-        :param min_updates: Optional minimum number of update steps.
-        :param max_updates: Optional maximum number of update steps.
-        :param max_seconds: Optional maximum number of seconds to run the training for. Training will stop on the next
-            checkpoint after reaching the maximum seconds.
-        :param min_epochs: Optional minimum number of epochs to train, overrides early stopping.
-        :param max_epochs: Optional maximum number of epochs to train, overrides early stopping.
-
-        :param lr_decay_param_reset: Reset parameters to previous best after a learning rate decay.
-        :param lr_decay_opt_states_reset: How to reset optimizer states after a learning rate decay.
-
-        :param decoder: Optional CheckpointDecoder instance to decode and compute evaluation metrics.
-        :param mxmonitor_pattern: Optional pattern to match to monitor weights/gradients/outputs
-               with MXNet's monitor. Default is None which means no monitoring.
-        :param mxmonitor_stat_func: Choice of statistics function to run on monitored weights/gradients/outputs
-               when using MXNEt's monitor.
-
-        :param allow_missing_parameters: Allow missing parameters when initializing model parameters from file.
-        :param existing_parameters: Optional filename of existing/pre-trained parameters to initialize from.
-
-        :return: Training state.
-        """
-        self._check_args(metrics, early_stopping_metric, lr_decay_opt_states_reset, lr_decay_param_reset, decoder)
-        logger.info("Early stopping by optimizing '%s'", early_stopping_metric)
+            checkpoint_decoder: Optional[CheckpointDecoder] = None):
+        logger.info("Early stopping by optimizing '%s'", self.config.early_stopping_metric)
 
-        self._initialize_parameters(existing_parameters, allow_missing_parameters)
-        self._initialize_optimizer()
+        if self.config.early_stopping_metric in C.METRICS_REQUIRING_DECODER:
+            utils.check_condition(checkpoint_decoder is not None,
+                                  "%s requires CheckpointDecoder" % self.config.early_stopping_metric)
 
         resume_training = os.path.exists(self.training_state_dirname)
         if resume_training:
             logger.info("Found partial training in '%s'. Resuming from saved state.", self.training_state_dirname)
-            utils.check_condition('dist' not in self.optimizer_config.kvstore,
-                                  "Training continuation not supported with distributed training.")
             self._load_training_state(train_iter)
         else:
-            self.state = TrainState(early_stopping_metric)
-            self._save_params()
-            self._update_best_params_link()
-            self._save_training_state(train_iter)
-            self._save_initial_optimizer_states(lr_decay_opt_states_reset)
-            self._update_best_optimizer_states(lr_decay_opt_states_reset)
-            self.tflogger.log_graph(self.model.current_module.symbol)
+            self.state = TrainState(self.config.early_stopping_metric)
+            self.model.save_config(self.config.output_dir)
+            self.model.save_version(self.config.output_dir)
+            # self._save_training_state(train_iter)
+            # self._save_trainer_states(self.best_optimizer_states_fname)  # not saving due to deferred initialization
             logger.info("Training started.")
 
-        metric_train, metric_val, metric_loss = self._create_metrics(metrics, self.model.optimizer, self.model.loss)
-
-        process_manager = None
-        if decoder is not None:
-            process_manager = DecoderProcessManager(self.model.output_dir, decoder=decoder)
-
-            if self.stop_training_on_decoder_failure:
-                # Start an initial decoder process to fail early in case we run out of memory
-                process_manager.start_decoder(checkpoint=0)
-
-        if mxmonitor_pattern is not None:
-            self.model.install_monitor(mxmonitor_pattern, mxmonitor_stat_func)
-
-        speedometer = Speedometer(frequency=C.MEASURE_SPEED_EVERY, auto_reset=False)
         tic = time.time()
 
-        if max_checkpoints is not None:
-            max_updates = self.state.updates + max_checkpoints * checkpoint_interval
-            logger.info(("Resetting max_updates to %d + %d * %d = %d in order to implement stopping after (an additional) %d checkpoints."
-                         % (self.state.updates, max_checkpoints, checkpoint_interval, max_updates, max_checkpoints)))
+        if self.config.max_checkpoints is not None:
+            self.config.max_updates = self.state.updates + self.config.max_checkpoints * self.config.checkpoint_interval
+            logger.info("Resetting max_updates to %d + %d * %d = %d in order to implement stopping "
+                        "after (an additional) %d checkpoints.",
+                        self.state.updates,
+                        self.config.max_checkpoints,
+                        self.config.checkpoint_interval,
+                        self.config.max_updates,
+                        self.config.max_checkpoints)
 
-        next_data_batch = train_iter.next()
         while True:
-
-            if max_epochs is not None and self.state.epoch == max_epochs:
-                logger.info("Maximum # of epochs (%s) reached.", max_epochs)
+            if self.config.max_epochs is not None and self.state.epoch == self.config.max_epochs:
+                logger.info("Maximum # of epochs (%s) reached.", self.config.max_epochs)
                 break
 
-            if max_updates is not None and self.state.updates == max_updates:
-                logger.info("Maximum # of updates (%s) reached.", max_updates)
+            if self.config.max_updates is not None and self.state.updates == self.config.max_updates:
+                logger.info("Maximum # of updates (%s) reached.", self.config.max_updates)
                 break
 
-            if max_samples is not None and self.state.samples >= max_samples:
-                logger.info("Maximum # of samples (%s) reached", max_samples)
+            if self.config.max_samples is not None and self.state.samples >= self.config.max_samples:
+                logger.info("Maximum # of samples (%s) reached", self.config.max_samples)
                 break
 
-            ######
-            # STEP
-            ######
-            batch = next_data_batch
-            self.state.batches += 1
-            self._step(self.model, batch, checkpoint_interval, metric_train, metric_loss)
-            batch_num_samples = batch.data[0].shape[0]
-            batch_num_tokens = batch.data[0].shape[1] * batch_num_samples
-            self.state.samples += batch_num_samples
+            self._step(batch=train_iter.next())
 
             if not train_iter.iter_next():
                 self.state.epoch += 1
                 train_iter.reset()
 
-            next_data_batch = train_iter.next()
-            self.model.prepare_batch(next_data_batch)
-
-            speedometer(self.state.epoch, self.state.batches, self.state.updates,
-                        batch_num_samples, batch_num_tokens, metric_train)
-
-            ############
-            # CHECKPOINT
-            ############
-            if self.state.updates > 0 and self.state.batches % (checkpoint_interval * self.update_interval) == 0:
+            if self.state.updates > 0 and self.state.batches % (
+                    self.config.checkpoint_interval * self.config.update_interval) == 0:
                 time_cost = time.time() - tic
                 self.state.checkpoint += 1
+
                 # (1) save parameters and evaluate on validation data
                 self._save_params()
+
+                train_metrics = [lf.metric for lf in self.loss_functions]
+
                 logger.info("Checkpoint [%d]\tUpdates=%d Epoch=%d Samples=%d Time-cost=%.3f Updates/sec=%.3f",
                             self.state.checkpoint, self.state.updates, self.state.epoch,
-                            self.state.samples, time_cost, checkpoint_interval / time_cost)
-                for name, val in metric_train.get_name_value():
-                    logger.info('Checkpoint [%d]\tTrain-%s=%f', self.state.checkpoint, name, val)
-                safe_custom_metrics_logger(self.custom_metrics_logger,
-                                           {'train-' + k: v for k, v in metric_train.get_name_value()},
-                                           global_step=self.state.checkpoint)
-                self._evaluate(validation_iter, metric_val)
-                for name, val in metric_val.get_name_value():
-                    logger.info('Checkpoint [%d]\tValidation-%s=%f', self.state.checkpoint, name, val)
-                safe_custom_metrics_logger(self.custom_metrics_logger,
-                                           {'val-' + k: v for k, v in metric_val.get_name_value()},
-                                           global_step=self.state.checkpoint)
-
-                # (2) wait for checkpoint decoder results and fill self.state.metrics
-                if process_manager is not None:
-                    result = process_manager.collect_results()
-                    if result is not None:
-                        decoded_checkpoint, decoder_metrics = result
-                        # The first checkpoint before any gradient updates is ignored
-                        if decoded_checkpoint > 0:
-                            self.state.metrics[decoded_checkpoint - 1].update(decoder_metrics)
-                            self.tflogger.log_metrics(decoder_metrics, decoded_checkpoint)
-                            utils.write_metrics_file(self.state.metrics, self.metrics_fname)
-                            safe_custom_metrics_logger(self.custom_metrics_logger, decoder_metrics,
-                                                       global_step=decoded_checkpoint)
-                    # Start the decoder for the next checkpoint
-                    process_manager.start_decoder(self.state.checkpoint)
-                
-
-
-                # (3) determine improvement
-                has_improved = False
-                previous_best = self.state.best_metric
-                # at this point state.self.metrics doesn't have perplexity validation results yet
-                current_checkpoint_val_metric = {"%s-val" % name: val for name, val in metric_val.get_name_value()}
-                for checkpoint, metric_dict in enumerate(self.state.metrics + [current_checkpoint_val_metric], 1):
-                    value = metric_dict.get("%s-val" % early_stopping_metric, self.state.best_metric)
-                    if utils.metric_value_is_better(value, self.state.best_metric, early_stopping_metric):
-                        self.state.best_metric = value
-                        self.state.best_checkpoint = checkpoint
-                        has_improved = True
+                            self.state.samples, time_cost, self.config.checkpoint_interval / time_cost)
+                logger.info('Checkpoint [%d]\t%s', self.state.checkpoint,
+                            "\t".join("Train-%s" % str(metric) for metric in train_metrics))
+
+                val_metrics = self._evaluate(self.state.checkpoint, validation_iter, checkpoint_decoder)
+
+                mx.nd.waitall()
 
+                has_improved = self._determine_improvement(val_metrics)
+                self.state.converged = self._determine_convergence()
+                self.state.diverged = self._determine_divergence(val_metrics)
+                self._adjust_learning_rate(has_improved)
                 if has_improved:
-                    self._update_best_params_link()
-                    self._update_best_optimizer_states(lr_decay_opt_states_reset)
-                    self.state.num_not_improved = 0
-                    logger.info("Validation-%s improved to %f (delta=%f).", early_stopping_metric,
-                                self.state.best_metric, abs(self.state.best_metric - previous_best))
-                else:
-                    self.state.num_not_improved += 1
-                    logger.info("Validation-%s has not improved for %d checkpoints, best so far: %f",
-                                early_stopping_metric, self.state.num_not_improved, self.state.best_metric)
-
-                # (4) determine stopping
-                if 0 <= max_num_not_improved <= self.state.num_not_improved:
-                    logger.info("Maximum number of not improved checkpoints (%d) reached: %d",
-                                max_num_not_improved, self.state.num_not_improved)
-                    self.state.converged = True
-
-                    if min_epochs is not None and self.state.epoch < min_epochs:
-                        logger.info("Minimum number of epochs (%d) not reached yet: %d",
-                                    min_epochs, self.state.epoch)
-                        self.state.converged = False
-
-                    if min_updates is not None and self.state.updates < min_updates:
-                        logger.info("Minimum number of updates (%d) not reached yet: %d",
-                                    min_updates, self.state.updates)
-                        self.state.converged = False
-
-                    if min_samples is not None and self.state.samples < min_samples:
-                        logger.info("Minimum number of samples (%d) not reached yet: %d",
-                                    min_samples, self.state.samples)
-                        self.state.converged = False
-
-                # (5) detect divergence with respect to the perplexity value at the last checkpoint
-                if self.state.metrics and not has_improved:
-                    last_ppl_value = current_checkpoint_val_metric["%s-val" % C.PERPLEXITY]
-                    # using a double of uniform distribution's value as a threshold
-                    if not np.isfinite(last_ppl_value) or last_ppl_value > 2 * len(self.target_vocab):
-                        logger.warning("Model optimization diverged. Last checkpoint's perplexity: %f",
-                                       last_ppl_value)
-                        self.state.diverged = True
-
-                # (6) update and write training/validation metrics late to capture converged/diverged status
-                self._update_metrics(metric_train, metric_val)
-                metric_train.reset()
-
-                # If using an extended optimizer, provide extra state information about the current checkpoint
-                # Loss: optimized metric
-                if metric_loss is not None and isinstance(self.model.optimizer, SockeyeOptimizer):
-                    m_val = 0
-                    for name, val in metric_val.get_name_value():
-                        if name == early_stopping_metric:
-                            m_val = val
-                    checkpoint_state = CheckpointState(checkpoint=self.state.checkpoint, metric_val=m_val)
-                    self.model.optimizer.pre_update_checkpoint(checkpoint_state)
-
-                # (7) adjust learning rates
-                self._adjust_learning_rate(has_improved, lr_decay_param_reset, lr_decay_opt_states_reset)
-
-                # (8) save training state
+                    self._update_best_params()
+                    self._save_trainer_states(self.best_optimizer_states_fname)
                 self._save_training_state(train_iter)
 
-                # (9) stop training if we reach the maximum seconds
-                if max_seconds and self.state.time_elapsed >= max_seconds:
-                    logger.info("Training has run for %.0f seconds, reaching the maximum of %d seconds.",
-                                self.state.time_elapsed, max_seconds)
+                self._write_and_log_metrics(train_metrics=train_metrics, val_metrics=val_metrics)
+                for metric in train_metrics:
+                    metric.reset()
+
+                if self.checkpoint_callback:
+                    self.checkpoint_callback(self.state.checkpoint)
+
+                if self.config.max_seconds is not None and self.state.time_elapsed >= self.config.max_seconds:
+                    logger.info("Maximum # of seconds (%s) reached. Training ran for %d seconds.",
+                                self.config.max_seconds, self.state.time_elapsed)
                     break
 
                 if self.state.converged or self.state.diverged:
@@ -872,311 +291,289 @@ def fit(self,
 
                 tic = time.time()
 
-                if self.checkpoint_callback:
-                    self.checkpoint_callback(self.state.checkpoint)
-
-            if process_manager is not None:
-                process_manager.update_process_died_status()
-                if self.stop_training_on_decoder_failure and process_manager.any_process_died:
-                    logger.info("A decoder process has died, will stop training as this was requested via %s",
-                                C.TRAIN_ARGS_STOP_ON_DECODER_FAILURE)
-                    break
-
-        self._cleanup(lr_decay_opt_states_reset, process_manager=process_manager,
-                      keep_training_state=not self.state.converged and not self.state.diverged)
         logger.info("Training finished%s. Best checkpoint: %d. Best validation %s: %.6f",
                     ", can be continued later" if not self.state.converged else "",
-                    self.state.best_checkpoint, early_stopping_metric, self.state.best_metric)
+                    self.state.best_checkpoint, self.state.early_stopping_metric, self.state.best_metric)
 
+        # Always keep the training state to allow continuing training with
+        # different stopping criteria
+        self._cleanup(keep_training_state=True)
         return self.state
 
-    def _step(self,
-              model: TrainingModel,
-              batch: mx.io.DataBatch,
-              checkpoint_interval: int,
-              metric_train: mx.metric.EvalMetric,
-              metric_loss: Optional[mx.metric.EvalMetric] = None):
-        """
-        Performs an update to model given a batch and updates metrics.
-        """
-
-        if model.monitor is not None:
-            model.monitor.tic()
-
-        ####################
-        # Forward & Backward
-        ####################
-        model.run_forward_backward(batch, metric_train)
-
-        # If using an extended optimizer, provide extra state information about the current batch
-        optimizer = model.optimizer
-        if metric_loss is not None and isinstance(optimizer, SockeyeOptimizer):
-            # Loss for this batch
-            metric_loss.reset()
-            metric_loss.update(batch.label, model.module.get_outputs())
-            [(_, m_val)] = metric_loss.get_name_value()
-            batch_state = BatchState(metric_val=m_val)
-            optimizer.pre_update_batch(batch_state)
-
-        ########
-        # UPDATE
-        ########
-        if self.update_interval == 1 or self.state.batches % self.update_interval == 0:
-
-            # Gradient rescaling
-            gradient_norm = None
-            if self.state.updates > 0 and (self.state.updates + 1) % checkpoint_interval == 0:
-                # compute values for logging to metrics (before rescaling...)
-                gradient_norm = self.state.gradient_norm = model.get_global_gradient_norm()
-                self.state.gradients = model.get_gradients()
-
-            # note: C.GRADIENT_CLIPPING_TYPE_ABS is handled by the mxnet optimizer directly
-            if self.optimizer_config.gradient_clipping_type == C.GRADIENT_CLIPPING_TYPE_NORM:
-                if gradient_norm is None:
-                    gradient_norm = model.get_global_gradient_norm()
-                # clip gradients
-                if gradient_norm > self.optimizer_config.gradient_clipping_threshold:
-                    ratio = self.optimizer_config.gradient_clipping_threshold / gradient_norm
-                    model.rescale_gradients(ratio)
-
-            model.update()
-
-            if self.update_interval > 1:
-                model.zero_gradients()
-
+    def _forward_backward(self, batch: data_io.Batch):
+        """
+        Performs forward-backward pass on a batch in data-parallel mode.
+
+        :param batch: Current data batch.
+        :return: List loss outputs (tuple of loss value and number of samples) for each loss function.
+        """
+        # split batch into shards
+        batch = batch.split_and_load(ctx=self.context)
+
+        # send sharded inputs to the backend
+        for inputs, labels in batch.shards():
+            self._parallel.put((inputs, labels))
+
+        # get outputs from parallel requests to the backend. Each shard output contains a list of tuples, one for each
+        # loss function of the form: (loss_value, num_samples).
+        sharded_outputs = [self._parallel.get() for _ in range(len(self.context))]
+
+        # repack outputs into a list of loss_values (length = number of shards) for each loss function
+        sharded_outputs_per_loss_function = list(zip(*sharded_outputs))
+
+        # sum loss values (on the cpu) and number of samples for each loss function
+        output_per_loss_function = [
+            tuple(mx.nd.add_n(*(s.as_in_context(mx.cpu()) for s in shard)) for shard in zip(*outs)) for outs in
+            sharded_outputs_per_loss_function]
+        return output_per_loss_function
+
+    def _step(self, batch: data_io.Batch):
+        self.state.batches += 1
+        loss_outputs = self._forward_backward(batch)
+        if self.config.update_interval == 1 or self.state.batches % self.config.update_interval == 0:
+            # `step` rescales the gradients for the number of batches in this
+            # update.
+            self.trainer.step(batch_size=self.config.update_interval)
+            if self.config.update_interval > 1:
+                # Multi-batch updates sum gradients for each batch instead of
+                # overwriting, so gradients must be manually zeroed after each
+                # update.
+                self.model.collect_params().zero_grad()
             self.state.updates += 1
 
-        if model.monitor is not None:
-            results = model.monitor.toc()
-            if results:
-                for _, k, v in results:
-                    logger.info('Monitor: Batch [{:d}] {:s} {:s}'.format(self.state.updates, k, v))
-
-    def _evaluate(self, val_iter: data_io.BaseParallelSampleIter, val_metric: mx.metric.EvalMetric):
-        """
-        Evaluates the model on the validation data and updates the validation metric(s).
-        """
-        val_iter.reset()
-        val_metric.reset()
-        self.model.evaluate(val_iter, val_metric)
-
-    def _update_metrics(self,
-                        metric_train: mx.metric.EvalMetric,
-                        metric_val: mx.metric.EvalMetric):
-        """
-        Updates metrics for current checkpoint. If a process manager is given, also collects previous decoding results
-        and spawns a new decoding process.
-        Writes all metrics to the metrics file and optionally logs to tensorboard.
-        """
-        self.state.update_time_elapsed()
-        checkpoint_metrics = {"epoch": self.state.epoch,
-                              "learning-rate": self.model.optimizer.learning_rate,
-                              "gradient-norm": self.state.gradient_norm,
-                              "time-elapsed": self.state.time_elapsed}
-        gpu_memory_usage = utils.get_gpu_memory_usage(self.model.context)
-        checkpoint_metrics['used-gpu-memory'] = sum(v[0] for v in gpu_memory_usage.values())
-        checkpoint_metrics['converged'] = self.state.converged
-        checkpoint_metrics['diverged'] = self.state.diverged
-
-        for name, value in metric_train.get_name_value():
-            checkpoint_metrics["%s-train" % name] = value
-        for name, value in metric_val.get_name_value():
-            checkpoint_metrics["%s-val" % name] = value
-
-        self.state.metrics.append(checkpoint_metrics)
-        utils.write_metrics_file(self.state.metrics, self.metrics_fname)
+        self.state.samples += batch.samples
+        for loss_func, (loss_value, num_samples) in zip(self.loss_functions, loss_outputs):
+            loss_func.metric.update(loss_value.asscalar(), num_samples.asscalar())
+        self._speedometer(self.state.epoch, self.state.batches,
+                          self.state.updates, batch.samples, batch.tokens, (lf.metric for lf in self.loss_functions))
+
+    def _evaluate(self, checkpoint: int, data_iter, checkpoint_decoder: Optional[CheckpointDecoder]) -> List[loss.LossMetric]:
+        """
+        Computes loss(es) on validation data and returns their metrics.
+        :param data_iter: Validation data iterator.
+        :return: List of validation metrics, same order as self.loss_functions.
+        """
+        data_iter.reset()
+        val_metrics = [lf.create_metric() for lf in self.loss_functions]
+        for batch in data_iter:
+            batch = batch.split_and_load(ctx=self.context)
+            sharded_loss_outputs = []  # type: List[List[Tuple[mx.nd.NDArray, mx.nd.NDArray]]]
+            for inputs, labels in batch.shards():
+                outputs = self.model(*inputs)  # type: Dict[str, mx.nd.NDArray]
+                loss_outputs = [loss_function(outputs, labels) for loss_function in self.loss_functions]
+                sharded_loss_outputs.append(loss_outputs)
+
+            # repack outputs into a list of loss_values (length = number of shards) for each loss function
+            sharded_loss_outputs_per_loss_function = list(zip(*sharded_loss_outputs))
+            # sum loss values (on the cpu) and number of samples for each loss function
+            output_per_loss_function = [tuple(mx.nd.add_n(*(s.as_in_context(mx.cpu()) for s in shard))
+                                        for shard in zip(*outs)) for outs in sharded_loss_outputs_per_loss_function]
+            # update validation metrics for batch
+            for loss_metric, (loss_value, num_samples) in zip(val_metrics, output_per_loss_function):
+                loss_metric.update(loss_value.asscalar(), num_samples.asscalar())
+
+        # Optionally run the checkpoint decoder
+        if checkpoint_decoder is not None:
+            output_name = os.path.join(self.config.output_dir, C.DECODE_OUT_NAME % checkpoint)
+            decoder_metrics = checkpoint_decoder.decode_and_evaluate(output_name=output_name)
+            for metric_name, metric_value in decoder_metrics.items():
+                assert metric_name not in val_metrics, "Duplicate validation metric %s" % metric_name
+                metric = loss.LossMetric(name=metric_name)
+                metric.update(metric_value, num_samples=1)
+                val_metrics.append(metric)
+
+        logger.info('Checkpoint [%d]\t%s',
+                    self.state.checkpoint, "\t".join("Validation-%s" % str(lm) for lm in val_metrics))
+
+        return val_metrics
+
+    def _determine_improvement(self, val_metrics: List[loss.LossMetric]) -> bool:
+        """
+        Determines whether early stopping metric on validation data improved and updates best value and checkpoint in
+        the state.
+        :param val_metrics: Validation metrics.
+        :return: Whether model has improved on held-out data since last checkpoint.
+        """
+        value = None
+        value_is_better = False
+        for val_metric in val_metrics:
+            if val_metric.name == self.config.early_stopping_metric:
+                value = val_metric.get()
+                # When using Horovod, the primary worker makes an authoritative
+                # check of whether metric value has improved and broadcasts the
+                # result to secondary workers.  Non-determinism in the order of
+                # GPU operations can lead to slight numeric variation across
+                # workers, causing potential desync if each worker makes its own
+                # check for key training decisions (reducing learning rate,
+                # early stopping, etc.).
+                if not horovod_mpi.using_horovod() or horovod_mpi.hvd.rank() == 0:
+                    # Horovod primary worker or not using Horovod: make
+                    # authoritative metric check.
+                    value_is_better = utils.metric_value_is_better(value,
+                                                                   self.state.best_metric,
+                                                                   self.config.early_stopping_metric)
+                if horovod_mpi.using_horovod():
+                    # Broadcast result across workers.
+                    value_is_better = horovod_mpi.MPI.COMM_WORLD.bcast(value_is_better, root=0)
+                if value_is_better:
+                    logger.info("Validation-%s improved to %f (delta=%f).", self.config.early_stopping_metric,
+                                value, abs(value - self.state.best_metric))
+                    self.state.best_metric = value
+                    self.state.best_checkpoint = self.state.checkpoint
+                    self.state.num_not_improved = 0
+        assert value is not None, "Early stopping metric %s not found in validation metrics." % self.config.early_stopping_metric
+        if not value_is_better:
+            self.state.num_not_improved += 1
+            logger.info("Validation-%s has not improved for %d checkpoints, best so far: %f",
+                        self.config.early_stopping_metric, self.state.num_not_improved, self.state.best_metric)
+        # Update best metric history
+        self.state.best_metric_history.append(self.state.best_metric)
+        if (self.config.max_num_checkpoint_not_improved is not None
+                and len(self.state.best_metric_history) > self.config.max_num_checkpoint_not_improved + 1):
+            self.state.best_metric_history.popleft()
+
+        return value_is_better
+
+    def _determine_convergence(self) -> bool:
+        """
+        True if model has converged w.r.t early stopping criteria (patience).
+        Order: first check required minimums (samples, updates, epochs), then
+        check early stopping criteria (checkpoints not improved).
+        """
+        if self.config.min_samples is not None and self.state.samples < self.config.min_samples:
+            logger.info("Minimum number of samples (%d) not reached yet: %d",
+                        self.config.min_samples, self.state.samples)
+            return False
+
+        if self.config.min_updates is not None and self.state.updates < self.config.min_updates:
+            logger.info("Minimum number of updates (%d) not reached yet: %d",
+                        self.config.min_updates, self.state.updates)
+            return False
+
+        if self.config.min_epochs is not None and self.state.epoch < self.config.min_epochs:
+            logger.info("Minimum number of epochs (%d) not reached yet: %d",
+                        self.config.min_epochs, self.state.epoch)
+            return False
+
+        if (self.config.max_num_checkpoint_not_improved is not None
+                and 0 <= self.config.max_num_checkpoint_not_improved
+                and self.state.checkpoint >= self.config.max_num_checkpoint_not_improved):
+            # When using Horovod, the primary worker makes the authoritative
+            # calculation of improvement over the window for evaluating stopping
+            window_improvement = 0.
+            if not horovod_mpi.using_horovod() or horovod_mpi.hvd.rank() == 0:
+                window_improvement = abs(self.state.best_metric - self.state.best_metric_history[0])
+            if horovod_mpi.using_horovod():
+                window_improvement = horovod_mpi.MPI.COMM_WORLD.bcast(window_improvement, root=0)
+
+            # <= to correctly handle threshold == 0
+            if window_improvement <= self.config.checkpoint_improvement_threshold:
+                logger.info("Maximum number of not improved checkpoints reached: "
+                            "improvement %f <= %f over %d checkpoints", window_improvement,
+                            self.config.checkpoint_improvement_threshold, self.config.max_num_checkpoint_not_improved)
+                return True
+            else:
+                logger.info("Sufficient improvement to continue: %f > %f over %d checkpoints", window_improvement,
+                            self.config.checkpoint_improvement_threshold, self.config.max_num_checkpoint_not_improved)
 
-        tf_metrics = checkpoint_metrics.copy()
-        tf_metrics.update({"%s_grad" % n: v for n, v in self.state.gradients.items()})
-        tf_metrics.update(self.model.params)
-        self.tflogger.log_metrics(metrics=tf_metrics, checkpoint=self.state.checkpoint)
+        return False
 
-    def _cleanup(self, lr_decay_opt_states_reset: str, process_manager: Optional['DecoderProcessManager'] = None,
-                 keep_training_state = False):
+    def _determine_divergence(self, val_metrics: List[loss.LossMetric]) -> bool:
         """
-        Cleans parameter files, training state directory and waits for remaining decoding processes.
+        True if last perplexity is infinite or >2*target_vocab_size.
         """
-        utils.cleanup_params_files(self.model.output_dir, self.max_params_files_to_keep,
-                                   self.state.checkpoint, self.state.best_checkpoint, self.keep_initializations)
-        if process_manager is not None:
-            result = process_manager.collect_results()
-            if result is not None:
-                decoded_checkpoint, decoder_metrics = result
-                self.state.metrics[decoded_checkpoint - 1].update(decoder_metrics)
-                self.tflogger.log_metrics(decoder_metrics, decoded_checkpoint)
-                utils.write_metrics_file(self.state.metrics, self.metrics_fname)
-                self.state.save(os.path.join(self.training_state_dirname, C.TRAINING_STATE_NAME))
+        # (5) detect divergence with respect to the perplexity value at the last checkpoint
+        last_ppl = float('nan')
+        for metric in val_metrics:
+            if metric.name == C.PERPLEXITY:
+                last_ppl = metric.get()
+                break
+        # using a double of uniform distribution's value as a threshold
+        if not np.isfinite(last_ppl) or last_ppl > 2 * self.model.config.vocab_target_size:
+            logger.warning("Model optimization diverged. Last checkpoint's perplexity: %f", last_ppl)
+            return True
+        return False
 
-        if not keep_training_state:
-            final_training_state_dirname = os.path.join(self.model.output_dir, C.TRAINING_STATE_DIRNAME)
-            if os.path.exists(final_training_state_dirname):
-                shutil.rmtree(final_training_state_dirname)
-            if lr_decay_opt_states_reset == C.LR_DECAY_OPT_STATES_RESET_BEST:
-                best_opt_states_fname = os.path.join(self.model.output_dir, C.OPT_STATES_BEST)
-                if os.path.exists(best_opt_states_fname):
-                    os.remove(best_opt_states_fname)
-            if lr_decay_opt_states_reset == C.LR_DECAY_OPT_STATES_RESET_INITIAL:
-                initial_opt_states_fname = os.path.join(self.model.output_dir, C.OPT_STATES_INITIAL)
-                if os.path.exists(initial_opt_states_fname):
-                    os.remove(initial_opt_states_fname)
-
-    def _initialize_parameters(self, params: Optional[str], allow_missing_params: bool):
-        self.model.initialize_parameters(self.optimizer_config.initializer, allow_missing_params)
-        if params is not None:
-            logger.info("Training will start with parameters loaded from '%s'", params)
-            self.model.load_params_from_file(params, allow_missing_params=allow_missing_params)
-        self.model.log_parameters()
-
-    def _initialize_optimizer(self):
-        self.model.initialize_optimizer(self.optimizer_config)
-
-    def _adjust_learning_rate(self, has_improved: bool, lr_decay_param_reset: bool, lr_decay_opt_states_reset: str):
+    def _adjust_learning_rate(self, has_improved: bool):
         """
         Adjusts the optimizer learning rate if required.
         """
-        if self.optimizer_config.lr_scheduler is not None:
-            if issubclass(type(self.optimizer_config.lr_scheduler), lr_scheduler.AdaptiveLearningRateScheduler):
-                lr_adjusted = self.optimizer_config.lr_scheduler.new_evaluation_result(has_improved)  # type: ignore
+        scheduler = self.trainer.optimizer.lr_scheduler
+        if scheduler is not None:
+            if issubclass(type(scheduler), lr_scheduler.AdaptiveLearningRateScheduler):
+                lr_adjusted = scheduler.new_evaluation_result(has_improved)  # type: ignore
             else:
                 lr_adjusted = False
             if lr_adjusted and not has_improved:
-                if lr_decay_param_reset:
-                    logger.info("Loading parameters from last best checkpoint: %d",
-                                self.state.best_checkpoint)
-                    self.model.load_params_from_file(self.best_params_fname)
-                if lr_decay_opt_states_reset == C.LR_DECAY_OPT_STATES_RESET_INITIAL:
-                    logger.info("Loading initial optimizer states")
-                    self.model.load_optimizer_states(os.path.join(self.model.output_dir, C.OPT_STATES_INITIAL))
-                elif lr_decay_opt_states_reset == C.LR_DECAY_OPT_STATES_RESET_BEST:
-                    logger.info("Loading optimizer states from best checkpoint: %d",
-                                self.state.best_checkpoint)
-                    self.model.load_optimizer_states(os.path.join(self.model.output_dir, C.OPT_STATES_BEST))
-
-    @property
-    def best_params_fname(self) -> str:
-        return os.path.join(self.model.output_dir, C.PARAMS_BEST_NAME)
-
-    @property
-    def current_params_fname(self) -> str:
-        return os.path.join(self.model.output_dir, C.PARAMS_NAME % self.state.checkpoint)
-
-    @property
-    def metrics_fname(self) -> str:
-        return os.path.join(self.model.output_dir, C.METRICS_NAME)
-
-    @property
-    def training_state_dirname(self) -> str:
-        return os.path.join(self.model.output_dir, C.TRAINING_STATE_DIRNAME)
+                logger.info("Loading model parameters and optimizer states from best checkpoint: %d",
+                            self.state.best_checkpoint)
+                adjusted_lr = self.trainer.optimizer.lr_scheduler.lr
+                # trainer.load_states also reloads the parameters
+                self._load_trainer_states(self.best_optimizer_states_fname)
+                # state loading replaces the lr_scheduler instance which then contains the old learning rate,
+                # overwriting here. TODO: make this better...
+                self.trainer.optimizer.lr_scheduler.lr = adjusted_lr
+
+    def _write_and_log_metrics(self, train_metrics: Iterable[loss.LossMetric], val_metrics: Iterable[loss.LossMetric]):
+        """
+        Updates metrics for current checkpoint.
+        Writes all metrics to the metrics file, optionally logs to tensorboard, and sends metrics to custom logger.
+        """
+        data = {"epoch": self.state.epoch,
+                "learning-rate": (self.trainer.learning_rate if self.trainer.optimizer.lr_scheduler is None
+                                  else self.trainer.optimizer.lr_scheduler.lr),
+                "gradient-norm": self.state.gradient_norm,
+                "time-elapsed": self.state.time_elapsed}
+        gpu_memory_usage = utils.get_gpu_memory_usage(self.context)
+        data['used-gpu-memory'] = sum(v[0] for v in gpu_memory_usage.values())
+        data['converged'] = self.state.converged
+        data['diverged'] = self.state.diverged
+
+        for metric in train_metrics:
+            data["%s-train" % metric.name] = metric.get()
+        for metric in val_metrics:
+            data["%s-val" % metric.name] = metric.get()
+
+        self.state.metrics.append(data)
+        utils.write_metrics_file(self.state.metrics, self.metrics_fname)
 
-    @staticmethod
-    def _create_eval_metric(metric_name: str) -> mx.metric.EvalMetric:
-        """
-        Creates an EvalMetric given a metric names.
-        """
-        # output_names refers to the list of outputs this metric should use to update itself, e.g. the softmax output
-        if metric_name == C.ACCURACY:
-            return utils.Accuracy(ignore_label=C.PAD_ID, output_names=[C.SOFTMAX_OUTPUT_NAME], label_names=[C.TARGET_LABEL_NAME])
-        elif metric_name == C.PERPLEXITY:
-            return mx.metric.Perplexity(ignore_label=C.PAD_ID, output_names=[C.SOFTMAX_OUTPUT_NAME], label_names=[C.TARGET_LABEL_NAME], name=C.PERPLEXITY)
-        elif metric_name == C.LENRATIO_MSE:
-            return loss.LengthRatioMSEMetric(name=C.LENRATIO_MSE,
-                                  output_names=[C.LENRATIO_OUTPUT_NAME], label_names=[C.LENRATIO_LABEL_OUTPUT_NAME])
-        else:
-            raise ValueError("unknown metric name")
+        self._tflogger.log_metrics(metrics=data, checkpoint=self.state.checkpoint)
+        safe_custom_metrics_logger(logging_function=self._custom_metrics_logger,
+                                   metrics=data,
+                                   global_step=self.state.checkpoint)
 
-    @staticmethod
-    def _create_eval_metric_composite(metric_names: List[str]) -> mx.metric.CompositeEvalMetric:
-        """
-        Creates a composite EvalMetric given a list of metric names.
-        """
-        metrics = [EarlyStoppingTrainer._create_eval_metric(metric_name) for metric_name in metric_names]
-        return mx.metric.create(metrics)
-
-    def _create_metrics(self, metrics: List[str], optimizer: mx.optimizer.Optimizer,
-                        loss: loss.Loss) -> Tuple[mx.metric.EvalMetric,
-                                                  mx.metric.EvalMetric,
-                                                  Optional[mx.metric.EvalMetric]]:
-        metric_train = self._create_eval_metric_composite(metrics)
-        metric_val = self._create_eval_metric_composite(metrics)
-        # If optimizer requires it, track loss as metric
-        if isinstance(optimizer, SockeyeOptimizer):
-            if optimizer.request_optimized_metric:
-                metric_loss = self._create_eval_metric(self.state.early_stopping_metric)
-            else:
-                metric_loss = loss.create_metric()
-        else:
-            metric_loss = None
-        return metric_train, metric_val, metric_loss
-
-    def _update_best_params_link(self):
+    def _update_best_params(self):
         """
         Updates the params.best link to the latest best parameter file.
         """
-        best_params_path = self.best_params_fname
         actual_best_params_fname = C.PARAMS_NAME % self.state.best_checkpoint
-        if os.path.lexists(best_params_path):
-            os.remove(best_params_path)
-        os.symlink(actual_best_params_fname, best_params_path)
-
-    def _update_best_optimizer_states(self, lr_decay_opt_states_reset: str):
-        if lr_decay_opt_states_reset == C.LR_DECAY_OPT_STATES_RESET_BEST:
-            self.model.save_optimizer_states(os.path.join(self.model.output_dir, C.OPT_STATES_BEST))
-
-    def _save_initial_optimizer_states(self, lr_decay_opt_states_reset: str):
-        if lr_decay_opt_states_reset == C.LR_DECAY_OPT_STATES_RESET_INITIAL:
-            self.model.save_optimizer_states(os.path.join(self.model.output_dir, C.OPT_STATES_INITIAL))
-
-    def _check_args(self,
-                    metrics: List[str],
-                    early_stopping_metric: str,
-                    lr_decay_opt_states_reset: str,
-                    lr_decay_param_reset: bool,
-                    cp_decoder: Optional[checkpoint_decoder.CheckpointDecoder] = None):
-        """
-        Helper function that checks various configuration compatibilities.
-        """
-        utils.check_condition(len(metrics) > 0, "At least one metric must be provided.")
-        for metric in metrics:
-            utils.check_condition(metric in C.METRICS, "Unknown metric to track during training: %s" % metric)
-
-        if 'dist' in self.optimizer_config.kvstore:
-            # In distributed training the optimizer will run remotely. For eve we however need to pass information about
-            # the loss, which is not possible anymore by means of accessing self.module._curr_module._optimizer.
-            utils.check_condition(self.optimizer_config.name != C.OPTIMIZER_EVE,
-                                  "Eve optimizer not supported with distributed training.")
-            utils.check_condition(
-                not issubclass(type(self.optimizer_config.lr_scheduler),
-                               lr_scheduler.AdaptiveLearningRateScheduler),
-                "Adaptive learning rate schedulers not supported with a dist kvstore. "
-                "Try a fixed schedule such as %s." % C.LR_SCHEDULER_FIXED_RATE_INV_SQRT_T)
-            utils.check_condition(not lr_decay_param_reset, "Parameter reset when the learning rate decays not "
-                                                            "supported with distributed training.")
-            utils.check_condition(lr_decay_opt_states_reset == C.LR_DECAY_OPT_STATES_RESET_OFF,
-                                  "Optimizer state reset when the learning rate decays "
-                                  "not supported with distributed training.")
-
-        utils.check_condition(self.optimizer_config.gradient_clipping_type in C.GRADIENT_CLIPPING_TYPES,
-                              "Unknown gradient clipping type %s" % self.optimizer_config.gradient_clipping_type)
-
-        utils.check_condition(early_stopping_metric in C.METRICS,
-                              "Unsupported early-stopping metric: %s" % early_stopping_metric)
-        if early_stopping_metric in C.METRICS_REQUIRING_DECODER:
-            utils.check_condition(cp_decoder is not None, "%s requires CheckpointDecoder" % early_stopping_metric)
+        if os.path.lexists(self.best_params_fname):
+            os.remove(self.best_params_fname)
+        os.symlink(actual_best_params_fname, self.best_params_fname)
+        logger.info("'%s' now points to '%s'", self.best_params_fname, actual_best_params_fname)
 
     def _save_params(self):
         """
         Saves model parameters at current checkpoint and optionally cleans up older parameter files to save disk space.
         """
-        self.model.save_params_to_file(self.current_params_fname)
-        utils.cleanup_params_files(self.model.output_dir, self.max_params_files_to_keep, self.state.checkpoint,
-                                   self.state.best_checkpoint, self.keep_initializations)
+        self.model.save_parameters(self.current_params_fname)
+        utils.cleanup_params_files(self.config.output_dir, self.config.max_params_files_to_keep, self.state.checkpoint,
+                                   self.state.best_checkpoint, self.config.keep_initializations)
+
+    def _save_trainer_states(self, fname):
+        trainer_save_states_no_dump_optimizer(self.trainer, fname)
+        logger.info('Saved optimizer states to "%s"', fname)
+
+    def _load_trainer_states(self, fname):
+        self.trainer.load_states(fname)
+        logger.info('Loaded optimizer states from "%s"', fname)
 
     def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter):
         """
         Saves current training state.
         """
         # Create temporary directory for storing the state of the optimization process
-        training_state_dirname = os.path.join(self.model.output_dir, C.TRAINING_STATE_TEMP_DIRNAME)
+        training_state_dirname = os.path.join(self.config.output_dir, C.TRAINING_STATE_TEMP_DIRNAME)
         if not os.path.exists(training_state_dirname):
             os.mkdir(training_state_dirname)
 
@@ -1189,7 +586,7 @@ def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter):
 
         # (2) Optimizer states
         opt_state_fname = os.path.join(training_state_dirname, C.OPT_STATES_LAST)
-        self.model.save_optimizer_states(opt_state_fname)
+        self._save_trainer_states(opt_state_fname)
 
         # (3) Data iterator
         train_iter.save_state(os.path.join(training_state_dirname, C.BUCKET_ITER_STATE_NAME))
@@ -1205,33 +602,42 @@ def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter):
         # (5) Training state
         self.state.save(os.path.join(training_state_dirname, C.TRAINING_STATE_NAME))
 
-        # (6) Learning rate scheduler
-        with open(os.path.join(training_state_dirname, C.SCHEDULER_STATE_NAME), "wb") as fp:
-            pickle.dump(self.optimizer_config.lr_scheduler, fp)
+        # (6) AMP loss scaler state
+        if self.using_amp:
+            with open(os.path.join(training_state_dirname, C.AMP_LOSS_SCALER_STATE_NAME), "wb") as fp:
+                pickle.dump([self.trainer._amp_loss_scaler._loss_scale,
+                             self.trainer._amp_loss_scaler._next_loss_scale,
+                             self.trainer._amp_loss_scaler._unskipped], fp)
 
         # First we rename the existing directory to minimize the risk of state
         # loss if the process is aborted during deletion (which will be slower
         # than directory renaming)
-        delete_training_state_dirname = os.path.join(self.model.output_dir, C.TRAINING_STATE_TEMP_DELETENAME)
+        delete_training_state_dirname = os.path.join(self.config.output_dir, C.TRAINING_STATE_TEMP_DELETENAME)
         if os.path.exists(self.training_state_dirname):
             os.rename(self.training_state_dirname, delete_training_state_dirname)
         os.rename(training_state_dirname, self.training_state_dirname)
         if os.path.exists(delete_training_state_dirname):
-            shutil.rmtree(delete_training_state_dirname)
+            try:
+                shutil.rmtree(delete_training_state_dirname)
+            except FileNotFoundError:
+                # This can be occur on file systems with higher latency, such as
+                # distributed file systems.  While repeated occurrences of this
+                # warning may indicate a problem, seeing one or two warnings
+                # during training is usually fine.
+                logger.warning('Directory has already been removed: %s', delete_training_state_dirname)
 
     def _load_training_state(self, train_iter: data_io.BaseParallelSampleIter):
         """
         Loads the full training state from disk.
-
         :param train_iter: training data iterator.
         """
         # (1) Parameters
         params_fname = os.path.join(self.training_state_dirname, C.TRAINING_STATE_PARAMS_NAME)
-        self.model.load_params_from_file(params_fname)
+        self.model.load_parameters(params_fname, ctx=self.context, allow_missing=False, ignore_extra=False)
 
         # (2) Optimizer states
         opt_state_fname = os.path.join(self.training_state_dirname, C.OPT_STATES_LAST)
-        self.model.load_optimizer_states(opt_state_fname)
+        self._load_trainer_states(opt_state_fname)
 
         # (3) Data Iterator
         train_iter.load_state(os.path.join(self.training_state_dirname, C.BUCKET_ITER_STATE_NAME))
@@ -1247,11 +653,79 @@ def _load_training_state(self, train_iter: data_io.BaseParallelSampleIter):
         # (5) Training state
         self.state = TrainState.load(os.path.join(self.training_state_dirname, C.TRAINING_STATE_NAME))
 
-        # (6) Learning rate scheduler
-        with open(os.path.join(self.training_state_dirname, C.SCHEDULER_STATE_NAME), "rb") as fp:
-            self.optimizer_config.set_lr_scheduler(pickle.load(fp))
-        # initialize optimizer again
-        self._initialize_optimizer()
+        # (6) AMP loss scaler state
+        if self.using_amp:
+            # Load loss scaler state
+            with open(os.path.join(self.training_state_dirname, C.AMP_LOSS_SCALER_STATE_NAME), "rb") as fp:
+                (self.trainer._amp_loss_scaler._loss_scale,
+                 self.trainer._amp_loss_scaler._next_loss_scale,
+                 self.trainer._amp_loss_scaler._unskipped) = pickle.load(fp)
+
+    def _cleanup(self, keep_training_state=False):
+        """
+        Cleans parameter files, training state directory and waits for remaining decoding processes.
+        """
+        utils.cleanup_params_files(self.config.output_dir, self.config.max_params_files_to_keep,
+                                   self.state.checkpoint, self.state.best_checkpoint, self.config.keep_initializations)
+
+        if not keep_training_state:
+            if os.path.exists(self.training_state_dirname):
+                shutil.rmtree(self.training_state_dirname)
+            if os.path.exists(self.best_optimizer_states_fname):
+                os.remove(self.best_optimizer_states_fname)
+
+    @property
+    def metrics_fname(self) -> str:
+        return os.path.join(self.config.output_dir, C.METRICS_NAME)
+
+    @property
+    def current_params_fname(self) -> str:
+        return os.path.join(self.config.output_dir, C.PARAMS_NAME % self.state.checkpoint)
+
+    @property
+    def best_params_fname(self) -> str:
+        return os.path.join(self.config.output_dir, C.PARAMS_BEST_NAME)
+
+    @property
+    def training_state_dirname(self) -> str:
+        return os.path.join(self.config.output_dir, C.TRAINING_STATE_DIRNAME)
+
+    @property
+    def best_optimizer_states_fname(self) -> str:
+        return os.path.join(self.config.output_dir, C.OPT_STATES_BEST)
+
+
+class ParallelModel(parallel.Parallelizable):
+
+    def __init__(self,
+                 model: Callable,
+                 loss_functions: List[loss.Loss],
+                 trainer: mx.gluon.Trainer,
+                 using_amp: bool = False) -> None:
+        self.model = model
+        self.loss_functions = loss_functions
+        self.trainer = trainer
+        self.using_amp = using_amp
+
+    def forward_backward(self, shard: Tuple) -> List[Tuple[mx.nd.NDArray, mx.nd.NDArray]]:
+        """
+        Applies forward-backward pass for a single shard of a batch (data-parallel training).
+        """
+        inputs, labels = shard
+        with mx.autograd.record():
+            outputs = self.model(*inputs)  # type: Dict[str, mx.nd.NDArray]
+            loss_outputs = [loss_function(outputs, labels) for loss_function in self.loss_functions]
+            loss_values = (v for v, _ in loss_outputs)
+            sum_losses = mx.nd.add_n(*loss_values)
+            if self.using_amp:
+                # AMP applies dynamic loss scaling to the losses (scale up) and
+                # the Trainer (scale down).
+                with amp.scale_loss(sum_losses, self.trainer) as scaled_loss:
+                    mx.autograd.backward(scaled_loss)
+            else:
+                # backward on the sum of losses, weights are defined in the loss blocks themselves.
+                sum_losses.backward()
+        return loss_outputs
 
 
 class TensorboardLogger:
@@ -1274,44 +748,46 @@ def __init__(self,
         try:
             import mxboard
             logger.info("Logging training events for Tensorboard at '%s'", self.logdir)
-            self.sw = mxboard.SummaryWriter(logdir=self.logdir, flush_secs=60, verbose=False)
+            self._writer = mxboard.SummaryWriter(logdir=self.logdir, flush_secs=60, verbose=False)
         except ImportError:
             logger.info("mxboard not found. Consider 'pip install mxboard' to log events to Tensorboard.")
-            self.sw = None
+            self._writer = None
 
     def log_metrics(self, metrics: Dict[str, Union[float, int, mx.nd.NDArray]], checkpoint: int):
-        if self.sw is None:
+        if self._writer is None:
             return
 
         for name, value in metrics.items():
             if isinstance(value, mx.nd.NDArray):
-                # TODO: switch to mx.ndarray.contrib.isfinite after upgrade to MxNet 1.4.*
-                if utils.isfinite(value).astype('int32').sum().asscalar() == value.size:
-                    self.sw.add_histogram(tag=name, values=value, bins=100, global_step=checkpoint)
+                if mx.nd.contrib.isfinite(value).sum().asscalar() == value.size:
+                    self._writer.add_histogram(tag=name, values=value, bins=100, global_step=checkpoint)
                 else:
-                    logger.warning("Not adding the histogram of %s to tensorboard because some of its values are not finite.")
+                    logger.warning("Histogram of %s not logged to tensorboard because of infinite data.")
+            elif value is None:
+                continue
             else:
-                self.sw.add_scalar(tag=name, value=value, global_step=checkpoint)
+                self._writer.add_scalar(tag=name, value=value, global_step=checkpoint)
+        self._writer.flush()
 
     def log_graph(self, symbol: mx.sym.Symbol):
-        if self.sw is None:
+        if self._writer is None:
             return
-        self.sw.add_graph(symbol)
+        self._writer.add_graph(symbol)
 
     def log_source_embedding(self, embedding: mx.nd.NDArray, checkpoint: int):
-        if self.sw is None or self.source_labels is None:
+        if self._writer is None or self.source_labels is None:
             return
-        self.sw.add_embedding(tag="source", embedding=embedding, labels=self.source_labels, global_step=checkpoint)
+        self._writer.add_embedding(tag="source", embedding=embedding, labels=self.source_labels, global_step=checkpoint)
 
     def log_target_embedding(self, embedding: mx.nd.NDArray, checkpoint: int):
-        if self.sw is None or self.target_labels is None:
+        if self._writer is None or self.target_labels is None:
             return
-        self.sw.add_embedding(tag="target", embedding=embedding, labels=self.target_labels, global_step=checkpoint)
+        self._writer.add_embedding(tag="target", embedding=embedding, labels=self.target_labels, global_step=checkpoint)
 
     def log_output_embedding(self, embedding: mx.nd.NDArray, checkpoint: int):
-        if self.sw is None or self.target_labels is None:
+        if self._writer is None or self.target_labels is None:
             return
-        self.sw.add_embedding(tag="output", embedding=embedding, labels=self.target_labels, global_step=checkpoint)
+        self._writer.add_embedding(tag="output", embedding=embedding, labels=self.target_labels, global_step=checkpoint)
 
 
 class Speedometer:
@@ -1330,7 +806,7 @@ def __init__(self, frequency: int = 50, auto_reset: bool = True) -> None:
         self.msg = 'Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec %.2f tokens/sec %.2f updates/sec'
 
     def __call__(self, epoch: int, batches: int, updates: int, samples: int,
-                 tokens: int, metric: Optional[mx.metric.EvalMetric]):
+                 tokens: int, metrics: Optional[Iterable[loss.LossMetric]] = None):
         count = batches
         if self.last_count > count:
             self.init = False
@@ -1341,21 +817,24 @@ def __call__(self, epoch: int, batches: int, updates: int, samples: int,
         if self.init:
             if count % self.frequency == 0:
                 toc = (time.time() - self.tic)
-                update_interval = batches / updates
+                update_interval = batches / max(1, updates)
                 updates_per_sec = self.frequency / update_interval / toc
                 samples_per_sec = self.samples / toc
                 tokens_per_sec = self.tokens / toc
                 self.samples = 0
                 self.tokens = 0
 
-                if metric is not None:
-                    name_value = metric.get_name_value()
-                    if self.auto_reset:
-                        metric.reset()
-                    logger.info(self.msg + '\t%s=%f' * len(name_value),
-                                epoch, count, samples_per_sec, tokens_per_sec, updates_per_sec, *sum(name_value, ()))
+                if metrics is not None:
+                    metric_values = []  # type: List[Tuple[str, float]]
+                    for metric in metrics:
+                        metric_values.append((metric.name, metric.get()))
+                        if self.auto_reset:
+                            metric.reset()
+                    logger.info(self.msg + '\t%s=%f' * len(metric_values),
+                                epoch, count, samples_per_sec, tokens_per_sec, updates_per_sec, *sum(metric_values, ()))
+
                 else:
-                    logger.info(self.msg, epoch, count, samples_per_sec)
+                    logger.info(self.msg, epoch, count, samples_per_sec, tokens_per_sec, updates_per_sec)
 
                 self.tic = time.time()
         else:
@@ -1363,96 +842,45 @@ def __call__(self, epoch: int, batches: int, updates: int, samples: int,
             self.tic = time.time()
 
 
-class DecoderProcessManager(object):
+def safe_custom_metrics_logger(logging_function: Callable,
+                               metrics: Dict,
+                               global_step: int = None):
     """
-    Thin wrapper around a CheckpointDecoder instance to start non-blocking decodes and collect the results.
-
-    :param output_folder: Folder where decoder outputs are written to.
-    :param decoder: CheckpointDecoder instance.
+    A thin wrapper for calling a custom metrics logging function, if supplied. As it uses an external function,
+    it should never throw an exception. If there is no logging_function supplied, the function does nothing.
+    :param logging_function: The function supplied by a caller of sockeye.train
+    :param metrics: A non-empty dict of (nonempty str, float/int/bool) pairs.
+    :param global_step: Optional argument, which can be used e.g. by Tensorboard.
     """
-
-    def __init__(self,
-                 output_folder: str,
-                 decoder: checkpoint_decoder.CheckpointDecoder) -> None:
-        self.output_folder = output_folder
-        self.decoder = decoder
-        self.ctx = mp_utils.get_context()  # type: ignore
-        self.decoder_metric_queue = self.ctx.Queue()
-        self.decoder_process = None  # type: Optional[multiprocessing.Process]
-        self._any_process_died = False
-        self._results_pending = False
-
-    def start_decoder(self, checkpoint: int):
-        """
-        Starts a new CheckpointDecoder process and returns. No other process may exist.
-
-        :param checkpoint: The checkpoint to decode.
-        """
-        assert self.decoder_process is None
-        output_name = os.path.join(self.output_folder, C.DECODE_OUT_NAME % checkpoint)
-        self.decoder_process = self.ctx.Process(target=_decode_and_evaluate,
-                                                args=(self.decoder, checkpoint, output_name, self.decoder_metric_queue))
-        self.decoder_process.name = 'Decoder-%d' % checkpoint
-        logger.info("Starting process: %s", self.decoder_process.name)
-        self.decoder_process.start()
-        self._results_pending = True
-
-    def collect_results(self) -> Optional[Tuple[int, Dict[str, float]]]:
-        """
-        Returns the decoded checkpoint and the decoder metrics or None if the queue is empty.
-        """
-        self.wait_to_finish()
-        if self.decoder_metric_queue.empty():
-            if self._results_pending:
-                self._any_process_died = True
-            self._results_pending = False
-            return None
-        decoded_checkpoint, decoder_metrics = self.decoder_metric_queue.get()
-        assert self.decoder_metric_queue.empty()
-        self._results_pending = False
-        logger.info("Decoder-%d finished: %s", decoded_checkpoint, decoder_metrics)
-        return decoded_checkpoint, decoder_metrics
-
-    def wait_to_finish(self):
-        if self.decoder_process is None:
-            return
-        if not self.decoder_process.is_alive():
-            self.decoder_process = None
-            return
-        name = self.decoder_process.name
-        logger.warning("Waiting for process %s to finish.", name)
-        wait_start = time.time()
-        self.decoder_process.join()
-        self.decoder_process = None
-        wait_time = int(time.time() - wait_start)
-        logger.warning("Had to wait %d seconds for the Checkpoint %s to finish. Consider increasing the "
-                       "checkpoint interval (updates between checkpoints, see %s) or reducing the size of the "
-                       "validation samples that are decoded (see %s)." % (wait_time, name,
-                                                                          C.TRAIN_ARGS_CHECKPOINT_INTERVAL,
-                                                                          C.TRAIN_ARGS_MONITOR_BLEU))
-
-    @property
-    def any_process_died(self):
-        """ Returns true if any decoder process exited and did not provide a result. """
-        return self._any_process_died
-
-    def update_process_died_status(self):
-        """ Update the flag indicating whether any process exited and did not provide a result. """
-
-        # There is a result pending, the process is no longer alive, yet there is no result in the queue
-        # This means the decoder process has not succesfully produced metrics
-        queue_should_hold_result = self._results_pending and self.decoder_process is not None and not self.decoder_process.is_alive()
-        if queue_should_hold_result and self.decoder_metric_queue.empty():
-            self._any_process_died = True
+    if logging_function is None:
+        return
+    try:
+        logging_function(metrics, global_step)
+    except Exception as e:
+        logging.warning("Didn't use custom metrics logger, exception '{}' occurred".format(str(e)))
 
 
-def _decode_and_evaluate(decoder: checkpoint_decoder.CheckpointDecoder,
-                         checkpoint: int,
-                         output_name: str,
-                         queue: multiprocessing.Queue):
+def trainer_save_states_no_dump_optimizer(trainer: mx.gluon.Trainer, fname: str):
     """
-    Decodes and evaluates using given checkpoint_decoder and puts result in the queue,
-    indexed by the checkpoint.
+    Otherwise exact copy of `Trainer.save_states` that does not include a
+    pickled optimizer instance as part of the state.  This is compatible with
+    the standard `Trainer.load_states`, which will handle a state file with no
+    optimizer instance (any statements involving `self._optimizer` become
+    no-ops).  This is especially important when using AMP, which patches the
+    optimizer at runtime with references to a specific loss scaler instance.
+    Loading a stale optimizer instance causes errors.
     """
-    metrics = decoder.decode_and_evaluate(checkpoint, output_name)
-    queue.put((checkpoint, metrics))
+    assert trainer._optimizer is not None
+
+    if not trainer._kv_initialized:
+        trainer._init_kvstore()
+    if trainer._params_to_init:
+        trainer._init_params()
+
+    if trainer._update_on_kvstore:
+        assert not trainer._params_to_init, "Cannot save trainer states when some " \
+                                            "parameters are not yet initialized in kvstore."
+        trainer._kvstore.save_optimizer_states(fname, dump_optimizer=False)
+    else:
+        with open(fname, 'wb') as fout:
+            fout.write(trainer._updaters[0].get_states(dump_optimizer=False))
diff --git a/sockeye/transformer.py b/sockeye/transformer.py
index 707f959a8..3193d8dc4 100644
--- a/sockeye/transformer.py
+++ b/sockeye/transformer.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -11,16 +11,14 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-from typing import Dict, Optional, TYPE_CHECKING
+from typing import List, Optional, Tuple
 
 import mxnet as mx
 
 from . import config
 from . import constants as C
 from . import layers
-
-if TYPE_CHECKING:
-    from . import encoder
+from . import quantization
 
 
 class TransformerConfig(config.Config):
@@ -39,9 +37,8 @@ def __init__(self,
                  postprocess_sequence: str,
                  max_seq_len_source: int,
                  max_seq_len_target: int,
-                 conv_config: Optional['encoder.ConvolutionalEmbeddingConfig'] = None,
                  lhuc: bool = False,
-                 dtype: str = C.DTYPE_FP32) -> None:  # type: ignore
+                 depth_key_value: int = 0) -> None:  # type: ignore
         super().__init__()
         self.model_size = model_size
         self.attention_heads = attention_heads
@@ -56,9 +53,8 @@ def __init__(self,
         self.postprocess_sequence = postprocess_sequence
         self.max_seq_len_source = max_seq_len_source
         self.max_seq_len_target = max_seq_len_target
-        self.conv_config = conv_config
         self.use_lhuc = lhuc
-        self.dtype = dtype
+        self.depth_key_value = depth_key_value
 
 
 class TransformerEncoderBlock(mx.gluon.HybridBlock):
@@ -69,40 +65,47 @@ class TransformerEncoderBlock(mx.gluon.HybridBlock):
 
     def __init__(self,
                  config: TransformerConfig,
-                 prefix: str) -> None:
+                 prefix: str,
+                 dtype: str) -> None:
         super().__init__(prefix=prefix)
 
         with self.name_scope():
             self.pre_self_attention = TransformerProcessBlock(sequence=config.preprocess_sequence,
                                                               dropout=config.dropout_prepost,
-                                                              prefix="att_self_pre_")
+                                                              prefix="att_self_pre_",
+                                                              num_hidden=config.model_size)
             self.self_attention = layers.MultiHeadSelfAttention(depth_att=config.model_size,
                                                                 heads=config.attention_heads,
                                                                 depth_out=config.model_size,
                                                                 dropout=config.dropout_attention,
-                                                                prefix="att_self_")
+                                                                prefix="att_self_",
+                                                                dtype=dtype)
             self.post_self_attention = TransformerProcessBlock(sequence=config.postprocess_sequence,
                                                                dropout=config.dropout_prepost,
-                                                               prefix="att_self_post_")
+                                                               prefix="att_self_post_",
+                                                               num_hidden=config.model_size)
 
             self.pre_ff = TransformerProcessBlock(sequence=config.preprocess_sequence,
                                                   dropout=config.dropout_prepost,
-                                                  prefix="ff_pre_")
+                                                  prefix="ff_pre_",
+                                                  num_hidden=config.model_size)
             self.ff = TransformerFeedForward(num_hidden=config.feed_forward_num_hidden,
                                              num_model=config.model_size,
                                              act_type=config.act_type,
                                              dropout=config.dropout_act,
-                                             prefix="ff_")
+                                             prefix="ff_",
+                                             dtype=dtype)
             self.post_ff = TransformerProcessBlock(sequence=config.postprocess_sequence,
                                                    dropout=config.dropout_prepost,
-                                                   prefix="ff_post_")
+                                                   prefix="ff_post_",
+                                                   num_hidden=config.model_size)
             self.lhuc = None
             if config.use_lhuc:
                 self.lhuc = layers.LHUC(config.model_size)
 
     def hybrid_forward(self, F, data: mx.sym.Symbol, bias: mx.sym.Symbol) -> mx.sym.Symbol:
         # self-attention
-        data_self_att = self.self_attention(self.pre_self_attention(data, None), None, bias, None)
+        data_self_att, _, __ = self.self_attention(self.pre_self_attention(data, None), None, bias, None, None)
         data = self.post_self_attention(data_self_att, data)
 
         # feed-forward
@@ -123,44 +126,55 @@ class TransformerDecoderBlock(mx.gluon.HybridBlock):
 
     def __init__(self,
                  config: TransformerConfig,
-                 prefix: str) -> None:
+                 prefix: str,
+                 dtype: str) -> None:
         super().__init__(prefix=prefix)
         with self.name_scope():
             self.pre_self_attention = TransformerProcessBlock(sequence=config.preprocess_sequence,
                                                               dropout=config.dropout_prepost,
-                                                              prefix="att_self_pre_")
+                                                              prefix="att_self_pre_",
+                                                              num_hidden=config.model_size)
             self.self_attention = layers.MultiHeadSelfAttention(depth_att=config.model_size,
                                                                 heads=config.attention_heads,
                                                                 depth_out=config.model_size,
                                                                 dropout=config.dropout_attention,
-                                                                prefix="att_self_")
+                                                                prefix="att_self_",
+                                                                dtype=dtype)
             self.post_self_attention = TransformerProcessBlock(sequence=config.postprocess_sequence,
                                                                dropout=config.dropout_prepost,
-                                                               prefix="att_self_post_")
+                                                               prefix="att_self_post_",
+                                                               num_hidden=config.model_size)
 
             self.pre_enc_attention = TransformerProcessBlock(sequence=config.preprocess_sequence,
                                                              dropout=config.dropout_prepost,
-                                                             prefix="att_enc_pre_")
+                                                             prefix="att_enc_pre_",
+                                                             num_hidden=config.model_size)
             self.enc_attention = layers.MultiHeadAttention(depth_att=config.model_size,
                                                            heads=config.attention_heads,
                                                            depth_out=config.model_size,
                                                            dropout=config.dropout_attention,
-                                                           prefix="att_enc_")
+                                                           depth_key_value=config.depth_key_value,
+                                                           prefix="att_enc_",
+                                                           dtype=dtype)
             self.post_enc_attention = TransformerProcessBlock(sequence=config.postprocess_sequence,
                                                               dropout=config.dropout_prepost,
-                                                              prefix="att_enc_post_")
+                                                              prefix="att_enc_post_",
+                                                              num_hidden=config.model_size)
 
             self.pre_ff = TransformerProcessBlock(sequence=config.preprocess_sequence,
                                                   dropout=config.dropout_prepost,
-                                                  prefix="ff_pre_")
+                                                  prefix="ff_pre_",
+                                                  num_hidden=config.model_size)
             self.ff = TransformerFeedForward(num_hidden=config.feed_forward_num_hidden,
                                              num_model=config.model_size,
                                              act_type=config.act_type,
                                              dropout=config.dropout_act,
-                                             prefix="ff_")
+                                             prefix="ff_",
+                                             dtype=dtype)
             self.post_ff = TransformerProcessBlock(sequence=config.postprocess_sequence,
                                                    dropout=config.dropout_prepost,
-                                                   prefix="ff_post_")
+                                                   prefix="ff_post_",
+                                                   num_hidden=config.model_size)
 
             self.lhuc = None
             if config.use_lhuc:
@@ -171,13 +185,27 @@ def hybrid_forward(self, F,
                        target_bias: mx.sym.Symbol,
                        source: mx.sym.Symbol,
                        source_bias: mx.sym.Symbol,
-                       cache: Optional[Dict[str, Optional[mx.sym.Symbol]]] = None) -> mx.sym.Symbol:
+                       self_att_k: Optional[mx.sym.Symbol] = None,
+                       self_att_v: Optional[mx.sym.Symbol] = None,
+                       enc_att_k: Optional[mx.sym.Symbol] = None,
+                       enc_att_v: Optional[mx.sym.Symbol] = None) -> Tuple[mx.sym.Symbol,
+                                                                           mx.sym.Symbol,
+                                                                           mx.sym.Symbol]:
         # self-attention
-        target_self_att = self.self_attention(self.pre_self_attention(target, None), None, target_bias, cache)
+        target_self_att, keys, values = self.self_attention(self.pre_self_attention(target, None),
+                                                            None,
+                                                            target_bias,
+                                                            self_att_k,
+                                                            self_att_v)
         target = self.post_self_attention(target_self_att, target)
 
         # encoder attention
-        target_enc_att = self.enc_attention(self.pre_enc_attention(target, None), source, None, source_bias)
+        target_enc_att = self.enc_attention(self.pre_enc_attention(target, None),
+                                            source,
+                                            None,
+                                            source_bias,
+                                            enc_att_k,
+                                            enc_att_v)
         target = self.post_enc_attention(target_enc_att, target)
 
         # feed-forward
@@ -187,7 +215,7 @@ def hybrid_forward(self, F,
         if self.lhuc:
             target = self.lhuc(target)
 
-        return target
+        return target, keys, values
 
 
 class TransformerProcessBlock(mx.gluon.nn.HybridBlock):
@@ -202,12 +230,15 @@ class TransformerProcessBlock(mx.gluon.nn.HybridBlock):
     def __init__(self,
                  sequence: str,
                  dropout: float,
-                 prefix: str) -> None:
+                 prefix: str,
+                 num_hidden: int = 0) -> None:
         super().__init__(prefix=prefix)
         self.sequence = sequence
         self.dropout = dropout
+        self.layer_norm = None
         with self.name_scope():
-            self.layer_norm = layers.LayerNormalization(prefix="norm") if 'n' in sequence else None
+            if 'n' in sequence:
+                self.layer_norm = mx.gluon.nn.LayerNorm(axis=-1, in_channels=num_hidden, epsilon=1e-06, prefix="norm_")
 
     def hybrid_forward(self, F, data: mx.sym.Symbol, prev: Optional[mx.sym.Symbol]) -> mx.sym.Symbol:
         """
@@ -226,7 +257,7 @@ def hybrid_forward(self, F, data: mx.sym.Symbol, prev: Optional[mx.sym.Symbol])
         for step in self.sequence:
 
             if step == "r":
-                data = F._internal._plus(data, prev)
+                data = data + prev
 
             elif step == "n":
                 data = self.layer_norm(data)
@@ -250,13 +281,14 @@ def __init__(self,
                  num_model: int,
                  act_type: str,
                  dropout: float,
-                 prefix: str) -> None:
+                 prefix: str,
+                 dtype: str) -> None:
         super().__init__(prefix=prefix)
         self.dropout = dropout
         with self.name_scope():
-            self.ff1 = mx.gluon.nn.Dense(units=num_hidden, flatten=False, prefix='i2h_')
+            self.ff1 = quantization.QuantizableDense(in_units=num_model, units=num_hidden, flatten=False, prefix='i2h_', dtype = dtype)
             self.act = layers.get_activation(act_type)
-            self.ff2 = mx.gluon.nn.Dense(units=num_model, flatten=False, prefix='h2o_')
+            self.ff2 = quantization.QuantizableDense(in_units=num_hidden, units=num_model, flatten=False, prefix='h2o_', dtype = dtype)
 
     def hybrid_forward(self, F, x):
         h = self.ff1(x)
@@ -280,6 +312,11 @@ def __init__(self, num_heads: Optional[int] = None, fold_heads: bool = True, nam
         super().__init__(prefix=name)
         self.num_heads = num_heads
         self.fold_heads = fold_heads
+        self._dtype = 'float32'
+
+    def cast(self, dtype):
+        self._dtype = dtype
+        super().cast(dtype)
 
     def hybrid_forward(self, F, data, lengths):
         """
@@ -290,23 +327,16 @@ def hybrid_forward(self, F, data, lengths):
         :param lengths: Sequence lengths. Shape: (batch,).
         :return:
         """
-        if mx.__version__.startswith("1.3"):
-            # TODO(fhieber): remove old branch eventually
-            # mxnet 1.3.1's broadcast_like operator does not support individual axes yet. This branch uses another way
-            # of creating the required zeros array.
-            # (batch, seq_len)
-            mask = F.sum(F.zeros_like(data), axis=2, keepdims=False)
-        else:
-            # (batch, 1)
-            mask = F.reshape(F.zeros_like(lengths), shape=(-1, 1))
-            # (batch, seq_len)
-            mask = F.broadcast_like(mask, data, lhs_axes=(1,), rhs_axes=(1,))
+        # (batch, 1)
+        mask = F.reshape(F.zeros_like(lengths.astype(self._dtype)), shape=(-1, 1))
+        # (batch, seq_len)
+        mask = F.broadcast_like(mask, data, lhs_axes=(1,), rhs_axes=(1,))
         # (batch_size, max_length)
         mask = F.SequenceMask(data=mask,
                               use_sequence_length=True,
                               sequence_length=lengths,
                               axis=1,
-                              value=C.LARGE_NEGATIVE_VALUE)
+                              value=-C.LARGE_VALUES[self._dtype])
         if self.num_heads is not None:
             # (batch_size, heads, max_length) if fold_heads == False else (batch_size * heads, max_length)
             mask = layers.broadcast_to_heads(F, mask, self.num_heads, ndim=2, fold_heads=self.fold_heads)
@@ -314,18 +344,25 @@ def hybrid_forward(self, F, data, lengths):
         return F.BlockGrad(mask)
 
 
-def get_autoregressive_bias(max_length: int, dtype: str = C.DTYPE_FP32) -> mx.sym.Symbol:
-    """
-    Returns bias/mask to ensure position i can only attend to positions <i.
+class AutoRegressiveBias(mx.gluon.HybridBlock):
+    def __init__(self, prefix: str = '',) -> None:
+        super().__init__(prefix=prefix)
+        self._dtype = 'float32'
 
-    :param max_length: Sequence length.
-    :param dtype: dtype of bias
-    :return: Bias symbol of shape (1, max_length, max_length).
-    """
-    length_array = mx.sym.arange(max_length, dtype=dtype)
-    # matrix with lower triangle and main diagonal set to 0, upper triangle set to 1
-    bias = mx.sym.broadcast_greater(mx.sym.reshape(length_array, shape=(1, -1)),
-                                    mx.sym.reshape(length_array, shape=(-1, 1)))
-    bias = bias * -C.LARGE_VALUES[dtype]
-    bias = mx.sym.reshape(bias, shape=(1, max_length, max_length))
-    return mx.sym.BlockGrad(bias)
+    def cast(self, dtype):
+        self._dtype = dtype
+        super().cast(dtype)
+
+    def hybrid_forward(self, F, x):
+        # (length)
+        x = F.squeeze(F.slice(x, begin=(0, None, 0), end=(1, None, 1)))
+        # (length, 1)
+        # TODO: use F.contrib.arange_like with MXNET 1.6.0
+        length_array = F.cast(F.contrib.index_array(x, axes=(1,)), dtype=self._dtype)
+        # matrix with lower triangle and main diagonal set to 0, upper triangle set to 1
+        # Shape: (length, length)
+        bias = F.broadcast_greater(F.reshape(length_array, shape=(1, -1)),
+                                   length_array)
+        bias = bias * -C.LARGE_VALUES[self._dtype]
+        bias = F.expand_dims(bias, axis=0)
+        return F.BlockGrad(bias)
diff --git a/sockeye/translate.py b/sockeye/translate.py
index 2ca8db441..f9b1ff637 100644
--- a/sockeye/translate.py
+++ b/sockeye/translate.py
@@ -14,10 +14,14 @@
 """
 Translation CLI.
 """
+from . import pre_mxnet
+# Called before importing mxnet or any module that imports mxnet
+pre_mxnet.init()
+
 import argparse
+import logging
 import sys
 import time
-import logging
 from contextlib import ExitStack
 from typing import Dict, Generator, List, Optional, Union
 
@@ -30,6 +34,7 @@
 from . import data_io
 from . import inference
 from . import utils
+from .model import load_models
 
 logger = logging.getLogger(__name__)
 
@@ -62,8 +67,8 @@ def run_translate(args: argparse.Namespace):
                            C.OUTPUT_HANDLER_JSON, args.output_type)
             args.output_type = C.OUTPUT_HANDLER_JSON
     output_handler = get_output_handler(args.output_type,
-                                        args.output,
-                                        args.sure_align_threshold)
+                                        args.output)
+    hybridize = not args.no_hybridization
 
     with ExitStack() as exit_stack:
         check_condition(len(args.device_ids) == 1, "translate only supports single device for now")
@@ -74,25 +79,13 @@ def run_translate(args: argparse.Namespace):
                                     exit_stack=exit_stack)[0]
         logger.info("Translate Device: %s", context)
 
-        models, source_vocabs, target_vocab = inference.load_models(
-            context=context,
-            max_input_len=args.max_input_len,
-            beam_size=args.beam_size,
-            batch_size=args.batch_size,
-            model_folders=args.models,
-            checkpoints=args.checkpoints,
-            softmax_temperature=args.softmax_temperature,
-            max_output_length_num_stds=args.max_output_length_num_stds,
-            decoder_return_logit_inputs=args.restrict_lexicon is not None,
-            cache_output_layer_w_b=args.restrict_lexicon is not None,
-            override_dtype=args.override_dtype,
-            output_scores=output_handler.reports_score(),
-            sampling=args.sample)
-        
-        if any([model.config.num_pointers for model in models]):
-            check_condition(args.restrict_lexicon is None,
-                            "The pointer mechanism does not currently work with vocabulary restriction.")
-        
+        models, source_vocabs, target_vocab = load_models(context=context,
+                                                          model_folders=args.models,
+                                                          checkpoints=args.checkpoints,
+                                                          dtype=args.dtype,
+                                                          hybridize=hybridize,
+                                                          inference_only=True)
+
         restrict_lexicon = None  # type: Optional[Union[TopKLexicon, Dict[str, TopKLexicon]]]
         if args.restrict_lexicon is not None:
             logger.info(str(args.restrict_lexicon))
@@ -102,7 +95,8 @@ def run_translate(args: argparse.Namespace):
                 # Handle a single arg of key:path or path (parsed as path:path)
                 restrict_lexicon.load(args.restrict_lexicon[0][1], k=args.restrict_lexicon_topk)
             else:
-                check_condition(args.json_input, "JSON input is required when using multiple lexicons for vocabulary restriction")
+                check_condition(args.json_input,
+                                "JSON input is required when using multiple lexicons for vocabulary restriction")
                 # Multiple lexicons with specified names
                 restrict_lexicon = dict()
                 for key, path in args.restrict_lexicon:
@@ -110,8 +104,6 @@ def run_translate(args: argparse.Namespace):
                     lexicon.load(path, k=args.restrict_lexicon_topk)
                     restrict_lexicon[key] = lexicon
 
-        store_beam = args.output_type == C.OUTPUT_HANDLER_BEAM_STORE
-
         brevity_penalty_weight = args.brevity_penalty_weight
         if args.brevity_penalty_type == C.BREVITY_PENALTY_CONSTANT:
             if args.brevity_penalty_constant_length_ratio > 0.0:
@@ -128,16 +120,17 @@ def run_translate(args: argparse.Namespace):
         else:
             raise ValueError("Unknown brevity penalty type %s" % args.brevity_penalty_type)
 
-        brevity_penalty = None  # type: Optional[inference.BrevityPenalty]
-        if brevity_penalty_weight != 0.0:
-            brevity_penalty = inference.BrevityPenalty(brevity_penalty_weight)
+        scorer = inference.CandidateScorer(
+            length_penalty_alpha=args.length_penalty_alpha,
+            length_penalty_beta=args.length_penalty_beta,
+            brevity_penalty_weight=brevity_penalty_weight,
+            prefix='scorer_')
 
         translator = inference.Translator(context=context,
                                           ensemble_mode=args.ensemble_mode,
-                                          bucket_source_width=args.bucket_width,
-                                          length_penalty=inference.LengthPenalty(args.length_penalty_alpha,
-                                                                                 args.length_penalty_beta),
-                                          beam_prune=args.beam_prune,
+                                          scorer=scorer,
+                                          batch_size=args.batch_size,
+                                          beam_size=args.beam_size,
                                           beam_search_stop=args.beam_search_stop,
                                           nbest_size=args.nbest_size,
                                           models=models,
@@ -145,12 +138,14 @@ def run_translate(args: argparse.Namespace):
                                           target_vocab=target_vocab,
                                           restrict_lexicon=restrict_lexicon,
                                           avoid_list=args.avoid_list,
-                                          store_beam=store_beam,
                                           strip_unknown_words=args.strip_unknown_words,
-                                          skip_topk=args.skip_topk,
                                           sample=args.sample,
+                                          output_scores=output_handler.reports_score(),
                                           constant_length_ratio=constant_length_ratio,
-                                          brevity_penalty=brevity_penalty)
+                                          max_output_length_num_stds=args.max_output_length_num_stds,
+                                          max_input_length=args.max_input_length,
+                                          max_output_length=args.max_output_length,
+                                          hybridize=hybridize)
         read_and_translate(translator=translator,
                            output_handler=output_handler,
                            chunk_size=args.chunk_size,
@@ -194,7 +189,7 @@ def make_inputs(input_file: Optional[str],
                             "Model(s) require %d factors, but %d given (through --input and --input-factors)." % (
                                 translator.num_source_factors, len(inputs)))
         with ExitStack() as exit_stack:
-            streams = [exit_stack.enter_context(data_io.smart_open(i)) for i in inputs]  # pylint: disable=no-member
+            streams = [exit_stack.enter_context(data_io.smart_open(i)) for i in inputs]
             for sentence_id, inputs in enumerate(zip(*streams), 1):
                 if input_is_json:
                     yield inference.make_input_from_json_string(sentence_id=sentence_id,
diff --git a/sockeye/utils.py b/sockeye/utils.py
index 59b4ab470..57e8465b0 100644
--- a/sockeye/utils.py
+++ b/sockeye/utils.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -18,30 +18,32 @@
 import errno
 import glob
 import gzip
-import math
 import itertools
 import logging
+import math
 import os
+import pprint
 import random
-import shutil
-import subprocess
 import sys
 import time
-import sockeye.multiprocessing_utils as mp_utils
-import multiprocessing
 from contextlib import contextmanager, ExitStack
-from typing import Mapping, Any, List, Iterator, Iterable, Set, Tuple, Dict, Optional, Union, IO, TypeVar, cast
+from functools import reduce
+from typing import Any, List, Iterator, Iterable, Set, Tuple, Dict, Optional, Union, IO, TypeVar, cast
 
 import mxnet as mx
 import numpy as np
 import portalocker
 
 from . import __version__, constants as C
+from . import horovod_mpi
 from .log import log_sockeye_version, log_mxnet_version
 
 logger = logging.getLogger(__name__)
 
 
+NDarrayOrSymbol = Union[mx.nd.NDArray, mx.sym.Symbol]
+
+
 class SockeyeError(Exception):
     pass
 
@@ -98,15 +100,27 @@ def log_basic_info(args) -> None:
     logger.info("Arguments: %s", args)
 
 
-def seed_rngs(seed: int) -> None:
+def seed_rngs(seed: int, ctx: Optional[Union[mx.Context, List[mx.Context]]] = None) -> None:
     """
-    Seed the random number generators (Python, Numpy and MXNet)
+    Seed the random number generators (Python, Numpy and MXNet).
 
     :param seed: The random seed.
+    :param ctx: Random number generators in MXNet are device specific.
+           If None, MXNet will set the state of each generator of each device using seed and device id. This will lead
+           to different results on different devices. If ctx is provided, this function will seed
+           device-specific generators with a fixed offset. E.g. for 2 devices and seed=13, seed for gpu(0) will be 13,
+           14 for gpu(1). See https://beta.mxnet.io/api/gluon-related/_autogen/mxnet.random.seed.html.
     """
+    logger.info("Random seed: %d", seed)
     np.random.seed(seed)
     random.seed(seed)
-    mx.random.seed(seed)
+    if ctx is None:
+        mx.random.seed(seed, ctx='all')
+    else:
+        if isinstance(ctx, mx.Context):
+            ctx = [ctx]
+        for i, c in enumerate(ctx):
+            mx.random.seed(seed + i, ctx=c)
 
 
 def check_condition(condition: bool, error_message: str):
@@ -143,89 +157,6 @@ def compute_lengths(sequence_data: mx.sym.Symbol) -> mx.sym.Symbol:
     return mx.sym.sum(sequence_data != C.PAD_ID, axis=1)
 
 
-def save_params(arg_params: Mapping[str, mx.nd.NDArray], fname: str,
-                aux_params: Optional[Mapping[str, mx.nd.NDArray]] = None):
-    """
-    Saves the parameters to a file.
-
-    :param arg_params: Mapping from parameter names to the actual parameters.
-    :param fname: The file name to store the parameters in.
-    :param aux_params: Optional mapping from parameter names to the auxiliary parameters.
-    """
-    save_dict = {('arg:%s' % k): v.as_in_context(mx.cpu()) for k, v in arg_params.items()}
-    if aux_params is not None:
-        save_dict.update({('aux:%s' % k): v.as_in_context(mx.cpu()) for k, v in aux_params.items()})
-    mx.nd.save(fname, save_dict)
-
-
-def load_params(fname: str) -> Tuple[Dict[str, mx.nd.NDArray], Dict[str, mx.nd.NDArray]]:
-    """
-    Loads parameters from a file.
-
-    :param fname: The file containing the parameters.
-    :return: Mapping from parameter names to the actual parameters for both the arg parameters and the aux parameters.
-    """
-    save_dict = mx.nd.load(fname)
-    arg_params = {}
-    aux_params = {}
-    for k, v in save_dict.items():
-        tp, name = k.split(':', 1)
-        if tp == 'arg':
-            """TODO(fhieber):
-            temporary weight split for models with combined weight for keys & values
-            in transformer source attention layers. This can be removed once with the next major version change."""
-            if "att_enc_kv2h_weight" in name:
-                logger.info("Splitting '%s' parameters into separate k & v matrices.", name)
-                v_split = mx.nd.split(v, axis=0, num_outputs=2)
-                arg_params[name.replace('kv2h', "k2h")] = v_split[0]
-                arg_params[name.replace('kv2h', "v2h")] = v_split[1]
-            else:
-                arg_params[name] = v
-        if tp == 'aux':
-            aux_params[name] = v
-    return arg_params, aux_params
-
-
-class Accuracy(mx.metric.EvalMetric):
-    """
-    Calculates accuracy. Taken from MXNet and adapted to work with batch-major labels
-    (reshapes (batch_size, time) -> (batch_size * time).
-    Also allows defining an ignore_label/pad symbol
-    """
-
-    def __init__(self,
-                 name='accuracy',
-                 output_names=None,
-                 label_names=None,
-                 ignore_label=None):
-        super(Accuracy, self).__init__(name=name,
-                                       output_names=output_names,
-                                       label_names=label_names,
-                                       ignore_label=ignore_label)
-        self.ignore_label = ignore_label
-
-    def update(self, labels, preds):
-        mx.metric.check_label_shapes(labels, preds)
-
-        for label, pred_label in zip(labels, preds):
-            if pred_label.shape != label.shape:
-                pred_label = mx.nd.argmax_channel(pred_label)
-            pred_label = pred_label.asnumpy().astype('int32')
-            label = mx.nd.reshape(label, shape=(pred_label.size,)).asnumpy().astype('int32')
-
-            mx.metric.check_label_shapes(label, pred_label)
-            if self.ignore_label is not None:
-                correct = ((pred_label.flat == label.flat) * (label.flat != self.ignore_label)).sum()
-                ignore = (label.flat == self.ignore_label).sum()
-                n = pred_label.size - ignore
-            else:
-                correct = (pred_label.flat == label.flat).sum()
-                n = pred_label.size
-
-            self.sum_metric += correct
-            self.num_inst += n
-
-
 class OnlineMeanAndVariance:
     def __init__(self) -> None:
         self._count = 0
@@ -260,61 +191,6 @@ def std(self) -> float:
         return math.sqrt(variance) if not math.isnan(variance) else 0.0
 
 
-def top1(scores: mx.nd.NDArray,
-         offset: mx.nd.NDArray) -> Tuple[mx.nd.NDArray, mx.nd.NDArray, mx.nd.NDArray]:
-    """
-    Get the single lowest element per sentence from a `scores` matrix. Expects that
-    beam size is 1, for greedy decoding.
-
-    NOTE(mathmu): The current implementation of argmin in MXNet much slower than topk with k=1.
-
-    :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
-    :param offset: Array to add to the hypothesis indices for offsetting in batch decoding.
-    :return: The row indices, column indices and values of the smallest items in matrix.
-    """
-    best_word_indices = mx.nd.cast(mx.nd.argmin(scores, axis=1), dtype='int32')
-    values = scores[mx.nd.arange(scores.shape[0], dtype='int32', ctx=scores.context), best_word_indices]
-
-    values = values.reshape((-1, 1))
-
-    # for top1, the best hyp indices are equal to the plain offset
-
-    return offset, best_word_indices, values
-
-
-def topk(scores: mx.nd.NDArray,
-         offset: mx.nd.NDArray,
-         k: int) -> Tuple[mx.nd.NDArray, mx.nd.NDArray, mx.nd.NDArray]:
-    """
-    Get the lowest k elements per sentence from a `scores` matrix.
-    At the first timestep, the shape of scores is (batch, target_vocabulary_size).
-    At subsequent steps, the shape is (batch * k, target_vocabulary_size).
-
-    :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
-    :param offset: Array (shape: batch_size * k) containing offsets to add to the hypothesis indices in batch decoding.
-    :param k: The number of smallest scores to return.
-    :return: The row indices, column indices and values of the k smallest items in matrix.
-    """
-
-    # Compute the batch size from the offsets and k. We don't know the batch size because it is
-    # either 1 (at timestep 1) or k (at timesteps 2+).
-    # (batch_size, beam_size * target_vocab_size)
-    batch_size = int(offset.shape[-1] / k)
-    folded_scores = scores.reshape((batch_size, -1))
-
-    # pylint: disable=unbalanced-tuple-unpacking
-    values, indices = mx.nd.topk(folded_scores, axis=1, k=k, ret_typ='both', is_ascend=True)
-    indices = mx.nd.cast(indices, 'int32').reshape((-1,))
-    best_hyp_indices, best_word_indices = mx.nd.unravel_index(indices, shape=(batch_size * k, scores.shape[-1]))
-
-    if batch_size > 1:
-        # Offsetting the indices to match the shape of the scores matrix
-        best_hyp_indices += offset
-
-    values = values.reshape((-1, 1))
-    return best_hyp_indices, best_word_indices, values
-
-
 def chunks(some_list: List, n: int) -> Iterable[List]:
     """Yield successive n-sized chunks from l."""
     for i in range(0, len(some_list), n):
@@ -363,88 +239,6 @@ def smart_open(filename: str, mode: str = "rt", ftype: str = "auto", errors: str
         return open(filename, mode=mode, encoding='utf-8', errors=errors)
 
 
-def plot_attention(attention_matrix: np.ndarray, source_tokens: List[str], target_tokens: List[str], filename: str):
-    """
-    Uses matplotlib for creating a visualization of the attention matrix.
-
-    :param attention_matrix: The attention matrix.
-    :param source_tokens: A list of source tokens.
-    :param target_tokens: A list of target tokens.
-    :param filename: The file to which the attention visualization will be written to.
-    """
-    try:
-        import matplotlib
-    except ImportError:
-        raise RuntimeError("Please install matplotlib.")
-    matplotlib.use("Agg")
-    import matplotlib.pyplot as plt
-    assert attention_matrix.shape[0] == len(target_tokens)
-
-    plt.imshow(attention_matrix.transpose(), interpolation="nearest", cmap="Greys")
-    plt.xlabel("target")
-    plt.ylabel("source")
-    plt.gca().set_xticks([i for i in range(0, len(target_tokens))])
-    plt.gca().set_yticks([i for i in range(0, len(source_tokens))])
-    plt.gca().set_xticklabels(target_tokens, rotation='vertical')
-    plt.gca().set_yticklabels(source_tokens)
-    plt.tight_layout()
-    plt.savefig(filename)
-    logger.info("Saved alignment visualization to " + filename)
-
-
-def print_attention_text(attention_matrix: np.ndarray, source_tokens: List[str], target_tokens: List[str],
-                         threshold: float):
-    """
-    Prints the attention matrix to standard out.
-
-    :param attention_matrix: The attention matrix.
-    :param source_tokens: A list of source tokens.
-    :param target_tokens: A list of target tokens.
-    :param threshold: The threshold for including an alignment link in the result.
-    """
-    sys.stdout.write("  ")
-    for _ in target_tokens:
-        sys.stdout.write("---")
-    sys.stdout.write("\n")
-    for i, f_i in enumerate(source_tokens):  # type: ignore
-        sys.stdout.write(" |")
-        for j in range(len(target_tokens)):
-            align_prob = attention_matrix[j, i]
-            if align_prob > threshold:
-                sys.stdout.write("(*)")
-            elif align_prob > 0.4:
-                sys.stdout.write("(?)")
-            else:
-                sys.stdout.write("   ")
-        sys.stdout.write(" | %s\n" % f_i)
-    sys.stdout.write("  ")
-    for _ in target_tokens:
-        sys.stdout.write("---")
-    sys.stdout.write("\n")
-    for k in range(max(map(len, target_tokens))):
-        sys.stdout.write("  ")
-        for word in target_tokens:
-            letter = word[k] if len(word) > k else " "
-            sys.stdout.write(" %s " % letter)
-        sys.stdout.write("\n")
-    sys.stdout.write("\n")
-
-
-def get_alignments(attention_matrix: np.ndarray, threshold: float = .9) -> Iterator[Tuple[int, int]]:
-    """
-    Yields hard alignments from an attention_matrix (target_length, source_length)
-    given a threshold.
-
-    :param attention_matrix: The attention matrix.
-    :param threshold: The threshold for including an alignment link in the result.
-    :return: Generator yielding strings of the form 0-0, 0-1, 2-1, 2-2, 3-4...
-    """
-    for src_idx in range(attention_matrix.shape[1]):
-        for trg_idx in range(attention_matrix.shape[0]):
-            if attention_matrix[trg_idx, src_idx] > threshold:
-                yield (src_idx, trg_idx)
-
-
 def average_arrays(arrays: List[mx.nd.NDArray]) -> mx.nd.NDArray:
     """
     Take a list of arrays of the same shape and take the element wise average.
@@ -466,41 +260,15 @@ def get_num_gpus() -> int:
 
     :return: The number of GPUs on the system.
     """
-    return mx.context.num_gpus()
-
-
-def query_nvidia_smi(device_ids: List[int], result_queue: multiprocessing.Queue) -> None:
-    """
-    Runs nvidia-smi to determine the memory usage.
-
-    :param device_ids: A list of devices for which the the memory usage will be queried.
-    :param result_queue: The queue to which the result dictionary of device id mapping to a tuple of
-    (memory used, memory total) is added.
-    """
-    device_id_strs = [str(device_id) for device_id in device_ids]
-    query = "--query-gpu=index,memory.used,memory.total"
-    format_arg = "--format=csv,noheader,nounits"
-    try:
-        sp = subprocess.Popen(['nvidia-smi', query, format_arg, "-i", ",".join(device_id_strs)],
-                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        result = sp.communicate()[0].decode("utf-8").rstrip().split("\n")
-    except OSError:
-        logger.exception("Failed calling nvidia-smi to query memory usage.")
-        result_queue.put({})
-        return
     try:
-        memory_data = {}
-        for line in result:
-            gpu_id, mem_used, mem_total = line.split(",")
-            memory_data[int(gpu_id)] = (int(mem_used), int(mem_total))
+        return mx.context.num_gpus()
+    except mx.MXNetError:
+        # Some builds of MXNet will raise a CUDA error when CUDA is not
+        # installed on the host.  In this case, zero GPUs are available.
+        return 0
 
-        result_queue.put(memory_data)
-    except:
-        logger.exception("Failed parsing nvidia-smi output %s", "\n".join(result))
-        result_queue.put({})
 
-
-def get_gpu_memory_usage(ctx: List[mx.context.Context]) -> Dict[int, Tuple[int, int]]:
+def get_gpu_memory_usage(ctx: Union[mx.context.Context, List[mx.context.Context]]) -> Dict[int, Tuple[int, int]]:
     """
     Returns used and total memory for GPUs identified by the given context list.
 
@@ -512,33 +280,23 @@ def get_gpu_memory_usage(ctx: List[mx.context.Context]) -> Dict[int, Tuple[int,
     ctx = [c for c in ctx if c.device_type == 'gpu']
     if not ctx:
         return {}
-    if shutil.which("nvidia-smi") is None:
-        logger.warning("Couldn't find nvidia-smi, therefore we assume no GPUs are available.")
-        return {}
 
-    device_ids = [c.device_id for c in ctx]
-
-    # Run from clean forkserver process to not leak any CUDA resources
-    try:
-        mp_context = mp_utils.get_context()
-        result_queue = mp_context.Queue()
-        nvidia_smi_process = mp_context.Process(target=query_nvidia_smi, args=(device_ids, result_queue,))
-        nvidia_smi_process.start()
-        nvidia_smi_process.join()
-
-        memory_data = result_queue.get()
-
-        log_gpu_memory_usage(memory_data)
-
-        return memory_data
-    except:
-        logger.exception("Failed querying the GPU memory using nvidia-smi.")
-        return {}
+    memory_data = {}  # type: Dict[int, Tuple[int, int]]
+    for c in ctx:
+        try:
+            free, total = mx.context.gpu_memory_info(device_id=c.device_id)  # in bytes
+            used = total - free
+            memory_data[c.device_id] = (used * 1e-06, total * 1e-06)
+        except mx.MXNetError:
+            logger.exception("Failed retrieving memory data for gpu%d", c.device_id)
+            continue
+    log_gpu_memory_usage(memory_data)
+    return memory_data
 
 
 def log_gpu_memory_usage(memory_data: Dict[int, Tuple[int, int]]):
     log_str = " ".join(
-        "GPU %d: %d/%d MB (%.2f%%)" % (k, v[0], v[1], v[0] * 100.0 / v[1]) for k, v in memory_data.items())
+        "GPU %d: %d/%d MB (%.2f%%)" % (k, v[0], v[1], v[0] * 100.0 / v[1]) for k, v in memory_data.items() if v[1])
     logger.info(log_str)
 
 
@@ -555,6 +313,7 @@ def determine_context(device_ids: List[int],
     :param disable_device_locking: Disable Sockeye's device locking feature.
     :param lock_dir: Directory to place device lock files in.
     :param exit_stack: An ExitStack from contextlib.
+
     :return: A list with the context(s) to run on.
     """
     if use_cpu:
@@ -563,11 +322,19 @@ def determine_context(device_ids: List[int],
         num_gpus = get_num_gpus()
         check_condition(num_gpus >= 1,
                         "No GPUs found, consider running on the CPU with --use-cpu ")
-        if disable_device_locking:
-            context = expand_requested_device_ids(device_ids)
+        if horovod_mpi.using_horovod():
+            # Running with Horovod/OpenMPI: GPU(s) are determined by local rank
+            check_condition(len(device_ids) == 1 and device_ids[0] < 0,
+                            "When using Horovod, --device-ids should be a negative integer indicating the number of "
+                            "GPUs each worker should use.")
+            n_ids = -device_ids[0]
+            context = [mx.gpu(_id + horovod_mpi.hvd.local_rank() * n_ids) for _id in range(n_ids)]
         else:
-            context = exit_stack.enter_context(acquire_gpus(device_ids, lock_dir=lock_dir))
-        context = [mx.gpu(gpu_id) for gpu_id in context]
+            if disable_device_locking:
+                context = expand_requested_device_ids(device_ids)
+            else:
+                context = exit_stack.enter_context(acquire_gpus(device_ids, lock_dir=lock_dir))
+            context = [mx.gpu(gpu_id) for gpu_id in context]
     return context
 
 
@@ -676,7 +443,6 @@ def acquire_gpus(requested_device_ids: List[int], lock_dir: str = "/tmp",
                 # This will make sure that we use consecutive device ids whenever possible.
                 if master_lock is not None:
                     for candidates in candidates_to_request:
-                        # pylint: disable=no-member
                         gpu_id = exit_stack.enter_context(GpuFileLock(candidates=candidates, lock_dir=lock_dir))
                         if gpu_id is not None:
                             acquired_gpus.append(cast(int, gpu_id))
@@ -787,6 +553,8 @@ def parse_metrics_line(line_number: int, line: str) -> Dict[str, Any]:
         key, value = field.split("=", 1)
         if value == 'True' or value == 'False':
             metric[key] = (value == 'True')
+        elif value == 'None':
+            metric[key] = None
         else:
             metric[key] = float(value)
     return metric
@@ -938,35 +706,14 @@ def cleanup_params_files(output_folder: str, max_to_keep: int, checkpoint: int,
         if n != best_checkpoint:
             param_fname_n = params_name_with_dir % n
             if param_fname_n in existing_files:
-                os.remove(param_fname_n)
-
-
-def cast_conditionally(F, data: mx.sym.Symbol, dtype: str) -> mx.sym.Symbol:
-    """
-    Workaround until no-op cast will be fixed in MXNet codebase.
-    Creates cast symbol only if dtype is different from default one, i.e. float32.
-
-    :param data: Input symbol.
-    :param dtype: Target dtype.
-    :return: Cast symbol or just data symbol.
-    """
-    if dtype != C.DTYPE_FP32:
-        return F.cast(data=data, dtype=dtype)
-    return data
-
-
-def uncast_conditionally(F, data: mx.sym.Symbol, dtype: str) -> mx.sym.Symbol:
-    """
-    Workaround until no-op cast will be fixed in MXNet codebase.
-    Creates cast to float32 symbol only if dtype is different from default one, i.e. float32.
-
-    :param data: Input symbol.
-    :param dtype: Input symbol dtype.
-    :return: Cast symbol or just data symbol.
-    """
-    if dtype != C.DTYPE_FP32:
-        return F.cast(data=data, dtype=C.DTYPE_FP32)
-    return data
+                try:
+                    os.remove(param_fname_n)
+                except FileNotFoundError:
+                    # This can be occur on file systems with higher latency,
+                    # such as distributed file systems.  While repeated
+                    # occurrences of this warning may indicate a problem, seeing
+                    # one or two warnings during training is usually fine.
+                    logger.warning('File has already been removed: %s', param_fname_n)
 
 
 def split(data: mx.nd.NDArray,
@@ -994,27 +741,41 @@ def split(data: mx.nd.NDArray,
     return ndarray_or_list
 
 
-def inflect(word: str,
-            count: int):
-    """
-    Minimal inflection module.
+_DTYPE_TO_STRING = {
+    np.float32: 'float32',
+    np.float16: 'float16',
+    np.int8: 'int8',
+    np.int32: 'int32'
+}
 
-    :param word: The word to inflect.
-    :param count: The count.
-    :return: The word, perhaps inflected for number.
-    """
-    if word in ['time', 'sentence']:
-        return word if count == 1 else word + 's'
-    elif word == 'was':
-        return 'was' if count == 1 else 'were'
-    else:
-        return word + '(s)'
+
+def _print_dtype(dtype):
+    return _DTYPE_TO_STRING.get(dtype, str(dtype))
 
 
-def isfinite(data: mx.nd.NDArray) -> mx.nd.NDArray:
-    """Performs an element-wise check to determine if the NDArray contains an infinite element or not.
-       TODO: remove this funciton after upgrade to MXNet 1.4.* in favor of mx.ndarray.contrib.isfinite()
+def log_parameters(params: mx.gluon.ParameterDict):
     """
-    is_data_not_nan = data == data
-    is_data_not_infinite = data.abs() != np.inf
-    return mx.nd.logical_and(is_data_not_infinite, is_data_not_nan)
+    Logs information about model parameters.
+    """
+    fixed_parameter_names = []
+    learned_parameter_names = []
+    total_learned = 0
+    total_fixed = 0
+    for name, param in sorted(params.items()):
+        repr = "%s [%s, %s]" % (name, param.shape, _print_dtype(param.dtype))
+        size = reduce(lambda x, y: x * y, param.shape)
+        if size == 0:
+            logger.debug("Parameter shape for '%s' not yet fully inferred, using 0", name)
+        if param.grad_req == 'null':
+            fixed_parameter_names.append(repr)
+            total_fixed += size
+        else:
+            total_learned += size
+            learned_parameter_names.append(repr)
+    total_parameters = total_learned + total_fixed
+    logger.info("# of parameters: %d | trainable: %d (%.2f%%) | fixed: %d (%.2f%%)",
+                total_parameters,
+                total_learned, total_learned / total_parameters * 100,
+                total_fixed, total_fixed / total_parameters * 100)
+    logger.info("Trainable parameters: \n%s", pprint.pformat(learned_parameter_names))
+    logger.info("Fixed parameters:\n%s", pprint.pformat(fixed_parameter_names))
diff --git a/sockeye/vocab.py b/sockeye/vocab.py
index b04e0e28c..60f4e7a91 100644
--- a/sockeye/vocab.py
+++ b/sockeye/vocab.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -19,7 +19,6 @@
 from contextlib import ExitStack
 from itertools import chain, islice
 from typing import Dict, Iterable, List, Optional, Tuple
-import re
 
 from sockeye.log import setup_main_logger
 from . import constants as C
@@ -32,14 +31,8 @@
 InverseVocab = Dict[int, str]
 
 
-pointer_pattern = re.compile(C.POINTER_PATTERN)
-def is_pointer(token):
-    return pointer_pattern.match(token) is not None
-
-
 def build_from_paths(paths: List[str], num_words: Optional[int] = None, min_count: int = 1,
-                     pad_to_multiple_of: Optional[int] = None,
-                     num_pointers: int = 0) -> Vocab:
+                     pad_to_multiple_of: Optional[int] = None) -> Vocab:
     """
     Creates vocabulary from paths to a file in sentence-per-line format. A sentence is just a whitespace delimited
     list of tokens. Note that special symbols like the beginning of sentence (BOS) symbol will be added to the
@@ -53,13 +46,12 @@ def build_from_paths(paths: List[str], num_words: Optional[int] = None, min_coun
     """
     with ExitStack() as stack:
         logger.info("Building vocabulary from dataset(s): %s", paths)
-        files = (stack.enter_context(utils.smart_open(path)) for path in paths)  # pylint: disable=no-member
-        return build_vocab(chain(*files), num_words, min_count, pad_to_multiple_of, num_pointers)
+        files = (stack.enter_context(utils.smart_open(path)) for path in paths)
+        return build_vocab(chain(*files), num_words, min_count, pad_to_multiple_of)
 
 
 def build_vocab(data: Iterable[str], num_words: Optional[int] = None, min_count: int = 1,
-                pad_to_multiple_of: Optional[int] = None,
-                num_pointers: int = 0) -> Vocab:
+                pad_to_multiple_of: Optional[int] = None) -> Vocab:
     """
     Creates a vocabulary mapping from words to ids. Increasing integer ids are assigned by word frequency,
     using lexical sorting as a tie breaker. The only exception to this are special symbols such as the padding symbol
@@ -72,12 +64,8 @@ def build_vocab(data: Iterable[str], num_words: Optional[int] = None, min_count:
     :return: Word-to-id mapping.
     """
     vocab_symbols_set = set(C.VOCAB_SYMBOLS)
-    
-    if num_pointers:
-        is_symbol = lambda token: (token in vocab_symbols_set or is_pointer(token))
-    else:
-        is_symbol = lambda token: (token in vocab_symbols_set)
-    raw_vocab = Counter(token for line in data for token in utils.get_tokens(line) if not is_symbol(token))
+    raw_vocab = Counter(token for line in data for token in utils.get_tokens(line)
+                        if token not in vocab_symbols_set)
     # For words with the same count, they will be ordered reverse alphabetically.
     # Not an issue since we only care for consistency
     pruned_vocab = [w for c, w in sorted(((c, w) for w, c in raw_vocab.items() if c >= min_count), reverse=True)]
@@ -88,7 +76,7 @@ def build_vocab(data: Iterable[str], num_words: Optional[int] = None, min_count:
     else:
         vocab = pruned_vocab
         num_words_log = "None"
-    
+
     if pad_to_multiple_of is not None:
         current_vocab_size = len(vocab) + len(C.VOCAB_SYMBOLS)
         rest = current_vocab_size % pad_to_multiple_of
@@ -100,10 +88,8 @@ def build_vocab(data: Iterable[str], num_words: Optional[int] = None, min_count:
     else:
         pad_entries = []
         pad_to_multiple_log = "None"
-    
-    pointer_entries = [C.POINTER_FORMAT % idx for idx in range(num_pointers)]
-    
-    word_to_id = {word: idx for idx, word in enumerate(chain(C.VOCAB_SYMBOLS, vocab, pad_entries, pointer_entries))}
+
+    word_to_id = {word: idx for idx, word in enumerate(chain(C.VOCAB_SYMBOLS, vocab, pad_entries))}
     logger.info("Vocabulary: types: %d/%d/%d/%d (initial/min_pruned/max_pruned/+special) " +
                 "[min_frequency=%d, max_num_types=%s, pad_to_multiple_of=%s]",
                 len(raw_vocab), len(pruned_vocab), len(vocab),
@@ -130,15 +116,19 @@ def is_valid_vocab(vocab: Vocab) -> bool:
     """
     Checks if a vocabulary is valid. We define valid as:
     1. All indices from 0 to num_words - 1 are present without duplicates.
-    2. All special symbols C.PAD_SYMBOL, C.UNK_SYMBOL, C.BOS_SYMBOL, C.EOS_SYMBOL are present.
-    3. PAD_ID has word id 0.
+    2. PAD_SYMBOL has word id 0, UNK_SYMBOL has word id 1, BOS_SYMBOL has word id 2, EOS_SYMBOL has word id 3.
     """
-    for symbol in [C.PAD_SYMBOL, C.UNK_SYMBOL, C.BOS_SYMBOL, C.EOS_SYMBOL]:
-        if symbol not in vocab:
-            logger.warning("%s missing from vocabulary.", symbol)
-            return False
-    if vocab[C.PAD_SYMBOL] != 0:
-        logger.warning("PAD_ID does not have word id 0 in vocabulary.")
+    if vocab[C.PAD_SYMBOL] != C.PAD_ID:
+        logger.warning("PAD_SYMBOL does not have word id 0 in vocabulary.")
+        return False
+    if vocab[C.UNK_SYMBOL] != C.UNK_ID:
+        logger.warning("UNK_SYMBOL does not have word id 1 in vocabulary.")
+        return False
+    if vocab[C.BOS_SYMBOL] != C.BOS_ID:
+        logger.warning("BOS_SYMBOL does not have word id 2 in vocabulary.")
+        return False
+    if vocab[C.EOS_SYMBOL] != C.EOS_ID:
+        logger.warning("EOS_SYMBOL does not have word id 3 in vocabulary.")
         return False
     word_ids = []
     for word, word_id in vocab.items():
@@ -219,16 +209,14 @@ def load_target_vocab(folder: str) -> Vocab:
 
 
 def load_or_create_vocab(data: str, vocab_path: Optional[str], num_words: int, word_min_count: int,
-                         pad_to_multiple_of: Optional[int] = None,
-                         num_pointers: int = 0) -> Vocab:
+                         pad_to_multiple_of: Optional[int] = None) -> Vocab:
     """
     If the vocabulary path is defined, the vocabulary is loaded from the path.
     Otherwise, it is built from the data file. No writing to disk occurs.
     """
     if vocab_path is None:
         return build_from_paths(paths=[data], num_words=num_words, min_count=word_min_count,
-                                pad_to_multiple_of=pad_to_multiple_of,
-                                num_pointers=num_pointers)
+                                pad_to_multiple_of=pad_to_multiple_of)
     else:
         return vocab_from_json(vocab_path)
 
@@ -236,12 +224,12 @@ def load_or_create_vocab(data: str, vocab_path: Optional[str], num_words: int, w
 def load_or_create_vocabs(source_paths: List[str],
                           target_path: str,
                           source_vocab_paths: List[Optional[str]],
+                          factor_vocab_same_as_source: List[bool],
                           target_vocab_path: Optional[str],
                           shared_vocab: bool,
                           num_words_source: Optional[int], word_min_count_source: int,
                           num_words_target: Optional[int], word_min_count_target: int,
-                          pad_to_multiple_of: Optional[int] = None,
-                          num_pointers: int = 0) -> Tuple[List[Vocab], Vocab]:
+                          pad_to_multiple_of: Optional[int] = None) -> Tuple[List[Vocab], Vocab]:
     """
     Returns vocabularies for source files (including factors) and target.
     If the respective vocabulary paths are not None, the vocabulary is read from the path and returned.
@@ -285,8 +273,7 @@ def load_or_create_vocabs(source_paths: List[str],
             vocab_source = vocab_target = build_from_paths(paths=[source_path, target_path],
                                                            num_words=num_words_source,
                                                            min_count=word_min_count_source,
-                                                           pad_to_multiple_of=pad_to_multiple_of,
-                                                           num_pointers=num_pointers)
+                                                           pad_to_multiple_of=pad_to_multiple_of)
 
         else:
             vocab_path = source_vocab_path if source_vocab_path is not None else target_vocab_path
@@ -297,16 +284,30 @@ def load_or_create_vocabs(source_paths: List[str],
         vocab_source = load_or_create_vocab(source_path, source_vocab_path, num_words_source, word_min_count_source,
                                             pad_to_multiple_of=pad_to_multiple_of)
         vocab_target = load_or_create_vocab(target_path, target_vocab_path, num_words_target, word_min_count_target,
-                                            pad_to_multiple_of=pad_to_multiple_of,
-                                            num_pointers=num_pointers)
+                                            pad_to_multiple_of=pad_to_multiple_of)
 
     vocab_source_factors = []  # type: List[Vocab]
     if source_factor_paths:
         logger.info("(2) Additional source factor vocabularies")
-        # source factor vocabs are always created
-        for factor_path, factor_vocab_path in zip(source_factor_paths, source_factor_vocab_paths):
+        if len(factor_vocab_same_as_source) > 1:
+            utils.check_condition(len(factor_vocab_same_as_source) == len(source_factor_paths),
+                                  "The number of flags for sharing the vocabulary of "
+                                  "source factors does not match the number of source "
+                                  "factors.")
+        elif len(factor_vocab_same_as_source) == 1:
+            factor_vocab_same_as_source = factor_vocab_same_as_source * len(source_factor_paths)
+        else:
+            factor_vocab_same_as_source = [False] * len(source_factor_paths)
+
+    for factor_path, factor_vocab_path, share_source_vocab in zip(source_factor_paths,
+                                                                  source_factor_vocab_paths,
+                                                                  factor_vocab_same_as_source):
+        if not share_source_vocab:
             vocab_source_factors.append(load_or_create_vocab(factor_path, factor_vocab_path,
-                                                             num_words_source, word_min_count_source))
+                                                             num_words_source, word_min_count_source,
+                                                             pad_to_multiple_of=pad_to_multiple_of))
+        else:
+            vocab_source_factors.append(vocab_source)
 
     return [vocab_source] + vocab_source_factors, vocab_target
 
diff --git a/sockeye2.bib b/sockeye2.bib
new file mode 100644
index 000000000..32e8e4aa4
--- /dev/null
+++ b/sockeye2.bib
@@ -0,0 +1,6 @@
+@article{Hieber2020Sockeye,
+  title={Sockeye 2: A toolkit for neural machine translation},
+  author={Hieber, Felix and Domhan, Tobias and Denkowski, Michael and Vilar, David},
+  journal={Proceedings of EAMT 2020, project track},
+  year={2020}
+}
diff --git a/sockeye_contrib/autopilot/README.md b/sockeye_contrib/autopilot/README.md
deleted file mode 100644
index ea3887aeb..000000000
--- a/sockeye_contrib/autopilot/README.md
+++ /dev/null
@@ -1,132 +0,0 @@
-# Sockeye Autopilot
-
-This module provides automated end-to-end system building for popular model types on public data sets.
-These capabilities can also be used independently: users can provide their own data for model training or use Autopilot to download and pre-process public data for other use.
-All intermediate files are preserved as plain text and commands are recorded, letting users take over at any point for further experimentation.
-
-## Quick Start
-
-If Sockeye is installed via pip or source, Autopilot can be run directly:
-
-```bash
-> sockeye-autopilot
-```
-
-This is equivalent to:
-
-```bash
-> python -m sockeye_contrib.autopilot.autopilot
-```
-
-With a single command, Autopilot can download and pre-process training data, then train and evaluate a translation model.
-For example, to build a transformer model on the WMT14 English-German benchmark, run:
-
-```bash
-> sockeye-autopilot --task wmt14_en_de --model transformer
-```
-
-By default, systems are built under `$HOME/sockeye_autopilot`.
-The `--workspace` argument can specify a different location.
-Also by default, a single GPU is used for training and decoding.
-The `--gpus` argument can specify a larger number of GPUs for parallel training or `0` for CPU mode only.
-
-Autopilot populates the following sub-directories in a workspace:
-
-- cache: raw downloaded files from public data sets.
-- third_party: downloaded third party tools for data pre-processing (currently [Moses tokenizer](https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer) and [subword-nmt](https://github.com/rsennrich/subword-nmt))
-- logs: log files for various steps.
-- systems: contains a single directory for each task, such as "wmt14_en_de".  Task directories contain (after a successful build):
-  - data: raw, tokenized, and byte-pair encoded data for train, dev, and test sets.
-  - model.bpe: byte-pair encoding model
-  - model.*: directory for each Sockeye model built, such as "model.transformer"
-  - results: decoding output and BLEU scores.  When starting with raw data, the .sacrebleu file contains a score that can be compared against official WMT results.
-
-### Custom Data
-
-Models can be built using custom data with any level of pre-processing.
-For example, to use custom German-English raw data, run:
-
-```bash
-> sockeye-autopilot --model transformer \
-    --custom-task my_task \
-    --custom-text-type raw \
-    --custom-lang de en \
-    --custom-train train.de train.en \
-    --custom-dev dev.de dev.en \
-    --custom-test test.de test.en \
-```
-
-Pre-tokenized or byte-pair encoded data can be used with `--custom-text-type tok` and `--custom-text-type bpe`.
-The `--custom-task` argument is used for directory naming.
-A custom number of BPE operations can be specified with `--custom-bpe-op`.
-
-### Data Preparation Only
-
-To use Autopilot for data preparation only, simply provide `none` as the model type:
-
-```bash
-> sockeye-autopilot --task wmt14_en_de --model none
-```
-
-## Automation Steps
-
-This section describes the steps Autopilot runs as part of each system build.
-Builds can be stopped and re-started (CTRL+C).
-Some steps are atomic while others (such as translation model training) can be resumed.
-Each completed step records its success so a re-started build can pick up from the last finished step.
-
-### Checkout Third Party Tools
-
-If the task requires tokenization, check out the [Moses tokenizer](https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer).
-If the task requires byte-pair encoding, check out the [subword-nmt](https://github.com/rsennrich/subword-nmt)) module.
-Store git checkouts of these tools in the third_party directory for re-use with future tasks in the same workspace.
-
-NOTE: These tools have different open source licenses than Sockeye.
-See the included license files for more information.
-
-### Download Data
-
-Download to the cache directory all raw files referenced by the current task (if not already present).
-See `RAW_FILES` and `TASKS` in `tasks.py` for examples of tasks referencing various publicly available data files.
-
-### Populate Input Files
-
-For known tasks, populate parallel train, dev, and test files under "data/raw" by extracting lines from raw files downloaded in the previous step.
-For custom tasks, copy the user-provided data.
-Train and dev files are concatenated while test sets are preserved as separate files.
-
-This step includes Unicode whitespace normalization to ensure that only ASCII newlines are considered as line breaks (spurious Unicode newlines are a known issue in some noisy public data).
-
-### Tokenize Data
-
-If data is not pre-tokenized, run the Moses tokenizer and store the results in "data/tok".
-For known tasks, use the listed `src_lang` and `trg_lang` (see `TASKS` in `tasks.py`).
-For custom tasks, use the provided `--custom-lang` arguments.
-
-### Byte-Pair Encode Data
-
-If the data is not already byte-pair encoded, learn a BPE model "model.bpe" and apply it to the data, storing the results in "data/bpe".
-For known tasks, use the listed number of operations `bpe_op`.
-For custom tasks, use the provided `--custom-bpe-op` argument.
-
-### Train Translation Model
-
-Run `sockeye.train` and `sockeye.average` to learn a translation model on the byte-pair encoded data.
-Use the arguments listed for the provided `--model` argument and specify "model.MODEL" (e.g., "model.transformer") as the model directory.
-See `MODELS` in `models.py` for examples of training arguments.
-
-This step can take several days and progress can be checked via the log file or tensorboard.
-This step also supports resuming from a partially trained model.
-
-### Translate Test Sets
-
-Run `sockeye.translate` to decode each test set using the specified settings.
-See `DECODE_ARGS` in `models.py` for decoding settings.
-
-### Evaluate Translations
-
-Provide the following outputs to the user under "results":
-
-- test.N.MODEL.SETTINGS.bpe.bleu: BLEU score of raw decoder output against byte-pair encoded references
-- test.N.MODEL.SETTINGS.tok.bleu: BLEU score of word-level decoder output against tokenized references
-- test.N.MODEL.SETTINGS.detok.sacrebleu: BLEU score of detokenized decoder output against raw references using [SacreBLEU](https://github.com/awslabs/sockeye/tree/master/sockeye_contrib/sacrebleu).  These scores are directly comparable to those reported in WMT evaluations.
diff --git a/sockeye_contrib/autopilot/__init__.py b/sockeye_contrib/autopilot/__init__.py
deleted file mode 100644
index b5f16042a..000000000
--- a/sockeye_contrib/autopilot/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-from sockeye_contrib.autopilot import autopilot
-from sockeye_contrib.autopilot import tasks
-from sockeye_contrib.autopilot import models
-from sockeye_contrib.autopilot import third_party
diff --git a/sockeye_contrib/autopilot/autopilot.py b/sockeye_contrib/autopilot/autopilot.py
deleted file mode 100644
index f063b3adb..000000000
--- a/sockeye_contrib/autopilot/autopilot.py
+++ /dev/null
@@ -1,907 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import argparse
-import gzip
-import hashlib
-import logging
-import os
-import re
-import shutil
-import subprocess
-import sys
-import tarfile
-import tempfile
-from typing import Any, IO, Iterable, List, Optional, Tuple
-import urllib.request
-import zipfile
-
-# Make sure sockeye is on the system path
-try:
-    from sockeye import constants as C
-    from sockeye import utils
-except ImportError:
-    SOCKEYE_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-    raise RuntimeError("Please install the sockeye module or add the sockeye root directory to your Python path. Ex: export PYTHONPATH=%s"
-                       % SOCKEYE_ROOT)
-
-from sockeye_contrib.autopilot.tasks import ARCHIVE_NONE, ARCHIVE_TAR, ARCHIVE_ZIP
-from sockeye_contrib.autopilot.tasks import TEXT_UTF8_RAW, TEXT_UTF8_RAW_SGML, TEXT_UTF8_RAW_BITEXT
-from sockeye_contrib.autopilot.tasks import TEXT_UTF8_RAW_BITEXT_REVERSE, TEXT_REQUIRES_TOKENIZATION
-from sockeye_contrib.autopilot.tasks import TEXT_UTF8_TOKENIZED
-from sockeye_contrib.autopilot.tasks import RAW_FILES
-from sockeye_contrib.autopilot.tasks import Task, TASKS
-from sockeye_contrib.autopilot.models import MODELS, MODEL_NONE, MODEL_GNMT, MODEL_TEST_ARGS
-from sockeye_contrib.autopilot.models import DECODE_ARGS, DECODE_STANDARD, DECODE_GNMT
-from sockeye_contrib.autopilot import third_party
-
-
-# Formats for custom files
-CUSTOM_UTF8_RAW = "raw"
-CUSTOM_UTF8_TOK = "tok"
-CUSTOM_UTF8_BPE = "bpe"
-CUSTOM_TEXT_TYPES = [CUSTOM_UTF8_RAW, CUSTOM_UTF8_TOK, CUSTOM_UTF8_BPE]
-
-# Special file names
-DIR_SOCKEYE_AUTOPILOT = "sockeye_autopilot"
-FILE_WORKSPACE = ".workspace"
-FILE_COMPLETE = ".complete"
-
-# Sub-task directory and file names
-DIR_CACHE = "cache"
-DIR_LOGS = "logs"
-DIR_SYSTEMS = "systems"
-DIR_DATA = "data"
-DIR_RAW = "raw"
-DIR_TOK = "tok"
-DIR_BPE = "bpe"
-PREFIX_TRAIN = "train."
-PREFIX_DEV = "dev."
-PREFIX_TEST = "test."
-DATA_SRC = "src"
-DATA_TRG = "trg"
-SUFFIX_SRC_GZ = DATA_SRC + ".gz"
-SUFFIX_TRG_GZ = DATA_TRG + ".gz"
-DIR_BPE_MODEL = "model.bpe"
-FILE_BPE_CODES = "codes"
-DIR_PREFIX_MODEL = "model."
-DIR_RESULTS = "results"
-FILE_COMMAND = "command.{}.sh"
-SUFFIX_COMMAND = "command.sh"
-SUFFIX_BPE = "bpe"
-SUFFIX_TOK = "tok"
-SUFFIX_DETOK = "detok"
-SUFFIX_BLEU = "bleu"
-SUFFIX_SACREBLEU = "sacrebleu"
-SUFFIX_TEST = ".test"
-
-# Reasonable defaults for model averaging
-AVERAGE_NUM_CHECKPOINTS = 8
-AVERAGE_METRIC = "perplexity"
-AVERAGE_STRATEGY = "best"
-PARAMS_BEST_SINGLE = "params.best.single"
-PARAMS_AVERAGE = "params.average"
-
-# Scaled down settings for test mode
-TEST_BPE_OPS = 1024
-
-
-def identify_raw_files(task: Task, test_mode: bool = False) -> List[str]:
-    """
-    Identify raw files that need to be downloaded for a given task.
-
-    :param task: Sequence-to-sequence task.
-    :param test_mode: Run in test mode, only downloading test data.
-    :return: List of raw file names.
-    """
-    raw_files = set()
-    all_sets = [task.test,] if test_mode else [task.train, task.dev, task.test]
-    for file_sets in all_sets:
-        for file_set in file_sets:
-            for fname in file_set[:2]:
-                raw_file = fname.split("/", 1)[0]
-                if raw_file not in RAW_FILES:
-                    raise RuntimeError("Unknown raw file %s found in path %s" % (raw_file, fname))
-                raw_files.add(raw_file)
-    return sorted(raw_files)
-
-
-def download_extract_raw_files(names: List[str], cache_dir: str, dest_dir: str):
-    """
-    Download and extract raw files, making use of a cache directory.
-    - Downloaded files are verified by MD5 sum.
-    - Extraction overwrites existing files.
-
-    :param names: List of raw file names in RAW_FILES.
-    :param cache_dir: Cache directory for downloading raw files.
-    :param dest_dir: Destination directory for extracting raw files.
-    """
-
-    for name in names:
-        raw_file = RAW_FILES[name]
-        local_dir = os.path.join(cache_dir, name)
-        local_fname = os.path.join(local_dir, os.path.basename(raw_file.url))
-
-        # Download file if not present
-        if not os.path.exists(local_dir):
-            logging.info("Create: %s", local_dir)
-            os.makedirs(local_dir)
-        if not os.path.exists(local_fname):
-            logging.info("Download: %s -> %s", raw_file.url, local_fname)
-            urllib.request.urlretrieve(raw_file.url, local_fname)
-
-        # Check MD5 sum, attempt one re-download on mismatch
-        md5 = md5sum(local_fname)
-        if not md5 == raw_file.md5:
-            logging.info("MD5 mismatch for %s, attempt re-download %s", local_fname, raw_file.url)
-            urllib.request.urlretrieve(raw_file.url, local_fname)
-            md5 = md5sum(local_fname)
-            if not md5 == raw_file.md5:
-                raise RuntimeError("MD5 mismatch for %s after re-download.  Check validity of %s"
-                                   % (local_fname, raw_file.url))
-        logging.info("Confirmed MD5: %s (%s)", local_fname, md5)
-
-        # Extract file(s), overwriting directory if exists
-        extract_path = os.path.join(dest_dir, name)
-        if os.path.exists(extract_path):
-            shutil.rmtree(extract_path)
-        os.makedirs(extract_path)
-        logging.info("Extract: %s -> %s", local_fname, extract_path)
-        if raw_file.archive_type == ARCHIVE_NONE:
-            os.symlink(local_fname, os.path.join(extract_path, os.path.basename(local_fname)))
-        elif raw_file.archive_type == ARCHIVE_TAR:
-            tar = tarfile.open(local_fname)
-            tar.extractall(path=extract_path)
-        elif raw_file.archive_type == ARCHIVE_ZIP:
-            zipf = zipfile.ZipFile(local_fname, "r")
-            zipf.extractall(path=extract_path)
-        else:
-            raise RuntimeError("Unknown archive type: %s" % raw_file.archive_type)
-
-
-def md5sum(fname: str) -> str:
-    """Compute MD5 sum of file."""
-    with open(fname, "rb") as inp:
-        md5 = hashlib.md5(inp.read()).hexdigest()
-    return md5
-
-
-def populate_parallel_text(extract_dir: str,
-                           file_sets: List[Tuple[str, str, str]],
-                           dest_prefix: str,
-                           keep_separate: bool,
-                           head_n: int = 0):
-    """
-    Create raw parallel train, dev, or test files with a given prefix.
-
-    :param extract_dir: Directory where raw files (inputs) are extracted.
-    :param file_sets: Sets of files to use.
-    :param dest_prefix: Prefix for output files.
-    :param keep_separate: True if each file set (source-target pair) should have
-                          its own file (used for test sets).
-    :param head_n: If N>0, use only the first N lines (used in test mode).
-    """
-    source_out = None  # type: IO[Any]
-    target_out = None  # type: IO[Any]
-    lines_written = 0
-    # Single output file for each side
-    if not keep_separate:
-        source_dest = dest_prefix + SUFFIX_SRC_GZ
-        target_dest = dest_prefix + SUFFIX_TRG_GZ
-        logging.info("Populate: %s %s", source_dest, target_dest)
-        source_out = gzip.open(source_dest, "wt", encoding="utf-8")
-        target_out = gzip.open(target_dest, "wt", encoding="utf-8")
-    for i, (source_fname, target_fname, text_type) in enumerate(file_sets):
-        # One output file per input file for each side
-        if keep_separate:
-            if source_out:
-                source_out.close()
-            if target_out:
-                target_out.close()
-            source_dest = dest_prefix + str(i) + "." + SUFFIX_SRC_GZ
-            target_dest = dest_prefix + str(i) + "." + SUFFIX_TRG_GZ
-            logging.info("Populate: %s %s", source_dest, target_dest)
-            source_out = gzip.open(source_dest, "wt", encoding="utf-8")
-            target_out = gzip.open(target_dest, "wt", encoding="utf-8")
-        for source_line, target_line in zip(
-                plain_text_iter(os.path.join(extract_dir, source_fname), text_type, DATA_SRC),
-                plain_text_iter(os.path.join(extract_dir, target_fname), text_type, DATA_TRG)):
-            # Only write N lines total if requested, but reset per file when
-            # keeping files separate
-            if head_n > 0 and lines_written >= head_n:
-                if keep_separate:
-                    lines_written = 0
-                break
-            source_out.write("{}\n".format(source_line))
-            target_out.write("{}\n".format(target_line))
-            lines_written += 1
-    source_out.close()
-    target_out.close()
-
-
-def copy_parallel_text(file_list: List[str], dest_prefix: str):
-    """
-    Copy pre-compiled raw parallel files with a given prefix.  Perform
-    whitespace character normalization to ensure that only ASCII newlines are
-    considered line breaks.
-
-    :param file_list: List of file pairs to use.
-    :param dest_prefix: Prefix for output files.
-    """
-    # Group files into source-target pairs
-    file_sets = []
-    for i in range(0, len(file_list), 2):
-        file_sets.append((file_list[i], file_list[i + 1]))
-    multiple_sets = len(file_sets) > 1
-    for i, (source_fname, target_fname) in enumerate(file_sets):
-        if multiple_sets:
-            source_dest = dest_prefix + str(i) + "." + SUFFIX_SRC_GZ
-            target_dest = dest_prefix + str(i) + "." + SUFFIX_TRG_GZ
-        else:
-            source_dest = dest_prefix + SUFFIX_SRC_GZ
-            target_dest = dest_prefix + SUFFIX_TRG_GZ
-        logging.info("Populate: %s %s", source_dest, target_dest)
-        with gzip.open(source_dest, "wb") as source_out, gzip.open(target_dest, "wb") as target_out:
-            with third_party.bin_open(source_fname) as inp:
-                for line in inp:
-                    line = (re.sub(r"\s", " ", line.decode("utf-8"))).encode("utf-8") + b"\n"
-                    source_out.write(line)
-            with third_party.bin_open(target_fname) as inp:
-                for line in inp:
-                    line = (re.sub(r"\s", " ", line.decode("utf-8"))).encode("utf-8") + b"\n"
-                    target_out.write(line)
-
-
-def plain_text_iter(fname: str, text_type: str, data_side: str) -> Iterable[str]:
-    """
-    Extract plain text from file as iterable.  Also take steps to ensure that
-    whitespace characters (including unicode newlines) are normalized and
-    outputs are line-parallel with inputs considering ASCII newlines only.
-
-    :param fname: Path of possibly gzipped input file.
-    :param text_type: One of TEXT_*, indicating data format.
-    :param data_side: DATA_SRC or DATA_TRG.
-    """
-    if text_type in (TEXT_UTF8_RAW, TEXT_UTF8_TOKENIZED):
-        with third_party.bin_open(fname) as inp:
-            for line in inp:
-                line = re.sub(r"\s", " ", line.decode("utf-8"))
-                yield line.strip()
-    elif text_type == TEXT_UTF8_RAW_SGML:
-        with third_party.bin_open(fname) as inp:
-            for line in inp:
-                line = re.sub(r"\s", " ", line.decode("utf-8"))
-                if line.startswith("<seg "):
-                    # Extract segment text
-                    text = re.sub(r"<seg.*?>(.*)</seg>.*?", "\\1", line)
-                    text = re.sub(r"\s+", " ", text.strip())
-                    # Unescape XML entities
-                    text = text.replace("&quot;", "\"")
-                    text = text.replace("&apos;", "'")
-                    text = text.replace("&lt;", "<")
-                    text = text.replace("&gt;", ">")
-                    text = text.replace("&amp;", "&")
-                    yield text
-    elif text_type in (TEXT_UTF8_RAW_BITEXT, TEXT_UTF8_RAW_BITEXT_REVERSE):
-        # Select source or target field, reversing if needed
-        if text_type == TEXT_UTF8_RAW_BITEXT:
-            field_id = 0 if data_side == DATA_SRC else 1
-        else:
-            field_id = 1 if data_side == DATA_SRC else 0
-        with third_party.bin_open(fname) as inp:
-            for line in inp:
-                line = re.sub(r"\s", " ", line.decode("utf-8"))
-                fields = line.split("|||")
-                yield fields[field_id].strip()
-    else:
-        raise RuntimeError("Unknown text type: %s" % text_type)
-
-
-def touch_file(fname: str):
-    """Create a file if not present, update access time."""
-    # Reference not needed since there will be no reads or writes
-    with open(fname, "a"):
-        os.utime(fname, None)
-
-
-def renew_step_dir(step_dir: str):
-    """Delete step directory if exists and create, reporting actions."""
-    if os.path.exists(step_dir):
-        logging.info("Remove unfinished step %s", step_dir)
-        shutil.rmtree(step_dir)
-    logging.info("Create: %s", step_dir)
-    os.makedirs(step_dir)
-
-
-def call_sockeye_train(model: str,
-                       bpe_dir: str,
-                       model_dir: str,
-                       log_fname: str,
-                       num_gpus: int,
-                       test_mode: bool = False):
-    """
-    Call sockeye.train with specified arguments on prepared inputs.  Will resume
-    partial training or skip training if model is already finished.  Record
-    command for future use.
-
-    :param model: Type of translation model to train.
-    :param bpe_dir: Directory of BPE-encoded input data.
-    :param model_dir: Model output directory.
-    :param log_fname: Location to write log file.
-    :param num_gpus: Number of GPUs to use for training (0 for CPU).
-    :param test_mode: Run in test mode, stopping after a small number of
-                      updates.
-    """
-    # Inputs and outputs
-    fnames = ["--source={}".format(os.path.join(bpe_dir, PREFIX_TRAIN + SUFFIX_SRC_GZ)),
-              "--target={}".format(os.path.join(bpe_dir, PREFIX_TRAIN + SUFFIX_TRG_GZ)),
-              "--validation-source={}".format(os.path.join(bpe_dir, PREFIX_DEV + SUFFIX_SRC_GZ)),
-              "--validation-target={}".format(os.path.join(bpe_dir, PREFIX_DEV + SUFFIX_TRG_GZ)),
-              "--output={}".format(model_dir)]
-    # Assemble command
-    command = [sys.executable, "-m", "sockeye.train"] + fnames + MODELS[model]
-    # Request GPUs or specify CPU
-    if num_gpus > 0:
-        command.append("--device-ids=-{}".format(num_gpus))
-    else:
-        command.append("--use-cpu")
-    # Test mode trains a smaller model for a small number of steps
-    if test_mode:
-        command += MODEL_TEST_ARGS[model]
-    command_fname = os.path.join(model_dir, FILE_COMMAND.format("sockeye.train"))
-    # Run unless training already finished
-    if not os.path.exists(command_fname):
-        # Call Sockeye training
-        with open(log_fname, "wb") as log:
-            logging.info("sockeye.train: %s", model_dir)
-            logging.info("Log: %s", log_fname)
-            logging.info("(This step can take several days. See log file or TensorBoard for progress)")
-            subprocess.check_call(command, stderr=log)
-        # Record successful command
-        logging.info("Command: %s", command_fname)
-        print_command(command, command_fname)
-
-
-def call_sockeye_average(model_dir: str, log_fname: str):
-    """
-    Call sockeye.average with reasonable defaults.
-
-    :param model_dir: Trained model directory.
-    :param log_fname: Location to write log file.
-    """
-    params_best_fname = os.path.join(model_dir, C.PARAMS_BEST_NAME)
-    params_best_single_fname = os.path.join(model_dir, PARAMS_BEST_SINGLE)
-    params_average_fname = os.path.join(model_dir, PARAMS_AVERAGE)
-    command = [sys.executable,
-               "-m",
-               "sockeye.average",
-               "--metric={}".format(AVERAGE_METRIC),
-               "-n",
-               str(AVERAGE_NUM_CHECKPOINTS),
-               "--output={}".format(params_average_fname),
-               "--strategy={}".format(AVERAGE_STRATEGY),
-               model_dir]
-    command_fname = os.path.join(model_dir, FILE_COMMAND.format("sockeye.average"))
-    # Run average if not previously run
-    if not os.path.exists(command_fname):
-        # Re-link best point to best single point
-        os.symlink(os.path.basename(os.path.realpath(params_best_fname)), params_best_single_fname)
-        os.remove(params_best_fname)
-        # Call Sockeye average
-        with open(log_fname, "wb") as log:
-            logging.info("sockeye.average: %s", os.path.join(model_dir, params_best_fname))
-            logging.info("Log: %s", log_fname)
-            subprocess.check_call(command, stderr=log)
-        # Link averaged point as new best
-        os.symlink(PARAMS_AVERAGE, params_best_fname)
-        # Record successful command
-        logging.info("Command: %s", command_fname)
-        print_command(command, command_fname)
-
-
-def call_sockeye_translate(args: List[str],
-                           input_fname: str,
-                           output_fname: str,
-                           model_dir: str,
-                           log_fname: str,
-                           use_cpu: bool):
-    """
-    Call sockeye.translate with specified arguments using a trained model.
-
-    :param args: Command line arguments for sockeye.translate.
-    :param input_fname: Input file (byte-pair encoded).
-    :param output_fname: Raw decoder output file.
-    :param model_dir: Model output directory.
-    :param log_fname: Location to write log file.
-    :param use_cpu: Use CPU instead of GPU for decoding.
-    """
-    # Inputs and outputs
-    fnames = ["--input={}".format(input_fname),
-              "--output={}".format(output_fname),
-              "--models={}".format(model_dir)]
-    # Assemble command
-    command = [sys.executable, "-m", "sockeye.translate"] + fnames + args
-    # Request GPUs or specify CPU
-    if use_cpu:
-        command.append("--use-cpu")
-    command_fname = output_fname + "." + SUFFIX_COMMAND
-    # Run unless translate already finished
-    if not os.path.exists(command_fname):
-        # Call Sockeye translate
-        with open(log_fname, "wb") as log:
-            logging.info("sockeye.translate: %s -> %s", input_fname, output_fname)
-            logging.info("Log: %s", log_fname)
-            subprocess.check_call(command, stderr=log)
-        # Cleanup redundant log file
-        try:
-            os.remove(output_fname + ".log")
-        except FileNotFoundError:
-            pass
-
-        # Record successful command
-        logging.info("Command: %s", command_fname)
-        print_command(command, command_fname)
-
-
-def call_sacrebleu(input_fname: str, ref_fname: str, output_fname: str, log_fname: str, tokenized: bool = False):
-    """
-    Call pip-installed sacrebleu on tokenized or detokenized inputs.
-
-    :param input_fname: Input translation file.
-    :param ref_fname: Reference translation file.
-    :param output_fname: Output score file.
-    :param log_fname: Location to write log file.
-    :param tokenized: Whether inputs are tokenized (or byte-pair encoded).
-    """
-    # Assemble command
-    command = ["sacrebleu",
-               "--score-only",
-               "--input={}".format(input_fname),
-               ref_fname]
-    # Already tokenized?
-    if tokenized:
-        command.append("--tokenize=none")
-    # Call sacrebleu
-    with open(log_fname, "wb") as log:
-        logging.info("sacrebleu: %s -> %s", input_fname, output_fname)
-        logging.info("Log: %s", log_fname)
-        score = subprocess.check_output(command, stderr=log)
-    # Record successful score
-    with open(output_fname, "wb") as out:
-        out.write(score)
-
-
-def print_command(command: List[str], fname: str):
-    """
-    Format and print command to file.
-
-    :param command: Command in args list form.
-    :param fname: File name to write out.
-    """
-    with open(fname, "w", encoding="utf-8") as out:
-        print(" \\\n".join(command), file=out)
-
-
-def run_steps(args: argparse.Namespace):
-    """Run all steps required to complete task.  Called directly from main."""
-
-    logging.basicConfig(level=logging.INFO, format="sockeye.autopilot: %(message)s")
-
-    # (1) Establish task
-
-    logging.info("=== Start Autopilot ===")
-    # Listed task
-    if args.task:
-        task = TASKS[args.task]
-        logging.info("Task: %s", task.description)
-        logging.info("URL: %s", task.url)
-
-        def report_data(file_sets):
-            for file_set in file_sets:
-                for fname in file_set[:2]:
-                    logging.info("    %s", fname)
-
-        logging.info("  Train:")
-        report_data(task.train)
-        logging.info("  Dev:")
-        report_data(task.dev)
-        logging.info("  Test:")
-        report_data(task.test)
-    # Custom task
-    else:
-        logging.info("Task: custom")
-    # Source and target language codes
-    lang_codes = (task.src_lang, task.trg_lang) if args.task else args.custom_lang
-
-    # (2) Establish workspace and task directories
-
-    logging.info("=== Establish working directories ===")
-    logging.info("Workspace: %s", args.workspace)
-    special_fname = os.path.join(args.workspace, FILE_WORKSPACE)
-    if not os.path.exists(args.workspace):
-        logging.info("Create: %s", args.workspace)
-        os.makedirs(args.workspace)
-        touch_file(special_fname)
-    else:
-        if not os.path.exists(special_fname):
-            raise RuntimeError("Directory %s exists but %s does not, stopping to avoid overwriting files in non-workspace directory"
-                            % (args.workspace, special_fname))
-
-    dir_third_party = os.path.join(args.workspace, third_party.DIR_THIRD_PARTY)
-    dir_cache = os.path.join(args.workspace, DIR_CACHE)
-    dir_logs = os.path.join(args.workspace, DIR_LOGS)
-    dir_systems = os.path.join(args.workspace, DIR_SYSTEMS)
-    task_name = args.task if args.task else args.custom_task
-    if args.test:
-        task_name += SUFFIX_TEST
-    dir_task = os.path.join(dir_systems, task_name)
-    for dirname in (dir_third_party, dir_cache, dir_logs, dir_systems, dir_task):
-        if os.path.exists(dirname):
-            logging.info("Exists: %s", dirname)
-        else:
-            logging.info("Create: %s", dirname)
-            os.makedirs(dirname)
-
-    # (3) Checkout necessary tools
-
-    logging.info("=== Checkout third-party tools ===")
-    # Requires tokenization?
-    if args.task or args.custom_text_type == CUSTOM_UTF8_RAW:
-        third_party.checkout_moses_tokenizer(args.workspace)
-    # Requires byte-pair encoding?
-    if args.task or args.custom_text_type in (CUSTOM_UTF8_RAW, CUSTOM_UTF8_TOK):
-        third_party.checkout_subword_nmt(args.workspace)
-
-    # (4) Populate train/dev/test data
-
-    # This step also normalizes whitespace on data population or copy, ensuring
-    # that for all input data, only ASCII newlines are considered line breaks.
-    logging.info("=== Populate train/dev/test data ===")
-    step_dir_raw = os.path.join(dir_task, DIR_DATA, DIR_RAW)
-    complete_fname = os.path.join(step_dir_raw, FILE_COMPLETE)
-    if os.path.exists(complete_fname):
-        logging.info("Re-use completed step: %s", step_dir_raw)
-    else:
-        # Listed task
-        if args.task:
-            raw_files = identify_raw_files(task, test_mode=args.test)
-            with tempfile.TemporaryDirectory(prefix="raw.", dir=dir_task) as raw_dir:
-                # Download (or locate in cache) and extract raw files to temp directory
-                logging.info("=== Download and extract raw files ===")
-                download_extract_raw_files(raw_files, dir_cache, raw_dir)
-                # Copy required files to train/dev/test
-                logging.info("=== Create input data files ===")
-                renew_step_dir(step_dir_raw)
-                # Test mode uses the full test set as training data and the
-                # first line of the test set as dev and test data
-                populate_parallel_text(raw_dir,
-                                       task.test if args.test else task.train,
-                                       os.path.join(step_dir_raw, PREFIX_TRAIN),
-                                       False)
-                populate_parallel_text(raw_dir,
-                                       task.test if args.test else task.dev,
-                                       os.path.join(step_dir_raw, PREFIX_DEV),
-                                       False,
-                                       head_n=1 if args.test else 0)
-                populate_parallel_text(raw_dir,
-                                       task.test,
-                                       os.path.join(step_dir_raw, PREFIX_TEST),
-                                       True,
-                                       head_n=1 if args.test else 0)
-        # Custom task
-        else:
-            logging.info("=== Copy input data files ===")
-            renew_step_dir(step_dir_raw)
-            copy_parallel_text(args.custom_train, os.path.join(step_dir_raw, PREFIX_TRAIN))
-            copy_parallel_text(args.custom_dev, os.path.join(step_dir_raw, PREFIX_DEV))
-            copy_parallel_text(args.custom_test, os.path.join(step_dir_raw, PREFIX_TEST))
-        # Record success
-        touch_file(complete_fname)
-        logging.info("Step complete: %s", step_dir_raw)
-
-    # (5) Tokenize train/dev/test data
-
-    # Task requires tokenization if _any_ raw file is not already tokenized
-    requires_tokenization = False
-    if args.task:
-        for file_sets in (task.train, task.dev, task.test):
-            for _, _, text_type in file_sets:
-                if text_type in TEXT_REQUIRES_TOKENIZATION:
-                    requires_tokenization = True
-    else:
-        if args.custom_text_type == CUSTOM_UTF8_RAW:
-            requires_tokenization = True
-    logging.info("=== Tokenize train/dev/test data ===")
-    step_dir_tok = os.path.join(dir_task, DIR_DATA, DIR_TOK)
-    complete_fname = os.path.join(step_dir_tok, FILE_COMPLETE)
-    if os.path.exists(complete_fname):
-        logging.info("Re-use completed step: %s", step_dir_tok)
-    else:
-        renew_step_dir(step_dir_tok)
-
-        # Tokenize each data file using the appropriate language code OR link
-        # raw file if already tokenized.
-        for fname in os.listdir(step_dir_raw):
-            if fname.startswith("."):
-                continue
-            input_fname = os.path.join(step_dir_raw, fname)
-            output_fname = os.path.join(step_dir_tok, fname)
-            if requires_tokenization:
-                lang_code = lang_codes[0] if fname.endswith(SUFFIX_SRC_GZ) else lang_codes[1]
-                logging.info("Tokenize (%s): %s -> %s", lang_code, input_fname, output_fname)
-                third_party.call_moses_tokenizer(workspace_dir=args.workspace,
-                                                 input_fname=input_fname,
-                                                 output_fname=output_fname,
-                                                 lang_code=lang_code)
-            else:
-                logging.info("Link pre-tokenized: %s -> %s", input_fname, output_fname)
-                os.symlink(os.path.join("..", DIR_RAW, fname), output_fname)
-        # Record success
-        touch_file(complete_fname)
-        logging.info("Step complete: %s", step_dir_tok)
-
-    # (6) Learn byte-pair encoding model
-
-    # Task requires byte-pair encoding unless using pre-encoded custom data
-    skip_bpe = (not args.task) and args.custom_text_type == CUSTOM_UTF8_BPE
-    logging.info("=== Learn byte-pair encoding model ===")
-    step_dir_bpe_model = os.path.join(dir_task, DIR_BPE_MODEL)
-    complete_fname = os.path.join(step_dir_bpe_model, FILE_COMPLETE)
-    if os.path.exists(complete_fname):
-        logging.info("Re-use completed step: %s", step_dir_bpe_model)
-    else:
-        renew_step_dir(step_dir_bpe_model)
-        if skip_bpe:
-            logging.info("BPE model not required for pre-encoded data")
-        else:
-            source_fname = os.path.join(step_dir_tok, PREFIX_TRAIN + SUFFIX_SRC_GZ)
-            target_fname = os.path.join(step_dir_tok, PREFIX_TRAIN + SUFFIX_TRG_GZ)
-            codes_fname = os.path.join(step_dir_bpe_model, FILE_BPE_CODES)
-            num_ops = task.bpe_op if args.task else args.custom_bpe_op
-            if args.test:
-                num_ops = TEST_BPE_OPS
-            logging.info("BPE Learn (%s): %s + %s -> %s", num_ops, source_fname, target_fname, codes_fname)
-            third_party.call_learn_bpe(workspace_dir=args.workspace,
-                                       source_fname=source_fname,
-                                       target_fname=target_fname,
-                                       model_fname=codes_fname,
-                                       num_ops=num_ops)
-        # Record success
-        touch_file(complete_fname)
-        logging.info("Step complete: %s", step_dir_bpe_model)
-
-    # (7) Byte-pair encode data
-
-    logging.info("=== Byte-pair encode train/dev/test data ===")
-    step_dir_bpe = os.path.join(dir_task, DIR_DATA, DIR_BPE)
-    complete_fname = os.path.join(step_dir_bpe, FILE_COMPLETE)
-    if os.path.exists(complete_fname):
-        logging.info("Re-use completed step: %s", step_dir_bpe)
-    else:
-        renew_step_dir(step_dir_bpe)
-        # Encode each data file
-        for fname in os.listdir(step_dir_tok):
-            if fname.startswith("."):
-                continue
-            input_fname = os.path.join(step_dir_tok, fname)
-            output_fname = os.path.join(step_dir_bpe, fname)
-            if skip_bpe:
-                logging.info("Link pre-encoded: %s -> %s", input_fname, output_fname)
-                os.symlink(os.path.join("..", DIR_TOK, fname), output_fname)
-            else:
-                codes_fname = os.path.join(step_dir_bpe_model, FILE_BPE_CODES)
-                logging.info("BPE: %s -> %s", input_fname, output_fname)
-                third_party.call_apply_bpe(workspace_dir=args.workspace,
-                                           input_fname=input_fname,
-                                           output_fname=output_fname,
-                                           model_fname=codes_fname)
-        # Record success
-        touch_file(complete_fname)
-        logging.info("Step complete: %s", step_dir_bpe)
-
-    # Done if only running data preparation steps
-    if args.model == MODEL_NONE:
-        return
-
-    # (8) Run Sockeye training
-
-    logging.info("=== Train translation model ===")
-    logging.info("Model: %s", args.model)
-    if args.model == MODEL_GNMT:
-        logging.info("NOTE: This is an 8 layer LSTM model similar (but not exactly identical) to the 'GNMT' architecture.")
-    step_dir_model = os.path.join(dir_task, DIR_PREFIX_MODEL + args.model)
-    complete_fname = os.path.join(step_dir_model, FILE_COMPLETE)
-    if os.path.exists(complete_fname):
-        logging.info("Re-use completed step: %s", step_dir_model)
-    else:
-        log_fname = os.path.join(args.workspace,
-                                 DIR_LOGS,
-                                 "sockeye.{{}}.{}.{}.{}.log".format(task_name, args.model, os.getpid()))
-        call_sockeye_train(args.model,
-                           step_dir_bpe,
-                           step_dir_model,
-                           log_fname.format("train"),
-                           args.gpus,
-                           test_mode=args.test)
-        call_sockeye_average(step_dir_model, log_fname.format("average"))
-        # Record success
-        touch_file(complete_fname)
-        logging.info("Step complete: %s", step_dir_model)
-
-    # (9) Decode test sets
-
-    logging.info("=== Decode test sets ===")
-    logging.info("Settings: %s", args.decode_settings)
-    step_dir_results = os.path.join(dir_task, DIR_RESULTS)
-    if not os.path.exists(step_dir_results):
-        logging.info("Create: %s", step_dir_results)
-        os.makedirs(step_dir_results)
-    # To collect BPE output names
-    output_fnames_bpe = []
-    # For each test file
-    for fname in os.listdir(step_dir_bpe):
-        if fname.startswith(PREFIX_TEST) and fname.endswith(SUFFIX_SRC_GZ):
-            input_fname = os.path.join(step_dir_bpe, fname)
-            # /path/to/results/test[.N].<model>.<settings>
-            output_fname = os.path.join(step_dir_results, "{}.{}.{}.{}".format(args.model,
-                                                                               args.decode_settings,
-                                                                               fname[:-len(SUFFIX_SRC_GZ) - 1],
-                                                                               SUFFIX_BPE))
-            output_fnames_bpe.append(output_fname)
-            # For the shared results directory, a command file indicates that
-            # the step has completed successfully.
-            command_fname = output_fname + "." + SUFFIX_COMMAND
-            if os.path.exists(command_fname):
-                logging.info("Re-use output: %s", output_fname)
-            else:
-                log_fname = os.path.join(args.workspace,
-                                 DIR_LOGS,
-                                 "sockeye.translate.{}.{}.{}.{}.log".format(task_name,
-                                                                            args.model,
-                                                                            fname[:-len(SUFFIX_SRC_GZ) - 1],
-                                                                            os.getpid()))
-                call_sockeye_translate(args=DECODE_ARGS[args.decode_settings],
-                                       input_fname=input_fname,
-                                       output_fname=output_fname,
-                                       model_dir=step_dir_model,
-                                       log_fname=log_fname,
-                                       use_cpu=(args.gpus == 0))
-
-    # (10) Evaluate test sets (bpe/tok/detok)
-
-    lang_code = lang_codes[1] if lang_codes else None
-    logging.info("=== Score outputs ===")
-    # For each output file
-    for fname_bpe in output_fnames_bpe:
-        # Score byte-pair encoded
-        fname_base = os.path.basename(fname_bpe)[:-len(SUFFIX_BPE)].split(".", 2)[2]
-        fname_ref_bpe = os.path.join(step_dir_bpe, fname_base + SUFFIX_TRG_GZ)
-        fname_bleu_bpe = fname_bpe + "." + SUFFIX_BLEU
-        if os.path.exists(fname_bleu_bpe):
-            logging.info("Re-use output: %s", fname_bleu_bpe)
-        else:
-            fname_log = os.path.join(args.workspace,
-                                     DIR_LOGS,
-                                     "sacrebleu.sacrebleu.{}.{}.{}.{}.log".format(task_name,
-                                                                                  args.model,
-                                                                                  fname_base + SUFFIX_BPE,
-                                                                                  os.getpid()))
-            call_sacrebleu(input_fname=fname_bpe,
-                           ref_fname=fname_ref_bpe,
-                           output_fname=fname_bleu_bpe,
-                           log_fname=fname_log,
-                           tokenized=True)
-        # Score tokenized
-        fname_tok = fname_bpe[:-len(SUFFIX_BPE)] + SUFFIX_TOK
-        fname_ref_tok = os.path.join(step_dir_tok, fname_base + SUFFIX_TRG_GZ)
-        fname_bleu_tok = fname_tok + "." + SUFFIX_BLEU
-        if os.path.exists(fname_bleu_tok):
-            logging.info("Re-use output: %s", fname_bleu_tok)
-        else:
-            # Merge BPE
-            logging.info("Merge BPE: %s -> %s", fname_bpe, fname_tok)
-            third_party.merge_bpe(input_fname=fname_bpe, output_fname=fname_tok)
-            fname_log = os.path.join(args.workspace,
-                                     DIR_LOGS,
-                                     "sacrebleu.sacrebleu.{}.{}.{}.{}.log".format(task_name,
-                                                                                  args.model,
-                                                                                  fname_base + SUFFIX_TOK,
-                                                                                  os.getpid()))
-            call_sacrebleu(input_fname=fname_tok,
-                           ref_fname=fname_ref_tok,
-                           output_fname=fname_bleu_tok,
-                           log_fname=fname_log,
-                           tokenized=True)
-        # Score detokenized (WMT-compatible BLEU)
-        fname_detok = fname_bpe[:-len(SUFFIX_BPE)] + SUFFIX_DETOK
-        fname_ref_raw = os.path.join(step_dir_raw, fname_base + SUFFIX_TRG_GZ)
-        fname_bleu_detok = fname_detok + "." + SUFFIX_SACREBLEU
-        if os.path.exists(fname_bleu_detok):
-            logging.info("Re-use output: %s", fname_bleu_detok)
-        else:
-            if not requires_tokenization:
-                logging.info(
-                    "WARNING: Task uses pre-tokenized data, cannot reliably detokenize to compute WMT-compatible scores")
-                continue
-            # Detokenize
-            logging.info("Detokenize (%s): %s -> %s", lang_code, fname_tok, fname_detok)
-            third_party.call_moses_detokenizer(workspace_dir=args.workspace,
-                                               input_fname=fname_tok,
-                                               output_fname=fname_detok,
-                                               lang_code=lang_code)
-            fname_log = os.path.join(args.workspace,
-                                     DIR_LOGS,
-                                     "sacrebleu.sacrebleu.{}.{}.{}.{}.log".format(task_name,
-                                                                                  args.model,
-                                                                                  fname_base + SUFFIX_DETOK,
-                                                                                  os.getpid()))
-            call_sacrebleu(input_fname=fname_detok,
-                           ref_fname=fname_ref_raw,
-                           output_fname=fname_bleu_detok,
-                           log_fname=fname_log,
-                           tokenized=False)
-
-
-def main():
-    default_workspace = os.path.join(os.path.expanduser("~"), DIR_SOCKEYE_AUTOPILOT)
-
-    arg_parser = argparse.ArgumentParser(description="Sockeye Autopilot: end-to-end model training and evaluation.")
-    arg_parser.add_argument("--workspace", type=str, metavar="DIR", default=default_workspace,
-                            help="Base directory to use for building systems (download files, train models, etc.). Default: %(default)s.")
-    arg_parser.add_argument("--task", type=str, choices=sorted(TASKS.keys()),
-                            help="Pre-defined data set for model training.")
-    arg_parser.add_argument("--model", type=str, choices=sorted(MODELS.keys()),
-                            help="Type of translation model to train.")
-    arg_parser.add_argument("--decode-settings", type=str, choices=sorted(DECODE_ARGS.keys()), default=DECODE_STANDARD,
-                            help="Decoding settings. Default: %(default)s.")
-    arg_parser.add_argument("--custom-task", type=str, metavar="NAME",
-                            help="Name of custom task (used for directory naming).")
-    arg_parser.add_argument("--custom-train", type=str, nargs=2, metavar=("SRC", "TRG"),
-                            help="Custom training data (source and target).")
-    arg_parser.add_argument("--custom-dev", type=str, nargs=2, metavar=("SRC", "TRG"),
-                            help="Custom development data (source and target).")
-    arg_parser.add_argument("--custom-test", type=str, nargs="+", metavar="SRC TRG",
-                            help="Custom test data (pairs of source and target).")
-    arg_parser.add_argument("--custom-text-type", type=str, choices=CUSTOM_TEXT_TYPES, default=CUSTOM_UTF8_RAW,
-                            help="Level of pre-processing already applied to data for custom task: none (raw), tokenization, or byte-pair encoding. Default: %(default)s.")
-    arg_parser.add_argument("--custom-lang", type=str, nargs=2, metavar=("SRC", "TRG"),
-                            help="Source and target language codes for custom task (en, fr, de, etc.).")
-    arg_parser.add_argument("--custom-bpe-op", type=int, default=32000,
-                            help="Number of byte-pair encoding operations for custom task. Default: %(default)s.")
-    arg_parser.add_argument("--gpus", type=int, metavar="N", default=1,
-                            help="Number of GPUs to use. 0 for CPU only. Default: %(default)s.")
-    arg_parser.add_argument("--test", action="store_true", default=False,
-                            help="Run in test mode (much abbreviated system build).")
-
-    args = arg_parser.parse_args()
-
-    # Listed task or fully specified custom task
-    utils.check_condition(args.task or all((args.custom_train, args.custom_dev, args.custom_test)),
-            "Please specify --task or all of: --custom-task --custom-train --custom-dev --custom-test")
-
-    # Required args for different custom tasks
-    if not args.task:
-        if args.custom_text_type == CUSTOM_UTF8_RAW:
-            utils.check_condition(args.custom_lang, "Please specify --custom-lang for source and target tokenization")
-
-    # Require explicit request to not train model
-    if not args.model:
-        raise RuntimeError("Please specify --model.  Use --model %s to run data preparation steps only" % MODEL_NONE)
-
-    run_steps(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/sockeye_contrib/autopilot/models.py b/sockeye_contrib/autopilot/models.py
deleted file mode 100644
index cc22b8071..000000000
--- a/sockeye_contrib/autopilot/models.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-# Names model types
-MODEL_NONE = "none"
-MODEL_TRANSFORMER = "transformer"
-MODEL_GNMT = "gnmt_like"
-
-# Named decoding settings
-DECODE_STANDARD = "standard"
-DECODE_GNMT = "gnmt_like"
-
-# Model configurations (architecture, training recipe, etc.)
-MODELS = {
-
-    MODEL_NONE: [],
-
-    MODEL_TRANSFORMER: [
-        "--encoder=transformer",
-        "--decoder=transformer",
-        "--num-layers=6:6",
-        "--transformer-model-size=512",
-        "--transformer-attention-heads=8",
-        "--transformer-feed-forward-num-hidden=2048",
-        "--transformer-positional-embedding-type=fixed",
-        "--transformer-preprocess=n",
-        "--transformer-postprocess=dr",
-        "--transformer-dropout-attention=0.1",
-        "--transformer-dropout-act=0.1",
-        "--transformer-dropout-prepost=0.1",
-        "--weight-tying",
-        "--weight-tying-type=src_trg_softmax",
-        "--weight-init=xavier",
-        "--weight-init-scale=3.0",
-        "--weight-init-xavier-factor-type=avg",
-        "--num-embed=512:512",
-        "--optimizer=adam",
-        "--optimized-metric=perplexity",
-        "--label-smoothing=0.1",
-        "--gradient-clipping-threshold=-1",
-        "--initial-learning-rate=0.0002",
-        "--learning-rate-reduce-num-not-improved=8",
-        "--learning-rate-reduce-factor=0.9",
-        "--learning-rate-scheduler-type=plateau-reduce",
-        "--learning-rate-decay-optimizer-states-reset=best",
-        "--learning-rate-decay-param-reset",
-        "--max-num-checkpoint-not-improved=32",
-        "--batch-type=word",
-        "--batch-size=4096",
-        "--checkpoint-interval=2000",
-        "--decode-and-evaluate=500",
-        "--keep-last-params=60",
-    ],
-
-    MODEL_GNMT: [
-        "--encoder=rnn",
-        "--decoder=rnn",
-        "--rnn-num-hidden=512",
-        "--rnn-attention-in-upper-layers",
-        "--rnn-attention-type=dot",
-        "--rnn-decoder-hidden-dropout=0.2",
-        "--embed-dropout=0.2",
-        "--num-layers=8:8",
-        "--weight-init=xavier",
-        "--weight-init-scale=3.0",
-        "--weight-init-xavier-factor-type=avg",
-        "--num-embed=256:256",
-        "--max-seq-len=100",
-        "--optimizer=adam",
-        "--optimized-metric=perplexity",
-        "--initial-learning-rate=0.0001",
-        "--learning-rate-reduce-num-not-improved=8",
-        "--learning-rate-reduce-factor=0.7",
-        "--max-num-checkpoint-not-improved=32",
-        "--batch-type=sentence",
-        "--batch-size=128",
-        "--checkpoint-interval=2000",
-        "--decode-and-evaluate=500",
-        "--keep-last-params=60",
-    ],
-}  # type: Dict[str, List[str]]
-
-# Arguments added to the end of any model in test mode to train a smaller
-# version quickly for system tests.  When multiple versions of the same argument
-# exist, the last version to appear (this list) takes precedence.
-MODEL_TEST_ARGS = {
-    MODEL_TRANSFORMER: [
-        "--num-layers=1:1",
-        "--transformer-model-size=16",
-        "--transformer-feed-forward-num-hidden=16",
-        "--num-embed=16:16",
-        "--num-words=16:16",
-        "--batch-type=sentence",
-        "--batch-size=1",
-        "--max-updates=4",
-        "--checkpoint-interval=2",
-    ],
-
-    MODEL_GNMT: [
-        "--num-layers=1:1",
-        "--rnn-num-hidden=16",
-        "--num-embed=16:16",
-        "--num-words=16:16",
-        "--batch-type=sentence",
-        "--batch-size=1",
-        "--max-updates=4",
-        "--checkpoint-interval=2",
-    ],
-}
-
-# Decoding configurations
-DECODE_ARGS = {
-    DECODE_STANDARD: [
-        "--beam-size=5",
-        "--batch-size=32",
-        "--chunk-size=1000",
-        "--length-penalty-alpha=0.1",
-        "--length-penalty-beta=0.0",
-        "--max-output-length-num-stds=2",
-        "--bucket-width=10",
-    ],
-
-    DECODE_GNMT: [
-        "--beam-size=10",
-        "--batch-size=32",
-        "--chunk-size=1000",
-        "--length-penalty-alpha=0.1",
-        "--length-penalty-beta=0.0",
-        "--max-output-length-num-stds=2",
-        "--bucket-width=10",
-    ],
-}
diff --git a/sockeye_contrib/autopilot/tasks.py b/sockeye_contrib/autopilot/tasks.py
deleted file mode 100644
index 46300e5b6..000000000
--- a/sockeye_contrib/autopilot/tasks.py
+++ /dev/null
@@ -1,625 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-from typing import List, NamedTuple, Tuple
-
-
-# Archive types
-ARCHIVE_NONE = "none"
-ARCHIVE_TAR = "tar"
-ARCHIVE_ZIP = "zip"
-
-# Formats for known files
-# Note: we currently assume that all data files will be UTF-8 encoded.  If this
-#       changes, review the third party tools to make sure everything is
-#       converted to UTF-8 immediately after extraction, prior to creating raw
-#       train/dev/test files.
-TEXT_UTF8_RAW = "utf8_raw"
-TEXT_UTF8_RAW_SGML = "utf8_raw_sgml"
-TEXT_UTF8_RAW_BITEXT = "utf8_raw_bitext"  # Triple-pipe delimited: source ||| target
-TEXT_UTF8_RAW_BITEXT_REVERSE = "utf8_raw_bitext_reverse"  # Same as above, but used
-                                                          # for reverse direction
-# All TEXT_* types above require tokenization and should appear in this list
-TEXT_REQUIRES_TOKENIZATION = [TEXT_UTF8_RAW, TEXT_UTF8_RAW_SGML, TEXT_UTF8_RAW_BITEXT, TEXT_UTF8_RAW_BITEXT_REVERSE]
-TEXT_UTF8_TOKENIZED = "utf8_tokenized"
-
-
-RawFile = NamedTuple("RawFile", [("description", str),
-                                 ("url", str),
-                                 ("md5", str),
-                                 ("archive_type", str)])
-"""
-Known raw file that provides input data for a sequence-to-sequence task.
-
-:param description: Short description of data contained in raw file.
-:param url: Download url.
-:param md5: Reference MD5 sum.
-:param archive_type: Type of archive, one of ARCHIVE_*.
-"""
-
-
-# Known raw files that provide data for sequence-to-sequence tasks.  Individual
-# files may be referenced in multiple tasks.
-RAW_FILES = {
-    # WMT training data
-    "europarl_v7": RawFile("Europarl v7",
-                           "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
-                           "c52404583294a1b609e56d45b2ed06f5",
-                           ARCHIVE_TAR),
-    "europarl_v8": RawFile("Europarl v8",
-                           "http://data.statmt.org/wmt17/translation-task/training-parallel-ep-v8.tgz",
-                           "07b77f254d189a5bfb7b43b7fc489716",
-                           ARCHIVE_TAR),
-    "common_crawl_wmt13": RawFile("Common Crawl corpus (WMT13 release)",
-                                  "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz",
-                                  "7e0acbe86b0d7816300e14650f5b2bd4",
-                                  ARCHIVE_TAR),
-    "un_wmt13": RawFile("UN corpus (WMT13 release)",
-                        "http://www.statmt.org/wmt13/training-parallel-un.tgz",
-                        "bb25a213ba9140023e4cc82c778bef53",
-                        ARCHIVE_TAR),
-    "news_commentary_v9": RawFile("News Commentary v9",
-                                  "http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz",
-                                  "92e42b68f9d3c2ae9722e6d1c2623e21",
-                                  ARCHIVE_TAR),
-    "news_commentary_v12": RawFile("News Commentary v12",
-                                   "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz",
-                                   "fc6b83b809347e64f511d291e4bc8731",
-                                   ARCHIVE_TAR),
-    "news_commentary_v13": RawFile("News Commentary v13",
-                                   "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz",
-                                   "07f45ec726e8fc822a8e43606a889e2d",
-                                   ARCHIVE_TAR),
-    "giga_fren_wmt10": RawFile("10^9 French-English corpus",
-                               "http://www.statmt.org/wmt10/training-giga-fren.tar",
-                               "0b12e20027d5b5f0dfcca290c72c8953",
-                               ARCHIVE_TAR),
-    "wiki_headlines_wmt15": RawFile("Wiki Headlines (WMT15 release)",
-                                    "http://www.statmt.org/wmt15/wiki-titles.tgz",
-                                    "f74eef43032766d55884a5073ed8ce27",
-                                    ARCHIVE_TAR),
-    "rapid_eu_2016": RawFile("Rapid corpus of EU press releases (2016)",
-                             "http://data.statmt.org/wmt17/translation-task/rapid2016.tgz",
-                             "17a3a1846433ad26acb95da02f93af93",
-                             ARCHIVE_TAR),
-    "leta_v1": RawFile("LETA translated news v1",
-                       "http://data.statmt.org/wmt17/translation-task/leta.v1.tgz",
-                       "3f367e86924f910cb1e969de57caf63c",
-                       ARCHIVE_TAR),
-    "dcep_lv_en_v1": RawFile("Digital Corpus of European Parliament v1",
-                             "http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz",
-                             "0f949102e8501dfb3c99d3e3f545b4f9",
-                             ARCHIVE_TAR),
-    "books_lv_en_v1": RawFile("Online Books v1",
-                              "http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz",
-                              "7073092421b1259158446870990a9ca5",
-                              ARCHIVE_TAR),
-    "setimes2_en_tr": RawFile("SETIMES2 English-Turkish",
-                              "http://opus.nlpl.eu/download.php?f=SETIMES2/en-tr.txt.zip",
-                              "544cec8a631f7820afab6a05451c13a7",
-                              ARCHIVE_ZIP),
-    "paracrawl_release1_en_de": RawFile("Paracrawl Filtered v1.0",
-                                        "https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-de.zipporah0-dedup-clean.tgz",
-                                        "30e67e94d111ea675c0567e1c1aa338c",
-                                        ARCHIVE_TAR),
-    # WMT dev and test sets
-    "wmt14_dev": RawFile("WMT17 development sets",
-                         "http://www.statmt.org/wmt14/dev.tgz",
-                         "88ba3fc60b2278d59277122e1c7dd6e7",
-                         ARCHIVE_TAR),
-    "wmt17_dev": RawFile("WMT17 development sets",
-                         "http://data.statmt.org/wmt17/translation-task/dev.tgz",
-                         "9b1aa63c1cf49dccdd20b962fe313989",
-                         ARCHIVE_TAR),
-    "wmt18_dev": RawFile("WMT18 development sets",
-                         "http://data.statmt.org/wmt18/translation-task/dev.tgz",
-                         "486f391da54a7a3247f02ebd25996f24",
-                         ARCHIVE_TAR),
-    "wmt14_test": RawFile("WMT14 test sets",
-                          "http://www.statmt.org/wmt14/test-filtered.tgz",
-                          "84c597844c1542e29c2aff23aaee4310",
-                          ARCHIVE_TAR),
-    "wmt17_test": RawFile("WMT17 test sets",
-                          "http://data.statmt.org/wmt17/translation-task/test.tgz",
-                          "86a1724c276004aa25455ae2a04cef26",
-                          ARCHIVE_TAR),
-    "wmt18_test": RawFile("WMT18 test sets",
-                          "http://data.statmt.org/wmt18/translation-task/test.tgz",
-                          "f996c245ecffea23d0006fa4c34e9064",
-                          ARCHIVE_TAR),
-    # Stanford NLP pre-processed data
-    "stanford_wmt14_train_en": RawFile("Stanford pre-processed WMT14 English training data",
-                                       "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en",
-                                       "7ac0d46a8f6db6dfce476c2a8e54121b",
-                                       ARCHIVE_NONE),
-    "stanford_wmt14_train_de": RawFile("Stanford pre-processed WMT14 German training data",
-                                       "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de",
-                                       "5873aae4fe517aad42bb29d607b5d2a0",
-                                       ARCHIVE_NONE),
-    "stanford_wmt14_test2013_en": RawFile("Stanford pre-processed WMT14 English news test 2013",
-                                          "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2013.en",
-                                          "f3ce7816bb0acbd2de0364795e9688b1",
-                                          ARCHIVE_NONE),
-    "stanford_wmt14_test2013_de": RawFile("Stanford pre-processed WMT14 German news test 2013",
-                                          "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2013.de",
-                                          "5d48c9300649bfad1300e53ad1334aec",
-                                          ARCHIVE_NONE),
-    "stanford_wmt14_test2014_en": RawFile("Stanford pre-processed WMT14 English news test 2014",
-                                          "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.en",
-                                          "4e4663b8de25d19c5fc1c4dab8d61703",
-                                          ARCHIVE_NONE),
-    "stanford_wmt14_test2014_de": RawFile("Stanford pre-processed WMT14 German news test 2014",
-                                          "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2014.de",
-                                          "06e8840abe90cbfbd45cf2729807605d",
-                                          ARCHIVE_NONE),
-    "stanford_wmt14_test2015_en": RawFile("Stanford pre-processed WMT14 English news test 2015",
-                                          "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2015.en",
-                                          "081a724a6a1942eb900d75852f9f5974",
-                                          ARCHIVE_NONE),
-    "stanford_wmt14_test2015_de": RawFile("Stanford pre-processed WMT14 German news test 2015",
-                                          "https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2015.de",
-                                          "40b6f52962fa630091d8e6a143423385",
-                                          ARCHIVE_NONE),
-}
-
-
-Task = NamedTuple("Task", [("description", str),
-                           ("url", str),
-                           ("src_lang", str),
-                           ("trg_lang", str),
-                           ("bpe_op", int),
-                           ("train", List[Tuple[str, str, str]]),
-                           ("dev", List[Tuple[str, str, str]]),
-                           ("test", List[Tuple[str, str, str]])])
-"""
-Sequence-to-sequence task that uses data from known raw files.  Train, dev, and
-test files are specified in triples of (source, target, text_type).  The format
-for source and target is "raw_file_name/path/to/data/file" and text_type is one
-of TEXT_*.  Multiple train and dev sets are concatenated while multiple test
-sets are evaluated individually.
-
-:param description: Short description of task.
-:param url: URL of task information page.
-:param src_lang: Source language code (used for tokenization only).
-:param trg_lang: Target language code (used for tokenization only).
-:param bpe_op: Number of byte-pair encoding operations for sub-word vocabulary.
-:param train: List of training file sets.
-:param dev: List of dev/validation file sets.
-:param test: List of test/evaluation file sets.
-"""
-
-
-# Known sequence-to-sequence tasks that specify train, dev, and test sets.
-TASKS = {
-    # WMT14 common benchmarks
-    "wmt14_de_en": Task(description="WMT14 German-English news",
-                        url="http://www.statmt.org/wmt14/translation-task.html",
-                        src_lang="de",
-                        trg_lang="en",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v7/training/europarl-v7.de-en.de",
-                             "europarl_v7/training/europarl-v7.de-en.en",
-                             TEXT_UTF8_RAW),
-                            ("common_crawl_wmt13/commoncrawl.de-en.de",
-                             "common_crawl_wmt13/commoncrawl.de-en.en",
-                             TEXT_UTF8_RAW),
-                            ("news_commentary_v9/training/news-commentary-v9.de-en.de",
-                             "news_commentary_v9/training/news-commentary-v9.de-en.en",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt14_dev/dev/newstest2013-src.de.sgm",
-                             "wmt14_dev/dev/newstest2013-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt14_test/test/newstest2014-deen-src.de.sgm",
-                             "wmt14_test/test/newstest2014-deen-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt14_fr_en": Task(description="WMT14 French-English news",
-                        url="http://www.statmt.org/wmt14/translation-task.html",
-                        src_lang="fr",
-                        trg_lang="en",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v7/training/europarl-v7.fr-en.fr",
-                             "europarl_v7/training/europarl-v7.fr-en.en",
-                             TEXT_UTF8_RAW),
-                            ("common_crawl_wmt13/commoncrawl.fr-en.fr",
-                             "common_crawl_wmt13/commoncrawl.fr-en.en",
-                             TEXT_UTF8_RAW),
-                            ("un_wmt13/un/undoc.2000.fr-en.fr",
-                             "un_wmt13/un/undoc.2000.fr-en.en",
-                             TEXT_UTF8_RAW),
-                            ("news_commentary_v9/training/news-commentary-v9.fr-en.fr",
-                             "news_commentary_v9/training/news-commentary-v9.fr-en.en",
-                             TEXT_UTF8_RAW),
-                            ("giga_fren_wmt10/giga-fren.release2.fixed.fr.gz",
-                             "giga_fren_wmt10/giga-fren.release2.fixed.en.gz",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt14_dev/dev/newstest2013-src.fr.sgm",
-                             "wmt14_dev/dev/newstest2013-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt14_test/test/newstest2014-fren-src.fr.sgm",
-                             "wmt14_test/test/newstest2014-fren-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt14_en_de": Task(description="WMT14 English-German news",
-                        url="http://www.statmt.org/wmt14/translation-task.html",
-                        src_lang="en",
-                        trg_lang="de",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v7/training/europarl-v7.de-en.en",
-                             "europarl_v7/training/europarl-v7.de-en.de",
-                             TEXT_UTF8_RAW),
-                            ("common_crawl_wmt13/commoncrawl.de-en.en",
-                             "common_crawl_wmt13/commoncrawl.de-en.de",
-                             TEXT_UTF8_RAW),
-                            ("news_commentary_v9/training/news-commentary-v9.de-en.en",
-                             "news_commentary_v9/training/news-commentary-v9.de-en.de",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt14_dev/dev/newstest2013-src.en.sgm",
-                             "wmt14_dev/dev/newstest2013-ref.de.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt14_test/test/newstest2014-deen-src.en.sgm",
-                             "wmt14_test/test/newstest2014-deen-ref.de.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt14_en_fr": Task(description="WMT14 English-French news",
-                        url="http://www.statmt.org/wmt14/translation-task.html",
-                        src_lang="en",
-                        trg_lang="fr",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v7/training/europarl-v7.fr-en.en",
-                             "europarl_v7/training/europarl-v7.fr-en.fr",
-                             TEXT_UTF8_RAW),
-                            ("common_crawl_wmt13/commoncrawl.fr-en.en",
-                             "common_crawl_wmt13/commoncrawl.fr-en.fr",
-                             TEXT_UTF8_RAW),
-                            ("un_wmt13/un/undoc.2000.fr-en.en",
-                             "un_wmt13/un/undoc.2000.fr-en.fr",
-                             TEXT_UTF8_RAW),
-                            ("news_commentary_v9/training/news-commentary-v9.fr-en.en",
-                             "news_commentary_v9/training/news-commentary-v9.fr-en.fr",
-                             TEXT_UTF8_RAW),
-                            ("giga_fren_wmt10/giga-fren.release2.fixed.en.gz",
-                             "giga_fren_wmt10/giga-fren.release2.fixed.fr.gz",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt14_dev/dev/newstest2013-src.en.sgm",
-                             "wmt14_dev/dev/newstest2013-ref.fr.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt14_test/test/newstest2014-fren-src.en.sgm",
-                             "wmt14_test/test/newstest2014-fren-ref.fr.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    # WMT17 tasks using 100% publicly available data
-    "wmt17_de_en": Task(description="WMT17 German-English news",
-                        url="http://www.statmt.org/wmt17/translation-task.html",
-                        src_lang="de",
-                        trg_lang="en",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v7/training/europarl-v7.de-en.de",
-                             "europarl_v7/training/europarl-v7.de-en.en",
-                             TEXT_UTF8_RAW),
-                            ("common_crawl_wmt13/commoncrawl.de-en.de",
-                             "common_crawl_wmt13/commoncrawl.de-en.en",
-                             TEXT_UTF8_RAW),
-                            ("news_commentary_v12/training/news-commentary-v12.de-en.de",
-                             "news_commentary_v12/training/news-commentary-v12.de-en.en",
-                             TEXT_UTF8_RAW),
-                            ("rapid_eu_2016/rapid2016.de-en.de",
-                             "rapid_eu_2016/rapid2016.de-en.en",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt17_dev/dev/newstest2016-deen-src.de.sgm",
-                             "wmt17_dev/dev/newstest2016-deen-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt17_test/test/newstest2017-deen-src.de.sgm",
-                             "wmt17_test/test/newstest2017-deen-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt17_fi_en": Task(description="WMT17 Finnish-English news",
-                        url="http://www.statmt.org/wmt17/translation-task.html",
-                        src_lang="fi",
-                        trg_lang="en",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v8/training/europarl-v8.fi-en.fi",
-                             "europarl_v8/training/europarl-v8.fi-en.en",
-                             TEXT_UTF8_RAW),
-                            ("wiki_headlines_wmt15/wiki/fi-en/titles.fi-en",
-                             "wiki_headlines_wmt15/wiki/fi-en/titles.fi-en",
-                             TEXT_UTF8_RAW_BITEXT),
-                            ("rapid_eu_2016/rapid2016.en-fi.fi",
-                             "rapid_eu_2016/rapid2016.en-fi.en",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt17_dev/dev/newstest2016-fien-src.fi.sgm",
-                             "wmt17_dev/dev/newstest2016-fien-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt17_test/test/newstest2017-fien-src.fi.sgm",
-                             "wmt17_test/test/newstest2017-fien-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt17_lv_en": Task(description="WMT17 Latvian-English news",
-                        url="http://www.statmt.org/wmt17/translation-task.html",
-                        src_lang="lv",
-                        trg_lang="en",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v8/training/europarl-v8.lv-en.lv",
-                             "europarl_v8/training/europarl-v8.lv-en.en",
-                             TEXT_UTF8_RAW),
-                            ("rapid_eu_2016/rapid2016.en-lv.lv",
-                             "rapid_eu_2016/rapid2016.en-lv.en",
-                             TEXT_UTF8_RAW),
-                            ("leta_v1/LETA-lv-en/leta.lv",
-                             "leta_v1/LETA-lv-en/leta.en",
-                             TEXT_UTF8_RAW),
-                            ("dcep_lv_en_v1/dcep.en-lv/dcep.lv",
-                             "dcep_lv_en_v1/dcep.en-lv/dcep.en",
-                             TEXT_UTF8_RAW),
-                            ("books_lv_en_v1/farewell/farewell.lv",
-                             "books_lv_en_v1/farewell/farewell.en",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt17_dev/dev/newsdev2017-lven-src.lv.sgm",
-                             "wmt17_dev/dev/newsdev2017-lven-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt17_test/test/newstest2017-lven-src.lv.sgm",
-                             "wmt17_test/test/newstest2017-lven-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt17_tr_en": Task(description="WMT17 Turkish-English news",
-                        url="http://www.statmt.org/wmt17/translation-task.html",
-                        src_lang="tr",
-                        trg_lang="en",
-                        bpe_op=16000,
-                        train=[
-                            ("setimes2_en_tr/SETIMES2.en-tr.tr",
-                             "setimes2_en_tr/SETIMES2.en-tr.en",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt17_dev/dev/newstest2016-tren-src.tr.sgm",
-                             "wmt17_dev/dev/newstest2016-tren-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt17_test/test/newstest2017-tren-src.tr.sgm",
-                             "wmt17_test/test/newstest2017-tren-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt17_en_de": Task(description="WMT17 English-German news",
-                        url="http://www.statmt.org/wmt17/translation-task.html",
-                        src_lang="en",
-                        trg_lang="de",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v7/training/europarl-v7.de-en.en",
-                             "europarl_v7/training/europarl-v7.de-en.de",
-                             TEXT_UTF8_RAW),
-                            ("common_crawl_wmt13/commoncrawl.de-en.en",
-                             "common_crawl_wmt13/commoncrawl.de-en.de",
-                             TEXT_UTF8_RAW),
-                            ("news_commentary_v12/training/news-commentary-v12.de-en.en",
-                             "news_commentary_v12/training/news-commentary-v12.de-en.de",
-                             TEXT_UTF8_RAW),
-                            ("rapid_eu_2016/rapid2016.de-en.en",
-                             "rapid_eu_2016/rapid2016.de-en.de",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt17_dev/dev/newstest2016-ende-src.en.sgm",
-                             "wmt17_dev/dev/newstest2016-ende-ref.de.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt17_test/test/newstest2017-ende-src.en.sgm",
-                             "wmt17_test/test/newstest2017-ende-ref.de.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt17_en_fi": Task(description="WMT17 English-Finnish news",
-                        url="http://www.statmt.org/wmt17/translation-task.html",
-                        src_lang="en",
-                        trg_lang="fi",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v8/training/europarl-v8.fi-en.en",
-                             "europarl_v8/training/europarl-v8.fi-en.fi",
-                             TEXT_UTF8_RAW),
-                            ("wiki_headlines_wmt15/wiki/fi-en/titles.fi-en",
-                             "wiki_headlines_wmt15/wiki/fi-en/titles.fi-en",
-                             TEXT_UTF8_RAW_BITEXT_REVERSE),
-                            ("rapid_eu_2016/rapid2016.en-fi.en",
-                             "rapid_eu_2016/rapid2016.en-fi.fi",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt17_dev/dev/newstest2016-enfi-src.en.sgm",
-                             "wmt17_dev/dev/newstest2016-enfi-ref.fi.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt17_test/test/newstest2017-enfi-src.en.sgm",
-                             "wmt17_test/test/newstest2017-enfi-ref.fi.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt17_en_lv": Task(description="WMT17 English-Latvian news",
-                        url="http://www.statmt.org/wmt17/translation-task.html",
-                        src_lang="en",
-                        trg_lang="lv",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v8/training/europarl-v8.lv-en.en",
-                             "europarl_v8/training/europarl-v8.lv-en.lv",
-                             TEXT_UTF8_RAW),
-                            ("rapid_eu_2016/rapid2016.en-lv.en",
-                             "rapid_eu_2016/rapid2016.en-lv.lv",
-                             TEXT_UTF8_RAW),
-                            ("leta_v1/LETA-lv-en/leta.en",
-                             "leta_v1/LETA-lv-en/leta.lv",
-                             TEXT_UTF8_RAW),
-                            ("dcep_lv_en_v1/dcep.en-lv/dcep.en",
-                             "dcep_lv_en_v1/dcep.en-lv/dcep.lv",
-                             TEXT_UTF8_RAW),
-                            ("books_lv_en_v1/farewell/farewell.en",
-                             "books_lv_en_v1/farewell/farewell.lv",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt17_dev/dev/newsdev2017-enlv-src.en.sgm",
-                             "wmt17_dev/dev/newsdev2017-enlv-ref.lv.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt17_test/test/newstest2017-enlv-src.en.sgm",
-                             "wmt17_test/test/newstest2017-enlv-ref.lv.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt17_en_tr": Task(description="WMT17 English-Turkish news",
-                        url="http://www.statmt.org/wmt17/translation-task.html",
-                        src_lang="en",
-                        trg_lang="tr",
-                        bpe_op=16000,
-                        train=[
-                            ("setimes2_en_tr/SETIMES2.en-tr.en",
-                             "setimes2_en_tr/SETIMES2.en-tr.tr",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt17_dev/dev/newstest2016-entr-src.en.sgm",
-                             "wmt17_dev/dev/newstest2016-entr-ref.tr.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt17_test/test/newstest2017-entr-src.en.sgm",
-                             "wmt17_test/test/newstest2017-entr-ref.tr.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    # WMT18 translation tasks
-    "wmt18_de_en": Task(description="WMT18 German-English news",
-                        url="http://statmt.org/wmt18/translation-task.html",
-                        src_lang="de",
-                        trg_lang="en",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v7/training/europarl-v7.de-en.de",
-                             "europarl_v7/training/europarl-v7.de-en.en",
-                             TEXT_UTF8_RAW),
-                            ("paracrawl_release1_en_de/paracrawl-release1.en-de.zipporah0-dedup-clean.de",
-                             "paracrawl_release1_en_de/paracrawl-release1.en-de.zipporah0-dedup-clean.en",
-                             TEXT_UTF8_RAW),
-                            ("common_crawl_wmt13/commoncrawl.de-en.de",
-                             "common_crawl_wmt13/commoncrawl.de-en.en",
-                             TEXT_UTF8_RAW),
-                            ("news_commentary_v13/training-parallel-nc-v13/news-commentary-v13.de-en.de",
-                             "news_commentary_v13/training-parallel-nc-v13/news-commentary-v13.de-en.en",
-                             TEXT_UTF8_RAW),
-                            ("rapid_eu_2016/rapid2016.de-en.de",
-                             "rapid_eu_2016/rapid2016.de-en.en",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt18_dev/dev/newstest2017-deen-src.de.sgm",
-                             "wmt18_dev/dev/newstest2017-deen-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt18_test/test/newstest2018-deen-src.de.sgm",
-                             "wmt18_test/test/newstest2018-deen-ref.en.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    "wmt18_en_de": Task(description="WMT18 English-German news",
-                        url="http://statmt.org/wmt18/translation-task.html",
-                        src_lang="en",
-                        trg_lang="de",
-                        bpe_op=32000,
-                        train=[
-                            ("europarl_v7/training/europarl-v7.de-en.en",
-                             "europarl_v7/training/europarl-v7.de-en.de",
-                             TEXT_UTF8_RAW),
-                            ("paracrawl_release1_en_de/paracrawl-release1.en-de.zipporah0-dedup-clean.en",
-                             "paracrawl_release1_en_de/paracrawl-release1.en-de.zipporah0-dedup-clean.de",
-                             TEXT_UTF8_RAW),
-                            ("common_crawl_wmt13/commoncrawl.de-en.en",
-                             "common_crawl_wmt13/commoncrawl.de-en.de",
-                             TEXT_UTF8_RAW),
-                            ("news_commentary_v13/training-parallel-nc-v13/news-commentary-v13.de-en.en",
-                             "news_commentary_v13/training-parallel-nc-v13/news-commentary-v13.de-en.de",
-                             TEXT_UTF8_RAW),
-                            ("rapid_eu_2016/rapid2016.de-en.en",
-                             "rapid_eu_2016/rapid2016.de-en.de",
-                             TEXT_UTF8_RAW),
-                        ],
-                        dev=[
-                            ("wmt18_dev/dev/newstest2017-ende-src.en.sgm",
-                             "wmt18_dev/dev/newstest2017-ende-ref.de.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ],
-                        test=[
-                            ("wmt18_test/test/newstest2018-ende-src.en.sgm",
-                             "wmt18_test/test/newstest2018-ende-ref.de.sgm",
-                             TEXT_UTF8_RAW_SGML),
-                        ]),
-    # WNMT18 shared task
-    "wnmt18_en_de": Task(description="WNMT18 English-German (WMT14 news pre-processed)",
-                         url="https://sites.google.com/site/wnmt18/shared-task",
-                         src_lang="en",
-                         trg_lang="de",
-                         bpe_op=32000,
-                         train=[
-                             ("stanford_wmt14_train_en/train.en",
-                              "stanford_wmt14_train_de/train.de",
-                              TEXT_UTF8_TOKENIZED),
-                         ],
-                         dev=[
-                             ("stanford_wmt14_test2013_en/newstest2013.en",
-                              "stanford_wmt14_test2013_de/newstest2013.de",
-                              TEXT_UTF8_TOKENIZED),
-                         ],
-                         test=[
-                             ("stanford_wmt14_test2014_en/newstest2014.en",
-                              "stanford_wmt14_test2014_de/newstest2014.de",
-                              TEXT_UTF8_TOKENIZED),
-                             ("stanford_wmt14_test2015_en/newstest2015.en",
-                              "stanford_wmt14_test2015_de/newstest2015.de",
-                              TEXT_UTF8_TOKENIZED),
-                         ]),
-}
diff --git a/sockeye_contrib/autopilot/test.py b/sockeye_contrib/autopilot/test.py
deleted file mode 100644
index ca2fe2ab6..000000000
--- a/sockeye_contrib/autopilot/test.py
+++ /dev/null
@@ -1,211 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import glob
-import os
-import shutil
-import subprocess
-import sys
-import tempfile
-from typing import List
-
-# Make sure the version of sockeye being tested is first on the system path
-try:
-    import sockeye_contrib.autopilot.autopilot as autopilot
-except ImportError:
-    SOCKEYE_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-    PYTHONPATH = "PYTHONPATH"
-    if os.environ.get(PYTHONPATH, None):
-        os.environ[PYTHONPATH] += os.pathsep + SOCKEYE_ROOT
-    else:
-        os.environ[PYTHONPATH] = SOCKEYE_ROOT
-    sys.path.append(SOCKEYE_ROOT)
-    import sockeye_contrib.autopilot.autopilot as autopilot
-
-
-# Test-specific constants
-WNMT_TASK = "wnmt18_en_de"
-DATA_ONLY_TASK = "wmt14_fr_en"
-WMT_TASK = "wmt14_de_en"
-WMT_SRC = "de"
-WMT_TRG = "en"
-WMT_BPE = 32000
-PREFIX_ZERO = "0."
-
-
-def run_test(command: List[str], workspace: str):
-    """
-    Run a test command in a given workspace directory.  If it succeeds, clean up
-    model files.  If it fails, print the last log file.
-    """
-    success = False
-    try:
-        subprocess.check_call(command + ["--workspace={}".format(workspace)])
-        success = True
-    except subprocess.CalledProcessError:
-        pass
-    if not success:
-        print("Error running command. Final log file:", file=sys.stderr)
-        print("==========", file=sys.stderr)
-        log_dir = os.path.join(workspace, autopilot.DIR_LOGS)
-        last_log = sorted(os.listdir(log_dir), key=lambda fname: os.stat(os.path.join(log_dir, fname)).st_mtime)[-1]
-        with open(os.path.join(log_dir, last_log), "r") as log:
-            for line in log:
-                print(line, file=sys.stderr, end="")
-        print("==========", file=sys.stderr)
-        raise RuntimeError("Test failed: %s" % " ".join(command))
-    # Cleanup models, leaving data avaiable for use as custom inputs to other
-    # tasks
-    model_dirs = glob.glob(os.path.join(workspace, autopilot.DIR_SYSTEMS, "*", "model.*"))
-    for model_dir in model_dirs:
-        shutil.rmtree(model_dir)
-
-
-def main():
-    """
-    Build test systems with different types of pre-defined data and custom data
-    with all levels of pre-processing.
-    """
-    with tempfile.TemporaryDirectory(prefix="sockeye.autopilot.") as tmp_dir:
-        work_dir = os.path.join(tmp_dir, "workspace")
-
-        # WMT task with raw data (Transformer)
-        command = [sys.executable,
-                   "-m",
-                   "sockeye_contrib.autopilot.autopilot",
-                   "--task={}".format(WMT_TASK),
-                   "--model=transformer",
-                   "--gpus=0",
-                   "--test"]
-        run_test(command, workspace=work_dir)
-
-        # WMT task with raw data (GNMT)
-        command = [sys.executable,
-                   "-m",
-                   "sockeye_contrib.autopilot.autopilot",
-                   "--task={}".format(WMT_TASK),
-                   "--model=gnmt_like",
-                   "--decode-settings=gnmt_like",
-                   "--gpus=0",
-                   "--test"]
-        run_test(command, workspace=work_dir)
-
-        # TODO: Currently disabled due to periodic outages of nlp.stanford.edu
-        #       preventing downloading data.
-        # WNMT task with pre-tokenized data (Transformer)
-        # command = [sys.executable,
-        #            "-m",
-        #            "sockeye_contrib.autopilot.autopilot",
-        #            "--task={}".format(WNMT_TASK),
-        #            "--model=transformer",
-        #            "--gpus=0",
-        #            "--test"]
-        # run_test(command, workspace=work_dir)
-
-        # WMT task, prepare data only
-        command = [sys.executable,
-                   "-m",
-                   "sockeye_contrib.autopilot.autopilot",
-                   "--task={}".format(DATA_ONLY_TASK),
-                   "--model=none",
-                   "--gpus=0",
-                   "--test"]
-        run_test(command, workspace=work_dir)
-
-        # Custom task (raw data, Transformer)
-        command = [sys.executable,
-                   "-m",
-                   "sockeye_contrib.autopilot.autopilot",
-                   "--custom-task=custom_raw",
-                   "--custom-train",
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_RAW, autopilot.PREFIX_TRAIN + autopilot.SUFFIX_SRC_GZ),
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_RAW, autopilot.PREFIX_TRAIN + autopilot.SUFFIX_TRG_GZ),
-                   "--custom-dev",
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_RAW, autopilot.PREFIX_DEV + autopilot.SUFFIX_SRC_GZ),
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_RAW, autopilot.PREFIX_DEV + autopilot.SUFFIX_TRG_GZ),
-                   "--custom-test",
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_RAW, autopilot.PREFIX_TEST + PREFIX_ZERO + autopilot.SUFFIX_SRC_GZ),
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_RAW, autopilot.PREFIX_TEST + PREFIX_ZERO + autopilot.SUFFIX_TRG_GZ),
-                   "--custom-lang",
-                   WMT_SRC,
-                   WMT_TRG,
-                   "--custom-bpe-op={}".format(WMT_BPE),
-                   "--model=transformer",
-                   "--gpus=0",
-                   "--test"]
-        run_test(command, workspace=work_dir)
-
-        # Custom task (tokenized data, Transformer)
-        command = [sys.executable,
-                   "-m",
-                   "sockeye_contrib.autopilot.autopilot",
-                   "--custom-task=custom_tok",
-                   "--custom-train",
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_TOK, autopilot.PREFIX_TRAIN + autopilot.SUFFIX_SRC_GZ),
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_TOK, autopilot.PREFIX_TRAIN + autopilot.SUFFIX_TRG_GZ),
-                   "--custom-dev",
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_TOK, autopilot.PREFIX_DEV + autopilot.SUFFIX_SRC_GZ),
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_TOK, autopilot.PREFIX_DEV + autopilot.SUFFIX_TRG_GZ),
-                   "--custom-test",
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_TOK, autopilot.PREFIX_TEST + PREFIX_ZERO + autopilot.SUFFIX_SRC_GZ),
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_TOK, autopilot.PREFIX_TEST + PREFIX_ZERO + autopilot.SUFFIX_TRG_GZ),
-                   "--custom-text-type=tok",
-                   "--custom-bpe-op={}".format(WMT_BPE),
-                   "--model=transformer",
-                   "--gpus=0",
-                   "--test"]
-        run_test(command, workspace=work_dir)
-
-        # Custom task (byte-pair encoded data, Transformer)
-        command = [sys.executable,
-                   "-m",
-                   "sockeye_contrib.autopilot.autopilot",
-                   "--custom-task=custom_bpe",
-                   "--custom-train",
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_BPE, autopilot.PREFIX_TRAIN + autopilot.SUFFIX_SRC_GZ),
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_BPE, autopilot.PREFIX_TRAIN + autopilot.SUFFIX_TRG_GZ),
-                   "--custom-dev",
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_BPE, autopilot.PREFIX_DEV + autopilot.SUFFIX_SRC_GZ),
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_BPE, autopilot.PREFIX_DEV + autopilot.SUFFIX_TRG_GZ),
-                   "--custom-test",
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_BPE, autopilot.PREFIX_TEST + PREFIX_ZERO + autopilot.SUFFIX_SRC_GZ),
-                   os.path.join(work_dir, autopilot.DIR_SYSTEMS, WMT_TASK + autopilot.SUFFIX_TEST, autopilot.DIR_DATA,
-                                autopilot.DIR_BPE, autopilot.PREFIX_TEST + PREFIX_ZERO + autopilot.SUFFIX_TRG_GZ),
-                   "--custom-text-type=bpe",
-                   "--model=transformer",
-                   "--gpus=0",
-                   "--test"]
-        run_test(command, workspace=work_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/sockeye_contrib/autopilot/third_party.py b/sockeye_contrib/autopilot/third_party.py
deleted file mode 100644
index 399e4e7ff..000000000
--- a/sockeye_contrib/autopilot/third_party.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import io
-import gzip
-import logging
-import os
-import shutil
-import subprocess
-import sys
-import threading
-from typing import Iterable, Optional
-
-from sockeye import utils
-
-
-DIR_THIRD_PARTY = "third_party"
-DIR_LOGS = "logs"
-
-# Moses, which contains the Moses tokenizer
-# License: LGPL-2.1
-MOSES_REPO = "https://github.com/moses-smt/mosesdecoder.git"
-# Paths to include in sparse checkout
-MOSES_SPARSE_CHECKOUT = ["COPYING", "scripts/share", "scripts/tokenizer"]
-MOSES_DEST = "mosesdecoder"
-MOSES_COMMIT = "686034488aad6ccee564e262aef9e07a85c1b784"
-
-# Subword-nmt, which contains a byte-pair encoding implementation
-# License: MIT
-SUBWORD_NMT_REPO = "https://github.com/rsennrich/subword-nmt.git"
-SUBWORD_NMT_DEST = "subword-nmt"
-SUBWORD_NMT_COMMIT = "9a95f9f7400a3a891a9d8168186229a54347fc0b"
-SUBWORD_SPECIAL = "@@"
-
-# Unicode underscore
-PLACEHOLDER = "▁".encode("utf-8")
-
-
-def bin_open(fname: str):
-    """
-    Returns a file descriptor for a plain text or gzipped file, binary read mode
-    for subprocess interaction.
-
-    :param fname: The filename to open.
-    :return: File descriptor in binary read mode.
-    """
-    if fname.endswith(".gz"):
-        return gzip.open(fname, "rb")
-    return open(fname, "rb")
-
-
-def check_git():
-    """Check if git command is available."""
-    try:
-        with open(os.devnull, "wb") as devnull:
-            subprocess.check_call(["git", "--version"], stdout=devnull, stderr=devnull)
-    except:
-        raise RuntimeError("Please make sure git is installed and on your path.")
-
-
-def check_perl():
-    """Check if perl command is available."""
-    try:
-        with open(os.devnull, "wb") as devnull:
-            subprocess.check_call(["perl", "--version"], stdout=devnull, stderr=devnull)
-    except:
-        raise RuntimeError("Please make sure perl is installed and on your path.")
-
-
-def checkout_moses_tokenizer(workspace_dir: str):
-    """
-    Checkout Moses tokenizer (sparse checkout of Moses).
-
-    :param workspace_dir: Workspace directory.
-    """
-    # Prerequisites
-    check_git()
-    check_perl()
-    # Check cache
-    dest = os.path.join(workspace_dir, DIR_THIRD_PARTY, MOSES_DEST)
-    if confirm_checkout(dest, MOSES_COMMIT):
-        logging.info("Usable: %s", dest)
-        return
-    # Need to (re-)checkout
-    if os.path.exists(dest):
-        shutil.rmtree(dest)
-    logging.info("Checkout: %s -> %s", MOSES_REPO, dest)
-    os.makedirs(dest)
-    log_fname = os.path.join(workspace_dir, DIR_LOGS, "checkout.{}.{}.log".format(MOSES_DEST, os.getpid()))
-    with open(log_fname, "wb") as log:
-        logging.info("Log: %s", log_fname)
-        subprocess.call(["git", "init"], cwd=dest, stdout=log, stderr=log)
-        subprocess.call(["git", "remote", "add", "origin", MOSES_REPO], cwd=dest, stdout=log, stderr=log)
-        subprocess.call(["git", "config", "core.sparsecheckout", "true"], cwd=dest, stdout=log, stderr=log)
-        with open(os.path.join(dest, ".git", "info", "sparse-checkout"), "w") as out:
-            for path in MOSES_SPARSE_CHECKOUT:
-                print(path, file=out)
-        subprocess.call(["git", "pull", "origin", "master"], cwd=dest, stdout=log, stderr=log)
-        subprocess.call(["git", "checkout", MOSES_COMMIT], cwd=dest, stdout=log, stderr=log)
-
-
-def checkout_subword_nmt(workspace_dir: str):
-    """
-    Checkout subword-nmt implementation of byte-pair encoding.
-
-    :param workspace_dir: Workspace third-party directory.
-    """
-    # Prerequisites
-    check_git()
-    # Check cache
-    dest = os.path.join(workspace_dir, DIR_THIRD_PARTY, SUBWORD_NMT_DEST)
-    if confirm_checkout(dest, SUBWORD_NMT_COMMIT):
-        logging.info("Usable: %s", dest)
-        return
-    # Need to (re-)checkout
-    if os.path.exists(dest):
-        shutil.rmtree(dest)
-    logging.info("Checkout: %s -> %s", SUBWORD_NMT_REPO, dest)
-    log_fname = os.path.join(workspace_dir, DIR_LOGS, "checkout.{}.{}.log".format(SUBWORD_NMT_DEST, os.getpid()))
-    with open(log_fname, "wb") as log:
-        logging.info("Log: %s", log_fname)
-        subprocess.call(["git", "clone", SUBWORD_NMT_REPO, dest], stdout=log, stderr=log)
-        subprocess.call(["git", "checkout", SUBWORD_NMT_COMMIT], cwd=dest, stdout=log, stderr=log)
-
-
-def confirm_checkout(dest: str, commit: str) -> bool:
-    """
-    Confirm that git repository is checked out.
-
-    :param dest: Local directory for checkout.
-    :param commit: Git commit.
-    :return: True if checkout is usable.
-    """
-    usable = False
-    if os.path.exists(dest):
-        try:
-            rev = subprocess.check_output(["git", "rev-parse", "--verify", "HEAD"], cwd=dest).decode("utf-8").strip()
-            usable = (rev == commit)
-        except subprocess.CalledProcessError:
-            pass
-        if not usable:
-            logging.info("Problem with %s, requires new checkout.", dest)
-    return usable
-
-
-def call_moses_tokenizer(workspace_dir: str,
-                         input_fname: str,
-                         output_fname: str,
-                         lang_code: str,
-                         num_threads: int = 4):
-    """
-    Call Moses tokenizer.
-
-    :param workspace_dir: Workspace third-party directory where Moses
-                            tokenizer is checked out.
-    :param input_fname: Path of raw input file, plain text or gzipped.
-    :param output_fname: Path of tokenized output file, gzipped.
-    :param lang_code: Language code for rules and non-breaking prefixes.
-    :param num_threads: Number of threads to use.
-    """
-    tokenizer_fname = os.path.join(workspace_dir,
-                                   DIR_THIRD_PARTY,
-                                   MOSES_DEST,
-                                   "scripts",
-                                   "tokenizer",
-                                   "tokenizer.perl")
-    with bin_open(input_fname) as inp, gzip.open(output_fname, "wb") as out, open(os.devnull, "wb") as devnull:
-        tokenizer = subprocess.Popen(["perl", tokenizer_fname, "-l", lang_code, "-threads", str(num_threads)],
-                                     stdin=subprocess.PIPE,
-                                     stdout=subprocess.PIPE,
-                                     stderr=devnull)
-        tokenizer_thread = threading.Thread(target=copy_out, args=(tokenizer.stdout, out))
-        tokenizer_thread.start()
-        for line in inp:
-            tokenizer.stdin.write(line)
-        tokenizer.stdin.close()
-        tokenizer_thread.join()
-        tokenizer.wait()
-
-
-def call_moses_detokenizer(workspace_dir: str, input_fname: str, output_fname: str, lang_code: Optional[str] = None):
-    """
-    Call Moses detokenizer.
-
-    :param workspace_dir: Workspace third-party directory where Moses
-                          tokenizer is checked out.
-    :param input_fname: Path of tokenized input file, plain text or gzipped.
-    :param output_fname: Path of tokenized output file, plain text.
-    :param lang_code: Language code for rules and non-breaking prefixes.  Can be
-                      None if unknown (using pre-tokenized data), which will
-                      cause the tokenizer to default to English.
-    """
-    detokenizer_fname = os.path.join(workspace_dir,
-                                     DIR_THIRD_PARTY,
-                                     MOSES_DEST,
-                                     "scripts",
-                                     "tokenizer",
-                                     "detokenizer.perl")
-    with bin_open(input_fname) as inp, open(output_fname, "wb") as out, open(os.devnull, "wb") as devnull:
-        command = ["perl", detokenizer_fname]
-        if lang_code:
-            command.append("-l")
-            command.append(lang_code)
-        detokenizer = subprocess.Popen(command,
-                                       stdin=subprocess.PIPE,
-                                       stdout=subprocess.PIPE,
-                                       stderr=devnull)
-        detokenizer_thread = threading.Thread(target=copy_out, args=(detokenizer.stdout, out))
-        detokenizer_thread.start()
-        for line in inp:
-            detokenizer.stdin.write(line)
-        detokenizer.stdin.close()
-        detokenizer_thread.join()
-        detokenizer.wait()
-
-
-def call_learn_bpe(workspace_dir: str, source_fname: str, target_fname: str, model_fname: str, num_ops: int = 32000):
-    """
-    Call script to learn byte-pair encoding model.
-
-    :param workspace_dir: Workspace third-party directory where subword-nmt is
-                            checked out.
-    :param source_fname: Path of source corpus file, plain text or gzipped.
-    :param target_fname: Path of target corpus file, plain text or gzipped.
-    :param model_fname: Path to write out model.
-    :param num_ops: Number of operations.
-    """
-    learn_bpe_fname = os.path.join(workspace_dir, DIR_THIRD_PARTY, SUBWORD_NMT_DEST, "learn_bpe.py")
-    with bin_open(source_fname) as src_in, bin_open(target_fname) as trg_in, open(model_fname, "wb") as out:
-        learn_bpe = subprocess.Popen([sys.executable, learn_bpe_fname, "-s", str(num_ops)],
-                                     stdin=subprocess.PIPE,
-                                     stdout=subprocess.PIPE)
-        learn_bpe_thread = threading.Thread(target=copy_out, args=(learn_bpe.stdout, out))
-        learn_bpe_thread.start()
-        for inp in (src_in, trg_in):
-            for line in inp:
-                learn_bpe.stdin.write(line)
-        learn_bpe.stdin.close()
-        learn_bpe_thread.join()
-        learn_bpe.wait()
-
-
-def call_apply_bpe(workspace_dir: str, input_fname: str, output_fname: str, model_fname: str):
-    """
-    Call BPE apply script.
-
-    :param workspace_dir: Workspace directory where subword-nmt is checked out.
-    :param input_fname: Path of tokenized input file, plain text or gzipped.
-    :param output_fname: Path of byte-pair encoded output file, gzipped.
-    :param model_fname: Path of BPE model file (codes).
-    """
-    apply_bpe_fname = os.path.join(workspace_dir, DIR_THIRD_PARTY, SUBWORD_NMT_DEST, "apply_bpe.py")
-    with bin_open(input_fname) as inp, gzip.open(output_fname, "wb") as out:
-        apply_bpe = subprocess.Popen([sys.executable, apply_bpe_fname, "-c", model_fname],
-                                     stdin=subprocess.PIPE,
-                                     stdout=subprocess.PIPE)
-        apply_bpe_thread = threading.Thread(target=copy_out, args=(apply_bpe.stdout, out, True))
-        apply_bpe_thread.start()
-        for line in inp:
-            # Use an empty line placeholder to avoid blank line duplication
-            # issues with BPE script
-            if not line.strip():
-                line = PLACEHOLDER + b"\n"
-            apply_bpe.stdin.write(line)
-        apply_bpe.stdin.close()
-        apply_bpe_thread.join()
-        apply_bpe.wait()
-
-
-def merge_bpe(input_fname: str, output_fname: str):
-    """
-    Merge byte-pair encoded sub-words.
-
-    :param input_fname: Path of byte-pair encoded input file, plain text or
-                        gzipped.
-    :param output_fname: Path of tokenized output file, plain text.
-    """
-    with utils.smart_open(input_fname, "r") as inp, open(output_fname, "w", encoding="utf-8") as out:
-        for line in inp:
-            # Merge on special markers and strip stray markers (end of line)
-            merged = line.replace(SUBWORD_SPECIAL + " ", "").replace(SUBWORD_SPECIAL, "")
-            out.write(merged)
-
-
-def copy_out(source: Iterable[bytes], dest: io.BytesIO, use_placeholders: bool = False):
-    """
-    Copy lines from source to destination.
-
-    :param source: Source line iterable.
-    :param dest: Destination open file.
-    :param use_placeholders: When true, convert lines containing placeholders to
-                             empty lines and drop true empty lines (assume to be
-                             spuriously generated).
-    """
-    for line in source:
-        if use_placeholders:
-            # True empty lines are assumed to be spurious as the placeholder
-            # should be passed through
-            if not line.strip():
-                continue
-            if line.startswith(PLACEHOLDER):
-                line = b"\n"
-        dest.write(line)
diff --git a/sockeye_contrib/docker/Dockerfile.cpu b/sockeye_contrib/docker/Dockerfile.cpu
new file mode 100644
index 000000000..4f4020dc7
--- /dev/null
+++ b/sockeye_contrib/docker/Dockerfile.cpu
@@ -0,0 +1,140 @@
+FROM ubuntu:18.04
+
+ENV PYTHON_VERSION=3.6
+
+# Set default shell to /bin/bash
+SHELL ["/bin/bash", "-cu"]
+
+# Add default users for Ubuntu and Amazon Linux for ease of use
+RUN apt-get update && apt-get install -y --no-install-recommends sudo
+RUN groupadd --gid 1000 ubuntu && \
+    useradd --uid 1000 --gid ubuntu -G sudo ubuntu && \
+    echo "ubuntu ALL=(ALL) NOPASSWD:ALL" >/etc/sudoers.d/ubuntu && \
+    mkdir -p /home/ubuntu && \
+    chown ubuntu:ubuntu /home/ubuntu
+RUN groupadd --gid 500 ec2-user && \
+    useradd --uid 500 --gid ec2-user -G sudo ec2-user && \
+    echo "ec2-user ALL=(ALL) NOPASSWD:ALL" >/etc/sudoers.d/ec2-user && \
+    mkdir -p /home/ec2-user && \
+    chown ec2-user:ec2-user /home/ec2-user
+
+# Minimal Python install first to avoid conflicts later
+RUN apt-get update && apt-get install -y \
+    python-dev \
+    python3-dev \
+    python3-venv \
+    wget
+RUN wget -nv https://bootstrap.pypa.io/get-pip.py && \
+    python3 get-pip.py
+RUN pip3 install --no-cache-dir wheel 'pyyaml>=5.1'
+
+#
+# Install Intel MKL
+#
+
+RUN apt-get update && apt-get install -y gnupg wget
+
+RUN cd /tmp && \
+    wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB && \
+    apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB && \
+    rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
+
+RUN echo "deb https://apt.repos.intel.com/mkl all main" > /etc/apt/sources.list.d/intel-mkl.list
+
+RUN apt-get update && apt-get install -y intel-mkl-2019.4-070
+
+#
+# Install MXNet
+#
+
+# Workaround for making sure DNNL uses MKL
+ENV CXXFLAGS="-O3 -march=native -DUSE_MKL -I/opt/intel/mkl/include"
+ENV CFLAGS="-O3 -march=native -DUSE_MKL -I/opt/intel/mkl/include"
+ENV LD_PRELOAD=/opt/intel/mkl/lib/intel64/libmkl_rt.so
+
+# MXNet dependencies
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+    apt-transport-https \
+    awscli \
+    build-essential \
+    ca-certificates \
+    ccache \
+    curl \
+    gcc-8 \
+    g++-8 \
+    git \
+    google-perftools \
+    libatlas-base-dev \
+    libcurl4-openssl-dev \
+    libjemalloc-dev \
+    libhdf5-dev \
+    liblapack-dev \
+    libopenblas-dev \
+    libopencv-dev \
+    libssl-dev \
+    libtinfo-dev \
+    libturbojpeg \
+    libzmq3-dev \
+    zlib1g-dev \
+    libedit-dev \
+    libxml2-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
+    ninja-build \
+    software-properties-common \
+    sudo \
+    unzip \
+    vim-nox \
+    virtualenv
+# https://github.com/HaxeFoundation/hashlink/issues/147
+RUN ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0.1.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so
+# MXNet requires CMake 3.13.2+
+RUN mkdir /opt/cmake && \
+    cd /opt/cmake && \
+    wget -nv https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.sh && \
+    sh cmake-3.13.5-Linux-x86_64.sh --prefix=/opt/cmake --skip-license && \
+    ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake && \
+    rm cmake-3.13.5-Linux-x86_64.sh && \
+    cmake --version
+
+# MXNet branch with intgemm support
+ENV MXNET_COMMIT=b02dbc3387c38392b1be1570ba1fa7bfe1673d61
+RUN cd /opt && \
+    git clone https://github.com/kpuatamazon/incubator-mxnet.git mxnet && \
+    cd mxnet && \
+    git checkout ${MXNET_COMMIT} && \
+    git checkout intgemm && \
+    git submodule init && \
+    git submodule update --recursive
+
+# Build MXNet
+RUN cd /opt/mxnet && \
+    rm -rf build && \
+    mkdir -p build && \
+    cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DUSE_MKL_IF_AVAILABLE=ON -DUSE_MKLDNN=ON -DUSE_CUDA=OFF -G Ninja -DUSE_INTGEMM=ON -DCMAKE_CXX_COMPILER=g++-8 -DCMAKE_C_COMPILER=gcc-8 .. && \
+    ninja -j$(nproc)
+
+# Install MXNet Python
+RUN cd /opt/mxnet/python && pip3 install -e .
+
+#
+# Install Sockeye
+#
+
+# Sockeye Python dependencies
+ARG REQS_BASE
+RUN pip install --no-cache-dir ${REQS_BASE}
+
+# Install Sockeye, including Docker entry point script
+ARG SOCKEYE_COMMIT
+# Python dependencies
+ARG REQS_BASE
+RUN pip install --no-cache-dir ${REQS_BASE}
+COPY . /opt/sockeye
+RUN cd /opt/sockeye && \
+    pip install --no-cache-dir --no-deps --editable .
+
+# Guarantee Intel NumPy
+RUN pip3 install intel-numpy
diff --git a/sockeye_contrib/docker/Dockerfile.gpu b/sockeye_contrib/docker/Dockerfile.gpu
new file mode 100644
index 000000000..e1ba423dd
--- /dev/null
+++ b/sockeye_contrib/docker/Dockerfile.gpu
@@ -0,0 +1,111 @@
+FROM nvidia/cuda:10.1-devel-ubuntu18.04
+
+# Set default shell to /bin/bash
+SHELL ["/bin/bash", "-cu"]
+
+# Add default users for Ubuntu and Amazon Linux for ease of use
+RUN apt-get update && apt-get install -y --no-install-recommends sudo
+RUN groupadd --gid 1000 ubuntu && \
+    useradd --uid 1000 --gid ubuntu -G sudo ubuntu && \
+    echo "ubuntu ALL=(ALL) NOPASSWD:ALL" >/etc/sudoers.d/ubuntu && \
+    mkdir -p /home/ubuntu && \
+    chown ubuntu:ubuntu /home/ubuntu
+RUN groupadd --gid 500 ec2-user && \
+    useradd --uid 500 --gid ec2-user -G sudo ec2-user && \
+    echo "ec2-user ALL=(ALL) NOPASSWD:ALL" >/etc/sudoers.d/ec2-user && \
+    mkdir -p /home/ec2-user && \
+    chown ec2-user:ec2-user /home/ec2-user
+
+ENV CUDNN_VERSION=7.6.5.32-1+cuda10.1
+ENV NCCL_VERSION=2.4.8-1+cuda10.1
+ENV PYTHON_VERSION=3.6
+
+RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+    build-essential \
+    cmake \
+    g++-4.8 \
+    git \
+    curl \
+    vim \
+    wget \
+    ca-certificates \
+    libcudnn7=${CUDNN_VERSION} \
+    libnccl2=${NCCL_VERSION} \
+    libnccl-dev=${NCCL_VERSION} \
+    libjpeg-dev \
+    libpng-dev \
+    python${PYTHON_VERSION} \
+    python${PYTHON_VERSION}-dev \
+    python${PYTHON_VERSION}-distutils \
+    librdmacm1 \
+    libibverbs1 \
+    ibverbs-providers
+
+RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
+
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi && \
+    cd /tmp/openmpi && \
+    wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz && \
+    tar zxf openmpi-4.0.3.tar.gz && \
+    cd openmpi-4.0.3 && \
+    ./configure --enable-orterun-prefix-by-default && \
+    make -j $(nproc) all && \
+    make install && \
+    ldconfig && \
+    rm -rf /tmp/openmpi
+
+# Install OpenSSH for MPI to communicate between containers
+RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
+    mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install MXNet
+ENV MXNET_VERSION=1.6.0
+RUN pip install mxnet-cu101==${MXNET_VERSION}
+
+# TODO: Remove this section when the issue of missing MKLDNN headers is fixed in
+#       MXNet
+#RUN mkdir /tmp/mkldnn && \
+#    cd /tmp/mkldnn && \
+#    wget https://github.com/oneapi-src/oneDNN/archive/v0.21.5.tar.gz && \
+#    tar zxf v0.21.5.tar.gz && \
+#    cd oneDNN-0.21.5 && \
+#    cp -r include /usr/local/lib/python3.6/dist-packages/mxnet/include/mkldnn && \
+#    mkdir build && \
+#    cd build && \
+#    cmake .. && \
+#    cp include/mkldnn_version.h /usr/local/lib/python3.6/dist-packages/mxnet/include/mkldnn && \
+#    rm -rf /tmp/mkldnn
+
+# Install Horovod and the MPI Python library, temporarily using CUDA stubs
+ARG REQS_HOROVOD
+RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
+    HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITH_MXNET=1 \
+        pip install --no-cache-dir ${REQS_HOROVOD} && \
+    ldconfig
+
+# Sockeye Python dependencies
+ARG REQS_BASE
+RUN pip install --no-cache-dir ${REQS_BASE}
+
+# Everything below this ARG command re-runs if the local commit has changed
+ARG SOCKEYE_COMMIT
+
+# Install Sockeye, including Docker entry point script
+COPY . /opt/sockeye
+RUN cd /opt/sockeye && \
+    pip install --no-cache-dir --no-deps --editable . && \
+    cp /opt/sockeye/sockeye_contrib/docker/entrypoint.sh /usr/local/bin/ && \
+    chmod +x /usr/local/bin/entrypoint.sh
+
+# Set entry point to use CUDA stubs when needed
+ENTRYPOINT ["entrypoint.sh"]
diff --git a/sockeye_contrib/docker/README.md b/sockeye_contrib/docker/README.md
new file mode 100644
index 000000000..04e5729de
--- /dev/null
+++ b/sockeye_contrib/docker/README.md
@@ -0,0 +1,75 @@
+# Sockeye Docker Images
+
+Run the build script to produce optimized CPU and GPU Docker images with the current version of Sockeye:
+
+```bash
+python3 sockeye_contrib/docker/build.py (cpu|gpu)
+```
+
+- The "cpu" version includes support for int8 inference with intgemm and full MKL support.
+- The "gpu" version includes support for distributed training with Horovod and NCCL.
+
+## Example: Distributed Training with Horovod
+
+Using the Docker image greatly simplifies distributed training.
+
+### Host Setup
+
+See the Horovod instructions for setting up hosts:
+
+- [Performance improvements for GPU hosts](https://github.com/horovod/horovod/blob/master/docs/gpus.rst)
+- [Passwordless SSH for running on multiple hosts](https://github.com/horovod/horovod/blob/master/docs/docker.rst#running-on-multiple-machines)
+
+### Running
+
+This is an example running on CPUs across 2 hosts.
+
+- `COMMIT` is the Sockeye commit
+- `HOST2` is the address of the secondary host
+- `/mnt/share/ssh` is a SSH directory set up following the Horovod instructions above.
+- `/mnt/share` is a general shared directory that all workers will access to read training data and write model files.
+
+#### Secondary Host(s)
+
+On each secondary host, start a Docker container running sshd.
+Horovod/OpenMPI will connect to these hosts to launch workers.
+
+```bash
+docker run --rm -i --network=host -v /mnt/share/ssh:/home/ec2-user/.ssh -v /mnt/share:/mnt/share sockeye-gpu \
+    bash -c "/usr/sbin/sshd -p 12345; sleep infinity"
+```
+
+#### Primary Host
+
+On the primary host, prepare the training data.
+
+```bash
+docker run --rm -i -v /mnt/share:/mnt/share --user ec2-user:ec2-user sockeye-gpu \
+    python3 -m sockeye.prepare_data \
+        --source /mnt/share/data/train.src \
+        --target /mnt/share/data/train.src \
+        --output /mnt/share/data/prepared_train
+```
+
+Start Sockeye training with `horovodrun`.
+
+```bash
+docker run --rm -i --network=host -v /mnt/share/ssh:/home/ec2-user/.ssh -v /mnt/share:/mnt/share --user ec2-user:ec2-user sockeye-gpu \
+    horovodrun -np 2 -H localhost:1,HOST2:1 -p 12345 python3 -m sockeye.train \
+        --prepared-data /mnt/share/data/prepared_train \
+        --validation-source /mnt/share/data/dev.src \
+        --validation-target /mnt/share/data/dev.trg \
+        --output /mnt/share/data/model \
+        --lock-dir /mnt/share/lock \
+        --use-cpu \
+        --horovod
+```
+
+## Example: Fast Int8 Inference
+
+A normal Sockeye model (trained as float32, with or without AMP) can be quantized at runtime for int8 inference.
+In the following example, `LEXICON` is a top-k lexicon (see the [fast_align documentation](sockeye_contrib/fast_align) and `sockeye.lexicon create`; k=200 works well in practice) and `NCPUS` is the number of physical CPU cores on the host running Sockeye.
+
+```bash
+docker run --rm -i -v $PWD:/work -w /work sockeye-cpu python3 -m sockeye.translate --use-cpu --omp-num-threads NCPUS --dtype int8 --input test.src --restrict-lexicon LEXICON --models model --output test.out
+```
diff --git a/sockeye_contrib/docker/build.py b/sockeye_contrib/docker/build.py
new file mode 100755
index 000000000..252e4697e
--- /dev/null
+++ b/sockeye_contrib/docker/build.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+import os
+import subprocess
+import sys
+
+
+SOCKEYE_DIR = os.path.dirname(os.path.dirname((os.path.dirname(os.path.abspath(__file__)))))
+DOCKERFILE_CPU = os.path.join(SOCKEYE_DIR, 'sockeye_contrib', 'docker', 'Dockerfile.cpu')
+DOCKERFILE_GPU = os.path.join(SOCKEYE_DIR, 'sockeye_contrib', 'docker', 'Dockerfile.gpu')
+REQS_BASE = os.path.join(SOCKEYE_DIR, 'requirements', 'requirements.txt')
+REQS_HOROVOD = os.path.join(SOCKEYE_DIR, 'requirements', 'requirements.horovod.txt')
+
+GIT = 'git'
+DOCKER = 'docker'
+
+REPOSITORY = 'sockeye'
+
+
+def check_command(cmd):
+    try:
+        retcode = subprocess.call([cmd, '--version'])
+    except FileNotFoundError:
+        retcode = None
+    if retcode != 0:
+        msg = 'Please install {}'.format(cmd)
+        raise subprocess.SubprocessError(msg)
+
+
+def run_command(cmd_args, get_output=False):
+    print('Running: {}'.format(' '.join(cmd_args)), file=sys.stderr)
+    if get_output:
+        return subprocess.check_output(cmd_args, cwd=SOCKEYE_DIR).decode('utf-8').strip()
+    return subprocess.call(cmd_args, cwd=SOCKEYE_DIR)
+
+
+def read_requirements(fname):
+    with open(fname, 'rt') as reqs_in:
+        # MXNet is installed separately in the Dockerfile
+        return ' '.join(line.strip() for line in reqs_in if not line.startswith('mxnet'))
+
+
+def main():
+    for fname in (DOCKERFILE_CPU, DOCKERFILE_GPU, REQS_BASE, REQS_HOROVOD):
+        if not os.path.exists(fname):
+            msg = 'Cannot find {}. Please make sure {} is a properly cloned repository.'.format(fname, SOCKEYE_DIR)
+            raise FileNotFoundError(msg)
+
+    if len(sys.argv[1:]) != 1:
+        print('Usage: {} (cpu|gpu)'.format(SOCKEYE_DIR), file=sys.stderr)
+        sys.exit(2)
+
+    if sys.argv[1] == 'cpu':
+        dockerfile = DOCKERFILE_CPU
+        repository = REPOSITORY + '-cpu'
+    else:
+        dockerfile = DOCKERFILE_GPU
+        repository = REPOSITORY + '-gpu'
+
+    check_command(GIT)
+    check_command(DOCKER)
+
+    print('Running commands in {}'.format(SOCKEYE_DIR), file=sys.stderr)
+
+    sockeye_commit = run_command([GIT, 'rev-parse', 'HEAD'], get_output=True)
+    tag = run_command([GIT, 'rev-parse', '--short', 'HEAD'], get_output=True)
+
+    run_command([DOCKER, 'build',
+                 '-t', '{}:{}'.format(repository, tag),
+                 '-f', dockerfile,
+                 '.',
+                 '--build-arg', 'SOCKEYE_COMMIT={}'.format(sockeye_commit),
+                 '--build-arg', 'REQS_BASE={}'.format(read_requirements(REQS_BASE)),
+                 '--build-arg', 'REQS_HOROVOD={}'.format(read_requirements(REQS_HOROVOD))])
+
+    run_command([DOCKER, 'tag', '{}:{}'.format(repository, tag), '{}:latest'.format(repository)])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sockeye_contrib/docker/entrypoint.sh b/sockeye_contrib/docker/entrypoint.sh
new file mode 100644
index 000000000..b1498dcf6
--- /dev/null
+++ b/sockeye_contrib/docker/entrypoint.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# Use CUDA stubs when running without nvidia-docker
+command -v nvidia-smi >/dev/null 2>&1 || sudo ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs
+
+exec "$@"
diff --git a/sockeye_contrib/plot_metrics.py b/sockeye_contrib/plot_metrics.py
new file mode 100644
index 000000000..bece862a6
--- /dev/null
+++ b/sockeye_contrib/plot_metrics.py
@@ -0,0 +1,260 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+from argparse import ArgumentParser
+from bisect import insort
+from collections import defaultdict
+from os import path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+PARSE_ENTRY = defaultdict(lambda: str)
+PARSE_ENTRY.update({
+    'bleu-val': float,
+    'bleu-test': float,
+    'chrf-val': float,
+    'epoch': int,
+    'learning-rate': float,
+    'perplexity-train': float,
+    'perplexity-val': float,
+    'time-elapsed': lambda s: float(s) / (60 * 60),
+})
+
+FIND_BEST = defaultdict(lambda: max)
+FIND_BEST.update({
+    'bleu-val': max,
+    'bleu-test': max,
+    'chrf-val': max,
+    'learning-rate': min,
+    'perplexity-train': min,
+    'perplexity-val': min,
+})
+
+AX_LABEL = {
+    'bleu-val': 'Validation BLEU',
+    'bleu-test': 'Test BLEU',
+    'chrf-val': 'Validation chrF',
+    'checkpoint': 'Checkpoint',
+    'epoch': 'Epoch',
+    'learning-rate': 'Learning Rate',
+    'perplexity-train': 'Training Perplexity',
+    'perplexity-val': 'Validation Perplexity',
+    'time-elapsed': 'Time (Hours)',
+}
+
+
+def ax_label(s):
+    if s in AX_LABEL:
+        return AX_LABEL[s]
+    return s
+
+
+def read_metrics_file(fname):
+    metrics = defaultdict(list)
+    for line in open(fname, encoding='utf-8'):
+        entries = line.split()
+        metrics['checkpoint'].append(int(entries[0]))
+        for entry in entries[1:]:
+            k, v = entry.split('=')
+            v = PARSE_ENTRY[k](v)
+            metrics[k].append(v)
+    return metrics
+
+
+def average_points(points, num_points, cmp):
+    averaged = []
+    best = []
+    for point in points:
+        insort(best, point)
+        best = best[:num_points] if cmp is min else best[-num_points:]
+        averaged.append(sum(best) / len(best))
+    return averaged
+
+
+def points_since_improvement(points, cmp):
+    num_not_improved = []
+    best = None
+    since_improvement = 0
+    for point in points:
+        if best is None or (cmp is min and point < best) or (cmp is max and point > best):
+            best = point
+            since_improvement = 0
+        num_not_improved.append(since_improvement)
+        since_improvement += 1
+    return num_not_improved
+
+
+def window_improvement(points, num_points, cmp):
+    window_improvement_at_point = []
+    best_at_point = []
+    for point in points:
+        if not best_at_point:
+            best_at_point.append(point)
+        elif (cmp is min and point < best_at_point[-1]) or (cmp is max and point > best_at_point[-1]):
+            best_at_point.append(point)
+        else:
+            best_at_point.append(best_at_point[-1])
+        if len(best_at_point) > num_points:
+            best_at_point = best_at_point[-num_points:]
+        window_improvement_at_point.append(abs(best_at_point[-1] - best_at_point[0]))
+    return window_improvement_at_point
+
+
+def slope(points, num_points):
+    # First point has no slope
+    slope_at_point = [0]
+    # Start computing slope with second point
+    for i in range(1, len(points)):
+        x, y = list(zip(*enumerate(points[max(i - num_points, 0):i + 1])))
+        slope_at_point.append(np.polyfit(x, y, 1)[0])
+    return slope_at_point
+
+
+def plot_metrics(args):
+
+    fig, ax = plt.subplots()
+    if args.y2:
+        # Create axis for second Y metric
+        ax2 = ax.twinx()
+    overall_best_y = None
+
+    if len(args.skip) == 1:
+        args.skip *= len(args.input)
+
+    if len(args.every) == 1:
+        args.every *= len(args.input)
+
+    # Paper scaling
+    linewidth = 1.25 if args.paper else 1.0
+    label_size = 12 if args.paper else None
+    title_size = 16 if args.paper else None
+    legend_size = 12 if args.paper else None
+    tick_size = 12 if args.paper else None
+
+    for fname, label, skip, every in zip(args.input,
+                                         args.legend if args.legend is not None
+                                         else (path.basename(fname) for fname in args.input),
+                                         args.skip,
+                                         args.every):
+        # Read metrics file to dict
+        metrics = read_metrics_file(fname)
+        x_vals = metrics[args.x][skip:]
+        y_vals = metrics[args.y][skip:]
+        y2_vals = metrics[args.y2][skip:] if args.y2 else None
+        x_label=ax_label(args.x)
+        y_label=ax_label(args.y)
+        y2_label=ax_label(args.y2)
+        # Spread points that collapse into one significant digit (ex: epochs)
+        for i_label, i_vals in zip([args.x, args.y], [x_vals, y_vals]):
+            if i_label in ['epoch']:
+                i_vals[:] = np.linspace(i_vals[0], i_vals[-1], len(i_vals))
+        # Optionally invert Y values
+        if args.y_invert:
+            y_vals = [val * -1 for val in y_vals]
+        if args.y2_invert:
+            y2_vals = [val * -1 for val in y2_vals]
+        # Optionally average best points so far for each Y point
+        if args.y_average is not None:
+            y_vals = average_points(y_vals, args.y_average, cmp=FIND_BEST[args.y])
+            y_label = '{} (Average of {} Points)'.format(y_label, args.y_average)
+        # Optionally count points since last improvement for each Y point
+        if args.y_since_best:
+            y_vals = points_since_improvement(y_vals, cmp=FIND_BEST[args.y])
+            y_label = '{} (Checkpoints Since Improvement)'.format(y_label)
+        # Optionally compute the window improvement for each Y point
+        if args.y_window_improvement is not None:
+            y_vals = window_improvement(y_vals, args.y_window_improvement, cmp=FIND_BEST[args.y])
+            # Don't plot points for which window improvement is unreliable
+            # (fewer than number points used for window)
+            x_vals = x_vals[args.y_window_improvement - 1:]
+            y_vals = y_vals[args.y_window_improvement - 1:]
+            y_label = '{} (Window Improvement over {} Points)'.format(y_label, args.y_window_improvement)
+        # Optionally compute current slope for each Y point
+        if args.y_slope is not None:
+            y_vals = slope(y_vals, args.y_slope)
+            # Don't plot points for which slope is unreliable (fewer than number
+            # points used to compute slope)
+            x_vals = x_vals[args.y_slope - 1:]
+            y_vals = y_vals[args.y_slope - 1:]
+            if y2_vals:
+                y2_vals = y2_vals[args.y_slope - 1:]
+            y_label = '{} (Slope of {} Points)'.format(y_label, args.y_slope)
+        # Only plot every N values
+        x_vals = x_vals[::every]
+        y_vals = y_vals[::every]
+        if y2_vals:
+            y2_vals = y2_vals[::every]
+        # Plot values for this metrics file
+        ax.plot(x_vals, y_vals, linewidth=linewidth, alpha=0.75, label=label)
+        ax.set_xlabel(x_label, fontsize=label_size)
+        ax.set_ylabel(y_label, fontsize=label_size)
+        plt.title(args.title, fontsize=title_size)
+        plt.xticks(fontsize=tick_size)
+        plt.yticks(fontsize=tick_size)
+        # If present, plot and label second Y axis metric
+        if args.y2:
+            ax2.plot(x_vals, y2_vals, linewidth=linewidth / 2, alpha=0.75, label=label)
+            ax2.set_ylabel(y2_label, fontsize=label_size)
+        # Optionally track best point so far
+        if args.best:
+            best_y = FIND_BEST[args.y](y_vals)
+            if overall_best_y is None:
+                overall_best_y = best_y
+            else:
+                overall_best_y = FIND_BEST[args.y](best_y, overall_best_y)
+    # Optionally mark best Y point across metrics files
+    if args.best:
+        ax.axhline(y=overall_best_y, color='gray', linewidth=linewidth, linestyle='--', zorder=999)
+    # Optionally draw user specified Y line
+    if args.y_line is not None:
+        ax.axhline(y=args.y_line, color='gray', linewidth=linewidth, linestyle='--', zorder=999)
+
+    ax.grid()
+    ax.legend(fontsize=legend_size)
+
+    fig.tight_layout()
+    fig.savefig(args.output)
+
+
+def main():
+    params = ArgumentParser(description='Plot data from \'metrics\' files written during training.')
+    params.add_argument('-i', '--input', required=True, nargs='+', help='One or more \'metrics\' files to plot.')
+    params.add_argument('-o', '--output', required=True, help='Output file to write (ex: plot.pdf).')
+    params.add_argument('-x', default='time-elapsed', help='X axis metric.')
+    params.add_argument('-y', default='perplexity-train', help='Y axis metric.')
+    params.add_argument('-y2', help='Second Y axis metric.')
+    params.add_argument('-yi', '--y-invert', action='store_true', help='Invert Y metric (multiply values by -1).')
+    params.add_argument('-y2i', '--y2-invert', action='store_true',
+                        help='Invert second Y metric (multiply values by -1).')
+    params.add_argument('-ya', '--y-average', type=int, help='Average the N best points so far for each Y value.')
+    params.add_argument('-ysb', '--y-since-best', action='store_true',
+                        help='Use number of points since improvement for each Y value.')
+    params.add_argument('-ywi', '--y-window-improvement', type=int,
+                        help='Improvement in best over the last N points for each Y value.')
+    params.add_argument('-ysl', '--y-slope', type=int, help='Compute current slope for each Y value.')
+    params.add_argument('-yli', '--y-line', type=float, help='Draw a horizontal line at specified Y value.')
+    params.add_argument('-l', '--legend', nargs='+', help='Labels in legend (one per input file).')
+    params.add_argument('-t', '--title', help='Plot title.')
+    params.add_argument('-b', '--best', action='store_true', help='Draw horizontal line at best Y value.')
+    params.add_argument('-s', '--skip', type=int, nargs='+', default=(0,),
+                        help='Skip the first N points for better readability.  Single value or value per input.')
+    params.add_argument('-ev', '--every', type=int, nargs='+', default=(1,), help='Only plot one point every N points.')
+    params.add_argument('-p', '--paper', action='store_true', help='Scale plot elements for inclusion in papers.')
+    args = params.parse_args()
+    plot_metrics(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/test/__init__.py b/test/__init__.py
index 6b3eaaafe..215c6fa15 100644
--- a/test/__init__.py
+++ b/test/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -9,7 +9,4 @@
 # or in the "license" file accompanying this file. This file is distributed on
 # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 # express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import sockeye.multiprocessing_utils as mp
-mp.initialize()
+# permissions and limitations under the License.
\ No newline at end of file
diff --git a/test/common.py b/test/common.py
index efe6313f7..90fe1937f 100644
--- a/test/common.py
+++ b/test/common.py
@@ -10,203 +10,24 @@
 # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
-
-import json
 import logging
 import os
-import random
 import sys
-from contextlib import contextmanager
-from tempfile import TemporaryDirectory
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 from unittest.mock import patch
 
-import mxnet as mx
 import numpy as np
 
-import sockeye.average
-import sockeye.checkpoint_decoder
-import sockeye.constants as C
-import sockeye.evaluate
-import sockeye.extract_parameters
-import sockeye.lexicon
-import sockeye.model
-import sockeye.prepare_data
 import sockeye.score
-import sockeye.train
 import sockeye.translate
-import sockeye.utils
+from sockeye import constants as C
+from sockeye.test_utils import run_train_translate, run_translate_restrict, TRANSLATE_PARAMS_COMMON, \
+    TRANSLATE_WITH_FACTORS_COMMON, collect_translate_output_and_scores, create_reference_constraints, \
+    SCORE_PARAMS_COMMON, SCORE_WITH_FACTORS_COMMON
 
 logger = logging.getLogger(__name__)
 
 
-def gaussian_vector(shape, return_symbol=False):
-    """
-    Generates random normal tensors (diagonal covariance)
-
-    :param shape: shape of the tensor.
-    :param return_symbol: True if the result should be a Symbol, False if it should be an Numpy array.
-    :return: A gaussian tensor.
-    """
-    return mx.sym.random_normal(shape=shape) if return_symbol else np.random.normal(size=shape)
-
-
-def integer_vector(shape, max_value, min_value=1, return_symbol=False):
-    """
-    Generates a random positive integer tensor
-
-    :param shape: shape of the tensor.
-    :param max_value: maximum integer value.
-    :param min_value: minimum integer value.
-    :param return_symbol: True if the result should be a Symbol, False if it should be an Numpy array.
-    :return: A random integer tensor.
-    """
-    return mx.sym.round(mx.sym.random.uniform(low=min_value, high=max_value, shape=shape)) if return_symbol \
-        else np.random.randint(low=min_value, high=max_value, size=shape)
-
-
-def uniform_vector(shape, min_value=0, max_value=1, return_symbol=False):
-    """
-    Generates a uniformly random tensor
-
-    :param shape: shape of the tensor
-    :param min_value: minimum possible value
-    :param max_value: maximum possible value (exclusive)
-    :param return_symbol: True if the result should be a mx.sym.Symbol, False if it should be a Numpy array
-    :return:
-    """
-    return mx.sym.random.uniform(low=min_value, high=max_value, shape=shape) if return_symbol \
-        else np.random.uniform(low=min_value, high=max_value, size=shape)
-
-
-_DIGITS = "0123456789"
-_MID = 5
-
-
-def generate_digits_file(source_path: str,
-                         target_path: str,
-                         line_count: int = 100,
-                         line_length: int = 9,
-                         sort_target: bool = False,
-                         line_count_empty: int = 0,
-                         seed=13):
-    assert line_count_empty <= line_count
-    random_gen = random.Random(seed)
-    with open(source_path, "w") as source_out, open(target_path, "w") as target_out:
-        all_digits = []
-        for _ in range(line_count - line_count_empty):
-            digits = [random_gen.choice(_DIGITS) for _ in range(random_gen.randint(1, line_length))]
-            all_digits.append(digits)
-        for _ in range(line_count_empty):
-            all_digits.append([])
-        random_gen.shuffle(all_digits)
-        for digits in all_digits:
-            print(" ".join(digits), file=source_out)
-            if sort_target:
-                digits.sort()
-            print(" ".join(digits), file=target_out)
-
-
-def generate_low_high_factors(source_path: str,
-                              output_path: str):
-    """
-    Writes low/high factor file given a source file of digit sequences.
-    """
-    with open(source_path, 'r') as fin, open(output_path, 'w') as fout:
-        for line in fin:
-            digits = map(int, line.rstrip().split())
-            factors = ("l" if digit < _MID else "h" for digit in digits)
-            print(" ".join(factors), file=fout)
-
-
-def generate_fast_align_lex(lex_path: str):
-    """
-    Generate a fast_align format lex table for digits.
-
-    :param lex_path: Path to write lex table.
-    """
-    with open(lex_path, "w") as lex_out:
-        for digit in _DIGITS:
-            print("{0}\t{0}\t0".format(digit), file=lex_out)
-
-
-_LEXICON_CREATE_PARAMS_COMMON = "create -i {input} -m {model} -k {topk} -o {lexicon}"
-
-
-@contextmanager
-def tmp_digits_dataset(prefix: str,
-                       train_line_count: int, train_line_count_empty: int, train_max_length: int,
-                       dev_line_count: int, dev_max_length: int,
-                       test_line_count: int, test_line_count_empty: int, test_max_length: int,
-                       sort_target: bool = False,
-                       seed_train: int = 13, seed_dev: int = 13,
-                       with_source_factors: bool = False) -> Dict[str, Any]:
-    """
-    Creates a temporary dataset with train, dev, and test. Returns a dictionary with paths to the respective temporary
-    files.
-    """
-    with TemporaryDirectory(prefix=prefix) as work_dir:
-        # Simple digits files for train/dev data
-        train_source_path = os.path.join(work_dir, "train.src")
-        train_target_path = os.path.join(work_dir, "train.tgt")
-        dev_source_path = os.path.join(work_dir, "dev.src")
-        dev_target_path = os.path.join(work_dir, "dev.tgt")
-        test_source_path = os.path.join(work_dir, "test.src")
-        test_target_path = os.path.join(work_dir, "test.tgt")
-        generate_digits_file(train_source_path, train_target_path, train_line_count, train_max_length,
-                             line_count_empty=train_line_count_empty, sort_target=sort_target, seed=seed_train)
-        generate_digits_file(dev_source_path, dev_target_path, dev_line_count, dev_max_length, sort_target=sort_target,
-                             seed=seed_dev)
-        generate_digits_file(test_source_path, test_target_path, test_line_count, test_max_length,
-                             line_count_empty=test_line_count_empty, sort_target=sort_target, seed=seed_dev)
-        data = {'work_dir': work_dir,
-                'train_source': train_source_path,
-                'train_target': train_target_path,
-                'dev_source': dev_source_path,
-                'dev_target': dev_target_path,
-                'test_source': test_source_path,
-                'test_target': test_target_path}
-
-        if with_source_factors:
-            train_factor_path = train_source_path + ".factors"
-            dev_factor_path = dev_source_path + ".factors"
-            test_factor_path = test_source_path + ".factors"
-            generate_low_high_factors(train_source_path, train_factor_path)
-            generate_low_high_factors(dev_source_path, dev_factor_path)
-            generate_low_high_factors(test_source_path, test_factor_path)
-            data['train_source_factors'] = [train_factor_path]
-            data['dev_source_factors'] = [dev_factor_path]
-            data['test_source_factors'] = [test_factor_path]
-
-        yield data
-
-
-_TRAIN_PARAMS_COMMON = "--use-cpu --max-seq-len {max_len} --source {train_source} --target {train_target}" \
-                       " --validation-source {dev_source} --validation-target {dev_target} --output {model}" \
-                       " --seed {seed}"
-
-_PREPARE_DATA_COMMON = " --max-seq-len {max_len} --source {train_source} --target {train_target}" \
-                       " --output {output} --pad-vocab-to-multiple-of 16"
-
-_TRAIN_WITH_FACTORS_COMMON = " --source-factors {source_factors}"
-_DEV_WITH_FACTORS_COMMON = " --validation-source-factors {dev_source_factors}"
-
-_TRAIN_PARAMS_PREPARED_DATA_COMMON = "--use-cpu --max-seq-len {max_len} --prepared-data {prepared_data}" \
-                                     " --validation-source {dev_source} --validation-target {dev_target} " \
-                                     "--output {model}"
-
-_TRANSLATE_PARAMS_COMMON = "--use-cpu --models {model} --input {input} --output {output} " \
-                           "--output-type translation_with_score"
-
-_TRANSLATE_WITH_FACTORS_COMMON = " --input-factors {input_factors}"
-
-_TRANSLATE_PARAMS_RESTRICT = "--restrict-lexicon {lexicon} --restrict-lexicon-topk {topk}"
-
-_SCORE_PARAMS_COMMON = "--use-cpu --model {model} --source {source} --target {target} --output {output}"
-
-_SCORE_WITH_FACTORS_COMMON = " --source-factors {source_factors}"
-
-
 def check_train_translate(train_params: str,
                           translate_params: str,
                           data: Dict[str, Any],
@@ -239,147 +60,12 @@ def check_train_translate(train_params: str,
     # Only run scoring under these conditions. Why?
     # - translate splits up too-long sentences and translates them in sequence, invalidating the score, so skip that
     # - scoring requires valid translation output to compare against
-    if '--max-input-len' not in translate_params and _translate_output_is_valid(data['test_outputs']):
+    if '--max-input-length' not in translate_params and _translate_output_is_valid(data['test_outputs']):
         test_scoring(data, translate_params, compare_output)
 
     return data
 
 
-def run_train_translate(train_params: str,
-                        translate_params: str,
-                        data: Dict[str, Any],
-                        use_prepared_data: bool = False,
-                        max_seq_len: int = 10,
-                        seed: int = 13) -> Dict[str, Any]:
-    """
-    Train a model and translate a test set. Returns the updated data dictionary containing paths to translation outputs
-    and scores.
-
-    :param train_params: Command line args for model training.
-    :param translate_params: First command line args for translation.
-    :param data: Dictionary containing test data
-    :param use_prepared_data: Whether to use the prepared data functionality.
-    :param max_seq_len: The maximum sequence length.
-    :param seed: The seed used for training.
-    :return: Data dictionary, updated with translation outputs and scores
-    """
-    work_dir = os.path.join(data['work_dir'], 'train_translate')
-    data['model'] = os.path.join(work_dir, "model")
-    # Optionally create prepared data directory
-    if use_prepared_data:
-        data['train_prepared'] = os.path.join(work_dir, "prepared_data")
-        params = "{} {}".format(sockeye.prepare_data.__file__,
-                                _PREPARE_DATA_COMMON.format(train_source=data['train_source'],
-                                                            train_target=data['train_target'],
-                                                            output=data['train_prepared'],
-                                                            max_len=max_seq_len))
-        if 'train_source_factors' in data:
-            params += _TRAIN_WITH_FACTORS_COMMON.format(source_factors=" ".join(data['train_source_factors']))
-
-        logger.info("Creating prepared data folder.")
-        with patch.object(sys, "argv", params.split()):
-            sockeye.prepare_data.main()
-        # Train model
-        params = "{} {} {}".format(sockeye.train.__file__,
-                                   _TRAIN_PARAMS_PREPARED_DATA_COMMON.format(prepared_data=data['train_prepared'],
-                                                                             dev_source=data['dev_source'],
-                                                                             dev_target=data['dev_target'],
-                                                                             model=data['model'],
-                                                                             max_len=max_seq_len),
-                                   train_params)
-
-        if 'dev_source_factors' in data:
-            params += _DEV_WITH_FACTORS_COMMON.format(dev_source_factors=" ".join(data['dev_source_factors']))
-
-        logger.info("Starting training with parameters %s.", train_params)
-        with patch.object(sys, "argv", params.split()):
-            sockeye.train.main()
-    else:
-        # Train model
-        params = "{} {} {}".format(sockeye.train.__file__,
-                                   _TRAIN_PARAMS_COMMON.format(train_source=data['train_source'],
-                                                               train_target=data['train_target'],
-                                                               dev_source=data['dev_source'],
-                                                               dev_target=data['dev_target'],
-                                                               model=data['model'],
-                                                               max_len=max_seq_len,
-                                                               seed=seed),
-                                   train_params)
-
-        if 'train_source_factors' in data:
-            params += _TRAIN_WITH_FACTORS_COMMON.format(source_factors=" ".join(data['train_source_factors']))
-        if 'dev_source_factors' in data:
-            params += _DEV_WITH_FACTORS_COMMON.format(dev_source_factors=" ".join(data['dev_source_factors']))
-
-        logger.info("Starting training with parameters %s.", train_params)
-        with patch.object(sys, "argv", params.split()):
-            sockeye.train.main()
-
-    # Translate corpus with the 1st params and scoring output handler to obtain scores
-    data['test_output'] = os.path.join(work_dir, "test.out")
-    params = "{} {} {}".format(sockeye.translate.__file__,
-                               _TRANSLATE_PARAMS_COMMON.format(model=data['model'],
-                                                               input=data['test_source'],
-                                                               output=data['test_output']),
-                               translate_params)
-
-    if 'test_source_factors' in data:
-        params += _TRANSLATE_WITH_FACTORS_COMMON.format(input_factors=" ".join(data['test_source_factors']))
-
-    logger.info("Translating with params %s", params)
-    with patch.object(sys, "argv", params.split()):
-        sockeye.translate.main()
-
-    # Collect test inputs
-    with open(data['test_source']) as inputs:
-        data['test_inputs'] = [line.strip() for line in inputs]
-
-    # Collect test references
-    with open(data['test_target'], "r") as ref:
-        data['test_targets'] = [line.strip() for line in ref]
-
-    # Collect test translate outputs and scores
-    data['test_outputs'], data['test_scores'] = collect_translate_output_and_scores(data['test_output'])
-    assert len(data['test_inputs']) == len(data['test_targets']) == len(data['test_outputs']) == len(data['test_scores'])
-    return data
-
-
-def run_translate_restrict(data: Dict[str, Any], translate_params: str) -> Dict[str, Any]:
-    """
-    Runs sockeye.translate with vocabulary selection and checks if number of outputs are the same as without
-    vocabulary selection. Adds restricted outputs and scores to the data dictionary.
-    """
-    out_path = os.path.join(data['work_dir'], "out-restrict.txt")
-    # fast_align lex table
-    ttable_path = os.path.join(data['work_dir'], "ttable")
-    generate_fast_align_lex(ttable_path)
-    # Top-K lexicon
-    lexicon_path = os.path.join(data['work_dir'], "lexicon")
-    params = "{} {}".format(sockeye.lexicon.__file__,
-                            _LEXICON_CREATE_PARAMS_COMMON.format(input=ttable_path,
-                                                                 model=data['model'],
-                                                                 topk=20,
-                                                                 lexicon=lexicon_path))
-    with patch.object(sys, "argv", params.split()):
-        sockeye.lexicon.main()
-    # Translate corpus with restrict-lexicon
-    params = "{} {} {} {}".format(sockeye.translate.__file__,
-                                  _TRANSLATE_PARAMS_COMMON.format(model=data['model'],
-                                                                  input=data['test_source'],
-                                                                  output=out_path),
-                                  translate_params,
-                                  _TRANSLATE_PARAMS_RESTRICT.format(lexicon=lexicon_path, topk=1))
-    if 'test_source_factors' in data:
-        params += _TRANSLATE_WITH_FACTORS_COMMON.format(input_factors=" ".join(data['test_source_factors']))
-    with patch.object(sys, "argv", params.split()):
-        sockeye.translate.main()
-
-    # Collect test translate outputs and scores
-    data['test_outputs_restricted'], data['test_scores_restricted'] = collect_translate_output_and_scores(out_path)
-    assert len(data['test_outputs_restricted']) == len(data['test_outputs'])
-    return data
-
-
 def test_translate_equivalence(data: Dict[str, Any], translate_params_equiv: str, compare_output: bool):
     """
     Tests whether the output and scores generated by sockeye.translate with translate_params_equiv are equal to
@@ -387,12 +73,12 @@ def test_translate_equivalence(data: Dict[str, Any], translate_params_equiv: str
     """
     out_path = os.path.join(data['work_dir'], "test.out.equiv")
     params = "{} {} {}".format(sockeye.translate.__file__,
-                               _TRANSLATE_PARAMS_COMMON.format(model=data['model'],
-                                                               input=data['test_source'],
-                                                               output=out_path),
+                               TRANSLATE_PARAMS_COMMON.format(model=data['model'],
+                                                              input=data['test_source'],
+                                                              output=out_path),
                                translate_params_equiv)
     if 'test_source_factors' in data:
-        params += _TRANSLATE_WITH_FACTORS_COMMON.format(input_factors=" ".join(data['test_source_factors']))
+        params += TRANSLATE_WITH_FACTORS_COMMON.format(input_factors=" ".join(data['test_source_factors']))
     with patch.object(sys, "argv", params.split()):
         sockeye.translate.main()
     # Collect translate outputs and scores
@@ -406,15 +92,8 @@ def test_translate_equivalence(data: Dict[str, Any], translate_params_equiv: str
         assert all(abs(a - b) < 0.01 or np.isnan(a - b) for a, b in zip(data['test_scores'], translate_scores_equiv))
 
 
-def _create_reference_constraints(translate_inputs: List[str], translate_outputs: List[str]) -> List[Dict[str, Any]]:
-    constrained_inputs = []
-    for sentno, (source, translate_output) in enumerate(zip(translate_inputs, translate_outputs)):
-        constrained_inputs.append(json.dumps({'text': source, 'constraints': ['<s> {} </s>'.format(translate_output)]}, ensure_ascii=False))
-    return constrained_inputs
-
-
 def test_constrained_decoding_against_ref(data: Dict[str, Any], translate_params: str):
-    constrained_inputs = _create_reference_constraints(data['test_inputs'], data['test_outputs'])
+    constrained_inputs = create_reference_constraints(data['test_inputs'], data['test_outputs'])
     new_test_source_path = os.path.join(data['work_dir'], "test_constrained.txt")
     with open(new_test_source_path, 'w') as out:
         for json_line in constrained_inputs:
@@ -422,9 +101,9 @@ def test_constrained_decoding_against_ref(data: Dict[str, Any], translate_params
     out_path_constrained = os.path.join(data['work_dir'], "out_constrained.txt")
     params = "{} {} {} --json-input --output-type translation_with_score --beam-size 1 --batch-size 1 --nbest-size 1".format(
         sockeye.translate.__file__,
-        _TRANSLATE_PARAMS_COMMON.format(model=data['model'],
-                                        input=new_test_source_path,
-                                        output=out_path_constrained),
+        TRANSLATE_PARAMS_COMMON.format(model=data['model'],
+                                       input=new_test_source_path,
+                                       output=out_path_constrained),
         translate_params)
     with patch.object(sys, "argv", params.split()):
         sockeye.translate.main()
@@ -445,8 +124,7 @@ def test_scoring(data: Dict[str, Any], translate_params: str, test_similar_score
     Tests the scoring CLI and checks for score equivalence with previously generated translate scores.
     """
     # Translate params that affect the score need to be used for scoring as well.
-    relevant_params = {'--softmax-temperature',
-                       '--brevity-penalty-type',
+    relevant_params = {'--brevity-penalty-type',
                        '--brevity-penalty-weight',
                        '--brevity-penalty-constant-length-ratio',
                        '--length-penalty-alpha',
@@ -467,13 +145,13 @@ def test_scoring(data: Dict[str, Any], translate_params: str, test_similar_score
             translate_tokens.append(output.split())
 
     params = "{} {} {}".format(sockeye.score.__file__,
-                               _SCORE_PARAMS_COMMON.format(model=data['model'],
-                                                           source=data['test_source'],
-                                                           target=target_path,
-                                                           output=out_path),
+                               SCORE_PARAMS_COMMON.format(model=data['model'],
+                                                          source=data['test_source'],
+                                                          target=target_path,
+                                                          output=out_path),
                                score_params)
     if 'test_source_factors' in data:
-        params += _SCORE_WITH_FACTORS_COMMON.format(source_factors=" ".join(data['test_source_factors']))
+        params += SCORE_WITH_FACTORS_COMMON.format(source_factors=" ".join(data['test_source_factors']))
     logger.info("Scoring with params %s", params)
     with patch.object(sys, "argv", params.split()):
         sockeye.score.main()
@@ -482,21 +160,19 @@ def test_scoring(data: Dict[str, Any], translate_params: str, test_similar_score
     with open(out_path) as score_out:
         score_scores = [float(line.strip()) for line in score_out]
 
-    # Compare scored output to original translation output. Unfortunately, sockeye.translate doesn't enforce
-    # generation of </s> and have had length normalization applied. So, skip all sentences that are as long
-    # as the maximum length, in order to safely exclude them.
     if test_similar_scores:
-        model_config = sockeye.model.SockeyeModel.load_config(os.path.join(data['model'], C.CONFIG_NAME))
-        max_len = model_config.config_data.max_seq_len_target
-
-        valid_outputs = list(filter(lambda x: len(x[0]) < max_len - 1,
-                                    zip(translate_tokens, data['test_scores'], score_scores)))
-        for translate_tokens, translate_score, score_score in valid_outputs:
-            # Skip sentences that are close to the maximum length to avoid confusion about whether
-            # the length penalty was applied
-            if len(translate_tokens) >= max_len - 2:
-                continue
-            assert (translate_score == -np.inf and score_score == -np.inf) or abs(translate_score - score_score) < 0.02
+        for inp, translate_tokens, translate_score, score_score in zip(data['test_inputs'],
+                                                                       translate_tokens,
+                                                                       data['test_scores'],
+                                                                       score_scores):
+            logger.info("tokens: %s || translate score: %.4f || score score: %.4f",
+                        translate_tokens, translate_score, score_score)
+            assert (translate_score == -np.inf and score_score == -np.inf) or np.isclose(translate_score,
+                                                                                         score_score,
+                                                                                         atol=1e-06),\
+                "input: %s || tokens: %s || translate score: %.6f || score score: %.6f" % (inp, translate_tokens,
+                                                                                           translate_score,
+                                                                                           score_score)
 
 
 def _translate_output_is_valid(translate_outputs: List[str]) -> bool:
@@ -513,32 +189,3 @@ def _translate_output_is_valid(translate_outputs: List[str]) -> bool:
             # There must be no bad tokens
             return False
     return found_valid_output
-
-
-def collect_translate_output_and_scores(out_path: str) -> Tuple[List[str], List[float]]:
-    """
-    Collects translation outputs and scores from an output file
-    produced with the 'translation_and_score' or nbest output handler.
-    """
-    translations = []  # type: List[str]
-    scores = []  # type: List[float]
-    with open(out_path) as out_fh:
-        for line in out_fh:
-            output = line.strip()
-            translation = ''
-            score = -np.inf
-            try:
-                output = json.loads(output)
-                try:
-                    translation = output['translation']
-                    score = output['score']
-                except IndexError:
-                    pass
-            except:
-                try:
-                    score, translation = output.split('\t', 1)
-                except ValueError:
-                    pass
-            translations.append(translation)
-            scores.append(float(score))
-    return translations, scores
diff --git a/test/common_image_captioning.py b/test/common_image_captioning.py
deleted file mode 100644
index e958625fc..000000000
--- a/test/common_image_captioning.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import logging
-import os
-import pickle
-import random
-import sys
-from contextlib import contextmanager
-from tempfile import TemporaryDirectory
-from typing import List, Optional, Tuple
-from unittest.mock import patch
-
-import mxnet as mx
-import numpy as np
-
-import sockeye.average
-import sockeye.constants as C
-import sockeye.image_captioning.captioner
-import sockeye.image_captioning.extract_features
-import sockeye.image_captioning.train
-from sockeye.evaluate import raw_corpus_bleu, raw_corpus_chrf
-
-try:  # Try to import pillow
-    from PIL import Image  # pylint: disable=import-error
-except ImportError as e:
-    raise RuntimeError("Please install pillow.")
-
-logger = logging.getLogger(__name__)
-
-
-_DIGITS = "0123456789"
-_IMAGE_SHAPE = (100, 100, 3)
-_CNN_INPUT_IMAGE_SHAPE = (3, 224, 224)
-_FEATURE_SHAPE = (2048, 7, 7)
-
-
-def generate_img_or_feat(filename, use_features):
-    if not use_features:
-        imarray = np.random.rand(*_IMAGE_SHAPE) * 255
-        im = Image.fromarray(imarray.astype('uint8'))
-        im.save(filename)
-    else:
-        data = np.random.rand(*_FEATURE_SHAPE)
-        np.save(filename, data)
-
-
-def generate_img_text_experiment_files(
-                            source_list: List[str],
-                            work_dir: str,
-                            source_path: str,
-                            target_path: str,
-                            line_length: int = 9,
-                            use_features: bool = False,
-                            seed=13):
-    random_gen = random.Random(seed)
-    with open(source_path, "w") as source_out, open(target_path, "w") as target_out:
-        source_list_img = []
-        for s in source_list:
-            if not use_features:
-                filename = s + ".jpg"
-            else:
-                filename = s + ".npy"
-            source_list_img.append(os.path.join(work_dir, filename))
-            print(filename, file=source_out)
-            digits = [random_gen.choice(_DIGITS) for _ in range(random_gen.randint(1, line_length))]
-            print(" ".join(digits), file=target_out)
-        # Create random images/features
-        for s in source_list_img:
-            filename = os.path.join(work_dir, s)
-            generate_img_or_feat(filename, use_features)
-        # Generate and save the image size and feature size
-        size_out_file = os.path.join(work_dir, "image_feature_sizes.pkl")
-        with open(size_out_file, "wb") as fout:
-            pickle.dump({"image_shape": _CNN_INPUT_IMAGE_SHAPE,
-                         "features_shape": _FEATURE_SHAPE}, fout)
-
-
-@contextmanager
-def tmp_img_captioning_dataset(
-                        source_list: List[str],
-                        prefix: str,
-                        train_max_length: int,
-                        dev_max_length: int,
-                        test_max_length: int,
-                        use_features: bool = False,
-                        seed_train: int = 13,
-                        seed_dev: int = 13):
-    with TemporaryDirectory(prefix=prefix) as work_dir:
-        # Simple digits files for train/dev data
-        train_source_path = os.path.join(work_dir, "train.src")
-        train_target_path = os.path.join(work_dir, "train.tgt")
-        dev_source_path = os.path.join(work_dir, "dev.src")
-        dev_target_path = os.path.join(work_dir, "dev.tgt")
-        test_source_path = os.path.join(work_dir, "test.src")
-        test_target_path = os.path.join(work_dir, "test.tgt")
-        generate_img_text_experiment_files(source_list, work_dir, train_source_path, train_target_path,
-                             train_max_length, use_features, seed=seed_train)
-        generate_img_text_experiment_files(source_list, work_dir, dev_source_path, dev_target_path,
-                                 dev_max_length, use_features, seed=seed_dev)
-        generate_img_text_experiment_files(source_list, work_dir, test_source_path, test_target_path,
-                                 test_max_length, use_features, seed=seed_dev)
-        data = {'work_dir': work_dir,
-                'source': train_source_path,
-                'target': train_target_path,
-                'validation_source': dev_source_path,
-                'validation_target': dev_target_path,
-                'test_source': test_source_path,
-                'test_target': test_target_path}
-
-        yield data
-
-
-_CAPTION_TRAIN_PARAMS_COMMON = \
-    "--use-cpu --max-seq-len {max_len} --source-root {source_root} --source {train_source} --target {train_target}" \
-    " --validation-source-root {dev_root} --validation-source {dev_source} --validation-target {dev_target} --output {model} {quiet}" \
-    " --seed {seed}"
-
-_CAPTIONER_PARAMS_COMMON = "--use-cpu --models {model}  --source-root {source_root} --input {input} --output {output} {quiet}"
-
-def run_train_captioning(train_params: str,
-                        translate_params: str,
-                        translate_params_equiv: Optional[str],
-                        train_source_path: str,
-                        train_target_path: str,
-                        dev_source_path: str,
-                        dev_target_path: str,
-                        test_source_path: str,
-                        test_target_path: str,
-                        max_seq_len: int = 10,
-                        work_dir: Optional[str] = None,
-                        seed: int = 13,
-                        quiet: bool = False) -> Tuple[float, float, float, float]:
-    """
-    Train a model and caption a dev set.  Report validation perplexity and BLEU.
-
-    :param train_params: Command line args for model training.
-    :param translate_params: First command line args for translation.
-    :param translate_params_equiv: Second command line args for captuoning. Should produce the same outputs
-    :param train_source_path: Path to the source file.
-    :param train_target_path: Path to the target file.
-    :param dev_source_path: Path to the development source file.
-    :param dev_target_path: Path to the development target file.
-    :param test_source_path: Path to the test source file.
-    :param test_target_path: Path to the test target file.
-    :param max_seq_len: The maximum sequence length.
-    :param work_dir: The directory to store the model and other outputs in.
-    :param seed: The seed used for training.
-    :param quiet: Suppress the console output of training and decoding.
-    :return: A tuple containing perplexity, bleu scores for standard and reduced vocab decoding, chrf score.
-    """
-    source_root = work_dir
-    if quiet:
-        quiet_arg = "--quiet"
-    else:
-        quiet_arg = ""
-    with TemporaryDirectory(dir=work_dir, prefix="test_train_translate.") as work_dir:
-        # Train model
-        model_path = os.path.join(work_dir, "model")
-        params = "{} {} {}".format(sockeye.image_captioning.train.__file__,
-                                   _CAPTION_TRAIN_PARAMS_COMMON.format(
-                                       source_root=source_root,
-                                       train_source=train_source_path,
-                                       train_target=train_target_path,
-                                       dev_root=source_root,
-                                       dev_source=dev_source_path,
-                                       dev_target=dev_target_path,
-                                       model=model_path,
-                                       max_len=max_seq_len,
-                                       seed=seed,
-                                       quiet=quiet_arg),
-                                   train_params)
-
-        logger.info("Starting training with parameters %s.", train_params)
-        with patch.object(sys, "argv", params.split()):
-            sockeye.image_captioning.train.main()
-
-        logger.info("Translating with parameters %s.", translate_params)
-        # Translate corpus with the 1st params
-        out_path = os.path.join(work_dir, "out.txt")
-        params = "{} {} {}".format(sockeye.image_captioning.captioner.__file__,
-                                   _CAPTIONER_PARAMS_COMMON.format(model=model_path,
-                                                                   source_root=source_root,
-                                                                   input=test_source_path,
-                                                                   output=out_path,
-                                                                   quiet=quiet_arg),
-                                   translate_params)
-        with patch.object(sys, "argv", params.split()):
-            sockeye.image_captioning.captioner.main()
-
-        # Translate corpus with the 2nd params
-        if translate_params_equiv is not None:
-            out_path_equiv = os.path.join(work_dir, "out_equiv.txt")
-            params = "{} {} {}".format(sockeye.image_captioning.captioner.__file__,
-                                   _CAPTIONER_PARAMS_COMMON.format(model=model_path,
-                                                                   source_root=source_root,
-                                                                   input=test_source_path,
-                                                                   output=out_path_equiv,
-                                                                   quiet=quiet_arg),
-                                    translate_params_equiv)
-            with patch.object(sys, "argv", params.split()):
-                sockeye.image_captioning.captioner.main()
-            # read-in both outputs, ensure they are the same
-            with open(out_path, 'rt') as f:
-                lines = f.readlines()
-            with open(out_path_equiv, 'rt') as f:
-                lines_equiv = f.readlines()
-            assert all(a == b for a, b in zip(lines, lines_equiv))
-
-        # test averaging
-        points = sockeye.average.find_checkpoints(model_path=model_path,
-                                                  size=1,
-                                                  strategy='best',
-                                                  metric=C.PERPLEXITY)
-        assert len(points) > 0
-        averaged_params = sockeye.average.average(points)
-        assert averaged_params
-
-        # get best validation perplexity
-        metrics = sockeye.utils.read_metrics_file(path=os.path.join(model_path, C.METRICS_NAME))
-        perplexity = min(m[C.PERPLEXITY + '-val'] for m in metrics)
-        hypotheses = open(out_path, "r").readlines()
-        references = open(test_target_path, "r").readlines()
-        assert len(hypotheses) == len(references)
-        # compute metrics
-        bleu = raw_corpus_bleu(hypotheses=hypotheses, references=references, offset=0.01)
-        chrf = raw_corpus_chrf(hypotheses=hypotheses, references=references)
-
-        return perplexity, bleu, chrf
-
-
-_EXTRACT_FEATURES_PARAMS_COMMON = \
-    "--use-cpu --image-root {image_root} --input {source_file} --output-root {output_root} " \
-    "--output {output_file} --image-encoder-model-path {image_encoder_model_path}"
-
-
-def run_extract_features_captioning(source_image_size: tuple,
-                                    batch_size: int,
-                                    extract_params: str,
-                                    source_files: List[str],
-                                    image_root: str) -> None:
-
-    with TemporaryDirectory(dir=image_root, prefix="test_extract_feats") as work_dir:
-        model_path = os.path.join(work_dir, '2-conv-layer')
-        epoch = 0
-        # Create net and save to disk
-        create_simple_and_save_to_disk(model_path, epoch, source_image_size, batch_size)
-
-        # Extract features
-        for s in source_files:
-            with TemporaryDirectory(dir=work_dir, prefix="extracted_feats") as local_work_dir:
-                output_root = local_work_dir
-                output_file = os.path.join(local_work_dir, "random.features")
-                params = "{} {} {}".format(sockeye.image_captioning.extract_features.__file__,
-                                           _EXTRACT_FEATURES_PARAMS_COMMON.format(
-                                               image_root=image_root,
-                                               source_file=s,
-                                               output_root=output_root,
-                                               output_file=output_file,
-                                               image_encoder_model_path=model_path
-                                           ),
-                                           extract_params)
-
-                logger.info("Starting feature extractopm with parameters %s.", extract_params)
-                with patch.object(sys, "argv", params.split()):
-                    sockeye.image_captioning.extract_features.main()
-
-
-def create_simple_and_save_to_disk(prefix, iteration, source_image_size, batch_size):
-    # init model
-    sym = get_2convnet_symbol()
-    mod = mx.mod.Module(sym)
-    mod.bind(data_shapes=[('data', (batch_size,) + source_image_size)],
-             label_shapes=[('softmax_label', (batch_size, 1))])
-    mod.init_params()
-    # save
-    mod.save_checkpoint(prefix, iteration)
-
-
-def get_2convnet_symbol():
-    data = mx.symbol.Variable('data')
-    # first conv
-    conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20, name='conv1')
-    tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
-    pool1 = mx.symbol.Pooling(data=tanh1, pool_type="max",
-                              kernel=(2,2), stride=(2,2))
-    # second conv
-    conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50, name='conv2')
-    tanh2 = mx.symbol.Activation(data=conv2, act_type="tanh")
-    pool2 = mx.symbol.Pooling(data=tanh2, pool_type="max",
-                              kernel=(2,2), stride=(2,2))
-    flatten = mx.symbol.Flatten(data=pool2)
-    fc2 = mx.symbol.FullyConnected(data=flatten, num_hidden=1)
-    # loss
-    outsym = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
-    return outsym
\ No newline at end of file
diff --git a/test/integration/image_captioning/__init__.py b/test/integration/image_captioning/__init__.py
deleted file mode 100644
index 6db27beb7..000000000
--- a/test/integration/image_captioning/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
diff --git a/test/integration/image_captioning/test_extract_features.py b/test/integration/image_captioning/test_extract_features.py
deleted file mode 100644
index 15345ac11..000000000
--- a/test/integration/image_captioning/test_extract_features.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import random
-import string
-
-import pytest
-
-from test.common_image_captioning import run_extract_features_captioning, \
-    tmp_img_captioning_dataset
-
-IMAGE_ENCODER_SETTINGS = [
-    ("conv1"),
-    ("conv2"),
-]
-
-
-@pytest.mark.parametrize("layer",
-                         IMAGE_ENCODER_SETTINGS)
-def test_caption_random_features(layer: str):
-    source_image_size = (3, 20, 20)
-    batch_size = 8
-    extract_params = "--source-image-size {s1} {s2} {s3} --batch-size {batch_size} " \
-                     "--image-encoder-layer {layer}".format(s1=source_image_size[0],
-                                                            s2=source_image_size[1],
-                                                            s3=source_image_size[2],
-                                                            batch_size=batch_size,
-                                                            layer=layer)
-
-    # generate random names
-    source_list = [
-        ''.join(random.choice(string.ascii_uppercase) for _ in range(4)) for i
-        in range(8)]
-    prefix = "tmp_features"
-    use_features = False
-    with tmp_img_captioning_dataset(source_list,
-                                    prefix,
-                                    train_max_length=1,
-                                    dev_max_length=1,
-                                    test_max_length=1,
-                                    use_features=use_features) as data:
-        source_files = [data["source"], data["validation_source"],
-                        data["test_source"]]
-        run_extract_features_captioning(source_image_size=source_image_size,
-                                        batch_size=batch_size,
-                                        extract_params=extract_params,
-                                        source_files=source_files,
-                                        image_root=data['work_dir'])
diff --git a/test/integration/image_captioning/test_image_captioning.py b/test/integration/image_captioning/test_image_captioning.py
deleted file mode 100644
index 85a1e6abe..000000000
--- a/test/integration/image_captioning/test_image_captioning.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import random
-import string
-
-import pytest
-
-from test.common_image_captioning import run_train_captioning, tmp_img_captioning_dataset
-
-_LINE_MAX_LENGTH = 9
-_TEST_MAX_LENGTH = 20
-
-ENCODER_DECODER_SETTINGS = [
-    # 2-layer LSTM decoder with attention
-    ("--encoder image-pretrain-cnn --image-encoder-num-hidden 8 --decoder rnn --rnn-cell-type lstm "
-     "--batch-type sentence --batch-size 2 "
-     "--initial-learning-rate 0.0003 --gradient-clipping-threshold 1.0 --bucket-width 2 "
-     "--rnn-num-hidden 8 --rnn-decoder-state-init zero --weight-normalization "
-     "--checkpoint-interval 2 --max-updates 2 --num-layers 1:2 ",
-     "--beam-size 2"),
-    # LSTM decoder with attention: no global, encoder hiddens 8, rnn last, load all feats to mem
-    ("--encoder image-pretrain-cnn --image-encoder-num-hidden 8 --no-image-encoder-global-descriptor "
-     "--decoder rnn --rnn-cell-type lstm --batch-size 12 --optimizer adam --load-all-features-to-memory "
-     "--initial-learning-rate 0.0003 --gradient-clipping-threshold 1.0 --bucket-width 2 "
-     "--rnn-num-hidden 8 --rnn-decoder-state-init last --weight-normalization "
-     "--checkpoint-interval 2 --max-updates 2",
-     "--beam-size 2"),
-    # Transformer decoder
-    ("--encoder image-pretrain-cnn --image-encoder-num-hidden 8 --decoder transformer --batch-size 12 --num-embed 4 "
-     "--transformer-attention-heads 2 --transformer-model-size 4 --transformer-feed-forward-num-hidden 8 "
-     "--initial-learning-rate 0.0003 --gradient-clipping-threshold 1.0 --bucket-width 2 "
-     "--checkpoint-interval 2 --max-updates 2",
-     "--beam-size 2"),
-    # 2-layer CNN decoder
-    ("--encoder image-pretrain-cnn --decoder cnn --num-layers 2 --batch-size 12 "
-     "--initial-learning-rate 0.0003 "
-     "--cnn-num-hidden 8 --image-encoder-num-hidden 8 --cnn-positional-embedding-type fixed "
-     "--checkpoint-interval 2 --max-updates 2",
-     "--beam-size 2")
-]
-
-
-@pytest.mark.parametrize("train_params, translate_params",
-                         ENCODER_DECODER_SETTINGS)
-def test_caption_random_features(train_params: str, translate_params: str):
-    # generate random names
-    source_list = [''.join(random.choice(string.ascii_uppercase) for _ in range(4)) for i in range(15)]
-    prefix = "tmp_caption_random"
-    use_features = True
-    with tmp_img_captioning_dataset(source_list,
-                                    prefix,
-                                    train_max_length=_LINE_MAX_LENGTH,
-                                    dev_max_length=_LINE_MAX_LENGTH,
-                                    test_max_length=_TEST_MAX_LENGTH,
-                                    use_features=use_features) as data:
-        # Test model configuration, including the output equivalence of batch and no-batch decoding
-        translate_params_batch = translate_params + " --batch-size 2"
-
-        # Ignore return values (perplexity and BLEU) for integration test
-        run_train_captioning(train_params=train_params,
-                             translate_params=translate_params,
-                             translate_params_equiv=translate_params_batch,
-                             train_source_path=data['source'],
-                             train_target_path=data['target'],
-                             dev_source_path=data['validation_source'],
-                             dev_target_path=data['validation_target'],
-                             test_source_path=data['test_source'],
-                             test_target_path=data['test_target'],
-                             max_seq_len=_LINE_MAX_LENGTH + 1,
-                             work_dir=data['work_dir'])
diff --git a/test/integration/test_constraints_int.py b/test/integration/test_constraints_int.py
index 5ad840b28..2911ce21c 100644
--- a/test/integration/test_constraints_int.py
+++ b/test/integration/test_constraints_int.py
@@ -20,12 +20,8 @@
 from typing import Dict, List, Any
 from unittest.mock import patch
 
-import pytest
-
-import sockeye.constants as C
 import sockeye.translate
-from test.common import run_train_translate, tmp_digits_dataset, collect_translate_output_and_scores, \
-    _TRANSLATE_PARAMS_COMMON
+from sockeye.test_utils import collect_translate_output_and_scores, TRANSLATE_PARAMS_COMMON
 
 _TRAIN_LINE_COUNT = 20
 _TRAIN_LINE_COUNT_EMPTY = 1
@@ -36,44 +32,48 @@
 _TEST_MAX_LENGTH = 20
 
 TEST_CONFIGS = [
-    # "Vanilla" LSTM encoder-decoder with attention
-    ("--encoder rnn --decoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 8 --num-embed 4 "
-     " --rnn-attention-type mlp"
-     " --rnn-attention-num-hidden 8 --loss cross-entropy --optimized-metric perplexity --max-updates 2"
-     " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01 --batch-type sentence "
-     " --decode-and-evaluate 0",
-     "--batch-size 3 --beam-size 10 --beam-prune 1"),
-    # Full transformer
+    # beam prune
     ("--encoder transformer --decoder transformer"
      " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 8 --num-embed 8"
      " --transformer-feed-forward-num-hidden 16"
      " --transformer-dropout-prepost 0.1 --transformer-preprocess n --transformer-postprocess dr"
-     " --weight-tying --weight-tying-type src_trg_softmax"
-     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg --embed-weight-init=normal"
+     " --weight-tying-type src_trg_softmax"
+     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg"
      " --batch-size 2 --max-updates 2 --batch-type sentence --decode-and-evaluate 0"
      " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01",
-     "--batch-size 1 --beam-size 10")]
-
-
-@pytest.mark.parametrize("train_params, translate_params", TEST_CONFIGS)
-def test_constraints(train_params: str, translate_params: str):
-    with tmp_digits_dataset(prefix="test_constraints",
-                            train_line_count=_TRAIN_LINE_COUNT,
-                            train_line_count_empty=_TRAIN_LINE_COUNT_EMPTY,
-                            train_max_length=_LINE_MAX_LENGTH,
-                            dev_line_count=_DEV_LINE_COUNT,
-                            dev_max_length=_LINE_MAX_LENGTH,
-                            test_line_count=_TEST_LINE_COUNT,
-                            test_line_count_empty=_TEST_LINE_COUNT_EMPTY,
-                            test_max_length=_TEST_MAX_LENGTH,
-                            sort_target=False) as data:
-        # train a minimal default model
-        data = run_train_translate(train_params=train_params, translate_params=translate_params, data=data,
-                                   max_seq_len=_LINE_MAX_LENGTH + C.SPACE_FOR_XOS)
+     "--batch-size 3 --beam-size 9 --beam-prune 1"),
+    # no beam prune
+    ("--encoder transformer --decoder transformer"
+     " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 8 --num-embed 8"
+     " --transformer-feed-forward-num-hidden 16"
+     " --transformer-dropout-prepost 0.1 --transformer-preprocess n --transformer-postprocess dr"
+     " --weight-tying-type src_trg_softmax"
+     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg"
+     " --batch-size 2 --max-updates 4 --batch-type sentence --decode-and-evaluate 0"
+     " --checkpoint-interval 4 --optimizer adam --initial-learning-rate 0.01",
+     "--batch-size 1 --beam-size 10")
+]
 
-        # 'constraint' = positive constraints (must appear), 'avoid' = negative constraints (must not appear)
-        for constraint_type in ["constraints", "avoid"]:
-            _test_constrained_type(constraint_type=constraint_type, data=data, translate_params=translate_params)
+# TODO(fhieber): Disabled due to brittleness of constrained decoding tests with Transformer models. Requires investigation.
+# @pytest.mark.parametrize("train_params, translate_params", TEST_CONFIGS)
+# def test_constraints(train_params: str, translate_params: str):
+#     with tmp_digits_dataset(prefix="test_constraints",
+#                             train_line_count=_TRAIN_LINE_COUNT,
+#                             train_line_count_empty=_TRAIN_LINE_COUNT_EMPTY,
+#                             train_max_length=_LINE_MAX_LENGTH,
+#                             dev_line_count=_DEV_LINE_COUNT,
+#                             dev_max_length=_LINE_MAX_LENGTH,
+#                             test_line_count=_TEST_LINE_COUNT,
+#                             test_line_count_empty=_TEST_LINE_COUNT_EMPTY,
+#                             test_max_length=_TEST_MAX_LENGTH,
+#                             sort_target=False) as data:
+#         # train a minimal default model
+#         data = run_train_translate(train_params=train_params, translate_params=translate_params, data=data,
+#                                    max_seq_len=_LINE_MAX_LENGTH + C.SPACE_FOR_XOS)
+#
+#         # 'constraint' = positive constraints (must appear), 'avoid' = negative constraints (must not appear)
+#         for constraint_type in ["constraints", "avoid"]:
+#             _test_constrained_type(constraint_type=constraint_type, data=data, translate_params=translate_params)
 
 
 def _test_constrained_type(constraint_type: str, data: Dict[str, Any], translate_params: str):
@@ -85,9 +85,9 @@ def _test_constrained_type(constraint_type: str, data: Dict[str, Any], translate
     out_path_constrained = os.path.join(data['work_dir'], "out_constrained.txt")
     params = "{} {} {} --json-input --output-type translation_with_score".format(
         sockeye.translate.__file__,
-        _TRANSLATE_PARAMS_COMMON.format(model=data['model'],
-                                        input=new_test_source_path,
-                                        output=out_path_constrained),
+        TRANSLATE_PARAMS_COMMON.format(model=data['model'],
+                                       input=new_test_source_path,
+                                       output=out_path_constrained),
         translate_params)
     with patch.object(sys, "argv", params.split()):
         sockeye.translate.main()
diff --git a/test/integration/test_seq_copy_int.py b/test/integration/test_seq_copy_int.py
index 904bf4481..c06873052 100644
--- a/test/integration/test_seq_copy_int.py
+++ b/test/integration/test_seq_copy_int.py
@@ -26,7 +26,9 @@
 import sockeye.evaluate
 import sockeye.extract_parameters
 from sockeye import constants as C
-from test.common import check_train_translate, run_train_translate, tmp_digits_dataset
+from sockeye.model import load_model
+from sockeye.test_utils import run_train_translate, tmp_digits_dataset
+from test.common import check_train_translate
 
 logger = logging.getLogger(__name__)
 
@@ -40,152 +42,86 @@
 
 # tuple format: (train_params, translate_params, use_prepared_data, use_source_factors)
 ENCODER_DECODER_SETTINGS = [
-    # "Vanilla" LSTM encoder-decoder with attention
-    ("--encoder rnn --decoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 8 --num-embed 4 "
-     " --rnn-attention-type mlp"
-     " --rnn-attention-num-hidden 8 --batch-size 2 --loss cross-entropy --optimized-metric perplexity --max-updates 2"
-     " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01 --batch-type sentence "
-     " --decode-and-evaluate 0",
-     "--beam-size 2 --softmax-temperature 0.01",
-     False, False),
-    # "Vanilla" LSTM encoder-decoder with attention, greedy and skip topk
-    ("--encoder rnn --decoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 8 --num-embed 4 "
-     " --rnn-attention-type mlp"
-     " --rnn-attention-num-hidden 8 --batch-size 2 --loss cross-entropy --optimized-metric perplexity --max-updates 2"
-     " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01 --batch-type sentence "
-     " --decode-and-evaluate 0",
-     "--beam-size 1 --softmax-temperature 0.01 --skip-topk",
-     False, False),
-    # "Vanilla" LSTM encoder-decoder with attention, higher nbest size
-    ("--encoder rnn --decoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 8 --num-embed 4 "
-     " --rnn-attention-type mlp"
-     " --rnn-attention-num-hidden 8 --batch-size 2 --loss cross-entropy --optimized-metric perplexity --max-updates 2"
-     " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01 --batch-type sentence "
-     " --decode-and-evaluate 0",
-     "--beam-size 2 --softmax-temperature 0.01 --nbest-size 2",
-     False, False),
-    # "Kitchen sink" LSTM encoder-decoder with attention
-    ("--encoder rnn --decoder rnn --num-layers 3:2 --rnn-cell-type lstm --rnn-num-hidden 8"
-     " --rnn-residual-connections"
-     " --num-embed 8 --rnn-attention-type coverage --rnn-attention-num-hidden 8 --weight-tying "
-     "--rnn-attention-use-prev-word --rnn-context-gating --layer-normalization --batch-size 2 "
-     "--loss cross-entropy --label-smoothing 0.1 --loss-normalization-type batch --optimized-metric perplexity"
-     " --max-updates 2 --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01"
-     " --rnn-dropout-inputs 0.5:0.1 --rnn-dropout-states 0.5:0.1 --embed-dropout 0.1 --rnn-decoder-hidden-dropout 0.01"
-     " --rnn-decoder-state-init avg --rnn-encoder-reverse-input --rnn-dropout-recurrent 0.1:0.0"
-     " --rnn-h2h-init orthogonal_stacked --batch-type sentence --decode-and-evaluate 0"
-     " --learning-rate-decay-param-reset --weight-normalization --source-factors-num-embed 5 --source-factors-combine concat",
-     "--beam-size 2 --beam-search-stop first",
-     True, True),
-    # Convolutional embedding encoder + LSTM encoder-decoder with attention
-    ("--encoder rnn-with-conv-embed --decoder rnn --conv-embed-max-filter-width 3 --conv-embed-num-filters 4:4:8"
-     " --conv-embed-pool-stride 2 --conv-embed-num-highway-layers 1 --num-layers 1 --rnn-cell-type lstm"
-     " --rnn-num-hidden 8 --num-embed 4 --rnn-attention-num-hidden 8 --batch-size 2 --loss cross-entropy"
-     " --optimized-metric perplexity --max-updates 2 --checkpoint-interval 2 --optimizer adam --batch-type sentence"
-     " --initial-learning-rate 0.01 --decode-and-evaluate 0",
-     "--beam-size 2",
-     False, False),
-    # Transformer encoder, GRU decoder, mhdot attention
-    ("--encoder transformer --decoder rnn --num-layers 2:1 --rnn-cell-type gru --rnn-num-hidden 8 --num-embed 4:8"
-     " --transformer-attention-heads 2 --transformer-model-size 4"
-     " --transformer-feed-forward-num-hidden 16 --transformer-activation-type gelu"
-     " --rnn-attention-type mhdot --rnn-attention-mhdot-heads 4 --rnn-attention-num-hidden 8 --batch-size 2 "
-     " --max-updates 2 --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01"
-     " --weight-init-xavier-factor-type avg --weight-init-scale 3.0 --embed-weight-init normal --batch-type sentence"
-     " --decode-and-evaluate 0",
-     "--beam-size 2",
-     True, False),
-    # LSTM encoder, Transformer decoder
-    ("--encoder rnn --decoder transformer --num-layers 2:2 --rnn-cell-type lstm --rnn-num-hidden 8 --num-embed 8"
-     " --transformer-attention-heads 2 --transformer-model-size 8"
-     " --transformer-feed-forward-num-hidden 16 --transformer-activation-type swish1"
+    # Basic transformer, nbest=2 decoding
+    ("--encoder transformer --decoder transformer"
+     " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 8 --num-embed 8"
+     " --transformer-feed-forward-num-hidden 16"
+     " --transformer-dropout-prepost 0.1 --transformer-preprocess n --transformer-postprocess dr"
+     " --weight-tying-type src_trg_softmax"
+     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg"
      " --batch-size 2 --max-updates 2 --batch-type sentence --decode-and-evaluate 0"
      " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01",
-     "--beam-size 3",
-     True, False),
-    # Full transformer
+     "--beam-size 2 --nbest-size 2",
+     False, 0),
+    # Basic transformer w/ prepared data & greedy decoding
     ("--encoder transformer --decoder transformer"
      " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 8 --num-embed 8"
      " --transformer-feed-forward-num-hidden 16"
      " --transformer-dropout-prepost 0.1 --transformer-preprocess n --transformer-postprocess dr"
-     " --weight-tying --weight-tying-type src_trg_softmax"
-     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg --embed-weight-init=normal"
+     " --weight-tying-type src_trg"
+     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg"
      " --batch-size 2 --max-updates 2 --batch-type sentence --decode-and-evaluate 0"
      " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01",
-     "--beam-size 2 --nbest-size 2",
-     False, False),
-    # Full transformer with source factor
+     "--beam-size 1",
+     True, 0),
+    # Basic transformer with source factor, beam-search-stop first decoding
     ("--encoder transformer --decoder transformer"
      " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 8 --num-embed 8"
      " --transformer-feed-forward-num-hidden 16"
      " --transformer-dropout-prepost 0.1 --transformer-preprocess n --transformer-postprocess dr"
-     " --weight-tying --weight-tying-type src_trg_softmax"
+     " --weight-tying-type trg_softmax"
      " --batch-size 2 --max-updates 2 --batch-type sentence --decode-and-evaluate 0"
-     " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01 --source-factors-combine sum",
-     "--beam-size 2",
-     False, True),
-    # 2-layer cnn
-    ("--encoder cnn --decoder cnn "
-     " --batch-size 2 --num-layers 2 --max-updates 2 --checkpoint-interval 2"
-     " --cnn-num-hidden 32 --cnn-positional-embedding-type fixed"
-     " --optimizer adam --initial-learning-rate 0.001 --batch-type sentence --decode-and-evaluate 0",
-     "--beam-size 2",
-     False, False),
-    # Vanilla LSTM like above but activating LHUC. In the normal case you would
-    # start with a trained system instead of a random initialized one like here.
-    ("--encoder rnn --decoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 8 --num-embed 4 "
-     " --rnn-attention-num-hidden 8 --rnn-attention-type mlp"
-     " --batch-size 2 --batch-type sentence"
-     " --loss cross-entropy --optimized-metric perplexity --max-updates 2"
-     " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01 --lhuc all",
-     "--beam-size 2 --nbest-size 2",
-     False, False),
-    # Full transformer with LHUC
+     " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01"
+     " --source-factors-combine sum concat average --source-factors-share-embedding true false true"
+     " --source-factors-num-embed 8 2 8",
+     "--beam-size 2 --beam-search-stop first",
+     True, 3),
+    # Basic transformer with LHUC
     ("--encoder transformer --decoder transformer"
      " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 8 --num-embed 8"
      " --transformer-feed-forward-num-hidden 16"
      " --transformer-dropout-prepost 0.1 --transformer-preprocess n --transformer-postprocess dr"
-     " --weight-tying --weight-tying-type src_trg_softmax"
-     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg --embed-weight-init=normal"
+     " --weight-tying-type src_trg_softmax"
+     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg"
      " --batch-size 2 --max-updates 2 --batch-type sentence  --decode-and-evaluate 0"
      " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01 --lhuc all",
-     "--beam-size 2 --beam-prune 1",
-     False, False),
-    # Full transformer and length ratio prediction, and learned brevity penalty during inference
+     "--beam-size 2",
+     False, 0),
+    # Basic transformer and length ratio prediction, and learned brevity penalty during inference
     ("--encoder transformer --decoder transformer"
      " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 8 --num-embed 8"
      " --transformer-feed-forward-num-hidden 16"
      " --transformer-dropout-prepost 0.1 --transformer-preprocess n --transformer-postprocess dr"
-     " --weight-tying --weight-tying-type src_trg_softmax"
-     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg --embed-weight-init=normal"
+     " --weight-tying-type src_trg_softmax"
+     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg"
      " --batch-size 2 --max-updates 2 --batch-type sentence --decode-and-evaluate 0"
-     " --checkpoint-frequency 2 --optimizer adam --initial-learning-rate 0.01"
+     " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01"
      " --length-task ratio --length-task-weight 1.0 --length-task-layers 1",
      "--beam-size 2"
      " --brevity-penalty-type learned --brevity-penalty-weight 1.0",
-     False, False),
-    # Full transformer and absolute length prediction, and constant brevity penalty during inference
+     True, 0),
+    # Basic transformer and absolute length prediction, and constant brevity penalty during inference
     ("--encoder transformer --decoder transformer"
      " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 8 --num-embed 8"
      " --transformer-feed-forward-num-hidden 16"
      " --transformer-dropout-prepost 0.1 --transformer-preprocess n --transformer-postprocess dr"
-     " --weight-tying --weight-tying-type src_trg_softmax"
-     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg --embed-weight-init=normal"
+     " --weight-tying-type src_trg_softmax"
+     " --weight-init-scale=3.0 --weight-init-xavier-factor-type=avg"
      " --batch-size 2 --max-updates 2 --batch-type sentence --decode-and-evaluate 0"
-     " --checkpoint-frequency 2 --optimizer adam --initial-learning-rate 0.01"
+     " --checkpoint-interval 2 --optimizer adam --initial-learning-rate 0.01"
      " --length-task length --length-task-weight 1.0 --length-task-layers 2",
      "--beam-size 2"
      " --brevity-penalty-type constant --brevity-penalty-weight 2.0 --brevity-penalty-constant-length-ratio 1.5",
-     False, False),
-    ]
+     False, 0),
+]
 
 
-@pytest.mark.parametrize("train_params, translate_params, use_prepared_data, use_source_factors",
+@pytest.mark.parametrize("train_params, translate_params, use_prepared_data, n_source_factors",
                          ENCODER_DECODER_SETTINGS)
 def test_seq_copy(train_params: str,
                   translate_params: str,
                   use_prepared_data: bool,
-                  use_source_factors: bool):
+                  n_source_factors: int):
     """
     Task: copy short sequences of digits
     """
@@ -200,7 +136,7 @@ def test_seq_copy(train_params: str,
                             test_line_count_empty=_TEST_LINE_COUNT_EMPTY,
                             test_max_length=_TEST_MAX_LENGTH,
                             sort_target=False,
-                            with_source_factors=use_source_factors) as data:
+                            with_n_source_factors=n_source_factors) as data:
 
         # TODO: Here we temporarily switch off comparing translation and scoring scores, which
         # sometimes produces inconsistent results for --batch-size > 1 (see issue #639 on github).
@@ -208,14 +144,14 @@ def test_seq_copy(train_params: str,
                               translate_params=translate_params,
                               data=data,
                               use_prepared_data=use_prepared_data,
-                              max_seq_len=_LINE_MAX_LENGTH + C.SPACE_FOR_XOS,
+                              max_seq_len=_LINE_MAX_LENGTH,
                               compare_output=False)
 
 
 TINY_TEST_MODEL = [(" --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 4 --num-embed 4"
-                    " --transformer-feed-forward-num-hidden 4 --weight-tying --weight-tying-type src_trg_softmax"
+                    " --transformer-feed-forward-num-hidden 4 --weight-tying-type src_trg_softmax"
                     " --batch-size 2 --batch-type sentence --max-updates 4 --decode-and-evaluate 0"
-                    " --checkpoint-frequency 4",
+                    " --checkpoint-interval 4",
                     "--beam-size 1")]
 
 
@@ -237,7 +173,7 @@ def test_other_clis(train_params: str, translate_params: str):
         data = run_train_translate(train_params=train_params,
                                    translate_params=translate_params,
                                    data=data,
-                                   max_seq_len=_LINE_MAX_LENGTH + C.SPACE_FOR_XOS)
+                                   max_seq_len=_LINE_MAX_LENGTH)
 
         _test_checkpoint_decoder(data['dev_source'], data['dev_target'], data['model'])
         _test_parameter_averaging(data['model'])
@@ -269,12 +205,12 @@ def _test_extract_parameters_cli(model_path: str):
     """
     Runs parameter extraction CLI and asserts that the resulting numpy serialization contains a parameter key.
     """
-    extract_params = "--input {input} --names target_output_bias --list-all --output {output}".format(
+    extract_params = "--input {input} --names output_layer.bias --list-all --output {output}".format(
         output=os.path.join(model_path, "params.extracted"), input=model_path)
     with patch.object(sys, "argv", extract_params.split()):
         sockeye.extract_parameters.main()
     with np.load(os.path.join(model_path, "params.extracted.npz")) as data:
-        assert "target_output_bias" in data
+        assert "output_layer.bias" in data
 
 
 def _test_parameter_averaging(model_path: str):
@@ -298,15 +234,21 @@ def _test_checkpoint_decoder(dev_source_path: str, dev_target_path: str, model_p
     with open(dev_source_path) as dev_fd:
         num_dev_sent = sum(1 for _ in dev_fd)
     sample_size = min(1, int(num_dev_sent * 0.1))
+
+    model, source_vocabs, target_vocab = load_model(model_folder=model_path, context=[mx.cpu()])
+
     cp_decoder = sockeye.checkpoint_decoder.CheckpointDecoder(context=mx.cpu(),
                                                               inputs=[dev_source_path],
                                                               references=dev_target_path,
-                                                              model=model_path,
+                                                              source_vocabs=source_vocabs,
+                                                              target_vocab=target_vocab,
+                                                              model=model,
+                                                              model_folder=model_path,
                                                               sample_size=sample_size,
                                                               batch_size=2,
                                                               beam_size=2)
     cp_metrics = cp_decoder.decode_and_evaluate()
     logger.info("Checkpoint decoder metrics: %s", cp_metrics)
-    assert 'bleu-val' in cp_metrics
-    assert 'chrf-val' in cp_metrics
-    assert 'decode-walltime-val' in cp_metrics
+    assert 'bleu' in cp_metrics
+    assert 'chrf' in cp_metrics
+    assert 'decode-walltime' in cp_metrics
diff --git a/test/system/test_seq_copy_sys.py b/test/system/test_seq_copy_sys.py
index d007c27cc..14c4c9798 100644
--- a/test/system/test_seq_copy_sys.py
+++ b/test/system/test_seq_copy_sys.py
@@ -20,17 +20,18 @@
 import sockeye.constants as C
 import sockeye.evaluate
 import sockeye.utils
-from test.common import check_train_translate, tmp_digits_dataset
+from sockeye.test_utils import tmp_digits_dataset
+from test.common import check_train_translate
 
 logger = logging.getLogger(__name__)
 
 _TRAIN_LINE_COUNT = 10000
 _TRAIN_LINE_COUNT_EMPTY = 100
 _DEV_LINE_COUNT = 100
-_LINE_MAX_LENGTH = 10
+_LINE_MAX_LENGTH = 9
 _TEST_LINE_COUNT = 110
 _TEST_LINE_COUNT_EMPTY = 10
-_TEST_MAX_LENGTH = 11
+_TEST_MAX_LENGTH = 9
 _SEED_TRAIN_DATA = 13
 _SEED_DEV_DATA = 17
 
@@ -43,64 +44,10 @@
 
 COMMON_TRAINING_PARAMS = " --checkpoint-interval 1000 --optimizer adam --initial-learning-rate 0.001" \
                          " --decode-and-evaluate 0 --label-smoothing 0.0" \
-                         " --optimized-metric perplexity --loss cross-entropy"
+                         " --optimized-metric perplexity --loss cross-entropy --weight-tying-type src_trg_softmax"
 
 
 @pytest.mark.parametrize("name, train_params, translate_params, use_prepared_data, perplexity_thresh, bleu_thresh", [
-    ("Copy:lstm:lstm",
-     "--encoder rnn --decoder rnn "
-     " --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32"
-     " --rnn-attention-type mlp --rnn-attention-num-hidden 32"
-     " --batch-size 16 --batch-type sentence"
-     " --rnn-dropout-states 0.0:0.1 --embed-dropout 0.1:0.0 --weight-normalization"
-     " --max-updates 4000"
-     " --gradient-clipping-type norm --gradient-clipping-threshold 10" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5 ",
-     True,
-     1.03,
-     0.98),
-    ("Copy:chunking",
-     "--encoder rnn --decoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32"
-     " --rnn-attention-type mlp --rnn-attention-num-hidden 32"
-     " --batch-size 16 --batch-type sentence"
-     " --rnn-dropout-states 0.0:0.1 --embed-dropout 0.1:0.0"
-     " --max-updates 5000" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5 --max-input-len 4",
-     False,
-     1.01,
-     0.99),
-    ("Copy:word-based-batching:pruning",
-     "--encoder rnn --decoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32 "
-     " --rnn-attention-type mlp --rnn-attention-num-hidden 32 "
-     " --batch-size 80 --batch-type word "
-     " --max-updates 5000 "
-     " --rnn-dropout-states 0.0:0.1 --embed-dropout 0.1:0.0 --layer-normalization" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5 --batch-size 2 --beam-prune 1",
-     True,
-     1.01,
-     0.99),
-    ("Copy:transformer:lstm",
-     "--encoder transformer --decoder rnn --num-layers 2:1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32"
-     " --rnn-attention-type mhdot --rnn-attention-num-hidden 32 --rnn-attention-mhdot-heads 1"
-     " --max-updates 6000"
-     " --transformer-attention-heads 4 --transformer-model-size 32"
-     " --transformer-feed-forward-num-hidden 64 --transformer-activation-type gelu"
-     " --batch-size 16 --batch-type sentence" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5",
-     False,
-     1.01,
-     0.99),
-    ("Copy:lstm:transformer",
-     "--encoder rnn --decoder transformer --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32"
-     " --rnn-decoder-hidden-dropout 0.0"
-     " --batch-size 16 --batch-type sentence"
-     " --max-updates 4000"
-     " --transformer-attention-heads 4 --transformer-model-size 32"
-     " --transformer-feed-forward-num-hidden 64 --transformer-activation-type swish1" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5",
-     True,
-     1.01,
-     0.98),
     ("Copy:transformer:transformer",
      "--encoder transformer --decoder transformer"
      " --max-updates 4000"
@@ -111,23 +58,15 @@
      False,
      1.02,
      0.98),
-    ("Copy:cnn:cnn",
-     "--encoder cnn --decoder cnn "
-     " --batch-size 16 --num-layers 3 --max-updates 4000"
-     " --cnn-num-hidden 32 --cnn-positional-embedding-type fixed --cnn-project-qkv"
-     " --batch-size 16 --batch-type sentence" + COMMON_TRAINING_PARAMS,
-     "--beam-size 1",
-     True,
-     1.04,
-     0.98),
     ("Copy:transformer:transformer:length_task_learned",
      "--encoder transformer --decoder transformer"
      " --max-updates 4000"
      " --num-layers 2 --transformer-attention-heads 4 --transformer-model-size 32"
      " --transformer-feed-forward-num-hidden 64 --num-embed 32"
-     " --length-task length --length-task-weight 1.5 --length-task-layers 3 --metrics perplexity length-ratio-mse"
+     " --length-task length --length-task-weight 1.5 --length-task-layers 3"
      " --batch-size 16 --batch-type sentence" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5 --batch-size 2 --brevity-penalty-type learned --brevity-penalty-weight 0.9 --max-input-len %s" % _TEST_MAX_LENGTH,
+     "--beam-size 5 --batch-size 2 --brevity-penalty-type learned"
+     " --brevity-penalty-weight 0.9 --max-input-length %s" % _TEST_MAX_LENGTH,
      True,
      1.02,
      0.96),
@@ -136,9 +75,10 @@
      " --max-updates 4000"
      " --num-layers 2 --transformer-attention-heads 4 --transformer-model-size 32"
      " --transformer-feed-forward-num-hidden 64 --num-embed 32"
-     " --length-task ratio --length-task-weight 0.1 --length-task-layers 1 --metrics perplexity length-ratio-mse"
+     " --length-task ratio --length-task-weight 0.1 --length-task-layers 1"
      " --batch-size 16 --batch-type sentence" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5 --batch-size 2 --brevity-penalty-type constant --brevity-penalty-weight 1.0 --brevity-penalty-constant-length-ratio 1 --max-input-len %s" % _TEST_MAX_LENGTH,
+     "--beam-size 5 --batch-size 2 --brevity-penalty-type constant"
+     " --brevity-penalty-weight 1.0 --brevity-penalty-constant-length-ratio 1 --max-input-length %s" % _TEST_MAX_LENGTH,
      False,
      1.02,
      0.94)
@@ -155,12 +95,12 @@ def test_seq_copy(name, train_params, translate_params, use_prepared_data, perpl
                             test_line_count_empty=_TEST_LINE_COUNT_EMPTY,
                             test_max_length=_TEST_MAX_LENGTH,
                             sort_target=False,
-                            with_source_factors=False) as data:
+                            with_n_source_factors=0) as data:
         data = check_train_translate(train_params=train_params,
                                      translate_params=translate_params,
                                      data=data,
                                      use_prepared_data=use_prepared_data,
-                                     max_seq_len=_LINE_MAX_LENGTH + C.SPACE_FOR_XOS,
+                                     max_seq_len=_LINE_MAX_LENGTH,
                                      compare_output=True,
                                      seed=seed)
 
@@ -174,60 +114,17 @@ def test_seq_copy(name, train_params, translate_params, use_prepared_data, perpl
         bleu_restrict = sockeye.evaluate.raw_corpus_bleu(hypotheses=data['test_outputs_restricted'],
                                                          references=data['test_targets'])
 
-        logger.info("test: %s", name)
+        logger.info("================")
+        logger.info("test results: %s", name)
         logger.info("perplexity=%f, bleu=%f, bleu_restrict=%f chrf=%f", perplexity, bleu, bleu_restrict, chrf)
+        logger.info("================\n")
         assert perplexity <= perplexity_thresh
         assert bleu >= bleu_thresh
         assert bleu_restrict >= bleu_thresh
 
 
 @pytest.mark.parametrize(
-    "name, train_params, translate_params, use_prepared_data, use_source_factor, perplexity_thresh, bleu_thresh", [
-    ("Sort:lstm:lstm",
-     "--encoder rnn --decoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32"
-     " --rnn-attention-type mlp"
-     " --rnn-attention-num-hidden 32"
-     " --max-updates 7000 "
-     " --batch-size 16 --batch-type sentence" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5",
-     True, False,
-     1.03,
-     0.97),
-    ("Sort:word-based-batching",
-     "--encoder rnn --decoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32 "
-     " --rnn-attention-type mlp --rnn-attention-num-hidden 32 "
-     " --batch-size 80 --batch-type word"
-     " --max-updates 6000"
-     " --rnn-dropout-states 0.0:0.1 --embed-dropout 0.1:0.0" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5",
-     False, False,
-     1.03,
-     0.97),
-    ("Sort:transformer:lstm",
-     "--encoder transformer --decoder rnn --num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32"
-     " --rnn-attention-type mhdot --rnn-attention-num-hidden 32"
-     " --batch-size 16 --batch-type sentence"
-     " --rnn-attention-mhdot-heads 2"
-     " --max-updates 6000"
-     " --transformer-dropout-attention 0.0 --transformer-dropout-act 0.0 --transformer-dropout-prepost 0.0"
-     " --transformer-attention-heads 4 --transformer-model-size 32"
-     " --transformer-feed-forward-num-hidden 64 --transformer-activation-type gelu" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5",
-     True, False,
-     1.03,
-     0.97),
-    ("Sort:lstm:transformer",
-     "--encoder rnn --num-layers 1:2 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32"
-     " --decoder transformer --transformer-model-size 32"
-     " --max-updates 7000"
-     " --transformer-attention-heads 4"
-     " --transformer-feed-forward-num-hidden 64 --transformer-activation-type swish1"
-     " --transformer-dropout-attention 0.0 --transformer-dropout-act 0.0 --transformer-dropout-prepost 0.0"
-     " --batch-size 16 --batch-type sentence" + COMMON_TRAINING_PARAMS,
-     "--beam-size 5",
-     False, False,
-     1.03,
-     0.97),
+    "name, train_params, translate_params, use_prepared_data, n_source_factors, perplexity_thresh, bleu_thresh", [
     ("Sort:transformer:transformer",
      "--encoder transformer --decoder transformer"
      " --batch-size 16 --update-interval 1 --batch-type sentence"
@@ -236,7 +133,7 @@ def test_seq_copy(name, train_params, translate_params, use_prepared_data, perpl
      " --transformer-dropout-attention 0.0 --transformer-dropout-act 0.0 --transformer-dropout-prepost 0.0"
      " --transformer-feed-forward-num-hidden 64" + COMMON_TRAINING_PARAMS,
      "--beam-size 1",
-     True, False,
+     True, 0,
      1.03,
      0.97),
     ("Sort:transformer_with_source_factor",
@@ -246,33 +143,24 @@ def test_seq_copy(name, train_params, translate_params, use_prepared_data, perpl
      " --num-layers 2 --transformer-attention-heads 2 --transformer-model-size 32 --num-embed 32"
      " --transformer-dropout-attention 0.0 --transformer-dropout-act 0.0 --transformer-dropout-prepost 0.0"
      " --transformer-feed-forward-num-hidden 64"
-     " --source-factors-num-embed 2" + COMMON_TRAINING_PARAMS,
+     " --source-factors-num-embed 2 2 2" + COMMON_TRAINING_PARAMS,
      "--beam-size 1",
-     True, True,
+     True, 3,
      1.03,
-     0.96),
-    ("Sort:cnn:cnn",
-     "--encoder cnn --decoder cnn"
-     " --batch-size 16 --batch-type sentence"
-     " --max-updates 6000"
-     " --num-layers 3 --cnn-num-hidden 32 --cnn-positional-embedding-type fixed" + COMMON_TRAINING_PARAMS,
-     "--beam-size 1",
-     False, False,
-     1.05,
-     0.94)
+     0.96)
 ])
 def test_seq_sort(name, train_params, translate_params, use_prepared_data,
-                  use_source_factor, perplexity_thresh, bleu_thresh):
+                  n_source_factors, perplexity_thresh, bleu_thresh):
     """Task: sort short sequences of digits"""
     with tmp_digits_dataset("test_seq_sort.", _TRAIN_LINE_COUNT, _TRAIN_LINE_COUNT_EMPTY, _LINE_MAX_LENGTH, _DEV_LINE_COUNT, _LINE_MAX_LENGTH,
                             _TEST_LINE_COUNT, _TEST_LINE_COUNT_EMPTY, _TEST_MAX_LENGTH,
                             sort_target=True, seed_train=_SEED_TRAIN_DATA, seed_dev=_SEED_DEV_DATA,
-                            with_source_factors=use_source_factor) as data:
+                            with_n_source_factors=n_source_factors) as data:
         data = check_train_translate(train_params=train_params,
                                      translate_params=translate_params,
                                      data=data,
                                      use_prepared_data=use_prepared_data,
-                                     max_seq_len=_LINE_MAX_LENGTH + C.SPACE_FOR_XOS,
+                                     max_seq_len=_LINE_MAX_LENGTH,
                                      compare_output=True,
                                      seed=seed)
 
diff --git a/test/unit/__init__.py b/test/unit/__init__.py
index 3d9e97c1e..d081d1e86 100644
--- a/test/unit/__init__.py
+++ b/test/unit/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
diff --git a/test/unit/image_captioning/test_arguments.py b/test/unit/image_captioning/test_arguments.py
deleted file mode 100644
index 10e6d57df..000000000
--- a/test/unit/image_captioning/test_arguments.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import argparse
-import pytest
-
-import sockeye.image_captioning.arguments as arguments
-import sockeye.constants as C
-
-from test.unit.test_arguments import _test_args
-
-
-@pytest.mark.parametrize("test_params, expected_params", [
-    ('--image-root test_img_root --input input --output-root test_output_root --output output',
-     dict(source_image_size=[3, 224, 224],
-          image_root="test_img_root",
-          input="input",
-          output_root="test_output_root",
-          output="output",
-          batch_size=64,
-          image_positional_embedding_type=C.NO_POSITIONAL_EMBEDDING,
-          image_encoder_model_path="/path/to/mxnet/image/model/",
-          image_encoder_model_epoch=0,
-          image_encoder_layer="stage4_unit3_conv3",
-          image_encoder_conv_map_size=49,
-          image_encoder_num_hidden=512,
-          no_image_encoder_global_descriptor=True,
-          load_all_features_to_memory=False,
-          device_ids=[-1],
-          disable_device_locking=False,
-          lock_dir='/tmp',
-          use_cpu=False,
-          extract_image_features=False
-     ))
-])
-def test_image_extract_features_cli_args(test_params, expected_params):
-    _test_args(test_params, expected_params, arguments.add_image_extract_features_cli_args)
-
-
-@pytest.mark.parametrize("test_params, expected_params", [
-    ('--source-root test_src_root',
-     dict(source_root="test_src_root"))
-])
-def test_image_source_root_args(test_params, expected_params):
-    _test_args(test_params, expected_params, arguments.add_image_source_root_args)
-
-
-@pytest.mark.parametrize("test_params, expected_params", [
-    ('--validation-source-root test_val_src_root --validation-source val_src --validation-target val_tgt',
-     dict(validation_source_root="test_val_src_root",
-          validation_source="val_src",
-          validation_target="val_tgt",
-          validation_source_factors=[]
-     ))
-])
-def test_image_validation_data_params(test_params, expected_params):
-    _test_args(test_params, expected_params, arguments.add_image_validation_data_params)
-
-
-@pytest.mark.parametrize("test_params, expected_params", [
-    ('--load-all-features-to-memory',
-     dict(load_all_features_to_memory=True, extract_image_features=False))
-])
-def test_preextracted_features_args(test_params, expected_params):
-    _test_args(test_params, expected_params, arguments.add_preextracted_features_args)
-
-
-def test_add_image_train_cli_args():
-     # Just make sure that it does not fail. We covered above the main tests and
-     # the rest are coveder in test/unit/test_arguments.py
-     params = argparse.ArgumentParser()
-     arguments.add_image_train_cli_args(params)
-
-
-def test_add_image_caption_cli_args():
-     # Just make sure that it does not fail. We covered above the main tests and
-     # the rest are coveder in test/unit/test_arguments.py
-     params = argparse.ArgumentParser()
-     arguments.add_image_caption_cli_args(params)
diff --git a/test/unit/image_captioning/test_data_io.py b/test/unit/image_captioning/test_data_io.py
deleted file mode 100644
index 884c541fb..000000000
--- a/test/unit/image_captioning/test_data_io.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import os
-from tempfile import TemporaryDirectory
-
-import mxnet as mx
-import numpy as np
-import pytest
-
-import sockeye.constants as C
-import sockeye.data_io
-import sockeye.image_captioning.data_io as data_io
-from sockeye import vocab
-from sockeye.utils import seed_rngs
-from test.common_image_captioning import generate_img_or_feat, tmp_img_captioning_dataset, _FEATURE_SHAPE, _CNN_INPUT_IMAGE_SHAPE
-
-seed_rngs(12)
-
-
-@pytest.mark.parametrize("source_list, target_sentences, num_samples_per_bucket, expected_source_0, expected_target_0, expected_label_0",
-                         [(['1', '2', '3', '4', '100'],
-                          [[1, 2, 3], [1, 6, 7], [7, 3], [3, 4, 5, 6], [3, 4]],
-                          [2, 2, 1],
-                           ['3', '100'], [[ 7., 3.], [ 3., 4.]], [[3., 10.], [4., 10.]])])
-def test_raw_list_text_dset_loader(source_list, target_sentences, num_samples_per_bucket,
-                                   expected_source_0, expected_target_0, expected_label_0):
-    # Test Init object
-    buckets = sockeye.data_io.define_parallel_buckets(4, 4, 1, 1.0)
-    dset_loader = data_io.RawListTextDatasetLoader(buckets=buckets,
-                                       eos_id=10, pad_id=C.PAD_ID)
-
-    assert isinstance(dset_loader, data_io.RawListTextDatasetLoader)
-    assert len(dset_loader.buckets)==3
-
-    # Test Load data
-    pop_dset_loader = dset_loader.load(source_list, target_sentences, num_samples_per_bucket)
-
-    assert isinstance(pop_dset_loader, sockeye.data_io.ParallelDataSet)
-    assert len(pop_dset_loader.source)==3
-    assert len(pop_dset_loader.target)==3
-    assert len(pop_dset_loader.label)==3
-    np.testing.assert_equal(pop_dset_loader.source[0], expected_source_0)
-    np.testing.assert_almost_equal(pop_dset_loader.target[0].asnumpy(), expected_target_0)
-    np.testing.assert_almost_equal(pop_dset_loader.label[0].asnumpy(), expected_label_0)
-
-
-@pytest.mark.parametrize("source_list, target_sentences, num_samples_per_bucket",
-                         [(['a', 'b', 'c', 'd', 'e'],
-                          [[1, 2, 3], [1, 6, 7], [7, 3], [3, 4, 5, 6], [3, 4]],
-                           [2, 2, 1])])
-def test_image_text_sample_iter(source_list, target_sentences, num_samples_per_bucket):
-    batch_size = 2
-    image_size = _CNN_INPUT_IMAGE_SHAPE
-    buckets = sockeye.data_io.define_parallel_buckets(4, 4, 1, 1.0)
-    bucket_batch_sizes = sockeye.data_io.define_bucket_batch_sizes(buckets,
-                                                                   batch_size,
-                                                                   batch_by_words=False,
-                                                                   batch_num_devices=1,
-                                                                   data_target_average_len=[None]*len(buckets))
-    dset_loader = data_io.RawListTextDatasetLoader(buckets=buckets, eos_id=-1, pad_id=C.PAD_ID)
-    with TemporaryDirectory() as work_dir:
-        source_list_img = []
-        source_list_npy = []
-        for s in source_list:
-            source_list_img.append(os.path.join(work_dir, s + ".jpg"))
-            source_list_npy.append(os.path.join(work_dir, s + ".npy"))
-        # Create random images/features
-        for s in source_list_img:
-            filename = os.path.join(work_dir, s)
-            generate_img_or_feat(filename, use_features=False)
-        for s in source_list_npy:
-            filename = os.path.join(work_dir, s)
-            generate_img_or_feat(filename, use_features=True)
-
-        # Test image iterator
-        pop_dset_loader = dset_loader.load(source_list_img, target_sentences, num_samples_per_bucket)
-        data_iter = data_io.ImageTextSampleIter(pop_dset_loader,
-                                                buckets,
-                                                batch_size,
-                                                bucket_batch_sizes,
-                                                image_size,
-                                                use_feature_loader=False,
-                                                preload_features=False)
-        data = data_iter.next()
-        assert isinstance(data, mx.io.DataBatch)
-        np.testing.assert_equal(data.data[0].asnumpy().shape[1:], image_size)
-
-        # Test iterator feature loader + preload all to memory
-        pop_dset_loader = dset_loader.load(source_list_npy, target_sentences, num_samples_per_bucket)
-        data_iter = data_io.ImageTextSampleIter(pop_dset_loader,
-                                                buckets,
-                                                batch_size,
-                                                bucket_batch_sizes,
-                                                _FEATURE_SHAPE,
-                                                use_feature_loader=True,
-                                                preload_features=True)
-        data = data_iter.next()
-        assert isinstance(data, mx.io.DataBatch)
-        np.testing.assert_equal(data.data[0].asnumpy().shape[1:], _FEATURE_SHAPE)
-
-
-def test_get_training_feature_text_data_iters():
-    # Test features
-    source_list = ['1', '2', '3', '4', '100']
-    prefix = "tmp_corpus"
-    use_feature_loader = True
-    preload_features = True
-    train_max_length = 30
-    dev_max_length = 30
-    expected_mean = 1.0
-    expected_std = 1.0
-    test_max_length = 30
-    batch_size = 5
-    if use_feature_loader:
-        source_image_size = _FEATURE_SHAPE
-    else:
-        source_image_size = _CNN_INPUT_IMAGE_SHAPE
-    with tmp_img_captioning_dataset(source_list,
-                                    prefix,
-                                    train_max_length,
-                                    dev_max_length,
-                                    test_max_length,
-                                    use_feature_loader) as data:
-        # tmp common vocab
-        vcb = vocab.build_from_paths([data['target'], data['target']])
-
-        train_iter, val_iter, config_data, data_info = data_io.get_training_image_text_data_iters(source_root=data['work_dir'],
-                                                                                                  source=data['source'],
-                                                                                                  target=data['target'],
-                                                                                                  validation_source_root=data['work_dir'],
-                                                                                                  validation_source=data['validation_source'],
-                                                                                                  validation_target=data['validation_target'],
-                                                                                                  vocab_target=vcb,
-                                                                                                  vocab_target_path=None,
-                                                                                                  batch_size=batch_size,
-                                                                                                  batch_by_words=False,
-                                                                                                  batch_num_devices=1,
-                                                                                                  source_image_size=source_image_size,
-                                                                                                  max_seq_len_target=train_max_length,
-                                                                                                  bucketing=True,
-                                                                                                  bucket_width=10,
-                                                                                                  use_feature_loader=use_feature_loader,
-                                                                                                  preload_features=preload_features)
-        assert isinstance(train_iter, data_io.ParallelSampleIter)
-        assert isinstance(val_iter, data_io.ParallelSampleIter)
-        assert isinstance(config_data, data_io.DataConfig)
-        assert isinstance(data_info.sources[0], data_io.FileListReader)
-        assert data_info.target == data['target']
-        assert data_info.source_vocabs is None
-        assert data_info.target_vocab is None
-        assert config_data.data_statistics.max_observed_len_source == 0
-        assert config_data.data_statistics.max_observed_len_target == train_max_length - 1
-        assert np.isclose(config_data.data_statistics.length_ratio_mean, expected_mean)
-        assert np.isclose(config_data.data_statistics.length_ratio_std, expected_std)
-
-        assert train_iter.batch_size == batch_size
-        assert val_iter.batch_size == batch_size
-        assert train_iter.default_bucket_key == (0, train_max_length)
-        assert val_iter.default_bucket_key == (0, dev_max_length)
-        assert train_iter.dtype == 'float32'
-
-        # test some batches
-        bos_id = vcb[C.BOS_SYMBOL]
-        expected_first_target_symbols = np.full((batch_size,), bos_id, dtype='float32')
-        for epoch in range(2):
-            while train_iter.iter_next():
-                batch = train_iter.next()
-                assert len(batch.data) == 2
-                assert len(batch.label) == 1
-                assert batch.bucket_key in train_iter.buckets
-                source = batch.data[0].asnumpy()
-                target = batch.data[1].asnumpy()
-                label = batch.label[0].asnumpy()
-                assert source.shape[0] == target.shape[0] == label.shape[0] == batch_size
-                # target first symbol should be BOS
-                assert np.array_equal(target[:, 0], expected_first_target_symbols)
-                # label first symbol should be 2nd target symbol
-                assert np.array_equal(label[:, 0], target[:, 1])
-                # each label sequence contains one EOS symbol
-                assert np.sum(label == vcb[C.EOS_SYMBOL]) == batch_size
-            train_iter.reset()
-
-
-def test_get_training_image_text_data_iters():
-    # Test images
-    source_list = ['1', '2', '3', '4', '100']
-    prefix = "tmp_corpus"
-    use_feature_loader = False
-    preload_features = False
-    train_max_length = 30
-    dev_max_length = 30
-    expected_mean = 1.0
-    expected_std = 1.0
-    test_max_length = 30
-    batch_size = 5
-    if use_feature_loader:
-        source_image_size = _FEATURE_SHAPE
-    else:
-        source_image_size = _CNN_INPUT_IMAGE_SHAPE
-    with tmp_img_captioning_dataset(source_list,
-                                    prefix,
-                                    train_max_length,
-                                    dev_max_length,
-                                    test_max_length,
-                                    use_feature_loader) as data:
-        # tmp common vocab
-        vcb = vocab.build_from_paths([data['target'], data['target']])
-
-        train_iter, val_iter, config_data, data_info = data_io.get_training_image_text_data_iters(source_root=data['work_dir'],
-                                                                                                  source=data['source'],
-                                                                                                  target=data['target'],
-                                                                                                  validation_source_root=data['work_dir'],
-                                                                                                  validation_source=data['validation_source'],
-                                                                                                  validation_target=data['validation_target'],
-                                                                                                  vocab_target=vcb,
-                                                                                                  vocab_target_path=None,
-                                                                                                  batch_size=batch_size,
-                                                                                                  batch_by_words=False,
-                                                                                                  batch_num_devices=1,
-                                                                                                  source_image_size=source_image_size,
-                                                                                                  max_seq_len_target=train_max_length,
-                                                                                                  bucketing=False,
-                                                                                                  bucket_width=10,
-                                                                                                  use_feature_loader=use_feature_loader,
-                                                                                                  preload_features=preload_features)
-        assert isinstance(train_iter, data_io.ParallelSampleIter)
-        assert isinstance(val_iter, data_io.ParallelSampleIter)
-        assert isinstance(config_data, data_io.DataConfig)
-        assert isinstance(data_info.sources[0], data_io.FileListReader)
-        assert data_info.target == data['target']
-        assert data_info.source_vocabs is None
-        assert data_info.target_vocab is None
-        assert config_data.data_statistics.max_observed_len_source == 0
-        assert config_data.data_statistics.max_observed_len_target == train_max_length - 1
-        assert np.isclose(config_data.data_statistics.length_ratio_mean, expected_mean)
-        assert np.isclose(config_data.data_statistics.length_ratio_std, expected_std)
-
-        assert train_iter.batch_size == batch_size
-        assert val_iter.batch_size == batch_size
-        assert train_iter.default_bucket_key == (0, train_max_length)
-        assert val_iter.default_bucket_key == (0, dev_max_length)
-        assert train_iter.dtype == 'float32'
-
-        # test some batches
-        bos_id = vcb[C.BOS_SYMBOL]
-        expected_first_target_symbols = np.full((batch_size,), bos_id, dtype='float32')
-        for epoch in range(2):
-            while train_iter.iter_next():
-                batch = train_iter.next()
-                assert len(batch.data) == 2
-                assert len(batch.label) == 1
-                assert batch.bucket_key in train_iter.buckets
-                source = batch.data[0].asnumpy()
-                target = batch.data[1].asnumpy()
-                label = batch.label[0].asnumpy()
-                assert source.shape[0] == target.shape[0] == label.shape[0] == batch_size
-                # target first symbol should be BOS
-                assert np.array_equal(target[:, 0], expected_first_target_symbols)
-                # label first symbol should be 2nd target symbol
-                assert np.array_equal(label[:, 0], target[:, 1])
-                # each label sequence contains one EOS symbol
-                assert np.sum(label == vcb[C.EOS_SYMBOL]) == batch_size
-            train_iter.reset()
diff --git a/test/unit/image_captioning/test_encoder.py b/test/unit/image_captioning/test_encoder.py
deleted file mode 100644
index 1090dab8a..000000000
--- a/test/unit/image_captioning/test_encoder.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import os
-from tempfile import TemporaryDirectory
-
-import mxnet as mx
-
-from sockeye import constants as C
-from sockeye.image_captioning.encoder import ImageLoadedCnnEncoderConfig, \
-    ImageLoadedCnnEncoder
-from test.common_image_captioning import create_simple_and_save_to_disk
-
-
-def test_image_loaded_cnn_encoder():
-    layer_name = "conv2"
-    encoded_seq_len = 16 + 1  # +1 for global descriptor
-    num_embed = 10
-    no_global_descriptor = False
-    preextracted_features = False
-    source_image_size = (3, 20, 20)
-    batch_size = 8
-
-    with TemporaryDirectory() as work_dir:
-        model_path = os.path.join(work_dir, '2-conv-layer')
-        epoch = 0
-        # Create net and save to disk
-        create_simple_and_save_to_disk(model_path, epoch, source_image_size, batch_size)
-        # Setup encoder
-        image_cnn_encoder_config = ImageLoadedCnnEncoderConfig(
-                                        model_path=model_path,
-                                        epoch=epoch,
-                                        layer_name=layer_name,
-                                        encoded_seq_len=encoded_seq_len,
-                                        num_embed=num_embed,
-                                        no_global_descriptor=no_global_descriptor,
-                                        preextracted_features=preextracted_features)
-        image_cnn_encoder = ImageLoadedCnnEncoder(image_cnn_encoder_config)
-        # Prepare for inference
-        data_nd = mx.nd.random_normal(shape=(batch_size,) + source_image_size)
-        source = mx.sym.Variable(C.SOURCE_NAME)
-        embedding, encoded_data_length, seq_len = image_cnn_encoder.encode(source,
-                                                                           None,
-                                                                           None)
-        data_names = ['source']
-        module = mx.mod.Module(symbol=embedding,
-                               data_names=data_names,
-                               label_names=None)
-        module.bind(for_training=False,
-                    data_shapes=[(data_names[0], (batch_size,) + source_image_size)])
-        # Pretrained net
-        initializers = image_cnn_encoder.get_initializers()
-        init = mx.initializer.Mixed(*zip(*initializers))
-        module.init_params(init)
-        provide_data = [
-            mx.io.DataDesc(name=data_names[0],
-                           shape=(batch_size,) + source_image_size,  # "NCHW"
-                           layout=C.BATCH_MAJOR_IMAGE)
-        ]
-        batch = mx.io.DataBatch([data_nd], None,
-                                pad=0, index=None,
-                                provide_data=provide_data)
-        # Inference & tests
-        module.forward(batch)
-        feats = module.get_outputs()[0].asnumpy()
-        assert feats.shape == (batch_size, encoded_seq_len, num_embed)
diff --git a/test/unit/image_captioning/test_utils.py b/test/unit/image_captioning/test_utils.py
deleted file mode 100644
index 3762bd5f0..000000000
--- a/test/unit/image_captioning/test_utils.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import os
-from tempfile import TemporaryDirectory
-
-import numpy as np
-from PIL import Image
-
-import sockeye.image_captioning.utils as utils
-
-
-def test_copy_mx_model_to():
-    model_path = "test"
-    model_epoch = 0
-
-    with TemporaryDirectory() as work_dir:
-        # Simulate model files
-        model_path = os.path.join(work_dir, model_path)
-        json_name = model_path + '-symbol.json'
-        params_name = model_path + '-%04d.params' % model_epoch
-        open(json_name, 'a').close()
-        open(params_name, 'a').close()
-
-        with TemporaryDirectory() as output_folder:
-            target_path = utils.copy_mx_model_to(model_path, model_epoch, output_folder)
-            assert os.path.exists(target_path + '-symbol.json')
-            assert os.path.exists(target_path + '-%04d.params' % model_epoch)
-
-
-def test_crop_resize_image():
-    image_size = [224, 224]
-    imarray = np.random.rand(100, 250, 3) * 255
-    image = Image.fromarray(imarray.astype('uint8'))
-    image_o = utils.crop_resize_image(image, image_size)
-    image_o = np.asarray(image_o)
-
-    np.testing.assert_equal(image_o.shape[:2], image_size)
-
-
-def test_load_preprocess_images():
-    image_size = [3, 224, 224]
-    image_paths = ['a.jpg', 'b.jpg', 'c.jpg']
-    # Generate a set of images
-    with TemporaryDirectory() as work_dir:
-        filenames = []
-        for s in image_paths:
-            filename = os.path.join(work_dir, s)
-            imarray = np.random.rand(100, 100, 3) * 255
-            im = Image.fromarray(imarray.astype('uint8'))
-            im.save(filename)
-            filenames.append(filename)
-
-        images = utils.load_preprocess_images(filenames, image_size)
-        assert len(images)==3
-        for img in images:
-            np.testing.assert_equal(img.shape, image_size)
-
-
-def test_load_features():
-    feature_size = [10, 2048]
-    filenames = ['a.npy', 'b.npy', 'c.npy', 'd.npy']
-    # Generate a set of images
-    with TemporaryDirectory() as work_dir:
-        paths = []
-        for s in filenames:
-            filename = os.path.join(work_dir, s)
-            data = np.random.rand(*feature_size)
-            np.save(filename, data)
-            paths.append(filename)
-
-        feats = utils.load_features(paths, feature_size)
-        assert len(feats)==4
-        for f in feats:
-            np.testing.assert_equal(f.shape, feature_size)
-
-
-def test_save_features():
-    feature_size = [10, 2048]
-    filenames = ['a', 'b', 'c']
-    # Generate the list of ndarrays
-    datas = []
-    for i in range(len(filenames)):
-        datas.append(np.random.rand(*feature_size))
-
-    with TemporaryDirectory() as work_dir:
-        paths = [os.path.join(work_dir, s) for s in filenames]
-        fnames = utils.save_features(paths, datas)
-        for i, f in enumerate(fnames):
-            assert os.path.exists(f)>0
-            data = utils.load_feature(f, feature_size)
-            np.testing.assert_almost_equal(datas[i], data)
-
-    # Tests with compression
-    with TemporaryDirectory() as work_dir:
-        paths = [os.path.join(work_dir, s) for s in filenames]
-        fnames = utils.save_features(paths, datas, compressed=True)
-        for i, f in enumerate(fnames):
-            assert os.path.exists(f)>0
-            data = utils.load_feature(f, feature_size)
-            np.testing.assert_almost_equal(datas[i], data)
diff --git a/test/unit/test_arguments.py b/test/unit/test_arguments.py
index 17082699a..3b5539346 100644
--- a/test/unit/test_arguments.py
+++ b/test/unit/test_arguments.py
@@ -1,4 +1,4 @@
-# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -16,13 +16,20 @@
 import tempfile
 import os
 import re
-import yaml
 
 import sockeye.arguments as arguments
 import sockeye.constants as C
 
 from itertools import zip_longest
 
+
+def test_simple_dict():
+    dict_str = 'beta1:0.9,beta2:0.999,epsilon:1e-8,lazy_update:true'
+    expected = {'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-8, 'lazy_update': True}
+    parse = arguments.simple_dict()
+    assert parse(dict_str) == expected
+
+
 # note that while --prepared-data and --source/--target are mutually exclusive this is not the case at the CLI level
 @pytest.mark.parametrize("test_params, expected_params", [
     # mandatory parameters
@@ -31,13 +38,14 @@
      '--output test_output',
      dict(source='test_src', target='test_tgt',
           source_factors=[],
+          source_factors_use_source_vocab=[],
           prepared_data='prep_data',
           validation_source='test_validation_src', validation_target='test_validation_tgt',
           validation_source_factors=[],
           output='test_output', overwrite_output=False,
           source_vocab=None, target_vocab=None, source_factor_vocabs=[], shared_vocab=False, num_words=(0, 0),
           word_min_count=(1, 1), pad_vocab_to_multiple_of=None,
-          no_bucketing=False, bucket_width=10, max_seq_len=(99, 99),
+          no_bucketing=False, bucket_width=10, no_bucket_scaling=False, max_seq_len=(99, 99),
           monitor_pattern=None, monitor_stat_func='mx_default')),
 
     # short parameters
@@ -46,13 +54,14 @@
      '-o test_output',
      dict(source='test_src', target='test_tgt',
           source_factors=[],
+          source_factors_use_source_vocab=[],
           prepared_data='prep_data',
           validation_source='test_validation_src', validation_target='test_validation_tgt',
           validation_source_factors=[],
           output='test_output', overwrite_output=False,
           source_vocab=None, target_vocab=None, source_factor_vocabs=[], shared_vocab=False, num_words=(0, 0),
           word_min_count=(1, 1), pad_vocab_to_multiple_of=None,
-          no_bucketing=False, bucket_width=10, max_seq_len=(99, 99),
+          no_bucketing=False, bucket_width=10, no_bucket_scaling=False, max_seq_len=(99, 99),
           monitor_pattern=None, monitor_stat_func='mx_default'))
 ])
 def test_io_args(test_params, expected_params):
@@ -69,9 +78,19 @@ def test_logging_args(test_params, expected_params):
 
 
 @pytest.mark.parametrize("test_params, expected_params", [
-    ('', dict(device_ids=[-1], use_cpu=False, disable_device_locking=False, lock_dir='/tmp')),
+    ('', dict(device_ids=[-1],
+              use_cpu=False,
+              omp_num_threads=None,
+              env=None,
+              disable_device_locking=False,
+              lock_dir='/tmp')),
     ('--device-ids 1 2 3 --use-cpu --disable-device-locking --lock-dir test_dir',
-     dict(device_ids=[1, 2, 3], use_cpu=True, disable_device_locking=True, lock_dir='test_dir'))
+     dict(device_ids=[1, 2, 3],
+          use_cpu=True,
+          omp_num_threads=None,
+          env=None,
+          disable_device_locking=True,
+          lock_dir='test_dir'))
 ])
 def test_device_args(test_params, expected_params):
     _test_args(test_params, expected_params, arguments.add_device_args)
@@ -83,79 +102,84 @@ def test_device_args(test_params, expected_params):
               num_layers=(6, 6),
               num_embed=(None, None),
               source_factors_num_embed=[],
-              source_factors_combine=C.SOURCE_FACTORS_COMBINE_CONCAT,
-              rnn_attention_type='mlp',
-              rnn_attention_num_hidden=None,
-              rnn_scale_dot_attention=False,
-              rnn_attention_coverage_type='count',
-              rnn_attention_coverage_num_hidden=1,
-              rnn_attention_coverage_max_fertility=2,
-              weight_tying=False,
-              weight_tying_type="trg_softmax",
-              rnn_attention_mhdot_heads=None,
+              source_factors_combine=[C.SOURCE_FACTORS_COMBINE_CONCAT],
+              source_factors_share_embedding=[False],
+              weight_tying_type="src_trg_softmax",
               transformer_attention_heads=(8, 8),
               transformer_feed_forward_num_hidden=(2048, 2048),
-              transformer_activation_type=C.RELU,
+              transformer_activation_type=(C.RELU, C.RELU),
               transformer_model_size=(512, 512),
               transformer_positional_embedding_type="fixed",
               transformer_preprocess=('n', 'n'),
               transformer_postprocess=('dr', 'dr'),
-              attention_based_copying=False,
-              rnn_attention_use_prev_word=False,
-              rnn_decoder_state_init="last",
-              rnn_encoder_reverse_input=False,
-              rnn_context_gating=False,
-              rnn_cell_type=C.LSTM_TYPE,
-              rnn_num_hidden=1024,
-              rnn_residual_connections=False,
-              rnn_first_residual_layer=2,
-              cnn_activation_type='glu',
-              cnn_kernel_width=(3, 3),
-              cnn_num_hidden=512,
-              cnn_positional_embedding_type="learned",
-              cnn_project_qkv=False,
-              layer_normalization=False,
-              weight_normalization=False,
               lhuc=None,
               encoder=C.TRANSFORMER_TYPE,
-              conv_embed_max_filter_width=8,
               decoder=C.TRANSFORMER_TYPE,
-              conv_embed_output_dim=None,
-              conv_embed_num_filters=(200, 200, 250, 250, 300, 300, 300, 300),
-              conv_embed_num_highway_layers=4,
-              conv_embed_pool_stride=5,
-              conv_embed_add_positional_encodings=False,
-              rnn_attention_in_upper_layers=False))
+              dtype='float32',
+              amp=False,
+              amp_scale_interval=2000))
 ])
 def test_model_parameters(test_params, expected_params):
     _test_args(test_params, expected_params, arguments.add_model_parameters)
 
 
 @pytest.mark.parametrize("test_params, expected_params", [
-    ('', dict(decoder_only=False,
-              batch_size=4096,
-              batch_type="word",
+    ('-m model', dict(input=None,
+                      input_factors=None,
+                      json_input=False,
+                      output=None,
+                      checkpoints=None,
+                      models=['model'],
+                      beam_size=5,
+                      nbest_size=1,
+                      batch_size=1,
+                      chunk_size=None,
+                      ensemble_mode='linear',
+                      bucket_width=10,
+                      max_input_length=None,
+                      restrict_lexicon=None,
+                      restrict_lexicon_topk=None,
+                      avoid_list=None,
+                      output_type='translation',
+                      max_output_length_num_stds=2,
+                      max_output_length=None,
+                      beam_search_stop='all',
+                      length_penalty_alpha=1.0,
+                      length_penalty_beta=0.0,
+                      brevity_penalty_constant_length_ratio=0.0,
+                      brevity_penalty_weight=1.0,
+                      brevity_penalty_type='none',
+                      strip_unknown_words=False,
+                      dtype=None,
+                      sample=None,
+                      seed=None)),
+])
+def test_inference_args(test_params, expected_params):
+    _test_args(test_params, expected_params, arguments.add_inference_args)
+
+
+@pytest.mark.parametrize("test_params, expected_params", [
+    ('', dict(batch_size=4096,
+              batch_type='word',
+              round_batch_sizes_to_multiple_of=1,
               loss=C.CROSS_ENTROPY,
               label_smoothing=0.1,
-              loss_normalization_type='valid',
               length_task=None,
               length_task_layers=1,
               length_task_weight=1.0,
-              metrics=[C.PERPLEXITY],
               optimized_metric=C.PERPLEXITY,
               checkpoint_interval=4000,
-              max_num_checkpoint_not_improved=32,
+              max_num_checkpoint_not_improved=None,
+              checkpoint_improvement_threshold=0.,
               max_checkpoints=None,
               embed_dropout=(.0, .0),
-              transformer_dropout_attention=0.1,
-              transformer_dropout_act=0.1,
-              transformer_dropout_prepost=0.1,
-              conv_embed_dropout=0.0,
+              transformer_dropout_attention=(0.1, 0.1),
+              transformer_dropout_act=(0.1, 0.1),
+              transformer_dropout_prepost=(0.1, 0.1),
               optimizer='adam',
               optimizer_params=None,
+              horovod=False,
               kvstore='device',
-              gradient_compression_type=None,
-              gradient_compression_threshold=0.5,
               min_samples=None,
               max_samples=None,
               min_updates=None,
@@ -170,151 +194,28 @@ def test_model_parameters(test_params, expected_params):
               gradient_clipping_threshold=1.0,
               gradient_clipping_type='none',
               learning_rate_scheduler_type='plateau-reduce',
-              learning_rate_reduce_factor=0.7,
+              learning_rate_t_scale=1.0,
+              learning_rate_reduce_factor=0.9,
               learning_rate_reduce_num_not_improved=8,
-              learning_rate_half_life=10,
               learning_rate_warmup=0,
-              learning_rate_schedule=None,
-              learning_rate_decay_param_reset=False,
-              learning_rate_decay_optimizer_states_reset='off',
               weight_init='xavier',
               weight_init_scale=3.0,
               weight_init_xavier_rand_type='uniform',
               weight_init_xavier_factor_type='avg',
-              embed_weight_init='default',
-              rnn_dropout_inputs=(.0, .0),
-              rnn_dropout_states=(.0, .0),
-              rnn_dropout_recurrent=(.0, .0),
-              rnn_decoder_hidden_dropout=.2,
-              cnn_hidden_dropout=0.2,
-              rnn_forget_bias=0.0,
               fixed_param_names=[],
               fixed_param_strategy=None,
-              rnn_h2h_init=C.RNN_INIT_ORTHOGONAL,
               decode_and_evaluate=500,
-              decode_and_evaluate_use_cpu=False,
               decode_and_evaluate_device_id=None,
               stop_training_on_decoder_failure=False,
-              seed=13,
+              seed=1,
               keep_last_params=-1,
               keep_initializations=False,
-              rnn_enc_last_hidden_concat_to_embedding=False,
               dry_run=False)),
 ])
 def test_training_arg(test_params, expected_params):
     _test_args(test_params, expected_params, arguments.add_training_args)
 
 
-@pytest.mark.parametrize("test_params, expected_params", [
-    ('-m model', dict(input=None,
-                      input_factors=None,
-                      json_input=False,
-                      output=None,
-                      checkpoints=None,
-                      models=['model'],
-                      beam_size=5,
-                      nbest_size=1,
-                      beam_prune=0,
-                      batch_size=1,
-                      chunk_size=None,
-                      ensemble_mode='linear',
-                      bucket_width=10,
-                      max_input_len=None,
-                      restrict_lexicon=None,
-                      restrict_lexicon_topk=None,
-                      avoid_list=None,
-                      softmax_temperature=None,
-                      output_type='translation',
-                      sure_align_threshold=0.9,
-                      max_output_length_num_stds=2,
-                      beam_search_stop='all',
-                      length_penalty_alpha=1.0,
-                      length_penalty_beta=0.0,
-                      brevity_penalty_constant_length_ratio=0.0,
-                      brevity_penalty_weight=1.0,
-                      brevity_penalty_type='none',
-                      strip_unknown_words=False,
-                      override_dtype=None,
-                      sample=None,
-                      seed=None,
-                      skip_topk=False)),
-])
-def test_inference_args(test_params, expected_params):
-    _test_args(test_params, expected_params, arguments.add_inference_args)
-
-
-# Make sure that the parameter names and default values used in the tutorials do not change without the tutorials
-# being updated accordingly.
-@pytest.mark.parametrize("test_params, expected_params, expected_params_present", [
-    # seqcopy tutorial
-    ('-s train.source '
-     '-t train.target '
-     '-vs dev.source '
-     '-vt dev.target '
-     '--num-embed 32 '
-     '--rnn-num-hidden 64 '
-     '--rnn-attention-type dot '
-     '--use-cpu '
-     '--metrics perplexity accuracy '
-     '--max-num-checkpoint-not-improved 3 '
-     '-o seqcopy_model',
-     dict(source="train.source",
-          target="train.target",
-          validation_source="dev.source",
-          validation_target="dev.target",
-          num_embed=(32, 32),
-          rnn_num_hidden=64,
-          use_cpu=True,
-          metrics=['perplexity', 'accuracy'],
-          max_num_checkpoint_not_improved=3,
-          output="seqcopy_model",
-          # The tutorial text mentions that we train a RNN model:
-          encoder=C.TRANSFORMER_TYPE,
-          decoder=C.TRANSFORMER_TYPE),
-     # Additionally we mention the checkpoint_interval
-     ['checkpoint_interval']),
-    # WMT tutorial
-    ('-d train_data '
-     '-vs newstest2016.tc.BPE.de '
-     '-vt newstest2016.tc.BPE.en '
-     '--encoder rnn '
-     '--decoder rnn '
-     '--num-embed 256 '
-     '--rnn-num-hidden 512 '
-     '--rnn-attention-type dot '
-     '--max-seq-len 60 '
-     '--decode-and-evaluate 500 '
-     '--use-cpu '
-     '-o wmt_mode',
-     dict(
-         source=None,
-         target=None,
-         prepared_data="train_data",
-         validation_source="newstest2016.tc.BPE.de",
-         validation_target="newstest2016.tc.BPE.en",
-         num_embed=(256, 256),
-         rnn_num_hidden=512,
-         rnn_attention_type='dot',
-         max_seq_len=(60, 60),
-         decode_and_evaluate=500,
-         use_cpu=True,
-         # Arguments mentioned in the text, should be renamed in the tutorial if they change:
-         rnn_cell_type="lstm",
-         encoder=C.RNN_NAME,
-         decoder=C.RNN_NAME,
-         optimizer="adam"),
-     ["num_layers",
-      "rnn_residual_connections",
-      "batch_size",
-      "learning_rate_schedule",
-      "optimized_metric",
-      "decode_and_evaluate",
-      "seed"])
-])
-def test_tutorial_train_args(test_params, expected_params, expected_params_present):
-    _test_args_subset(test_params, expected_params, expected_params_present, arguments.add_train_cli_args)
-
-
 @pytest.mark.parametrize("test_params, expected_params, expected_params_present", [
     # seqcopy tutorial
     ('-m seqcopy_model '
@@ -324,14 +225,11 @@ def test_tutorial_train_args(test_params, expected_params, expected_params_prese
      []),
     # WMT tutorial
     ('-m wmt_model wmt_model_seed2 '
-     '--use-cpu '
-     '--output-type align_plot',
+     '--use-cpu ',
      dict(models=["wmt_model", "wmt_model_seed2"],
-          use_cpu=True,
-          output_type="align_plot"),
+          use_cpu=True),
      # Other parameters mentioned in the WMT tutorial
      ["beam_size",
-      "softmax_temperature",
       "length_penalty_alpha"]),
 ])
 def test_tutorial_translate_args(test_params, expected_params, expected_params_present):
@@ -356,6 +254,7 @@ def test_tutorial_averaging_args(test_params, expected_params, expected_params_p
           source_vocab=None,
           target_vocab=None,
           source_factors=[],
+          source_factors_use_source_vocab=[],
           source_factor_vocabs=[],
           shared_vocab=False,
           num_words=(0, 0),
@@ -363,14 +262,16 @@ def test_tutorial_averaging_args(test_params, expected_params, expected_params_p
           pad_vocab_to_multiple_of=None,
           no_bucketing=False,
           bucket_width=10,
+          no_bucket_scaling=False,
           max_seq_len=(99, 99),
           min_num_shards=1,
-          num_samples_per_shard=1000000,
+          num_samples_per_shard=10000000,
           seed=13,
           output='train_data',
           quiet=False,
           loglevel='INFO',
-          no_logfile=False
+          no_logfile=False,
+          max_processes=1
           ))
 ])
 def test_tutorial_prepare_data_cli_args(test_params, expected_params):
@@ -383,6 +284,7 @@ def test_tutorial_prepare_data_cli_args(test_params, expected_params):
           source_vocab=None,
           target_vocab=None,
           source_factors=[],
+          source_factors_use_source_vocab=[],
           source_factor_vocabs=[],
           shared_vocab=False,
           num_words=(0, 0),
@@ -390,14 +292,16 @@ def test_tutorial_prepare_data_cli_args(test_params, expected_params):
           pad_vocab_to_multiple_of=None,
           no_bucketing=False,
           bucket_width=10,
+          no_bucket_scaling=False,
           max_seq_len=(99, 99),
           min_num_shards=1,
-          num_samples_per_shard=1000000,
+          num_samples_per_shard=10000000,
           seed=13,
           output='prepared_data',
           quiet=False,
           loglevel='INFO',
-          no_logfile=False
+          no_logfile=False,
+          max_processes=1
           ))
 ])
 def test_prepare_data_cli_args(test_params, expected_params):
diff --git a/test/unit/test_attention.py b/test/unit/test_attention.py
deleted file mode 100644
index ab948de7d..000000000
--- a/test/unit/test_attention.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import mxnet as mx
-import numpy as np
-import pytest
-
-import sockeye.constants as C
-import sockeye.coverage
-import sockeye.rnn_attention
-from test.common import gaussian_vector, integer_vector
-
-attention_types = [C.ATT_BILINEAR, C.ATT_DOT, C.ATT_LOC, C.ATT_MLP]
-
-
-def test_att_bilinear():
-    config_attention = sockeye.rnn_attention.AttentionConfig(type=C.ATT_BILINEAR,
-                                                             num_hidden=None,
-                                                             input_previous_word=True,
-                                                             source_num_hidden=None,
-                                                             query_num_hidden=6,
-                                                             layer_normalization=False,
-                                                             config_coverage=None)
-
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=None)
-
-    assert type(attention) == sockeye.rnn_attention.BilinearAttention
-    assert not attention._input_previous_word
-    assert attention.num_hidden == 6
-
-
-def test_att_dot():
-    config_attention = sockeye.rnn_attention.AttentionConfig(type=C.ATT_DOT,
-                                                             num_hidden=2,
-                                                             input_previous_word=True,
-                                                             source_num_hidden=4,
-                                                             query_num_hidden=6,
-                                                             layer_normalization=False,
-                                                             config_coverage=None,
-                                                             is_scaled=False)
-
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=None)
-
-    assert type(attention) == sockeye.rnn_attention.DotAttention
-    assert attention._input_previous_word
-    assert attention.project_source
-    assert attention.project_query
-    assert attention.num_hidden == 2
-    assert attention.is_scaled is False
-    assert not attention.scale
-
-
-def test_att_dot_scaled():
-    config_attention = sockeye.rnn_attention.AttentionConfig(type=C.ATT_DOT,
-                                                             num_hidden=16,
-                                                             input_previous_word=True,
-                                                             source_num_hidden=None,
-                                                             query_num_hidden=None,
-                                                             layer_normalization=False,
-                                                             config_coverage=None,
-                                                             is_scaled=True)
-
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=None)
-
-    assert type(attention) == sockeye.rnn_attention.DotAttention
-    assert attention._input_previous_word
-    assert attention.project_source
-    assert attention.project_query
-    assert attention.num_hidden == 16
-    assert attention.is_scaled is True
-    assert attention.scale == 0.25
-
-
-def test_att_mh_dot():
-    config_attention = sockeye.rnn_attention.AttentionConfig(type=C.ATT_MH_DOT,
-                                                             num_hidden=None,
-                                                             input_previous_word=True,
-                                                             source_num_hidden=8,
-                                                             query_num_hidden=None,
-                                                             layer_normalization=False,
-                                                             config_coverage=None,
-                                                             num_heads=2)
-
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=None)
-
-    assert type(attention) == sockeye.rnn_attention.MultiHeadDotAttention
-    assert attention._input_previous_word
-    assert attention.num_hidden == 8
-    assert attention.heads == 2
-    assert attention.num_hidden_per_head == 4
-
-
-def test_att_fixed():
-    config_attention = sockeye.rnn_attention.AttentionConfig(type=C.ATT_FIXED,
-                                                             num_hidden=None,
-                                                             input_previous_word=True,
-                                                             source_num_hidden=None,
-                                                             query_num_hidden=None,
-                                                             layer_normalization=False,
-                                                             config_coverage=None)
-
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=None)
-
-    assert type(attention) == sockeye.rnn_attention.EncoderLastStateAttention
-    assert attention._input_previous_word
-
-
-def test_att_loc():
-    config_attention = sockeye.rnn_attention.AttentionConfig(type=C.ATT_LOC,
-                                                             num_hidden=None,
-                                                             input_previous_word=True,
-                                                             source_num_hidden=None,
-                                                             query_num_hidden=None,
-                                                             layer_normalization=False,
-                                                             config_coverage=None)
-
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=10)
-
-    assert type(attention) == sockeye.rnn_attention.LocationAttention
-    assert attention._input_previous_word
-    assert attention.max_source_seq_len == 10
-
-
-def test_att_mlp():
-    config_attention = sockeye.rnn_attention.AttentionConfig(type=C.ATT_MLP,
-                                                             num_hidden=16,
-                                                             input_previous_word=True,
-                                                             source_num_hidden=None,
-                                                             query_num_hidden=None,
-                                                             layer_normalization=True,
-                                                             config_coverage=None)
-
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=10)
-
-    assert type(attention) == sockeye.rnn_attention.MlpAttention
-    assert attention._input_previous_word
-    assert attention.attention_num_hidden == 16
-    assert attention.dynamic_source_num_hidden == 1
-    assert attention._ln
-    assert not attention.coverage
-
-
-def test_att_cov():
-    config_coverage = sockeye.coverage.CoverageConfig(type='tanh', max_fertility=2, num_hidden=5, layer_normalization=True)
-
-    config_attention = sockeye.rnn_attention.AttentionConfig(type=C.ATT_COV,
-                                                             num_hidden=16,
-                                                             input_previous_word=True,
-                                                             source_num_hidden=None,
-                                                             query_num_hidden=None,
-                                                             layer_normalization=True,
-                                                             config_coverage=config_coverage)
-
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=10)
-
-    assert type(attention) == sockeye.rnn_attention.MlpCovAttention
-    assert attention._input_previous_word
-    assert attention.attention_num_hidden == 16
-    assert attention.dynamic_source_num_hidden == 5
-    assert attention._ln
-    assert type(attention.coverage) == sockeye.coverage.ActivationCoverage
-
-
-@pytest.mark.parametrize("attention_type", attention_types)
-def test_attention(attention_type,
-                   batch_size=1,
-                   encoder_num_hidden=2,
-                   decoder_num_hidden=2):
-    # source: (batch_size, seq_len, encoder_num_hidden)
-    source = mx.sym.Variable("source")
-    # source_length: (batch_size,)
-    source_length = mx.sym.Variable("source_length")
-    source_seq_len = 3
-
-    config_attention = sockeye.rnn_attention.AttentionConfig(type=attention_type,
-                                                             num_hidden=2,
-                                                             input_previous_word=False,
-                                                             source_num_hidden=2,
-                                                             query_num_hidden=2,
-                                                             layer_normalization=False,
-                                                             config_coverage=None)
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=source_seq_len)
-
-    attention_state = attention.get_initial_state(source_length, source_seq_len)
-    attention_func = attention.on(source, source_length, source_seq_len)
-    attention_input = attention.make_input(0, mx.sym.Variable("word_vec_prev"), mx.sym.Variable("decoder_state"))
-    attention_state = attention_func(attention_input, attention_state)
-    sym = mx.sym.Group([attention_state.context, attention_state.probs])
-
-    executor = sym.simple_bind(ctx=mx.cpu(),
-                               source=(batch_size, source_seq_len, encoder_num_hidden),
-                               source_length=(batch_size,),
-                               decoder_state=(batch_size, decoder_num_hidden))
-
-    # TODO: test for other inputs (that are not equal at each source position)
-    executor.arg_dict["source"][:] = np.asarray([[[1., 2.], [1., 2.], [3., 4.]]])
-    executor.arg_dict["source_length"][:] = np.asarray([2.0])
-    executor.arg_dict["decoder_state"][:] = np.asarray([[5, 6]])
-    exec_output = executor.forward()
-    context_result = exec_output[0].asnumpy()
-    attention_prob_result = exec_output[1].asnumpy()
-
-    # expecting uniform attention_weights of 0.5: 0.5 * seq1 + 0.5 * seq2
-    assert np.isclose(context_result, np.asarray([[1., 2.]])).all()
-    # equal attention to first two and no attention to third
-    assert np.isclose(attention_prob_result, np.asarray([[0.5, 0.5, 0.]])).all()
-
-
-coverage_cases = [("gru", 10), ("tanh", 4), ("count", 1), ("sigmoid", 1), ("relu", 30), ("fertility", 1)]
-
-
-@pytest.mark.parametrize("attention_coverage_type,attention_coverage_num_hidden", coverage_cases)
-def test_coverage_attention(attention_coverage_type,
-                            attention_coverage_num_hidden,
-                            batch_size=3,
-                            encoder_num_hidden=2,
-                            decoder_num_hidden=2):
-    # source: (batch_size, seq_len, encoder_num_hidden)
-    source = mx.sym.Variable("source")
-    # source_length: (batch_size, )
-    source_length = mx.sym.Variable("source_length")
-    source_seq_len = 10
-
-    config_coverage = sockeye.coverage.CoverageConfig(type=attention_coverage_type,
-                                                      max_fertility=2,
-                                                      num_hidden=attention_coverage_num_hidden,
-                                                      layer_normalization=False)
-    config_attention = sockeye.rnn_attention.AttentionConfig(type="coverage",
-                                                             num_hidden=5,
-                                                             input_previous_word=False,
-                                                             source_num_hidden=encoder_num_hidden,
-                                                             query_num_hidden=decoder_num_hidden,
-                                                             layer_normalization=False,
-                                                             config_coverage=config_coverage)
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=source_seq_len)
-
-    attention_state = attention.get_initial_state(source_length, source_seq_len)
-    attention_func = attention.on(source, source_length, source_seq_len)
-    attention_input = attention.make_input(0, mx.sym.Variable("word_vec_prev"), mx.sym.Variable("decoder_state"))
-    attention_state = attention_func(attention_input, attention_state)
-    sym = mx.sym.Group([attention_state.context, attention_state.probs, attention_state.dynamic_source])
-
-    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
-    source_length_shape = (batch_size,)
-    decoder_state_shape = (batch_size, decoder_num_hidden)
-
-    executor = sym.simple_bind(ctx=mx.cpu(),
-                               source=source_shape,
-                               source_length=source_length_shape,
-                               decoder_state=decoder_state_shape)
-
-    source_length_vector = integer_vector(shape=source_length_shape, max_value=source_seq_len)
-    executor.arg_dict["source"][:] = gaussian_vector(shape=source_shape)
-    executor.arg_dict["source_length"][:] = source_length_vector
-    executor.arg_dict["decoder_state"][:] = gaussian_vector(shape=decoder_state_shape)
-    exec_output = executor.forward()
-    context_result = exec_output[0].asnumpy()
-    attention_prob_result = exec_output[1].asnumpy()
-    dynamic_source_result = exec_output[2].asnumpy()
-
-    expected_probs = (1. / source_length_vector).reshape((batch_size, 1))
-
-    assert context_result.shape == (batch_size, encoder_num_hidden)
-    assert attention_prob_result.shape == (batch_size, source_seq_len)
-    assert dynamic_source_result.shape == (batch_size, source_seq_len, attention_coverage_num_hidden)
-    assert (np.sum(np.isclose(attention_prob_result, expected_probs), axis=1) == source_length_vector).all()
-
-
-def test_last_state_attention(batch_size=1,
-                              encoder_num_hidden=2):
-    """
-    EncoderLastStateAttention is a bit different from other attention mechanisms as it doesn't take a query argument
-    and doesn't return a probability distribution over the inputs (aka alignment).
-    """
-    # source: (batch_size, seq_len, encoder_num_hidden)
-    source = mx.sym.Variable("source")
-    # source_length: (batch_size,)
-    source_length = mx.sym.Variable("source_length")
-    source_seq_len = 3
-
-    config_attention = sockeye.rnn_attention.AttentionConfig(type="fixed",
-                                                             num_hidden=0,
-                                                             input_previous_word=False,
-                                                             source_num_hidden=2,
-                                                             query_num_hidden=2,
-                                                             layer_normalization=False,
-                                                             config_coverage=None)
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=source_seq_len)
-
-    attention_state = attention.get_initial_state(source_length, source_seq_len)
-    attention_func = attention.on(source, source_length, source_seq_len)
-    attention_input = attention.make_input(0, mx.sym.Variable("word_vec_prev"), mx.sym.Variable("decoder_state"))
-    attention_state = attention_func(attention_input, attention_state)
-    sym = mx.sym.Group([attention_state.context, attention_state.probs])
-
-    executor = sym.simple_bind(ctx=mx.cpu(),
-                               source=(batch_size, source_seq_len, encoder_num_hidden),
-                               source_length=(batch_size,))
-
-    # TODO: test for other inputs (that are not equal at each source position)
-    executor.arg_dict["source"][:] = np.asarray([[[1., 2.], [1., 2.], [3., 4.]]])
-    executor.arg_dict["source_length"][:] = np.asarray([2.0])
-    exec_output = executor.forward()
-    context_result = exec_output[0].asnumpy()
-    attention_prob_result = exec_output[1].asnumpy()
-
-    # expecting attention on last state based on source_length
-    assert np.isclose(context_result, np.asarray([[1., 2.]])).all()
-    assert np.isclose(attention_prob_result, np.asarray([[0., 1.0, 0.]])).all()
-
-
-def test_get_context_and_attention_probs():
-    source = mx.sym.Variable('source')
-    source_length = mx.sym.Variable('source_length')
-    attention_scores = mx.sym.Variable('scores')
-    context, att_probs = sockeye.rnn_attention.get_context_and_attention_probs(
-        source,
-        source_length,
-        attention_scores,
-        C.DTYPE_FP32)
-    sym = mx.sym.Group([context, att_probs])
-    assert len(sym.list_arguments()) == 3
-
-    batch_size, seq_len, num_hidden = 32, 50, 100
-
-    # data
-    source_nd = mx.nd.random_normal(shape=(batch_size, seq_len, num_hidden))
-    source_length_np = np.random.randint(1, seq_len+1, (batch_size,))
-    source_length_nd = mx.nd.array(source_length_np)
-    scores_nd = mx.nd.zeros((batch_size, seq_len, 1))
-
-    in_shapes, out_shapes, _ = sym.infer_shape(source=source_nd.shape,
-                                               source_length=source_length_nd.shape,
-                                               scores=scores_nd.shape)
-
-    assert in_shapes == [(batch_size, seq_len, num_hidden), (batch_size, seq_len, 1), (batch_size,)]
-    assert out_shapes == [(batch_size, num_hidden), (batch_size, seq_len)]
-
-    context, probs = sym.eval(source=source_nd,
-                              source_length=source_length_nd,
-                              scores=scores_nd)
-
-    expected_probs = (1. / source_length_nd).reshape((batch_size, 1)).asnumpy()
-    assert (np.sum(np.isclose(probs.asnumpy(), expected_probs), axis=1) == source_length_np).all()
diff --git a/test/unit/test_average.py b/test/unit/test_average.py
index c488fdc23..0bc8e3d54 100644
--- a/test/unit/test_average.py
+++ b/test/unit/test_average.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -25,7 +25,6 @@
 ])
 def test_strategy_best(test_points, expected_top_n, size, maximize):
     result = average._strategy_best(test_points, size, maximize)
-
     assert result == expected_top_n
 
 
@@ -40,7 +39,6 @@ def test_strategy_best(test_points, expected_top_n, size, maximize):
 ])
 def test_strategy_last(test_points, expected_top_n, size, maximize):
     result = average._strategy_last(test_points, size, maximize)
-
     assert result == expected_top_n
 
 
@@ -56,5 +54,4 @@ def test_strategy_last(test_points, expected_top_n, size, maximize):
 ])
 def test_strategy_lifespan(test_points, expected_top_n, size, maximize):
     result = average._strategy_lifespan(test_points, size, maximize)
-
     assert result == expected_top_n
diff --git a/test/unit/test_beam_search.py b/test/unit/test_beam_search.py
new file mode 100644
index 000000000..481a5cc3e
--- /dev/null
+++ b/test/unit/test_beam_search.py
@@ -0,0 +1,352 @@
+# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+from typing import List, Optional
+from typing import Tuple
+
+import mxnet as mx
+import numpy as np
+import pytest
+
+import sockeye.beam_search
+import sockeye.constants as C
+import sockeye.data_io
+import sockeye.inference
+import sockeye.lexical_constraints
+import sockeye.lexicon
+import sockeye.model
+import sockeye.utils
+
+
+def test_length_penalty_default():
+    lengths = mx.nd.array([[1], [2], [3]])
+    length_penalty = sockeye.beam_search.LengthPenalty(1.0, 0.0)
+    expected_lp = np.array([[1.0], [2.], [3.]])
+
+    assert np.allclose(length_penalty(lengths).asnumpy(), expected_lp)
+    assert np.allclose(length_penalty(lengths).asnumpy(), expected_lp)
+    length_penalty.hybridize()
+    assert np.allclose(length_penalty(lengths).asnumpy(), expected_lp)
+
+
+def test_length_penalty():
+    lengths = mx.nd.array([[1], [2], [3]])
+    length_penalty = sockeye.beam_search.LengthPenalty(.2, 5.0)
+    expected_lp = np.array([[6 ** 0.2 / 6 ** 0.2], [7 ** 0.2 / 6 ** 0.2], [8 ** 0.2 / 6 ** 0.2]])
+
+    assert np.allclose(length_penalty(lengths).asnumpy(), expected_lp)
+    assert np.allclose(length_penalty(lengths).asnumpy(), expected_lp)
+    length_penalty.hybridize()
+    assert np.allclose(length_penalty(lengths).asnumpy(), expected_lp)
+
+
+def test_length_penalty_int_input():
+    length = 1
+    length_penalty = sockeye.beam_search.LengthPenalty(.2, 5.0)
+    expected_lp = [6 ** 0.2 / 6 ** 0.2]
+
+    assert np.isclose(length_penalty(length), expected_lp)
+
+
+def test_brevity_penalty_default():
+    hyp_lengths = mx.nd.array([[1], [2], [3]])
+    ref_lengths = mx.nd.array([[2], [3], [2]])
+    brevity_penalty = sockeye.beam_search.BrevityPenalty(0.0)
+    expected_bp = mx.nd.array([[0.0], [0.0], [0.0]])
+    expected_bp_np = np.array([0.0, 0.0, 0.0])
+
+    assert np.allclose(brevity_penalty(hyp_lengths, ref_lengths).asnumpy(), expected_bp.asnumpy())
+    assert np.allclose(brevity_penalty(hyp_lengths, ref_lengths).asnumpy(), expected_bp_np)
+    brevity_penalty.hybridize()
+    assert np.allclose(brevity_penalty(hyp_lengths, ref_lengths).asnumpy(), expected_bp.asnumpy())
+
+
+def test_brevity_penalty():
+    hyp_lengths = mx.nd.array([[1], [2], [3]])
+    ref_lengths = mx.nd.array([[7], [2], [91]])
+    brevity_penalty = sockeye.beam_search.BrevityPenalty(3.5)
+    expected_bp = np.array([[3.5 * (1 - 7 / 1)], [0.0], [3.5 * (1 - 91 / 3)]])
+
+    assert np.allclose(brevity_penalty(hyp_lengths, ref_lengths).asnumpy(), expected_bp)
+    brevity_penalty.hybridize()
+    assert np.allclose(brevity_penalty(hyp_lengths, ref_lengths).asnumpy(), expected_bp)
+
+
+def test_brevity_penalty_int_input():
+    hyp_length = 3
+    ref_length = 5
+    brevity_penalty = sockeye.beam_search.BrevityPenalty(2.0)
+    expected_bp = [2.0 * (1 - 5 / 3)]
+
+    assert np.isclose(brevity_penalty(hyp_length, ref_length), expected_bp)
+
+
+def test_candidate_scorer():
+    scorer = sockeye.beam_search.CandidateScorer(length_penalty_alpha=1.0,
+                                                 length_penalty_beta=0.0,
+                                                 brevity_penalty_weight=0.1)
+    scorer.initialize()
+    scorer.hybridize(static_alloc=True)
+
+    # NDArray input
+    raw_scores = mx.nd.random.uniform(0, 1, (5,))
+    lengths = mx.nd.array([1, 2, 3, 4, 5])
+    reference_lengths = mx.nd.array([2, 3, 4, 5, 6])
+
+    scores = scorer(raw_scores, lengths, reference_lengths)
+    unnormalized_scores = scorer.unnormalize(scores, lengths, reference_lengths)
+    assert np.allclose(unnormalized_scores.asnumpy(), raw_scores.asnumpy())
+
+    # int/float input
+    raw_scores = 5.6
+    lengths = 3
+    reference_lengths = 4
+
+    scores = scorer(raw_scores, lengths, reference_lengths)
+    unnormalized_scores = scorer.unnormalize(scores, lengths, reference_lengths)
+    assert np.allclose(unnormalized_scores, raw_scores)
+
+
+def numpy_topk(scores: mx.nd.NDArray,
+               k: int,
+               offset: mx.nd.NDArray) -> Tuple[mx.nd.NDArray, mx.nd.NDArray, mx.nd.NDArray]:
+    """
+    Get the lowest k elements per sentence from a `scores` matrix using an intermediary Numpy conversion.
+    This should be equivalent to sockeye.utils.topk() and is used as a comparative implementation in testing.
+
+    :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
+    :param k: The number of smallest scores to return.
+    :param offset: Array to add to the hypothesis indices for offsetting in batch decoding.
+    :return: The row indices, column indices and values of the k smallest items in matrix.
+    """
+    # (batch_size, beam_size * target_vocab_size)
+    folded_scores = scores.reshape((-1, k * scores.shape[-1]))
+    batch_size = folded_scores.shape[0]
+
+    folded_scores = folded_scores.asnumpy()
+    # Get the scores
+    # Indexes into folded_scores: (batch_size, beam_size)
+    flat_idxs = np.argpartition(folded_scores, range(k))[:, :k]
+    # Score values: (batch_size, beam_size)
+    values = mx.nd.array(folded_scores[np.arange(folded_scores.shape[0])[:, None], flat_idxs], ctx=scores.context)
+    best_hyp_indices, best_word_indices = mx.nd.array(np.unravel_index(flat_idxs.ravel(), scores.shape),
+                                                      dtype='int32', ctx=scores.context)
+
+    if batch_size > 1:
+        # Offsetting the indices to match the shape of the scores matrix
+        best_hyp_indices += offset
+
+    values = values.reshape((-1, 1))
+    return best_hyp_indices, best_word_indices, values
+
+
+@pytest.mark.parametrize("batch_size, beam_size, target_vocab_size",
+                        [(1, 5, 200),
+                         (5, 5, 200),
+                         (1, 1, 200),
+                         (5, 1, 200),
+                         (10, 10, 100)])
+def test_topk_func(batch_size, beam_size, target_vocab_size):
+    # Random model scores. Shape: (batch_size * beam_size, target_vocab_size)
+    scores = mx.nd.random.uniform(0, 1, (batch_size * beam_size, target_vocab_size))
+    # offset for batch sizes > 1
+    offset = mx.nd.repeat(mx.nd.arange(0, batch_size * beam_size, beam_size, dtype='int32'), beam_size)
+
+    np_hyp, np_word, np_values = numpy_topk(scores, k=beam_size, offset=offset)
+    np_hyp, np_word, np_values = np_hyp.asnumpy(), np_word.asnumpy(), np_values.asnumpy()
+
+    topk = sockeye.beam_search.TopK(k=beam_size)
+    topk.initialize()
+
+    mx_hyp, mx_word, mx_values = topk(scores, offset)
+    mx_hyp, mx_word, mx_values = mx_hyp.asnumpy(), mx_word.asnumpy(), mx_values.asnumpy()
+    assert np.allclose(mx_hyp, np_hyp)
+    assert np.allclose(mx_word, np_word)
+    assert np.allclose(mx_values, np_values)
+
+    topk.hybridize()
+    mx_hyp, mx_word, mx_values = topk(scores, offset)
+    mx_hyp, mx_word, mx_values = mx_hyp.asnumpy(), mx_word.asnumpy(), mx_values.asnumpy()
+    assert np.allclose(mx_hyp, np_hyp)
+    assert np.allclose(mx_word, np_word)
+    assert np.allclose(mx_values, np_values)
+
+
+@pytest.mark.parametrize("batch_size, beam_size, target_vocab_size, top_n",
+                        [(1, 5, 200, 0),
+                         (5, 5, 200, 0),
+                         (1, 100, 200, 5),
+                         (5, 100, 200, 5)])
+def test_samplek_func(batch_size, beam_size, target_vocab_size, top_n):
+    # arrange scores increasing values from left to right, so the best item is always index 0, next-best 1, and so on
+    scores = mx.nd.array([list(range(1, target_vocab_size + 1)) for _ in range(batch_size * beam_size)])
+    # normalize
+    target_dists = mx.nd.broadcast_div(scores, scores.sum(axis=1, keepdims=True))
+
+    samplek = sockeye.beam_search.SampleK(n=top_n)
+    samplek.initialize()
+
+    sample_best_hyp_indices = mx.nd.arange(0, batch_size * beam_size, dtype='int32')
+
+    # 0..(batch_size * beam_size)-1
+    expected_hyps = mx.nd.array(range(batch_size * beam_size), dtype='int32')
+    finished = mx.nd.cast(mx.nd.random.uniform(0, 1, (batch_size * beam_size)) > 0.5, dtype='int32')
+
+    for i in [1, 2]:
+        if i == 2:
+            samplek.hybridize()
+
+        hyps, words, values = samplek(scores, scores, finished, sample_best_hyp_indices)
+        assert hyps.shape[0] == batch_size * beam_size
+
+        # The indices should always be the integers from 0 to batch*beam-1
+        assert sum(hyps == expected_hyps).asscalar() == (batch_size * beam_size)
+        if top_n != 0:
+            # Scores are increasing left-to-right, so best items are all the lowest word IDs.
+            # No word id greater than the cap (top_n) should be selected
+            assert mx.nd.sum(words >= top_n)[0].asscalar() == 0
+
+        # word index should be zero for all finished hypotheses
+        assert mx.nd.sum(mx.nd.where(finished, words, finished))[0].asscalar() == 0
+
+
+def test_update_scores():
+    vocab_size = 10
+    batch_beam_size = 3
+    us = sockeye.beam_search.UpdateScores()
+    pad_dist = mx.nd.full((batch_beam_size, vocab_size - 1), val=np.inf, dtype='float32')
+    eos_dist = mx.nd.full((batch_beam_size, vocab_size), val=np.inf, dtype='float32')
+    eos_dist[:, C.EOS_ID] = 0
+
+    lengths = mx.nd.array([0, 1, 0], dtype='int32')
+    max_lengths = mx.nd.array([1, 2, 3], dtype='int32')  # first on reaches max length
+    scores_accumulated = mx.nd.ones((3, 1), dtype='float32')
+    finished = mx.nd.array([0,   # not finished
+                            1,   # finished
+                            0],  # not finished
+                           dtype='int32')
+    inactive = mx.nd.zeros_like(finished)
+    target_dists = mx.nd.uniform(0, 1, (3, vocab_size))
+
+    scores, lengths = us(target_dists, finished, inactive, scores_accumulated, lengths, max_lengths, pad_dist, eos_dist)
+    scores = scores.asnumpy()
+    lengths = lengths.asnumpy().reshape((-1,))
+
+    assert (lengths == np.array([[1], [1], [1]])).all()  # all lengths but finished updated + 1
+    assert (scores[0] == (1. + target_dists[0] + eos_dist).asnumpy()).all()  # 1 reached max length, force eos
+    assert (scores[1] == np.array([1.] + pad_dist[1].asnumpy().tolist())).all()  # 2 finished, force pad, keep score
+    assert (scores[2] == (1. + target_dists[2]).asnumpy()).all()  # 3 scores + previous scores
+
+
+class _TestInference(sockeye.beam_search._Inference):
+
+    def __init__(self, output_vocab_size: int):
+        self.output_vocab_size = output_vocab_size
+        self.states = []
+
+    def state_structure(self):
+        return C.STEP_STATE + C.STEP_STATE # is this the correct structure to use for self.states?
+
+    def encode_and_initialize(self,
+                              inputs: mx.nd.NDArray,
+                              valid_length: Optional[mx.nd.NDArray] = None):
+        batch_size = inputs.shape[0]
+        # 'lengths'
+        internal_lengths = mx.nd.zeros((batch_size, 1), dtype='int32')
+        num_decode_step_calls = mx.nd.zeros((1, ), dtype='int32')
+        self.states = [internal_lengths, num_decode_step_calls]  # TODO add nested states
+        predicted_output_length = mx.nd.ones((batch_size, 1))  # does that work?
+        return self.states, predicted_output_length
+
+    def decode_step(self,
+                    step_input: mx.nd.NDArray,
+                    states: List,
+                    vocab_slice_ids: Optional[mx.nd.NDArray] = None):
+        batch_beam_size = step_input.shape[0]
+        print('step_input', step_input.asnumpy())
+
+        internal_lengths, num_decode_step_calls = states
+        num_decode_step_calls = num_decode_step_calls.asscalar()
+        if num_decode_step_calls == 0:  # first call to decode_step, we expect step input to be all <bos>
+            assert (step_input.asnumpy() == C.BOS_ID).all()
+
+        if step_input.asscalar() == C.BOS_ID:
+            # predict word id 4 given <bos>
+            scores = mx.nd.array([0, 0, 0, 0, 1])
+        elif step_input.asscalar() == C.EOS_ID:
+            # predict pad given <eos>
+            scores = mx.nd.array([1, 0, 0, 0, 0])
+        else:
+            # otherwise always predict pad
+            scores = mx.nd.array([0, 0, 0, 0, 1])
+
+        # topk is minimizing
+        scores *= -1
+        #outputs = mx.nd.array([self.predictor.get(inp, C.PAD_ID) for inp in step_input.asnumpy().tolist()], ctx=step_input.context)
+        #scores = mx.nd.one_hot(outputs, depth=self.output_vocab_size)
+
+        internal_lengths += 1
+        num_decode_step_calls += 1
+
+        self.states = states = [internal_lengths, mx.nd.array([num_decode_step_calls], dtype='int32')]
+        return scores, states
+
+
+# TODO make this a useful test
+# TODO: add vocabulary selection test
+def test_beam_search():
+    context = mx.cpu()
+    dtype='float32'
+    num_source_factors = 1
+    vocab_size = len(C.VOCAB_SYMBOLS) + 1  # 1 actual word: word id 4
+    beam_size = 1
+    bos_id = 2
+    eos_id = 3
+
+    inference = _TestInference(output_vocab_size=vocab_size)
+    bs = sockeye.beam_search.BeamSearch(
+        beam_size=beam_size,
+        dtype=dtype,
+        bos_id=bos_id,
+        eos_id=eos_id,
+        context=context,
+        output_vocab_size=vocab_size,
+        scorer=sockeye.beam_search.CandidateScorer(),
+        num_source_factors=num_source_factors,
+        inference=inference,
+        beam_search_stop=C.BEAM_SEARCH_STOP_ALL,
+        global_avoid_trie=None,
+        sample=None)
+
+    # inputs
+    batch_size = 1
+    max_length = 3
+    source = mx.nd.array([[C.BOS_ID, 4, C.EOS_ID, C.PAD_ID, C.PAD_ID]], ctx=context, dtype=dtype).reshape((0, -1, 1))
+    source_length = (source != C.PAD_ID).sum(axis=1).reshape((-1,))  # (batch_size,)
+
+    restrict_lexicon = None
+    raw_constraints = [None] * batch_size
+    raw_avoid_list = [None] * batch_size
+    max_output_lengths = mx.nd.array([max_length], ctx=context, dtype='int32')
+
+    bs_out = bs(source, source_length, restrict_lexicon, raw_constraints, raw_avoid_list, max_output_lengths)
+    best_hyp_indices, best_word_indices, scores, lengths, estimated_ref_lengths, constraints = bs_out
+
+    print('beam search lengths', lengths)
+    print('internal lengths', inference.states[0].asnumpy())
+    assert np.allclose(lengths, inference.states[0].asnumpy())
+    assert inference.states[1] == max_length
+
+    print(best_hyp_indices)
+    print(best_word_indices)
+
diff --git a/test/unit/test_bleu.py b/test/unit/test_bleu.py
index 3bb8b941b..c35d72e58 100644
--- a/test/unit/test_bleu.py
+++ b/test/unit/test_bleu.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
diff --git a/test/unit/test_chrf.py b/test/unit/test_chrf.py
index 6dc72c1b1..384829545 100644
--- a/test/unit/test_chrf.py
+++ b/test/unit/test_chrf.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -40,11 +40,11 @@
 
 @pytest.mark.parametrize("hypotheses, references, expected_score", test_cases)
 def test_chrf(hypotheses, references, expected_score):
-    score = sacrebleu.corpus_chrf(hypotheses, references, 6, 3)
+    score = sacrebleu.corpus_chrf(hypotheses, references, 6, 3).score
     assert abs(score - expected_score) < EPSILON
 
 
 @pytest.mark.parametrize("hypotheses, references, expected_score", test_cases_keep_whitespace)
 def test_chrf_keep_whitespace(hypotheses, references, expected_score):
-    score = sacrebleu.corpus_chrf(hypotheses, references, 6, 3, remove_whitespace=False)
+    score = sacrebleu.corpus_chrf(hypotheses, references, 6, 3, remove_whitespace=False).score
     assert abs(score - expected_score) < EPSILON
diff --git a/test/unit/test_config.py b/test/unit/test_config.py
index f87eca461..03c8306a3 100644
--- a/test/unit/test_config.py
+++ b/test/unit/test_config.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -28,43 +28,12 @@ def __init__(self, param, config=None):
         self.config = config
 
 
-def test_base_freeze():
-    c = config.Config()
-    c.param = 1
-    assert c.param == 1
-    c.freeze()
-    with pytest.raises(AttributeError) as e:
-        c.param = 2
-    assert str(e.value) == "Cannot set 'param' in frozen config"
-
-
-def test_freeze():
-    c1 = ConfigTest(param=1)
-    c2 = ConfigTest(param=3)
-    c1.param = 2
-    assert c1.param == 2
-    c1.config = c2
-    assert c2 == c1.config
-    c1.config.param = 2
-    assert c1.config.param == 2
-    c1.freeze()
-    assert c1.config._frozen  # pylint: disable= no-member
-    assert c2._frozen  # pylint: disable= no-member
-    with pytest.raises(AttributeError) as e:
-        c1.param = 3
-    assert str(e.value) == "Cannot set 'param' in frozen config"
-    with pytest.raises(AttributeError) as e:
-        c1.config.param = 3
-    assert str(e.value) == "Cannot set 'param' in frozen config"
-
-
 def test_config_repr():
     c1 = ConfigTest(param=1, config=ConfigTest(param=3))
-    c1.config.freeze()
-    assert str(c1) == "Config[_frozen=False, config=Config[_frozen=True, config=None, param=3], param=1]"
+    assert str(c1) == "Config[config=Config[config=None, param=3], param=1]"
 
 
-def test_eq():
+def test_config_eq():
     basic_c = config.Config()
     c1 = ConfigTest(param=1)
     c1_other = ConfigTest(param=1)
@@ -82,14 +51,14 @@ def test_eq():
     assert c_nested != c_nested_c2
 
 
-def test_no_self_attribute():
+def test_config_no_self_attribute():
     c1 = ConfigTest(param=1)
     with pytest.raises(AttributeError) as e:
         c1.config = c1
     assert str(e.value) == "Cannot set self as attribute"
 
 
-def test_serialization():
+def test_config_serialization():
     c1 = ConfigTest(param=1, config=ConfigTest(param=2))
     expected_serialization = """!ConfigTest
 config: !ConfigTest
@@ -99,7 +68,6 @@ def test_serialization():
 """
     with tempfile.TemporaryDirectory() as tmp_dir:
         fname = os.path.join(tmp_dir, "config")
-        c1.freeze()
         c1.save(fname)
         assert os.path.exists(fname)
         with open(fname) as f:
@@ -108,10 +76,9 @@ def test_serialization():
         c2 = config.Config.load(fname)
         assert c2.param == c1.param
         assert c2.config.param == c1.config.param
-        assert not c2._frozen
 
 
-def test_copy():
+def test_config_copy():
     c1 = ConfigTest(param=1)
     copy_c1 = c1.copy()
     # should be a different object that is equal to the original object
@@ -133,7 +100,7 @@ def __init__(self, existing_attribute, new_attribute="new_attribute"):
         self.new_attribute = new_attribute
 
 
-def test_missing_attributes_filled_with_default():
+def test_config_missing_attributes_filled_with_default():
     # when we load a configuration object that does not contain all attributes as the current version of the
     # configuration object we expect the missing attributes to be filled with the default values taken from the
     # __init__ method.
diff --git a/test/unit/test_constraints.py b/test/unit/test_constraints.py
index a78c5f7ca..55762e612 100644
--- a/test/unit/test_constraints.py
+++ b/test/unit/test_constraints.py
@@ -1,4 +1,4 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2018--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -11,22 +11,20 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-import json
 from unittest.mock import Mock
 
 import mxnet as mx
-import numpy as np
 import pytest
-from math import ceil
 
-from sockeye.data_io import get_tokens, tokens2ids, strids2ids
-from sockeye.vocab import build_vocab, reverse_vocab
-from sockeye.lexical_constraints import init_batch, get_bank_sizes, topk, ConstrainedHypothesis, AvoidBatch, AvoidState, AvoidTrie
+from sockeye.data_io import get_tokens, strids2ids
 from sockeye.inference import Translator
+from sockeye.lexical_constraints import init_batch, get_bank_sizes, ConstrainedHypothesis, AvoidBatch, AvoidState, \
+    AvoidTrie
 
 BOS_ID = 2
 EOS_ID = 3
 
+
 def mock_translator(num_source_factors: int):
     t_mock = Mock(Translator)
     t_mock.num_source_factors = num_source_factors
diff --git a/test/unit/test_coverage.py b/test/unit/test_coverage.py
deleted file mode 100644
index 183670a8f..000000000
--- a/test/unit/test_coverage.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-from unittest.mock import patch
-
-import mxnet as mx
-import numpy as np
-import pytest
-import sockeye.coverage
-from test.common import gaussian_vector, integer_vector, uniform_vector
-
-activation_types = ["tanh", "sigmoid", "relu", "softrelu"]
-
-
-def setup_module():
-    #  Store a reference to the original MXNet sequence mask function.
-    _mask_with_one.original_sequence_mask = mx.sym.SequenceMask
-
-
-@pytest.mark.parametrize("act_type", activation_types)
-def test_activation_coverage(act_type):
-    # Before running our test we patch MXNet's sequence mask function with a custom implementation.  Our custom function
-    # will call the built in masking operation, but ensure the masking value is the number one.  This masking value
-    # allows for clear test assertions.
-    _patch_sequence_mask(lambda: _test_activation_coverage(act_type))
-
-
-def test_gru_coverage():
-    # Before running our test we patch MXNet's sequence mask function with a custom implementation.  Our custom function
-    # will call the built in masking operation, but ensure the masking value is the number one.  This masking value
-    # allows for clear test assertions.
-    _patch_sequence_mask(lambda: _test_gru_coverage())
-
-
-def _test_activation_coverage(act_type):
-    config_coverage = sockeye.coverage.CoverageConfig(type=act_type, max_fertility=2, num_hidden=2, layer_normalization=False)
-    encoder_num_hidden, decoder_num_hidden, source_seq_len, batch_size = 5, 5, 10, 4
-    # source: (batch_size, source_seq_len, encoder_num_hidden)
-    source = mx.sym.Variable("source")
-    # source_length: (batch_size,)
-    source_length = mx.sym.Variable("source_length")
-    # prev_hidden: (batch_size, decoder_num_hidden)
-    prev_hidden = mx.sym.Variable("prev_hidden")
-    # prev_coverage: (batch_size, source_seq_len, coverage_num_hidden)
-    prev_coverage = mx.sym.Variable("prev_coverage")
-    # attention_scores: (batch_size, source_seq_len)
-    attention_scores = mx.sym.Variable("attention_scores")
-    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
-    source_length_shape = (batch_size,)
-    prev_hidden_shape = (batch_size, decoder_num_hidden)
-    attention_scores_shape = (batch_size, source_seq_len)
-    prev_coverage_shape = (batch_size, source_seq_len, config_coverage.num_hidden)
-    source_data = gaussian_vector(shape=source_shape)
-    source_length_data = integer_vector(shape=source_length_shape, max_value=source_seq_len)
-    prev_hidden_data = gaussian_vector(shape=prev_hidden_shape)
-    prev_coverage_data = gaussian_vector(shape=prev_coverage_shape)
-    attention_scores_data = uniform_vector(shape=attention_scores_shape)
-    attention_scores_data = attention_scores_data / np.sum(attention_scores_data)
-
-    coverage = sockeye.coverage.get_coverage(config_coverage)
-    coverage_func = coverage.on(source, source_length, source_seq_len)
-    updated_coverage = coverage_func(prev_hidden, attention_scores, prev_coverage)
-    executor = updated_coverage.simple_bind(ctx=mx.cpu(),
-                                            source=source_shape,
-                                            source_length=source_length_shape,
-                                            prev_hidden=prev_hidden_shape,
-                                            prev_coverage=prev_coverage_shape,
-                                            attention_scores=attention_scores_shape)
-    executor.arg_dict["source"][:] = source_data
-    executor.arg_dict["source_length"][:] = source_length_data
-    executor.arg_dict["prev_hidden"][:] = prev_hidden_data
-    executor.arg_dict["prev_coverage"][:] = prev_coverage_data
-    executor.arg_dict["attention_scores"][:] = attention_scores_data
-    result = executor.forward()
-    new_coverage = result[0].asnumpy()
-    assert new_coverage.shape == prev_coverage_shape
-    # this is needed to modulate the 0 input. The output changes according to the activation type used.
-    modulated = mx.nd.Activation(mx.nd.zeros((1, 1)), act_type=act_type).asnumpy()
-    assert (np.sum(np.sum(np.isclose(new_coverage, modulated, atol=1.e-6), axis=2) != 0, axis=1) == source_length_data).all()
-
-
-def _test_gru_coverage():
-    config_coverage = sockeye.coverage.CoverageConfig(type="gru", num_hidden=2,  max_fertility=2, layer_normalization=False)
-    encoder_num_hidden, decoder_num_hidden, source_seq_len, batch_size = 5, 5, 10, 4
-    # source: (batch_size, source_seq_len, encoder_num_hidden)
-    source = mx.sym.Variable("source")
-    # source_length: (batch_size,)
-    source_length = mx.sym.Variable("source_length")
-    # prev_hidden: (batch_size, decoder_num_hidden)
-    prev_hidden = mx.sym.Variable("prev_hidden")
-    # prev_coverage: (batch_size, source_seq_len, coverage_num_hidden)
-    prev_coverage = mx.sym.Variable("prev_coverage")
-    # attention_scores: (batch_size, source_seq_len)
-    attention_scores = mx.sym.Variable("attention_scores")
-    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
-    source_length_shape = (batch_size,)
-    prev_hidden_shape = (batch_size, decoder_num_hidden)
-    attention_scores_shape = (batch_size, source_seq_len)
-    prev_coverage_shape = (batch_size, source_seq_len, config_coverage.num_hidden)
-    source_data = gaussian_vector(shape=source_shape)
-    source_length_data = integer_vector(shape=source_length_shape, max_value=source_seq_len)
-    prev_hidden_data = gaussian_vector(shape=prev_hidden_shape)
-    prev_coverage_data = gaussian_vector(shape=prev_coverage_shape)
-    attention_scores_data = uniform_vector(shape=attention_scores_shape)
-    attention_scores_data = attention_scores_data / np.sum(attention_scores_data)
-    coverage = sockeye.coverage.get_coverage(config_coverage)
-    coverage_func = coverage.on(source, source_length, source_seq_len)
-    updated_coverage = coverage_func(prev_hidden, attention_scores, prev_coverage)
-    executor = updated_coverage.simple_bind(ctx=mx.cpu(),
-                                            source=source_shape,
-                                            source_length=source_length_shape,
-                                            prev_hidden=prev_hidden_shape,
-                                            prev_coverage=prev_coverage_shape,
-                                            attention_scores=attention_scores_shape)
-    executor.arg_dict["source"][:] = source_data
-    executor.arg_dict["source_length"][:] = source_length_data
-    executor.arg_dict["prev_hidden"][:] = prev_hidden_data
-    executor.arg_dict["prev_coverage"][:] = prev_coverage_data
-    executor.arg_dict["attention_scores"][:] = attention_scores_data
-    result = executor.forward()
-    new_coverage = result[0].asnumpy()
-    assert new_coverage.shape == prev_coverage_shape
-    assert (np.sum(np.sum(new_coverage != 1, axis=2) != 0, axis=1) == source_length_data).all()
-
-
-def _mask_with_one(data, axis, use_sequence_length, sequence_length):
-    return _mask_with_one.original_sequence_mask(data=data, axis=axis, use_sequence_length=use_sequence_length,
-                                                 sequence_length=sequence_length, value=1)
-
-
-def _patch_sequence_mask(test):
-    #  Wrap mx.sym to make it easily patchable.  All un-patched methods will fall-back to their default implementation.
-    with patch.object(mx, 'sym', wraps=mx.sym) as mxnet_mock:
-        #  Patch Sequence Mask to use ones for padding.
-        mxnet_mock.SequenceMask = _mask_with_one
-        test()
diff --git a/test/unit/test_data_io.py b/test/unit/test_data_io.py
index dbaffae9e..47a4b52cc 100644
--- a/test/unit/test_data_io.py
+++ b/test/unit/test_data_io.py
@@ -1,4 +1,4 @@
-# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -24,7 +24,7 @@
 from sockeye import data_io
 from sockeye import vocab
 from sockeye.utils import SockeyeError, get_tokens, seed_rngs
-from test.common import tmp_digits_dataset
+from sockeye.test_utils import tmp_digits_dataset
 
 seed_rngs(12)
 
@@ -42,28 +42,33 @@ def test_define_buckets(max_seq_len, step, expected_buckets):
     assert buckets == expected_buckets
 
 
-define_parallel_bucket_tests = [(50, 50, 10, 1.0, [(10, 10), (20, 20), (30, 30), (40, 40), (50, 50)]),
-                                (50, 50, 10, 0.5,
+define_parallel_bucket_tests = [(50, 50, 10, True, 1.0, [(10, 10), (20, 20), (30, 30), (40, 40), (50, 50)]),
+                                (50, 50, 10, True, 0.5,
                                  [(10, 5), (20, 10), (30, 15), (40, 20), (50, 25), (50, 30), (50, 35), (50, 40),
                                   (50, 45), (50, 50)]),
-                                (10, 10, 10, 0.1,
+                                (10, 10, 10, True, 0.1,
                                  [(10, 2), (10, 3), (10, 4), (10, 5), (10, 6), (10, 7), (10, 8), (10, 9), (10, 10)]),
-                                (10, 5, 10, 0.01, [(10, 2), (10, 3), (10, 4), (10, 5)]),
-                                (50, 50, 10, 2.0,
+                                (10, 5, 10, True, 0.01, [(10, 2), (10, 3), (10, 4), (10, 5)]),
+                                (50, 50, 10, True, 2.0,
                                  [(5, 10), (10, 20), (15, 30), (20, 40), (25, 50), (30, 50), (35, 50), (40, 50),
                                   (45, 50), (50, 50)]),
-                                (5, 10, 10, 10.0, [(2, 10), (3, 10), (4, 10), (5, 10)]),
-                                (5, 10, 10, 11.0, [(2, 10), (3, 10), (4, 10), (5, 10)]),
-                                (50, 50, 50, 0.5, [(50, 25), (50, 50)]),
-                                (50, 50, 50, 1.5, [(33, 50), (50, 50)]),
-                                (75, 75, 50, 1.5, [(33, 50), (66, 75), (75, 75)])]
-
-
-@pytest.mark.parametrize("max_seq_len_source, max_seq_len_target, bucket_width, length_ratio, expected_buckets",
-                         define_parallel_bucket_tests)
-def test_define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_width, length_ratio, expected_buckets):
+                                (5, 10, 10, True, 10.0, [(2, 10), (3, 10), (4, 10), (5, 10)]),
+                                (5, 10, 10, True, 11.0, [(2, 10), (3, 10), (4, 10), (5, 10)]),
+                                (50, 50, 50, True, 0.5, [(50, 25), (50, 50)]),
+                                (50, 50, 50, True, 1.5, [(33, 50), (50, 50)]),
+                                (75, 75, 50, True, 1.5, [(33, 50), (66, 75), (75, 75)]),
+                                (50, 50, 8, False, 1.5, [(8, 8), (16, 16), (24, 24), (32, 32), (40, 40), (48, 48),
+                                                         (50, 50)]),
+                                (50, 75, 8, False, 1.5, [(8, 8), (16, 16), (24, 24), (32, 32), (40, 40), (48, 48),
+                                                         (50, 56), (50, 64), (50, 72), (50, 75)])]
+
+
+@pytest.mark.parametrize("max_seq_len_source, max_seq_len_target, bucket_width, bucket_scaling, length_ratio,"
+                         "expected_buckets", define_parallel_bucket_tests)
+def test_define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_width, bucket_scaling, length_ratio,
+                                 expected_buckets):
     buckets = data_io.define_parallel_buckets(max_seq_len_source, max_seq_len_target, bucket_width=bucket_width,
-                                              length_ratio=length_ratio)
+                                              bucket_scaling=bucket_scaling, length_ratio=length_ratio)
     assert buckets == expected_buckets
 
 
@@ -212,7 +217,7 @@ def test_sample_based_define_bucket_batch_sizes():
     batch_by_words = False
     batch_size = 32
     max_seq_len = 100
-    buckets = data_io.define_parallel_buckets(max_seq_len, max_seq_len, 10, 1.5)
+    buckets = data_io.define_parallel_buckets(max_seq_len, max_seq_len, 10, 1, 1.5)
     bucket_batch_sizes = data_io.define_bucket_batch_sizes(buckets=buckets,
                                                            batch_size=batch_size,
                                                            batch_by_words=batch_by_words,
@@ -223,23 +228,29 @@ def test_sample_based_define_bucket_batch_sizes():
         assert bbs.average_target_words_per_batch == bbs.bucket[1] * batch_size
 
 
-@pytest.mark.parametrize("length_ratio", [0.5, 1.5])
-def test_word_based_define_bucket_batch_sizes(length_ratio):
+@pytest.mark.parametrize("length_ratio,batch_sentences_multiple_of,expected_batch_sizes", [
+        # Reference batch sizes manually inspected for sanity.  Note that for
+        # very unbalanced lengths, the last batch can be very large.  This is
+        # due to the requirement for any size batch (total elements) to fit into
+        # the same allocated space for MXNet's memory sharing.
+        (0.5, 1, [200.0, 100.0, 67.0, 50.0, 40.0, 33.0, 29.0, 25.0, 22.0, 41.0]),
+        (1.5, 1, [100.0, 50.0, 33.0, 25.0, 20.0, 20.0, 20.0, 20.0]),
+        (1.5, 8, [96.0, 48.0, 32.0, 24.0, 16.0, 16.0, 16.0, 24.0])])
+def test_word_based_define_bucket_batch_sizes(length_ratio, batch_sentences_multiple_of, expected_batch_sizes):
     batch_by_words = True
     batch_num_devices = 1
-    batch_size = 200
-    max_seq_len = 100
-    buckets = data_io.define_parallel_buckets(max_seq_len, max_seq_len, 10, length_ratio)
+    batch_size = 1000
+    max_seq_len = 50
+    buckets = data_io.define_parallel_buckets(max_seq_len, max_seq_len, 10, 1, length_ratio)
     bucket_batch_sizes = data_io.define_bucket_batch_sizes(buckets=buckets,
                                                            batch_size=batch_size,
                                                            batch_by_words=batch_by_words,
                                                            batch_num_devices=batch_num_devices,
-                                                           data_target_average_len=[None] * len(buckets))
+                                                           data_target_average_len=[None] * len(buckets),
+                                                           batch_sentences_multiple_of=batch_sentences_multiple_of)
     max_num_words = 0
     # last bucket batch size is different
-    for bbs in bucket_batch_sizes[:-1]:
-        target_padded_seq_len = bbs.bucket[1]
-        expected_batch_size = round((batch_size / target_padded_seq_len) / batch_num_devices)
+    for bbs, expected_batch_size in zip(bucket_batch_sizes, expected_batch_sizes):
         assert bbs.batch_size == expected_batch_size
         expected_average_target_words_per_batch = expected_batch_size * bbs.bucket[1]
         assert bbs.average_target_words_per_batch == expected_average_target_words_per_batch
@@ -270,17 +281,16 @@ def _get_random_bucketed_data(buckets: List[Tuple[int, int]],
         bucket_counts = [None for _ in buckets]
     bucket_counts = [random.randint(min_count, max_count) if given_count is None else given_count
                      for given_count in bucket_counts]
-    source = [mx.nd.array(np.random.randint(0, 10, (count, random.randint(1, bucket[0])))) for count, bucket in
+    source = [mx.nd.array(np.random.randint(0, 10, (count, random.randint(1, bucket[0]), 1))) for count, bucket in
               zip(bucket_counts, buckets)]
-    target = [mx.nd.array(np.random.randint(0, 10, (count, random.randint(1, bucket[1])))) for count, bucket in
+    target = [mx.nd.array(np.random.randint(0, 10, (count, random.randint(2, bucket[1])))) for count, bucket in
               zip(bucket_counts, buckets)]
-    label = target
-    return source, target, label
+    return source, target
 
 
 def test_parallel_data_set():
-    buckets = data_io.define_parallel_buckets(100, 100, 10, 1.0)
-    source, target, label = _get_random_bucketed_data(buckets, min_count=0, max_count=5)
+    buckets = data_io.define_parallel_buckets(100, 100, 10, 1, 1.0)
+    source, target = _get_random_bucketed_data(buckets, min_count=0, max_count=5)
 
     def check_equal(arrays1, arrays2):
         assert len(arrays1) == len(arrays2)
@@ -288,18 +298,17 @@ def check_equal(arrays1, arrays2):
             assert np.array_equal(a1.asnumpy(), a2.asnumpy())
 
     with TemporaryDirectory() as work_dir:
-        dataset = data_io.ParallelDataSet(source, target, label)
+        dataset = data_io.ParallelDataSet(source, target)
         fname = os.path.join(work_dir, 'dataset')
         dataset.save(fname)
         dataset_loaded = data_io.ParallelDataSet.load(fname)
         check_equal(dataset.source, dataset_loaded.source)
         check_equal(dataset.target, dataset_loaded.target)
-        check_equal(dataset.label, dataset_loaded.label)
 
 
 def test_parallel_data_set_fill_up():
     batch_size = 32
-    buckets = data_io.define_parallel_buckets(100, 100, 10, 1.0)
+    buckets = data_io.define_parallel_buckets(100, 100, 10, 1, 1.0)
     bucket_batch_sizes = data_io.define_bucket_batch_sizes(buckets,
                                                            batch_size,
                                                            batch_by_words=False,
@@ -310,12 +319,10 @@ def test_parallel_data_set_fill_up():
     dataset_filled_up = dataset.fill_up(bucket_batch_sizes)
     assert len(dataset_filled_up.source) == len(dataset.source)
     assert len(dataset_filled_up.target) == len(dataset.target)
-    assert len(dataset_filled_up.label) == len(dataset.label)
     for bidx in range(len(dataset)):
         bucket_batch_size = bucket_batch_sizes[bidx].batch_size
         assert dataset_filled_up.source[bidx].shape[0] == bucket_batch_size
         assert dataset_filled_up.target[bidx].shape[0] == bucket_batch_size
-        assert dataset_filled_up.label[bidx].shape[0] == bucket_batch_size
 
 
 def test_get_permutations():
@@ -342,7 +349,7 @@ def test_get_permutations():
 
 def test_parallel_data_set_permute():
     batch_size = 5
-    buckets = data_io.define_parallel_buckets(100, 100, 10, 1.0)
+    buckets = data_io.define_parallel_buckets(100, 100, 10, 1, 1.0)
     bucket_batch_sizes = data_io.define_bucket_batch_sizes(buckets,
                                                            batch_size,
                                                            batch_by_words=False,
@@ -361,17 +368,15 @@ def test_parallel_data_set_permute():
         if num_samples:
             assert (dataset.source[buck_idx] == dataset_restored.source[buck_idx]).asnumpy().all()
             assert (dataset.target[buck_idx] == dataset_restored.target[buck_idx]).asnumpy().all()
-            assert (dataset.label[buck_idx] == dataset_restored.label[buck_idx]).asnumpy().all()
         else:
             assert not dataset_restored.source[buck_idx]
             assert not dataset_restored.target[buck_idx]
-            assert not dataset_restored.label[buck_idx]
 
 
 def test_get_batch_indices():
     max_bucket_size = 50
     batch_size = 10
-    buckets = data_io.define_parallel_buckets(100, 100, 10, 1.0)
+    buckets = data_io.define_parallel_buckets(100, 100, 10, 1, 1.0)
     bucket_batch_sizes = data_io.define_bucket_batch_sizes(buckets,
                                                            batch_size,
                                                            batch_by_words=False,
@@ -510,12 +515,11 @@ def test_get_training_data_iters():
         for epoch in range(2):
             while train_iter.iter_next():
                 batch = train_iter.next()
-                assert len(batch.data) == 2
-                assert len(batch.label) == 1
-                assert batch.bucket_key in train_iter.buckets
-                source = batch.data[0].asnumpy()
-                target = batch.data[1].asnumpy()
-                label = batch.label[0].asnumpy()
+                assert isinstance(batch, data_io.Batch)
+                source = batch.source.asnumpy()
+                target = batch.target.asnumpy()
+                label = batch.labels[C.TARGET_LABEL_NAME].asnumpy()
+                length_ratio_label = batch.labels[C.LENRATIO_LABEL_NAME].asnumpy()
                 assert source.shape[0] == target.shape[0] == label.shape[0] == batch_size
                 # target first symbol should be BOS
                 # each source sequence contains one EOS symbol
@@ -528,17 +532,21 @@ def test_get_training_data_iters():
             train_iter.reset()
 
 
-def _data_batches_equal(db1, db2):
-    # We just compare the data, should probably be enough
+def _data_batches_equal(db1: data_io.Batch, db2: data_io.Batch) -> bool:
     equal = True
-    for data1, data2 in zip(db1.data, db2.data):
-        equal = equal and np.allclose(data1.asnumpy(), data2.asnumpy())
+    equal = equal and np.allclose(db1.source.asnumpy(), db2.source.asnumpy())
+    equal = equal and np.allclose(db1.source_length.asnumpy(), db2.source_length.asnumpy())
+    equal = equal and np.allclose(db1.target.asnumpy(), db2.target.asnumpy())
+    equal = equal and np.allclose(db1.target_length.asnumpy(), db2.target_length.asnumpy())
+    equal = equal and db1.labels.keys() == db2.labels.keys()
+    equal = equal and db1.samples == db2.samples
+    equal = equal and db1.tokens == db2.tokens
     return equal
 
 
 def test_parallel_sample_iter():
     batch_size = 2
-    buckets = data_io.define_parallel_buckets(100, 100, 10, 1.0)
+    buckets = data_io.define_parallel_buckets(100, 100, 10, 1, 1.0)
     # The first bucket is going to be empty:
     bucket_counts = [0] + [None] * (len(buckets) - 1)
     bucket_batch_sizes = data_io.define_bucket_batch_sizes(buckets,
@@ -596,7 +604,7 @@ def test_parallel_sample_iter():
 
 def test_sharded_parallel_sample_iter():
     batch_size = 2
-    buckets = data_io.define_parallel_buckets(100, 100, 10, 1.0)
+    buckets = data_io.define_parallel_buckets(100, 100, 10, 1, 1.0)
     # The first bucket is going to be empty:
     bucket_counts = [0] + [None] * (len(buckets) - 1)
     bucket_batch_sizes = data_io.define_bucket_batch_sizes(buckets,
@@ -668,7 +676,7 @@ def test_sharded_parallel_sample_iter_num_batches():
     num_shards = 2
     batch_size = 2
     num_batches_per_bucket = 10
-    buckets = data_io.define_parallel_buckets(100, 100, 10, 1.0)
+    buckets = data_io.define_parallel_buckets(100, 100, 10, 1, 1.0)
     bucket_counts = [batch_size * num_batches_per_bucket for _ in buckets]
     num_batches_per_shard = num_batches_per_bucket * len(buckets)
     num_batches = num_shards * num_batches_per_shard
@@ -689,8 +697,7 @@ def test_sharded_parallel_sample_iter_num_batches():
         dataset2.save(shard2_fname)
         shard_fnames = [shard1_fname, shard2_fname]
 
-        it = data_io.ShardedParallelSampleIter(shard_fnames, buckets, batch_size, bucket_batch_sizes,
-                                               'replicate')
+        it = data_io.ShardedParallelSampleIter(shard_fnames, buckets, batch_size, bucket_batch_sizes)
 
         num_batches_seen = 0
         while it.iter_next():
@@ -704,7 +711,7 @@ def test_sharded_and_parallel_iter_same_num_batches():
     using the same dataset. """
     batch_size = 2
     num_batches_per_bucket = 10
-    buckets = data_io.define_parallel_buckets(100, 100, 10, 1.0)
+    buckets = data_io.define_parallel_buckets(100, 100, 10, 1, 1.0)
     bucket_counts = [batch_size * num_batches_per_bucket for _ in buckets]
     num_batches = num_batches_per_bucket * len(buckets)
     bucket_batch_sizes = data_io.define_bucket_batch_sizes(buckets,
@@ -721,8 +728,7 @@ def test_sharded_and_parallel_iter_same_num_batches():
         dataset.save(shard_fname)
         shard_fnames = [shard_fname]
 
-        it_sharded = data_io.ShardedParallelSampleIter(shard_fnames, buckets, batch_size, bucket_batch_sizes,
-                                                       'replicate')
+        it_sharded = data_io.ShardedParallelSampleIter(shard_fnames, buckets, batch_size, bucket_batch_sizes)
 
         it_parallel = data_io.ParallelSampleIter(dataset, buckets, batch_size, bucket_batch_sizes)
 
@@ -747,3 +753,18 @@ def test_sharded_and_parallel_iter_same_num_batches():
             num_batches_seen += 1
 
         assert num_batches_seen == num_batches
+
+
+def test_create_target_and_shifted_label_sequences():
+    target_and_label = mx.nd.array([[C.BOS_ID, 4, 17, 35, 12, C.EOS_ID, C.PAD_ID, C.PAD_ID],
+                                    [C.BOS_ID, 15, 23, 23, 77, 55, 22, C.EOS_ID],
+                                    [C.BOS_ID, 4, C.EOS_ID, C.PAD_ID, C.PAD_ID, C.PAD_ID, C.PAD_ID, C.PAD_ID]])
+    expected_lengths = mx.nd.array([5, 7, 2])
+
+    target, label = data_io.create_target_and_shifted_label_sequences(target_and_label)
+
+    assert target.shape[0] == label.shape[0] == target_and_label.shape[0]
+    assert target.shape[1] == label.shape[1] == target_and_label.shape[1] - 1
+    lengths = (target != C.PAD_ID).sum(axis=1)
+    assert np.allclose(lengths.asnumpy(), expected_lengths.asnumpy())
+
diff --git a/test/unit/test_decoder.py b/test/unit/test_decoder.py
index 034b14fa1..40d6194c4 100644
--- a/test/unit/test_decoder.py
+++ b/test/unit/test_decoder.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -11,21 +11,17 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-import mxnet as mx
 import pytest
 
-import sockeye.rnn_attention
-import sockeye.rnn
 import sockeye.constants as C
-import sockeye.coverage
 import sockeye.decoder
 import sockeye.transformer
-from test.common import gaussian_vector, integer_vector
 
-step_tests = [(C.GRU_TYPE, True), (C.LSTM_TYPE, False)]
-
-
-def test_get_decoder():
+@pytest.mark.parametrize('lhuc', [
+    (False,),
+    (True,)
+])
+def test_get_decoder(lhuc):
     config = sockeye.transformer.TransformerConfig(
         model_size=20,
         attention_heads=10,
@@ -40,91 +36,8 @@ def test_get_decoder():
         postprocess_sequence='test_post_seq',
         max_seq_len_source=60,
         max_seq_len_target=70,
-        conv_config=None)
-    decoder = sockeye.decoder.get_decoder(config, 'test_')
+        lhuc=lhuc)
+    decoder = sockeye.decoder.get_decoder(config, inference_only=False, prefix='test_')
 
     assert type(decoder) == sockeye.decoder.TransformerDecoder
     assert decoder.prefix == 'test_' + C.TRANSFORMER_DECODER_PREFIX
-
-
-@pytest.mark.parametrize("cell_type, context_gating", step_tests)
-def test_step(cell_type, context_gating,
-              num_embed=2,
-              encoder_num_hidden=5,
-              decoder_num_hidden=5):
-
-    vocab_size, batch_size, source_seq_len = 10, 10, 7,
-
-    # (batch_size, source_seq_len, encoder_num_hidden)
-    source = mx.sym.Variable("source")
-    source_shape = (batch_size, source_seq_len, encoder_num_hidden)
-    # (batch_size,)
-    source_length = mx.sym.Variable("source_length")
-    source_length_shape = (batch_size,)
-    # (batch_size, num_embed)
-    word_vec_prev = mx.sym.Variable("word_vec_prev")
-    word_vec_prev_shape = (batch_size, num_embed)
-    # (batch_size, decoder_num_hidden)
-    hidden_prev = mx.sym.Variable("hidden_prev")
-    hidden_prev_shape = (batch_size, decoder_num_hidden)
-    # List(mx.sym.Symbol(batch_size, decoder_num_hidden)
-    states_shape = (batch_size, decoder_num_hidden)
-
-    config_coverage = sockeye.coverage.CoverageConfig(type="tanh",
-                                                      max_fertility=2,
-                                                      num_hidden=2,
-                                                      layer_normalization=False)
-    config_attention = sockeye.rnn_attention.AttentionConfig(type="coverage",
-                                                             num_hidden=2,
-                                                             input_previous_word=False,
-                                                             source_num_hidden=decoder_num_hidden,
-                                                             query_num_hidden=decoder_num_hidden,
-                                                             layer_normalization=False,
-                                                             config_coverage=config_coverage)
-    attention = sockeye.rnn_attention.get_attention(config_attention, max_seq_len=source_seq_len)
-    attention_state = attention.get_initial_state(source_length, source_seq_len)
-    attention_func = attention.on(source, source_length, source_seq_len)
-
-    config_rnn = sockeye.rnn.RNNConfig(cell_type=cell_type,
-                                       num_hidden=decoder_num_hidden,
-                                       num_layers=1,
-                                       dropout_inputs=0.,
-                                       dropout_states=0.,
-                                       residual=False,
-                                       forget_bias=0.)
-
-    config_decoder = sockeye.decoder.RecurrentDecoderConfig(max_seq_len_source=source_seq_len,
-                                                            rnn_config=config_rnn,
-                                                            attention_config=config_attention,
-                                                            context_gating=context_gating)
-
-    decoder = sockeye.decoder.RecurrentDecoder(config=config_decoder)
-
-    if cell_type == C.GRU_TYPE:
-        layer_states = [gaussian_vector(shape=states_shape, return_symbol=True) for _ in range(config_rnn.num_layers)]
-    elif cell_type == C.LSTM_TYPE:
-        layer_states = [gaussian_vector(shape=states_shape, return_symbol=True) for _ in range(config_rnn.num_layers*2)]
-    else:
-        raise ValueError
-
-    state, attention_state = decoder._step(word_vec_prev=word_vec_prev,
-                                           state=sockeye.decoder.RecurrentDecoderState(hidden_prev, layer_states),
-                                           attention_func=attention_func,
-                                           attention_state=attention_state)
-    sym = mx.sym.Group([state.hidden, attention_state.probs, attention_state.dynamic_source])
-
-    executor = sym.simple_bind(ctx=mx.cpu(),
-                               source=source_shape,
-                               source_length=source_length_shape,
-                               word_vec_prev=word_vec_prev_shape,
-                               hidden_prev=hidden_prev_shape)
-    executor.arg_dict["source"][:] = gaussian_vector(source_shape)
-    executor.arg_dict["source_length"][:] = integer_vector(source_length_shape, source_seq_len)
-    executor.arg_dict["word_vec_prev"][:] = gaussian_vector(word_vec_prev_shape)
-    executor.arg_dict["hidden_prev"][:] = gaussian_vector(hidden_prev_shape)
-    executor.arg_dict["states"] = layer_states
-    hidden_result, attention_probs_result, attention_dynamic_source_result = executor.forward()
-
-    assert hidden_result.shape == hidden_prev_shape
-    assert attention_probs_result.shape == (batch_size, source_seq_len)
-    assert attention_dynamic_source_result.shape == (batch_size, source_seq_len, config_coverage.num_hidden)
diff --git a/test/unit/test_encoder.py b/test/unit/test_encoder.py
index 6fa69195f..6db7d9588 100644
--- a/test/unit/test_encoder.py
+++ b/test/unit/test_encoder.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -12,88 +12,32 @@
 # permissions and limitations under the License.
 
 import pytest
-import mxnet as mx
-import numpy as np
 
 import sockeye.constants as C
 import sockeye.encoder
+import sockeye.transformer
 
 
-_BATCH_SIZE = 8
-_SEQ_LEN = 10
-_NUM_EMBED = 8
-_DATA_LENGTH_ND = mx.nd.array([1, 2, 3, 4, 5, 6, 7, 8])
-
-
-def test_get_recurrent_encoder_no_conv_config():
-    rnn_config = sockeye.rnn.RNNConfig(cell_type=C.LSTM_TYPE,
-                                       num_hidden=10,
-                                       num_layers=20,
-                                       dropout_inputs=1.0,
-                                       dropout_states=2.0)
-    config = sockeye.encoder.RecurrentEncoderConfig(rnn_config, conv_config=None, reverse_input=True, dtype='float16')
-    encoder = sockeye.encoder.get_recurrent_encoder(config, prefix='test_')
-
-    assert type(encoder) == sockeye.encoder.EncoderSequence
-    assert len(encoder.encoders) == 5
-
-    assert type(encoder.encoders[0]) == sockeye.encoder.ConvertLayout
-    assert encoder.encoders[0].__dict__.items() >= dict(num_hidden=0, target_layout='TNC',
-                                                        dtype='float16').items()
-
-    assert type(encoder.encoders[1]) == sockeye.encoder.ReverseSequence
-    assert encoder.encoders[1].__dict__.items() >= dict(num_hidden=0, dtype='float16').items()
-
-    assert type(encoder.encoders[2]) == sockeye.encoder.BiDirectionalRNNEncoder
-    assert encoder.encoders[2].__dict__.items() >= dict(layout='TNC', prefix='test_encoder_birnn_', dtype='float32').items()
-
-    assert type(encoder.encoders[3]) == sockeye.encoder.RecurrentEncoder
-    assert encoder.encoders[3].__dict__.items() >= dict(layout='TNC', dtype='float32').items()
-
-    assert type(encoder.encoders[4]) == sockeye.encoder.ConvertLayout
-    assert encoder.encoders[4].__dict__.items() >= dict(num_hidden=10, target_layout='NTC', dtype='float16').items()
-
-
-def test_get_recurrent_encoder():
-    rnn_config = sockeye.rnn.RNNConfig(cell_type=C.LSTM_TYPE,
-                                       num_hidden=10,
-                                       num_layers=20,
-                                       dropout_inputs=1.0,
-                                       dropout_states=2.0)
-    conv_config = sockeye.encoder.ConvolutionalEmbeddingConfig(num_embed=6, add_positional_encoding=True)
-    config = sockeye.encoder.RecurrentEncoderConfig(rnn_config, conv_config, reverse_input=True, dtype='float16')
-    encoder = sockeye.encoder.get_recurrent_encoder(config, prefix='test_')
-
-    assert type(encoder) == sockeye.encoder.EncoderSequence
-    assert len(encoder.encoders) == 7
-
-    assert type(encoder.encoders[0]) == sockeye.encoder.ConvolutionalEmbeddingEncoder
-    assert encoder.encoders[0].__dict__.items() >= dict(num_embed=6, prefix='test_encoder_char_',
-                                                        dtype='float32').items()
-
-    assert type(encoder.encoders[1]) == sockeye.encoder.AddSinCosPositionalEmbeddings
-    assert encoder.encoders[1].__dict__.items() >= dict(num_embed=6, prefix='test_encoder_char_add_positional_encodings',
-                                                        scale_up_input=False,
-                                                        scale_down_positions=False, dtype='float16').items()
-
-    assert type(encoder.encoders[2]) == sockeye.encoder.ConvertLayout
-    assert encoder.encoders[2].__dict__.items() >= dict(num_hidden=6, target_layout='TNC', dtype='float16').items()
-
-    assert type(encoder.encoders[3]) == sockeye.encoder.ReverseSequence
-    assert encoder.encoders[3].__dict__.items() >= dict(num_hidden=6, dtype='float16').items()
-
-    assert type(encoder.encoders[4]) == sockeye.encoder.BiDirectionalRNNEncoder
-    assert encoder.encoders[4].__dict__.items() >= dict(layout='TNC', prefix='test_encoder_birnn_', dtype='float32').items()
-
-    assert type(encoder.encoders[5]) == sockeye.encoder.RecurrentEncoder
-    assert encoder.encoders[5].__dict__.items() >= dict(layout='TNC', dtype='float32').items()
-
-    assert type(encoder.encoders[6]) == sockeye.encoder.ConvertLayout
-    assert encoder.encoders[6].__dict__.items() >= dict(num_hidden=10, target_layout='NTC', dtype='float16').items()
+@pytest.mark.parametrize('dropout, factor_configs, is_source', [
+    (0., None, False),
+    (0.1, [sockeye.encoder.FactorConfig(vocab_size=5,
+                                        num_embed=5,
+                                        combine=C.SOURCE_FACTORS_COMBINE_SUM,
+                                        share_source_embedding=False)],
+     True),
+])
+def test_embedding_encoder(dropout, factor_configs, is_source):
+    config = sockeye.encoder.EmbeddingConfig(vocab_size=20, num_embed=10, dropout=dropout, factor_configs=factor_configs)
+    embedding = sockeye.encoder.Embedding(config, prefix='embedding', is_source=is_source)
+    assert type(embedding) == sockeye.encoder.Embedding
 
 
-def test_get_transformer_encoder():
-    conv_config = sockeye.encoder.ConvolutionalEmbeddingConfig(num_embed=6, add_positional_encoding=True)
+@pytest.mark.parametrize('lhuc', [
+    (False,),
+    (True,)
+])
+def test_get_transformer_encoder(lhuc):
+    prefix = "test_"
     config = sockeye.transformer.TransformerConfig(model_size=20,
                                                    attention_heads=10,
                                                    feed_forward_num_hidden=30,
@@ -107,120 +51,10 @@ def test_get_transformer_encoder():
                                                    postprocess_sequence='test_post',
                                                    max_seq_len_source=50,
                                                    max_seq_len_target=60,
-                                                   conv_config=conv_config, dtype='float16')
-    encoder = sockeye.encoder.get_transformer_encoder(config, prefix='test_')
-
-    assert type(encoder) == sockeye.encoder.EncoderSequence
-    assert len(encoder.encoders) == 3
-
-    assert type(encoder.encoders[0]) == sockeye.encoder.AddLearnedPositionalEmbeddings
-    assert encoder.encoders[0].__dict__.items() >= dict(num_embed=20, max_seq_len=50, prefix='test_source_pos_embed_',
-                                                        dtype='float16').items()
-
-    assert type(encoder.encoders[1]) == sockeye.encoder.ConvolutionalEmbeddingEncoder
-    assert encoder.encoders[1].__dict__.items() >= dict(num_embed=6, prefix='test_encoder_char_', dtype='float32').items()
-
-    assert type(encoder.encoders[2]) == sockeye.encoder.TransformerEncoder
-    assert encoder.encoders[2].prefix == "test_encoder_transformer_"
-    assert encoder.encoders[2].dtype == 'float16'
-
-
-def test_get_convolutional_encoder():
-    cnn_config = sockeye.convolution.ConvolutionConfig(kernel_width=5, num_hidden=10)
-    config = sockeye.encoder.ConvolutionalEncoderConfig(num_embed=10,
-                                                        max_seq_len_source=20,
-                                                        cnn_config=cnn_config,
-                                                        num_layers=30,
-                                                        positional_embedding_type=C.NO_POSITIONAL_EMBEDDING,
-                                                        dtype='float16')
-    encoder = sockeye.encoder.get_convolutional_encoder(config, prefix='test_')
-
-    assert type(encoder) == sockeye.encoder.EncoderSequence
-    assert len(encoder.encoders) == 2
-
-    assert type(encoder.encoders[0]) == sockeye.encoder.NoOpPositionalEmbeddings
-    assert encoder.encoders[0].__dict__.items() >= dict(num_embed=10, dtype='float16').items()
-
-    assert type(encoder.encoders[1]) == sockeye.encoder.ConvolutionalEncoder
-    assert encoder.encoders[1].__dict__.items() >= dict(dtype='float16').items()
-
-
-def test_get_empty_encoder():
-    config = sockeye.encoder.EmptyEncoderConfig(num_embed=_NUM_EMBED,
-                                                num_hidden=10,
-                                                dtype='float16')
-    encoder = sockeye.encoder.EncoderSequence([sockeye.encoder.EmptyEncoder(config)], config.dtype)
-
-    assert type(encoder) == sockeye.encoder.EncoderSequence
-    assert len(encoder.encoders) == 1
-
-    assert type(encoder.encoders[0]) == sockeye.encoder.EmptyEncoder
-    assert encoder.encoders[0].__dict__.items() >= dict(num_embed=_NUM_EMBED, num_hidden=10, dtype='float16').items()
-
-
-@pytest.mark.parametrize("config, out_data_shape, out_data_length, out_seq_len", [
-    (sockeye.encoder.ConvolutionalEmbeddingConfig(num_embed=_NUM_EMBED,
-                                                  output_dim=None,
-                                                  max_filter_width=3,
-                                                  num_filters=[8, 16, 16],
-                                                  pool_stride=4,
-                                                  num_highway_layers=2,
-                                                  dropout=0,
-                                                  add_positional_encoding=False),
-     (8, 3, 40),
-     [1, 1, 1, 1, 2, 2, 2, 2],
-     3),
-    (sockeye.encoder.ConvolutionalEmbeddingConfig(num_embed=_NUM_EMBED,
-                                                  output_dim=32,
-                                                  max_filter_width=2,
-                                                  num_filters=[8, 16],
-                                                  pool_stride=3,
-                                                  num_highway_layers=0,
-                                                  dropout=0.1,
-                                                  add_positional_encoding=True),
-     (8, 4, 32),
-     [1, 1, 1, 2, 2, 2, 3, 3],
-     4),
-])
-def test_convolutional_embedding_encoder(config, out_data_shape, out_data_length, out_seq_len):
-    conv_embed = sockeye.encoder.ConvolutionalEmbeddingEncoder(config)
-
-    data_nd = mx.nd.random_normal(shape=(_BATCH_SIZE, _SEQ_LEN, _NUM_EMBED))
-
-    data = mx.sym.Variable("data", shape=data_nd.shape)
-    data_length = mx.sym.Variable("data_length", shape=_DATA_LENGTH_ND.shape)
-
-    (encoded_data,
-     encoded_data_length,
-     encoded_seq_len) = conv_embed.encode(data=data, data_length=data_length, seq_len=_SEQ_LEN)
-
-    exe = encoded_data.simple_bind(mx.cpu(), data=data_nd.shape)
-    exe.forward(data=data_nd)
-    assert exe.outputs[0].shape == out_data_shape
-
-    exe = encoded_data_length.simple_bind(mx.cpu(), data_length=_DATA_LENGTH_ND.shape)
-    exe.forward(data_length=_DATA_LENGTH_ND)
-    assert np.equal(exe.outputs[0].asnumpy(), np.asarray(out_data_length)).all()  # pylint: disable=no-member
-
-    assert encoded_seq_len == out_seq_len
-
-
-def test_sincos_positional_embeddings():
-    # Test that .encode() and .encode_positions() return the same values:
-    data = mx.sym.Variable("data")
-    positions = mx.sym.Variable("positions")
-    pos_encoder = sockeye.encoder.AddSinCosPositionalEmbeddings(num_embed=_NUM_EMBED,
-                                                                scale_up_input=False,
-                                                                scale_down_positions=False,
-                                                                prefix="test")
-    encoded, _, __ = pos_encoder.encode(data, None, _SEQ_LEN)
-    nd_encoded = encoded.eval(data=mx.nd.zeros((_BATCH_SIZE, _SEQ_LEN, _NUM_EMBED)))[0]
-    # Take the first element in the batch to get (seq_len, num_embed)
-    nd_encoded = nd_encoded[0]
-
-    encoded_positions = pos_encoder.encode_positions(positions, data)
-    # Explicitly encode all positions from 0 to _SEQ_LEN
-    nd_encoded_positions = encoded_positions.eval(positions=mx.nd.arange(0, _SEQ_LEN),
-                                                  data=mx.nd.zeros((_SEQ_LEN, _NUM_EMBED)))[0]
-    assert np.isclose(nd_encoded.asnumpy(), nd_encoded_positions.asnumpy()).all()
+                                                   lhuc=lhuc)
+    encoder = sockeye.encoder.get_transformer_encoder(config, prefix=prefix, dtype = C.DTYPE_FP32)
+    encoder.initialize()
+    encoder.hybridize(static_alloc=True)
 
+    assert type(encoder) == sockeye.encoder.TransformerEncoder
+    assert encoder.prefix == prefix + C.TRANSFORMER_ENCODER_PREFIX
diff --git a/test/unit/test_fixed_param_strategy.py b/test/unit/test_fixed_param_strategy.py
index bf787bd50..a2ebb9766 100644
--- a/test/unit/test_fixed_param_strategy.py
+++ b/test/unit/test_fixed_param_strategy.py
@@ -16,24 +16,14 @@
 import pytest
 
 import sockeye.constants as C
-from sockeye.training import TrainingModel
+from sockeye.model import SockeyeModel
+from sockeye.train import fixed_param_names_from_stragegy
 
 
 NUM_LAYERS = 3
 
 # Abbreviated version of weights from different model types.
 ALL_PARAMS = [
-    # RNN
-    'encoder_birnn_forward_l0_W',
-    'encoder_birnn_reverse_l0_W',
-    'encoder_rnn_l0_W',
-    'encoder_rnn_l1_W',
-    'decoder_rnn_att_W',
-    'decoder_rnn_enc2decinit_W',
-    'decoder_rnn_hidden_W',
-    'decoder_rnn_l0_W',
-    'decoder_rnn_l1_W',
-    'decoder_rnn_l2_W',
     # Transformer
     'encoder_transformer_0_W',
     'encoder_transformer_1_W',
@@ -43,15 +33,6 @@
     'decoder_transformer_1_W',
     'decoder_transformer_2_W',
     'decoder_transformer_final_W',
-    # CNN
-    'encoder_cnn_0_W',
-    'encoder_cnn_1_W',
-    'encoder_cnn_2_W',
-    'encoder_cnn_i2h_W',
-    'decoder_cnn_0_W',
-    'decoder_cnn_1_W',
-    'decoder_cnn_2_W',
-    'decoder_cnn_i2h_W',
     # Embeddings
     'source_embed_factor0_weight',
     'source_embed_factor1_weight',
@@ -66,21 +47,11 @@
 ]
 
 ALL_EXCEPT_DECODER_PARAMS = [
-    # RNN
-    'encoder_birnn_forward_l0_W',
-    'encoder_birnn_reverse_l0_W',
-    'encoder_rnn_l0_W',
-    'encoder_rnn_l1_W',
     # Transformer
     'encoder_transformer_0_W',
     'encoder_transformer_1_W',
     'encoder_transformer_2_W',
     'encoder_transformer_final_W',
-    # CNN
-    'encoder_cnn_0_W',
-    'encoder_cnn_1_W',
-    'encoder_cnn_2_W',
-    'encoder_cnn_i2h_W',
     # Embeddings
     'source_embed_factor0_weight',
     'source_embed_factor1_weight',
@@ -95,22 +66,11 @@
 ]
 
 ALL_EXCEPT_OUTER_LAYERS_PARAMS = [
-    # RNN
-    'encoder_rnn_l0_W',
-    'decoder_rnn_att_W',
-    'decoder_rnn_enc2decinit_W',
-    'decoder_rnn_hidden_W',
-    'decoder_rnn_l1_W',
     # Transformer
     'encoder_transformer_1_W',
     'encoder_transformer_final_W',
     'decoder_transformer_1_W',
     'decoder_transformer_final_W',
-    # CNN
-    'encoder_cnn_1_W',
-    'encoder_cnn_i2h_W',
-    'decoder_cnn_1_W',
-    'decoder_cnn_i2h_W',
     # Embeddings
     'source_embed_factor0_weight',
     'source_embed_factor1_weight',
@@ -125,17 +85,6 @@
 ]
 
 ALL_EXCEPT_EMBED_PARAMS = [
-    # RNN
-    'encoder_birnn_forward_l0_W',
-    'encoder_birnn_reverse_l0_W',
-    'encoder_rnn_l0_W',
-    'encoder_rnn_l1_W',
-    'decoder_rnn_att_W',
-    'decoder_rnn_enc2decinit_W',
-    'decoder_rnn_hidden_W',
-    'decoder_rnn_l0_W',
-    'decoder_rnn_l1_W',
-    'decoder_rnn_l2_W',
     # Transformer
     'encoder_transformer_0_W',
     'encoder_transformer_1_W',
@@ -145,32 +94,15 @@
     'decoder_transformer_1_W',
     'decoder_transformer_2_W',
     'decoder_transformer_final_W',
-    # CNN
-    'encoder_cnn_0_W',
-    'encoder_cnn_1_W',
-    'encoder_cnn_2_W',
-    'encoder_cnn_i2h_W',
-    'decoder_cnn_0_W',
-    'decoder_cnn_1_W',
-    'decoder_cnn_2_W',
-    'decoder_cnn_i2h_W',
+    # Embeddings
+    'source_pos_embed_weight',
+    'target_pos_embed_weight',
     # Output
     'target_output_bias',
     'target_output_weight',
 ]
 
 ALL_EXCEPT_OUTPUT_PROJ_PARAMS = [
-    # RNN
-    'encoder_birnn_forward_l0_W',
-    'encoder_birnn_reverse_l0_W',
-    'encoder_rnn_l0_W',
-    'encoder_rnn_l1_W',
-    'decoder_rnn_att_W',
-    'decoder_rnn_enc2decinit_W',
-    'decoder_rnn_hidden_W',
-    'decoder_rnn_l0_W',
-    'decoder_rnn_l1_W',
-    'decoder_rnn_l2_W',
     # Transformer
     'encoder_transformer_0_W',
     'encoder_transformer_1_W',
@@ -180,15 +112,6 @@
     'decoder_transformer_1_W',
     'decoder_transformer_2_W',
     'decoder_transformer_final_W',
-    # CNN
-    'encoder_cnn_0_W',
-    'encoder_cnn_1_W',
-    'encoder_cnn_2_W',
-    'encoder_cnn_i2h_W',
-    'decoder_cnn_0_W',
-    'decoder_cnn_1_W',
-    'decoder_cnn_2_W',
-    'decoder_cnn_i2h_W',
     # Embeddings
     'source_embed_factor0_weight',
     'source_embed_factor1_weight',
@@ -206,8 +129,9 @@
     (ALL_PARAMS, C.FIXED_PARAM_STRATEGY_ALL_EXCEPT_OUTPUT_PROJ, ALL_EXCEPT_OUTPUT_PROJ_PARAMS),
 ])
 def test_fixed_param_strategy(param_names, strategy, expected_fixed_param_names):
-    model = mock.Mock()
-    model.config.config_encoder.num_layers = NUM_LAYERS
-    model.config.config_decoder.num_layers = NUM_LAYERS
-    fixed_param_names = TrainingModel._generate_fixed_param_names(model, param_names, strategy)
-    assert fixed_param_names == expected_fixed_param_names
+    config = mock.Mock()
+    config.config_encoder.num_layers = NUM_LAYERS
+    config.config_decoder.num_layers = NUM_LAYERS
+    params = {name: None for name in ALL_PARAMS}
+    fixed_param_names = fixed_param_names_from_stragegy(config, params, strategy)
+    assert sorted(fixed_param_names) == sorted(expected_fixed_param_names)
diff --git a/test/unit/test_inference.py b/test/unit/test_inference.py
index 0aa2d31b0..a7ebe501f 100644
--- a/test/unit/test_inference.py
+++ b/test/unit/test_inference.py
@@ -1,4 +1,4 @@
-# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -11,21 +11,22 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
+import itertools
 import json
 from math import ceil
-from typing import Tuple
 from unittest.mock import patch, Mock
 
 import mxnet as mx
 import numpy as np
-import itertools
 import pytest
 
+import sockeye.beam_search
 import sockeye.constants as C
 import sockeye.data_io
 import sockeye.inference
 import sockeye.lexical_constraints
 import sockeye.lexicon
+import sockeye.model
 import sockeye.utils
 
 _BOS = 0
@@ -35,7 +36,6 @@
 def mock_translator(batch_size: int = 1,
                     beam_size: int = 5,
                     nbest_size: int = 1,
-                    beam_prune: float = 0,
                     num_source_factors: int = 1):
     """
     Creates a fake translator object but with real values for things that we need.
@@ -43,35 +43,31 @@ def mock_translator(batch_size: int = 1,
     """
     with patch.object(sockeye.inference.Translator, '__init__', lambda self, **kwargs: None):
         translator = sockeye.inference.Translator(context=None,
+                                                  batch_size=None,
+                                                  beam_size=None,
                                                   ensemble_mode=None,
-                                                  bucket_source_width=None,
-                                                  length_penalty=None,
-                                                  brevity_penalty=None,
-                                                  beam_prune=None,
+                                                  scorer=None,
                                                   beam_search_stop=None,
                                                   nbest_size=None,
                                                   models=None,
                                                   source_vocabs=None,
                                                   target_vocab=None,
                                                   restrict_lexicon=None,
-                                                  store_beam=None,
                                                   strip_unknown_words=None)
 
         # This is needed for returning the right number of source factors
         def mock_model():
-            t_mock = Mock(sockeye.inference.InferenceModel)
+            t_mock = Mock(sockeye.model.SockeyeModel)
             t_mock.num_source_factors = num_source_factors
             return t_mock
 
-        translator.models = [mock_model()]
-
         translator.batch_size = batch_size
         translator.beam_size = beam_size
         translator.nbest_size = nbest_size
-        translator.beam_prune = beam_prune
+        translator.models = [mock_model()]
         translator.zeros_array = mx.nd.zeros((beam_size,), dtype='int32')
         translator.inf_array = mx.nd.full((batch_size * beam_size,), val=np.inf, dtype='float32')
-        translator.inf_array = mx.nd.slice(translator.inf_array, begin=(0), end=(beam_size))
+        translator.inf_array = mx.nd.slice(translator.inf_array, begin=(0,), end=(beam_size,))
         translator.restrict_lexicon = None
         return translator
 
@@ -87,112 +83,38 @@ def test_concat_translations(lp_alpha: float, lp_beta: float, bp_weight: float):
     beam_history3 = {"id": [3]}
     expected_beam_histories = [beam_history1, beam_history2, beam_history3]
     expected_target_ids = [0, 1, 2, 0, 8, 9, 0, 3, 4, 5, -1]
-    num_src = 7
 
-    length_penalty = sockeye.inference.LengthPenalty(lp_alpha, lp_beta)
-    brevity_penalty = sockeye.inference.BrevityPenalty(bp_weight)
+    scorer = sockeye.beam_search.CandidateScorer(lp_alpha, lp_beta, bp_weight)
 
-    expected_score = (1 + 2 + 3) / length_penalty.get(len(expected_target_ids)) - \
-                     brevity_penalty.get(len(expected_target_ids), 10 + 11 + 12)
+    raw_score = (1 + 2 + 3)
+    length = len(expected_target_ids)
+    reference_length = (10 + 11 + 12)
+    expected_score = scorer(raw_score, length, reference_length)
+    # expected_score = (1 + 2 + 3) / length_penalty.get(len(expected_target_ids)) - \
+    #                  brevity_penalty.get(len(expected_target_ids), 10 + 11 + 12)
     translations = [sockeye.inference.Translation([0, 1, 2, -1],
-                                                  np.zeros((4, num_src)),
-                                                  1.0 / length_penalty.get(4) - brevity_penalty.get(4, 10),
+                                                  scorer(1.0, 4, 10),
                                                   [beam_history1],
                                                   None,
                                                   10),
                     # Translation without EOS
                     sockeye.inference.Translation([0, 8, 9],
-                                                  np.zeros((3, num_src)),
-                                                  2.0 / length_penalty.get(3) - brevity_penalty.get(3, 11),
+                                                  scorer(2.0, 3, 11),
                                                   [beam_history2],
                                                   None,
                                                   11),
                     sockeye.inference.Translation([0, 3, 4, 5, -1],
-                                                  np.zeros((5, num_src)),
-                                                  3.0 / length_penalty.get(5) - brevity_penalty.get(5, 12),
+                                                  scorer(3.0, 5, 12),
                                                   [beam_history3],
                                                   None,
                                                   12)]
-    combined = sockeye.inference._concat_translations(translations, stop_ids={_EOS},
-                                                      length_penalty=length_penalty, brevity_penalty=brevity_penalty)
+    combined = sockeye.inference._concat_translations(translations, stop_ids={_EOS}, scorer=scorer)
 
     assert combined.target_ids == expected_target_ids
-    assert combined.attention_matrix.shape == (len(expected_target_ids), len(translations) * num_src)
     assert np.isclose(combined.score, expected_score)
     assert combined.beam_histories == expected_beam_histories
 
 
-def test_length_penalty_default():
-    lengths = mx.nd.array([[1], [2], [3]])
-    length_penalty = sockeye.inference.LengthPenalty(1.0, 0.0)
-    expected_lp = np.array([[1.0], [2.], [3.]])
-
-    assert np.isclose(length_penalty.get(lengths).asnumpy(), expected_lp).all()
-    assert np.isclose(length_penalty(lengths).asnumpy(), expected_lp).all()
-    length_penalty.hybridize()
-    assert np.isclose(length_penalty(lengths).asnumpy(), expected_lp).all()
-
-
-def test_length_penalty():
-    lengths = mx.nd.array([[1], [2], [3]])
-    length_penalty = sockeye.inference.LengthPenalty(.2, 5.0)
-    expected_lp = np.array([[6 ** 0.2 / 6 ** 0.2], [7 ** 0.2 / 6 ** 0.2], [8 ** 0.2 / 6 ** 0.2]])
-
-    assert np.isclose(length_penalty.get(lengths).asnumpy(), expected_lp).all()
-    assert np.isclose(length_penalty(lengths).asnumpy(), expected_lp).all()
-    length_penalty.hybridize()
-    assert np.isclose(length_penalty(lengths).asnumpy(), expected_lp).all()
-
-
-def test_length_penalty_int_input():
-    length = 1
-    length_penalty = sockeye.inference.LengthPenalty(.2, 5.0)
-    expected_lp = [6 ** 0.2 / 6 ** 0.2]
-
-    assert np.isclose(np.asarray([length_penalty.get(length)]), np.asarray(expected_lp)).all()
-
-
-def test_brevity_penalty_default():
-    hyp_lengths = mx.nd.array([[1], [2], [3]])
-    ref_lengths = mx.nd.array([[2], [3], [2]])
-    brevity_penalty = sockeye.inference.BrevityPenalty(0.0)
-    expected_bp = 0.0
-    expected_bp_np = np.array([0.0, 0.0, 0.0])
-
-    assert np.isclose(brevity_penalty.get(hyp_lengths, ref_lengths), expected_bp)
-    assert np.isclose(brevity_penalty(hyp_lengths, ref_lengths).asnumpy(), expected_bp_np).all()
-    brevity_penalty.hybridize()
-    assert np.isclose(brevity_penalty(hyp_lengths, ref_lengths).asnumpy(), expected_bp).all()
-
-
-def test_brevity_penalty():
-    hyp_lengths = mx.nd.array([[1], [2], [3]])
-    ref_lengths = mx.nd.array([[7], [2], [91]])
-    brevity_penalty = sockeye.inference.BrevityPenalty(3.5)
-    expected_bp = np.array([[3.5 * (1 - 7 / 1)], [0.0], [3.5 * (1 - 91 / 3)]])
-
-    assert np.isclose(brevity_penalty(hyp_lengths, ref_lengths).asnumpy(), expected_bp).all()
-    brevity_penalty.hybridize()
-    assert np.isclose(brevity_penalty(hyp_lengths, ref_lengths).asnumpy(), expected_bp).all()
-
-
-def test_brevity_penalty_int_input():
-    hyp_length = 3
-    ref_length = 5
-    brevity_penalty = sockeye.inference.BrevityPenalty(2.0)
-    expected_bp = [2.0 * (1 - 5 / 3)]
-
-    assert np.isclose(np.asarray([brevity_penalty.get(hyp_length, ref_length)]), np.asarray(expected_bp)).all()
-
-
-def test_brevity_penalty_empty_ref():
-    hyp_length = 3
-    ref_length = None
-    brevity_penalty = sockeye.inference.BrevityPenalty(2.0)
-    expected_bp = 0.0
-
-    assert np.isclose(np.asarray([brevity_penalty.get(hyp_length, ref_length)]), np.asarray(expected_bp)).all()
-
 @pytest.mark.parametrize("sentence_id, sentence, factors, chunk_size",
                          [(1, "a test", None, 4),
                           (1, "a test", None, 2),
@@ -224,27 +146,22 @@ def test_translator_input(sentence_id, sentence, factors, chunk_size):
                 assert factor == expected_factor[chunk_id * chunk_size: (chunk_id + 1) * chunk_size]
 
 
-@pytest.mark.parametrize("supported_max_seq_len_source, supported_max_seq_len_target, training_max_seq_len_source, "
-                         "forced_max_input_len, length_ratio_mean, length_ratio_std, "
+@pytest.mark.parametrize("supported_max_seq_len_source, supported_max_seq_len_target, "
+                         "forced_max_input_len, forced_max_output_len, length_ratio_mean, length_ratio_std, "
                          "expected_max_input_len, expected_max_output_len",
                          [
-                             (100, 100, 100, None, 0.9, 0.2, 89, 100),
-                             (100, 100, 100, None, 1.1, 0.2, 75, 100),
-                             # No source length constraints.
-                             (None, 100, 100, None, 0.9, 0.1, 98, 100),
-                             # No target length constraints.
-                             (80, None, 100, None, 1.1, 0.4, 80, 122),
-                             # No source/target length constraints. Source is max observed during training and target
-                             # based on length ratios.
-                             (None, None, 100, None, 1.0, 0.1, 100, 113),
-                             # Force a maximum input length.
-                             (100, 100, 100, 50, 1.1, 0.2, 50, 67),
+                             (99 + 1, 99 + 1, None, None, 1.0, 0.0, 100, 100),  # copy/sort test cases
+                             (99 + 1, 99 + 1, None, None, 0.9, 0.2, 100, 111),  # target shorter than source
+                             (99 + 1, 99 + 1, None, None, 1.1, 0.2, 100, 130),  # target longer than source
+                             (99 + 1, 99 + 1, 50, None, 1.1, 0.2, 51, 67),  # force a maximum input length
+                             (99 + 1, 99 + 1, 50, None, 1.1, 0.2, 51, 67),  # force a maximum input length
+                             (99 + 1, 99 + 1, 50, 80, 1.1, 0.2, 51, 81),  # force a maximum input length
                          ])
 def test_get_max_input_output_length(
         supported_max_seq_len_source,
         supported_max_seq_len_target,
-        training_max_seq_len_source,
         forced_max_input_len,
+        forced_max_output_len,
         length_ratio_mean,
         length_ratio_std,
         expected_max_input_len,
@@ -252,21 +169,16 @@ def test_get_max_input_output_length(
     max_input_len, get_max_output_len = sockeye.inference.get_max_input_output_length(
         supported_max_seq_len_source=supported_max_seq_len_source,
         supported_max_seq_len_target=supported_max_seq_len_target,
-        training_max_seq_len_source=training_max_seq_len_source,
         forced_max_input_len=forced_max_input_len,
+        forced_max_output_len=forced_max_output_len,
         length_ratio_mean=length_ratio_mean,
         length_ratio_std=length_ratio_std,
         num_stds=1)
     max_output_len = get_max_output_len(max_input_len)
 
-    if supported_max_seq_len_source is not None:
-        assert max_input_len <= supported_max_seq_len_source
-    if supported_max_seq_len_target is not None:
-        assert max_output_len <= supported_max_seq_len_target
-    if expected_max_input_len is not None:
-        assert max_input_len == expected_max_input_len
-    if expected_max_output_len is not None:
-        assert max_output_len == expected_max_output_len
+    assert max_input_len <= supported_max_seq_len_source
+    assert max_input_len == expected_max_input_len
+    assert max_output_len == expected_max_output_len
 
 
 @pytest.mark.parametrize("sentence, num_expected_factors, delimiter, expected_tokens, expected_factors",
@@ -448,166 +360,6 @@ def test_make_input_from_multiple_strings(strings):
     assert inp.factors == expected_factors
 
 
-# batch size, beam size, prune thresh, accumulated scores, finished, expected_inactive
-prune_tests = [
-    # no pruning because nothing is finished
-    (1, 10, 0, list(range(10)), [0] * 10, [0] * 10),
-    # top item finished, threshold of 0.5, so one everything except top inactive
-    (1, 10, 0.5, list(range(10)), [1] + [0] * 9, [0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
-    # same but here the threshold doesn't include the second item
-    (1, 10, 1.5, list(range(10)), [1] + [0] * 9, [0, 0, 1, 1, 1, 1, 1, 1, 1, 1]),
-    # finished item is in the middle
-    (1, 5, 1.5, [10, 16, 4, 5, 8], [0, 0, 1, 0, 0], [1, 1, 0, 0, 1]),
-    # multiple finished items, lowest in last position
-    (1, 5, 1.5, [10, 16, 4, 5, 8], [1, 0, 0, 0, 1], [1, 1, 0, 0, 0]),
-    # batch setting, so pruning only applies to the first sentence
-    (2, 10, 1.5, list(range(20)), [1] + [0] * 19, [0, 0] + [1] * 8 + [0] * 10),
-]
-
-
-@pytest.mark.parametrize("batch, beam, prune, scores, finished, expected_inactive", prune_tests)
-def test_beam_prune(batch, beam, prune, scores, finished, expected_inactive):
-    scores = mx.nd.array(scores).reshape((-1, 1))
-    finished = mx.nd.array(finished, dtype='int32')
-    best_word_indices = mx.nd.zeros((batch * beam,), dtype='int32')
-
-    prune_hyps = sockeye.inference.PruneHypotheses(prune, beam)
-    prune_hyps.initialize()
-    inactive, _, _ = prune_hyps(best_word_indices, scores, finished)
-    assert inactive.asnumpy().tolist() == expected_inactive
-
-    prune_hyps.hybridize()
-    inactive, _, _ = prune_hyps(best_word_indices, scores, finished)
-    assert inactive.asnumpy().tolist() == expected_inactive
-
-
-def test_sort_by_index():
-    data = [mx.nd.random.uniform(0, 1, (3, i)) for i in range(1, 5)]
-    indices = mx.nd.array([2, 0, 1], dtype='int32')
-    expected = [d.asnumpy()[indices.asnumpy()] for d in data]
-
-    sort_by_index = sockeye.inference.SortByIndex()
-    sort_by_index.initialize()
-
-    out = sort_by_index(indices, *data)
-    assert len(out) == len(data) == len(expected)
-    for o, e in zip(out, expected):
-        assert (o.asnumpy() == e).all()
-
-    sort_by_index.hybridize()
-    out = sort_by_index(indices, *data)
-    assert len(out) == len(data) == len(expected)
-    for o, e in zip(out, expected):
-        assert (o.asnumpy() == e).all()
-
-
-def numpy_topk(scores: mx.nd.NDArray,
-               k: int,
-               offset: mx.nd.NDArray) -> Tuple[mx.nd.NDArray, mx.nd.NDArray, mx.nd.NDArray]:
-    """
-    Get the lowest k elements per sentence from a `scores` matrix using an intermediary Numpy conversion.
-    This should be equivalent to sockeye.utils.topk() and is used as a comparative implementation in testing.
-
-    :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
-    :param k: The number of smallest scores to return.
-    :param offset: Array to add to the hypothesis indices for offsetting in batch decoding.
-    :return: The row indices, column indices and values of the k smallest items in matrix.
-    """
-    # (batch_size, beam_size * target_vocab_size)
-    folded_scores = scores.reshape((-1, k * scores.shape[-1]))
-    batch_size = folded_scores.shape[0]
-
-    folded_scores = folded_scores.asnumpy()
-    # Get the scores
-    # Indexes into folded_scores: (batch_size, beam_size)
-    flat_idxs = np.argpartition(folded_scores, range(k))[:, :k]
-    # Score values: (batch_size, beam_size)
-    values = mx.nd.array(folded_scores[np.arange(folded_scores.shape[0])[:, None], flat_idxs], ctx=scores.context)
-    best_hyp_indices, best_word_indices = mx.nd.array(np.unravel_index(flat_idxs.ravel(), scores.shape),
-                                                      dtype='int32', ctx=scores.context)
-
-    if batch_size > 1:
-        # Offsetting the indices to match the shape of the scores matrix
-        best_hyp_indices += offset
-
-    values = values.reshape((-1, 1))
-    return best_hyp_indices, best_word_indices, values
-
-
-@pytest.mark.parametrize("batch_size, beam_size, target_vocab_size",
-                        [(1, 5, 200),
-                         (5, 5, 200),
-                         (1, 1, 200),
-                         (5, 1, 200),
-                         (10, 10, 100)])
-def test_topk_func(batch_size, beam_size, target_vocab_size):
-    # Random model scores. Shape: (batch_size * beam_size, target_vocab_size)
-    scores = mx.nd.random.uniform(0, 1, (batch_size * beam_size, target_vocab_size))
-    # offset for batch sizes > 1
-    offset = mx.nd.repeat(mx.nd.arange(0, batch_size * beam_size, beam_size, dtype='int32'), beam_size)
-
-    np_hyp, np_word, np_values = numpy_topk(scores, k=beam_size, offset=offset)
-    np_hyp, np_word, np_values = np_hyp.asnumpy(), np_word.asnumpy(), np_values.asnumpy()
-
-    mx_hyp, mx_word, mx_values = sockeye.utils.topk(scores, k=beam_size, offset=offset)
-    mx_hyp, mx_word, mx_values = mx_hyp.asnumpy(), mx_word.asnumpy(), mx_values.asnumpy()
-    assert all(mx_hyp == np_hyp)
-    assert all(mx_word == np_word)
-    assert all(mx_values == np_values)
-
-    topk = sockeye.inference.TopK(k=beam_size, vocab_size=target_vocab_size)
-    topk.initialize()
-
-    mx_hyp, mx_word, mx_values = topk(scores, offset)
-    mx_hyp, mx_word, mx_values = mx_hyp.asnumpy(), mx_word.asnumpy(), mx_values.asnumpy()
-    assert all(mx_hyp == np_hyp)
-    assert all(mx_word == np_word)
-    assert all(mx_values == np_values)
-
-    topk.hybridize()
-    mx_hyp, mx_word, mx_values = topk(scores, offset)
-    mx_hyp, mx_word, mx_values = mx_hyp.asnumpy(), mx_word.asnumpy(), mx_values.asnumpy()
-    assert all(mx_hyp == np_hyp)
-    assert all(mx_word == np_word)
-    assert all(mx_values == np_values)
-
-
-@pytest.mark.parametrize("batch_size, beam_size, target_vocab_size, top_n",
-                        [(1, 5, 200, 0),
-                         (5, 5, 200, 0),
-                         (1, 100, 200, 5),
-                         (5, 100, 200, 5)])
-def test_samplek_func(batch_size, beam_size, target_vocab_size, top_n):
-    # arrange scores increasing values from left to right, so the best item is always index 0, next-best 1, and so on
-    scores = mx.nd.array([list(range(1, target_vocab_size + 1)) for _ in range(batch_size * beam_size)])
-    # normalize
-    target_dists = mx.nd.broadcast_div(scores, scores.sum(axis=1, keepdims=True))
-
-    samplek = sockeye.inference.SampleK(k=beam_size, n=top_n, max_batch_size=batch_size)
-    samplek.initialize()
-
-    # 0..(batch_size * beam_size)-1
-    expected_hyps = mx.nd.array(range(batch_size * beam_size), dtype='int32')
-    finished = mx.nd.cast(mx.nd.random.uniform(0, 1, (batch_size * beam_size)) > 0.5, dtype='int32')
-
-    for i in [1, 2]:
-        if i == 2:
-            samplek.hybridize()
-
-        hyps, words, values = samplek(scores, scores, finished)
-        assert hyps.shape[0] == batch_size * beam_size
-
-        # The indices should always be the integers from 0 to batch*beam-1
-        assert sum(hyps == expected_hyps).asscalar() == (batch_size * beam_size)
-        if top_n != 0:
-            # Scores are increasing left-to-right, so best items are all the lowest word IDs.
-            # No word id greater than the cap (top_n) should be selected
-            assert mx.nd.sum(words >= top_n)[0].asscalar() == 0
-
-        # word index should be zero for all finished hypotheses
-        assert mx.nd.sum(mx.nd.where(finished, words, finished))[0].asscalar() == 0
-
-
 def test_get_best_word_indices_for_kth_hypotheses():
     # data
     all_hyp_indices = np.array([[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 2, 0, 0, 4, 3],
@@ -651,31 +403,6 @@ def test_get_best_from_beam(raw_constraints, beam_histories, expected_best_ids,
                                   [3, 3, 0],
                                   [4, 5, 3]],
                                  dtype='int32')
-    attentions = np.array([[[0.1748407 , 0.17223692, 0.153318  , 0.16618672, 0.15373373,
-                             0.1796839 , 0.        , 0.        , 0.        , 0.        ],
-                            [0.17484048, 0.17223585, 0.15332589, 0.16618879, 0.15374145,
-                             0.17966755, 0.        , 0.        , 0.        , 0.        ],
-                            [0.17483611, 0.17222905, 0.15335034, 0.16619477, 0.15375796,
-                             0.17963174, 0.        , 0.        , 0.        , 0.        ]],
-                           [[0.1748407 , 0.17223692, 0.153318  , 0.16618672, 0.15373373,
-                             0.1796839 , 0.        , 0.        , 0.        , 0.        ],
-                            [0.17484048, 0.17223585, 0.15332589, 0.16618879, 0.15374145,
-                             0.17966755, 0.        , 0.        , 0.        , 0.        ],
-                            [0.1748425 , 0.17223647, 0.15333334, 0.16618758, 0.15375413,
-                             0.17964599, 0.        , 0.        , 0.        , 0.        ]],
-                           [[0.20974289, 0.1808782 , 0.18161033, 0.20220006, 0.22556852,
-                             0.        , 0.        , 0.        , 0.        , 0.        ],
-                            [0.20973803, 0.18088503, 0.18162282, 0.20220187, 0.22555229,
-                             0.        , 0.        , 0.        , 0.        , 0.        ],
-                            [0.20973288, 0.18088858, 0.1816678 , 0.20219383, 0.2255169 ,
-                             0.        , 0.        , 0.        , 0.        , 0.        ]],
-                           [[0.20974289, 0.1808782 , 0.18161033, 0.20220006, 0.22556852,
-                             0.        , 0.        , 0.        , 0.        , 0.        ],
-                            [0.20973803, 0.18088503, 0.18162282, 0.20220187, 0.22555229,
-                             0.        , 0.        , 0.        , 0.        , 0.        ],
-                            [0.20972022, 0.1809091 , 0.18161656, 0.20222935, 0.22552474,
-                             0.        , 0.        , 0.        , 0.        , 0.        ]]],
-                           dtype='float32')
     seq_scores = np.array([[3.8197377],
                            [5.081118 ],
                            [3.8068485],
@@ -688,7 +415,6 @@ def test_get_best_from_beam(raw_constraints, beam_histories, expected_best_ids,
     expected_result = [sockeye.inference.Translator._assemble_translation(*x) for x in zip(
                             best_word_indices[expected_best_indices, np.arange(expected_best_indices.shape[1])],
                             lengths[expected_best_ids],
-                            attentions[expected_best_ids],
                             seq_scores[expected_best_ids],
                             beam_histories,
                             itertools.repeat(None))]
@@ -698,7 +424,6 @@ def test_get_best_from_beam(raw_constraints, beam_histories, expected_best_ids,
     actual_result = sockeye.inference.Translator._get_best_from_beam(translator,
                                                                      best_hyp_indices,
                                                                      best_word_indices,
-                                                                     attentions,
                                                                      seq_scores,
                                                                      lengths,
                                                                      None,
@@ -707,7 +432,5 @@ def test_get_best_from_beam(raw_constraints, beam_histories, expected_best_ids,
 
     for expected_translation, actual_translation in zip(expected_result, actual_result):
         assert expected_translation.target_ids == actual_translation.target_ids
-        assert np.array_equal(expected_translation.attention_matrix,
-                              actual_translation.attention_matrix)
         assert expected_translation.score == actual_translation.score
         assert expected_translation.beam_histories == actual_translation.beam_histories
diff --git a/test/unit/test_init_embedding.py b/test/unit/test_init_embedding.py
index 0fcfdd18c..c6a6ad23f 100644
--- a/test/unit/test_init_embedding.py
+++ b/test/unit/test_init_embedding.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
diff --git a/test/unit/test_layers.py b/test/unit/test_layers.py
index 54b1090f1..cea3d1921 100644
--- a/test/unit/test_layers.py
+++ b/test/unit/test_layers.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -15,124 +15,102 @@
 import numpy as np
 
 import sockeye.layers
-import sockeye.rnn
-
-
-def test_layer_normalization():
-    batch_size = 32
-    other_dim = 10
-    num_hidden = 64
-    x_nd = mx.nd.uniform(0, 10, (batch_size, other_dim, num_hidden))
-    x_np = x_nd.asnumpy()
-
-    ln = sockeye.layers.LayerNormalization(prefix="")
-    ln.initialize()
-
-    expected_mean = np.mean(x_np, axis=-1, keepdims=True)
-    expected_var = np.var(x_np, axis=-1, keepdims=True)
-    expected_norm = (x_np - expected_mean) / np.sqrt(expected_var)
-
-    norm = ln(x_nd)
-    assert np.isclose(norm.asnumpy(), expected_norm, atol=1.e-6).all()
-    ln.hybridize()
-    norm = ln(x_nd)
-    assert np.isclose(norm.asnumpy(), expected_norm, atol=1.e-6).all()
 
 
 def test_lhuc():
     num_hidden = 50
     batch_size = 10
+    inp = mx.nd.random_uniform(shape=(batch_size, num_hidden))
 
-    inp = mx.sym.Variable("inp")
-    params = mx.sym.Variable("params")
-    lhuc = sockeye.layers.LHUC(num_hidden=num_hidden, weight=params)
-    with_lhuc = lhuc(inp)
-
-    inp_nd = mx.nd.random_uniform(shape=(batch_size, num_hidden))
-    params_same_nd = mx.nd.zeros(shape=(num_hidden,))
-    params_double_nd = mx.nd.ones(shape=(num_hidden,)) * 20
-
-    out_same = with_lhuc.eval(inp=inp_nd, params=params_same_nd)[0]
-    assert np.isclose(inp_nd.asnumpy(), out_same.asnumpy()).all()
+    lhuc = sockeye.layers.LHUC(num_hidden=num_hidden, weight_init='zeros')
+    lhuc.initialize()
+    out = lhuc(inp)
+    assert np.allclose(inp.asnumpy(), out.asnumpy())
 
-    out_double = with_lhuc.eval(inp=inp_nd, params=params_double_nd)[0]
-    assert np.isclose(2 * inp_nd.asnumpy(), out_double.asnumpy()).all()
+    lhuc = sockeye.layers.LHUC(num_hidden=num_hidden, weight_init=mx.init.Constant(value=20.0))
+    lhuc.initialize()
+    out = lhuc(inp)
+    assert np.allclose(2 * inp.asnumpy(), out.asnumpy())
 
 
 def test_weight_normalization():
-    # The norm after the operation should be equal to the scale factor.
-    expected_norm = np.asarray([1., 2.])
-    scale_factor = mx.nd.array([[1.], [2.]])
-    weight = mx.sym.Variable("weight")
-    weight_norm = sockeye.layers.WeightNormalization(weight,
-                                                     num_hidden=2)
-    norm_weight = weight_norm()
-    nd_norm_weight = norm_weight.eval(weight=mx.nd.array([[1., 2.],
-                                                          [3., 4.]]),
-                                      wn_scale=scale_factor)
-    assert np.isclose(np.linalg.norm(nd_norm_weight[0].asnumpy(), axis=1), expected_norm).all()
-
-
-def test_length_ratio_average_sources():
-    # sources: (n=3, length=5, hidden_size=2)
-    sources = mx.nd.array([[[1, 5],
-                            [2, 6],
-                            [3, 7],
-                            [4, 8],
-                            [0, 9]],
-                          [[10, 0],
-                            [9, 1],
-                            [8, 3],
-                            [7, 5],
-                            [0, 7]],
-                          [[-1, 0],
-                           [-1, 0],
-                           [-1, 0],
-                           [0, -1],
-                           [0, -1]]])
-    lengths = mx.nd.array([3, 4, 5])
-    expected_averages = np.array([[2., 6.], [8.5, 2.25], [-0.6, -0.4]])
-
-    average = sockeye.layers.LengthRatio.average_sources(mx.sym.Variable('sources'),
-                                                         mx.sym.Variable('lengths'))
-    average = average.eval(sources=sources, lengths=lengths)[0]
-    assert np.isclose(average.asnumpy(), expected_averages).all()
-
-
-def test_length_ratio():
-    # sources: (n=3, length=5, hidden_size=2)
-    sources = mx.nd.array([[[1, 6],
-                            [2, 7],
-                            [3, 8],
-                            [4, 9],
-                            [5, 10]],
-                          [[10, 5],
-                            [9, 4],
-                            [8, 3],
-                            [7, 2],
-                            [6, 1]],
-                          [[-1, 1],
-                           [-1, 0],
-                           [-1, 2],
-                           [-1, -2],
-                           [1, 1]]])
-    lengths = mx.nd.array([5, 5, 4])
-    expected_averages = np.array([[3., 8.], [8., 3.], [-1., 0.25]])
-    weight = mx.nd.array([[1.1, 1.3]])
-    bias = mx.nd.array([8])
-
-    length_ratio = sockeye.layers.LengthRatio(hidden_size=2, num_layers=1, prefix="lr_")
-
-    data = length_ratio(mx.sym.Variable('sources'), mx.sym.Variable('lengths'))
-    ratio = data.eval(sources=sources, lengths=lengths,
-                      lr_dense0_weight=weight, lr_dense0_bias=bias)[0]
-
-    average = sockeye.layers.LengthRatio.average_sources(mx.sym.Variable('sources'),
-                                                         mx.sym.Variable('lengths')).eval(sources=sources,
-                                                                                          lengths=lengths)[0]
-    assert np.isclose(average.asnumpy(), expected_averages).all()
-
-    softrelu = lambda x: np.log(1 + np.exp(x))
-    expected_softrelu = softrelu(np.dot(expected_averages, weight.asnumpy().T) + bias.asnumpy())
-
-    assert np.isclose(ratio.asnumpy(), expected_softrelu).all()
+    expected_norm = np.array([1., 1.])
+    weight = mx.nd.array([[1., 2.],
+                          [3., 4.]])
+    weight_norm = sockeye.layers.WeightNormalization(num_hidden=2)
+    weight_norm.initialize()
+    norm_weight = weight_norm(weight).asnumpy()
+    assert np.allclose(np.linalg.norm(norm_weight, axis=1), expected_norm)
+
+
+def test_positional_embeddings():
+    num_embed = 32
+    max_seq_len = 10
+    prefix = ''
+    scale_up_input = False
+    scale_down_positions = False
+    data_len = 5
+    data = mx.nd.zeros((2, data_len, num_embed))
+
+    # fixed embeddings
+    expected_fixed_embedding = sockeye.layers.get_positional_embeddings(data_len, num_embed)
+    b = sockeye.layers.PositionalEmbeddings(weight_type='fixed',
+                                            num_embed=num_embed,
+                                            max_seq_len=max_seq_len,
+                                            prefix=prefix,
+                                            scale_up_input=scale_up_input,
+                                            scale_down_positions=scale_down_positions,
+                                            weight_init=None)
+    b.initialize()
+    # no steps
+    out = b(data, None).asnumpy()
+    assert np.allclose(out[0], expected_fixed_embedding)
+    assert np.allclose(out[1], expected_fixed_embedding)
+
+    # steps
+    steps = mx.nd.expand_dims(mx.nd.array([2, 3]), axis=1)
+    out = b(data, steps).asnumpy()
+    assert np.allclose(out[0], expected_fixed_embedding[2])
+    assert np.allclose(out[1], expected_fixed_embedding[3])
+
+    # learned embeddings
+    b = sockeye.layers.PositionalEmbeddings(weight_type='learned',
+                                            num_embed=num_embed,
+                                            max_seq_len=max_seq_len,
+                                            prefix=prefix,
+                                            scale_up_input=scale_up_input,
+                                            scale_down_positions=scale_down_positions,
+                                            weight_init='ones')
+    b.initialize()
+    expected_learned_embeddings = np.ones((data_len, num_embed))
+    out = b(data, None).asnumpy()
+    assert np.allclose(out[0], expected_learned_embeddings)
+
+
+def test_output_layer():
+    num_hidden = 32
+    vocab_size = 64
+    data = mx.nd.ones((2, 10, num_hidden))
+    vocab_slice_ids = mx.nd.array([4, 7, 23])
+
+    b = sockeye.layers.OutputLayer(num_hidden, vocab_size)
+    b.initialize()
+
+    output = b(data, None)
+    assert output.shape == (2, 10, vocab_size)
+    reduced_output = output.take(vocab_slice_ids, axis=-1).asnumpy()
+
+    output_restricted = b(data, vocab_slice_ids).asnumpy()
+    assert output_restricted.shape == (2, 10, len(vocab_slice_ids))
+
+    assert np.allclose(output_restricted, reduced_output)
+
+    b.hybridize()
+    output = b(data, None)
+    assert output.shape == (2, 10, vocab_size)
+    reduced_output = output.take(vocab_slice_ids, axis=-1).asnumpy()
+
+    output_restricted = b(data, vocab_slice_ids).asnumpy()
+    assert output_restricted.shape == (2, 10, len(vocab_slice_ids))
+
+    assert np.allclose(output_restricted, reduced_output)
diff --git a/test/unit/test_lexicon.py b/test/unit/test_lexicon.py
index a1e2f934c..6c6009f16 100644
--- a/test/unit/test_lexicon.py
+++ b/test/unit/test_lexicon.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
diff --git a/test/unit/test_loss.py b/test/unit/test_loss.py
index c6b5c423d..c023739a3 100644
--- a/test/unit/test_loss.py
+++ b/test/unit/test_loss.py
@@ -1,4 +1,4 @@
-# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -11,6 +11,8 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
+import math
+
 import mxnet as mx
 import numpy as np
 import pytest
@@ -18,139 +20,96 @@
 import sockeye.constants as C
 import sockeye.loss
 import sockeye.model
+import sockeye.utils
+
+
+# Dummy loss for testing
+class DummyLoss(sockeye.loss.Loss):
+    def hybrid_forward(self, F, outputs, labels):
+        return (outputs + labels) * self.weight
+
+    def create_metric(self):
+        return sockeye.loss.LossMetric('test_metric')
+
+
+def test_loss_block():
+    b = DummyLoss(name='test', output_name='output', label_name='label', weight=2.0)
+    b.initialize()
+    assert b.name == 'test'
+    assert b.output_name == 'output'
+    assert b.label_name == 'label'
+    assert b.weight == 2.0
+
+    # check required outputs/labels not found
+    with pytest.raises(sockeye.utils.SockeyeError) as _:
+        b({'unknown_output': mx.nd.zeros((1,))}, {'label': mx.nd.zeros((1,))})
+    with pytest.raises(sockeye.utils.SockeyeError) as _:
+        b({'output': mx.nd.zeros((1,))}, {'unknown_label': mx.nd.zeros((1,))})
+
+    metric = b.create_metric()
+    assert isinstance(metric, sockeye.loss.LossMetric)
+    assert metric.name == 'test_metric'
+
+    loss_out = b({'output': mx.nd.ones((1,))}, {'label': mx.nd.ones((1,))}).asscalar()
+    assert loss_out == 4.0
+
+
+def test_loss_metric():
+    metric = sockeye.loss.LossMetric(name='metric')
+    assert metric.name == 'metric'
+    assert np.isnan(metric.get())
+    metric.update(loss=2, num_samples=2)
+    assert metric.get() == 1.0
+    metric.update(loss=2, num_samples=6)
+    assert metric.get() == 0.5
+    metric.reset()
+    assert np.isnan(metric.get())
 
 
 def test_cross_entropy_loss():
-    config = sockeye.loss.LossConfig(name=C.CROSS_ENTROPY, vocab_size=4, normalization_type=C.LOSS_NORM_BATCH)
-    loss = sockeye.loss.get_loss(config)
-    assert isinstance(loss, sockeye.loss.CrossEntropyLoss)
-
-    logits = mx.sym.Variable("logits")
-    labels = mx.sym.Variable("labels")
-    sym = mx.sym.Group([loss.get_loss(logits, labels)])
-
-    assert sym.list_arguments() == ['logits', 'labels']
-    assert sym.list_outputs() == [C.SOFTMAX_NAME + "_output"]
-
-    logits_np = mx.nd.array([[1, 2, 3, 4],
-                             [4, 2, 2, 2],
-                             [3, 3, 3, 3],
-                             [4, 4, 4, 4]])
-    labels_np = mx.nd.array([1, 0, 2, 3])  # C.PAD_ID == 0
-
-    expected_softmax = np.asarray([[0.0320586, 0.08714432, 0.23688284, 0.64391428],
-                                   [0.71123451, 0.09625512, 0.09625512, 0.09625512],
-                                   [0.25, 0.25, 0.25, 0.25],
-                                   [0.25, 0.25, 0.25, 0.25]])
-    expected_grads = np.asarray([[0.0320586, -0.91285568, 0.23688284, 0.64391428],
-                                 [0., 0., 0., 0.],
-                                 [0.25, 0.25, -0.75, 0.25],
-                                 [0.25, 0.25, 0.25, -0.75]])
-
-    _, out_shapes, _ = (sym.infer_shape(logits=logits_np.shape, labels=labels_np.shape))
-    assert out_shapes[0] == logits_np.shape
-
-    executor = sym.simple_bind(ctx=mx.cpu(),
-                               logits=logits_np.shape,
-                               labels=labels_np.shape)
-    executor.arg_dict["logits"][:] = logits_np
-    executor.arg_dict["labels"][:] = labels_np
-    softmax = executor.forward(is_train=True)[0].asnumpy()
-    assert np.isclose(softmax, expected_softmax).all()
-
-    executor.backward()
-    grads = executor.grad_dict["logits"].asnumpy()
-    assert np.isclose(grads, expected_grads).all()
-    label_grad_sum = executor.grad_dict["labels"].asnumpy().sum()
-    assert label_grad_sum == 0
-
-
-def test_smoothed_cross_entropy_loss():
-    config = sockeye.loss.LossConfig(name=C.CROSS_ENTROPY,
-                                     vocab_size=4,
-                                     normalization_type=C.LOSS_NORM_BATCH,
-                                     label_smoothing=0.5)
-    loss = sockeye.loss.get_loss(config)
-    assert isinstance(loss, sockeye.loss.CrossEntropyLoss)
-
-    logits = mx.sym.Variable("logits")
-    labels = mx.sym.Variable("labels")
-    sym = mx.sym.Group([loss.get_loss(logits, labels)])
-
-    assert sym.list_arguments() == ['logits', 'labels']
-    assert sym.list_outputs() == [C.SOFTMAX_NAME + "_output"]
-
-    logits_np = mx.nd.array([[1, 2, 3, 4],
-                             [4, 2, 2, 2],
-                             [3, 3, 3, 3],
-                             [4, 4, 4, 4]])
-    labels_np = mx.nd.array([1, 0, 2, 3])  # C.PAD_ID == 0
-
-    expected_softmax = np.asarray([[0.0320586, 0.08714432, 0.23688284, 0.64391428],
-                                   [0.71123451, 0.09625512, 0.09625512, 0.09625512],
-                                   [0.25, 0.25, 0.25, 0.25],
-                                   [0.25, 0.25, 0.25, 0.25]])
-    expected_grads = np.asarray([[-0.13460806, -0.41285568, 0.07021617, 0.4772476],
-                                 [0., 0., 0., 0.],
-                                 [0.08333333, 0.08333333, -0.25, 0.08333333],
-                                 [0.08333333, 0.08333333, 0.08333333, -0.25]])
-
-    _, out_shapes, _ = (sym.infer_shape(logits=logits_np.shape, labels=labels_np.shape))
-    assert out_shapes[0] == logits_np.shape
-
-    executor = sym.simple_bind(ctx=mx.cpu(),
-                               logits=logits_np.shape,
-                               labels=labels_np.shape)
-    executor.arg_dict["logits"][:] = logits_np
-    executor.arg_dict["labels"][:] = labels_np
-    outputs = executor.forward(is_train=True)
-    softmax = outputs[0].asnumpy()
-    assert np.isclose(softmax, expected_softmax).all()
-
-    executor.backward()
-    grads = executor.grad_dict["logits"].asnumpy()
-    assert np.isclose(grads, expected_grads).all()
-    label_grad_sum = executor.grad_dict["labels"].asnumpy().sum()
-    assert label_grad_sum == 0
-
-
-@pytest.mark.parametrize("preds, labels, normalization_type, label_smoothing, expected_value",
-                         [(mx.nd.array([[0.0, 0.2, 0.8],
-                                        [0.0, 1.0, 0.0]]),
-                           mx.nd.array([[2],
-                                        [0]]),
-                           'valid',
-                           0.0,
-                           -np.log(0.8 + 1e-8) / 1.0),  # pylint: disable=invalid-unary-operand-type
-                          (mx.nd.array([[0.0, 0.2, 0.8],
-                                        [0.0, 1.0, 0.0]]),
-                           mx.nd.array([[2],
-                                        [0]]),
-                           'batch',
-                           0.0,
-                           -np.log(0.8 + 1e-8) / 2.0)]  # pylint: disable=invalid-unary-operand-type
-                         )
-def test_cross_entropy_metric(preds, labels, normalization_type, label_smoothing, expected_value):
-    config = sockeye.loss.LossConfig(name=C.CROSS_ENTROPY,
-                                     vocab_size=preds.shape[1],
-                                     normalization_type=normalization_type,
-                                     label_smoothing=label_smoothing)
-    metric = sockeye.loss.CrossEntropyMetric(config)
-    metric.update([labels], [preds])
-    name, value = metric.get()
-    assert name == 'cross-entropy'
-    assert np.isclose(value, expected_value)
-
-
-def test_cross_entropy_internal():
-    pred = mx.nd.array([[0.0, 0.2, 0.8]])
-    logprob = mx.nd.log(pred + 1e-8)
-    label = mx.nd.array([2])
-    expected_cross_entropy = -np.log(0.8 + 1e-8) / 1.0  # pylint: disable=invalid-unary-operand-type
-
-    cross_entropy = sockeye.loss.CrossEntropyMetric.cross_entropy(logprob, label).sum()
-    cross_entropy_smoothed = sockeye.loss.CrossEntropyMetric.cross_entropy_smoothed(logprob, label,
-                                                                                    alpha=0.0, num_classes=3).sum()
-
-    assert np.isclose(cross_entropy.asnumpy(), expected_cross_entropy)
-    assert np.isclose(cross_entropy_smoothed.asnumpy(), expected_cross_entropy)
+    b = sockeye.loss.CrossEntropyLoss()
+    b.initialize()
+    assert b.ignore_label == C.PAD_ID
+    assert b.name == C.CROSS_ENTROPY
+    assert b.weight == 1.0
+    assert b._dtype == C.DTYPE_FP32
+    assert b.output_name == C.LOGITS_NAME
+    assert b.label_name == C.TARGET_LABEL_NAME
+    assert b._alpha == 0.0
+
+    logits = mx.nd.array([[1, 1, 1, 1],
+                          [4, 2, 2, 2],
+                          [1, 1, 1, 1],
+                          [1, 1, 1, 1]])
+    logits.attach_grad()
+    labels = mx.nd.array([1, 0, 2, 3])
+    labels.attach_grad()
+
+    with mx.autograd.record():
+        loss_value, loss_samples = b({C.LOGITS_NAME: logits, 'other_stuff': None},
+                                     {C.TARGET_LABEL_NAME: labels, 'other_stuff': None})
+    loss_value.backward()
+    assert loss_samples.asscalar() == (C.PAD_ID != labels).sum().asscalar()
+
+    expected_logits_grad = [[0.08333334, -0.25,        0.08333334,  0.08333334],
+                            [0.,          0.,          0.,          0.],
+                            [0.08333334,  0.08333334, -0.25,        0.08333334],
+                            [0.08333334,  0.08333334,  0.08333334, -0.25]]
+    expected_loss_value = -(math.log(1/4) * 3)  # 3 valid rows, all uniform
+
+    assert np.isclose(loss_value.asscalar(), expected_loss_value)
+    assert np.allclose(logits.grad.asnumpy(), expected_logits_grad)
+    assert labels.grad.sum().asscalar() == 0
+
+
+def test_perplexity_metric():
+    ppl = sockeye.loss.PerplexityMetric()
+    assert ppl.name == C.PERPLEXITY
+    ces = [2.0, 1.4, 5.2]
+    for ce in ces:
+        ppl.update(ce, 1)
+    expected_ppl = math.exp(sum(ces) / len(ces))
+    assert np.isclose(ppl.get(), expected_ppl)
+
+
+# TODO(fhieber): test to compare SoftmaxOutput and alternative cross entropy loss implementation
diff --git a/test/unit/test_lr_scheduler.py b/test/unit/test_lr_scheduler.py
index 008b97150..f94be6f3c 100644
--- a/test/unit/test_lr_scheduler.py
+++ b/test/unit/test_lr_scheduler.py
@@ -1,11 +1,11 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
 # is located at
 #
 #     http://aws.amazon.com/apache2.0/
-# 
+#
 # or in the "license" file accompanying this file. This file is distributed on
 # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 # express or implied. See the License for the specific language governing
@@ -13,61 +13,81 @@
 
 import pytest
 
+import numpy as np
+
 from sockeye import lr_scheduler
-from sockeye.lr_scheduler import LearningRateSchedulerFixedStep, LearningRateSchedulerInvSqrtT, LearningRateSchedulerInvT
-
-
-def test_lr_scheduler():
-    updates_per_checkpoint = 13
-    half_life_num_checkpoints = 3
-
-    schedulers = [LearningRateSchedulerInvT(updates_per_checkpoint, half_life_num_checkpoints),
-                  LearningRateSchedulerInvSqrtT(updates_per_checkpoint, half_life_num_checkpoints)]
-    for scheduler in schedulers:
-        scheduler.base_lr = 1.0
-        # test correct half-life:
-        assert scheduler(updates_per_checkpoint * half_life_num_checkpoints) == pytest.approx(0.5)
-
-
-def test_fixed_step_lr_scheduler():
-    # Parse schedule string
-    schedule_str = "0.5:16,0.25:8"
-    schedule = LearningRateSchedulerFixedStep.parse_schedule_str(schedule_str)
-    assert schedule == [(0.5, 16), (0.25, 8)]
-    # Check learning rate steps
-    updates_per_checkpoint = 2
-    scheduler = LearningRateSchedulerFixedStep(schedule, updates_per_checkpoint)
-    t = 0
-    for _ in range(16):
-        t += 1
-        assert scheduler(t) == 0.5
-        if t % 2 == 0:
-            scheduler.new_evaluation_result(False)
-    assert scheduler(t) == 0.25
-    for _ in range(8):
-        t += 1
-        assert scheduler(t) == 0.25
-        if t % 2 == 0:
-            scheduler.new_evaluation_result(False)
-
-
-@pytest.mark.parametrize("scheduler_type, reduce_factor, expected_instance",
-                         [("fixed-rate-inv-sqrt-t", 1.0, lr_scheduler.LearningRateSchedulerInvSqrtT),
-                          ("fixed-rate-inv-t", 1.0, lr_scheduler.LearningRateSchedulerInvT),
-                          ("plateau-reduce", 0.5, lr_scheduler.LearningRateSchedulerPlateauReduce)])
-def test_get_lr_scheduler(scheduler_type, reduce_factor, expected_instance):
+
+
+@pytest.mark.parametrize('learning_rate_warmup,learning_rate_t_scale',
+                         [(1, 1), (3, 2), (10, .5), (20, 1)])
+def test_inv_sqrt_decay_scheduler(learning_rate_warmup, learning_rate_t_scale):
+    scheduler = lr_scheduler.get_lr_scheduler('inv-sqrt-decay',
+                                              learning_rate_t_scale=learning_rate_t_scale,
+                                              learning_rate_reduce_factor=0,
+                                              learning_rate_reduce_num_not_improved=0,
+                                              learning_rate_warmup=learning_rate_warmup,
+                                              max_updates=10)
+    scheduler.base_lr = 1
+
+    # Reference formula from Transformer paper, plus time scaling
+    alternate_implementation = lambda t: min((t * learning_rate_t_scale)**-0.5,
+                                             (t * learning_rate_t_scale) * learning_rate_warmup**-1.5)
+
+    expected_schedule = [alternate_implementation(t) for t in range(1, 11)]
+
+    actual_schedule = [scheduler(t) for t in range(1, 11)]
+
+    assert np.isclose(expected_schedule, actual_schedule).all()
+
+
+
+def test_linear_decay_scheduler():
+    scheduler = lr_scheduler.get_lr_scheduler('linear-decay',
+                                              learning_rate_t_scale=1,
+                                              learning_rate_reduce_factor=0,
+                                              learning_rate_reduce_num_not_improved=0,
+                                              learning_rate_warmup=3,
+                                              max_updates=10)
+    scheduler.base_lr = 1
+
+    # Warmup term * decay term
+    expected_schedule = [
+        (1/3) * (9/10),
+        (2/3) * (8/10),
+        (3/3) * (7/10),
+        (3/3) * (6/10),
+        (3/3) * (5/10),
+        (3/3) * (4/10),
+        (3/3) * (3/10),
+        (3/3) * (2/10),
+        (3/3) * (1/10),
+        (3/3) * (0/10),
+    ]
+    actual_schedule = [scheduler(t) for t in range(1, 11)]
+    assert np.isclose(expected_schedule, actual_schedule).all()
+
+
+@pytest.mark.parametrize('scheduler_type, expected_instance',
+                         [('none', None),
+                          ('inv-sqrt-decay', lr_scheduler.LearningRateSchedulerInvSqrtDecay),
+                          ('linear-decay', lr_scheduler.LearningRateSchedulerLinearDecay),
+                          ('plateau-reduce', lr_scheduler.LearningRateSchedulerPlateauReduce)])
+def test_get_lr_scheduler(scheduler_type, expected_instance):
     scheduler = lr_scheduler.get_lr_scheduler(scheduler_type,
-                                              updates_per_checkpoint=4,
-                                              learning_rate_half_life=2,
-                                              learning_rate_reduce_factor=reduce_factor,
-                                              learning_rate_reduce_num_not_improved=16)
-    assert isinstance(scheduler, expected_instance)
+                                              learning_rate_t_scale=1,
+                                              learning_rate_reduce_factor=0.5,
+                                              learning_rate_reduce_num_not_improved=16,
+                                              learning_rate_warmup=1000,
+                                              max_updates=10000)
+    if expected_instance is None:
+        assert scheduler is None
+    else:
+        assert isinstance(scheduler, expected_instance)
 
 
 def test_get_lr_scheduler_no_reduce():
-    scheduler = lr_scheduler.get_lr_scheduler("plateau-reduce",
-                                              updates_per_checkpoint=4,
-                                              learning_rate_half_life=2,
+    scheduler = lr_scheduler.get_lr_scheduler('plateau-reduce',
+                                              learning_rate_t_scale=1,
                                               learning_rate_reduce_factor=1.0,
                                               learning_rate_reduce_num_not_improved=16)
     assert scheduler is None
diff --git a/test/unit/test_operator.py b/test/unit/test_operator.py
deleted file mode 100644
index 0d9f998ea..000000000
--- a/test/unit/test_operator.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-import mxnet as mx
-import numpy as np
-
-import sockeye.constants as C
-import sockeye.transformer
-
-
-def test_auto_regressive_bias_op():
-    bias = sockeye.transformer.get_autoregressive_bias(2, dtype='float32')
-
-    arg_types, out_types, aux_types = bias.infer_type()
-    assert out_types[0] == np.float32
-
-    out = bias.eval()[0]
-
-    assert out.dtype == np.float32
-
-    expected = np.array([[0.0, -1.0e8], [0.0, 0.0]]).reshape((1, 2, 2))
-    np.testing.assert_array_equal(out.asnumpy(), expected)
-
-
-def test_auto_regressive_bias_sym_float16():
-    bias = sockeye.transformer.get_autoregressive_bias(2, dtype=C.DTYPE_FP16)
-
-    arg_types, out_types, aux_types = bias.infer_type()
-    assert out_types[0] == np.float16
-
-    out = bias.eval()[0]
-
-    assert out.dtype == np.float16
-
-    expected = np.array([[0.0, -49152.0], [0.0, 0.0]]).reshape((1, 2, 2))
-    np.testing.assert_array_equal(out.asnumpy(), expected)
diff --git a/test/unit/test_optimizers.py b/test/unit/test_optimizers.py
deleted file mode 100644
index d7912de0f..000000000
--- a/test/unit/test_optimizers.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-
-from random import random
-
-import mxnet.ndarray as nd
-import pytest
-from mxnet import optimizer as opt
-
-import sockeye.constants as C
-from sockeye.optimizers import BatchState, CheckpointState, SockeyeOptimizer
-
-
-@pytest.mark.parametrize("optimizer, optimizer_params",
-                         ((C.OPTIMIZER_ADAM, {}),
-                          (C.OPTIMIZER_EVE, {}),
-                          (C.OPTIMIZER_EVE, {"use_batch_objective": True, "use_checkpoint_objective": True}),
-                          ))
-def test_optimizer(optimizer, optimizer_params):
-    # Weights
-    index = 0
-    weight = nd.zeros(shape=(8,))
-    # Optimizer from registry
-    optimizer = opt.create(optimizer, **optimizer_params)
-    state = optimizer.create_state(index, weight)
-    # Run a few updates
-    for i in range(1, 13):
-        grad = nd.random_normal(shape=(8,))
-        if isinstance(optimizer, SockeyeOptimizer):
-            batch_state = BatchState(metric_val=random())
-            optimizer.pre_update_batch(batch_state)
-        optimizer.update(index, weight, grad, state)
-        # Checkpoint
-        if i % 3 == 0:
-            if isinstance(optimizer, SockeyeOptimizer):
-                checkpoint_state = CheckpointState(checkpoint=(i % 3 + 1), metric_val=random())
-                optimizer.pre_update_checkpoint(checkpoint_state)
diff --git a/test/unit/test_output_handler.py b/test/unit/test_output_handler.py
index 78131c00a..2552e0a30 100644
--- a/test/unit/test_output_handler.py
+++ b/test/unit/test_output_handler.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -22,45 +22,25 @@
 stream_handler_tests = [(sockeye.output_handler.StringOutputHandler(io.StringIO()),
                          TranslatorInput(sentence_id=0, tokens=[], factors=[], constraints=[]),
                          TranslatorOutput(sentence_id=0, translation="ein Test", tokens=None,
-                                          attention_matrix=None,
                                           score=0.),
                          0.,
                          "ein Test\n"),
                         (sockeye.output_handler.StringOutputHandler(io.StringIO()),
                          TranslatorInput(sentence_id=0, tokens=[], factors=[]),
                          TranslatorOutput(sentence_id=0, translation="", tokens=None,
-                                          attention_matrix=None,
                                           score=0.),
                          0.,
                          "\n"),
-                        (sockeye.output_handler.StringWithAlignmentsOutputHandler(io.StringIO(), threshold=0.5),
-                         TranslatorInput(sentence_id=0, tokens="a test".split(), factors=[]),
-                         TranslatorOutput(sentence_id=0, translation="ein Test", tokens=None,
-                                          attention_matrix=np.asarray([[1, 0],
-                                                                       [0, 1]]),
-                                          score=0.),
-                         0.,
-                         "ein Test\t0-0 1-1\n"),
-                        (sockeye.output_handler.StringWithAlignmentsOutputHandler(io.StringIO(), threshold=0.5),
-                         TranslatorInput(sentence_id=0, tokens="a test".split(), factors=[]),
-                         TranslatorOutput(sentence_id=0, translation="ein Test !", tokens=None,
-                                          attention_matrix=np.asarray([[0.4, 0.6],
-                                                                       [0.8, 0.2],
-                                                                       [0.5, 0.5]]),
-                                          score=0.),
-                         0.,
-                         "ein Test !\t0-1 1-0\n"),
                         (sockeye.output_handler.BenchmarkOutputHandler(io.StringIO()),
                          TranslatorInput(sentence_id=0, tokens=["a", "test"], factors=[]),
                          TranslatorOutput(sentence_id=0, translation="ein Test", tokens=["ein", "Test"],
-                                          attention_matrix=None,
                                           score=0.),
                          0.5,
                          "input=a test\toutput=ein Test\tinput_tokens=2\toutput_tokens=2\ttranslation_time=0.5000\n"),
                         (sockeye.output_handler.BeamStoringHandler(io.StringIO()),
                          TranslatorInput(sentence_id=0, tokens=["What"]),
                          TranslatorOutput(sentence_id=0, translation="Was", tokens=["Was"],
-                                          attention_matrix=None, score=0.,
+                                          score=0.,
                                           beam_histories=[
                                               {"predicted_ids": [[258, 137, 31],
                                                                  [0, 0, 3]],
@@ -75,26 +55,16 @@
                                           ]),
                          0.5,
                          '{"id": 0, "normalized_scores": [[0.05599012225866318, 4.394228935241699, 4.426244735717773], [0.17525514960289001, 0.2744167149066925, 0.2806641757488251]], "number_steps": 2, "parent_ids": [[0, 0, 0], [0, 0, 1]], "predicted_ids": [[258, 137, 31], [0, 0, 3]], "predicted_tokens": [["Was", "Wie", "Wo"], ["<pad>", "<pad>", "</s>"]], "scores": [[0.05599012225866318, 4.394228935241699, 4.426244735717773], [2.2783169746398926, 3.5674173831939697, 3.648634195327759]]}\n'),
-                        (sockeye.output_handler.JSONOutputHandler(io.StringIO(), threshold=0.5),
+                        (sockeye.output_handler.JSONOutputHandler(io.StringIO()),
                          TranslatorInput(sentence_id=0, tokens=[], factors=[], constraints=[]),
                          TranslatorOutput(sentence_id=0, translation="ein Test", tokens=None,
-                                          attention_matrix=np.asarray([[0.4, 0.6],
-                                                                       [0.8, 0.2],
-                                                                       [0.5, 0.5]]),
                                           score=0.,
                                           pass_through_dict={'pass_through_test': 'success!'},
                                           nbest_translations=["ein Test", "der Test"],
                                           nbest_tokens=[None, None],
-                                          nbest_attention_matrices=[
-                                            np.asarray([[0.4, 0.6],
-                                                        [0.8, 0.2],
-                                                        [0.5, 0.5]]),
-                                            np.asarray([[0.4, 0.6],
-                                                        [0.8, 0.2],
-                                                        [0.5, 0.5]])],
                                           nbest_scores=[0., 0.1]),
                          0.5,
-                         '{"alignments": [[[0, 1], [1, 0]], [[0, 1], [1, 0]]], "pass_through_test": "success!", "score": 0.0, "scores": [0.0, 0.1], "sentence_id": 0, "translation": "ein Test", "translations": ["ein Test", "der Test"]}\n')]
+                         '{"pass_through_test": "success!", "score": 0.0, "scores": [0.0, 0.1], "sentence_id": 0, "translation": "ein Test", "translations": ["ein Test", "der Test"]}\n')]
 
 
 @pytest.mark.parametrize("handler, translation_input, translation_output, translation_walltime, expected_string", stream_handler_tests)
diff --git a/test/unit/test_params.py b/test/unit/test_params.py
index cec1e907d..9da401223 100644
--- a/test/unit/test_params.py
+++ b/test/unit/test_params.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -16,6 +16,11 @@
 import os.path
 import tempfile
 
+import mxnet as mx
+import pytest
+
+import sockeye.encoder
+import sockeye.model
 import sockeye.training
 import sockeye.constants as C
 import sockeye.utils
@@ -33,6 +38,7 @@ def test_cleanup_param_files():
         # 17 must survive because it is the best one
         assert set(glob.glob(os.path.join(tmp_dir, C.PARAMS_PREFIX + "*"))) == expectedSurviving
 
+
 def test_cleanup_param_files_keep_first():
     with tempfile.TemporaryDirectory() as tmp_dir:
         for n in itertools.chain(range(0, 20, 2), range(21, 41)):
@@ -45,3 +51,91 @@ def test_cleanup_param_files_keep_first():
         # 16 must survive because it is the best one
         # 0 should also survive because we set keep_first to True
         assert set(glob.glob(os.path.join(tmp_dir, C.PARAMS_PREFIX + "*"))) == expectedSurviving
+
+
+def mock_model():
+    config_embed = sockeye.encoder.EmbeddingConfig(vocab_size=20, num_embed=4, dropout=0.0)
+    config_encoder = sockeye.encoder.EncoderConfig(model_size=4, attention_heads=1, feed_forward_num_hidden=4,
+                                                   act_type='relu', num_layers=1, dropout_attention=0.0,
+                                                   dropout_act=0.0, dropout_prepost=0.0,
+                                                   positional_embedding_type='fixed', preprocess_sequence='none',
+                                                   postprocess_sequence='none', max_seq_len_source=30,
+                                                   max_seq_len_target=30)
+    config = sockeye.model.ModelConfig(config_data=None, vocab_source_size=20, vocab_target_size=20,
+                                       config_embed_source=config_embed, config_embed_target=config_embed,
+                                       config_encoder=config_encoder, config_decoder=config_encoder)
+    model = sockeye.model.SockeyeModel(config=config)
+    return model
+
+
+def test_set_parameters():
+    model = mock_model()
+    model.initialize(init='xavier', ctx=mx.cpu(0))
+    p = mx.gluon.Parameter('source_target_embed_weight', shape=(20, 4))
+    p.initialize(init='xavier', ctx=mx.cpu(0))
+    model.set_parameters({'source_target_embed_weight': p})
+    assert mx.test_utils.same(model.params['source_target_embed_weight'].data(), p.data())
+
+
+def test_set_parameters_allow_missing():
+    model = mock_model()
+    model.initialize(init='xavier', ctx=mx.cpu(0))
+    model.set_parameters({}, allow_missing=True)
+    assert 'source_target_embed_weight' in model.params
+    with pytest.raises(AssertionError) as e:
+        model.set_parameters({}, allow_missing=False)
+    assert str(e.value) == "Parameter 'source_target_embed_weight' is missing in new_params dictionary. " \
+                           "Set allow_missing=True to ignore missing parameters."
+
+
+def test_set_parameters_ignore_extra():
+    model = mock_model()
+    model.initialize(init='xavier', ctx=mx.cpu(0))
+    p = mx.gluon.Parameter('source_target_embed_weight', shape=(20, 4))
+    p.initialize(init='xavier', ctx=mx.cpu(0))
+    q = mx.gluon.Parameter('q', shape=(1, 1))
+    q.initialize(init='xavier', ctx=mx.cpu(0))
+    params = {'source_target_embed_weight': p, 'q': q}
+    model.set_parameters(params, ignore_extra=True)
+    assert 'source_target_embed_weight' in model.params
+    assert 'q' not in model.params
+    with pytest.raises(ValueError) as e:
+        model.set_parameters(params, ignore_extra=False)
+    assert str(e.value) == "Parameter 'q' in new_params dictionary is not preset in ParameterDict. " \
+                           "Set ignore_extra=True to ignore."
+
+
+def test_set_parameters_context():
+    model = mock_model()
+    model.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
+    p = mx.gluon.Parameter('source_target_embed_weight', shape=(20, 4))
+    p.initialize(init='xavier', ctx=mx.cpu(2))
+    model.set_parameters({'source_target_embed_weight': p})
+    for i in range(2):
+        assert mx.test_utils.same(model.params['source_target_embed_weight'].data(mx.cpu(i)), p.data(mx.cpu(2)))
+
+
+def test_set_parameters_shape():
+    model = mock_model()
+    model.initialize(init='xavier', ctx=mx.cpu(0))
+    p = mx.gluon.Parameter('source_target_embed_weight', shape=(10, 10))
+    p.initialize(init='xavier', ctx=mx.cpu(0))
+    with pytest.raises(AssertionError) as e:
+        model.set_parameters({'source_target_embed_weight': p})
+    assert str(e.value) == "Parameter 'source_target_embed_weight' has shape '(20, 4)' in the model but shape " \
+                           "'(10, 10)' in the new_params dictionary."
+
+
+def test_set_parameters_uninitialized():
+    model = mock_model()
+    model.initialize(init='xavier', ctx=mx.cpu(0))
+    p = mx.gluon.Parameter('source_target_embed_weight', shape=(20, 4))
+    with pytest.raises(AssertionError) as e:
+        model.set_parameters({'source_target_embed_weight': p})
+    assert str(e.value) == "Parameter 'source_target_embed_weight' is not initialized in new_params dictionary."
+    p.initialize(init='xavier', ctx=mx.cpu(0))
+    model = mock_model()
+    with pytest.raises(AssertionError) as e:
+        model.set_parameters({'source_target_embed_weight': p})
+    assert str(e.value) == "Parameter 'source_target_embed_weight' must be initialized before it can be reset using " \
+                           "set_parameters."
diff --git a/test/unit/test_reranking.py b/test/unit/test_reranking.py
index dfb067028..c9cba604e 100644
--- a/test/unit/test_reranking.py
+++ b/test/unit/test_reranking.py
@@ -1,4 +1,4 @@
-# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
diff --git a/test/unit/test_rnn.py b/test/unit/test_rnn.py
deleted file mode 100644
index b9efc0406..000000000
--- a/test/unit/test_rnn.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"). You may not
-# use this file except in compliance with the License. A copy of the License
-# is located at
-#
-#     http://aws.amazon.com/apache2.0/
-#
-# or in the "license" file accompanying this file. This file is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
-# express or implied. See the License for the specific language governing
-# permissions and limitations under the License.
-import mxnet as mx
-import numpy as np
-import pytest
-
-from sockeye import constants as C
-from sockeye import rnn
-
-cell_test_cases = [
-    (rnn.LayerNormLSTMCell(100, prefix='rnn_', forget_bias=1.0),
-     sorted(['rnn_c_scale', 'rnn_c_shift',
-             'rnn_h2h_bias', 'rnn_h2h_scale', 'rnn_h2h_shift', 'rnn_h2h_weight',
-             'rnn_i2h_bias', 'rnn_i2h_scale', 'rnn_i2h_shift', 'rnn_i2h_weight'])),
-    (rnn.LayerNormPerGateLSTMCell(100, prefix='rnn_', forget_bias=1.0),
-     sorted(['rnn_c_scale', 'rnn_c_shift',
-             'rnn_f_scale', 'rnn_f_shift',
-             'rnn_h2h_bias', 'rnn_h2h_weight',
-             'rnn_i2h_bias', 'rnn_i2h_weight',
-             'rnn_i_scale', 'rnn_i_shift',
-             'rnn_o_scale', 'rnn_o_shift',
-             'rnn_s_scale', 'rnn_s_shift'])),
-    (rnn.LayerNormGRUCell(100, prefix='rnn_'),
-     sorted(['rnn_h2h_bias', 'rnn_h2h_scale', 'rnn_h2h_shift', 'rnn_h2h_weight',
-             'rnn_i2h_bias', 'rnn_i2h_scale', 'rnn_i2h_shift', 'rnn_i2h_weight'])),
-    (rnn.LayerNormPerGateGRUCell(100, prefix='rnn_'),
-     sorted(['rnn_h2h_bias', 'rnn_h2h_weight',
-             'rnn_i2h_bias', 'rnn_i2h_weight',
-             'rnn_o_scale', 'rnn_o_shift',
-             'rnn_r_scale', 'rnn_r_shift',
-             'rnn_z_scale', 'rnn_z_shift']))
-]
-
-
-@pytest.mark.parametrize("cell, expected_param_keys", cell_test_cases)
-def test_ln_cell(cell, expected_param_keys):
-    inputs = [mx.sym.Variable('rnn_t%d_data' % i) for i in range(3)]
-    outputs, _ = cell.unroll(3, inputs)
-    outputs = mx.sym.Group(outputs)
-    assert sorted(cell.params._params.keys()) == expected_param_keys
-    assert outputs.list_outputs() == ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']
-
-    args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10, 50), rnn_t1_data=(10, 50), rnn_t2_data=(10, 50))
-    assert outs == [(10, 100), (10, 100), (10, 100)]
-
-
-get_rnn_test_cases = [
-    (rnn.RNNConfig(cell_type=C.LSTM_TYPE, num_hidden=100, num_layers=2, dropout_inputs=0.5, dropout_states=0.5,
-                   residual=False, forget_bias=0.0), mx.rnn.LSTMCell),
-    (rnn.RNNConfig(cell_type=C.LSTM_TYPE, num_hidden=100, num_layers=2, dropout_inputs=0.0, dropout_states=0.0,
-                   dropout_recurrent=0.5, residual=False, forget_bias=0.0), rnn.RecurrentDropoutLSTMCell),
-    (rnn.RNNConfig(cell_type=C.LNLSTM_TYPE, num_hidden=12, num_layers=2, dropout_inputs=0.5, dropout_states=0.5,
-                   residual=False, forget_bias=1.0), rnn.LayerNormLSTMCell),
-    (rnn.RNNConfig(cell_type=C.LNGLSTM_TYPE, num_hidden=55, num_layers=2, dropout_inputs=0.5, dropout_states=0.5,
-                   residual=False, forget_bias=0.0), rnn.LayerNormPerGateLSTMCell),
-    (rnn.RNNConfig(cell_type=C.GRU_TYPE, num_hidden=200, num_layers=2, dropout_inputs=0.9, dropout_states=0.9,
-                   residual=False, forget_bias=0.0), mx.rnn.GRUCell),
-    (rnn.RNNConfig(cell_type=C.LNGRU_TYPE, num_hidden=100, num_layers=2, dropout_inputs=0.0, dropout_states=0.5,
-                   residual=False, forget_bias=0.0), rnn.LayerNormGRUCell),
-    (rnn.RNNConfig(cell_type=C.LNGGRU_TYPE, num_hidden=2, num_layers=2, dropout_inputs=0.0, dropout_states=0.0,
-                   residual=True, forget_bias=0.0), rnn.LayerNormPerGateGRUCell),
-    (rnn.RNNConfig(cell_type=C.LSTM_TYPE, num_hidden=2, num_layers=3, dropout_inputs=0.0, dropout_states=0.0,
-                   residual=True, forget_bias=0.0), mx.rnn.LSTMCell)]
-
-
-@pytest.mark.parametrize("config, expected_cell", get_rnn_test_cases)
-def test_get_stacked_rnn(config, expected_cell):
-    cell = rnn.get_stacked_rnn(config, prefix=config.cell_type)
-    assert isinstance(cell, mx.rnn.SequentialRNNCell)
-    cell = cell._cells[-1]  # last cell
-    if config.residual:
-        assert isinstance(cell, mx.rnn.ResidualCell)
-        cell = cell.base_cell
-    if config.dropout_inputs > 0 or config.dropout_states > 0:
-        assert isinstance(cell, rnn.VariationalDropoutCell)
-        cell = cell.base_cell
-    assert isinstance(cell, expected_cell)
-    assert cell._num_hidden, config.num_hidden
-
-
-def test_cell_parallel_input():
-    num_hidden = 128
-    batch_size = 256
-    parallel_size = 64
-
-    input_shape = (batch_size, num_hidden)
-    states_shape = (batch_size, num_hidden)
-    parallel_shape = (batch_size, parallel_size)
-
-    inp = mx.sym.Variable("input")
-    parallel_input = mx.sym.Variable("parallel")
-    params = mx.rnn.RNNParams("params_")
-    states = mx.sym.Variable("states")
-
-    default_cell = mx.rnn.RNNCell(num_hidden, params=params)
-    default_cell_output, _ = default_cell(mx.sym.concat(inp, parallel_input), states)
-
-    inner_rnn_cell = mx.rnn.RNNCell(num_hidden, params=params)
-    parallel_cell = rnn.ParallelInputCell(inner_rnn_cell)
-    parallel_cell_output, _ = parallel_cell(inp, parallel_input, states)
-
-    input_nd = mx.nd.random_uniform(shape=input_shape)
-    states_nd = mx.nd.random_uniform(shape=states_shape)
-    parallel_nd = mx.nd.random_uniform(shape=parallel_shape)
-    arg_shapes, _, _ = default_cell_output.infer_shape(input=input_shape, states=states_shape, parallel=parallel_shape)
-    params_with_shapes = filter(lambda a: a[0].startswith("params_"),
-                                [x for x in zip(default_cell_output.list_arguments(), arg_shapes)]
-                                )
-    params_nd = {}
-    for name, shape in params_with_shapes:
-        params_nd[name] = mx.nd.random_uniform(shape=shape)
-
-    out_default_residual = default_cell_output.eval(input=input_nd,
-                                                    states=states_nd,
-                                                    parallel=parallel_nd,
-                                                    **params_nd)[0]
-    out_parallel = parallel_cell_output.eval(input=input_nd,
-                                             states=states_nd,
-                                             parallel=parallel_nd,
-                                             **params_nd)[0]
-
-    assert np.isclose(out_default_residual.asnumpy(), out_parallel.asnumpy()).all()
-
-
-def test_residual_cell_parallel_input():
-    num_hidden = 128
-    batch_size = 256
-    parallel_size = 64
-
-    input_shape = (batch_size, num_hidden)
-    states_shape = (batch_size, num_hidden)
-    parallel_shape = (batch_size, parallel_size)
-
-    inp = mx.sym.Variable("input")
-    parallel_input = mx.sym.Variable("parallel")
-    params = mx.rnn.RNNParams("params_")
-    states = mx.sym.Variable("states")
-
-    default_cell = mx.rnn.RNNCell(num_hidden, params=params)
-    default_cell_output, _ = default_cell(mx.sym.concat(inp, parallel_input), states)
-    default_residual_output = mx.sym.elemwise_add(default_cell_output, inp)
-
-    inner_rnn_cell = mx.rnn.RNNCell(num_hidden, params=params)
-    parallel_cell = rnn.ResidualCellParallelInput(inner_rnn_cell)
-    parallel_cell_output, _ = parallel_cell(inp, parallel_input, states)
-
-    input_nd = mx.nd.random_uniform(shape=input_shape)
-    states_nd = mx.nd.random_uniform(shape=states_shape)
-    parallel_nd = mx.nd.random_uniform(shape=parallel_shape)
-    arg_shapes, _, _ = default_residual_output.infer_shape(input=input_shape,
-                                                           states=states_shape,
-                                                           parallel=parallel_shape)
-    params_with_shapes = filter(lambda a: a[0].startswith("params_"),
-                                [x for x in zip(default_residual_output.list_arguments(), arg_shapes)]
-                                )
-    params_nd = {}
-    for name, shape in params_with_shapes:
-        params_nd[name] = mx.nd.random_uniform(shape=shape)
-
-    out_default_residual = default_residual_output.eval(input=input_nd,
-                                                        states=states_nd,
-                                                        parallel=parallel_nd,
-                                                        **params_nd)[0]
-    out_parallel = parallel_cell_output.eval(input=input_nd,
-                                             states=states_nd,
-                                             parallel=parallel_nd,
-                                             **params_nd)[0]
-
-    assert np.isclose(out_default_residual.asnumpy(), out_parallel.asnumpy()).all()
-
-
-def test_sequential_rnn_cell_parallel_input():
-    num_hidden = 128
-    batch_size = 256
-    parallel_size = 64
-    n_layers = 3
-
-    input_shape = (batch_size, num_hidden)
-    states_shape = (batch_size, num_hidden)
-    parallel_shape = (batch_size, parallel_size)
-
-    input = mx.sym.Variable("input")
-    parallel_input = mx.sym.Variable("parallel")
-    params = mx.rnn.RNNParams("params_")  # To simplify, we will share the parameters across all layers
-    states = mx.sym.Variable("states")    # ...and also the previous states
-
-    last_output = input
-    for _ in range(n_layers):
-        cell = mx.rnn.RNNCell(num_hidden, params=params)
-        last_output, _ = cell(mx.sym.concat(last_output, parallel_input), states)
-    manual_stacking_output = last_output
-
-    sequential_cell = rnn.SequentialRNNCellParallelInput()
-    for _ in range(n_layers):
-        cell = mx.rnn.RNNCell(num_hidden, params=params)
-        cell = rnn.ParallelInputCell(cell)
-        sequential_cell.add(cell)
-    sequential_output, _ = sequential_cell(input, parallel_input, [states]*n_layers)
-
-    input_nd = mx.nd.random_uniform(shape=input_shape)
-    states_nd = mx.nd.random_uniform(shape=states_shape)
-    parallel_nd = mx.nd.random_uniform(shape=parallel_shape)
-    arg_shapes, _, _ = manual_stacking_output.infer_shape(input=input_shape, states=states_shape, parallel=parallel_shape)
-    params_with_shapes = filter(lambda a: a[0].startswith("params_"),
-                                [x for x in zip(manual_stacking_output.list_arguments(), arg_shapes)]
-                                )
-    params_nd = {}
-    for name, shape in params_with_shapes:
-        params_nd[name] = mx.nd.random_uniform(shape=shape)
-
-    out_manual = manual_stacking_output.eval(input=input_nd,
-                                             states=states_nd,
-                                             parallel=parallel_nd,
-                                             **params_nd)[0]
-    out_sequential = sequential_output.eval(input=input_nd,
-                                            states=states_nd,
-                                            parallel=parallel_nd,
-                                            **params_nd)[0]
-
-    assert np.isclose(out_manual.asnumpy(), out_sequential.asnumpy()).all()
diff --git a/test/unit/test_rouge.py b/test/unit/test_rouge.py
index a44c5810a..f6c979ff5 100644
--- a/test/unit/test_rouge.py
+++ b/test/unit/test_rouge.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
diff --git a/test/unit/test_scoring.py b/test/unit/test_scoring.py
new file mode 100644
index 000000000..d245d3cbc
--- /dev/null
+++ b/test/unit/test_scoring.py
@@ -0,0 +1,37 @@
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import sockeye.scoring
+from sockeye.beam_search import CandidateScorer
+
+import mxnet as mx
+
+
+def test_batch_scorer():
+    # TODO: make this a useful test
+    batch = 2
+    seq = 4
+    nh = 6
+    logits = mx.nd.ones((batch, seq, nh))
+    label = mx.nd.ones((batch, seq))
+    length_ratio = mx.nd.ones((batch,))
+    source_length = mx.nd.cast(mx.nd.random.randint(0, seq, (batch,)), 'float32')
+    target_length = source_length
+    b = sockeye.scoring.BatchScorer(scorer=CandidateScorer(1.0, 0.0, 0.0),
+                                    score_type='neglogprob',
+                                    constant_length_ratio=None)
+    b.hybridize()
+    scores = b(logits, label, length_ratio, source_length, target_length)
+    assert scores.shape == (batch,)
+
+
diff --git a/test/unit/test_transformer.py b/test/unit/test_transformer.py
new file mode 100644
index 000000000..993c968e6
--- /dev/null
+++ b/test/unit/test_transformer.py
@@ -0,0 +1,41 @@
+# Copyright 2018--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import mxnet as mx
+import numpy as np
+
+import sockeye.transformer
+
+
+def test_auto_regressive_bias_dtype():
+    block = sockeye.transformer.AutoRegressiveBias()
+    block.initialize()
+    length = 10
+    data = mx.nd.ones((2, length, 10), dtype='float32')
+    bias = block(data)
+    assert bias.dtype == np.float32
+
+    block.cast('float16')
+    bias = block(data.astype('float16'))
+    assert bias.dtype == np.float16
+
+
+def test_auto_regressive_bias_output():
+    block = sockeye.transformer.AutoRegressiveBias()
+    block.initialize()
+    length = 2
+    data = mx.nd.ones((2, length, 10), dtype='float32')
+    bias = block(data)
+
+    expected = np.array([[0.0, -1.0e8], [0.0, 0.0]]).reshape((1, 2, 2))
+    np.testing.assert_array_equal(bias.asnumpy(), expected)
diff --git a/test/unit/test_translate.py b/test/unit/test_translate.py
index 5eff2f736..46cfd8b1e 100644
--- a/test/unit/test_translate.py
+++ b/test/unit/test_translate.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
diff --git a/test/unit/test_utils.py b/test/unit/test_utils.py
index 4fb019b64..0e64f9f00 100644
--- a/test/unit/test_utils.py
+++ b/test/unit/test_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
@@ -15,7 +15,6 @@
 import math
 import os
 import re
-import tempfile
 from tempfile import TemporaryDirectory
 
 import mxnet as mx
@@ -39,19 +38,6 @@ def test_chunks(some_list, expected):
     assert chunked_list == expected
 
 
-def test_get_alignments():
-    attention_matrix = np.asarray([[0.1, 0.4, 0.5],
-                                   [0.2, 0.8, 0.0],
-                                   [0.4, 0.4, 0.2]])
-    test_cases = [(0.5, [(1, 1)]),
-                  (0.8, []),
-                  (0.1, [(0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 2)])]
-
-    for threshold, expected_alignment in test_cases:
-        alignment = list(utils.get_alignments(attention_matrix, threshold=threshold))
-        assert alignment == expected_alignment
-
-
 device_params = [([-4, 3, 5], 6, [0, 1, 2, 3, 4, 5]),
                  ([-2, 3, -2, 5], 6, [0, 1, 2, 3, 4, 5]),
                  ([-1], 1, [0]),
@@ -263,7 +249,7 @@ def test_average_arrays():
     expected_average /= 4
 
     mx_arrays = [mx.nd.array(a) for a in arrays]
-    assert np.isclose(utils.average_arrays(mx_arrays).asnumpy(), expected_average).all()
+    assert np.allclose(utils.average_arrays(mx_arrays).asnumpy(), expected_average)
 
     with pytest.raises(utils.SockeyeError) as e:
         other_shape = (12, 13)
@@ -271,25 +257,6 @@ def test_average_arrays():
     assert "nd array shapes do not match" == str(e.value)
 
 
-def test_save_and_load_params():
-    array = mx.nd.uniform(0, 1, (10, 12))
-    arg_params = {"array": array}
-    aux_params = {"array": array}
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        path = os.path.join(tmpdir, "params")
-        utils.save_params(arg_params, path, aux_params=aux_params)
-        params = mx.nd.load(path)
-        assert len(params.keys()) == 2
-        assert "arg:array" in params.keys()
-        assert "aux:array" in params.keys()
-        loaded_arg_params, loaded_aux_params = utils.load_params(path)
-        assert "array" in loaded_arg_params
-        assert "array" in loaded_aux_params
-        assert np.isclose(loaded_arg_params['array'].asnumpy(), array.asnumpy()).all()
-        assert np.isclose(loaded_aux_params['array'].asnumpy(), array.asnumpy()).all()
-
-
 def test_print_value():
     data = mx.sym.Variable("data")
     weights = mx.sym.Variable("weights")
@@ -392,20 +359,21 @@ def test_smart_open_without_suffix():
 ])
 def test_compute_lengths(data, expected_lengths):
     lengths = utils.compute_lengths(mx.sym.Variable('data')).eval(data=data)[0]
-    assert (lengths.asnumpy() == expected_lengths.asnumpy()).all()
+    assert np.allclose(lengths.asnumpy(), expected_lengths.asnumpy())
 
 
 @pytest.mark.parametrize("line_num,line,expected_metrics", [
-        (1, "1\tfloat_metric=3.45\tbool_metric=True", {'float_metric':3.45, 'bool_metric': True}),
-        (3, "3\tfloat_metric=1.0\tbool_metric=False", {'float_metric':1.00, 'bool_metric': False}),
+        (1, "1\tfloat_metric=3.45\tbool_metric=True", {'float_metric': 3.45, 'bool_metric': True}),
+        (3, "3\tfloat_metric=1.0\tbool_metric=False", {'float_metric': 1.00, 'bool_metric': False}),
+        (3, "3\tfloat_metric=1.0\tnone_metric=None", {'float_metric': 1.00, 'none_metric': None}),
         # line_num and checkpoint are not equal, should fail
-        (2, "4\tfloat_metric=1.0\tbool_metric=False", {'float_metric':1.00, 'bool_metric': False}),
+        (2, "4\tfloat_metric=1.0\tbool_metric=False", {'float_metric': 1.00, 'bool_metric': False}),
         ])
 def test_parse_metrics_line(line_num, line, expected_metrics):
     if line_num == int(line.split('\t')[0]):
         parsed_metrics = utils.parse_metrics_line(line_num, line)
         for k, v in parsed_metrics.items():
-            assert type(v) == type(expected_metrics[k])
+            assert isinstance(v, type(expected_metrics[k]))
             assert v == expected_metrics[k]
     else:
         with pytest.raises(utils.SockeyeError) as e:
diff --git a/test/unit/test_vocab.py b/test/unit/test_vocab.py
index 6e8df1b3d..bedcbe287 100644
--- a/test/unit/test_vocab.py
+++ b/test/unit/test_vocab.py
@@ -1,4 +1,4 @@
-# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"). You may not
 # use this file except in compliance with the License. A copy of the License
diff --git a/typechecked-files b/typechecked-files
index 3de9ef2df..4522b74e8 100644
--- a/typechecked-files
+++ b/typechecked-files
@@ -4,8 +4,7 @@ sockeye/average.py
 sockeye/checkpoint_decoder.py
 sockeye/config.py
 sockeye/constants.py
-sockeye/convolution.py
-sockeye/coverage.py
+sockeye/beam_search.py
 sockeye/data_io.py
 sockeye/decoder.py
 sockeye/embeddings.py
@@ -13,7 +12,6 @@ sockeye/encoder.py
 sockeye/extract_parameters.py
 sockeye/inference.py
 sockeye/init_embedding.py
-sockeye/initializer.py
 sockeye/layers.py
 sockeye/lexical_constraints.py
 sockeye/lexicon.py
@@ -24,8 +22,7 @@ sockeye/model.py
 sockeye/optimizers.py
 sockeye/output_handler.py
 sockeye/prepare_data.py
-sockeye/rnn.py
-sockeye/rnn_attention.py
+sockeye/rerank.py
 sockeye/score.py
 sockeye/scoring.py
 sockeye/train.py
@@ -34,11 +31,3 @@ sockeye/transformer.py
 sockeye/translate.py
 sockeye/utils.py
 sockeye/vocab.py
-sockeye/image_captioning/__init__.py
-sockeye/image_captioning/arguments.py
-sockeye/image_captioning/captioner.py
-sockeye/image_captioning/checkpoint_decoder.py
-sockeye/image_captioning/encoder.py
-sockeye/image_captioning/extract_features.py
-sockeye/image_captioning/utils.py
-sockeye/image_captioning/visualize.py