diff --git a/.github/workflows/push_pr.yml b/.github/workflows/push_pr.yml index 2c991e5dd..9b1595624 100644 --- a/.github/workflows/push_pr.yml +++ b/.github/workflows/push_pr.yml @@ -2,11 +2,9 @@ name: push and pull request testing on: push: branches: - - sockeye_2 - master pull_request: branches: - - sockeye_2 - master jobs: diff --git a/.gitignore b/.gitignore index 4069c48b0..99d7f0b7f 100644 --- a/.gitignore +++ b/.gitignore @@ -18,5 +18,3 @@ .pytest_cache tags sockeye/__pycache__ -git_version.py - diff --git a/.travis.yml b/.travis.yml index 3704e87e5..8d7989d31 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,6 @@ before_install: - docker pull ubuntu:16.04 python: - - "3.4" - "3.5" - "3.6" @@ -26,9 +25,7 @@ script: - mypy --version - mypy --ignore-missing-imports --follow-imports=silent @typechecked-files --no-strict-optional - check-manifest --ignore sockeye/git_version.py - - if [ "$TRAVIS_EVENT_TYPE" != "cron" ]; then python -m pytest -k "Copy:lstm:lstm" --maxfail=1 test/system; fi - if [ "$TRAVIS_EVENT_TYPE" != "cron" ]; then python -m pytest -k "Copy:transformer:transformer" --maxfail=1 test/system; fi - - if [ "$TRAVIS_EVENT_TYPE" != "cron" ]; then python -m pytest -k "Copy:cnn:cnn" --maxfail=1 test/system; fi - if [ "$TRAVIS_EVENT_TYPE" = "cron" ]; then python -m pytest --maxfail=1 test/system; fi - if [ "$TRAVIS_EVENT_TYPE" = "cron" ]; then python -m sockeye_contrib.autopilot.test; fi diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d62c214e..20e8cac1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ # Changelog + All notable changes to the project are documented in this file. Version numbers are of the form `1.0.0`. @@ -10,63 +11,120 @@ Note that Sockeye has checks in place to not translate with an old model that wa Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_. -## [1.18.115] -### Added -- Added requirements for MXnet compatible with cuda 10.1. +## [2.1.7] -## [1.18.114] -### Fixed -- Fix bug in prepare_train_data arguments. +### Changed -## [1.18.113] -### Fixed -- Added logging arguments for prepare_data CLI. +- Optimize prepare_data by saving the shards in parallel. The prepare_data script accepts a new parameter `--max-processes` to control the level of parallelism with which shards are written to disk. + +## [2.1.6] + +### Changed + +- Updated Dockerfiles optimized for CPU (intgemm int8 inference, full MKL support) and GPU (distributed training with Horovod). See [sockeye_contrib/docker](sockeye_contrib/docker). -## [1.18.112] ### Added -- Option to suppress creation of logfiles for CLIs (`--no-logfile`). -## [1.18.111] +- Official support for int8 quantization with [intgemm](https://github.com/kpu/intgemm): + - This requires the "intgemm" fork of MXNet ([kpuatamazon/incubator-mxnet/intgemm](https://github.com/kpuatamazon/incubator-mxnet/tree/intgemm)). This is the version of MXNet used in the Sockeye CPU docker image (see [sockeye_contrib/docker](sockeye_contrib/docker)). + - Use `sockeye.translate --dtype int8` to quantize a trained float32 model at runtime. + - Use the `sockeye.quantize` CLI to annotate a float32 model with int8 scaling factors for fast runtime quantization. + +## [2.1.5] + +### Changed + +- Changed state caching for transformer models during beam search to cache states with attention heads already separated out. This avoids repeated transpose operations during decoding, leading to faster inference. + +## [2.1.4] + ### Added -- Added an optional checkpoint callback for the train function. + +- Added Dockerfiles that build an experimental CPU-optimized Sockeye image: + - Uses the latest versions of [kpuatamazon/incubator-mxnet](https://github.com/kpuatamazon/incubator-mxnet) (supports [intgemm](https://github.com/kpu/intgemm) and makes full use of Intel MKL) and [kpuatamazon/sockeye](https://github.com/kpuatamazon/sockeye) (supports int8 quantization for inference). + - See [sockeye_contrib/docker](sockeye_contrib/docker). + +## [2.1.3] ### Changed -- Excluded gradients from pickled fields of TrainState -## [1.18.110] +- Performance optimizations to beam search inference + - Remove unneeded take ops on encoder states + - Gathering input data before sending to GPU, rather than sending each batch element individually + - All of beam search can be done in fp16, if specified by the model + - Other small miscellaneous optimizations +- Model states are now a flat list in ensemble inference, structure of states provided by `state_structure()` + +## [2.1.2] + ### Changed -- We now guard against failures to run `nvidia-smi` for GPU memory monitoring. -## [1.18.109] -### Fixed -- Fixed the metric names by prefixing training metrics with 'train-' and validation metrics with 'val-'. Also restricted the custom logging function to accept only a dictionary and a compulsory global_step parameter. +- Updated to [MXNet 1.6.0](https://github.com/apache/incubator-mxnet/tree/1.6.0) + +### Added + +- Added support for CUDA 10.2 + +### Removed + +- Removed support for CUDA<9.1 / CUDNN<7.5 + +## [2.1.1] + +### Added +- Ability to set environment variables from training/translate CLIs before MXNet is imported. For example, users can + configure MXNet as such: `--env "OMP_NUM_THREADS=1;MXNET_ENGINE_TYPE=NaiveEngine"` + +## [2.1.0] -## [1.18.108] ### Changed -- More verbose log messages about target token counts. -## [1.18.107] +- Version bump, which should have been included in commit b0461b due to incompatible models. + +## [2.0.1] + ### Changed -- Updated to [MXNet 1.5.0](https://github.com/apache/incubator-mxnet/tree/1.5.0) -## [1.18.106] -### Added -- Added an optional time limit for stopping training. The training will stop at the next checkpoint after reaching the time limit. +- Inference defaults to using the max input length observed in training (versus scaling down based on mean length ratio and standard deviations). -## [1.18.105] ### Added -- Added support for a possibility to have a custom metrics logger - a function passed as an extra parameter. If supplied, the logger is called during training. -## [1.18.104] +- Additional parameter fixing strategies: + - `all_except_feed_forward`: Only train feed forward layers. + - `encoder_and_source_embeddings`: Only train the decoder (decoder layers, output layer, and target embeddings). + - `encoder_half_and_source_embeddings`: Train the latter half of encoder layers and the decoder. +- Option to specify the number of CPU threads without using an environment variable (`--omp-num-threads`). +- More flexibility for source factors combination + +## [2.0.0] + ### Changed -- Implemented an attention-based copy mechanism as described in [Jia, Robin, and Percy Liang. "Data recombination for neural semantic parsing." (2016)](https://arxiv.org/abs/1606.03622). -- Added a special symbol to explicitly point at an input token in the target sequence -- Changed the decoder interface to pass both the decoder data and the pointer data. -- Changed the AttentionState named tuple to add the raw attention scores. + +- Update to [MXNet 1.5.0](https://github.com/apache/incubator-mxnet/tree/1.5.0) +- Moved `SockeyeModel` implementation and all layers to [Gluon API](http://mxnet.incubator.apache.org/versions/master/gluon/index.html) +- Removed support for Python 3.4. +- Removed image captioning module +- Removed outdated Autopilot module +- Removed unused training options: Eve, Nadam, RMSProp, Nag, Adagrad, and Adadelta optimizers, `fixed-step` and `fixed-rate-inv-t` learning rate schedulers +- Updated and renamed learning rate scheduler `fixed-rate-inv-sqrt-t` -> `inv-sqrt-decay` +- Added script for plotting metrics files: [sockeye_contrib/plot_metrics.py](sockeye_contrib/plot_metrics.py) +- Removed option `--weight-tying`. Weight tying is enabled by default, disable with `--weight-tying-type none`. + +### Added + +- Added distributed training support with Horovod/OpenMPI. Use `horovodrun` and the `--horovod` training flag. +- Added Dockerfiles that build a Sockeye image with all features enabled. See [sockeye_contrib/docker](sockeye_contrib/docker). +- Added `none` learning rate scheduler (use a fixed rate throughout training) +- Added `linear-decay` learning rate scheduler +- Added training option `--learning-rate-t-scale` for time-based decay schedulers +- Added support for MXNet's [Automatic Mixed Precision](https://mxnet.incubator.apache.org/versions/master/tutorials/amp/amp_tutorial.html). Activate with the `--amp` training flag. For best results, make sure as many model dimensions are possible are multiples of 8. +- Added options for making various model dimensions multiples of a given value. For example, use `--pad-vocab-to-multiple-of 8`, `--bucket-width 8 --no-bucket-scaling`, and `--round-batch-sizes-to-multiple-of 8` with AMP training. +- Added [GluonNLP](http://gluon-nlp.mxnet.io/)'s BERTAdam optimizer, an implementation of the Adam variant used by Devlin et al. ([2018](https://arxiv.org/pdf/1810.04805.pdf)). Use `--optimizer bertadam`. +- Added training option `--checkpoint-improvement-threshold` to set the amount of metric improvement required over the window of previous checkpoints to be considered actual model improvement (used with `--max-num-checkpoint-not-improved`). ## [1.18.103] ### Added -- Added ability to score image-sentence pairs by extending the scoring feature originally implemented for machine +- Added ability to score image-sentence pairs by extending the scoring feature originally implemented for machine translation to the image captioning module. ## [1.18.102] @@ -95,7 +153,7 @@ Each version section may have have subsections for: _Added_, _Changed_, _Removed ## [1.18.96] ### Changed -- Extracted prepare vocab functionality in the build vocab step into its own function. This matches the pattern in prepare data and train where the main() function only has argparsing, and it invokes a separate function to do the work. This is to allow modules that import this one to circumvent the command line. +- Extracted prepare vocab functionality in the build vocab step into its own function. This matches the pattern in prepare data and train where the main() function only has argparsing, and it invokes a separate function to do the work. This is to allow modules that import this one to circumvent the command line. ## [1.18.95] ### Changed diff --git a/MANIFEST.in b/MANIFEST.in index 5f8e3c773..f8ba0012b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -8,6 +8,7 @@ include .flake8 include typechecked-files include test/data/config_with_missing_attributes.yaml include sockeye/git_version.py +include *.bib recursive-include .github * include CONTRIBUTING.md exclude *.sh @@ -21,8 +22,8 @@ recursive-include docs *.html recursive-include docs *.png recursive-include docs *.md recursive-include docs *.py -recursive-include docs *.sh recursive-include docs *.yml recursive-include docs *.ico recursive-include docs *.css recursive-include test *.txt +include docs/tutorials/multilingual/prepare-iwslt17-multilingual.sh diff --git a/README.md b/README.md index 868b646c3..fbd42dd0d 100644 --- a/README.md +++ b/README.md @@ -6,29 +6,87 @@ [![Build Status](https://travis-ci.org/awslabs/sockeye.svg?branch=master)](https://travis-ci.org/awslabs/sockeye) [![Documentation Status](https://readthedocs.org/projects/sockeye/badge/?version=latest)](http://sockeye.readthedocs.io/en/latest/?badge=latest) -This package contains the Sockeye project, a sequence-to-sequence framework for Neural Machine Translation based on Apache MXNet (Incubating). -It implements state-of-the-art encoder-decoder architectures, such as: +This package contains the Sockeye project, an open-source sequence-to-sequence framework for Neural Machine Translation based on [Apache MXNet (Incubating)](http://mxnet.incubator.apache.org/). Sockeye powers several Machine Translation use cases, including [Amazon Translate](https://aws.amazon.com/translate/). The framework implements state-of-the-art machine translation models with Transformers ([Vaswani et al, 2017](https://arxiv.org/abs/1706.03762)). Recent developments and changes are tracked in our [CHANGELOG](https://github.com/awslabs/sockeye/blob/master/CHANGELOG.md). -- Deep Recurrent Neural Networks with Attention [[Bahdanau, '14](https://arxiv.org/abs/1409.0473)] -- Transformer Models with self-attention [[Vaswani et al, '17](https://arxiv.org/abs/1706.03762)] -- Fully convolutional sequence-to-sequence models [[Gehring et al, '17](https://arxiv.org/abs/1705.03122)] +If you have any questions or discover problems, please [file an issue](https://github.com/awslabs/sockeye/issues/new). You can also send questions to *sockeye-dev-at-amazon-dot-com*. -In addition, it provides an experimental [image-to-description module](https://github.com/awslabs/sockeye/tree/master/sockeye/image_captioning) that can be used for image captioning. -Recent developments and changes are tracked in our [CHANGELOG](https://github.com/awslabs/sockeye/blob/master/CHANGELOG.md). +#### Version 2.0 -If you have any questions or discover problems, please [file an issue](https://github.com/awslabs/sockeye/issues/new). -You can also send questions to *sockeye-dev-at-amazon-dot-com*. +With version 2.0, we have updated the usage of MXNet by moving to the [Gluon API](https://mxnet.incubator.apache.org/api/python/docs/api/gluon/index.html) and adding support for several state-of-the-art features such as distributed training, low-precision training and decoding, as well as easier debugging of neural network architectures. +In the context of this rewrite, we also trimmed down the large feature set of version 1.18.x to concentrate on the most important types of models and features, to provide a maintainable framework that is suitable for fast prototyping, research, and production. +We welcome Pull Requests if you would like to help with adding back features when needed. + +## Installation + +The easiest way to run Sockeye is with [Docker](https://www.docker.com) or [nvidia-docker](https://github.com/NVIDIA/nvidia-docker). +To build a Sockeye image with all features enabled, run the build script: + +```bash +python3 sockeye_contrib/docker/build.py +``` + +See the [Dockerfile documentation](sockeye_contrib/docker) for more information. ## Documentation For information on how to use Sockeye, please visit [our documentation](https://awslabs.github.io/sockeye/). -Developers may be interested in our [developer guidelines](https://awslabs.github.io/sockeye/development.html). + +- For a quickstart guide to training a large data WMT model, see the [WMT 2018 German-English tutorial](https://awslabs.github.io/sockeye/tutorials/wmt_large.html). +- Developers may be interested in our [developer guidelines](https://awslabs.github.io/sockeye/development.html). ## Citation -For technical information about Sockeye, see our paper on the arXiv ([BibTeX](sockeye.bib)): +For more information about Sockeye 2, see our paper ([BibTeX](sockeye2.bib)): + +> Felix Hieber, Tobias Domhan, Michael Denkowski, David Vilar. 2020. +> [Sockeye 2: A Toolkit for Neural Machine Translation](https://www.amazon.science/publications/sockeye-2-a-toolkit-for-neural-machine-translation). To appear in EAMT 2020, project track. + +For technical information about Sockeye 1, see our paper on the arXiv ([BibTeX](sockeye.bib)): > Felix Hieber, Tobias Domhan, Michael Denkowski, David Vilar, Artem Sokolov, Ann Clifton and Matt Post. 2017. > [Sockeye: A Toolkit for Neural Machine Translation](https://arxiv.org/abs/1712.05690). ArXiv e-prints. +## Research with Sockeye + +Sockeye has been used for both academic and industrial research. A list of known publications that use Sockeye is shown below. +If you know more, please let us know or submit a pull request (last updated: April 2020). + +### 2020 + +* Dinu, Georgiana, Prashant Mathur, Marcello Federico, Stanislas Lauly, Yaser Al-Onaizan. "Joint translation and unit conversion for end-to-end localization." arXiv preprint arXiv:2004.05219 (2020) +* Hisamoto, Sorami, Matt Post, Kevin Duh. "Membership Inference Attacks on Sequence-to-Sequence Models: Is My Data In Your Machine Translation System?" Transactions of the Association for Computational Linguistics, Volume 8 (2020) +* Naradowsky, Jason, Xuan Zhan, Kevin Duh. "Machine Translation System Selection from Bandit Feedback." arXiv preprint arXiv:2002.09646 (2020) +* Niu, Xing, Marine Carpuat. "Controlling Neural Machine Translation Formality with Synthetic Supervision." Proceedings of AAAI (2020) + +### 2019 + +* Agrawal, Sweta, Marine Carpuat. "Controlling Text Complexity in Neural Machine Translation." Proceedings of EMNLP (2019) +* Beck, Daniel, Trevor Cohn, Gholamreza Haffari. "Neural Speech Translation using Lattice Transformations and Graph Networks." Proceedings of TextGraphs-13 (EMNLP 2019) +* Currey, Anna, Kenneth Heafield. "Zero-Resource Neural Machine Translation with Monolingual Pivot Data." Proceedings of EMNLP (2019) +* Gupta, Prabhakar, Mayank Sharma. "Unsupervised Translation Quality Estimation for Digital Entertainment Content Subtitles." IEEE International Journal of Semantic Computing (2019) +* Hu, J. Edward, Huda Khayrallah, Ryan Culkin, Patrick Xia, Tongfei Chen, Matt Post, and Benjamin Van Durme. "Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting." Proceedings of NAACL-HLT (2019) +* Rosendahl, Jan, Christian Herold, Yunsu Kim, Miguel Graça,Weiyue Wang, Parnia Bahar, Yingbo Gao and Hermann Ney “The RWTH Aachen University Machine Translation Systems for WMT 2019” Proceedings of the 4th WMT: Research Papers (2019) +* Thompson, Brian, Jeremy Gwinnup, Huda Khayrallah, Kevin Duh, and Philipp Koehn. "Overcoming catastrophic forgetting during domain adaptation of neural machine translation." Proceedings of NAACL-HLT 2019 (2019) +* Tättar, Andre, Elizaveta Korotkova, Mark Fishel “University of Tartu’s Multilingual Multi-domain WMT19 News Translation Shared Task Submission” Proceedings of 4th WMT: Research Papers (2019) + +### 2018 + +* Domhan, Tobias. "How Much Attention Do You Need? A Granular Analysis of Neural Machine Translation Architectures". Proceedings of 56th ACL (2018) +* Kim, Yunsu, Yingbo Gao, and Hermann Ney. "Effective Cross-lingual Transfer of Neural Machine Translation Models without Shared Vocabularies." arXiv preprint arXiv:1905.05475 (2019) +* Korotkova, Elizaveta, Maksym Del, and Mark Fishel. "Monolingual and Cross-lingual Zero-shot Style Transfer." arXiv preprint arXiv:1808.00179 (2018) +* Niu, Xing, Michael Denkowski, and Marine Carpuat. "Bi-directional neural machine translation with synthetic parallel data." arXiv preprint arXiv:1805.11213 (2018) +* Niu, Xing, Sudha Rao, and Marine Carpuat. "Multi-Task Neural Models for Translating Between Styles Within and Across Languages." COLING (2018) +* Post, Matt and David Vilar. "Fast Lexically Constrained Decoding with Dynamic Beam Allocation for Neural Machine Translation." Proceedings of NAACL-HLT (2018) +* Schamper, Julian, Jan Rosendahl, Parnia Bahar, Yunsu Kim, Arne Nix, and Hermann Ney. "The RWTH Aachen University Supervised Machine Translation Systems for WMT 2018." Proceedings of the 3rd WMT: Shared Task Papers (2018) +* Schulz, Philip, Wilker Aziz, and Trevor Cohn. "A stochastic decoder for neural machine translation." arXiv preprint arXiv:1805.10844 (2018) +* Tamer, Alkouli, Gabriel Bretschner, and Hermann Ney. "On The Alignment Problem In Multi-Head Attention-Based Neural Machine Translation." Proceedings of the 3rd WMT: Research Papers (2018) +* Tang, Gongbo, Rico Sennrich, and Joakim Nivre. "An Analysis of Attention Mechanisms: The Case of Word Sense Disambiguation in Neural Machine Translation." Proceedings of 3rd WMT: Research Papers (2018) +* Thompson, Brian, Huda Khayrallah, Antonios Anastasopoulos, Arya McCarthy, Kevin Duh, Rebecca Marvin, Paul McNamee, Jeremy Gwinnup, Tim Anderson, and Philipp Koehn. "Freezing Subnetworks to Analyze Domain Adaptation in Neural Machine Translation." arXiv preprint arXiv:1809.05218 (2018) +* Vilar, David. "Learning Hidden Unit Contribution for Adapting Neural Machine Translation Models." Proceedings of NAACL-HLT (2018) +* Vyas, Yogarshi, Xing Niu and Marine Carpuat “Identifying Semantic Divergences in Parallel Text without Annotations”. Proceedings of NAACL-HLT (2018) +* Wang, Weiyue, Derui Zhu, Tamer Alkhouli, Zixuan Gan, and Hermann Ney. "Neural Hidden Markov Model for Machine Translation". Proceedings of 56th ACL (2018) +* Zhang, Xuan, Gaurav Kumar, Huda Khayrallah, Kenton Murray, Jeremy Gwinnup, Marianna J Martindale, Paul McNamee, Kevin Duh, and Marine Carpuat. "An Empirical Exploration of Curriculum Learning for Neural Machine Translation." arXiv preprint arXiv:1811.00739 (2018) + +### 2017 +* Domhan, Tobias and Felix Hieber. "Using target-side monolingual data for neural machine translation through multi-task learning." Proceedings of EMNLP (2017). diff --git a/docs/development.md b/docs/development.md index b75c9dcfb..4add22b33 100644 --- a/docs/development.md +++ b/docs/development.md @@ -32,7 +32,8 @@ def foo(bar: ) -> : """ ``` -- When using MXNet operators, preceding symbolic statements in the code with the resulting, expected shape of the tensor greatly improves readability of the code: +- Sockeye 2 uses the [Gluon API](http://mxnet.incubator.apache.org/versions/master/gluon/index.html). +- When using MXNet operators, preceding symbolic or hybridizable statements in the code with the resulting, expected shape of the tensor greatly improves readability of the code: ```python # (batch_size, num_hidden) @@ -43,8 +44,6 @@ data = mx.sym.reshape(data=data, shape=(-1)) - The desired line length of Python modules should not exceed 120 characters. -- When writing symbol-generating classes (such as encoders/decoders), initialize variables in the constructor of the class and re-use them in the class methods. - - Make sure to pass unit tests before submitting a pull request. - Whenever reasonable, write py.test unit tests covering your contribution. diff --git a/docs/image_captioning.md b/docs/image_captioning.md deleted file mode 100644 index 922ed8693..000000000 --- a/docs/image_captioning.md +++ /dev/null @@ -1,163 +0,0 @@ ---- -layout: default ---- -# Image Captioning - -Sockeye provides also a module to perform image captioning. -It follows the same logic of sequence-to-sequence frameworks, which consist of encoder-decoder models. -In this case the encoder takes an image instead of a sentence and encodes it in a feature representation. -This is decoded with attention (optionally) using exactly the same models of Sockeye (RNNs, transformers, or CNNs). -This tutorial explains how to train image captioning models. - - -## Citation - -For technical information about the image captioning module, see our paper on the arXiv ([BibTeX](sockeye_captioning.bib)): - -> Loris Bazzani, Tobias Domhan, and Felix Hieber. 2018. -> [Image Captioning as Neural Machine Translation Task in SOCKEYE](https://arxiv.org/abs/1810.04101). ArXiv e-prints. - - -## Installation - -Follow the instructions to install Sockeye, and install further dependencies: - -```bash -> sudo pip3 install Pillow -``` - -Optionally you can also install matplotlib for visualization: -```bash -> sudo pip3 install matplotlib -``` - - -## Train - -In order to train your first image captioning model you will need two sets of parallel files: one for training -and one for validation. The latter will be used for computing various metrics during training. -Each set should consist of two files: one with source images and one with target sentences (captions). -Both files should have the same number of lines, each line containing the relative path of the image and a single -sentence, respectively. Each sentence should be a whitespace delimited list of tokens. - -First, you need to obtain the mxnet image models from the model gallery: https://github.com/dmlc/mxnet-model-gallery - -Then, we can extract features from them: -```bash -> python -m sockeye.image_captioning.extract_features \ - --image-root /path/to/image/dataset/folder/ \ - --input training_set.images \ - --output-root /path/to/feature/cache/folder/ \ - --output training_set.features \ - --device-id 0 \ - --batch-size 128 \ - --source-image-size 3 224 224 \ - --image-encoder-model-path /path/to/mxnet/model/filename_prefix \ - --image-encoder-layer stage4_unit3_conv3 - -> python -m sockeye.image_captioning.extract_features \ - --image-root /path/to/image/dataset/folder/ \ - --input validation_set.images \ - --output-root /path/to/feature/cache/folder/ \ - --output validation_set.features \ - --device-id 0 \ - --batch-size 128 \ - --source-image-size 3 224 224 \ - --image-encoder-model-path /path/to/mxnet/model/filename_prefix \ - --image-encoder-layer stage4_unit3_conv3 -``` -In the option `--image-encoder-model-path`, `filename_prefix` should be the prefix of the MXNet model without `-symbol.json` or `-0000.params`. - -The script above will generate the features stored in `/path/to/feature/cache/` and a file `training_set.features` which contains the path to the features relative to `/path/to/feature/cache/`. -Note that finetuning of the image model is not supported yet. - - -Now we can train an one-layer LSTM with attention for image captioning model as follows: -```bash -> python -m sockeye.image_captioning.train \ - --source-root /path/to/feature/cache/folder/ \ - --source training_set.features \ - --target training_set.captions \ - --validation-source-root /path/to/feature/cache/folder/ \ - --validation-source validation_set.features \ - --validation-target validation_set.captions \ - --batch-size 64 \ - --initial-learning-rate 0.0003 \ - --gradient-clipping-threshold 1.0 \ - --bucket-width 5 \ - --max-seq-len 1:60 \ - --fill-up replicate \ - --output models/ \ - --encoder image-pretrain-cnn \ - --rnn-num-hidden 512 \ - --rnn-decoder-state-init zero \ - --checkpoint-interval 200 \ - --weight-normalization -``` -Use the option `--load-all-features-to-memory` to load all the features to memory. This is possible depending on the size of the dataset/features and amount of available CPU memory. -There is an initial overhead to load the feature (training does not start immediately), but with the big advantage that training is 15X-20X faster. - -You can add the options `--decode-and-evaluate 200 --max-output-length 60` to perform captioning of the part of the validation set (200 samples in this case) during training. - -## Image to Text - -Assuming that features were pre-extracted, you can do image captioning as follows: - -```bash -> python -m sockeye.image_captioning.captioner \ - --models models/ \ - --input validation_set.features \ - --source-root /path/to/feature/cache/folder/ \ - --max-output-length 60 \ - --batch-size 1024 \ - --chunk-size 2048 \ - --beam-size 3 > validation_set.predictions -``` - -This will take the best set of parameters found during training and then load the image provided in the STDIN and -write the caption to STDOUT, which is redirected using `>` to the file `validation_set.predictions` overwriting its content if it exists already. - -You can also caption directly from image with the option `--extract-image-features` as follows: - -```bash -> python -m sockeye.image_captioning.captioner \ - --extract-image-features \ - --source-image-size 3 224 224 \ - --image-encoder-model-path /path/to/mxnet/model/filename_prefix \ - --models models/ \ - --input validation_set.images \ - --source-root /path/to/image/dataset/folder/ \ - --max-output-length 60 \ - --batch-size 512 \ - --chunk-size 1024 \ - --beam-size 3 > validation_set.predictions -``` - - -### Using Lexical Constrains - -It is also possible to use lexical constraints during inference as described [here](inference.html#lexical-constraints). -The input JSON object needs to have the following form, with the image path in the `text` field, and constraints specified as usual: - - { 'text': 'relative/path/of/image/given/in/validation_set/file/filename.jpg', - 'constraints': ['constr@@ aint', - 'multi@@ word constr@@ aint'] } - -(*Note: Sockeye expects this text to be present on a single line*). -You can use the `sockeye.lexical_constraints` module to generate this (for usage, run `python3 -m sockeye.lexical_constraints`). -Once the file is generated, the CLI option `--json-input` needs to be passed to `sockeye.image_captioning.captioner`. - -## Visualization - -You can now visualize the results in a nice format as follows: - -```bash -> python -m sockeye.image_captioning.visualize \ - --image-root /path/to/image/dataset/folder/ \ - --source validation_set.images \ - --prediction validation_set.predictions \ - --ground-truth validation_set.captions \ - --save-to-folder validation_set/ -```` -This will save to disk plots containing images, predicted captions (white background) and optionally (mutiple) ground-truth captions (green background). -It is possible to remove `--save-to-folder` and the plots will be visualized on screen. diff --git a/docs/index.md b/docs/index.md index 43ed555cf..6d48f7b6c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -13,15 +13,11 @@ layout: default This is the documentation for Sockeye, a sequence-to-sequence framework for Neural Machine Translation based on Apache MXNet Incubating. It implements state-of-the-art encoder-decoder architectures, such as -- Deep Recurrent Neural Networks with Attention [[Bahdanau, '14](https://arxiv.org/abs/1409.0473)] - Transformer Models with self-attention [[Vaswani et al, '17](https://arxiv.org/abs/1706.03762)] -- Fully convolutional sequence-to-sequence models [[Gehring et al, '17](https://arxiv.org/abs/1705.03122)] - -In addition, this framework provides an experimental [image-to-description module](https://github.com/awslabs/sockeye/tree/master/sockeye/image_captioning) that can be used for [image captioning](image_captioning.html). Recent developments and changes are tracked in our [CHANGELOG](https://github.com/awslabs/sockeye/blob/master/CHANGELOG.md). -If you are interested in collaborating or have any questions, please submit a pull request or [issue](https://github.com/awslabs/sockeye/issues/new). +If you are interested in collaborating or have any questions, please submit a pull request or [issue](https://github.com/awslabs/sockeye/issues/new). You can also send questions to *sockeye-dev-at-amazon-dot-com*. Developers may be interested in [our developer guidelines](development.html). diff --git a/docs/setup.md b/docs/setup.md index fbf59acfe..89297b162 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -4,7 +4,7 @@ Sockeye requires: - **Python3** -- [MXNet 1.5.0](https://github.com/apache/incubator-mxnet/tree/1.5.0) +- [MXNet 1.6.0](https://github.com/apache/incubator-mxnet/tree/1.6.0) - numpy ## Installation @@ -28,7 +28,7 @@ Depending on your version of CUDA, you can do this by running the following: > pip install sockeye --no-deps -r requirements.gpu-cu${CUDA_VERSION}.txt > rm requirements.gpu-cu${CUDA_VERSION}.txt ``` -where `${CUDA_VERSION}` can be `80` (8.0), `90` (9.0), `92` (9.2), `100` (10.0) or `101` (10.1). +where `${CUDA_VERSION}` can be `92` (9.2), `100` (10.0), `101` (10.1), or `102` (10.2). ### → via source... @@ -47,7 +47,7 @@ running the following: > pip install -r requirements/requirements.gpu-cu${CUDA_VERSION}.txt > pip install . ``` -where `${CUDA_VERSION}` can be `80` (8.0), `90` (9.0), `92` (9.2), `100` (10.0) or `101` (10.1). +where `${CUDA_VERSION}` can be `92` (9.2), `100` (10.0), `101` (10.1), or `102` (10.2). Developers will be better served by pointing `$PYTHONPATH` to the root of the git-cloned source. @@ -70,7 +70,7 @@ On an instance with a GPU, the following commands will work > pip install sockeye --no-deps -r requirements.gpu-cu${CUDA_VERSION}.txt rm requirements.gpu-cu${CUDA_VERSION}.txt ``` -where `${CUDA_VERSION}` can be `80` (8.0), `90` (9.0), `92` (9.2), `100` (10.0) or `101` (10.1). +where `${CUDA_VERSION}` can be `92` (9.2), `100` (10.0), `101` (10.1), or `102` (10.2). ### Optional dependencies In order to write training statistics to a Tensorboard event file for visualization, you can optionally install mxboard diff --git a/docs/sockeye_captioning.bib b/docs/sockeye_captioning.bib deleted file mode 100644 index 4c26cffb1..000000000 --- a/docs/sockeye_captioning.bib +++ /dev/null @@ -1,12 +0,0 @@ -@article{SockeyeCaptioning:18, - author = {Bazzani, Loris and Domhan, Tobias and Hieber, Felix}, - title = "{Image Captioning as Neural Machine Translation Task in SOCKEYE}", - journal = {arXiv preprint arXiv:1810.04101}, -archivePrefix = "arXiv", - eprint = {1810.04101}, - primaryClass = "cs.CV", - keywords = {Computer Science - Computer Vision and Pattern Recognition}, - year = 2018, - month = oct, - url = {https://arxiv.org/abs/1810.04101} -} diff --git a/docs/training.md b/docs/training.md index 7dabd49ec..f607555a5 100644 --- a/docs/training.md +++ b/docs/training.md @@ -4,12 +4,6 @@ layout: default # Training -## Autopilot - -For easily training popular model types on known data sets, see the [Sockeye Autopilot documentation](https://github.com/awslabs/sockeye/tree/master/sockeye_contrib/autopilot). -For manually training and running translation models on your data, read on. -Autopilot also contains some other details you may find useful, such as recommended training parameters for [the RNN](https://github.com/awslabs/sockeye/blob/7fd7f152a2480ecf10683f71a89f7519fe7fbc06/sockeye_contrib/autopilot/models.py#L65) or [Transformer](https://github.com/awslabs/sockeye/blob/7fd7f152a2480ecf10683f71a89f7519fe7fbc06/sockeye_contrib/autopilot/models.py#L28) models. - ## Data preparation Sockeye can read the raw data at training time in two sentence-parallel files via the `--source` and `--target` command-line options. diff --git a/docs/tutorials.md b/docs/tutorials.md index ee99ed51e..2187b20fa 100644 --- a/docs/tutorials.md +++ b/docs/tutorials.md @@ -13,4 +13,5 @@ introduce different concepts and parameters used for training and translation. 1. [Sequence copy task](tutorials/seqcopy.html) 1. [WMT German to English news translation](tutorials/wmt.html) 1. [Domain adaptation of NMT models](tutorials/adapt.html) -1. [Multilingual Zero-shot Translation IWSLT 2017](tutorials/multilingual.html) +1. [Large data: WMT German-English 2018](tutorials/wmt_large.html) +1. [Multilingual Zero-shot Translation IWSLT 2017](tutorials/multilingual.html) \ No newline at end of file diff --git a/docs/tutorials/adapt.md b/docs/tutorials/adapt.md index ed61d6c29..97781474c 100644 --- a/docs/tutorials/adapt.md +++ b/docs/tutorials/adapt.md @@ -60,8 +60,6 @@ This argument accepts a (space separated) list of components where to apply the Again it may be beneficial to adjust the learning parameters for the adaptation run. -**Note:** At the moment LHUC is not supported for convolutional models. - ## References > Markus Freitag and Yaser Al-Onaizan. 2016. diff --git a/docs/tutorials/multilingual.md b/docs/tutorials/multilingual.md index df56952a5..1d82c0e31 100644 --- a/docs/tutorials/multilingual.md +++ b/docs/tutorials/multilingual.md @@ -64,9 +64,9 @@ git clone https://github.com/bricksdont/moses-scripts tools/moses-scripts # download helper scripts -wget https://raw.githubusercontent.com/awslabs/sockeye/master/docs/tutorials/multilingual/prepare-iwslt17-multilingual.sh -P tools -wget https://raw.githubusercontent.com/awslabs/sockeye/master/docs/tutorials/multilingual/add_tag_to_lines.py -P tools -wget https://raw.githubusercontent.com/awslabs/sockeye/master/docs/tutorials/multilingual/remove_tag_from_translations.py -P tools +wget https://raw.githubusercontent.com/awslabs/sockeye/sockeye_2/docs/tutorials/multilingual/prepare-iwslt17-multilingual.sh -P tools +wget https://raw.githubusercontent.com/awslabs/sockeye/sockeye_2/docs/tutorials/multilingual/add_tag_to_lines.py -P tools +wget https://raw.githubusercontent.com/awslabs/sockeye/sockeye_2/docs/tutorials/multilingual/remove_tag_from_translations.py -P tools ``` @@ -266,9 +266,6 @@ We can now kick off the training process: python -m sockeye.train -d train_data \ -vs $DATA/valid.tag.src \ -vt $DATA/valid.tag.trg \ - --encoder transformer \ - --decoder transformer \ - --weight-tying \ --shared-vocab \ --weight-tying-type src_trg_softmax \ --device-ids 0 \ diff --git a/docs/tutorials/seqcopy.md b/docs/tutorials/seqcopy.md index 004012849..4b9afc085 100644 --- a/docs/tutorials/seqcopy.md +++ b/docs/tutorials/seqcopy.md @@ -44,42 +44,42 @@ python3 -m sockeye.train -s data/train.source \ -t data/train.target \ -vs data/dev.source \ -vt data/dev.target \ - --encoder rnn --decoder rnn \ + --encoder transformer --decoder transformer \ --num-layers 1:1 \ --num-embed 32 \ - --rnn-num-hidden 64 \ - --rnn-attention-type dot \ + --transformer-model-size 32 \ + --transformer-feed-forward-num-hidden 64 \ + --transformer-attention-heads 4 \ --use-cpu \ - --metrics perplexity accuracy \ --max-num-checkpoint-not-improved 3 \ -o seqcopy_model ``` -This will train a 1-layer RNN model with a bidirectional LSTM as the encoder and a uni-directional LSTM as the decoder. -The RNNs have 64 hidden units and we learn embeddings of size 32. +This will train a 1-layer Transformer model with 32 hidden units as the embedding size. +The Feed-Forward sublayers have 64 hidden units and attention mechanisms are using 4 heads. Looking at the log we can see that our training data was assigned to buckets according to their lengths. -Additionally, Sockeye will take care of correctly padding sequences and masking relevant parts of the network, in order to deal with sequences of variable length. +Additionally, Sockeye will take care of correctly padding sequences and masking relevant parts of the network, +in order to deal with sequences of variable length. ### Metrics and checkpointing During training Sockeye will print relevant metrics on both the training and the validation data. -The metrics can be chosen using the `--metrics` parameter. Validation metrics are evaluated every time we create a checkpoint. During checkpointing the current model parameters are saved into the model directory and current validation scores are evaluated. -By default Sockeye will create a checkpoint every 1000 updates. +By default Sockeye will create a checkpoint every 4000 updates. This can be adjusted through the `--checkpoint-interval` parameter. -From the log you can see that initially the accuracy is around 0.1: +From the log you can see that initially the perplexity is around `20.0`: ```bash ... +[INFO:sockeye.training] Early stopping by optimizing 'perplexity' +[INFO:sockeye.model] Saved model config to "seqcopy_model/config" [INFO:sockeye.training] Training started. -[INFO:sockeye.callback] Early stopping by optimizing 'perplexity' -[INFO:root] Epoch[0] Batch [50] Speed: 683.23 samples/sec perplexity=14.104128 accuracy=0.092011 -[INFO:root] Epoch[0] Batch [100] Speed: 849.97 samples/sec perplexity=13.036482 accuracy=0.096760 +[INFO:sockeye.training] Epoch[0] Batch [50] Speed: 429.27 samples/sec 10879.00 tokens/sec 2.16 updates/sec perplexity=20.074619 +[INFO:sockeye.training] Epoch[0] Batch [100] Speed: 534.38 samples/sec 13846.37 tokens/sec 2.76 updates/sec perplexity=17.064554 ... ``` -With a vocabulary of size 10 this essentially means that the model is guessing randomly. -As training progresses we see that after around 14 epochs the accuracy goes up to ~1.0 and the perplexity down to ~1.0. +As training progresses we see that after the first checkpoint (~7 epochs) the validation perplexity is at ~1.05. Sockeye performs early stopping based on the validation metrics tracked when checkpointing. Once the validation metrics have not improved for several checkpoints the training is stopped. The number of tolerated non-improving checkpoints can be adjusted (`--max-num-checkpoint-not-improved`). @@ -111,8 +111,8 @@ If you open the file you can see that in addition to the digits Sockeye also add ``` -Note that the model was trained on sequences consisting of between 10 and 30 characters. -Therefore, the model will most likely have some difficulties with sequences shorter than 10 characters. +Note that the model was trained on sequences consisting of between 10 and 30 digits. +Therefore, the model will most likely have some difficulties with sequences shorter than 10 digits. By default Sockeye will read sentence from stdin and print the translations on stdout. Internally Sockeye will run a beam search in order to (approximately) find the translation with the highest probability. diff --git a/docs/tutorials/wmt.md b/docs/tutorials/wmt.md index 52eb900b4..3e608c905 100644 --- a/docs/tutorials/wmt.md +++ b/docs/tutorials/wmt.md @@ -16,13 +16,7 @@ git clone https://github.com/rsennrich/subword-nmt.git export PYTHONPATH=$(pwd)/subword-nmt:$PYTHONPATH ``` -For visualizating alignments we will need `matplotlib`. -If you haven't installed the library yet you can do so by running: -```bash -pip install matplotlib -``` - -We will visualize training progress using Tensorboard and its MXNet adaptor, `mxboard`. +We will visualize training progress using Tensorboard and its MXNet adaptor, `mxboard`. Install it using: ```bash pip install tensorboard mxboard @@ -101,24 +95,13 @@ We can now kick off the training process: python -m sockeye.train -d train_data \ -vs newstest2016.tc.BPE.de \ -vt newstest2016.tc.BPE.en \ - --encoder rnn \ - --decoder rnn \ - --num-embed 256 \ - --rnn-num-hidden 512 \ - --rnn-attention-type dot \ --max-seq-len 60 \ --decode-and-evaluate 500 \ --use-cpu \ -o wmt_model ``` -This will train a 1-layer bi-LSTM encoder, 1-layer LSTM decoder with dot attention. -Sockeye offers a whole variety of different options regarding the model architecture, -such as stacked RNNs with residual connections (`--num-layers`, `--rnn-residual-connections`), -[Transformer](https://arxiv.org/abs/1706.03762) encoder and decoder (`--encoder transformer`, `--decoder transformer`), -[ConvS2S](https://arxiv.org/pdf/1705.03122) (`--encoder cnn`, `--decoder cnn`), -various RNN (`--rnn-cell-type`) and attention (`--attention-type`) types and more. - +This will train a "base" [Transformer](https://arxiv.org/abs/1706.03762) model. There are also several parameters controlling training itself. Unless you specify a different optimizer (`--optimizer`) [Adam](https://arxiv.org/abs/1412.6980) will be used. Additionally, you can control the batch size (`--batch-size`), the learning rate schedule (`--learning-rate-schedule`) and other parameters relevant for training. @@ -180,26 +163,6 @@ he is a great guy and a family father . At decoding time Sockeye will run a beam search. You can set the size of the beam (`--beam-size`) or change other decoding parameters such as `--softmax-temperature` and `--length-penalty-alpha`. -### Alignment visualization - -Sockeye not only supports text output, but also other output types. -The following command for example will plot the alignment matrix: - - -```bash -echo "er ist so ein toller Kerl und ein Familienvater ." | \ - python -m apply_bpe -c bpe.codes --vocabulary bpe.vocab.en \ - --vocabulary-threshold 50 | \ - python -m sockeye.translate -m wmt_model --output-type align_plot -``` - -This will create a file `align_1.png` that looks similar to this: - -![Alignment plot](wmt/align.png "Alignment plot") - -Note that the alignment plot shows the subword units instead of tokens, as this is the representation used by Sockeye during translation. -Additionally you can see the special end-of-sentence symbol `` being added to the target sentence. - ### Embedding inspection diff --git a/docs/tutorials/wmt_large.md b/docs/tutorials/wmt_large.md new file mode 100644 index 000000000..6a24cb22e --- /dev/null +++ b/docs/tutorials/wmt_large.md @@ -0,0 +1,182 @@ +# Large Data: WMT 2018 German-English + +This tutorial covers training a Sockeye model using an arbitrarily large amount of data. +We use the data provided for the [WMT 2018](http://www.statmt.org/wmt18/translation-task.html) German-English news task (41 million parallel sentences), though similar settings could be used for even larger data sets. + +## Setup + +**NOTE**: This build assumes that 4 local GPUs are available. + +For this tutorial, we use the Sockeye Docker image. + +1. Follow the linked instructions to install [nvidia-docker](https://github.com/NVIDIA/nvidia-docker). + +2. Build the Docker image and record the commit used as the tag: + +```bash +python3 sockeye_contrib/docker/build.py + +export TAG=$(git rev-parse --short HEAD) +``` + +3. This tutorial uses two external pieces of software, the [subword-nmt](https://github.com/rsennrich/subword-nmt) tool that implements byte-pair encoding (BPE) and the [langid.py](https://github.com/saffsd/langid.py) tool that performs language identification: + +```bash +git clone https://github.com/rsennrich/subword-nmt.git +export PYTHONPATH=$(pwd)/subword-nmt:$PYTHONPATH + +git clone https://github.com/saffsd/langid.py.git +export PYTHONPATH=$(pwd)/langid.py:$PYTHONPATH +``` + +4. We also recommend installing [GNU Parallel](https://www.gnu.org/software/parallel/) to speed up preprocessing steps (run `apt-get install parallel` or `yum install parallel`). + +## Data + +We use the preprocessed data provided for the WMT 2018 news translation shared task. +Download and extract the data using the following commands: + +```bash +wget http://data.statmt.org/wmt18/translation-task/preprocessed/de-en/corpus.gz +wget http://data.statmt.org/wmt18/translation-task/preprocessed/de-en/dev.tgz +zcat corpus.gz |cut -f1 >corpus.de +zcat corpus.gz |cut -f2 >corpus.en +tar xvzf dev.tgz '*.en' '*.de' +``` + +## Preprocessing + +The data has already been tokenized and true-cased, however no significant corpus cleaning is applied. +The majority of the data is taken from inherently noisy web-crawls (sentence pairs are not always in the correct language, or even natural language text). +If we were participating in the WMT evaluation, we would spend a substantial amount of effort selecting clean training data from the noisy corpus. +For this tutorial, we run a simple cleaning step that retains sentence pairs for which a language identification model classifies the target side as English. +The use of GNU Parallel is optional, but makes this step much faster: + +```bash +parallel --pipe --keep-order \ + python -m langid.langid --line -l en,de corpus.en.langid + +paste corpus.en.langid corpus.de |grep "^('en" |cut -f2 >corpus.de.clean +paste corpus.en.langid corpus.en |grep "^('en" |cut -f2 >corpus.en.clean +``` + +We next use BPE to learn a joint sub-word vocabulary from the clean training data. +To speed up this step, we use random samples of the source and target data (note that these samples will not be parallel, but BPE training does not require parallel data). + +```bash +shuf -n 1000000 corpus.de.clean >corpus.de.clean.sample +shuf -n 1000000 corpus.en.clean >corpus.en.clean.sample + +python -m subword_nmt.learn_joint_bpe_and_vocab \ + --input corpus.de.clean.sample corpus.en.clean.sample \ + -s 32000 \ + -o bpe.codes \ + --write-vocabulary bpe.vocab.de bpe.vocab.en +``` + +We use this vocabulary to encode our training, validation, and test data. +For simplicity, we use the 2016 data for validation and 2017 data for test. +GNU parallel can also significantly speed up this step. + +```bash +parallel --pipe --keep-order \ + python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.de --vocabulary-threshold 50 corpus.de.clean.bpe +parallel --pipe --keep-order \ + python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.en --vocabulary-threshold 50 corpus.en.clean.bpe + +python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.de --vocabulary-threshold 50 newstest2016.tc.de.bpe +python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.en --vocabulary-threshold 50 newstest2016.tc.en.bpe + +python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.de --vocabulary-threshold 50 newstest2017.tc.de.bpe +python -m subword_nmt.apply_bpe -c bpe.codes --vocabulary bpe.vocab.en --vocabulary-threshold 50 newstest2017.tc.en.bpe +``` + +## Training + +Now that our data is cleaned and sub-word encoded, we are almost ready to start model training. +We first run a data preparation step that splits the training data into shards and serializes it in MXNet's NDArray format. +This allows us to train on data of any size by efficiently loading and unloading different pieces during training: + +```bash +nvidia-docker run --rm -i -v $(pwd):/work -w /work sockeye:$TAG \ + python -m sockeye.prepare_data \ + -s corpus.de.clean.bpe \ + -t corpus.en.clean.bpe \ + -o prepared_data \ + --shared-vocab \ + --word-min-count 2 \ + --pad-vocab-to-multiple-of 8 \ + --bucket-width 8 \ + --no-bucket-scaling \ + --max-seq-len 95 \ + --num-samples-per-shard 10000000 \ + --seed 1 +``` + +We then start Sockeye training: + +```bash +nvidia-docker run --rm -i -v $(pwd):/work -w /work -e OMP_NUM_THREADS=4 sockeye:$TAG \ + python -m sockeye.train \ + -d prepared_data \ + -vs newstest2016.tc.de.bpe \ + -vt newstest2016.tc.en.bpe \ + -o model \ + --num-layers 6 \ + --transformer-model-size 512 \ + --transformer-attention-heads 8 \ + --transformer-feed-forward-num-hidden 2048 \ + --weight-tying \ + --weight-tying-type src_trg_softmax \ + --optimizer adam \ + --batch-size 8192 \ + --update-interval 4 \ + --round-batch-sizes-to-multiple-of 8 \ + --checkpoint-interval 1000 \ + --initial-learning-rate 0.0004 \ + --learning-rate-reduce-factor 0.9 \ + --learning-rate-reduce-num-not-improved 8 \ + --max-num-checkpoint-not-improved 60 \ + --decode-and-evaluate 500 \ + --device-ids -4 \ + --seed 1 +``` + +**Faster training**: + +- To run FP16 training using a fixed loss scaling factor, add `--dtype float16`. +- To use MXNet's Automatic Mixed Precision, add `--amp`. + +This trains a "base" [Transformer](https://arxiv.org/abs/1706.03762) model using the [Adam](https://arxiv.org/abs/1412.6980) optimizer with a batch size of 32,768 (8192 x 4) tokens. +The learning rate will automatically reduce when validation perplexity does not improve for 8 checkpoints (1000 updates per checkpoint) and training will conclude when validation perplexity does not improve for 60 checkpoints. +At each checkpoint, Sockeye runs a separate decoder process to evaluate metrics such as BLEU on a sample of the validation data (500 sentences). +Note that these scores are calculated on the tokens provided to Sockeye, e.g. in this tutorial BLEU will be calculated on the sub-words we created above. + +## Evaluation + +Now the model is ready to translate data. +Input should be preprocessed identically to the training data, including sub-word encoding (BPE). +Run the following to translate the test set that we've already preprocessed: + +```bash +nvidia-docker run --rm -i -v $(pwd):/work -w /work sockeye:$TAG \ + python -m sockeye.translate \ + -i newstest2017.tc.de.bpe \ + -o newstest2017.tc.hyp.bpe \ + -m model \ + --beam-size 5 \ + --batch-size 64 \ + --device-ids -1 +``` + +To evaluate the translations, reverse the BPE sub-word encoding and run [sacreBLEU](https://github.com/mjpost/sacreBLEU) to compute the BLEU score: + +```bash +sed -re 's/(@@ |@@$)//g' newstest2017.tc.hyp + +nvidia-docker run --rm -i -v $(pwd):/work -w /work sockeye:$TAG \ + sacrebleu newstest2017.tc.en -tok none -i newstest2017.tc.hyp +``` + +Note that this is tokenized, normalized, and true-cased data. +If we were actually participating in WMT, the translations would need to be recased and detokenized for human evaluation. diff --git a/pylintrc b/pylintrc index d4c419405..7e7e6fd84 100644 --- a/pylintrc +++ b/pylintrc @@ -283,7 +283,7 @@ ignored-modules=mxnet,mxnet.*,numpy,numpy.* # List of class names for which member attributes should not be checked (useful # for classes with dynamically set attributes). This supports the use of # qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local +ignored-classes=optparse.Values,thread._local,_thread._local,AbstractContextManager # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E1101 when accessed. Python regular diff --git a/pytest.ini b/pytest.ini index f45f864b4..3cc6356bf 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,3 @@ [pytest] -addopts = --cov sockeye test/unit test/integration -v +addopts = -v +testpaths = test/unit test/integration diff --git a/requirements/requirements.gpu-cu100.txt b/requirements/requirements.gpu-cu100.txt index d49d93658..b77b876d0 100644 --- a/requirements/requirements.gpu-cu100.txt +++ b/requirements/requirements.gpu-cu100.txt @@ -1,6 +1,6 @@ pyyaml>=5.1 -mxnet-cu100mkl==1.5.1 -numpy>=1.14 +mxnet-cu100mkl==1.6.0 +numpy>1.16.0,<2.0.0 typing portalocker -sacrebleu==1.3.6 +sacrebleu==1.4.3 diff --git a/requirements/requirements.gpu-cu101.txt b/requirements/requirements.gpu-cu101.txt index 35db8d6cc..1a2ecf218 100644 --- a/requirements/requirements.gpu-cu101.txt +++ b/requirements/requirements.gpu-cu101.txt @@ -1,6 +1,6 @@ pyyaml>=5.1 -mxnet-cu101mkl==1.5.1 -numpy>=1.14 +mxnet-cu101mkl==1.6.0 +numpy>1.16.0,<2.0.0 typing portalocker -sacrebleu==1.3.6 +sacrebleu==1.4.3 diff --git a/requirements/requirements.gpu-cu102.txt b/requirements/requirements.gpu-cu102.txt new file mode 100644 index 000000000..dd670a45d --- /dev/null +++ b/requirements/requirements.gpu-cu102.txt @@ -0,0 +1,6 @@ +pyyaml>=5.1 +mxnet-cu102mkl==1.6.0 +numpy>1.16.0,<2.0.0 +typing +portalocker +sacrebleu==1.4.3 diff --git a/requirements/requirements.gpu-cu80.txt b/requirements/requirements.gpu-cu80.txt deleted file mode 100644 index c809b6656..000000000 --- a/requirements/requirements.gpu-cu80.txt +++ /dev/null @@ -1,6 +0,0 @@ -pyyaml>=5.1 -mxnet-cu80mkl==1.5.1 -numpy>=1.14 -typing -portalocker -sacrebleu==1.3.6 diff --git a/requirements/requirements.gpu-cu90.txt b/requirements/requirements.gpu-cu90.txt deleted file mode 100644 index 9ad3732c2..000000000 --- a/requirements/requirements.gpu-cu90.txt +++ /dev/null @@ -1,6 +0,0 @@ -pyyaml>=5.1 -mxnet-cu90mkl==1.5.1 -numpy>=1.14 -typing -portalocker -sacrebleu==1.3.6 diff --git a/requirements/requirements.gpu-cu92.txt b/requirements/requirements.gpu-cu92.txt index bc80d5ac6..585832235 100644 --- a/requirements/requirements.gpu-cu92.txt +++ b/requirements/requirements.gpu-cu92.txt @@ -1,6 +1,6 @@ pyyaml>=5.1 -mxnet-cu92mkl==1.5.1 -numpy>=1.14 +mxnet-cu92mkl==1.6.0 +numpy>1.16.0,<2.0.0 typing portalocker -sacrebleu==1.3.6 +sacrebleu==1.4.3 diff --git a/requirements/requirements.horovod.txt b/requirements/requirements.horovod.txt new file mode 100644 index 000000000..9c74bec0d --- /dev/null +++ b/requirements/requirements.horovod.txt @@ -0,0 +1,2 @@ +horovod==0.19.1 +mpi4py diff --git a/requirements/requirements.txt b/requirements/requirements.txt index a9d9217f0..0f5488dd9 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,6 +1,6 @@ pyyaml>=5.1 -mxnet-mkl==1.5.1 -numpy>=1.14 +mxnet-mkl==1.6.0 +numpy>1.16.0,<2.0.0 typing portalocker -sacrebleu==1.3.6 +sacrebleu==1.4.3 diff --git a/setup.py b/setup.py index ffa2a7b7c..21ac6031c 100644 --- a/setup.py +++ b/setup.py @@ -82,6 +82,7 @@ def get_requirements(filename): 'sockeye-lexicon = sockeye.lexicon:main', 'sockeye-init-embed = sockeye.init_embedding:main', 'sockeye-prepare-data = sockeye.prepare_data:main', + 'sockeye-quantize = sockeye.quantize:main', 'sockeye-score = sockeye.score:main', 'sockeye-train = sockeye.train:main', 'sockeye-translate = sockeye.translate:main', diff --git a/sockeye/__init__.py b/sockeye/__init__.py index 378b5dd0b..11040ebc8 100644 --- a/sockeye/__init__.py +++ b/sockeye/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017--2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not # use this file except in compliance with the License. A copy of the License @@ -11,4 +11,4 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -__version__ = '1.18.115' +__version__ = '2.1.7' diff --git a/sockeye/arguments.py b/sockeye/arguments.py index c583e8a08..0f11f1a22 100644 --- a/sockeye/arguments.py +++ b/sockeye/arguments.py @@ -1,4 +1,4 @@ -# Copyright 2017, 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not # use this file except in compliance with the License. A copy of the License @@ -24,7 +24,6 @@ from . import constants as C from . import data_io -from .lr_scheduler import LearningRateSchedulerFixedStep from . import utils @@ -170,21 +169,22 @@ def check_greater_equal(value: str): return check_greater_equal -def learning_schedule() -> Callable: +def bool_str() -> Callable: """ - Returns a method that can be used in argument parsing to check that the argument is a valid learning rate schedule - string. + Returns a method that can be used in argument parsing to check that the argument is a valid representation of + a boolean value. :return: A method that can be used as a type in argparse. """ - - def parse(schedule_str): - try: - schedule = LearningRateSchedulerFixedStep.parse_schedule_str(schedule_str) - except ValueError: + def parse(value: str): + lower_value = value.lower() + if lower_value in ["true", "yes", "1"]: + return True + elif lower_value in ["false", "no", "0"]: + return False + else: raise argparse.ArgumentTypeError( - "Learning rate schedule string should have form rate1:num_updates1[,rate2:num_updates2,...]") - return schedule + "Invalid value for bool argument. Use true/false, yes/no or 1/0.") return parse @@ -201,11 +201,11 @@ def simple_dict() -> Callable: def parse(dict_str: str): def _parse(value: str): - if value == "True": + if value.lower() == "true": return True - if value == "False": + if value.lower() == "false": return False - if "." in value: + if "." in value or "e" in value: return float(value) return int(value) @@ -299,7 +299,7 @@ def add_extract_args(params): extract_params.add_argument("input", metavar="INPUT", type=str, - help="Either a model directory (using params.best) or a specific params.x file.") + help="Either a model directory (using its %s) or a specific params.x file." % C.PARAMS_BEST_NAME) extract_params.add_argument('--names', '-n', nargs='*', default=[], @@ -385,6 +385,13 @@ def add_training_data_args(params, required=False): type=regular_file(), default=[], help='File(s) containing additional token-parallel source side factors. Default: %(default)s.') + params.add_argument('--source-factors-use-source-vocab', + required=False, + nargs='+', + type=bool_str(), + default=[], + help='List of bools signaling wether to use the source vocabulary for the source factors. ' + 'If empty (default) each factor has its own vocabulary.') params.add_argument(C.TRAINING_ARG_TARGET, '-t', required=required, type=regular_file(), @@ -462,6 +469,10 @@ def add_bucketing_args(params): default=10, help='Width of buckets in tokens. Default: %(default)s.') + params.add_argument('--no-bucket-scaling', + action='store_true', + help='Disable scaling source/target buckets based on length ratio. Default: %(default)s.') + params.add_argument(C.TRAINING_ARG_MAX_SEQ_LEN, type=multiple_values(num_values=2, greater_or_equal=1), default=(99, 99), @@ -473,30 +484,32 @@ def add_prepare_data_cli_args(params): add_training_data_args(params, required=True) add_vocab_args(params) add_bucketing_args(params) - add_logging_args(params) - - data_prep = params.add_argument_group("Data preparation.") - data_prep.add_argument('--num-samples-per-shard', + params.add_argument('--num-samples-per-shard', type=int_greater_or_equal(1), - default=1000000, + default=10000000, help='The approximate number of samples per shard. Default: %(default)s.') - data_prep.add_argument('--min-num-shards', + params.add_argument('--min-num-shards', default=1, type=int_greater_or_equal(1), help='The minimum number of shards to use, even if they would not ' 'reach the desired number of samples per shard. Default: %(default)s.') - data_prep.add_argument('--seed', + params.add_argument('--seed', type=int, default=13, help='Random seed used that makes shard assignments deterministic. Default: %(default)s.') - data_prep.add_argument('--output', '-o', + params.add_argument('--output', '-o', required=True, help='Folder where the prepared and possibly sharded data is written to.') + params.add_argument('--max-processes', + type=int_greater_or_equal(1), + default=1, + help='Process the shards in parallel using max-processes processes.') + add_logging_args(params) def add_device_args(params): @@ -513,6 +526,14 @@ def add_device_args(params): device_params.add_argument('--use-cpu', action='store_true', help='Use CPU device instead of GPU.') + device_params.add_argument('--omp-num-threads', + type=int, + help='Set the OMP_NUM_THREADS environment variable (CPU threads). Recommended: set to ' + 'number of GPUs for training, number of physical CPU cores for inference. Default: ' + '%(default)s.') + device_params.add_argument('--env', + help='List of environment variables to be set before importing MXNet. Separated by ",", ' + 'e.g. --env=OMP_NUM_THREADS=4,MXNET_GPU_WORKER_NTHREADS=3 etc.') device_params.add_argument('--disable-device-locking', action='store_true', help='Just use the specified device ids without locking.') @@ -590,87 +611,6 @@ def add_model_parameters(params): help='Number of layers for encoder & decoder. ' 'Use "x:x" to specify separate values for encoder & decoder. Default: %(default)s.') - model_params.add_argument('--conv-embed-output-dim', - type=int_greater_or_equal(1), - default=None, - help="Project segment embeddings to this size for ConvolutionalEmbeddingEncoder. Omit to" - " avoid projection, leaving segment embeddings total size of all filters. Default:" - " %(default)s.") - model_params.add_argument('--conv-embed-max-filter-width', - type=int_greater_or_equal(1), - default=8, - help="Maximum filter width for ConvolutionalEmbeddingEncoder. Default: %(default)s.") - model_params.add_argument('--conv-embed-num-filters', - type=multiple_values(greater_or_equal=1), - default=(200, 200, 250, 250, 300, 300, 300, 300), - help="List of number of filters of each width 1..max for ConvolutionalEmbeddingEncoder. " - "Default: %(default)s.") - model_params.add_argument('--conv-embed-pool-stride', - type=int_greater_or_equal(1), - default=5, - help="Pooling stride for ConvolutionalEmbeddingEncoder. Default: %(default)s.") - model_params.add_argument('--conv-embed-num-highway-layers', - type=int_greater_or_equal(0), - default=4, - help="Number of highway layers for ConvolutionalEmbeddingEncoder. Default: %(default)s.") - model_params.add_argument('--conv-embed-add-positional-encodings', - action='store_true', - default=False, - help="Add positional encodings to final segment embeddings for" - " ConvolutionalEmbeddingEncoder. Default: %(default)s.") - - # convolutional encoder/decoder arguments arguments - model_params.add_argument('--cnn-kernel-width', - type=multiple_values(num_values=2, greater_or_equal=1, data_type=int), - default=(3, 3), - help='Kernel width of the convolutional encoder and decoder. Default: %(default)s.') - model_params.add_argument('--cnn-num-hidden', - type=int_greater_or_equal(1), - default=512, - help='Number of hidden units for the convolutional encoder and decoder. ' - 'Default: %(default)s.') - model_params.add_argument('--cnn-activation-type', - choices=C.CNN_ACTIVATION_TYPES, - default=C.GLU, - help="Type activation to use for each convolutional layer. Default: %(default)s.") - model_params.add_argument('--cnn-positional-embedding-type', - choices=C.POSITIONAL_EMBEDDING_TYPES, - default=C.LEARNED_POSITIONAL_EMBEDDING, - help='The type of positional embedding. Default: %(default)s.') - model_params.add_argument('--cnn-project-qkv', - action='store_true', - default=False, - help="Optionally apply query, key and value projections to the source and target hidden " - "vectors before applying the attention mechanism.") - - # rnn arguments - model_params.add_argument('--rnn-cell-type', - choices=C.CELL_TYPES, - default=C.LSTM_TYPE, - help='RNN cell type for encoder and decoder. Default: %(default)s.') - model_params.add_argument('--rnn-num-hidden', - type=int_greater_or_equal(1), - default=1024, - help='Number of RNN hidden units for encoder and decoder. Default: %(default)s.') - model_params.add_argument('--rnn-encoder-reverse-input', - action='store_true', - help='Reverse input sequence for RNN encoder. Default: %(default)s.') - model_params.add_argument('--rnn-decoder-state-init', - default=C.RNN_DEC_INIT_LAST, - choices=C.RNN_DEC_INIT_CHOICES, - help='How to initialize RNN decoder states. Default: %(default)s.') - model_params.add_argument('--rnn-residual-connections', - action="store_true", - default=False, - help="Add residual connections to stacked RNNs. (see Wu ETAL'16). Default: %(default)s.") - model_params.add_argument('--rnn-first-residual-layer', - type=int_greater_or_equal(2), - default=2, - help='First RNN layer to have a residual connection. Default: %(default)s.') - model_params.add_argument('--rnn-context-gating', action="store_true", - help="Enables a context gate which adaptively weighs the RNN decoder input against the " - "source context vector before each update of the decoder hidden state.") - # transformer arguments model_params.add_argument('--transformer-model-size', type=multiple_values(num_values=2, greater_or_equal=1), @@ -688,9 +628,11 @@ def add_model_parameters(params): help='Number of hidden units in transformers feed forward layers. ' 'Use "x:x" to specify separate values for encoder & decoder. Default: %(default)s.') model_params.add_argument('--transformer-activation-type', - choices=C.TRANSFORMER_ACTIVATION_TYPES, - default=C.RELU, - help="Type activation to use for each feed forward layer. Default: %(default)s.") + type=multiple_values(num_values=2, greater_or_equal=None, data_type=str), + default=(C.RELU, C.RELU), + help='Type of activation to use for each feed forward layer. Use "x:x" to specify ' + 'different values for encoder & decoder. Supported: {}. Default: ' + '%(default)s.'.format(' '.join(C.TRANSFORMER_ACTIVATION_TYPES))) model_params.add_argument('--transformer-positional-embedding-type', choices=C.POSITIONAL_EMBEDDING_TYPES, default=C.FIXED_POSITIONAL_EMBEDDING, @@ -715,23 +657,16 @@ def add_model_parameters(params): 'You can specify separate sequences for encoder and decoder by separating with ":" ' 'For example: n:drn ' 'Default: %(default)s.') - model_params.add_argument('--attention-based-copying', action="store_true", - help="Enables an attention-based copying mechanism. Supported only by RNN decoders. " - "This allows to explicitly declare pointers to source tokens in the target " - "sequence (format: )." - "Each pointer on the target side can point to any of the d input tokens, " - "e.g. points to the first source token.") # LHUC - # TODO: The convolutional model does not support lhuc yet model_params.add_argument('--lhuc', nargs="+", default=None, choices=C.LHUC_CHOICES, metavar="COMPONENT", help="Use LHUC (Vilar 2018). Include an amplitude parameter to hidden units for" - " domain adaptation. Needs a pre-trained model. Valid values: {values}. Currently not" - " supported for convolutional models. Default: %(default)s.".format( + " domain adaptation. Needs a pre-trained model. Valid values: {values}." + " Default: %(default)s.".format( values=", ".join(C.LHUC_CHOICES))) # embedding arguments @@ -749,76 +684,30 @@ def add_model_parameters(params): '(validation) source factor files. Default: %(default)s.') model_params.add_argument('--source-factors-combine', '-sfc', choices=C.SOURCE_FACTORS_COMBINE_CHOICES, - default=C.SOURCE_FACTORS_COMBINE_CONCAT, - help='How to combine source factors. Default: %(default)s.') - - # attention arguments - model_params.add_argument('--rnn-attention-type', - choices=C.ATT_TYPES, - default=C.ATT_MLP, - help='Attention model for RNN decoders. Choices: {%(choices)s}. ' - 'Default: %(default)s.') - model_params.add_argument('--rnn-attention-num-hidden', - default=None, - type=int, - help='Number of hidden units for attention layers. Default: equal to --rnn-num-hidden.') - model_params.add_argument('--rnn-attention-use-prev-word', action="store_true", - help="Feed the previous target embedding into the attention mechanism.") - - model_params.add_argument('--rnn-scale-dot-attention', - action='store_true', - help='Optional scale before dot product. Only applicable to \'dot\' attention type. ' - '[Vaswani et al, 2017]') - - model_params.add_argument('--rnn-attention-coverage-type', - choices=C.COVERAGE_TYPES, - default=C.COVERAGE_COUNT, - help="Type of model for updating coverage vectors. 'count' refers to an update method " - "that accumulates attention scores. 'fertility' accumulates attention scores as well " - "but also computes a fertility value for every source word. " - "'tanh', 'sigmoid', 'relu', 'softrelu' " - "use non-linear layers with the respective activation type, and 'gru' uses a " - "GRU to update the coverage vectors. Default: %(default)s.") - model_params.add_argument('--rnn-attention-coverage-max-fertility', - type=int, - default=2, - help="Maximum fertility for individual source words. Default: %(default)s.") - model_params.add_argument('--rnn-attention-coverage-num-hidden', - type=int, - default=1, - help="Number of hidden units for coverage vectors. Default: %(default)s.") - model_params.add_argument('--rnn-attention-in-upper-layers', - action="store_true", - help="Pass the attention to the upper layers of the RNN decoder, similar " - "to GNMT paper. Only applicable if more than one layer is used.") - model_params.add_argument('--rnn-attention-mhdot-heads', - type=int, default=None, - help='Number of heads for Multi-head dot attention. Default: %(default)s.') + default=[C.SOURCE_FACTORS_COMBINE_CONCAT], + nargs='+', + help='How to combine source factors. Can be either one value which will be applied to all ' + 'source factors, or a list of values. Default: %(default)s.') + model_params.add_argument('--source-factors-share-embedding', + type=bool_str(), + nargs='+', + default=[False], + help='Share the embeddings with the source language. Can be either one value which will be ' + 'applied to all source factors, or a list of values. Default: do not share.') - model_params.add_argument('--weight-tying', - action='store_true', - help='Turn on weight tying (see arxiv.org/abs/1608.05859). ' - 'The type of weight sharing is determined through ' - '--weight-tying-type. Default: %(default)s.') model_params.add_argument('--weight-tying-type', - default=C.WEIGHT_TYING_TRG_SOFTMAX, - choices=[C.WEIGHT_TYING_SRC_TRG_SOFTMAX, - C.WEIGHT_TYING_SRC_TRG, - C.WEIGHT_TYING_TRG_SOFTMAX], + default=C.WEIGHT_TYING_SRC_TRG_SOFTMAX, + choices=C.WEIGHT_TYING_TYPES, help='The type of weight tying. source embeddings=src, target embeddings=trg, ' 'target softmax weight matrix=softmax. Default: %(default)s.') - model_params.add_argument('--layer-normalization', action="store_true", - help="Adds layer normalization before non-linear activations. " - "This includes MLP attention, RNN decoder state initialization, " - "RNN decoder hidden state, and cnn layers." - "It does not normalize RNN cell activations " - "(this can be done using the '%s' or '%s' rnn-cell-type." % (C.LNLSTM_TYPE, - C.LNGLSTM_TYPE)) + model_params.add_argument('--dtype', default=C.DTYPE_FP32, choices=[C.DTYPE_FP32, C.DTYPE_FP16], + help="Data type.") - model_params.add_argument('--weight-normalization', action="store_true", - help="Adds weight normalization to decoder output layers " - "(and all convolutional weight matrices for CNN decoders). Default: %(default)s.") + model_params.add_argument('--amp', action='store_true', help='Use MXNet\'s automatic mixed precision (AMP).') + model_params.add_argument('--amp-scale-interval', type=int, default=2000, + help='Attempt to increase loss scale after this many updates without overflow. ' + 'Default: %(default)s.') def add_batch_args(params, default_batch_size=4096): @@ -837,6 +726,19 @@ def add_batch_args(params, default_batch_size=4096): help="Sentence: each batch contains X sentences, number of words varies." "Word: each batch contains (approximately) X target words, " "number of sentences varies. Default: %(default)s.") + params.add_argument('--round-batch-sizes-to-multiple-of', + type=int, + default=1, + help='For word-based batches, round each bucket\'s batch size (measured in sentences) to a ' + 'multiple of this integer. Default: %(default)s.') + + + +def add_hybridization_arg(params): + params.add_argument('--no-hybridization', + action='store_true', + help='Turn off hybridization. Hybridization builds a static computation graph and computations will therefore be faster. ' + 'The downside is that one can not set breakpoints to inspect intermediate results. Default: %(default)s.') def add_training_args(params): @@ -844,11 +746,6 @@ def add_training_args(params): add_batch_args(train_params) - train_params.add_argument('--decoder-only', - action='store_true', - help='Pre-train a decoder. This is currently for RNN decoders only. ' - 'Default: %(default)s.') - train_params.add_argument('--loss', default=C.CROSS_ENTROPY, choices=[C.CROSS_ENTROPY], @@ -857,11 +754,6 @@ def add_training_args(params): default=0.1, type=float, help='Smoothing constant for label smoothing. Default: %(default)s.') - train_params.add_argument('--loss-normalization-type', - default=C.LOSS_NORM_VALID, - choices=[C.LOSS_NORM_VALID, C.LOSS_NORM_BATCH], - help='How to normalize the loss. By default loss is normalized by the number ' - 'of valid (non-PAD) tokens (%s).' % C.LOSS_NORM_VALID) train_params.add_argument('--length-task', type=str, @@ -878,33 +770,21 @@ def add_training_args(params): default=1, help='Number of fully-connected layers for predicting the length ratio. Default %(default)s.') - train_params.add_argument('--metrics', - nargs='+', - default=[C.PERPLEXITY], - choices=[C.PERPLEXITY, C.ACCURACY, C.LENRATIO_MSE], - help='Names of metrics to track on training and validation data. Default: %(default)s.') train_params.add_argument('--optimized-metric', default=C.PERPLEXITY, choices=C.METRICS, help='Metric to optimize with early stopping {%(choices)s}. Default: %(default)s.') - train_params.add_argument('--min-updates', - type=int, - default=None, - help='Minimum number of updates before training can stop. Default: %(default)s.') - train_params.add_argument('--max-updates', - type=int, - default=None, - help='Maximum number of updates. Default: %(default)s.') - train_params.add_argument('--max-seconds', - type=int, - default=None, - help='Training will stop on the next checkpoint after reaching the maximum seconds. ' - 'Default: %(default)s.') train_params.add_argument('--update-interval', type=int, default=1, help="Number of batch gradients to accumulate before updating. Default: %(default)s.") + train_params.add_argument(C.TRAIN_ARGS_CHECKPOINT_INTERVAL, + type=int_greater_or_equal(1), + default=4000, + help='Checkpoint and evaluate every x updates (update-interval * batches). ' + 'Default: %(default)s.') + train_params.add_argument('--min-samples', type=int, default=None, @@ -913,29 +793,38 @@ def add_training_args(params): type=int, default=None, help='Maximum number of samples. Default: %(default)s.') - train_params.add_argument(C.TRAIN_ARGS_CHECKPOINT_INTERVAL, - type=int_greater_or_equal(1), - default=4000, - help='Checkpoint and evaluate every x updates/batches. Default: %(default)s.') - train_params.add_argument(C.TRAIN_ARGS_CHECKPOINT_FREQUENCY, - type=int_greater_or_equal(1), - dest="checkpoint_interval", - deprecated_dest="checkpoint_frequency", - action=StoreDeprecatedAction, - default=argparse.SUPPRESS, - help=argparse.SUPPRESS) - train_params.add_argument('--max-num-checkpoint-not-improved', + train_params.add_argument('--min-updates', type=int, - default=32, - help='Maximum number of checkpoints the model is allowed to not improve in ' - ' on validation data before training is stopped. ' + default=None, + help='Minimum number of updates before training can stop. Default: %(default)s.') + train_params.add_argument('--max-updates', + type=int, + default=None, + help='Maximum number of updates. Default: %(default)s.') + train_params.add_argument('--max-seconds', + type=int, + default=None, + help='Training will stop on the next checkpoint after reaching the maximum seconds. ' 'Default: %(default)s.') + train_params.add_argument('--max-checkpoints', type=int, default=None, help='Maximum number of checkpoints to continue training the model ' 'before training is stopped. ' 'Default: %(default)s.') + train_params.add_argument('--max-num-checkpoint-not-improved', + type=int, + default=None, + help='Maximum number of checkpoints the model is allowed to not improve in ' + ' on validation data before training is stopped. ' + 'Default: %(default)s.') + train_params.add_argument('--checkpoint-improvement-threshold', + type=float, + default=0., + help='Improvement in over specified number of checkpoints must exceed' + 'this value to be considered actual improvement. Default: %(default)s.') + train_params.add_argument('--min-num-epochs', type=int, default=None, @@ -949,53 +838,23 @@ def add_training_args(params): train_params.add_argument('--embed-dropout', type=multiple_values(2, data_type=float), default=(.0, .0), - help='Dropout probability for source & target embeddings. Use "x:x" to specify ' - 'separate values. Default: %(default)s.') - train_params.add_argument('--rnn-dropout-inputs', - type=multiple_values(2, data_type=float), - default=(.0, .0), - help='RNN variational dropout probability for encoder & decoder RNN inputs. (Gal, 2015)' - 'Use "x:x" to specify separate values. Default: %(default)s.') - train_params.add_argument('--rnn-dropout-states', - type=multiple_values(2, data_type=float), - default=(.0, .0), - help='RNN variational dropout probability for encoder & decoder RNN states. (Gal, 2015)' - 'Use "x:x" to specify separate values. Default: %(default)s.') - train_params.add_argument('--rnn-dropout-recurrent', - type=multiple_values(2, data_type=float), - default=(.0, .0), - help='Recurrent dropout without memory loss (Semeniuta, 2016) for encoder & decoder ' - 'LSTMs. Use "x:x" to specify separate values. Default: %(default)s.') - train_params.add_argument('--rnn-enc-last-hidden-concat-to-embedding', - action="store_true", - help='Concatenate the last hidden layer of the encoder to the input of the decoder, ' - 'instead of the previous state of the decoder. Default: %(default)s.') - - train_params.add_argument('--rnn-decoder-hidden-dropout', - type=float, - default=.2, - help='Dropout probability for hidden state that combines the context with the ' - 'RNN hidden state in the decoder. Default: %(default)s.') + help='Dropout probability for source & target embeddings. Use "x:x" to specify separate ' + 'values. Default: %(default)s.') train_params.add_argument('--transformer-dropout-attention', - type=float, - default=0.1, - help='Dropout probability for multi-head attention. Default: %(default)s.') + type=multiple_values(2, data_type=float), + default=(0.1, 0.1), + help='Dropout probability for multi-head attention. Use "x:x" to specify separate ' + 'values for encoder & decoder. Default: %(default)s.') train_params.add_argument('--transformer-dropout-act', - type=float, - default=0.1, - help='Dropout probability before activation in feed-forward block. Default: %(default)s.') + type=multiple_values(2, data_type=float), + default=(0.1, 0.1), + help='Dropout probability before activation in feed-forward block. Use "x:x" to specify ' + 'separate values for encoder & decoder. Default: %(default)s.') train_params.add_argument('--transformer-dropout-prepost', - type=float, - default=0.1, - help='Dropout probability for pre/postprocessing blocks. Default: %(default)s.') - train_params.add_argument('--conv-embed-dropout', - type=float, - default=.0, - help="Dropout probability for ConvolutionalEmbeddingEncoder. Default: %(default)s.") - train_params.add_argument('--cnn-hidden-dropout', - type=float, - default=.2, - help="Dropout probability for dropout between convolutional layers. Default: %(default)s.") + type=multiple_values(2, data_type=float), + default=(0.1, 0.1), + help='Dropout probability for pre/postprocessing blocks. Use "x:x" to specify separate ' + 'values for encoder & decoder. Default: %(default)s.') train_params.add_argument('--optimizer', default=C.OPTIMIZER_ADAM, @@ -1006,6 +865,12 @@ def add_training_args(params): default=None, help='Additional optimizer params as dictionary. Format: key1:value1,key2:value2,...') + train_params.add_argument('--horovod', + action='store_true', + help='Use Horovod/OpenMPI for distributed training (Sergeev and Del Balso 2018, ' + 'arxiv.org/abs/1802.05799). When using this option, run Sockeye with `horovodrun ' + '-np ... -H ... python`.') + train_params.add_argument("--kvstore", type=str, default=C.KVSTORE_DEVICE, @@ -1013,15 +878,6 @@ def add_training_args(params): help="The MXNet kvstore to use. 'device' is recommended for single process training. " "Use any of 'dist_sync', 'dist_device_sync' and 'dist_async' for distributed " "training. Default: %(default)s.") - train_params.add_argument("--gradient-compression-type", - type=str, - default=C.GRADIENT_COMPRESSION_NONE, - choices=C.GRADIENT_COMPRESSION_TYPES, - help='Type of gradient compression to use. Default: %(default)s.') - train_params.add_argument("--gradient-compression-threshold", - type=float, - default=0.5, - help="Threshold for gradient compression if --gctype is '2bit'. Default: %(default)s.") train_params.add_argument('--weight-init', type=str, @@ -1043,13 +899,6 @@ def add_training_args(params): default=C.RAND_TYPE_UNIFORM, choices=[C.RAND_TYPE_UNIFORM, C.RAND_TYPE_GAUSSIAN], help='Xavier random number generator type. Default: %(default)s.') - train_params.add_argument('--embed-weight-init', - type=str, - default=C.EMBED_INIT_DEFAULT, - choices=C.EMBED_INIT_TYPES, - help='Type of embedding matrix weight initialization. If normal, initializes embedding ' - 'weights using a normal distribution with std=1/srqt(vocab_size). ' - 'Default: %(default)s.') train_params.add_argument('--initial-learning-rate', type=float, default=0.0002, @@ -1076,9 +925,14 @@ def add_training_args(params): default=C.LR_SCHEDULER_PLATEAU_REDUCE, choices=C.LR_SCHEDULERS, help='Learning rate scheduler type. Default: %(default)s.') + train_params.add_argument('--learning-rate-t-scale', + type=float, + default=1.0, + help="Step number is multiplied by this value when determining learning rate for the " + "current step. Default: %(default)s.") train_params.add_argument('--learning-rate-reduce-factor', type=float, - default=0.7, + default=0.9, help="Factor to multiply learning rate with " "(for 'plateau-reduce' learning rate scheduler). Default: %(default)s.") train_params.add_argument('--learning-rate-reduce-num-not-improved', @@ -1086,40 +940,11 @@ def add_training_args(params): default=8, help="For 'plateau-reduce' learning rate scheduler. Adjust learning rate " "if did not improve for x checkpoints. Default: %(default)s.") - train_params.add_argument('--learning-rate-schedule', - type=learning_schedule(), - default=None, - help="For 'fixed-step' scheduler. Fully specified learning schedule in the form" - " \"rate1:num_updates1[,rate2:num_updates2,...]\". Overrides all other args related" - " to learning rate and stopping conditions. Default: %(default)s.") - train_params.add_argument('--learning-rate-half-life', - type=float, - default=10, - help="Half-life of learning rate in checkpoints. For 'fixed-rate-*' " - "learning rate schedulers. Default: %(default)s.") train_params.add_argument('--learning-rate-warmup', type=int, default=0, help="Number of warmup steps. If set to x, linearly increases learning rate from 10%% " "to 100%% of the initial learning rate. Default: %(default)s.") - train_params.add_argument('--learning-rate-decay-param-reset', - action='store_true', - help='Resets model parameters to current best when learning rate is reduced due to the ' - 'value of --learning-rate-reduce-num-not-improved. Default: %(default)s.') - train_params.add_argument('--learning-rate-decay-optimizer-states-reset', - choices=C.LR_DECAY_OPT_STATES_RESET_CHOICES, - default=C.LR_DECAY_OPT_STATES_RESET_OFF, - help="Action to take on optimizer states (e.g. Adam states) when learning rate is " - "reduced due to the value of --learning-rate-reduce-num-not-improved. " - "Default: %(default)s.") - - train_params.add_argument('--rnn-forget-bias', - default=0.0, - type=float, - help='Initial value of RNN forget biases.') - train_params.add_argument('--rnn-h2h-init', type=str, default=C.RNN_INIT_ORTHOGONAL, - choices=[C.RNN_INIT_ORTHOGONAL, C.RNN_INIT_ORTHOGONAL_STACKED, C.RNN_INIT_DEFAULT], - help="Initialization method for RNN parameters. Default: %(default)s.") train_params.add_argument('--fixed-param-strategy', default=None, @@ -1137,10 +962,7 @@ def add_training_args(params): type=int, help='x>0: decode x sampled sentences from validation data and ' 'compute evaluation metrics. x==-1: use full validation data. Default: %(default)s.') - train_params.add_argument('--decode-and-evaluate-use-cpu', - action='store_true', - help='Use CPU for decoding validation data. Overrides --decode-and-evaluate-device-id. ' - 'Default: %(default)s.') + train_params.add_argument('--decode-and-evaluate-device-id', default=None, type=int, @@ -1155,7 +977,7 @@ def add_training_args(params): train_params.add_argument('--seed', type=int, - default=13, + default=1, help='Random seed. Default: %(default)s.') train_params.add_argument('--keep-last-params', @@ -1179,12 +1001,14 @@ def add_train_cli_args(params): add_training_args(params) add_device_args(params) add_logging_args(params) + add_hybridization_arg(params) def add_translate_cli_args(params): add_inference_args(params) add_device_args(params) add_logging_args(params) + add_hybridization_arg(params) def add_score_cli_args(params): @@ -1192,24 +1016,19 @@ def add_score_cli_args(params): add_vocab_args(params) add_device_args(params) add_batch_args(params, default_batch_size=500) + add_hybridization_arg(params) params = params.add_argument_group("Scoring parameters") params.add_argument("--model", "-m", required=True, help="Model directory containing trained model.") - params.add_argument('--max-seq-len', + params.add_argument(C.TRAINING_ARG_MAX_SEQ_LEN, type=multiple_values(num_values=2, greater_or_equal=1), default=None, help='Maximum sequence length in tokens.' 'Use "x:x" to specify separate values for src&tgt. Default: Read from model.') - params.add_argument('--softmax-temperature', - type=float, - default=None, - help='Controls peakiness of model predictions. Values < 1.0 produce ' - 'peaked predictions, values > 1.0 produce smoothed distributions.') - # common params with translate CLI add_length_penalty_args(params) add_brevity_penalty_args(params) @@ -1227,15 +1046,10 @@ def add_score_cli_args(params): default=C.SCORING_TYPE_DEFAULT, help='Score type to output. Default: %(default)s') - add_logging_args(params) - + params.add_argument('--dtype', default=None, choices=[None, C.DTYPE_FP32, C.DTYPE_FP16, C.DTYPE_INT8], + help="Data type. Default: %(default)s infers from saved model.") -def add_max_output_cli_args(params): - params.add_argument('--max-output-length', - type=int, - default=None, - help='Maximum number of words to generate during translation. ' - 'If None, it will be computed automatically. Default: %(default)s.') + add_logging_args(params) def add_inference_args(params): @@ -1288,12 +1102,6 @@ def add_inference_args(params): default=5, help='Size of the beam. Default: %(default)s.') - decode_params.add_argument('--beam-prune', '-p', - type=float, - default=0, - help='Pruning threshold for beam search. All hypotheses with scores not within ' - 'this amount of the best finished hypothesis are discarded (0 = off). ' - 'Default: %(default)s.') decode_params.add_argument('--beam-search-stop', choices=[C.BEAM_SEARCH_STOP_ALL, C.BEAM_SEARCH_STOP_FIRST], default=C.BEAM_SEARCH_STOP_ALL, @@ -1313,11 +1121,6 @@ def add_inference_args(params): ' Default: %d without batching ' 'and %d * batch_size with batching.' % (C.CHUNK_SIZE_NO_BATCHING, C.CHUNK_SIZE_PER_BATCH_SEGMENT)) - decode_params.add_argument('--skip-topk', - default=False, - action='store_true', - help='Use argmax instead of topk for greedy decoding (when --beam-size 1).' - 'Default: %(default)s.') decode_params.add_argument('--sample', type=int_greater_or_equal(0), default=None, @@ -1338,21 +1141,21 @@ def add_inference_args(params): type=int_greater_or_equal(0), default=10, help='Bucket width for encoder steps. 0 means no bucketing. Default: %(default)s.') - decode_params.add_argument('--max-input-len', '-n', - type=int, + decode_params.add_argument('--max-input-length', + type=int_greater_or_equal(1), default=None, help='Maximum input sequence length. Default: value from model(s).') - decode_params.add_argument('--softmax-temperature', - type=float, - default=None, - help='Controls peakiness of model predictions. Values < 1.0 produce ' - 'peaked predictions, values > 1.0 produce smoothed distributions.') decode_params.add_argument('--max-output-length-num-stds', type=int, default=C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH, help='Number of target-to-source length ratio standard deviations from training to add ' 'to calculate maximum output length for beam search for each sentence. ' 'Default: %(default)s.') + decode_params.add_argument('--max-output-length', + type=int_greater_or_equal(1), + default=None, + help='Maximum number of words to generate during translation. ' + 'If None, it will be computed automatically. Default: %(default)s.') decode_params.add_argument('--restrict-lexicon', nargs='+', type=multiple_values(num_values=2, data_type=str), @@ -1383,20 +1186,13 @@ def add_inference_args(params): default='translation', choices=C.OUTPUT_HANDLERS, help='Output type. Default: %(default)s.') - decode_params.add_argument('--sure-align-threshold', - default=0.9, - type=float, - help='Threshold to consider a soft alignment a sure alignment. Default: %(default)s.') # common params with score CLI add_length_penalty_args(decode_params) add_brevity_penalty_args(decode_params) - decode_params.add_argument('--override-dtype', - default=None, - type=str, - help='EXPERIMENTAL: may be changed or removed in future. Overrides training dtype of ' - 'encoders and decoders during inference. Default: %(default)s.') + decode_params.add_argument('--dtype', default=None, choices=[None, C.DTYPE_FP32, C.DTYPE_FP16, C.DTYPE_INT8], + help="Data type. Default: %(default)s infers from saved model.") def add_length_penalty_args(params): diff --git a/sockeye/average.py b/sockeye/average.py index 465a2ddd9..9c45d2356 100644 --- a/sockeye/average.py +++ b/sockeye/average.py @@ -1,4 +1,4 @@ -# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not # use this file except in compliance with the License. A copy of the License @@ -41,30 +41,21 @@ def average(param_paths: Iterable[str]) -> Dict[str, mx.nd.NDArray]: :param param_paths: List of paths to parameter files. :return: Averaged parameter dictionary. """ - all_arg_params = [] - all_aux_params = [] + all_params = [] # type: List[Dict[str, mx.nd.NDArray]] for path in param_paths: logger.info("Loading parameters from '%s'", path) - arg_params, aux_params = utils.load_params(path) - all_arg_params.append(arg_params) - all_aux_params.append(aux_params) + params = mx.nd.load(path) + all_params.append(params) - logger.info("%d models loaded", len(all_arg_params)) - utils.check_condition(all(all_arg_params[0].keys() == p.keys() for p in all_arg_params), - "arg_param names do not match across models") - utils.check_condition(all(all_aux_params[0].keys() == p.keys() for p in all_aux_params), - "aux_param names do not match across models") + logger.info("%d models loaded", len(all_params)) + utils.check_condition(all(all_params[0].keys() == p.keys() for p in all_params), + "param names do not match across models") avg_params = {} # average arg_params - for k in all_arg_params[0]: - arrays = [p[k] for p in all_arg_params] - avg_params["arg:" + k] = utils.average_arrays(arrays) - # average aux_params - for k in all_aux_params[0]: - arrays = [p[k] for p in all_aux_params] - avg_params["aux:" + k] = utils.average_arrays(arrays) - + for k in all_params[0]: + arrays = [p[k] for p in all_params] + avg_params[k] = utils.average_arrays(arrays) return avg_params diff --git a/sockeye/beam_search.py b/sockeye/beam_search.py new file mode 100644 index 000000000..9fb818878 --- /dev/null +++ b/sockeye/beam_search.py @@ -0,0 +1,784 @@ +# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not +# use this file except in compliance with the License. A copy of the License +# is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + +import logging +import functools +import operator +from abc import abstractmethod, ABC +from typing import Tuple, Optional, List, Union + +import mxnet as mx +import numpy as np + +from . import constants as C +from . import lexical_constraints as constrained +from . import lexicon +from . import utils +from . import vocab +from .model import SockeyeModel + +logger = logging.getLogger(__name__) + + +class _Inference(ABC): + + @abstractmethod + def state_structure(self): + raise NotImplementedError() + + @abstractmethod + def encode_and_initialize(self, + inputs: mx.nd.NDArray, + valid_length: Optional[mx.nd.NDArray] = None): + raise NotImplementedError() + + @abstractmethod + def decode_step(self, + step_input: mx.nd.NDArray, + states: List, + vocab_slice_ids: Optional[mx.nd.NDArray] = None): + raise NotImplementedError() + + +class _SingleModelInference(_Inference): + + def __init__(self, + model: SockeyeModel, + skip_softmax: bool = False, + constant_length_ratio: float = 0.0) -> None: + self._model = model + self._skip_softmax = skip_softmax + self._const_lr = constant_length_ratio + + def state_structure(self) -> List: + return [self._model.state_structure()] + + def encode_and_initialize(self, inputs: mx.nd.NDArray, valid_length: Optional[mx.nd.NDArray] = None): + states, predicted_output_length = self._model.encode_and_initialize(inputs, valid_length, self._const_lr) + predicted_output_length = predicted_output_length.expand_dims(axis=1) + return states, predicted_output_length + + def decode_step(self, + step_input: mx.nd.NDArray, + states: List, + vocab_slice_ids: Optional[mx.nd.NDArray] = None): + logits, states, _ = self._model.decode_step(step_input, states, vocab_slice_ids) + if not self._skip_softmax: + logits = logits.log_softmax(axis=-1) + scores = -logits + return scores, states + + +class _EnsembleInference(_Inference): + + def __init__(self, + models: List[SockeyeModel], + ensemble_mode: str = 'linear', + constant_length_ratio: float = 0.0) -> None: + self._models = models + if ensemble_mode == 'linear': + self._interpolation = self.linear_interpolation + elif ensemble_mode == 'log_linear': + self._interpolation = self.log_linear_interpolation + else: + raise ValueError() + self._const_lr = constant_length_ratio + + def state_structure(self) -> List: + structure = [] + for model in self._models: + structure.append(model.state_structure()) + return structure + + def encode_and_initialize(self, inputs: mx.nd.NDArray, valid_length: Optional[mx.nd.NDArray] = None): + model_states = [] # type: List[mx.nd.NDArray] + predicted_output_lengths = [] # type: List[mx.nd.NDArray] + for model in self._models: + states, predicted_output_length = model.encode_and_initialize(inputs, valid_length, self._const_lr) + predicted_output_lengths.append(predicted_output_length) + model_states += states + # average predicted output lengths, (batch, 1) + predicted_output_lengths = mx.nd.mean(mx.nd.stack(*predicted_output_lengths, axis=1), axis=1, keepdims=True) + return model_states, predicted_output_lengths + + def decode_step(self, + step_input: mx.nd.NDArray, + states: List, + vocab_slice_ids: Optional[mx.nd.NDArray] = None): + outputs = [] # type: List[mx.nd.NDArray] + new_states = [] # type: List[mx.nd.NDArray] + state_index = 0 + for model, model_state_structure in zip(self._models, self.state_structure()): + model_states = states[state_index:state_index+len(model_state_structure)] + state_index += len(model_state_structure) + logits, model_states, _ = model.decode_step(step_input, model_states, vocab_slice_ids) + probs = logits.softmax(axis=-1) + outputs.append(probs) + new_states += model_states + scores = self._interpolation(outputs) + return scores, new_states + + @staticmethod + def linear_interpolation(predictions): + return -mx.nd.log(utils.average_arrays(predictions)) # pylint: disable=invalid-unary-operand-type + + @staticmethod + def log_linear_interpolation(predictions): + log_probs = utils.average_arrays([p.log() for p in predictions]) + return -log_probs.log_softmax() # pylint: disable=invalid-unary-operand-type + + +class UpdateScores(mx.gluon.HybridBlock): + """ + A HybridBlock that updates the scores from the decoder step with accumulated scores. + Inactive hypotheses receive score inf. Finished hypotheses receive their accumulated score for C.PAD_ID. + Hypotheses at maximum length are forced to produce C.EOS_ID. + All other options are set to infinity. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + assert C.PAD_ID == 0, "This block only works with PAD_ID == 0" + + def hybrid_forward(self, F, + target_dists, finished, inactive, + scores_accumulated, lengths, max_lengths, + pad_dist, eos_dist): + # broadcast hypothesis score to each prediction. + # scores_accumulated. Shape: (batch*beam, 1) + # target_dists. Shape: (batch*beam, vocab_size) + scores = F.broadcast_add(target_dists, scores_accumulated) + + # Special treatment for finished and inactive rows. Inactive rows are inf everywhere; + # finished rows are inf everywhere except column zero (pad_id), which holds the accumulated model score. + # Items that are finished (but not inactive) get their previous accumulated score for the symbol, + # infinity otherwise. + # pad_dist. Shape: (batch*beam, vocab_size) + pad_dist = F.concat(scores_accumulated, pad_dist) + scores = F.where(F.broadcast_logical_or(finished, inactive), pad_dist, scores) + + # Update lengths of all items, except those that were already finished. This updates + # the lengths for inactive items, too, but that doesn't matter since they are ignored anyway. + lengths = lengths + (1 - finished) + + # Items that are at their maximum length and not finished now are forced to produce the symbol. + # That is, we keep scores for hypotheses below max length or finished, and 'force-eos' the rest. + below_max_length = lengths < max_lengths + scores = F.where(F.broadcast_logical_or(below_max_length, finished), scores, eos_dist + scores) + + return scores, lengths + + +class LengthPenalty(mx.gluon.HybridBlock): + """ + Calculates the length penalty as: + (beta + len(Y))**alpha / (beta + 1)**alpha + + See Wu et al. 2016 (note that in the paper beta has a different meaning, + and a fixed value 5 was used for this parameter) + + :param alpha: The alpha factor for the length penalty (see above). + :param beta: The beta factor for the length penalty (see above). + """ + + def __init__(self, alpha: float = 1.0, beta: float = 0.0, **kwargs) -> None: + super().__init__(**kwargs) + self.alpha = alpha + self.beta = beta + self.denominator = (self.beta + 1.) ** self.alpha + + def forward(self, lengths): + if isinstance(lengths, mx.nd.NDArray) or isinstance(lengths, mx.sym.Symbol): + return super().forward(lengths) + else: + return self.hybrid_forward(None, lengths) + + def hybrid_forward(self, F, lengths): + if self.alpha == 0.0: + if F is None: + return 1.0 + else: + return F.ones_like(lengths) + else: + numerator = self.beta + lengths if self.beta != 0.0 else lengths + numerator = numerator ** self.alpha if self.alpha != 1.0 else numerator + return numerator / self.denominator + + +class BrevityPenalty(mx.gluon.HybridBlock): + """ + Calculates the logarithmic brevity penalty as: + weight * log min(1, exp(1 - ref_len / hyp_len)) = weight * min(0, 1 - ref_len / hyp_len). + + :param weight: Linear weight. + """ + + def __init__(self, weight: float = 0.0, **kwargs) -> None: + super().__init__(**kwargs) + self.weight = weight + + def forward(self, hyp_lengths, reference_lengths): + if isinstance(hyp_lengths, mx.nd.NDArray) or isinstance(hyp_lengths, mx.sym.Symbol): + return super().forward(hyp_lengths, reference_lengths) + else: + return self.hybrid_forward(None, hyp_lengths, reference_lengths) + + def hybrid_forward(self, F, hyp_lengths, reference_lengths): + if self.weight == 0.0: + if F is None: + return 0.0 + else: + # subtract to avoid MxNet's warning of not using both arguments + # this branch should not and is not used during inference + return F.zeros_like(hyp_lengths - reference_lengths) + else: + # log_bp is always <= 0.0 + if F is None: + log_bp = min(0.0, 1.0 - reference_lengths / hyp_lengths) + else: + log_bp = F.minimum(F.zeros_like(hyp_lengths), 1.0 - reference_lengths / hyp_lengths) + return self.weight * log_bp + + +class CandidateScorer(mx.gluon.HybridBlock): + + def __init__(self, + length_penalty_alpha: float = 1.0, + length_penalty_beta: float = 0.0, + brevity_penalty_weight: float = 0.0, + **kwargs) -> None: + super().__init__(**kwargs) + with self.name_scope(): + self._lp = LengthPenalty(alpha=length_penalty_alpha, beta=length_penalty_beta) + self._bp = None # type: Optional[BrevityPenalty] + if brevity_penalty_weight > 0.0: + self._bp = BrevityPenalty(weight=brevity_penalty_weight) + + def forward(self, scores, lengths, reference_lengths): + if isinstance(scores, mx.nd.NDArray) or isinstance(scores, mx.sym.Symbol): + return super().forward(scores, lengths, reference_lengths) + else: + return self.hybrid_forward(None, scores, lengths, reference_lengths) + + def hybrid_forward(self, F, scores, lengths, reference_lengths): + lp = self._lp(lengths) + if self._bp is not None: + bp = self._bp(lengths, reference_lengths) + else: + if F is None: + bp = 0.0 + else: + # avoid warning for unused input + bp = F.zeros_like(reference_lengths) if reference_lengths is not None else 0.0 + return scores / lp - bp + + def unnormalize(self, scores, lengths, reference_lengths): + bp = 0.0 if self._bp is None else self._bp(lengths, reference_lengths) + return (scores + bp) * self._lp(lengths) + + +class SortNormalizeAndUpdateFinished(mx.gluon.HybridBlock): + """ + A HybridBlock for normalizing newly finished hypotheses scores with LengthPenalty. + """ + + def __init__(self, + dtype: str, + pad_id: int, + eos_id: int, + scorer: CandidateScorer, + **kwargs) -> None: + super().__init__(**kwargs) + self.dtype = dtype + self.pad_id = pad_id + self.eos_id = eos_id + self._scorer = scorer + + def hybrid_forward(self, F, best_hyp_indices, best_word_indices, + finished, scores_accumulated, lengths, reference_lengths): + + # Reorder fixed-size beam data according to best_hyp_indices (ascending) + finished = F.take(finished, best_hyp_indices) + lengths = F.take(lengths, best_hyp_indices) + reference_lengths = F.take(reference_lengths, best_hyp_indices) + + # Normalize hypotheses that JUST finished + all_finished = F.broadcast_logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id) + newly_finished = F.broadcast_logical_xor(all_finished, finished) + scores_accumulated = F.where(newly_finished, + self._scorer(scores_accumulated, + F.cast(F.expand_dims(lengths, axis=1), self.dtype), + reference_lengths), + scores_accumulated) + + # Recompute finished. Hypotheses are finished if they are extended with or + finished = F.broadcast_logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id) + + return finished, scores_accumulated, lengths, reference_lengths + + +class TopK(mx.gluon.HybridBlock): + """ + Batch-wise topk operation. + Forward method uses imperative shape inference, since both batch_size and vocab_size are dynamic + during translation (due to variable batch size and potential vocabulary selection). + """ + + def __init__(self, k: int, **kwargs) -> None: + """ + :param k: The number of smallest scores to return. + """ + super().__init__(**kwargs) + self.k = k + + def forward(self, scores, offset): + """ + Get the lowest k elements per sentence from a `scores` matrix. + + :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size) + :param offset: Array to add to the hypothesis indices for offsetting in batch decoding. + :return: The row indices, column indices and values of the k smallest items in matrix. + """ + vocab_size = scores.shape[1] + batch_size = int(offset.shape[-1] / self.k) + # Shape: (batch size, beam_size * vocab_size) + batchwise_scores = scores.reshape(shape=(batch_size, self.k * vocab_size)) + indices, values = super().forward(batchwise_scores) + best_hyp_indices, best_word_indices = mx.nd.unravel_index(indices, shape=(batch_size * self.k, vocab_size)) + if batch_size > 1: + # Offsetting the indices to match the shape of the scores matrix + best_hyp_indices += offset + return best_hyp_indices, best_word_indices, values + + def hybrid_forward(self, F, scores): + values, indices = F.topk(scores, axis=1, k=self.k, ret_typ='both', is_ascend=True) + # Project indices back into original shape (which is different for t==1 and t>1) + return F.reshape(F.cast(indices, 'int32'), shape=(-1,)), F.reshape(values, shape=(-1, 1)) + + +class SampleK(mx.gluon.HybridBlock): + """ + A HybridBlock for selecting a random word from each hypothesis according to its distribution. + """ + def __init__(self, n, **kwargs) -> None: + super().__init__(**kwargs) + self.n = n + + def hybrid_forward(self, F, scores, target_dists, finished, best_hyp_indices): + """ + Choose an extension of each hypothesis from its softmax distribution. + + :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size) + :param target_dists: The non-cumulative target distributions (ignored). + :param finished: The list of finished hypotheses. + :param best_hyp_indices: Best hypothesis indices constant. + :return: The row indices, column indices, and values of the sampled words. + """ + # Map the negative logprobs to probabilities so as to have a distribution + target_dists = F.exp(-target_dists) + + # n == 0 means sample from the full vocabulary. Otherwise, we sample from the top n. + if self.n != 0: + # select the top n in each row, via a mask + masked_items = F.topk(target_dists, k=self.n, ret_typ='mask', axis=1, is_ascend=False) + # set unmasked items to 0 + masked_items = F.where(masked_items, target_dists, masked_items) + # renormalize + target_dists = F.broadcast_div(masked_items, F.sum(masked_items, axis=1, keepdims=True)) + + # Sample from the target distributions over words, then get the corresponding values from the cumulative scores + best_word_indices = F.random.multinomial(target_dists, get_prob=False) + # Zeroes for finished hypotheses. + best_word_indices = F.where(finished, F.zeros_like(best_word_indices), best_word_indices) + values = F.pick(scores, best_word_indices, axis=1, keepdims=True) + + best_hyp_indices = F.slice_like(best_hyp_indices, best_word_indices, axes=(0,)) + + return best_hyp_indices, best_word_indices, values + + +def _repeat_states(states: List, beam_size: int, state_structure: List) -> List: + repeated_states = [] + flat_structure = functools.reduce(operator.add, state_structure) + assert len(states) == len(flat_structure), "Number of states do not match the defined state structure" + for state, state_format in zip(states, flat_structure): + if state_format == C.STEP_STATE or state_format == C.BIAS_STATE: + repeat_axis = 0 + elif state_format == C.DECODER_STATE or state_format == C.ENCODER_STATE: + # TODO: Change repeat axis to 1 when interleaved multihead attention is implemented + repeat_axis = 0 + else: + raise ValueError("Provided state format %s not recognized." % state_format) + repeated_state = state.repeat(repeats=beam_size, axis=repeat_axis) + repeated_states.append(repeated_state) + return repeated_states + + +class SortStates(mx.gluon.HybridBlock): + + def __init__(self, state_structure, prefix): + mx.gluon.HybridBlock.__init__(self, prefix=prefix) + self.flat_structure = functools.reduce(operator.add, state_structure) + + def hybrid_forward(self, F, best_hyp_indices, *states): + sorted_states = [] + assert len(states) == len(self.flat_structure), "Number of states do not match the defined state structure" + for state, state_format in zip(states, self.flat_structure): + if state_format == C.STEP_STATE or state_format == C.BIAS_STATE: + sorted_state = F.take(state, best_hyp_indices) + elif state_format == C.DECODER_STATE: + # TODO: Change take axis to 1 when interleaved multihead attention is implemented + sorted_state = F.take(state, best_hyp_indices) + elif state_format == C.ENCODER_STATE: + # No need for takes on encoder layer states + sorted_state = state + else: + raise ValueError("Provided state format %s not recognized." % state_format) + sorted_states.append(sorted_state) + return sorted_states + + +class BeamSearch(mx.gluon.Block): + """ + Features: + - beam search stop + - constraints (pos & neg) + - ensemble decoding + - vocabulary selection + - sampling (TODO: check if its working correctly) + + Not supported: + - beam pruning + - beam history + """ + + def __init__(self, + beam_size: int, + dtype: str, + bos_id: int, + eos_id: int, + context: Union[mx.Context, List[mx.Context]], + output_vocab_size: int, + scorer: CandidateScorer, + num_source_factors: int, + inference: _Inference, + beam_search_stop: str = C.BEAM_SEARCH_STOP_ALL, + global_avoid_trie: Optional[constrained.AvoidTrie] = None, + sample: Optional[int] = None) -> None: + super().__init__(prefix='beam_search_') + self.beam_size = beam_size + self.dtype = dtype + self.bos_id = bos_id + self.eos_id = eos_id + self.output_vocab_size = output_vocab_size + self.context = context + self._inference = inference + self.beam_search_stop = beam_search_stop + self.num_source_factors = num_source_factors + self.global_avoid_trie = global_avoid_trie + + with self.name_scope(): + self._sort_states = SortStates(state_structure=self._inference.state_structure(), + prefix='sort_states_') + self._update_scores = UpdateScores(prefix='update_scores_') + self._scorer = scorer + self._sort_norm_and_update_finished = SortNormalizeAndUpdateFinished( + prefix='sort_norm_and_update_finished_', + dtype=self.dtype, + pad_id=C.PAD_ID, + eos_id=eos_id, + scorer=scorer) + + self._sample = None # type: Optional[mx.gluon.HybridBlock] + self._top = None # type: Optional[mx.gluon.HybridBlock] + if sample is not None: + self._sample = SampleK(sample) + else: + self._top = TopK(self.beam_size) + + def forward(self, + source: mx.nd.NDArray, + source_length: mx.nd.NDArray, + restrict_lexicon: Optional[lexicon.TopKLexicon], + raw_constraint_list: List[Optional[constrained.RawConstraintList]], + raw_avoid_list: List[Optional[constrained.RawConstraintList]], + max_output_lengths: mx.nd.NDArray) -> Tuple[np.ndarray, + np.ndarray, + np.ndarray, + np.ndarray, + List[Optional[np.ndarray]], + List[Optional[constrained.ConstrainedHypothesis]]]: + """ + Translates multiple sentences using beam search. + + :param source: Source ids. Shape: (batch_size, bucket_key, num_factors). + :param source_length: Valid source lengths. Shape: (batch_size,). + :param restrict_lexicon: Lexicon to use for vocabulary restriction. + :param raw_constraint_list: A list of optional lists containing phrases (as lists of target word IDs) + that must appear in each output. + :param raw_avoid_list: A list of optional lists containing phrases (as lists of target word IDs) + that must NOT appear in each output. + :param max_output_lengths: NDArray of maximum output lengths per input in source. + Shape: (batch_size,). Dtype: int32. + :return List of best hypotheses indices, list of best word indices, + array of accumulated length-normalized negative log-probs, hypotheses lengths, + predicted lengths of references (if any), constraints (if any). + """ + batch_size = source.shape[0] + logger.debug("beam_search batch size: %d", batch_size) + + # Maximum beam search iterations (determined by longest input with eos) + max_iterations = max_output_lengths.max().asscalar() + logger.debug("max beam search iterations: %d", max_iterations) + + sample_best_hyp_indices = None + if self._sample is not None: + utils.check_condition(restrict_lexicon is None, + "Sampling is not available when working with a restricted lexicon.") + sample_best_hyp_indices = mx.nd.arange(0, batch_size * self.beam_size, dtype='int32') + + # General data structure: batch_size * beam_size blocks in total; + # a full beam for each sentence, followed by the next beam-block for the next sentence and so on + + best_word_indices = mx.nd.full((batch_size * self.beam_size,), val=self.bos_id, ctx=self.context, + dtype='int32') + + # offset for hypothesis indices in batch decoding + offset = mx.nd.repeat(mx.nd.arange(0, batch_size * self.beam_size, self.beam_size, + dtype='int32', ctx=self.context), self.beam_size) + + # locations of each batch item when first dimension is (batch * beam) + batch_indices = mx.nd.arange(0, batch_size * self.beam_size, self.beam_size, dtype='int32', ctx=self.context) + first_step_mask = mx.nd.full((batch_size * self.beam_size, 1), val=np.inf, ctx=self.context, dtype=self.dtype) + first_step_mask[batch_indices] = 1.0 + pad_dist = mx.nd.full((batch_size * self.beam_size, self.output_vocab_size - 1), val=np.inf, + ctx=self.context, dtype=self.dtype) + eos_dist = mx.nd.full((batch_size * self.beam_size, self.output_vocab_size), val=np.inf, + ctx=self.context, dtype=self.dtype) + eos_dist[:, C.EOS_ID] = 0 + + # Best word and hypotheses indices across beam search steps from topk operation. + best_hyp_indices_list = [] # type: List[mx.nd.NDArray] + best_word_indices_list = [] # type: List[mx.nd.NDArray] + + lengths = mx.nd.zeros((batch_size * self.beam_size,), ctx=self.context, dtype='int32') + finished = mx.nd.zeros((batch_size * self.beam_size,), ctx=self.context, dtype='int32') + + # Extending max_output_lengths to shape (batch_size * beam_size,) + max_output_lengths = mx.nd.repeat(max_output_lengths, self.beam_size) + + # scores_accumulated: chosen smallest scores in scores (ascending). + scores_accumulated = mx.nd.zeros((batch_size * self.beam_size, 1), ctx=self.context, dtype=self.dtype) + + # If using a top-k lexicon, select param rows for logit computation that correspond to the + # target vocab for this sentence. + vocab_slice_ids = None # type: Optional[mx.nd.NDArray] + if restrict_lexicon: + source_words = utils.split(source, num_outputs=self.num_source_factors, axis=2, squeeze_axis=True)[0] + vocab_slice_ids = restrict_lexicon.get_trg_ids(source_words.astype("int32").asnumpy()) + if any(raw_constraint_list): + # Add the constraint IDs to the list of permissibled IDs, and then project them into the reduced space + constraint_ids = np.array([word_id for sent in raw_constraint_list for phr in sent for word_id in phr]) + vocab_slice_ids = np.lib.arraysetops.union1d(vocab_slice_ids, constraint_ids) + full_to_reduced = dict((val, i) for i, val in enumerate(vocab_slice_ids)) + raw_constraint_list = [[[full_to_reduced[x] for x in phr] for phr in sent] for sent in + raw_constraint_list] + # Pad to a multiple of 8. + vocab_slice_ids = np.pad(vocab_slice_ids, (0, 7 - ((len(vocab_slice_ids) - 1) % 8)), + mode='constant', constant_values = self.eos_id) + vocab_slice_ids = mx.nd.array(vocab_slice_ids, ctx=self.context, dtype='int32') + + if vocab_slice_ids.shape[0] < self.beam_size + 1: + # This fixes an edge case for toy models, where the number of vocab ids from the lexicon is + # smaller than the beam size. + logger.warning("Padding vocab_slice_ids (%d) with EOS to have at least %d+1 elements to expand", + vocab_slice_ids.shape[0], self.beam_size) + n = self.beam_size - vocab_slice_ids.shape[0] + 1 + vocab_slice_ids = mx.nd.concat(vocab_slice_ids, + mx.nd.full((n,), val=self.eos_id, ctx=self.context, dtype='int32'), + dim=0) + + pad_dist = mx.nd.full((batch_size * self.beam_size, vocab_slice_ids.shape[0] - 1), + val=np.inf, ctx=self.context) + eos_dist = mx.nd.full((batch_size * self.beam_size, vocab_slice_ids.shape[0]), + val=np.inf, ctx=self.context) + eos_dist[:, C.EOS_ID] = 0 + + # Initialize the beam to track constraint sets, where target-side lexical constraints are present + constraints = constrained.init_batch(raw_constraint_list, self.beam_size, self.bos_id, self.eos_id) + + if self.global_avoid_trie or any(raw_avoid_list): + avoid_states = constrained.AvoidBatch(batch_size, self.beam_size, + avoid_list=raw_avoid_list, + global_avoid_trie=self.global_avoid_trie) + avoid_states.consume(best_word_indices) + + # (0) encode source sentence, returns a list + model_states, estimated_reference_lengths = self._inference.encode_and_initialize(source, source_length) + # repeat states to beam_size + model_states = _repeat_states(model_states, self.beam_size, self._inference.state_structure()) + + # Records items in the beam that are inactive. At the beginning (t==1), there is only one valid or active + # item on the beam for each sentence + inactive = mx.nd.zeros((batch_size * self.beam_size), dtype='int32', ctx=self.context) + t = 1 + for t in range(1, max_iterations + 1): # TODO: max_iterations + 1 is the MINIMUM to get correct results right now + # (1) obtain next predictions and advance models' state + # target_dists: (batch_size * beam_size, target_vocab_size) + target_dists, model_states = self._inference.decode_step(best_word_indices, model_states, vocab_slice_ids) + + # (2) Produces the accumulated cost of target words in each row. + # There is special treatment for finished and inactive rows: inactive rows are inf everywhere; + # finished rows are inf everywhere except column zero, which holds the accumulated model score + scores, lengths = self._update_scores(target_dists, + finished, + inactive, + scores_accumulated, + lengths, + max_output_lengths, + pad_dist, + eos_dist) + + # Mark entries that should be blocked as having a score of np.inf + if self.global_avoid_trie or any(raw_avoid_list): + block_indices = avoid_states.avoid() + if len(block_indices) > 0: + scores[block_indices] = np.inf + if self._sample is not None: + target_dists[block_indices] = np.inf + + # (3) Get beam_size winning hypotheses for each sentence block separately. Only look as + # far as the active beam size for each sentence. + if self._sample is not None: + best_hyp_indices, best_word_indices, scores_accumulated = self._sample(scores, + target_dists, + finished, + sample_best_hyp_indices) + else: + # On the first timestep, all hypotheses have identical histories, so force topk() to choose extensions + # of the first row only by setting all other rows to inf + if t == 1: + scores *= first_step_mask + + best_hyp_indices, best_word_indices, scores_accumulated = self._top(scores, offset) + + # Constraints for constrained decoding are processed sentence by sentence + if any(raw_constraint_list): + best_hyp_indices, best_word_indices, scores_accumulated, constraints, inactive = constrained.topk( + t, + batch_size, + self.beam_size, + inactive, + scores, + constraints, + best_hyp_indices, + best_word_indices, + scores_accumulated) + + # Map from restricted to full vocab ids if needed + if restrict_lexicon: + best_word_indices = vocab_slice_ids.take(best_word_indices) + + # (4) Normalize the scores of newly finished hypotheses. Note that after this until the + # next call to topk(), hypotheses may not be in sorted order. + finished, scores_accumulated, lengths, estimated_reference_lengths = self._sort_norm_and_update_finished( + best_hyp_indices, + best_word_indices, + finished, + scores_accumulated, + lengths, + estimated_reference_lengths) + + # Collect best hypotheses, best word indices + best_hyp_indices_list.append(best_hyp_indices) + best_word_indices_list.append(best_word_indices) + + if self._should_stop(finished, batch_size): + break + + # (5) update models' state with winning hypotheses (ascending) + model_states = self._sort_states(best_hyp_indices, *model_states) + + logger.debug("Finished after %d out of %d steps.", t, max_iterations) + + # (9) Sort the hypotheses within each sentence (normalization for finished hyps may have unsorted them). + folded_accumulated_scores = scores_accumulated.reshape((batch_size, + self.beam_size * scores_accumulated.shape[-1])) + indices = mx.nd.cast(mx.nd.argsort(folded_accumulated_scores.astype('float32'), axis=1), dtype='int32').reshape((-1,)) + best_hyp_indices, _ = mx.nd.unravel_index(indices, scores_accumulated.shape) + offset + scores_accumulated = scores_accumulated.take(best_hyp_indices) + best_hyp_indices_list.append(best_hyp_indices) + lengths = lengths.take(best_hyp_indices) + all_best_hyp_indices = mx.nd.stack(*best_hyp_indices_list, axis=1) + all_best_word_indices = mx.nd.stack(*best_word_indices_list, axis=1) + constraints = [constraints[x] for x in best_hyp_indices.asnumpy()] + + return all_best_hyp_indices.asnumpy(), \ + all_best_word_indices.asnumpy(), \ + scores_accumulated.asnumpy(), \ + lengths.asnumpy().astype('int32'), \ + estimated_reference_lengths.asnumpy(), \ + constraints + + def _should_stop(self, finished, batch_size): + if self.beam_search_stop == C.BEAM_SEARCH_STOP_FIRST: + at_least_one_finished = finished.reshape((batch_size, self.beam_size)).sum(axis=1) > 0 + return at_least_one_finished.sum().asscalar() == batch_size + else: + return finished.sum().asscalar() == batch_size * self.beam_size # all finished + + +def get_beam_search(models: List[SockeyeModel], + beam_size: int, + context: Union[mx.Context, List[mx.Context]], + vocab_target: vocab.Vocab, + output_scores: bool, + scorer: CandidateScorer, + ensemble_mode: str = 'linear', + beam_search_stop: str = C.BEAM_SEARCH_STOP_ALL, + constant_length_ratio: float = 0.0, + avoid_list: Optional[str] = None, + sample: Optional[int] = None, + hybridize: bool = True) -> BeamSearch: + + inference = None # type: Optional[_Inference] + if len(models) == 1: + skip_softmax = beam_size == 1 and not output_scores and not sample + if skip_softmax: + logger.info("Enabled skipping softmax for a single model and greedy decoding.") + inference = _SingleModelInference(model=models[0], + skip_softmax=skip_softmax, constant_length_ratio=constant_length_ratio) + else: + inference = _EnsembleInference(models=models, + ensemble_mode=ensemble_mode, + constant_length_ratio=constant_length_ratio) + + global_avoid_trie = None if avoid_list is None else constrained.get_avoid_trie(avoid_list, vocab_target) + bs = BeamSearch( + beam_size=beam_size, + dtype=C.DTYPE_FP32 if models[0].dtype == C.DTYPE_INT8 else models[0].dtype, + bos_id=C.BOS_ID, + eos_id=C.EOS_ID, + context=context, + output_vocab_size=models[0].output_layer_vocab_size, + beam_search_stop=beam_search_stop, + scorer=scorer, + sample=sample, + num_source_factors=models[0].num_source_factors, + global_avoid_trie=global_avoid_trie, + inference=inference + ) + bs.initialize() + if hybridize: + bs.hybridize(static_alloc=True) + return bs diff --git a/sockeye/checkpoint_decoder.py b/sockeye/checkpoint_decoder.py index 1c9d9d0a4..340c5b553 100644 --- a/sockeye/checkpoint_decoder.py +++ b/sockeye/checkpoint_decoder.py @@ -1,4 +1,4 @@ -# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not # use this file except in compliance with the License. A copy of the License @@ -23,6 +23,7 @@ import mxnet as mx +import sockeye.model import sockeye.output_handler import sockeye.translate from . import constants as C @@ -30,6 +31,7 @@ from . import evaluate from . import inference from . import utils +from . import vocab logger = logging.getLogger(__name__) @@ -38,29 +40,34 @@ class CheckpointDecoder: """ Decodes a (random sample of a) dataset using parameters at given checkpoint and computes BLEU against references. - :param context: MXNet context to bind the model to. + :param model_folder: The model folder where checkpoint decoder outputs will be written to. :param inputs: Path(s) to file containing input sentences (and their factors). :param references: Path to file containing references. - :param model: Model to load. + :param source_vocabs: The source vocabularies. + :param target_vocab: The target vocabulary. + :param context: The devices to use for decoding. + :param model: The translation model. :param max_input_len: Maximum input length. :param batch_size: Batch size. :param beam_size: Size of the beam. :param nbest_size: Size of nbest lists. - :param bucket_width_source: Source bucket width. :param length_penalty_alpha: Alpha factor for the length penalty :param length_penalty_beta: Beta factor for the length penalty - :param softmax_temperature: Optional parameter to control steepness of softmax distribution. :param max_output_length_num_stds: Number of standard deviations as safety margin for maximum output length. :param ensemble_mode: Ensemble mode: linear or log_linear combination. :param sample_size: Maximum number of sentences to sample and decode. If <=0, all sentences are used. :param random_seed: Random seed for sampling. Default: 42. + :param hybridize: Turn on hybridization of the translator. """ def __init__(self, - context: mx.context.Context, + model_folder: str, inputs: List[str], references: str, - model: str, + source_vocabs: List[vocab.Vocab], + target_vocab: vocab.Vocab, + model: sockeye.model.SockeyeModel, + context: mx.Context, max_input_len: Optional[int] = None, batch_size: int = 16, beam_size: int = C.DEFAULT_BEAM_SIZE, @@ -68,12 +75,11 @@ def __init__(self, bucket_width_source: int = 10, length_penalty_alpha: float = 1.0, length_penalty_beta: float = 0.0, - softmax_temperature: Optional[float] = None, max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH, ensemble_mode: str = 'linear', sample_size: int = -1, - random_seed: int = 42) -> None: - self.context = context + random_seed: int = 42, + hybridize: bool = True) -> None: self.max_input_len = max_input_len self.max_output_length_num_stds = max_output_length_num_stds self.ensemble_mode = ensemble_mode @@ -83,18 +89,19 @@ def __init__(self, self.bucket_width_source = bucket_width_source self.length_penalty_alpha = length_penalty_alpha self.length_penalty_beta = length_penalty_beta - self.softmax_temperature = softmax_temperature self.model = model with ExitStack() as exit_stack: - inputs_fins = [exit_stack.enter_context(data_io.smart_open(f)) for f in inputs] # pylint: disable=no-member - references_fin = exit_stack.enter_context(data_io.smart_open(references)) # pylint: disable=no-member + inputs_fins = [exit_stack.enter_context(data_io.smart_open(f)) for f in inputs] + references_fin = exit_stack.enter_context(data_io.smart_open(references)) inputs_sentences = [f.readlines() for f in inputs_fins] target_sentences = references_fin.readlines() utils.check_condition(all(len(l) == len(target_sentences) for l in inputs_sentences), "Sentences differ in length") + utils.check_condition(all(len(sentence.strip()) > 0 for sentence in target_sentences), + "Empty target validation sentence.") if sample_size <= 0: sample_size = len(inputs_sentences[0]) @@ -108,47 +115,44 @@ def __init__(self, self.batch_size = sample_size for i, factor in enumerate(self.inputs_sentences): - write_to_file(factor, os.path.join(self.model, C.DECODE_IN_NAME % i)) - write_to_file(self.target_sentences, os.path.join(self.model, C.DECODE_REF_NAME)) + write_to_file(factor, os.path.join(model_folder, C.DECODE_IN_NAME % i)) + write_to_file(self.target_sentences, os.path.join(model_folder, C.DECODE_REF_NAME)) self.inputs_sentences = list(zip(*self.inputs_sentences)) # type: List[List[str]] - logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d, context=%s)", - max_input_len if max_input_len is not None else -1, beam_size, model, len(self.target_sentences), - context) + scorer = inference.CandidateScorer( + length_penalty_alpha=length_penalty_alpha, + length_penalty_beta=length_penalty_beta, + brevity_penalty_weight=0.0, + prefix='scorer_') + + # TODO: possibly support decoding on multiple GPUs + self.translator = inference.Translator( + batch_size=self.batch_size, + context=context, + ensemble_mode=self.ensemble_mode, + scorer=scorer, + beam_search_stop='all', + nbest_size=self.nbest_size, + models=[self.model], + source_vocabs=source_vocabs, + target_vocab=target_vocab, + restrict_lexicon=None, + hybridize=hybridize) + + logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, num_sentences=%d)", + max_input_len if max_input_len is not None else -1, beam_size, len(self.target_sentences)) def decode_and_evaluate(self, - checkpoint: Optional[int] = None, output_name: str = os.devnull) -> Dict[str, float]: """ Decodes data set and evaluates given a checkpoint. - :param checkpoint: Checkpoint to load parameters from. :param output_name: Filename to write translations to. Defaults to /dev/null. :return: Mapping of metric names to scores. """ - models, source_vocabs, target_vocab = inference.load_models( - self.context, - self.max_input_len, - self.beam_size, - self.batch_size, - [self.model], - [checkpoint], - softmax_temperature=self.softmax_temperature, - max_output_length_num_stds=self.max_output_length_num_stds) - translator = inference.Translator(context=self.context, - ensemble_mode=self.ensemble_mode, - bucket_source_width=self.bucket_width_source, - length_penalty=inference.LengthPenalty(self.length_penalty_alpha, self.length_penalty_beta), - brevity_penalty=inference.BrevityPenalty(weight=0.0), - beam_prune=0.0, - beam_search_stop='all', - nbest_size=self.nbest_size, - models=models, - source_vocabs=source_vocabs, - target_vocab=target_vocab, - restrict_lexicon=None, - store_beam=False) + + # 1. Translate trans_wall_time = 0.0 translations = [] with data_io.smart_open(output_name, 'w') as output: @@ -157,27 +161,27 @@ def decode_and_evaluate(self, trans_inputs = [] # type: List[inference.TranslatorInput] for i, inputs in enumerate(self.inputs_sentences): trans_inputs.append(sockeye.inference.make_input_from_multiple_strings(i, inputs)) - trans_outputs = translator.translate(trans_inputs) + trans_outputs = self.translator.translate(trans_inputs) trans_wall_time = time.time() - tic for trans_input, trans_output in zip(trans_inputs, trans_outputs): handler.handle(trans_input, trans_output) translations.append(trans_output.translation) avg_time = trans_wall_time / len(self.target_sentences) - # TODO(fhieber): eventually add more metrics (METEOR etc.) - return {C.BLEU_VAL: evaluate.raw_corpus_bleu(hypotheses=translations, - references=self.target_sentences, - offset=0.01), - C.CHRF_VAL: evaluate.raw_corpus_chrf(hypotheses=translations, + # 2. Evaluate + return {C.BLEU: evaluate.raw_corpus_bleu(hypotheses=translations, + references=self.target_sentences, + offset=0.01), + C.CHRF: evaluate.raw_corpus_chrf(hypotheses=translations, + references=self.target_sentences), + C.ROUGE1: evaluate.raw_corpus_rouge1(hypotheses=translations, + references=self.target_sentences), + C.ROUGE2: evaluate.raw_corpus_rouge2(hypotheses=translations, + references=self.target_sentences), + C.ROUGEL: evaluate.raw_corpus_rougel(hypotheses=translations, references=self.target_sentences), - C.ROUGE_1_VAL: evaluate.raw_corpus_rouge1(hypotheses=translations, - references=self.target_sentences), - C.ROUGE_2_VAL: evaluate.raw_corpus_rouge2(hypotheses=translations, - references=self.target_sentences), - C.ROUGE_L_VAL: evaluate.raw_corpus_rougel(hypotheses=translations, - references=self.target_sentences), - C.LENRATIO_VAL: evaluate.raw_corpus_length_ratio(hypotheses=translations, - references=self.target_sentences), + C.LENRATIO: evaluate.raw_corpus_length_ratio(hypotheses=translations, + references=self.target_sentences), C.AVG_TIME: avg_time, C.DECODING_TIME: trans_wall_time} diff --git a/sockeye/config.py b/sockeye/config.py index dcbf99140..31adeb0a4 100644 --- a/sockeye/config.py +++ b/sockeye/config.py @@ -1,4 +1,4 @@ -# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2017--2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not # use this file except in compliance with the License. A copy of the License @@ -31,17 +31,12 @@ def __init__(cls, name, bases, kwds): class Config(yaml.YAMLObject, metaclass=TaggedYamlObjectMetaclass): """ - Base configuration object that supports freezing of members and YAML (de-)serialization. + Base configuration object YAML (de-)serialization. Actual Configuration should subclass this object. """ yaml_loader = yaml.UnsafeLoader # type: ignore - def __init__(self): - self.__add_frozen() - def __setattr__(self, key, value): - if hasattr(self, '_frozen') and getattr(self, '_frozen'): - raise AttributeError("Cannot set '%s' in frozen config" % key) if value == self: raise AttributeError("Cannot set self as attribute") object.__setattr__(self, key, value) @@ -58,17 +53,6 @@ def __setstate__(self, state): if not hasattr(self, param_name): object.__setattr__(self, param_name, param.default) - def freeze(self): - """ - Freezes this Config object, disallowing modification or addition of any parameters. - """ - if getattr(self, '_frozen'): - return - object.__setattr__(self, "_frozen", True) - for k, v in self.__dict__.items(): - if isinstance(v, Config) and k != "self": - v.freeze() # pylint: disable= no-member - def __repr__(self): return "Config[%s]" % ", ".join("%s=%s" % (str(k), str(v)) for k, v in sorted(self.__dict__.items())) @@ -83,46 +67,26 @@ def __eq__(self, other): return False return True - def __del_frozen(self): - """ - Removes _frozen attribute from this instance and all its child configurations. - """ - self.__delattr__('_frozen') - for attr, val in self.__dict__.items(): - if isinstance(val, Config) and hasattr(val, '_frozen'): - val.__del_frozen() # pylint: disable= no-member - - def __add_frozen(self): - """ - Adds _frozen attribute to this instance and all its child configurations. - """ - setattr(self, "_frozen", False) - for attr, val in self.__dict__.items(): - if isinstance(val, Config): - val.__add_frozen() # pylint: disable= no-member - def save(self, fname: str): """ - Saves this Config (without the frozen state) to a file called fname. + Saves this Config to a file called fname. :param fname: Name of file to store this Config in. """ obj = copy.deepcopy(self) - obj.__del_frozen() with open(fname, 'w') as out: yaml.dump(obj, out, default_flow_style=False) @staticmethod def load(fname: str) -> 'Config': """ - Returns a Config object loaded from a file. The loaded object is not frozen. + Returns a Config object loaded from a file. :param fname: Name of file to load the Config from. :return: Configuration. """ with open(fname) as inp: obj = yaml.load(inp, Loader=yaml.UnsafeLoader) # type: ignore - obj.__add_frozen() return obj def copy(self, **kwargs): diff --git a/sockeye/constants.py b/sockeye/constants.py index ad5c07cbf..5fd57db1c 100644 --- a/sockeye/constants.py +++ b/sockeye/constants.py @@ -19,17 +19,24 @@ import mxnet as mx import numpy as np +# MXNet environment variables +MXNET_SAFE_ACCUMULATION = 'MXNET_SAFE_ACCUMULATION' + +# Horovod environment variables +HOROVOD_HIERARCHICAL_ALLREDUCE = 'HOROVOD_HIERARCHICAL_ALLREDUCE' +HOROVOD_HIERARCHICAL_ALLGATHER = 'HOROVOD_HIERARCHICAL_ALLGATHER' + BOS_SYMBOL = "" EOS_SYMBOL = "" UNK_SYMBOL = "" PAD_SYMBOL = "" PAD_ID = 0 PAD_FORMAT = "" -POINTER_FORMAT = "" -POINTER_PATTERN = "\d+)>" - TOKEN_SEPARATOR = " " VOCAB_SYMBOLS = [PAD_SYMBOL, UNK_SYMBOL, BOS_SYMBOL, EOS_SYMBOL] +UNK_ID = VOCAB_SYMBOLS.index(UNK_SYMBOL) +BOS_ID = VOCAB_SYMBOLS.index(BOS_SYMBOL) +EOS_ID = VOCAB_SYMBOLS.index(EOS_SYMBOL) # reserve extra space for the EOS or BOS symbol that is added to both source and target SPACE_FOR_XOS = 1 @@ -40,13 +47,7 @@ EMBEDDING_PREFIX = "embed_" ATTENTION_PREFIX = "att_" COVERAGE_PREFIX = "cov_" -BIDIRECTIONALRNN_PREFIX = ENCODER_PREFIX + "birnn_" -STACKEDRNN_PREFIX = ENCODER_PREFIX + "rnn_" -FORWARD_PREFIX = "forward_" -REVERSE_PREFIX = "reverse_" TRANSFORMER_ENCODER_PREFIX = ENCODER_PREFIX + "transformer_" -CNN_ENCODER_PREFIX = ENCODER_PREFIX + "cnn_" -CHAR_SEQ_ENCODER_PREFIX = ENCODER_PREFIX + "char_" DEFAULT_OUTPUT_LAYER_PREFIX = "target_output_" LENRATIOS_OUTPUT_LAYER_PREFIX = "length_ratio_layer_" @@ -59,31 +60,20 @@ # source factors SOURCE_FACTORS_COMBINE_SUM = 'sum' +SOURCE_FACTORS_COMBINE_AVERAGE = 'average' SOURCE_FACTORS_COMBINE_CONCAT = 'concat' -SOURCE_FACTORS_COMBINE_CHOICES = [SOURCE_FACTORS_COMBINE_SUM, SOURCE_FACTORS_COMBINE_CONCAT] +SOURCE_FACTORS_COMBINE_CHOICES = [SOURCE_FACTORS_COMBINE_SUM, + SOURCE_FACTORS_COMBINE_AVERAGE, + SOURCE_FACTORS_COMBINE_CONCAT] # encoder names (arguments) -RNN_NAME = "rnn" -RNN_WITH_CONV_EMBED_NAME = "rnn-with-conv-embed" TRANSFORMER_TYPE = "transformer" -CONVOLUTION_TYPE = "cnn" -TRANSFORMER_WITH_CONV_EMBED_TYPE = "transformer-with-conv-embed" -IMAGE_PRETRAIN_TYPE = "image-pretrain-cnn" # available encoders -ENCODERS = [RNN_NAME, RNN_WITH_CONV_EMBED_NAME, TRANSFORMER_TYPE, TRANSFORMER_WITH_CONV_EMBED_TYPE, CONVOLUTION_TYPE, IMAGE_PRETRAIN_TYPE] +ENCODERS = [TRANSFORMER_TYPE] # available decoder -DECODERS = [RNN_NAME, TRANSFORMER_TYPE, CONVOLUTION_TYPE] - -# rnn types -LSTM_TYPE = 'lstm' -LNLSTM_TYPE = 'lnlstm' -LNGLSTM_TYPE = 'lnglstm' -GRU_TYPE = 'gru' -LNGRU_TYPE = 'lngru' -LNGGRU_TYPE = 'lnggru' -CELL_TYPES = [LSTM_TYPE, LNLSTM_TYPE, LNGLSTM_TYPE, GRU_TYPE, LNGRU_TYPE, LNGGRU_TYPE] +DECODERS = [TRANSFORMER_TYPE] # positional embeddings NO_POSITIONAL_EMBEDDING = "none" @@ -113,71 +103,27 @@ EMBED_INIT_TYPES = [EMBED_INIT_DEFAULT, EMBED_INIT_NORMAL] DEFAULT_NUM_EMBED = 512 -# RNN init types -RNN_INIT_PATTERN = ".*h2h.*" -RNN_INIT_ORTHOGONAL = 'orthogonal' -RNN_INIT_ORTHOGONAL_STACKED = 'orthogonal_stacked' -# use the default initializer used also for all other weights -RNN_INIT_DEFAULT = 'default' - -# RNN decoder state init types -RNN_DEC_INIT_ZERO = "zero" -RNN_DEC_INIT_LAST = "last" -RNN_DEC_INIT_AVG = "avg" -RNN_DEC_INIT_CHOICES = [RNN_DEC_INIT_ZERO, RNN_DEC_INIT_LAST, RNN_DEC_INIT_AVG] - -# attention types -ATT_BILINEAR = 'bilinear' -ATT_DOT = 'dot' -ATT_MH_DOT = 'mhdot' -ATT_FIXED = 'fixed' -ATT_LOC = 'location' -ATT_MLP = 'mlp' -ATT_COV = "coverage" -ATT_TYPES = [ATT_BILINEAR, ATT_DOT, ATT_MH_DOT, ATT_FIXED, ATT_LOC, ATT_MLP, ATT_COV] - # weight tying components WEIGHT_TYING_SRC = 'src' WEIGHT_TYING_TRG = 'trg' WEIGHT_TYING_SOFTMAX = 'softmax' # weight tying types (combinations of above components): +WEIGHT_TYING_NONE = 'none' WEIGHT_TYING_TRG_SOFTMAX = 'trg_softmax' WEIGHT_TYING_SRC_TRG = 'src_trg' WEIGHT_TYING_SRC_TRG_SOFTMAX = 'src_trg_softmax' +WEIGHT_TYING_TYPES = [WEIGHT_TYING_NONE, WEIGHT_TYING_SRC_TRG_SOFTMAX, WEIGHT_TYING_SRC_TRG, WEIGHT_TYING_TRG_SOFTMAX] # default decoder prefixes -RNN_DECODER_PREFIX = DECODER_PREFIX + "rnn_" TRANSFORMER_DECODER_PREFIX = DECODER_PREFIX + "transformer_" -CNN_DECODER_PREFIX = DECODER_PREFIX + "cnn_" # Activation types -# Gaussian Error Linear Unit (https://arxiv.org/pdf/1606.08415.pdf) -GELU = "gelu" -# Gated Linear Unit (https://arxiv.org/pdf/1705.03122.pdf) -GLU = "glu" RELU = "relu" -SIGMOID = "sigmoid" -SOFT_RELU = "softrelu" # Swish-1/SiLU (https://arxiv.org/pdf/1710.05941.pdf, https://arxiv.org/pdf/1702.03118.pdf) SWISH1 = "swish1" -TANH = "tanh" -TRANSFORMER_ACTIVATION_TYPES = [GELU, RELU, SWISH1] -CNN_ACTIVATION_TYPES = [GLU, RELU, SIGMOID, SOFT_RELU, TANH] - -# Convolutional block pad types: -CNN_PAD_LEFT = "left" -CNN_PAD_CENTERED = "centered" - -# coverage types -COVERAGE_COUNT = "count" -COVERAGE_FERTILITY = "fertility" -COVERAGE_TYPES = [TANH, - SIGMOID, - RELU, - SOFT_RELU, - GRU_TYPE, - COVERAGE_COUNT, - COVERAGE_FERTILITY] +# Gaussian Error Linear Unit (https://arxiv.org/pdf/1606.08415.pdf) +GELU = "gelu" +TRANSFORMER_ACTIVATION_TYPES = [RELU, SWISH1, GELU] # default I/O variable names SOURCE_NAME = "source" @@ -220,6 +166,12 @@ BEAM_SEARCH_STOP_FIRST = 'first' BEAM_SEARCH_STOP_ALL = 'all' +# State structure constants +STEP_STATE = 's' +BIAS_STATE = 'b' +ENCODER_STATE = 'e' +DECODER_STATE = 'd' + # Inference Input JSON constants JSON_TEXT_KEY = "text" JSON_FACTORS_KEY = "factors" @@ -233,6 +185,7 @@ VERSION_NAME = "version" CONFIG_NAME = "config" +CONFIG_NAME_FLOAT32 = CONFIG_NAME + ".float32" LOG_NAME = "log" JSON_SUFFIX = ".json" VOCAB_SRC_PREFIX = "vocab.src" @@ -243,6 +196,7 @@ PARAMS_PREFIX = "params." PARAMS_NAME = PARAMS_PREFIX + "%05d" PARAMS_BEST_NAME = "params.best" +PARAMS_BEST_NAME_FLOAT32 = PARAMS_BEST_NAME + ".float32" DECODE_OUT_NAME = "decode.output.%05d" DECODE_IN_NAME = "decode.source.%d" DECODE_REF_NAME = "decode.target" @@ -260,6 +214,7 @@ BUCKET_ITER_STATE_NAME = "bucket.pkl" RNG_STATE_NAME = "rng.pkl" TRAINING_STATE_NAME = "training.pkl" +AMP_LOSS_SCALER_STATE_NAME = "amp_loss_scaler.pkl" SCHEDULER_STATE_NAME = "scheduler.pkl" TRAINING_STATE_PARAMS_NAME = "params" ARGS_STATE_NAME = "args.yaml" @@ -269,9 +224,8 @@ "align_plot_prefix", "sure_align_threshold", "keep_last_params", "seed", "max_updates", "min_updates", - "max_seconds", "max_num_epochs", "min_num_epochs", - "max_samples", "min_samples", "max_checkpoints"] + "max_samples", "min_samples", "max_checkpoints", "max_seconds"] # Other argument constants TRAINING_ARG_SOURCE = "--source" @@ -295,11 +249,6 @@ # Used to delimit factors on STDIN for inference DEFAULT_FACTOR_DELIMITER = '|' -# data layout strings -BATCH_MAJOR_IMAGE = "NCHW" -BATCH_MAJOR = "NTC" -TIME_MAJOR = "TNC" - BATCH_TYPE_SENTENCE = "sentence" BATCH_TYPE_WORD = "word" @@ -315,32 +264,18 @@ # Training constants OPTIMIZER_ADAM = "adam" -OPTIMIZER_EVE = "eve" -OPTIMIZER_NADAM = "nadam" -OPTIMIZER_RMSPROP = "rmsprop" OPTIMIZER_SGD = "sgd" -OPTIMIZER_NAG = "nag" -OPTIMIZER_ADAGRAD = "adagrad" -OPTIMIZER_ADADELTA = "adadelta" -OPTIMIZERS = [OPTIMIZER_ADAM, OPTIMIZER_EVE, OPTIMIZER_NADAM, OPTIMIZER_RMSPROP, OPTIMIZER_SGD, OPTIMIZER_NAG, - OPTIMIZER_ADAGRAD, OPTIMIZER_ADADELTA] - -LR_SCHEDULER_FIXED_RATE_INV_SQRT_T = "fixed-rate-inv-sqrt-t" -LR_SCHEDULER_FIXED_RATE_INV_T = "fixed-rate-inv-t" -LR_SCHEDULER_FIXED_STEP = "fixed-step" -LR_SCHEDULER_PLATEAU_REDUCE = "plateau-reduce" -LR_SCHEDULERS = [LR_SCHEDULER_FIXED_RATE_INV_SQRT_T, - LR_SCHEDULER_FIXED_RATE_INV_T, - LR_SCHEDULER_FIXED_STEP, +OPTIMIZERS = [OPTIMIZER_ADAM, OPTIMIZER_SGD] + +LR_SCHEDULER_NONE = 'none' +LR_SCHEDULER_INV_SQRT_DECAY = 'inv-sqrt-decay' +LR_SCHEDULER_LINEAR_DECAY = 'linear-decay' +LR_SCHEDULER_PLATEAU_REDUCE = 'plateau-reduce' +LR_SCHEDULERS = [LR_SCHEDULER_NONE, + LR_SCHEDULER_INV_SQRT_DECAY, + LR_SCHEDULER_LINEAR_DECAY, LR_SCHEDULER_PLATEAU_REDUCE] -LR_DECAY_OPT_STATES_RESET_OFF = 'off' -LR_DECAY_OPT_STATES_RESET_INITIAL = 'initial' -LR_DECAY_OPT_STATES_RESET_BEST = 'best' -LR_DECAY_OPT_STATES_RESET_CHOICES = [LR_DECAY_OPT_STATES_RESET_OFF, - LR_DECAY_OPT_STATES_RESET_INITIAL, - LR_DECAY_OPT_STATES_RESET_BEST] - GRADIENT_CLIPPING_TYPE_ABS = 'abs' GRADIENT_CLIPPING_TYPE_NORM = 'norm' GRADIENT_CLIPPING_TYPE_NONE = 'none' @@ -350,27 +285,20 @@ GRADIENT_COMPRESSION_2BIT = "2bit" GRADIENT_COMPRESSION_TYPES = [GRADIENT_CLIPPING_TYPE_NONE, GRADIENT_COMPRESSION_2BIT] +HOROVOD_SECONDARY_WORKERS_DIRNAME = 'secondary_workers' + # output handler OUTPUT_HANDLER_TRANSLATION = "translation" OUTPUT_HANDLER_TRANSLATION_WITH_SCORE = "translation_with_score" -OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENTS = "translation_with_alignments" -OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENT_MATRIX = "translation_with_alignment_matrix" OUTPUT_HANDLER_SCORE = "score" OUTPUT_HANDLER_PAIR_WITH_SCORE = "pair_with_score" OUTPUT_HANDLER_BENCHMARK = "benchmark" -OUTPUT_HANDLER_ALIGN_PLOT = "align_plot" -OUTPUT_HANDLER_ALIGN_TEXT = "align_text" OUTPUT_HANDLER_BEAM_STORE = "beam_store" OUTPUT_HANDLER_JSON = "json" OUTPUT_HANDLERS = [OUTPUT_HANDLER_TRANSLATION, OUTPUT_HANDLER_SCORE, OUTPUT_HANDLER_TRANSLATION_WITH_SCORE, - OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENTS, - OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENT_MATRIX, OUTPUT_HANDLER_BENCHMARK, - OUTPUT_HANDLER_ALIGN_PLOT, - OUTPUT_HANDLER_ALIGN_TEXT, - OUTPUT_HANDLER_BEAM_STORE, OUTPUT_HANDLER_JSON] OUTPUT_HANDLERS_SCORING = [OUTPUT_HANDLER_SCORE, OUTPUT_HANDLER_PAIR_WITH_SCORE] @@ -385,15 +313,9 @@ ROUGE1 = 'rouge1' ROUGE2 = 'rouge2' ROUGEL = 'rougel' -BLEU_VAL = BLEU + "-val" -CHRF_VAL = CHRF + "-val" -ROUGE_VAL = ROUGE + "-val" -ROUGE_1_VAL = ROUGE1 + "-val" -ROUGE_2_VAL = ROUGE2 + "-val" -ROUGE_L_VAL = ROUGEL + "-val" -LENRATIO_VAL = 'length-ratio-mse' -AVG_TIME = "avg-sec-per-sent-val" -DECODING_TIME = "decode-walltime-val" +LENRATIO = 'length-ratio-mse' +AVG_TIME = "avg-sec-per-sent" +DECODING_TIME = "decode-walltime" METRICS = [PERPLEXITY, ACCURACY, LENRATIO_MSE, BLEU, CHRF, ROUGE1] METRIC_MAXIMIZE = {ACCURACY: True, BLEU: True, CHRF: True, ROUGE1: True, PERPLEXITY: False} METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf} @@ -415,6 +337,7 @@ TARGET_MAX_LENGTH_FACTOR = 2 DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH = 2 +DTYPE_INT8 = 'int8' DTYPE_FP16 = 'float16' DTYPE_FP32 = 'float32' LARGE_POSITIVE_VALUE = 99999999. @@ -423,31 +346,42 @@ # Something at the middle of 32768