merge updates from edge repository.

hfxunlp · Sep 12, 2019 · 003b3b9 · 003b3b9
1 parent d7cb516
commit 003b3b9
Show file tree

Hide file tree

Showing 40 changed files with 743 additions and 1,841 deletions.
diff --git a/README.md b/README.md
@@ -13,60 +13,26 @@ If you want to use [BPE](https://github.com/rsennrich/subword-nmt), to enable co
 
 ### BPE
 
-Run `bash scripts/mkbpe.sh` to preprocess the data with bpe, following variables in `mkbpe.sh` can be configured for your usage:
-
-```
-# "cachedir" is the directory for processed data files.
-export cachedir=cache
-# "srcd" is the path to the source data of both training and validation sets.
-export srcd=wmt17/de-en
-# "dataid" sets the ID of the generated data, all generated files will be saved into: "$cachedir/$dataid".
-export dataid=de-en
-
-# "bpeops" is the number of bpe actions for the joint bpe on source language and its translation. "minfreq" is the minimum frequency of words, words with lower frequencies will be seperated during bpe.
-export bpeops=32000
-export minfreq=50
-# "maxtokens" is the maximum tokens allowed for the source sentence and target sentence in the training set. Longer sentences will be droped.
-export maxtokens=256
-
-# "srctf" is a plain text file which stores the source languages of training set.
-export srctf=tok.de
-# "tgttf" is the corresponding gold translation of "srctf".
-export tgttf=tok.en
-# "srcvf" is similar to "srctf", but for validation.
-export srcvf=06.tok.de
-# similar to "tgttf" for validation.
-export tgtvf=06.tok.en0
-```
-
-### cleaning with BPE results
-
-Run `bash scripts/cleanbpe.sh` to clean bpe results, following variables in `cleanbpe.sh` can be configured besides those arguments which already exist in `scripts/mkbpe.sh`:
-
-```
-# vratio of words with lower frequencies are regarded as rare words, and if there are (1.0 - vratio) words of a sentence are rare words, it will be dropped
-export vratio=0.2
-
-# options for cleaning the data processed by bpe,
-# advised values except numrules can be calculated by:
-#	python tools/check/charatio.py $tgtd/src.dev.bpe $tgtd/tgt.dev.bpe, and
-#	python tools/check/biratio.py $tgtd/src.dev.bpe $tgtd/tgt.dev.bpe
-# with development set.
-# As for numrules, choose from [1, 6], fewer data will be droped with larger value, none data would be droped if it was set to 6, details are described in:
-#	tools/check/chars.py
-export charatio=0.751
-export bperatio=4.01
-export seperatio=0.8
-export bibperatio=2.64
-export bioratio=3.54
-export numrules=1
-```
+We provide scripts to apply Byte-Pair Encoding (BPE) under `scripts/bpe/`.
 
 ### convert plain text to tensors for training
 
 Generate training data for `train.py` with `bash scripts/mktrain.sh`, configure following variables in `mktrain.sh` for your usage (the other variables should comply with those in `scripts/mkbpe.sh`):
 
 ```
+# the path of datasets
+export cachedir=cache
+# the ID of a dataset (files should be saved in $cachedir/$dataid)
+export dataid=w14ende
+# the training file of the source language
+export srctf=src.train.bpe
+# the training file of the target language
+export tgttf=tgt.train.bpe
+# the validation file of the source language
+export srcvf=src.dev.bpe
+# the validation file of the target language
+export tgtvf=tgt.dev.bpe
+
 # "vsize" is the size of the vocabulary for both source language and its translation. Set a very large number to use the full vocabulary for BPE. The real vocabulary size will be 4 greater than this value because of special tags ("<sos>", "<eos>", "<unk>" and "<pad>").
 export vsize=65536
 
@@ -86,7 +52,7 @@ All parameters for configuration are saved in `cnfg.py`:
 run_id = "base"
 
 # the ID of the dataset to use
-data_id = "de-en"
+data_id = "w14ende"
 
 # training, validation and test sets, created by mktrain.sh and mktest.sh correspondingly.
 train_data = "cache/"+data_id+"/train.h5"
@@ -146,6 +112,8 @@ report_eva = False
 # run on GPU or not, and GPU device(s) to use. Data Parallel depended multi-gpu support can be enabled with values like: 'cuda:0, 1, 3'.
 use_cuda = True
 gpuid = 'cuda:0'
+# [EXP] enable mixed precision (FP16) with "O1"
+amp_opt = None
 
 # use multi-gpu for translating or not. "predict.py" will take the last gpu rather than the first in case multi_gpu_decoding is set to False to avoid potential break due to out of memory, because the first gpu is the main device by default which takes more jobs.
 multi_gpu_decoding = False
@@ -173,13 +141,13 @@ isize = 512
 nlayer = 6
 
 # hidden size for those feed-forward neural networks.
-ff_hsize = 2048
+ff_hsize = isize * 4
 
 # dropout rate for hidden states.
 drop = 0.1
 
 # dropout rate applied to multi-head attention.
-attn_drop = 0.1
+attn_drop = drop
 
 # label smoothing settings for the KL divergence.
 label_smoothing = 0.1
@@ -194,10 +162,10 @@ length_penalty = 0.0
 share_emb = False
 
 # number of heads for multi-head attention.
-nhead = 8
+nhead = max(1, isize // 64)
 
 # maximum steps cached for the positional embedding.
-cache_len = 260
+cache_len = 256
 
 # warm up steps for the training.
 warm_step = 8000
@@ -223,7 +191,7 @@ where `runid` can be omitted. In that case, the `run_id` in `cnfg.py` will be ta
 
 ```
 # "srcd" is the path of the source file you want to translate.
-export srcd=un-cache
+export srcd=w14src
 
 # "srctf" is a plain text file to be translated which should be saved in "srcd" and processed with bpe like that with the training set.
 export srctf=src-val.bpe
@@ -232,6 +200,8 @@ export modelf=expm/debug/checkpoint.t7
 # result file.
 export rsf=trans.txt
 
+# the ID of the dataset assigned in mktrain.sh
+export dataid=w14ende
 ```
 
 ## Exporting python files to C libraries
@@ -318,6 +288,10 @@ Hierarchical aggregation proposed in [Exploiting Deep Representations for Neural
 
 Implementation of transparent attention proposed in [Training Deeper Neural Machine Translation Models with Transparent Attention](https://aclweb.org/anthology/D18-1338) modules.
 
+#### `SC/`
+
+Implementation of sentential context proposed in [Exploiting Sentential Context for Neural Machine Translation](https://www.aclweb.org/anthology/P19-1624/) modules.
+
 ### `parallel/`
 
 #### `base.py`
@@ -449,7 +423,7 @@ There is a difference between the Transformer in the original paper (residue con
 
 ## Acknowledgements
 
-The project starts when Hongfei XU (the developer) was a postgraduate student at [Zhengzhou University](http://www5.zzu.edu.cn/nlp/), and continues when he is a PhD candidate at [Saarland University](https://www.uni-saarland.de/nc/en/home.html) supervised by [Prof. Dr. Josef van Genabith](https://www.dfki.de/en/web/about-us/employee/person/jova02/) and a Junior Researcher at [DFKI, MLT (German Research Center for Artificial Intelligence, Multilinguality and Language Technology)](https://www.dfki.de/en/web/research/research-departments-and-groups/multilinguality-and-language-technology/). Hongfei XU enjoys a doctoral grant from [China Scholarship Council](https://www.csc.edu.cn/) ([2018]3101, 201807040056) while maintaining this project.
+The project starts when Hongfei XU (the developer) was a postgraduate student at [Zhengzhou University](http://www5.zzu.edu.cn/nlp/), and continues when he is a PhD candidate at [Saarland University](https://www.uni-saarland.de/nc/en/home.html) supervised by [Prof. Dr. Josef van Genabith](https://www.dfki.de/en/web/about-us/employee/person/jova02/) and Prof. Dr. Deyi Xiong, and a Junior Researcher at [DFKI, MLT (German Research Center for Artificial Intelligence, Multilinguality and Language Technology)](https://www.dfki.de/en/web/research/research-departments-and-groups/multilinguality-and-language-technology/). Hongfei XU enjoys a doctoral grant from [China Scholarship Council](https://www.csc.edu.cn/) ([2018]3101, 201807040056) while maintaining this project.
 
 Details of this project can be found [here](https://arxiv.org/abs/1903.07402), and please cite it if you enjoy the implementation :)
 

diff --git a/cnfg.py b/cnfg.py
@@ -2,7 +2,7 @@
 
 run_id = "base"
 
-data_id = "de-en"
+data_id = "w14ende"
 
 train_data = "cache/"+data_id+"/train.h5"
 dev_data = "cache/"+data_id+"/dev.h5"
@@ -46,6 +46,8 @@
 use_cuda = True
 # enable Data Parallel multi-gpu support with values like: 'cuda:0, 1, 3'.
 gpuid = 'cuda:0'
+# [EXP] enable mixed precision (FP16) with "O1"
+amp_opt = None
 
 # use multi-gpu for translating or not. `predict.py` will take the last gpu rather than the first in case multi_gpu_decoding is set to False to avoid potential break due to out of memory, since the first gpu is the main device by default which takes more jobs.
 multi_gpu_decoding = False
@@ -67,11 +69,11 @@
 
 nlayer = 6
 
-ff_hsize = 2048
+ff_hsize = isize * 4
 
 drop = 0.1
 
-attn_drop = 0.1
+attn_drop = drop
 
 label_smoothing = 0.1
 
@@ -81,9 +83,9 @@
 
 share_emb = False
 
-nhead = 8
+nhead = max(1, isize // 64)
 
-cache_len = 260
+cache_len = 256
 
 warm_step = 8000
 

diff --git a/discriminator.py b/discriminator.py
@@ -15,7 +15,7 @@ def __init__(self, isize, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, nu
 
 		_fhsize = _ahsize if fhsize is None else fhsize
 
-		self.drop = nn.Dropout(dropout, inplace=True) if dropout > 0.0 else None
+		self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
 
 		self.pemb = PositionalEmb(isize, xseql, 0, 0) if use_pemb else None
 
@@ -73,8 +73,8 @@ def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, a
 		self.layer_normer1 = nn.LayerNorm(isize, eps=1e-06)
 		self.layer_normer2 = nn.LayerNorm(isize, eps=1e-06)
 		if dropout > 0:
-			self.d1 = nn.Dropout(dropout, inplace=True)
-			self.d2 = nn.Dropout(dropout, inplace=True)
+			self.d1 = Dropout(dropout, inplace=True)
+			self.d2 = Dropout(dropout, inplace=True)
 		else:
 			self.d1 = None
 			self.d2 = None
@@ -118,7 +118,7 @@ def __init__(self, isize, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, nu
 
 		_fhsize = _ahsize if fhsize is None else fhsize
 
-		self.drop = nn.Dropout(dropout, inplace=True) if dropout > 0.0 else None
+		self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None
 
 		self.pemb = PositionalEmb(isize, xseql, 0, 0) if use_pemb else None
 

diff --git a/loss.py b/loss.py
@@ -67,7 +67,7 @@ def forward(self, output, target):
 		model_prob.scatter_(1, _target, self.conf)
 
 		if isinstance(self.ignore_index, (list, tuple)):
-			model_prob.masked_fill_(torch.gt(torch.stack([_target == _tmp for _tmp in self.ignore_index]).sum(0), 0), 0.0)
+			model_prob.masked_fill_(torch.stack([_target == _tmp for _tmp in self.ignore_index]).sum(0).gt(0), 0.0)
 		elif self.ignore_index >= 0:
 			model_prob.masked_fill_(_target == self.ignore_index, 0.0)
 

diff --git a/modules/TA.py b/modules/TA.py
@@ -20,10 +20,4 @@ def __init__(self, isize, hsize=None, dropout=0.0, use_GeLU=False):
 
 	def forward(self, x):
 
-		out = x
-		for net in self.nets:
-			out = net(out)
-
-		out = self.normer(out + x)
-
-		return out
+		return self.normer(self.net(x) + x)
diff --git a/modules/__no_significance__/group.py b/modules/__no_significance__/group.py
diff --git a/modules/__no_significance__/ua_cattn.py b/modules/__no_significance__/ua_cattn.py
@@ -1,10 +1,10 @@
 #encoding: utf-8
 
-from math import sqrt
+from math import sqrt, inf
 
 from torch import nn
 
-from modules.base import SparseNormer, MHSparseNormer
+from modules.base import SparseNormer, MHSparseNormer, Linear, Dropout
 
 class CrossAttn(nn.Module):
 
@@ -23,15 +23,15 @@ def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, enable_bias=Fal
 		self.hsize = self.attn_dim * num_head
 		self.num_head = num_head
 
-		self.query_adaptor = nn.Linear(isize, self.hsize, bias=enable_bias)
-		self.kv_adaptor = nn.Linear(isize, self.hsize * 2, bias=enable_bias)
+		self.query_adaptor = Linear(isize, self.hsize, bias=enable_bias)
+		self.kv_adaptor = Linear(isize, self.hsize * 2, bias=enable_bias)
 
-		self.outer = nn.Linear(self.hsize, osize, bias=enable_bias)
+		self.outer = Linear(self.hsize, osize, bias=enable_bias)
 
 		#self.normer = MHSparseNormer(num_head, dim=-1) if sparsenorm else nn.Softmax(dim=-1)
 		self.normer = SparseNormer(dim=-1) if sparsenorm else nn.Softmax(dim=-1)
 
-		self.drop = nn.Dropout(dropout, inplace=sparsenorm) if dropout > 0.0 else None
+		self.drop = Dropout(dropout, inplace=sparsenorm) if dropout > 0.0 else None
 
 	# iQ: query (bsize, num_query, vsize)
 	# iK: keys (bsize, seql, vsize)
@@ -58,7 +58,7 @@ def forward(self, iQ, iK, mask=None):
 		scores = real_iQ.matmul(real_iK) / sqrt(adim)
 
 		if mask is not None:
-			scores.masked_fill_(mask.unsqueeze(1).expand_as(scores), -1e32)
+			scores.masked_fill_(mask.unsqueeze(1).expand_as(scores), -inf)
 
 		_rscore = scores = self.normer(scores)