February 2021 update

hfxunlp · Feb 22, 2021 · 787268e · 787268e
1 parent 3ff6d2c
commit 787268e
Show file tree

Hide file tree

Showing 46 changed files with 550 additions and 365 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Neutron
-Neutron: A pytorch based implementation of [Transformer](https://arxiv.org/abs/1706.03762) and its variants.
+Neutron: A pytorch based implementation of the [Transformer](https://arxiv.org/abs/1706.03762) and its variants.
 
 This project is developed with python 3.8.
 
@@ -96,11 +96,11 @@ Tokenized case-sensitive BLEU measured with [multi-bleu.perl](https://github.com
 | | BLEU | Training Speed | Decoding Speed |
 | :------| ------: | ------: | ------: |
 | Attention is all you need | 27.3 | | |
-| Neutron | 28.07 | 21562.98 | 68.25  |
+| Neutron | 28.07 | 22424.63 | 150.15  |
 
 ## Acknowledgments
 
-The project starts when Hongfei XU (the developer) was a postgraduate student at [Zhengzhou University](http://www5.zzu.edu.cn/nlp/), and continues when he is a PhD candidate at [Saarland University](https://www.uni-saarland.de/nc/en/home.html) supervised by [Prof. Dr. Josef van Genabith](https://www.dfki.de/en/web/about-us/employee/person/jova02/) and [Prof. Dr. Deyi Xiong](http://cic.tju.edu.cn/faculty/xiongdeyi/), and a Junior Researcher at [DFKI, MLT (German Research Center for Artificial Intelligence, Multilinguality and Language Technology)](https://www.dfki.de/en/web/research/research-departments-and-groups/multilinguality-and-language-technology/). Hongfei XU enjoys a doctoral grant from [China Scholarship Council](https://www.csc.edu.cn/) ([2018]3101, 201807040056) while maintaining this project.
+Hongfei Xu enjoys a doctoral grant from [China Scholarship Council](https://www.csc.edu.cn/) ([2018]3101, 201807040056) while maintaining this project.
 
 Details of this project can be found [here](https://arxiv.org/abs/1903.07402), and please cite it if you enjoy the implementation :)
 

diff --git a/adv/predict/doc/para/predict_doc_para.py b/adv/predict/doc/para/predict_doc_para.py
@@ -57,7 +57,7 @@ def load_fixing(module):
 # Important to make cudnn methods deterministic
 set_random_seed(cnfg.seed, use_cuda)
 
-if use_cuda:
+if cuda_device:
 	mymodel.to(cuda_device)
 	if multi_gpu:
 		mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False)
@@ -75,9 +75,10 @@ def load_fixing(module):
 with open(sys.argv[1], "wb") as f:
 	with torch.no_grad():
 		for nsent, i_d in tqdm(tl):
-			seq_batch = torch.from_numpy(src_grp[nsent][i_d][:]).long()
-			if use_cuda:
+			seq_batch = torch.from_numpy(src_grp[nsent][i_d][:])
+			if cuda_device:
 				seq_batch = seq_batch.to(cuda_device)
+			seq_batch = seq_batch.long()
 			bsize, _nsent, seql = seq_batch.size()
 			_nsent_use = _nsent - 1
 			with autocast(enabled=use_amp):

diff --git a/adv/predict/predict_ape.py b/adv/predict/predict_ape.py
@@ -57,7 +57,7 @@ def load_fixing(module):
 
 set_random_seed(cnfg.seed, use_cuda)
 
-if use_cuda:
+if cuda_device:
 	mymodel.to(cuda_device)
 	if multi_gpu:
 		mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False)
@@ -73,11 +73,12 @@ def load_fixing(module):
 with open(sys.argv[1], "wb") as f:
 	with torch.no_grad():
 		for i in tqdm(range(ntest)):
-			seq_batch = torch.from_numpy(src_grp[str(i)][:]).long()
-			seq_mt = torch.from_numpy(mt_grp[str(i)][:]).long()
-			if use_cuda:
+			seq_batch = torch.from_numpy(src_grp[str(i)][:])
+			seq_mt = torch.from_numpy(mt_grp[str(i)][:])
+			if cuda_device:
 				seq_batch = seq_batch.to(cuda_device)
 				seq_mt = seq_mt.to(cuda_device)
+			seq_batch, seq_mt = seq_batch.long(), seq_mt.long()
 			with autocast(enabled=use_amp):
 				output = mymodel.decode(seq_batch, seq_mt, beam_size, None, length_penalty)
 			if multi_gpu:

diff --git a/adv/rank/doc/para/rank_loss_para.py b/adv/rank/doc/para/rank_loss_para.py
@@ -65,7 +65,7 @@ def load_fixing(module):
 # Important to make cudnn methods deterministic
 set_random_seed(cnfg.seed, use_cuda)
 
-if use_cuda:
+if cuda_device:
 	mymodel.to(cuda_device)
 	lossf.to(cuda_device)
 	if multi_gpu:
@@ -81,12 +81,13 @@ def load_fixing(module):
 	with torch.no_grad():
 		for i in tqdm(range(ntest)):
 			_curid = str(i)
-			seq_batch = torch.from_numpy(src_grp[_curid][:]).long()
-			seq_o = torch.from_numpy(tgt_grp[_curid][:]).long()
+			seq_batch = torch.from_numpy(src_grp[_curid][:])
+			seq_o = torch.from_numpy(tgt_grp[_curid][:])
 			lo = seq_o.size(-1) - 1
-			if use_cuda:
+			if cuda_device:
 				seq_batch = seq_batch.to(cuda_device)
 				seq_o = seq_o.to(cuda_device)
+			seq_batch, seq_o = seq_batch.long(), seq_o.long()
 			bsize, _nsent = seq_batch.size()[:2]
 			_nsent_use = _nsent - 1
 			seq_o = seq_o.narrow(1, 1, _nsent_use)

diff --git a/adv/rank/doc/rank_loss_sent.py b/adv/rank/doc/rank_loss_sent.py
@@ -65,7 +65,7 @@ def load_fixing(module):
 # Important to make cudnn methods deterministic
 set_random_seed(cnfg.seed, use_cuda)
 
-if use_cuda:
+if cuda_device:
 	mymodel.to(cuda_device)
 	lossf.to(cuda_device)
 	if multi_gpu:
@@ -79,13 +79,14 @@ def load_fixing(module):
 	with torch.no_grad():
 		for i in tqdm(range(ntest)):
 			_curid = str(i)
-			seq_batch = torch.from_numpy(src_grp[_curid][:]).long()
-			seq_o = torch.from_numpy(tgt_grp[_curid][:]).long()
+			seq_batch = torch.from_numpy(src_grp[_curid][:])
+			seq_o = torch.from_numpy(tgt_grp[_curid][:])
 			bsize, nsent = seq_batch.size()[:2]
 			ebsize = bsize * nsent
-			if use_cuda:
+			if cuda_device:
 				seq_batch = seq_batch.to(cuda_device)
 				seq_o = seq_o.to(cuda_device)
+			seq_batch, seq_o = seq_batch.long(), seq_o.long()
 			lo = seq_o.size(-1) - 1
 			ot = seq_o.narrow(-1, 1, lo).contiguous()
 			with autocast(enabled=use_amp):

diff --git a/adv/train/doc/para/train_doc_para.py b/adv/train/doc/para/train_doc_para.py
@@ -42,12 +42,13 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok
 
 	src_grp, tgt_grp = td["src"], td["tgt"]
 	for nsent, i_d in tqdm(tl):
-		seq_batch = torch.from_numpy(src_grp[nsent][i_d][:]).long()
-		seq_o = torch.from_numpy(tgt_grp[nsent][i_d][:]).long()
+		seq_batch = torch.from_numpy(src_grp[nsent][i_d][:])
+		seq_o = torch.from_numpy(tgt_grp[nsent][i_d][:])
 		lo = seq_o.size(-1) - 1
 		if mv_device:
 			seq_batch = seq_batch.to(mv_device)
 			seq_o = seq_o.to(mv_device)
+		seq_batch, seq_o = seq_batch.long(), seq_o.long()
 
 		_nsent = seq_batch.size(1)
 		_nsent_use = _nsent - 1
@@ -145,12 +146,13 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 	src_grp, tgt_grp = ed["src"], ed["tgt"]
 	with torch.no_grad():
 		for nsent, i_d in tqdm(nd):
-			seq_batch = torch.from_numpy(src_grp[nsent][i_d][:]).long()
-			seq_o = torch.from_numpy(tgt_grp[nsent][i_d][:]).long()
+			seq_batch = torch.from_numpy(src_grp[nsent][i_d][:])
+			seq_o = torch.from_numpy(tgt_grp[nsent][i_d][:])
 			lo = seq_o.size(-1) - 1
 			if mv_device:
 				seq_batch = seq_batch.to(mv_device)
 				seq_o = seq_o.to(mv_device)
+			seq_batch, seq_o = seq_batch.long(), seq_o.long()
 
 			_nsent = seq_batch.size(1)
 			_nsent_use = _nsent - 1
@@ -261,7 +263,7 @@ def init_fixing(module):
 	logger.info("Load target embedding from: " + cnfg.tgt_emb)
 	load_emb(cnfg.tgt_emb, mymodel.dec.wemb.weight, nwordt, cnfg.scale_down_emb, cnfg.freeze_tgtemb)
 
-if use_cuda:
+if cuda_device:
 	mymodel.to(cuda_device)
 	lossf.to(cuda_device)
 

diff --git a/adv/train/train_ape.py b/adv/train/train_ape.py
@@ -40,14 +40,15 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok
 	cur_b, _ls = 1, {} if save_loss else None
 	src_grp, mt_grp, tgt_grp = td["src"], td["mt"], td["tgt"]
 	for i_d in tqdm(tl):
-		seq_batch = torch.from_numpy(src_grp[i_d][:]).long()
-		seq_mt = torch.from_numpy(mt_grp[i_d][:]).long()
-		seq_o = torch.from_numpy(tgt_grp[i_d][:]).long()
+		seq_batch = torch.from_numpy(src_grp[i_d][:])
+		seq_mt = torch.from_numpy(mt_grp[i_d][:])
+		seq_o = torch.from_numpy(tgt_grp[i_d][:])
 		lo = seq_o.size(1) - 1
 		if mv_device:
 			seq_batch = seq_batch.to(mv_device)
 			seq_mt = seq_mt.to(mv_device)
 			seq_o = seq_o.to(mv_device)
+		seq_batch, seq_mt, seq_o = seq_batch.long(), seq_mt.long(), seq_o.long()
 
 		oi = seq_o.narrow(1, 0, lo)
 		ot = seq_o.narrow(1, 1, lo).contiguous()
@@ -142,14 +143,15 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 	with torch.no_grad():
 		for i in tqdm(range(nd)):
 			bid = str(i)
-			seq_batch = torch.from_numpy(src_grp[bid][:]).long()
-			seq_mt = torch.from_numpy(mt_grp[bid][:]).long()
-			seq_o = torch.from_numpy(tgt_grp[bid][:]).long()
+			seq_batch = torch.from_numpy(src_grp[bid][:])
+			seq_mt = torch.from_numpy(mt_grp[bid][:])
+			seq_o = torch.from_numpy(tgt_grp[bid][:])
 			lo = seq_o.size(1) - 1
 			if mv_device:
 				seq_batch = seq_batch.to(mv_device)
 				seq_mt = seq_mt.to(mv_device)
 				seq_o = seq_o.to(mv_device)
+			seq_batch, seq_mt, seq_o = seq_batch.long(), seq_mt.long(), seq_o.long()
 			ot = seq_o.narrow(1, 1, lo).contiguous()
 			with autocast(enabled=use_amp):
 				output = model(seq_batch, seq_mt, seq_o.narrow(1, 0, lo))
@@ -251,7 +253,7 @@ def init_fixing(module):
 	logger.info("Load target embedding from: " + cnfg.tgt_emb)
 	load_emb(cnfg.tgt_emb, mymodel.dec.wemb.weight, nwordt, cnfg.scale_down_emb, cnfg.freeze_tgtemb)
 
-if use_cuda:
+if cuda_device:
 	mymodel.to(cuda_device)
 	lossf.to(cuda_device)
 

diff --git a/adv/train/train_dynb.py b/adv/train/train_dynb.py
@@ -56,12 +56,13 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok
 
 	src_grp, tgt_grp = td["src"], td["tgt"]
 	for i_d in tqdm(tl):
-		seq_batch = torch.from_numpy(src_grp[i_d][:]).long()
-		seq_o = torch.from_numpy(tgt_grp[i_d][:]).long()
+		seq_batch = torch.from_numpy(src_grp[i_d][:])
+		seq_o = torch.from_numpy(tgt_grp[i_d][:])
 		lo = seq_o.size(1) - 1
 		if mv_device:
 			seq_batch = seq_batch.to(mv_device)
 			seq_o = seq_o.to(mv_device)
+		seq_batch, seq_o = seq_batch.long(), seq_o.long()
 
 		oi = seq_o.narrow(1, 0, lo)
 		ot = seq_o.narrow(1, 1, lo).contiguous()
@@ -169,12 +170,13 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
 	with torch.no_grad():
 		for i in tqdm(range(nd)):
 			bid = str(i)
-			seq_batch = torch.from_numpy(src_grp[bid][:]).long()
-			seq_o = torch.from_numpy(tgt_grp[bid][:]).long()
+			seq_batch = torch.from_numpy(src_grp[bid][:])
+			seq_o = torch.from_numpy(tgt_grp[bid][:])
 			lo = seq_o.size(1) - 1
 			if mv_device:
 				seq_batch = seq_batch.to(mv_device)
 				seq_o = seq_o.to(mv_device)
+			seq_batch, seq_o = seq_batch.long(), seq_o.long()
 			ot = seq_o.narrow(1, 1, lo).contiguous()
 			with autocast(enabled=use_amp):
 				output = model(seq_batch, seq_o.narrow(1, 0, lo))
@@ -272,7 +274,7 @@ def init_fixing(module):
 	logger.info("Load target embedding from: " + cnfg.tgt_emb)
 	load_emb(cnfg.tgt_emb, mymodel.dec.wemb.weight, nwordt, cnfg.scale_down_emb, cnfg.freeze_tgtemb)
 
-if use_cuda:
+if cuda_device:
 	mymodel.to(cuda_device)
 	lossf.to(cuda_device)
 

diff --git a/cnfg/README.md b/cnfg/README.md
@@ -152,10 +152,13 @@ cache_len_default = 256
 use_k_relative_position = 0
 disable_std_pemb = False
 
+# using fast implementation of label smoothing loss, but it cannot exclude the negative impact of special tokens, like <pad>, on training. `forbidden_indexes` in `cnfg/base.py` shall be set to None to enable.
+use_fast_loss = False
+
 # configure maximum batch size w.r.t GPU memory
-max_sentences_gpu = 768
-max_tokens_gpu = 4608
-max_pad_tokens_sentence = 16
+max_sentences_gpu = 2048
+max_tokens_gpu = 6144
+max_pad_tokens_sentence = 32
 normal_tokens_vs_pad_tokens = 4
 
 # trade CPU for IO and disk space, see [h5py](http://docs.h5py.org/en/stable/high/dataset.html) for details.
@@ -168,11 +171,14 @@ hdf5_model_compression_level = 0
 
 # For BPE (using full vocabulary), the special <unk> token will never appear and thus can be removed from the vocabulary. Otherwise, it should be set to True.
 use_unk = True
+
+# prune with length penalty in each beam decoding step
+clip_beam_with_lp = True
 ```
 
 ## `ihyp.py`
 
-To interpret configurations in hyp.py.
+To interpret configurations in `hyp.py`.
 
 ## `dynb.py`
 

diff --git a/cnfg/hyp.py b/cnfg/hyp.py
@@ -13,17 +13,17 @@
 # default cached sequence length (for positional embedding, etc.)
 cache_len_default = 256
 
-# window size (one side) of relative positional embeddings, 0 to disable. 16 and 8 are used in [Self-Attention with Relative Position Representations](https://www.aclweb.org/anthology/N18-2074/) for Transformer Base and Big respectively. disable_std_pemb to disable the standard positional embedding when use the relative position, or to disable only the decoder side with a tuple (False, True,), useful for AAN.
+# window size (one side) of relative positional embeddings, 0 to disable. 8 and 16 are used in [Self-Attention with Relative Position Representations](https://www.aclweb.org/anthology/N18-2074/) for Transformer Base and Big respectively. disable_std_pemb to disable the standard positional embedding when use the relative position, or to disable only the decoder side with a tuple (False, True,), useful for AAN.
 use_k_relative_position = 0
 disable_std_pemb = False
 
 # using fast implementation of label smoothing loss, but it cannot exclude the negative impact of special tokens, like <pad>, on training. `forbidden_indexes` in `cnfg/base.py` shall be set to None to enable.
 use_fast_loss = False
 
 # configure maximum batch size w.r.t GPU memory
-max_sentences_gpu = 768
-max_tokens_gpu = 4608
-max_pad_tokens_sentence = 16
+max_sentences_gpu = 2048
+max_tokens_gpu = 6144
+max_pad_tokens_sentence = 32
 normal_tokens_vs_pad_tokens = 4
 
 # trade CPU for IO and disk space, see [h5py](http://docs.h5py.org/en/stable/high/dataset.html) for details.

diff --git a/cnfg/ihyp.py b/cnfg/ihyp.py
@@ -45,6 +45,7 @@
 use_k_relative_position_encoder, use_k_relative_position_decoder = parse_double_value_tuple(use_k_relative_position)
 rel_pos_enabled = (max(use_k_relative_position_encoder, use_k_relative_position_decoder) > 0)
 disable_std_pemb_encoder, disable_std_pemb_decoder = parse_double_value_tuple(disable_std_pemb)
+relpos_reduction_with_zeros = True
 
 h5datawargs = {} if hdf5_data_compression is None else {"compression": hdf5_data_compression, "compression_opts": hdf5_data_compression_level, "shuffle":True}
 h5modelwargs = {} if hdf5_model_compression is None else {"compression": hdf5_model_compression, "compression_opts": hdf5_model_compression_level, "shuffle":True}