Skip to content
This repository has been archived by the owner on Aug 10, 2023. It is now read-only.

Commit

Permalink
February 2021 update
Browse files Browse the repository at this point in the history
  • Loading branch information
liuqiuhui2015 committed Feb 22, 2021
1 parent 3ff6d2c commit 787268e
Show file tree
Hide file tree
Showing 46 changed files with 550 additions and 365 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Neutron
Neutron: A pytorch based implementation of [Transformer](https://arxiv.org/abs/1706.03762) and its variants.
Neutron: A pytorch based implementation of the [Transformer](https://arxiv.org/abs/1706.03762) and its variants.

This project is developed with python 3.8.

Expand Down Expand Up @@ -96,11 +96,11 @@ Tokenized case-sensitive BLEU measured with [multi-bleu.perl](https://github.com
| | BLEU | Training Speed | Decoding Speed |
| :------| ------: | ------: | ------: |
| Attention is all you need | 27.3 | | |
| Neutron | 28.07 | 21562.98 | 68.25 |
| Neutron | 28.07 | 22424.63 | 150.15 |

## Acknowledgments

The project starts when Hongfei XU (the developer) was a postgraduate student at [Zhengzhou University](http://www5.zzu.edu.cn/nlp/), and continues when he is a PhD candidate at [Saarland University](https://www.uni-saarland.de/nc/en/home.html) supervised by [Prof. Dr. Josef van Genabith](https://www.dfki.de/en/web/about-us/employee/person/jova02/) and [Prof. Dr. Deyi Xiong](http://cic.tju.edu.cn/faculty/xiongdeyi/), and a Junior Researcher at [DFKI, MLT (German Research Center for Artificial Intelligence, Multilinguality and Language Technology)](https://www.dfki.de/en/web/research/research-departments-and-groups/multilinguality-and-language-technology/). Hongfei XU enjoys a doctoral grant from [China Scholarship Council](https://www.csc.edu.cn/) ([2018]3101, 201807040056) while maintaining this project.
Hongfei Xu enjoys a doctoral grant from [China Scholarship Council](https://www.csc.edu.cn/) ([2018]3101, 201807040056) while maintaining this project.

Details of this project can be found [here](https://arxiv.org/abs/1903.07402), and please cite it if you enjoy the implementation :)

Expand Down
7 changes: 4 additions & 3 deletions adv/predict/doc/para/predict_doc_para.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def load_fixing(module):
# Important to make cudnn methods deterministic
set_random_seed(cnfg.seed, use_cuda)

if use_cuda:
if cuda_device:
mymodel.to(cuda_device)
if multi_gpu:
mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False)
Expand All @@ -75,9 +75,10 @@ def load_fixing(module):
with open(sys.argv[1], "wb") as f:
with torch.no_grad():
for nsent, i_d in tqdm(tl):
seq_batch = torch.from_numpy(src_grp[nsent][i_d][:]).long()
if use_cuda:
seq_batch = torch.from_numpy(src_grp[nsent][i_d][:])
if cuda_device:
seq_batch = seq_batch.to(cuda_device)
seq_batch = seq_batch.long()
bsize, _nsent, seql = seq_batch.size()
_nsent_use = _nsent - 1
with autocast(enabled=use_amp):
Expand Down
9 changes: 5 additions & 4 deletions adv/predict/predict_ape.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def load_fixing(module):

set_random_seed(cnfg.seed, use_cuda)

if use_cuda:
if cuda_device:
mymodel.to(cuda_device)
if multi_gpu:
mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False)
Expand All @@ -73,11 +73,12 @@ def load_fixing(module):
with open(sys.argv[1], "wb") as f:
with torch.no_grad():
for i in tqdm(range(ntest)):
seq_batch = torch.from_numpy(src_grp[str(i)][:]).long()
seq_mt = torch.from_numpy(mt_grp[str(i)][:]).long()
if use_cuda:
seq_batch = torch.from_numpy(src_grp[str(i)][:])
seq_mt = torch.from_numpy(mt_grp[str(i)][:])
if cuda_device:
seq_batch = seq_batch.to(cuda_device)
seq_mt = seq_mt.to(cuda_device)
seq_batch, seq_mt = seq_batch.long(), seq_mt.long()
with autocast(enabled=use_amp):
output = mymodel.decode(seq_batch, seq_mt, beam_size, None, length_penalty)
if multi_gpu:
Expand Down
9 changes: 5 additions & 4 deletions adv/rank/doc/para/rank_loss_para.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def load_fixing(module):
# Important to make cudnn methods deterministic
set_random_seed(cnfg.seed, use_cuda)

if use_cuda:
if cuda_device:
mymodel.to(cuda_device)
lossf.to(cuda_device)
if multi_gpu:
Expand All @@ -81,12 +81,13 @@ def load_fixing(module):
with torch.no_grad():
for i in tqdm(range(ntest)):
_curid = str(i)
seq_batch = torch.from_numpy(src_grp[_curid][:]).long()
seq_o = torch.from_numpy(tgt_grp[_curid][:]).long()
seq_batch = torch.from_numpy(src_grp[_curid][:])
seq_o = torch.from_numpy(tgt_grp[_curid][:])
lo = seq_o.size(-1) - 1
if use_cuda:
if cuda_device:
seq_batch = seq_batch.to(cuda_device)
seq_o = seq_o.to(cuda_device)
seq_batch, seq_o = seq_batch.long(), seq_o.long()
bsize, _nsent = seq_batch.size()[:2]
_nsent_use = _nsent - 1
seq_o = seq_o.narrow(1, 1, _nsent_use)
Expand Down
9 changes: 5 additions & 4 deletions adv/rank/doc/rank_loss_sent.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def load_fixing(module):
# Important to make cudnn methods deterministic
set_random_seed(cnfg.seed, use_cuda)

if use_cuda:
if cuda_device:
mymodel.to(cuda_device)
lossf.to(cuda_device)
if multi_gpu:
Expand All @@ -79,13 +79,14 @@ def load_fixing(module):
with torch.no_grad():
for i in tqdm(range(ntest)):
_curid = str(i)
seq_batch = torch.from_numpy(src_grp[_curid][:]).long()
seq_o = torch.from_numpy(tgt_grp[_curid][:]).long()
seq_batch = torch.from_numpy(src_grp[_curid][:])
seq_o = torch.from_numpy(tgt_grp[_curid][:])
bsize, nsent = seq_batch.size()[:2]
ebsize = bsize * nsent
if use_cuda:
if cuda_device:
seq_batch = seq_batch.to(cuda_device)
seq_o = seq_o.to(cuda_device)
seq_batch, seq_o = seq_batch.long(), seq_o.long()
lo = seq_o.size(-1) - 1
ot = seq_o.narrow(-1, 1, lo).contiguous()
with autocast(enabled=use_amp):
Expand Down
12 changes: 7 additions & 5 deletions adv/train/doc/para/train_doc_para.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,13 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok

src_grp, tgt_grp = td["src"], td["tgt"]
for nsent, i_d in tqdm(tl):
seq_batch = torch.from_numpy(src_grp[nsent][i_d][:]).long()
seq_o = torch.from_numpy(tgt_grp[nsent][i_d][:]).long()
seq_batch = torch.from_numpy(src_grp[nsent][i_d][:])
seq_o = torch.from_numpy(tgt_grp[nsent][i_d][:])
lo = seq_o.size(-1) - 1
if mv_device:
seq_batch = seq_batch.to(mv_device)
seq_o = seq_o.to(mv_device)
seq_batch, seq_o = seq_batch.long(), seq_o.long()

_nsent = seq_batch.size(1)
_nsent_use = _nsent - 1
Expand Down Expand Up @@ -145,12 +146,13 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
src_grp, tgt_grp = ed["src"], ed["tgt"]
with torch.no_grad():
for nsent, i_d in tqdm(nd):
seq_batch = torch.from_numpy(src_grp[nsent][i_d][:]).long()
seq_o = torch.from_numpy(tgt_grp[nsent][i_d][:]).long()
seq_batch = torch.from_numpy(src_grp[nsent][i_d][:])
seq_o = torch.from_numpy(tgt_grp[nsent][i_d][:])
lo = seq_o.size(-1) - 1
if mv_device:
seq_batch = seq_batch.to(mv_device)
seq_o = seq_o.to(mv_device)
seq_batch, seq_o = seq_batch.long(), seq_o.long()

_nsent = seq_batch.size(1)
_nsent_use = _nsent - 1
Expand Down Expand Up @@ -261,7 +263,7 @@ def init_fixing(module):
logger.info("Load target embedding from: " + cnfg.tgt_emb)
load_emb(cnfg.tgt_emb, mymodel.dec.wemb.weight, nwordt, cnfg.scale_down_emb, cnfg.freeze_tgtemb)

if use_cuda:
if cuda_device:
mymodel.to(cuda_device)
lossf.to(cuda_device)

Expand Down
16 changes: 9 additions & 7 deletions adv/train/train_ape.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,15 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok
cur_b, _ls = 1, {} if save_loss else None
src_grp, mt_grp, tgt_grp = td["src"], td["mt"], td["tgt"]
for i_d in tqdm(tl):
seq_batch = torch.from_numpy(src_grp[i_d][:]).long()
seq_mt = torch.from_numpy(mt_grp[i_d][:]).long()
seq_o = torch.from_numpy(tgt_grp[i_d][:]).long()
seq_batch = torch.from_numpy(src_grp[i_d][:])
seq_mt = torch.from_numpy(mt_grp[i_d][:])
seq_o = torch.from_numpy(tgt_grp[i_d][:])
lo = seq_o.size(1) - 1
if mv_device:
seq_batch = seq_batch.to(mv_device)
seq_mt = seq_mt.to(mv_device)
seq_o = seq_o.to(mv_device)
seq_batch, seq_mt, seq_o = seq_batch.long(), seq_mt.long(), seq_o.long()

oi = seq_o.narrow(1, 0, lo)
ot = seq_o.narrow(1, 1, lo).contiguous()
Expand Down Expand Up @@ -142,14 +143,15 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
with torch.no_grad():
for i in tqdm(range(nd)):
bid = str(i)
seq_batch = torch.from_numpy(src_grp[bid][:]).long()
seq_mt = torch.from_numpy(mt_grp[bid][:]).long()
seq_o = torch.from_numpy(tgt_grp[bid][:]).long()
seq_batch = torch.from_numpy(src_grp[bid][:])
seq_mt = torch.from_numpy(mt_grp[bid][:])
seq_o = torch.from_numpy(tgt_grp[bid][:])
lo = seq_o.size(1) - 1
if mv_device:
seq_batch = seq_batch.to(mv_device)
seq_mt = seq_mt.to(mv_device)
seq_o = seq_o.to(mv_device)
seq_batch, seq_mt, seq_o = seq_batch.long(), seq_mt.long(), seq_o.long()
ot = seq_o.narrow(1, 1, lo).contiguous()
with autocast(enabled=use_amp):
output = model(seq_batch, seq_mt, seq_o.narrow(1, 0, lo))
Expand Down Expand Up @@ -251,7 +253,7 @@ def init_fixing(module):
logger.info("Load target embedding from: " + cnfg.tgt_emb)
load_emb(cnfg.tgt_emb, mymodel.dec.wemb.weight, nwordt, cnfg.scale_down_emb, cnfg.freeze_tgtemb)

if use_cuda:
if cuda_device:
mymodel.to(cuda_device)
lossf.to(cuda_device)

Expand Down
12 changes: 7 additions & 5 deletions adv/train/train_dynb.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,13 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok

src_grp, tgt_grp = td["src"], td["tgt"]
for i_d in tqdm(tl):
seq_batch = torch.from_numpy(src_grp[i_d][:]).long()
seq_o = torch.from_numpy(tgt_grp[i_d][:]).long()
seq_batch = torch.from_numpy(src_grp[i_d][:])
seq_o = torch.from_numpy(tgt_grp[i_d][:])
lo = seq_o.size(1) - 1
if mv_device:
seq_batch = seq_batch.to(mv_device)
seq_o = seq_o.to(mv_device)
seq_batch, seq_o = seq_batch.long(), seq_o.long()

oi = seq_o.narrow(1, 0, lo)
ot = seq_o.narrow(1, 1, lo).contiguous()
Expand Down Expand Up @@ -169,12 +170,13 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu, use_amp=False):
with torch.no_grad():
for i in tqdm(range(nd)):
bid = str(i)
seq_batch = torch.from_numpy(src_grp[bid][:]).long()
seq_o = torch.from_numpy(tgt_grp[bid][:]).long()
seq_batch = torch.from_numpy(src_grp[bid][:])
seq_o = torch.from_numpy(tgt_grp[bid][:])
lo = seq_o.size(1) - 1
if mv_device:
seq_batch = seq_batch.to(mv_device)
seq_o = seq_o.to(mv_device)
seq_batch, seq_o = seq_batch.long(), seq_o.long()
ot = seq_o.narrow(1, 1, lo).contiguous()
with autocast(enabled=use_amp):
output = model(seq_batch, seq_o.narrow(1, 0, lo))
Expand Down Expand Up @@ -272,7 +274,7 @@ def init_fixing(module):
logger.info("Load target embedding from: " + cnfg.tgt_emb)
load_emb(cnfg.tgt_emb, mymodel.dec.wemb.weight, nwordt, cnfg.scale_down_emb, cnfg.freeze_tgtemb)

if use_cuda:
if cuda_device:
mymodel.to(cuda_device)
lossf.to(cuda_device)

Expand Down
14 changes: 10 additions & 4 deletions cnfg/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,13 @@ cache_len_default = 256
use_k_relative_position = 0
disable_std_pemb = False
# using fast implementation of label smoothing loss, but it cannot exclude the negative impact of special tokens, like <pad>, on training. `forbidden_indexes` in `cnfg/base.py` shall be set to None to enable.
use_fast_loss = False
# configure maximum batch size w.r.t GPU memory
max_sentences_gpu = 768
max_tokens_gpu = 4608
max_pad_tokens_sentence = 16
max_sentences_gpu = 2048
max_tokens_gpu = 6144
max_pad_tokens_sentence = 32
normal_tokens_vs_pad_tokens = 4
# trade CPU for IO and disk space, see [h5py](http://docs.h5py.org/en/stable/high/dataset.html) for details.
Expand All @@ -168,11 +171,14 @@ hdf5_model_compression_level = 0
# For BPE (using full vocabulary), the special <unk> token will never appear and thus can be removed from the vocabulary. Otherwise, it should be set to True.
use_unk = True
# prune with length penalty in each beam decoding step
clip_beam_with_lp = True
```

## `ihyp.py`

To interpret configurations in hyp.py.
To interpret configurations in `hyp.py`.

## `dynb.py`

Expand Down
8 changes: 4 additions & 4 deletions cnfg/hyp.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@
# default cached sequence length (for positional embedding, etc.)
cache_len_default = 256

# window size (one side) of relative positional embeddings, 0 to disable. 16 and 8 are used in [Self-Attention with Relative Position Representations](https://www.aclweb.org/anthology/N18-2074/) for Transformer Base and Big respectively. disable_std_pemb to disable the standard positional embedding when use the relative position, or to disable only the decoder side with a tuple (False, True,), useful for AAN.
# window size (one side) of relative positional embeddings, 0 to disable. 8 and 16 are used in [Self-Attention with Relative Position Representations](https://www.aclweb.org/anthology/N18-2074/) for Transformer Base and Big respectively. disable_std_pemb to disable the standard positional embedding when use the relative position, or to disable only the decoder side with a tuple (False, True,), useful for AAN.
use_k_relative_position = 0
disable_std_pemb = False

# using fast implementation of label smoothing loss, but it cannot exclude the negative impact of special tokens, like <pad>, on training. `forbidden_indexes` in `cnfg/base.py` shall be set to None to enable.
use_fast_loss = False

# configure maximum batch size w.r.t GPU memory
max_sentences_gpu = 768
max_tokens_gpu = 4608
max_pad_tokens_sentence = 16
max_sentences_gpu = 2048
max_tokens_gpu = 6144
max_pad_tokens_sentence = 32
normal_tokens_vs_pad_tokens = 4

# trade CPU for IO and disk space, see [h5py](http://docs.h5py.org/en/stable/high/dataset.html) for details.
Expand Down
1 change: 1 addition & 0 deletions cnfg/ihyp.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
use_k_relative_position_encoder, use_k_relative_position_decoder = parse_double_value_tuple(use_k_relative_position)
rel_pos_enabled = (max(use_k_relative_position_encoder, use_k_relative_position_decoder) > 0)
disable_std_pemb_encoder, disable_std_pemb_decoder = parse_double_value_tuple(disable_std_pemb)
relpos_reduction_with_zeros = True

h5datawargs = {} if hdf5_data_compression is None else {"compression": hdf5_data_compression, "compression_opts": hdf5_data_compression_level, "shuffle":True}
h5modelwargs = {} if hdf5_model_compression is None else {"compression": hdf5_model_compression, "compression_opts": hdf5_model_compression_level, "shuffle":True}
Expand Down
Loading

0 comments on commit 787268e

Please sign in to comment.