From 07250af5f358839d0496fe284d4537beca5445f2 Mon Sep 17 00:00:00 2001 From: Qiuhui Liu Date: Thu, 2 Apr 2020 13:02:04 +0800 Subject: [PATCH] Hello World :-) --- README.md | 55 +------ adv/predict/doc/para/predict_doc_para.py | 6 +- adv/rank/doc/para/rank_loss_doc_para.py | 10 +- adv/rank/doc/rank_loss_sent.py | 8 +- adv/train/doc/para/train_doc_para.py | 55 +++---- cnfg/base.py | 2 - cnfg/hyp.py | 33 ++++ cnfg/ihyp.py | 50 ++++++ loss.py | 94 ----------- loss/__init__.py | 1 + loss/base.py | 192 +++++++++++++++++++++++ modules/TA.py | 4 +- modules/act.py | 11 +- modules/base.py | 119 +++++++++----- modules/noise.py | 8 +- modules/rnncells.py | 6 +- predict.py | 5 +- rank_loss.py | 7 +- scripts/doc/para/mktest.sh | 5 +- scripts/mktest.sh | 9 +- tools/average_model.py | 10 +- tools/check/cnfg | 1 + tools/check/ext_emb.py | 3 +- tools/check/tspeed.py | 5 +- tools/clean/cnfg | 1 + tools/clean/compress_h5.py | 12 ++ tools/clean/doc/para/cnfg | 1 + tools/cnfg | 1 + tools/doc/para/cnfg | 1 + tools/doc/para/mkiodata.py | 8 +- tools/doc/para/mktest.py | 6 +- tools/lsort/cnfg | 1 + tools/mkiodata.py | 8 +- tools/mktest.py | 7 +- tools/share_vocab.py | 2 +- tools/sorti.py | 17 +- tools/vocab.py | 4 +- train.py | 49 +++--- transformer/AGG/HierDecoder.py | 4 +- transformer/AGG/HierEncoder.py | 4 +- transformer/AGG/InceptDecoder.py | 4 +- transformer/AGG/InceptEncoder.py | 4 +- transformer/AvgDecoder.py | 30 ++-- transformer/Decoder.py | 70 ++++++--- transformer/Doc/Para/Base/Decoder.py | 34 ++-- transformer/Doc/Para/Base/Encoder.py | 16 +- transformer/Doc/Para/Base/NMT.py | 11 +- transformer/Encoder.py | 28 ++-- transformer/EnsembleAvgDecoder.py | 20 ++- transformer/EnsembleDecoder.py | 16 +- transformer/NMT.py | 11 +- transformer/README.md | 53 +++++++ transformer/RNMTDecoder.py | 6 +- transformer/SC/Decoder.py | 24 ++- transformer/SC/Encoder.py | 12 +- transformer/SC/NMT.py | 11 +- transformer/TA/Decoder.py | 24 ++- transformer/TA/Encoder.py | 8 +- transformer/UniEncoder.py | 6 +- translator.py | 14 +- utils/base.py | 12 +- utils/fmt/base.py | 31 ++-- utils/fmt/base4torch.py | 3 +- utils/fmt/dual.py | 2 +- utils/fmt/triple.py | 2 +- utils/h5serial.py | 71 +++++++++ 66 files changed, 924 insertions(+), 424 deletions(-) create mode 100644 cnfg/hyp.py create mode 100644 cnfg/ihyp.py delete mode 100644 loss.py create mode 100644 loss/__init__.py create mode 100644 loss/base.py create mode 120000 tools/check/cnfg create mode 120000 tools/clean/cnfg create mode 100644 tools/clean/compress_h5.py create mode 120000 tools/clean/doc/para/cnfg create mode 120000 tools/cnfg create mode 120000 tools/doc/para/cnfg create mode 120000 tools/lsort/cnfg create mode 100644 transformer/README.md create mode 100644 utils/h5serial.py diff --git a/README.md b/README.md index f3f4b45..82f6dc7 100644 --- a/README.md +++ b/README.md @@ -167,9 +167,6 @@ share_emb = False # number of heads for multi-head attention. nhead = max(1, isize // 64) -# maximum steps cached for the positional embedding. -cache_len = 256 - # warm up steps for the training. warm_step = 8000 # scalar of learning rate @@ -182,6 +179,8 @@ attn_hsize = None seed = 666666 ``` +Configure advanced details with `cnfg/hyp.py`: + ## Training Just execute the following command to launch the training: @@ -227,7 +226,7 @@ where `rsf` is the result file, `h5f` is HDF5 formatted input of file of your co Foundamental models needed for the construction of transformer. -### `loss.py` +### `loss/` Implementation of label smoothing loss function required by the training of transformer. @@ -249,53 +248,7 @@ An example depends on Flask to provide simple Web service and REST API about how ### `transformer/` -#### `NMT.py` - -The transformer model encapsulates encoder and decoder. Switch [the comment line](https://github.com/anoidgit/transformer/blob/master/transformer/NMT.py#L9-L11) to make a choice between the standard decoder and the average decoder. - -#### `Encoder.py` - -The encoder of transformer. - -#### `Decoder.py` - -The standard decoder of transformer. - -#### `AvgDecoder.py` - -The average decoder of transformer proposed by [Accelerating Neural Transformer via an Average Attention Network](https://www.aclweb.org/anthology/P18-1166/). - -#### `EnsembleNMT.py` - -A model encapsulates several NMT models to do ensemble decoding. Switch [the comment line](https://github.com/anoidgit/transformer/blob/master/transformer/EnsembleNMT.py#L9-L11) to make a choice between the standard decoder and the average decoder. - -#### `EnsembleEncoder.py` - -A model encapsulates several encoders for ensemble decoding. - -#### `EnsembleDecoder.py` - -A model encapsulates several standard decoders for ensemble decoding. - -#### `EnsembleAvgDecoder.py` - -A model encapsulates several average decoders proposed by [Accelerating Neural Transformer via an Average Attention Network](https://www.aclweb.org/anthology/P18-1166/) for ensemble decoding. - -#### `AGG/` - -Implementation of aggregation models. - -##### `Hier*.py` - -Hierarchical aggregation proposed in [Exploiting Deep Representations for Neural Machine Translation](https://www.aclweb.org/anthology/D18-1457/). - -#### `TA/` - -Implementation of transparent attention proposed in [Training Deeper Neural Machine Translation Models with Transparent Attention](https://aclweb.org/anthology/D18-1338). - -#### `SC/` - -Implementation of sentential context proposed in [Exploiting Sentential Context for Neural Machine Translation](https://www.aclweb.org/anthology/P19-1624/). +Implementations of seq2seq models. ### `parallel/` diff --git a/adv/predict/doc/para/predict_doc_para.py b/adv/predict/doc/para/predict_doc_para.py index 5787b57..d8b2244 100644 --- a/adv/predict/doc/para/predict_doc_para.py +++ b/adv/predict/doc/para/predict_doc_para.py @@ -9,6 +9,7 @@ import h5py import cnfg.docpara as cnfg +from cnfg.ihyp import * from transformer.Doc.Para.Base.NMT import NMT from transformer.EnsembleNMT import NMT as Ensemble @@ -31,7 +32,7 @@ def load_fixing(module): vcbt = reverse_dict(vcbt) if len(sys.argv) == 4: - mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context) + mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context) mymodel = load_model_cpu(sys.argv[3], mymodel) mymodel.apply(load_fixing) @@ -39,7 +40,7 @@ def load_fixing(module): else: models = [] for modelf in sys.argv[3:]: - tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context) + tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context) tmp = load_model_cpu(modelf, tmp) tmp.apply(load_fixing) @@ -54,6 +55,7 @@ def load_fixing(module): use_cuda, cuda_device, cuda_devices, multi_gpu = parse_cuda_decode(cnfg.use_cuda, cnfg.gpuid, cnfg.multi_gpu_decoding) +# Important to make cudnn methods deterministic set_random_seed(cnfg.seed, use_cuda) if use_cuda: diff --git a/adv/rank/doc/para/rank_loss_doc_para.py b/adv/rank/doc/para/rank_loss_doc_para.py index 7280075..5a135ae 100644 --- a/adv/rank/doc/para/rank_loss_doc_para.py +++ b/adv/rank/doc/para/rank_loss_doc_para.py @@ -13,13 +13,14 @@ import h5py import cnfg.docpara as cnfg +from cnfg.ihyp import * from transformer.Doc.Para.Base.NMT import NMT from transformer.EnsembleNMT import NMT as Ensemble from parallel.parallelMT import DataParallelMT from parallel.base import DataParallelCriterion -from loss import LabelSmoothingLoss +from loss.base import LabelSmoothingLoss from utils.base import * from utils.fmt.base4torch import parse_cuda @@ -38,7 +39,7 @@ def load_fixing(module): cuda_device = torch.device(cnfg.gpuid) if len(sys.argv) == 4: - mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context) + mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context) mymodel = load_model_cpu(sys.argv[3], mymodel) mymodel.apply(load_fixing) @@ -46,7 +47,7 @@ def load_fixing(module): else: models = [] for modelf in sys.argv[3:]: - tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context) + tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context) tmp = load_model_cpu(modelf, tmp) tmp.apply(load_fixing) @@ -59,9 +60,8 @@ def load_fixing(module): lossf = LabelSmoothingLoss(nwordt, cnfg.label_smoothing, ignore_index=0, reduction='none', forbidden_index=cnfg.forbidden_indexes) use_cuda, cuda_device, cuda_devices, multi_gpu = parse_cuda(cnfg.use_cuda, cnfg.gpuid) -# disable multi_gpu, not supported -multi_gpu, cuda_devices = False, None +# Important to make cudnn methods deterministic set_random_seed(cnfg.seed, use_cuda) if use_cuda: diff --git a/adv/rank/doc/rank_loss_sent.py b/adv/rank/doc/rank_loss_sent.py index e4cab4c..eb86dda 100644 --- a/adv/rank/doc/rank_loss_sent.py +++ b/adv/rank/doc/rank_loss_sent.py @@ -13,13 +13,14 @@ import h5py import cnfg.base as cnfg +from cnfg.ihyp import * from transformer.NMT import NMT from transformer.EnsembleNMT import NMT as Ensemble from parallel.parallelMT import DataParallelMT from parallel.base import DataParallelCriterion -from loss import LabelSmoothingLoss +from loss.base import LabelSmoothingLoss from utils.base import * from utils.fmt.base4torch import parse_cuda @@ -38,7 +39,7 @@ def load_fixing(module): cuda_device = torch.device(cnfg.gpuid) if len(sys.argv) == 4: - mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) mymodel = load_model_cpu(sys.argv[3], mymodel) mymodel.apply(load_fixing) @@ -46,7 +47,7 @@ def load_fixing(module): else: models = [] for modelf in sys.argv[3:]: - tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) tmp = load_model_cpu(modelf, tmp) tmp.apply(load_fixing) @@ -60,6 +61,7 @@ def load_fixing(module): use_cuda, cuda_device, cuda_devices, multi_gpu = parse_cuda(cnfg.use_cuda, cnfg.gpuid) +# Important to make cudnn methods deterministic set_random_seed(cnfg.seed, use_cuda) if use_cuda: diff --git a/adv/train/doc/para/train_doc_para.py b/adv/train/doc/para/train_doc_para.py index 596f4a5..2baae26 100644 --- a/adv/train/doc/para/train_doc_para.py +++ b/adv/train/doc/para/train_doc_para.py @@ -7,17 +7,17 @@ from torch import optim from parallel.base import DataParallelCriterion -from parallel.parallelMT import DataParallelMT +from parallel.parallelMTFP import DataParallelMT from utils.base import * +from utils.h5serial import h5save, h5load from utils.fmt.base import tostr, save_states, load_states from utils.fmt.base4torch import parse_cuda, load_emb from lrsch import GoogleLR -from loss import LabelSmoothingLoss +from loss.base import LabelSmoothingLoss from random import shuffle -from math import inf from tqdm import tqdm @@ -27,6 +27,7 @@ import h5py import cnfg.docpara as cnfg +from cnfg.ihyp import * from transformer.Doc.Para.Base.NMT import NMT from transformer.NMT import NMT as BaseNMT @@ -91,7 +92,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok if _cur_rstep is not None: if save_checkp_epoch and (save_every is not None) and (_cur_rstep % save_every == 0) and (chkpf is not None) and (_cur_rstep > 0): if num_checkpoint > 1: - _fend = "_%d.t7" % (_cur_checkid) + _fend = "_%d.h5" % (_cur_checkid) _chkpf = chkpf[:-3] + _fend if chkpof is not None: _chkpof = chkpof[:-3] + _fend @@ -101,7 +102,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok _chkpof = chkpof save_model(model, _chkpf, multi_gpu, logger) if chkpof is not None: - torch.save(optm.state_dict(), _chkpof) + h5save(optm.state_dict(), _chkpof) if statesf is not None: save_states(statesf, tl[cur_b - 1:]) _cur_rstep -= 1 @@ -125,7 +126,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok if save_checkp_epoch and (_cur_rstep is None) and (save_every is not None) and (cur_b % save_every == 0) and (chkpf is not None) and (cur_b < ndata): if num_checkpoint > 1: - _fend = "_%d.t7" % (_cur_checkid) + _fend = "_%d.h5" % (_cur_checkid) _chkpf = chkpf[:-3] + _fend if chkpof is not None: _chkpof = chkpof[:-3] + _fend @@ -135,7 +136,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok _chkpof = chkpof save_model(model, _chkpf, multi_gpu, logger) if chkpof is not None: - torch.save(optm.state_dict(), _chkpof) + h5save(optm.state_dict(), _chkpof) if statesf is not None: save_states(statesf, tl[cur_b - 1:]) cur_b += 1 @@ -168,9 +169,9 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu): loss = lossf(output, ot) if multi_gpu: loss = loss.sum() - trans = torch.cat([torch.argmax(outu, -1).to(mv_device) for outu in output], 0) + trans = torch.cat([outu.argmax(-1).to(mv_device) for outu in output], 0) else: - trans = torch.argmax(output, -1) + trans = output.argmax(-1) sum_loss += loss.data.item() data_mask = ot.ne(0) correct = (trans.eq(ot) & data_mask).int() @@ -219,9 +220,9 @@ def init_fixing(module): chkpof = None statesf = None if save_every is not None: - chkpf = wkdir + "checkpoint.t7" + chkpf = wkdir + "checkpoint.h5" if save_optm_state: - chkpof = wkdir + "checkpoint.optm.t7" + chkpof = wkdir + "checkpoint.optm.h5" if cnfg.save_train_state: statesf = wkdir + "checkpoint.states" @@ -248,7 +249,7 @@ def init_fixing(module): nwordi, nwordt = nword[0], nword[-1] logger.info("Design models with seed: %d" % torch.initial_seed()) -mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context) +mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes, cnfg.num_prev_sent, cnfg.num_layer_context) fine_tune_m = cnfg.fine_tune_m @@ -257,10 +258,9 @@ def init_fixing(module): vl = [(str(nsent), str(_curd),) for nsent, ndata in zip(vd["nsent"][:].tolist(), vd["ndata"][:].tolist()) for _curd in range(ndata)] mymodel = init_model_params(mymodel) -mymodel.apply(init_fixing) if fine_tune_m is not None: logger.info("Load pre-trained model from: " + fine_tune_m) - _tmpm = BaseNMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + _tmpm = BaseNMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) _tmpm = load_model_cpu(fine_tune_m, _tmpm) #with torch.no_grad(): #_tmpm.dec.classifier.bias[_tmpm.dec.classifier.bias.lt(-1e3)] = -1e3 @@ -270,6 +270,7 @@ def init_fixing(module): _tmpm.dec.classifier.bias.requires_grad_(True) mymodel.load_base(_tmpm) _tmpm = None +mymodel.apply(init_fixing) lossf = LabelSmoothingLoss(nwordt, cnfg.label_smoothing, ignore_index=0, reduction='sum', forbidden_index=cnfg.forbidden_indexes) if cnfg.src_emb is not None: @@ -283,7 +284,7 @@ def init_fixing(module): mymodel.to(cuda_device) lossf.to(cuda_device) -optimizer = optim.Adam(filter_para_grad(mymodel.parameters()), lr=1e-4, betas=(0.9, 0.98), eps=1e-9, weight_decay=cnfg.weight_decay, amsgrad=use_ams) +optimizer = optim.Adam(filter_para_grad(mymodel.parameters()), lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams) optimizer.zero_grad() if use_amp: @@ -296,20 +297,20 @@ def init_fixing(module): fine_tune_state = cnfg.fine_tune_state if fine_tune_state is not None: logger.info("Load optimizer state from: " + fine_tune_state) - optimizer.load_state_dict(torch.load(fine_tune_state)) + optimizer.load_state_dict(h5load(fine_tune_state)) lrsch = GoogleLR(optimizer, cnfg.isize, cnfg.warm_step, scale=cnfg.lr_scale) num_checkpoint = cnfg.num_checkpoint cur_checkid = 0 -tminerr = inf +tminerr = inf_default minloss, minerr = eva(vd, vl, mymodel, lossf, cuda_device, multi_gpu) logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))), ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr)))) if fine_tune_m is None: - save_model(mymodel, wkdir + "init.t7", multi_gpu, logger) + save_model(mymodel, wkdir + "init.h5", multi_gpu, logger) logger.info("Initial model saved") else: cnt_states = cnfg.train_statesf @@ -318,9 +319,9 @@ def init_fixing(module): tminerr, done_tokens, cur_checkid, remain_steps, _ = train(td, load_states(cnt_states), vd, vl, optimizer, lrsch, mymodel, lossf, cuda_device, logger, done_tokens, multi_gpu, tokens_optm, batch_report, save_every, chkpf, chkpof, statesf, num_checkpoint, cur_checkid, report_eva, remain_steps, False, False, use_amp) vloss, vprec = eva(vd, vl, mymodel, lossf, cuda_device, multi_gpu) logger.info("Epoch: 0, train loss: %.3f, valid loss/error: %.3f %.2f" % (tminerr, vloss, vprec)) - save_model(mymodel, wkdir + "train_0_%.3f_%.3f_%.2f.t7" % (tminerr, vloss, vprec), multi_gpu, logger) + save_model(mymodel, wkdir + "train_0_%.3f_%.3f_%.2f.h5" % (tminerr, vloss, vprec), multi_gpu, logger) if save_optm_state: - torch.save(optimizer.state_dict(), wkdir + "train_0_%.3f_%.3f_%.2f.optm.t7" % (tminerr, vloss, vprec)) + h5save(optimizer.state_dict(), wkdir + "train_0_%.3f_%.3f_%.2f.optm.h5" % (tminerr, vloss, vprec)) logger.info("New best model saved") if cnfg.dss_ws is not None and cnfg.dss_ws > 0.0 and cnfg.dss_ws < 1.0: @@ -347,9 +348,9 @@ def init_fixing(module): logger.info("Epoch: %d, train loss: %.3f, valid loss/error: %.3f %.2f" % (i, terr, vloss, vprec)) if (vprec <= minerr) or (vloss <= minloss): - save_model(mymodel, wkdir + "eva_%d_%.3f_%.3f_%.2f.t7" % (i, terr, vloss, vprec), multi_gpu, logger) + save_model(mymodel, wkdir + "eva_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, logger) if save_optm_state: - torch.save(optimizer.state_dict(), wkdir + "eva_%d_%.3f_%.3f_%.2f.optm.t7" % (i, terr, vloss, vprec)) + h5save(optimizer.state_dict(), wkdir + "eva_%d_%.3f_%.3f_%.2f.optm.h5" % (i, terr, vloss, vprec)) logger.info("New best model saved") namin = 0 @@ -362,11 +363,11 @@ def init_fixing(module): else: if terr < tminerr: tminerr = terr - save_model(mymodel, wkdir + "train_%d_%.3f_%.3f_%.2f.t7" % (i, terr, vloss, vprec), multi_gpu, logger) + save_model(mymodel, wkdir + "train_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, logger) if save_optm_state: - torch.save(optimizer.state_dict(), wkdir + "train_%d_%.3f_%.3f_%.2f.optm.t7" % (i, terr, vloss, vprec)) + h5save(optimizer.state_dict(), wkdir + "train_%d_%.3f_%.3f_%.2f.optm.h5" % (i, terr, vloss, vprec)) elif epoch_save: - save_model(mymodel, wkdir + "epoch_%d_%.3f_%.3f_%.2f.t7" % (i, terr, vloss, vprec), multi_gpu, logger) + save_model(mymodel, wkdir + "epoch_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, logger) namin += 1 if namin >= earlystop: @@ -397,9 +398,9 @@ def init_fixing(module): mymodel.collect_gradients() optimizer.step() -save_model(mymodel, wkdir + "last.t7", multi_gpu, logger) +save_model(mymodel, wkdir + "last.h5", multi_gpu, logger) if save_optm_state: - torch.save(optimizer.state_dict(), wkdir + "last.optm.t7") + h5save(optimizer.state_dict(), wkdir + "last.optm.h5") logger.info("model saved") td.close() diff --git a/cnfg/base.py b/cnfg/base.py index 66b3660..55b1f4b 100644 --- a/cnfg/base.py +++ b/cnfg/base.py @@ -87,8 +87,6 @@ nhead = max(1, isize // 64) -cache_len = 256 - warm_step = 8000 lr_scale = 1.0 diff --git a/cnfg/hyp.py b/cnfg/hyp.py new file mode 100644 index 0000000..e8a580a --- /dev/null +++ b/cnfg/hyp.py @@ -0,0 +1,33 @@ +#encoding: utf-8 + +ease_optimization = True + +# choices: None, "GeLU", "Swish", "Sigmoid" +advance_activation_function = None + +# choices: "v1", "v2" +computation_order = "v1" + +# default cached sequence length (for positional embedding, etc.) +cache_len_default = 256 + +# window size (one side) of relative positional embeddings, 0 to disable. 16 and 8 are used in [Self-Attention with Relative Position Representations](https://www.aclweb.org/anthology/N18-2074/) for Transformer Base and Big respectively. disable_std_pemb to disable the standard positional embedding when use the relative position, or to disable only the decoder side with a tuple (False, True,), useful for AAN. +use_k_relative_position = 0 +disable_std_pemb = False + +# configure maximum batch size w.r.t GPU memory +max_sentences_gpu = 768 +max_tokens_gpu = 4608 +max_pad_tokens_sentence = 16 +normal_tokens_vs_pad_tokens = 4 + +# trade CPU for IO and disk space, see [h5py](http://docs.h5py.org/en/stable/high/dataset.html) for details. +# choices: None, "gzip", "lzf" +hdf5_data_compression = "gzip" +# choices: 0 to 9, default is 4. None for lzf. +hdf5_data_compression_level = 9 +hdf5_model_compression = None +hdf5_model_compression_level = 0 + +# For BPE (using full vocabulary), the special token will never appear and thus can be removed from the vocabulary. Otherwise, it should be set to True. +use_unk = True diff --git a/cnfg/ihyp.py b/cnfg/ihyp.py new file mode 100644 index 0000000..8eac14c --- /dev/null +++ b/cnfg/ihyp.py @@ -0,0 +1,50 @@ +#encoding: utf-8 + +# this file interprets hyper-parameters assigned in cnfg/hyp.py + +from cnfg.hyp import * + +from math import inf + +from utils.fmt.base import parse_none, parse_double_value_tuple + +if ease_optimization: + enable_residual_bias_default = False +else: + enable_residual_bias_default = True + +use_adv_act_default = False +override_GeLU_Swish = False +override_GeLU_Sigmoid = False +if advance_activation_function is not None: + use_adv_act_default = True + _adv_act = advance_activation_function.lower() + if _adv_act == "sigmoid": + override_GeLU_Sigmoid = True + elif _adv_act == "swish": + override_GeLU_Swish = True +inplace_after_GeLU = use_adv_act_default and not override_GeLU_Sigmoid + +norm_residual_default = not (computation_order.lower() == "v2") + +# override by the GoogleLR in most case +init_lr = 1e-4 + +inf_default = inf + +ieps_default = 1e-9 +ieps_ln_default = 1e-6 +ieps_adam_default = 1e-9 + +ieps_ln_default = parse_none(ieps_ln_default, ieps_default) +ieps_adam_default = parse_none(ieps_adam_default, ieps_default) + +adam_betas_default = (0.9, 0.98,) + +use_k_relative_position_encoder, use_k_relative_position_decoder = parse_double_value_tuple(use_k_relative_position) +disable_std_pemb_encoder, disable_std_pemb_decoder = parse_double_value_tuple(disable_std_pemb) + +h5datawargs = {} if hdf5_data_compression is None else {"compression": hdf5_data_compression, "compression_opts": hdf5_data_compression_level, "shuffle":True} +h5modelwargs = {} if hdf5_model_compression is None else {"compression": hdf5_model_compression, "compression_opts": hdf5_model_compression_level, "shuffle":True} + +list_key_func = str diff --git a/loss.py b/loss.py deleted file mode 100644 index 5aa8c3a..0000000 --- a/loss.py +++ /dev/null @@ -1,94 +0,0 @@ -#encoding: utf-8 - -import torch -from torch.nn.modules.loss import _Loss -from torch.nn.modules.loss import NLLLoss as NLLLossBase - -import torch.nn.functional as F - -""" from: Rethinking the Inception Architecture for Computer Vision (https://arxiv.org/abs/1512.00567) - With label smoothing, KL-divergence between q_{smoothed ground truth prob.}(w) and p_{prob. computed by model}(w) is minimized. -""" - -class LabelSmoothingLoss(_Loss): - - def __init__(self, nclass, label_smoothing=0.1, ignore_index=-1, reduction='mean', forbidden_index=-1): - - super(LabelSmoothingLoss, self).__init__() - - fbil = set() - if isinstance(forbidden_index, (list, tuple)): - for fi in forbidden_index: - if (fi >= 0) and (fi not in fbil): - fbil.add(fi) - else: - if forbidden_index is not None and forbidden_index >= 0: - fbil.add(forbidden_index) - - if isinstance(ignore_index, (list, tuple)): - tmp = [] - for _tmp in ignore_index: - if (_tmp >= 0) and (_tmp not in tmp): - tmp.append(_tmp) - if _tmp not in fbil: - fbil.add(_tmp) - _nid = len(tmp) - if _nid > 0: - if _nid > 1: - self.ignore_index = tuple(tmp) - else: - self.ignore_index = tmp[0] - else: - self.ignore_index = ignore_index[0] if len(ignore_index) > 0 else -1 - else: - self.ignore_index = ignore_index - if (ignore_index >= 0) and (ignore_index not in fbil): - fbil.add(ignore_index) - - smoothing_value = label_smoothing / (nclass - 1 - len(fbil)) - weight = torch.full((nclass,), smoothing_value) - weight.index_fill_(0, torch.tensor(tuple(fbil), dtype=torch.long, device=weight.device), 0.0) - self.register_buffer("weight", weight.unsqueeze(0)) - - self.reduction = reduction - self.conf = 1.0 - label_smoothing - - # output: (batch size, num_classes) - # target: (batch size) - # they will be flattened automatically if the dimension of output is larger than 2. - - def forward(self, output, target): - - _output = output.view(-1, output.size(-1)) if output.dim() > 2 else output - - _target = target.view(-1, 1) - - model_prob = self.weight.repeat(_target.size(0), 1) - model_prob.scatter_(1, _target, self.conf) - - if isinstance(self.ignore_index, (list, tuple)): - model_prob.masked_fill_(torch.stack([_target == _tmp for _tmp in self.ignore_index]).int().sum(0).gt(0), 0.0) - elif self.ignore_index >= 0: - model_prob.masked_fill_(_target == self.ignore_index, 0.0) - - return F.kl_div(_output, model_prob, reduction=self.reduction) - -class NLLLoss(NLLLossBase): - - def forward(self, input, target): - - isize = input.size() - - return F.nll_loss(input.view(-1, isize[-1]), target.view(-1), weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction).view(isize[:-1]) - -class RankingLoss(_Loss): - - # output: (batch size) - # target: (batch size) - def forward(self, output, target): - - loss = output * target - if self.reduction == 'mean': - loss = loss / loss.numel() - - return loss diff --git a/loss/__init__.py b/loss/__init__.py new file mode 100644 index 0000000..8fb0d7c --- /dev/null +++ b/loss/__init__.py @@ -0,0 +1 @@ +#encoding: utf-8 diff --git a/loss/base.py b/loss/base.py new file mode 100644 index 0000000..6bc7a08 --- /dev/null +++ b/loss/base.py @@ -0,0 +1,192 @@ +#encoding: utf-8 + +import torch +from torch.nn.modules.loss import _Loss +from torch.nn.modules.loss import NLLLoss as NLLLossBase + +import torch.nn.functional as F + +from utils.base import clear_pad_mask + +""" from: Rethinking the Inception Architecture for Computer Vision (https://arxiv.org/abs/1512.00567) + With label smoothing, KL-divergence between q_{smoothed ground truth prob.}(w) and p_{prob. computed by model}(w) is minimized. +""" + +class LabelSmoothingLoss(_Loss): + + def __init__(self, nclass, label_smoothing=0.1, ignore_index=-1, reduction='mean', forbidden_index=-1): + + super(LabelSmoothingLoss, self).__init__() + + fbil = set() + if isinstance(forbidden_index, (list, tuple)): + for fi in forbidden_index: + if (fi >= 0) and (fi not in fbil): + fbil.add(fi) + else: + if forbidden_index is not None and forbidden_index >= 0: + fbil.add(forbidden_index) + + if isinstance(ignore_index, (list, tuple)): + tmp = [] + for _tmp in ignore_index: + if (_tmp >= 0) and (_tmp not in tmp): + tmp.append(_tmp) + if _tmp not in fbil: + fbil.add(_tmp) + _nid = len(tmp) + if _nid > 0: + if _nid > 1: + self.ignore_index = tuple(tmp) + else: + self.ignore_index = tmp[0] + else: + self.ignore_index = ignore_index[0] if len(ignore_index) > 0 else -1 + else: + self.ignore_index = ignore_index + if (ignore_index >= 0) and (ignore_index not in fbil): + fbil.add(ignore_index) + + smoothing_value = label_smoothing / (nclass - 1 - len(fbil)) + weight = torch.full((nclass,), smoothing_value) + weight.index_fill_(0, torch.tensor(tuple(fbil), dtype=torch.long, device=weight.device), 0.0) + self.register_buffer("weight", weight.unsqueeze(0)) + + self.reduction = reduction + self.conf = 1.0 - label_smoothing + + # output: (batch size, num_classes) + # target: (batch size) + # they will be flattened automatically if the dimension of output is larger than 2. + + def forward(self, output, target): + + _output = output.view(-1, output.size(-1)) if output.dim() > 2 else output + + _target = target.view(-1, 1) + + model_prob = self.weight.repeat(_target.size(0), 1) + model_prob.scatter_(1, _target, self.conf) + + if isinstance(self.ignore_index, (list, tuple)): + model_prob.masked_fill_(torch.stack([_target == _tmp for _tmp in self.ignore_index]).int().sum(0).gt(0), 0.0) + elif self.ignore_index >= 0: + model_prob.masked_fill_(_target == self.ignore_index, 0.0) + + return F.kl_div(_output, model_prob, reduction=self.reduction) + +class NLLLoss(NLLLossBase): + + def forward(self, input, target): + + isize = input.size() + + return F.nll_loss(input.view(-1, isize[-1]), target.view(-1), weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction).view(isize[:-1]) + +class RankingLoss(_Loss): + + # output: (batch size) + # target: (batch size) + def forward(self, output, target): + + loss = output * target + if self.reduction == 'mean': + loss = loss / loss.numel() + + return loss + +class MultiLabelSmoothingLoss(_Loss): + + def __init__(self, nclass, label_smoothing=0.1, ignore_index=-1, reduction='mean', forbidden_index=-1): + + super(MultiLabelSmoothingLoss, self).__init__() + + fbil = [] + for fbilu in forbidden_index: + tmp = set() + if isinstance(fbilu, (list, tuple)): + for fi in fbilu: + if (fi >= 0) and (fi not in tmp): + tmp.add(fi) + else: + if fbilu is not None and fbilu >= 0: + tmp.add(forbidden_index) + fbil.append(tmp) + + if isinstance(ignore_index, (list, tuple)): + tmp = [] + for _tmp in ignore_index: + if (_tmp >= 0) and (_tmp not in tmp): + tmp.append(_tmp) + for fbilu in fbil: + if _tmp not in fbilu: + fbilu.add(_tmp) + _nid = len(tmp) + if _nid > 0: + if _nid > 1: + self.ignore_index = tuple(tmp) + else: + self.ignore_index = tmp[0] + else: + self.ignore_index = ignore_index[0] if len(ignore_index) > 0 else -1 + else: + self.ignore_index = ignore_index + if (ignore_index >= 0): + for fbilu in fbil: + if ignore_index not in fbilu: + fbilu.add(ignore_index) + + _weight = [] + for fbilu in fbil: + smoothing_value = label_smoothing / (nclass - 1 - len(fbilu)) + _tmp_w = torch.full((nclass,), smoothing_value) + _tmp_w.index_fill_(0, torch.tensor(tuple(fbilu), dtype=torch.long, device=_tmp_w.device), 0.0) + _weight.append(_tmp_w) + self.register_buffer("weight", torch.stack(_weight, 0).unsqueeze(1)) + + self.reduction = reduction + + self.conf = 1.0 - label_smoothing + + def forward(self, output, target, lang_id=0): + + _output = output.view(-1, output.size(-1)) if output.dim() > 2 else output + + _target = target.view(-1, 1) + + model_prob = self.weight[lang_id].repeat(_target.size(0), 1) + model_prob.scatter_(1, _target, self.conf) + + if isinstance(self.ignore_index, (list, tuple)): + model_prob.masked_fill_(torch.stack([_target == _tmp for _tmp in self.ignore_index]).int().sum(0).gt(0), 0.0) + elif self.ignore_index >= 0: + model_prob.masked_fill_(_target == self.ignore_index, 0.0) + + return F.kl_div(_output, model_prob, reduction=self.reduction) + +class ReducedLabelSmoothingLoss(LabelSmoothingLoss): + + def __init__(self, nclass, label_smoothing=0.1, ignore_index=-1, reduction='mean', forbidden_index=-1, reduce_dim=None): + + super(ReducedLabelSmoothingLoss, self).__init__(nclass, label_smoothing, ignore_index, reduction, forbidden_index) + + self.reduce_dim = reduce_dim + + def forward(self, output, target): + + if self.reduce_dim is not None: + output, target = clear_pad_mask([output, target], target.eq(0), [self.reduce_dim - 1, self.reduce_dim], mask_dim=self.reduce_dim, return_contiguous=True)[0] + + _output = output.view(-1, output.size(-1)) if output.dim() > 2 else output + + _target = target.view(-1, 1) + + model_prob = self.weight.repeat(_target.size(0), 1) + model_prob.scatter_(1, _target, self.conf) + + if isinstance(self.ignore_index, (list, tuple)): + model_prob.masked_fill_(torch.stack([_target == _tmp for _tmp in self.ignore_index]).int().sum(0).gt(0), 0.0) + elif self.ignore_index >= 0: + model_prob.masked_fill_(_target == self.ignore_index, 0.0) + + return F.kl_div(_output, model_prob, reduction=self.reduction) diff --git a/modules/TA.py b/modules/TA.py index 8abcfd3..df6641b 100644 --- a/modules/TA.py +++ b/modules/TA.py @@ -2,12 +2,14 @@ from modules.base import PositionwiseFF as PositionwiseFFBase +from cnfg.ihyp import * + class PositionwiseFF(PositionwiseFFBase): # isize: input dimension # hsize: hidden dimension - def __init__(self, isize, hsize=None, dropout=0.0, use_GeLU=False): + def __init__(self, isize, hsize=None, dropout=0.0, use_GeLU=use_adv_act_default): super(PositionwiseFF, self).__init__(isize, hsize, dropout, False, use_GeLU) diff --git a/modules/act.py b/modules/act.py index 97c9801..c025a42 100644 --- a/modules/act.py +++ b/modules/act.py @@ -5,6 +5,8 @@ from math import sqrt +from cnfg.ihyp import * + # 2 kinds of GELU activation function implementation according to https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L53-L58 class GeLU_GPT(nn.Module): @@ -31,8 +33,6 @@ def forward(self, x): return 0.5 * x * (1.0 + (x / self.k).erf()) -GeLU = GeLU_BERT - # Swish approximates GeLU when beta=1.702 (https://mp.weixin.qq.com/s/LEPalstOc15CX6fuqMRJ8Q). # GELU is nonmonotonic function that has a shape similar to Swish with beta = 1.4 (https://arxiv.org/abs/1710.05941). class Swish(nn.Module): @@ -56,3 +56,10 @@ def fix_init(self): if self.reset_beta is not None: self.beta.fill_(self.reset_beta) + +if override_GeLU_Swish: + GeLU = Swish +elif override_GeLU_Sigmoid: + GeLU = nn.Sigmoid +else: + GeLU = GeLU_BERT diff --git a/modules/base.py b/modules/base.py index 4d0265d..15a4b23 100644 --- a/modules/base.py +++ b/modules/base.py @@ -1,6 +1,6 @@ #encoding: utf-8 -from math import sqrt, log, exp, pi, inf +from math import sqrt, log, exp, pi import torch from torch import nn from torch.nn import functional as nnFunc @@ -9,6 +9,8 @@ from utils.base import reduce_model_list from modules.act import GeLU_GPT, GeLU_BERT, GeLU, Swish +from cnfg.ihyp import * + Linear = nn.Linear Dropout = nn.Dropout @@ -17,17 +19,17 @@ class PositionwiseFF(nn.Module): # isize: input dimension # hsize: hidden dimension - def __init__(self, isize, hsize=None, dropout=0.0, norm_residue=False, use_GeLU=False, enable_bias=False): + def __init__(self, isize, hsize=None, dropout=0.0, norm_residual=norm_residual_default, use_GeLU=use_adv_act_default, enable_bias=enable_residual_bias_default): super(PositionwiseFF, self).__init__() _hsize = isize * 4 if hsize is None else hsize - self.net = nn.Sequential(Linear(isize, _hsize), GeLU() if use_GeLU else nn.ReLU(inplace=True), Dropout(dropout, inplace=use_GeLU), Linear(_hsize, isize, bias=enable_bias), Dropout(dropout, inplace=True)) if dropout > 0.0 else nn.Sequential(Linear(isize, _hsize), GeLU() if use_GeLU else nn.ReLU(inplace=True), Linear(_hsize, isize, bias=enable_bias)) + self.net = nn.Sequential(Linear(isize, _hsize), GeLU() if use_GeLU else nn.ReLU(inplace=True), Dropout(dropout, inplace=inplace_after_GeLU), Linear(_hsize, isize, bias=enable_bias), Dropout(dropout, inplace=True)) if dropout > 0.0 else nn.Sequential(Linear(isize, _hsize), GeLU() if use_GeLU else nn.ReLU(inplace=True), Linear(_hsize, isize, bias=enable_bias)) - self.normer = nn.LayerNorm(isize, eps=1e-06) + self.normer = nn.LayerNorm(isize, eps=ieps_ln_default) - self.norm_residue = norm_residue + self.norm_residual = norm_residual def forward(self, x): @@ -35,7 +37,7 @@ def forward(self, x): out = self.net(_out) - out = out + (_out if self.norm_residue else x) + out = out + (_out if self.norm_residual else x) return out @@ -101,7 +103,7 @@ def get_ext(self, length, step_pick=False): def get_pos(self, step): - return self.w[step] if step < self.num_pos else self.get_ext(step, True).squeeze(0) + return self.w[step] if step <= self.num_pos else self.get_ext(step, True).squeeze(0) class MultiHeadAttn(nn.Module): @@ -113,7 +115,7 @@ class MultiHeadAttn(nn.Module): # sparsenorm: using sparse normer or standard softmax # bind_qk: query and key can share a same linear transformation for the Reformer: The Efficient Transformer(https://arxiv.org/abs/2001.04451) paper. - def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, k_isize=None, v_isize=None, enable_bias=False, sparsenorm=False, bind_qk=False): + def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, k_isize=None, v_isize=None, enable_bias=enable_residual_bias_default, k_rel_pos=0, sparsenorm=False, bind_qk=False, xseql=cache_len_default): super(MultiHeadAttn, self).__init__() @@ -133,6 +135,17 @@ def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, k_isize=None, v self.drop = Dropout(dropout, inplace=sparsenorm) if dropout > 0.0 else None + if k_rel_pos > 0: + self.k_rel_pos = k_rel_pos + self.rel_pemb = nn.Embedding(k_rel_pos * 2 + 1, self.attn_dim) + _rpm = torch.arange(-xseql + 1, 1).unsqueeze(0) + self.register_buffer("rel_pos", (_rpm - _rpm.t()).clamp(min=-k_rel_pos, max=k_rel_pos) + k_rel_pos) + self.xseql = xseql + # the buffer can be shared inside the encoder or the decoder across layers for saving memory, by setting self.ref_rel_posm of self attns in deep layers to SelfAttn in layer 0, and sharing corresponding self.rel_pos + self.ref_rel_posm = None + else: + self.rel_pemb = None + # iQ: query (bsize, num_query, vsize) # iK: keys (bsize, seql, vsize) # iV: values (bsize, seql, vsize) @@ -153,10 +166,16 @@ def forward(self, iQ, iK, iV, mask=None): # scores (bsize, nheads, nquery, adim) * (bsize, nheads, adim, seql) => (bsize, nheads, nquery, seql) - scores = real_iQ.matmul(real_iK) / sqrt(adim) + scores = real_iQ.matmul(real_iK) + + if self.rel_pemb is not None: + self.rel_pos_cache = self.get_rel_pos(seql).narrow(0, seql - nquery, nquery).contiguous() if self.ref_rel_posm is None else self.ref_rel_posm.rel_pos_cache + scores += real_iQ.permute(2, 0, 1, 3).contiguous().view(nquery, bsize * nheads, adim).bmm(self.rel_pemb(self.get_rel_pos(seql).narrow(0, seql - nquery, nquery)).transpose(1, 2)).view(nquery, bsize, nheads, seql).permute(1, 2, 0, 3) + + scores = scores / sqrt(adim) if mask is not None: - scores.masked_fill_(mask.unsqueeze(1), -inf) + scores.masked_fill_(mask.unsqueeze(1), -inf_default) scores = self.normer(scores) @@ -171,6 +190,14 @@ def forward(self, iQ, iK, iV, mask=None): return self.outer(oMA.view(bsize, nquery, self.hsize)) + def get_rel_pos(self, length): + + if length <= self.xseql: + return self.rel_pos.narrow(0, 0, length).narrow(1, 0, length) + else: + _rpm = torch.arange(-length + 1, 1, dtype=self.rel_pos.dtype, device=self.rel_pos.device).unsqueeze(0) + return ((_rpm - _rpm.t()).clamp(min=-self.k_rel_pos, max=self.k_rel_pos) + self.k_rel_pos) + # Average Attention is proposed in Accelerating Neural Transformer via an Average Attention Network(https://arxiv.org/abs/1805.00631) class AverageAttn(nn.Module): @@ -179,7 +206,7 @@ class AverageAttn(nn.Module): # dropout: dropout rate for Feed-forward NN # num_pos: maximum length of sentence cached, extended length will be generated while needed and droped immediately after that - def __init__(self, isize, hsize=None, dropout=0.0, num_pos=512, use_GeLU=False): + def __init__(self, isize, hsize=None, dropout=0.0, num_pos=512, use_GeLU=use_adv_act_default): super(AverageAttn, self).__init__() @@ -206,20 +233,14 @@ def forward(self, iQ, iV, decoding=False): bsize, seql = iV.size()[:2] # attn: (seql, seql) - if seql > self.num_pos: - attn = self.get_ext(seql) - else: - attn = self.w.narrow(0, 0, seql).narrow(1, 0, seql) + attn = self.get_ext(seql) if seql > self.num_pos else self.w.narrow(0, 0, seql).narrow(1, 0, seql) # avg: (bsize, seql, vsize) avg = attn.unsqueeze(0).expand(bsize, seql, seql).matmul(iV) avg = self.ffn(avg) - ifg = self.gw(torch.cat((iQ, avg), -1)).sigmoid() - isize = avg.size(-1) - igate = ifg.narrow(-1, 0, isize) - fgate = ifg.narrow(-1, isize, isize) + igate, fgate = self.gw(torch.cat((iQ, avg), -1)).sigmoid().chunk(2, -1) return igate * iQ + fgate * avg @@ -229,14 +250,12 @@ def reset_parameters(self): def get_ext(self, npos): - _tmp = (1.0 / torch.arange(1, npos + 1, dtype=self.w.dtype, device=self.w.device)).unsqueeze(1).repeat(1, npos) - - return _tmp.tril(0) + return (1.0 / torch.arange(1, npos + 1, dtype=self.w.dtype, device=self.w.device)).unsqueeze(1).expand(-1, npos).tril(0.0) # Accelerated MultiHeadAttn for self attention, use when Q == K == V class SelfAttn(nn.Module): - def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, enable_bias=False, sparsenorm=False): + def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, enable_bias=enable_residual_bias_default, k_rel_pos=use_k_relative_position, sparsenorm=False, xseql=cache_len_default): super(SelfAttn, self).__init__() @@ -253,6 +272,18 @@ def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, enable_bias=Fal self.drop = Dropout(dropout, inplace=sparsenorm) if dropout > 0.0 else None + if k_rel_pos > 0: + self.k_rel_pos = k_rel_pos + self.rel_pemb = nn.Embedding(k_rel_pos * 2 + 1, self.attn_dim) + _rpm = torch.arange(-xseql + 1, 1).unsqueeze(0) + self.register_buffer("rel_pos", (_rpm - _rpm.t()).clamp(min=-k_rel_pos, max=k_rel_pos) + k_rel_pos) + self.xseql = xseql + # the buffer can be shared inside the encoder or the decoder across layers for saving memory, by setting self.ref_rel_posm of self attns in deep layers to SelfAttn in layer 0, and sharing corresponding self.rel_pos + self.ref_rel_posm = None + self.register_buffer("rel_pos_cache", None) + else: + self.rel_pemb = None + def forward(self, iQ, mask=None, iK=None): bsize, nquery = iQ.size()[:2] @@ -274,10 +305,20 @@ def forward(self, iQ, mask=None, iK=None): real_iQ, real_iK, real_iV = real_iQ.transpose(1, 2), real_iK.permute(0, 2, 3, 1), real_iV.transpose(1, 2) - scores = real_iQ.matmul(real_iK) / sqrt(adim) + scores = real_iQ.matmul(real_iK) + + if self.rel_pemb is not None: + if iK is None: + self.rel_pos_cache = self.get_rel_pos(nquery).contiguous() if self.ref_rel_posm is None else self.ref_rel_posm.rel_pos_cache + scores += real_iQ.permute(2, 0, 1, 3).contiguous().view(nquery, bsize * nheads, adim).bmm(self.rel_pemb(self.rel_pos_cache).transpose(1, 2)).view(nquery, bsize, nheads, nquery).permute(1, 2, 0, 3) + else: + self.rel_pos_cache = self.get_rel_pos(seql).narrow(0, seql - nquery, nquery).contiguous() if self.ref_rel_posm is None else self.ref_rel_posm.rel_pos_cache + scores += real_iQ.permute(2, 0, 1, 3).contiguous().view(nquery, bsize * nheads, adim).bmm(self.rel_pemb(self.rel_pos_cache).transpose(1, 2)).view(nquery, bsize, nheads, seql).permute(1, 2, 0, 3) + + scores = scores / sqrt(adim) if mask is not None: - scores.masked_fill_(mask.unsqueeze(1), -inf) + scores.masked_fill_(mask.unsqueeze(1), -inf_default) scores = self.normer(scores) @@ -288,10 +329,18 @@ def forward(self, iQ, mask=None, iK=None): return self.outer(oMA.view(bsize, nquery, self.hsize)) + def get_rel_pos(self, length): + + if length <= self.xseql: + return self.rel_pos.narrow(0, 0, length).narrow(1, 0, length) + else: + _rpm = torch.arange(-length + 1, 1, dtype=self.rel_pos.dtype, device=self.rel_pos.device).unsqueeze(0) + return ((_rpm - _rpm.t()).clamp(min=-self.k_rel_pos, max=self.k_rel_pos) + self.k_rel_pos) + # Accelerated MultiHeadAttn for cross attention, use when K == V class CrossAttn(nn.Module): - def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, k_isize=None, enable_bias=False, sparsenorm=False): + def __init__(self, isize, hsize, osize, num_head=8, dropout=0.0, k_isize=None, enable_bias=enable_residual_bias_default, sparsenorm=False): super(CrossAttn, self).__init__() @@ -325,7 +374,7 @@ def forward(self, iQ, iK, mask=None): scores = real_iQ.matmul(real_iK) / sqrt(adim) if mask is not None: - scores.masked_fill_(mask.unsqueeze(1), -inf) + scores.masked_fill_(mask.unsqueeze(1), -inf_default) scores = self.normer(scores) @@ -341,16 +390,16 @@ class ResidueCombiner(nn.Module): # isize: input size of Feed-forward NN - def __init__(self, isize, ncomb=2, hsize=None, dropout=0.0, use_GeLU=False, enable_bias=False): + def __init__(self, isize, ncomb=2, hsize=None, dropout=0.0, use_GeLU=use_adv_act_default, enable_bias=enable_residual_bias_default): super(ResidueCombiner, self).__init__() _hsize = isize * 2 * ncomb if hsize is None else hsize # should dropout be in front of sigmoid or not? - self.net = nn.Sequential(Linear(isize * ncomb, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Dropout(dropout, inplace=use_GeLU), Linear(_hsize, isize, bias=enable_bias), Dropout(dropout, inplace=True)) if dropout > 0.0 else nn.Sequential(Linear(isize * ncomb, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Linear(_hsize, isize, bias=enable_bias)) + self.net = nn.Sequential(Linear(isize * ncomb, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Dropout(dropout, inplace=inplace_after_GeLU), Linear(_hsize, isize, bias=enable_bias), Dropout(dropout, inplace=True)) if dropout > 0.0 else nn.Sequential(Linear(isize * ncomb, _hsize), GeLU() if use_GeLU else nn.Sigmoid(), Linear(_hsize, isize, bias=enable_bias)) - self.out_normer = nn.LayerNorm(isize, eps=1e-06) + self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default) def forward(self, *xl): @@ -617,7 +666,7 @@ def forward(self, x, mask=None): _weight = self.net(x) if mask is not None: - _weight.masked_fill_(mask, -inf) + _weight.masked_fill_(mask, -inf_default) # (bsize, seql, 1)' * (bsize, seql, isize) => (bsize, 1, isize) return self.normer(_weight).transpose(1, 2).bmm(x).squeeze(1) @@ -649,7 +698,7 @@ def forward(self, x, step, expand=True): bsize, seql = x.size()[:2] - if step < self.num_steps: + if step <= self.num_steps: rs = self.w[step][:seql] if seql <= self.num_pos else torch.cat((self.w[step], self.get_ext(seql, step, False)), 0) else: rs = self.get_ext(seql, step, False) @@ -682,14 +731,14 @@ def get_ext(self, length, step, step_pick=False): ed = self.w.new(1, self.num_dim) else: npos = self.num_pos - _pos = torch.arange(npos + poff if step < self.num_steps else poff, length + poff, dtype=self.w.dtype, device=self.w.device).unsqueeze(1) + _pos = torch.arange(npos + poff if step <= self.num_steps else poff, length + poff, dtype=self.w.dtype, device=self.w.device).unsqueeze(1) ed = self.w.new(length - npos, self.num_dim) rdiv_term = (torch.arange(self.doff, self.num_dim + self.doff, 2, dtype=self.w.dtype, device=self.w.device) * -(log(1e4) / self.num_dim)).exp() _tmp1, _tmp2 = _pos * rdiv_term, _step * rdiv_term if self.alpha != 1.0: _tmp1.mul_(self.alpha) _tmp2.mul_(self.alpha) - ed[:, 0::2], ed[:, 1::2] = _tmp1.sin() + _tmp2.sin(), ((_tmp1.cos() + _tmp2.cos()).narrow(-1, 0, _tmp1.size(-1) - 1) if self.num_dim % 2 == 1 else _tmp1.cos() + _tmp2.cos()) + ed[:, 0::2], ed[:, 1::2] = _tmp1.sin() + _tmp2.sin(), ((_tmp1.narrow(-1, 0, _tmp1.size(-1) - 1).cos() + _tmp2.narrow(-1, 0, _tmp1.size(-1) - 1).cos()) if self.num_dim % 2 == 1 else _tmp1.cos() + _tmp2.cos()) return ed @@ -697,7 +746,7 @@ def get_ext(self, length, step, step_pick=False): def get_pos(self, step, layer): - return self.w[layer][step] if step < self.num_pos and layer < self.num_steps else self.get_ext(step, layer, True).squeeze(0) + return self.w[layer][step] if step <= self.num_pos and layer <= self.num_steps else self.get_ext(step, layer, True).squeeze(0) class Temperature(nn.Module): diff --git a/modules/noise.py b/modules/noise.py index 0ad9dec..dfcd4e0 100644 --- a/modules/noise.py +++ b/modules/noise.py @@ -5,6 +5,8 @@ from modules.base import PositionwiseFF as PositionwiseFFBase +from cnfg.ihyp import * + class GausNoiser(nn.Module): def __init__(self, power): @@ -86,9 +88,9 @@ def forward(self, inpute, mask=None): class PositionwiseFF(PositionwiseFFBase): - def __init__(self, isize, hsize=None, dropout=0.0, norm_residue=False, use_GeLU=False, power=None): + def __init__(self, isize, hsize=None, dropout=0.0, norm_residual=norm_residual_default, use_GeLU=use_adv_act_default, power=None): - super(PositionwiseFF, self).__init__(isize, hsize, dropout, norm_residue, use_GeLU) + super(PositionwiseFF, self).__init__(isize, hsize, dropout, norm_residual, use_GeLU) self.noiser = None if power is None else Noiser(power) @@ -100,6 +102,6 @@ def forward(self, x, mask=None): out = self.net(_out) - out = out + (_out if self.norm_residue else x) + out = out + (_out if self.norm_residual else x) return out diff --git a/modules/rnncells.py b/modules/rnncells.py index 8ac839f..4b76e32 100644 --- a/modules/rnncells.py +++ b/modules/rnncells.py @@ -5,6 +5,8 @@ from modules.base import * from modules.act import GeLU +from cnfg.ihyp import * + def prepare_initState(hx, cx, bsize): return hx.expand(bsize, -1), cx.expand(bsize, -1) @@ -16,7 +18,7 @@ class LSTMCell4RNMT(nn.Module): # isize: input size of Feed-forward NN - def __init__(self, isize, osize, use_GeLU=False): + def __init__(self, isize, osize, use_GeLU=use_adv_act_default): super(LSTMCell4RNMT, self).__init__() @@ -47,7 +49,7 @@ class GRUCell4RNMT(nn.Module): # isize: input size of Feed-forward NN - def __init__(self, isize, osize, use_GeLU=False): + def __init__(self, isize, osize, use_GeLU=use_adv_act_default): super(GRUCell4RNMT, self).__init__() diff --git a/predict.py b/predict.py index ccd0f08..18fc6c5 100644 --- a/predict.py +++ b/predict.py @@ -9,6 +9,7 @@ import h5py import cnfg.base as cnfg +from cnfg.ihyp import * from transformer.NMT import NMT from transformer.EnsembleNMT import NMT as Ensemble @@ -31,7 +32,7 @@ def load_fixing(module): vcbt = reverse_dict(vcbt) if len(sys.argv) == 4: - mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) mymodel = load_model_cpu(sys.argv[3], mymodel) mymodel.apply(load_fixing) @@ -39,7 +40,7 @@ def load_fixing(module): else: models = [] for modelf in sys.argv[3:]: - tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) tmp = load_model_cpu(modelf, tmp) tmp.apply(load_fixing) diff --git a/rank_loss.py b/rank_loss.py index 2eedabd..8a36c11 100644 --- a/rank_loss.py +++ b/rank_loss.py @@ -13,13 +13,14 @@ import h5py import cnfg.base as cnfg +from cnfg.ihyp import * from transformer.NMT import NMT from transformer.EnsembleNMT import NMT as Ensemble from parallel.parallelMT import DataParallelMT from parallel.base import DataParallelCriterion -from loss import LabelSmoothingLoss +from loss.base import LabelSmoothingLoss from utils.base import * from utils.fmt.base4torch import parse_cuda @@ -38,7 +39,7 @@ def load_fixing(module): cuda_device = torch.device(cnfg.gpuid) if len(sys.argv) == 4: - mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) mymodel = load_model_cpu(sys.argv[3], mymodel) mymodel.apply(load_fixing) @@ -46,7 +47,7 @@ def load_fixing(module): else: models = [] for modelf in sys.argv[3:]: - tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) tmp = load_model_cpu(modelf, tmp) tmp.apply(load_fixing) diff --git a/scripts/doc/para/mktest.sh b/scripts/doc/para/mktest.sh index a876eec..ec9375f 100644 --- a/scripts/doc/para/mktest.sh +++ b/scripts/doc/para/mktest.sh @@ -3,7 +3,8 @@ export srcd=w19edoc export srctf=test.en.w19edoc export modelf="expm/w19edoc/checkpoint.t7" -export rsf=w19edoctrs/trans.txt +export rsd=w19edoctrs +export rsf=$rsd/trans.txt export share_vcb=false export cachedir=cache @@ -23,6 +24,8 @@ else export tgt_vcb=$tgtd/tgt.vcb fi +mkdir -p $rsd + python tools/doc/mono/sort.py $srcd/$srctf $tgtd/$srctf.srt python tools/doc/para/mktest.py $tgtd/$srctf.srt $src_vcb $tgtd/test.h5 $ngpu python predict_doc_para.py $tgtd/$bpef.srt $tgt_vcb $modelf diff --git a/scripts/mktest.sh b/scripts/mktest.sh index 8b7bfed..6849dca 100644 --- a/scripts/mktest.sh +++ b/scripts/mktest.sh @@ -1,9 +1,10 @@ #!/bin/bash -export srcd=w14ende +export srcd=wmt14 export srctf=test.tc.en.w14ed32 -export modelf="expm/w14ende/checkpoint.t7" -export rsf=w14trs/trans.txt +export modelf="expm/w14ed32/checkpoint.t7" +export rsd=w14trs +export rsf=$rsd/trans.txt export share_vcb=false export cachedir=cache @@ -23,6 +24,8 @@ else export tgt_vcb=$tgtd/tgt.vcb fi +mkdir -p $rsd + python tools/sorti.py $srcd/$srctf $tgtd/$srctf.srt python tools/mktest.py $tgtd/$srctf.srt $src_vcb $tgtd/test.h5 $ngpu python predict.py $tgtd/$bpef.srt $tgt_vcb $modelf diff --git a/tools/average_model.py b/tools/average_model.py index c900cf8..6dcd2b0 100644 --- a/tools/average_model.py +++ b/tools/average_model.py @@ -1,19 +1,21 @@ #encoding: utf-8 ''' usage: - python tools/average_model.py $averaged_model_file.t7 $model1.t7 $ model2.t7 ... + python tools/average_model.py $averaged_model_file.h5 $model1.h5 $ model2.h5 ... ''' import sys import torch -rsm = torch.load(sys.argv[2], map_location='cpu') +from utils.h5serial import h5save, h5load + +rsm = h5load(sys.argv[2]) nmodel = 1 for modelf in sys.argv[3:]: - for basep, mpload in zip(rsm, torch.load(modelf, map_location='cpu')): + for basep, mpload in zip(rsm, h5load(modelf)): basep.add_(mpload) nmodel += 1 @@ -22,4 +24,4 @@ for basep in rsm: basep.div_(nmodel) -torch.save(rsm, sys.argv[1]) +h5save(rsm, sys.argv[1]) diff --git a/tools/check/cnfg b/tools/check/cnfg new file mode 120000 index 0000000..bcd9a88 --- /dev/null +++ b/tools/check/cnfg @@ -0,0 +1 @@ +../../cnfg/ \ No newline at end of file diff --git a/tools/check/ext_emb.py b/tools/check/ext_emb.py index 72fed0b..b2a7f90 100644 --- a/tools/check/ext_emb.py +++ b/tools/check/ext_emb.py @@ -10,6 +10,7 @@ from utils.fmt.base import ldvocab, reverse_dict from utils.fmt.base4torch import load_emb_txt +from utils.h5serial import h5save, h5load def handle(vcbf, embf, rsf): @@ -20,7 +21,7 @@ def handle(vcbf, embf, rsf): rs = [] for i in range(nwd): rs.append(emb.get(vcb[i], unkemb)) - torch.save(torch.stack(rs, 0), rsf) + h5save(torch.stack(rs, 0), rsf) if __name__ == "__main__": handle(sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/tools/check/tspeed.py b/tools/check/tspeed.py index 01f98e6..88254b5 100644 --- a/tools/check/tspeed.py +++ b/tools/check/tspeed.py @@ -9,6 +9,7 @@ import h5py import cnfg.base as cnfg +from cnfg.ihyp import * from transformer.NMT import NMT from transformer.EnsembleNMT import NMT as Ensemble @@ -30,7 +31,7 @@ def load_fixing(module): cuda_device = torch.device(cnfg.gpuid) if len(sys.argv) == 2: - mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) mymodel = load_model_cpu(sys.argv[1], mymodel) mymodel.apply(load_fixing) @@ -38,7 +39,7 @@ def load_fixing(module): else: models = [] for modelf in sys.argv[1:]: - tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) tmp = load_model_cpu(modelf, tmp) tmp.apply(load_fixing) diff --git a/tools/clean/cnfg b/tools/clean/cnfg new file mode 120000 index 0000000..bcd9a88 --- /dev/null +++ b/tools/clean/cnfg @@ -0,0 +1 @@ +../../cnfg/ \ No newline at end of file diff --git a/tools/clean/compress_h5.py b/tools/clean/compress_h5.py new file mode 100644 index 0000000..f5a2df7 --- /dev/null +++ b/tools/clean/compress_h5.py @@ -0,0 +1,12 @@ +#encoding: utf-8 + +import sys + +from utils.h5serial import h5save, h5load + +def handle(srcf, rsf): + + h5save(h5load(srcf, restore_list=False), rsf, h5args={"compression": "gzip", "compression_opts": 9, "shuffle":True}) + +if __name__ == "__main__": + handle(sys.argv[1], sys.argv[-1]) diff --git a/tools/clean/doc/para/cnfg b/tools/clean/doc/para/cnfg new file mode 120000 index 0000000..2f54778 --- /dev/null +++ b/tools/clean/doc/para/cnfg @@ -0,0 +1 @@ +../../../../cnfg/ \ No newline at end of file diff --git a/tools/cnfg b/tools/cnfg new file mode 120000 index 0000000..428c255 --- /dev/null +++ b/tools/cnfg @@ -0,0 +1 @@ +../cnfg/ \ No newline at end of file diff --git a/tools/doc/para/cnfg b/tools/doc/para/cnfg new file mode 120000 index 0000000..a958f86 --- /dev/null +++ b/tools/doc/para/cnfg @@ -0,0 +1 @@ +../../../cnfg/ \ No newline at end of file diff --git a/tools/doc/para/mkiodata.py b/tools/doc/para/mkiodata.py index 65c5c5e..f778aad 100644 --- a/tools/doc/para/mkiodata.py +++ b/tools/doc/para/mkiodata.py @@ -8,7 +8,9 @@ from utils.fmt.base import ldvocab, dict2pairs from utils.fmt.doc.para.dual import batch_padder -def handle(finput, ftarget, fvocab_i, fvocab_t, frs, minbsize=1, expand_for_mulgpu=True, bsize=512, maxpad=16, maxpart=4, maxtoken=5120, minfreq=False, vsize=False): +from cnfg.ihyp import * + +def handle(finput, ftarget, fvocab_i, fvocab_t, frs, minbsize=1, expand_for_mulgpu=True, bsize=max_sentences_gpu, maxpad=max_pad_tokens_sentence, maxpart=normal_tokens_vs_pad_tokens, maxtoken=max_tokens_gpu, minfreq=False, vsize=False): vcbi, nwordi = ldvocab(fvocab_i, minfreq, vsize) vcbt, nwordt = ldvocab(fvocab_t, minfreq, vsize) if expand_for_mulgpu: @@ -30,8 +32,8 @@ def handle(finput, ftarget, fvocab_i, fvocab_t, frs, minbsize=1, expand_for_mulg src_grp.create_group(_nsentgid) tgt_grp.create_group(_nsentgid) _curid = str(_curd) - src_grp[_nsentgid][_curid] = rid - tgt_grp[_nsentgid][_curid] = rtd + src_grp[_nsentgid].create_dataset(_curid, data=rid, **h5datawargs) + tgt_grp[_nsentgid].create_dataset(_curid, data=rtd, **h5datawargs) curd[nsent] = _curd + 1 sents, ndl = dict2pairs(curd) rsf["nsent"] = numpy.array(sents, dtype = numpy.int32) diff --git a/tools/doc/para/mktest.py b/tools/doc/para/mktest.py index 9876110..86c1fa0 100644 --- a/tools/doc/para/mktest.py +++ b/tools/doc/para/mktest.py @@ -8,7 +8,9 @@ from utils.fmt.base import ldvocab, dict2pairs from utils.fmt.doc.para.single import batch_padder -def handle(finput, fvocab_i, frs, minbsize=1, expand_for_mulgpu=True, bsize=128, maxpad=16, maxpart=4, maxtoken=2048, minfreq=False, vsize=False): +from cnfg.ihyp import * + +def handle(finput, fvocab_i, frs, minbsize=1, expand_for_mulgpu=True, bsize=max_sentences_gpu, maxpad=max_pad_tokens_sentence, maxpart=normal_tokens_vs_pad_tokens, maxtoken=max_tokens_gpu, minfreq=False, vsize=False): vcbi, nwordi = ldvocab(fvocab_i, minfreq, vsize) if expand_for_mulgpu: _bsize = bsize * minbsize @@ -25,7 +27,7 @@ def handle(finput, fvocab_i, frs, minbsize=1, expand_for_mulgpu=True, bsize=128, _curd = curd.get(nsent, 0) if _curd == 0: src_grp.create_group(_nsentgid) - src_grp[_nsentgid][str(_curd)] = rid + src_grp[_nsentgid].create_dataset(str(_curd), data=rid, **h5datawargs) curd[nsent] = _curd + 1 sents, ndl = dict2pairs(curd) rsf["nsent"] = numpy.array(sents, dtype = numpy.int32) diff --git a/tools/lsort/cnfg b/tools/lsort/cnfg new file mode 120000 index 0000000..bcd9a88 --- /dev/null +++ b/tools/lsort/cnfg @@ -0,0 +1 @@ +../../cnfg/ \ No newline at end of file diff --git a/tools/mkiodata.py b/tools/mkiodata.py index 1ee99fd..0d42a76 100644 --- a/tools/mkiodata.py +++ b/tools/mkiodata.py @@ -8,7 +8,9 @@ from utils.fmt.base import ldvocab from utils.fmt.dual import batch_padder -def handle(finput, ftarget, fvocab_i, fvocab_t, frs, minbsize=1, expand_for_mulgpu=True, bsize=768, maxpad=16, maxpart=4, maxtoken=4352, minfreq=False, vsize=False): +from cnfg.ihyp import * + +def handle(finput, ftarget, fvocab_i, fvocab_t, frs, minbsize=1, expand_for_mulgpu=True, bsize=max_sentences_gpu, maxpad=max_pad_tokens_sentence, maxpart=normal_tokens_vs_pad_tokens, maxtoken=max_tokens_gpu, minfreq=False, vsize=False): vcbi, nwordi = ldvocab(fvocab_i, minfreq, vsize) vcbt, nwordt = ldvocab(fvocab_t, minfreq, vsize) if expand_for_mulgpu: @@ -26,8 +28,8 @@ def handle(finput, ftarget, fvocab_i, fvocab_t, frs, minbsize=1, expand_for_mulg rtd = numpy.array(td, dtype = numpy.int32) #rld = numpy.array(ld, dtype = numpy.int32) wid = str(curd) - src_grp[wid] = rid - tgt_grp[wid] = rtd + src_grp.create_dataset(wid, data=rid, **h5datawargs) + tgt_grp.create_dataset(wid, data=rtd, **h5datawargs) #rsf["l" + wid] = rld curd += 1 rsf["ndata"] = numpy.array([curd], dtype = numpy.int32) diff --git a/tools/mktest.py b/tools/mktest.py index 2206fa9..5f7d45b 100644 --- a/tools/mktest.py +++ b/tools/mktest.py @@ -8,8 +8,11 @@ from utils.fmt.base import ldvocab from utils.fmt.single import batch_padder +from cnfg.ihyp import * + # maxtoken should be the maxtoken in mkiodata.py / 2 / beam size roughly, similar for bsize -def handle(finput, fvocab_i, frs, minbsize=1, expand_for_mulgpu=True, bsize=768, maxpad=16, maxpart=4, maxtoken=4352, minfreq=False, vsize=False): + +def handle(finput, fvocab_i, frs, minbsize=1, expand_for_mulgpu=True, bsize=max_sentences_gpu, maxpad=max_pad_tokens_sentence, maxpart=normal_tokens_vs_pad_tokens, maxtoken=max_tokens_gpu, minfreq=False, vsize=False): vcbi, nwordi = ldvocab(fvocab_i, minfreq, vsize) if expand_for_mulgpu: _bsize = bsize * minbsize @@ -24,7 +27,7 @@ def handle(finput, fvocab_i, frs, minbsize=1, expand_for_mulgpu=True, bsize=768, rid = numpy.array(i_d, dtype = numpy.int32) #rld = numpy.array(ld, dtype = numpy.int32) wid = str(curd) - src_grp[wid] = rid + src_grp.create_dataset(wid, data=rid, **h5datawargs) #rsf["l" + wid] = rld curd += 1 rsf["ndata"] = numpy.array([curd], dtype = numpy.int32) diff --git a/tools/share_vocab.py b/tools/share_vocab.py index ca957a5..a1705ff 100644 --- a/tools/share_vocab.py +++ b/tools/share_vocab.py @@ -4,7 +4,7 @@ from utils.fmt.base import clean_list_iter -def handle(srcfl, rsf, vsize=32764): +def handle(srcfl, rsf, vsize=65532): vocab = {} diff --git a/tools/sorti.py b/tools/sorti.py index 640034f..85d8724 100644 --- a/tools/sorti.py +++ b/tools/sorti.py @@ -4,20 +4,23 @@ from utils.fmt.base import clean_liststr_lentok, iter_dict_sort -def handle(srcfs, tgtfs): +def handle(srcfs, tgtfs, max_len=1048576): data = {} + _max_len = max(1, max_len - 2) + with open(srcfs, "rb") as fs: for ls in fs: ls = ls.strip() if ls: ls, lgth = clean_liststr_lentok(ls.decode("utf-8").split()) - if lgth in data: - if ls not in data[lgth]: - data[lgth].add(ls) - else: - data[lgth] = set([ls]) + if lgth <= _max_len: + if lgth in data: + if ls not in data[lgth]: + data[lgth].add(ls) + else: + data[lgth] = set([ls]) ens = "\n".encode("utf-8") @@ -27,4 +30,4 @@ def handle(srcfs, tgtfs): fs.write(ens) if __name__ == "__main__": - handle(sys.argv[1], sys.argv[2]) + handle(sys.argv[1], sys.argv[2]) if len(sys.argv) == 3 else handle(sys.argv[1], sys.argv[2], int(sys.argv[-1])) diff --git a/tools/vocab.py b/tools/vocab.py index 942cd89..103ec3b 100644 --- a/tools/vocab.py +++ b/tools/vocab.py @@ -4,7 +4,7 @@ from utils.fmt.base import clean_list_iter -def handle(srcf, rsf, vsize=32764): +def handle(srcf, rsf, vsize=65532): vocab = {} @@ -41,4 +41,4 @@ def handle(srcf, rsf, vsize=32764): break if __name__ == "__main__": - handle(sys.argv[1], sys.argv[2], int(sys.argv[3])) + handle(sys.argv[1], sys.argv[2]) if len(sys.argv) == 3 else handle(sys.argv[1], sys.argv[2], int(sys.argv[-1])) diff --git a/train.py b/train.py index 84c5ac3..f74a310 100644 --- a/train.py +++ b/train.py @@ -11,14 +11,14 @@ from parallel.parallelMT import DataParallelMT from utils.base import * +from utils.h5serial import h5save, h5load from utils.fmt.base import tostr, save_states, load_states from utils.fmt.base4torch import parse_cuda, load_emb from lrsch import GoogleLR -from loss import LabelSmoothingLoss +from loss.base import LabelSmoothingLoss from random import shuffle -from math import inf from tqdm import tqdm @@ -28,6 +28,7 @@ import h5py import cnfg.base as cnfg +from cnfg.ihyp import * from transformer.NMT import NMT @@ -90,7 +91,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok if _cur_rstep is not None: if save_checkp_epoch and (save_every is not None) and (_cur_rstep % save_every == 0) and (chkpf is not None) and (_cur_rstep > 0): if num_checkpoint > 1: - _fend = "_%d.t7" % (_cur_checkid) + _fend = "_%d.h5" % (_cur_checkid) _chkpf = chkpf[:-3] + _fend if chkpof is not None: _chkpof = chkpof[:-3] + _fend @@ -100,7 +101,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok _chkpof = chkpof save_model(model, _chkpf, multi_gpu, logger) if chkpof is not None: - torch.save(optm.state_dict(), _chkpof) + h5save(optm.state_dict(), _chkpof) if statesf is not None: save_states(statesf, tl[cur_b - 1:]) _cur_rstep -= 1 @@ -124,7 +125,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok if save_checkp_epoch and (_cur_rstep is None) and (save_every is not None) and (cur_b % save_every == 0) and (chkpf is not None) and (cur_b < ndata): if num_checkpoint > 1: - _fend = "_%d.t7" % (_cur_checkid) + _fend = "_%d.h5" % (_cur_checkid) _chkpf = chkpf[:-3] + _fend if chkpof is not None: _chkpof = chkpof[:-3] + _fend @@ -135,7 +136,7 @@ def train(td, tl, ed, nd, optm, lrsch, model, lossf, mv_device, logger, done_tok #save_model(model, _chkpf, isinstance(model, nn.DataParallel), logger) save_model(model, _chkpf, multi_gpu, logger) if chkpof is not None: - torch.save(optm.state_dict(), _chkpof) + h5save(optm.state_dict(), _chkpof) if statesf is not None: save_states(statesf, tl[cur_b - 1:]) cur_b += 1 @@ -163,9 +164,9 @@ def eva(ed, nd, model, lossf, mv_device, multi_gpu): loss = lossf(output, ot) if multi_gpu: loss = loss.sum() - trans = torch.cat([torch.argmax(outu, -1).to(mv_device) for outu in output], 0) + trans = torch.cat([outu.argmax(-1).to(mv_device) for outu in output], 0) else: - trans = torch.argmax(output, -1) + trans = output.argmax(-1) sum_loss += loss.data.item() data_mask = ot.ne(0) correct = (trans.eq(ot) & data_mask).int() @@ -218,9 +219,9 @@ def init_fixing(module): chkpof = None statesf = None if save_every is not None: - chkpf = wkdir + "checkpoint.t7" + chkpf = wkdir + "checkpoint.h5" if save_optm_state: - chkpof = wkdir + "checkpoint.optm.t7" + chkpof = wkdir + "checkpoint.optm.h5" if cnfg.save_train_state: statesf = wkdir + "checkpoint.states" @@ -249,7 +250,7 @@ def init_fixing(module): nwordi, nwordt = nword[0], nword[-1] logger.info("Design models with seed: %d" % torch.initial_seed()) -mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) +mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) fine_tune_m = cnfg.fine_tune_m @@ -278,7 +279,7 @@ def init_fixing(module): lossf.to(cuda_device) # lr will be over written by GoogleLR before used -optimizer = optim.Adam(mymodel.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9, weight_decay=cnfg.weight_decay, amsgrad=use_ams) +optimizer = optim.Adam(mymodel.parameters(), lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams) optimizer.zero_grad() if use_amp: @@ -292,7 +293,7 @@ def init_fixing(module): fine_tune_state = cnfg.fine_tune_state if fine_tune_state is not None: logger.info("Load optimizer state from: " + fine_tune_state) - optimizer.load_state_dict(torch.load(fine_tune_state)) + optimizer.load_state_dict(h5load(fine_tune_state)) lrsch = GoogleLR(optimizer, cnfg.isize, cnfg.warm_step, scale=cnfg.lr_scale) #lrsch.step() @@ -300,13 +301,13 @@ def init_fixing(module): num_checkpoint = cnfg.num_checkpoint cur_checkid = 0 -tminerr = inf +tminerr = inf_default minloss, minerr = eva(vd, nvalid, mymodel, lossf, cuda_device, multi_gpu) logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))), ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr)))) if fine_tune_m is None: - save_model(mymodel, wkdir + "init.t7", multi_gpu, logger) + save_model(mymodel, wkdir + "init.h5", multi_gpu, logger) logger.info("Initial model saved") else: cnt_states = cnfg.train_statesf @@ -315,9 +316,9 @@ def init_fixing(module): tminerr, done_tokens, cur_checkid, remain_steps, _ = train(td, load_states(cnt_states), vd, nvalid, optimizer, lrsch, mymodel, lossf, cuda_device, logger, done_tokens, multi_gpu, tokens_optm, batch_report, save_every, chkpf, chkpof, statesf, num_checkpoint, cur_checkid, report_eva, remain_steps, False, False, use_amp) vloss, vprec = eva(vd, nvalid, mymodel, lossf, cuda_device, multi_gpu) logger.info("Epoch: 0, train loss: %.3f, valid loss/error: %.3f %.2f" % (tminerr, vloss, vprec)) - save_model(mymodel, wkdir + "train_0_%.3f_%.3f_%.2f.t7" % (tminerr, vloss, vprec), multi_gpu, logger) + save_model(mymodel, wkdir + "train_0_%.3f_%.3f_%.2f.h5" % (tminerr, vloss, vprec), multi_gpu, logger) if save_optm_state: - torch.save(optimizer.state_dict(), wkdir + "train_0_%.3f_%.3f_%.2f.optm.t7" % (tminerr, vloss, vprec)) + h5save(optimizer.state_dict(), wkdir + "train_0_%.3f_%.3f_%.2f.optm.h5" % (tminerr, vloss, vprec)) logger.info("New best model saved") if cnfg.dss_ws is not None and cnfg.dss_ws > 0.0 and cnfg.dss_ws < 1.0: @@ -344,9 +345,9 @@ def init_fixing(module): logger.info("Epoch: %d, train loss: %.3f, valid loss/error: %.3f %.2f" % (i, terr, vloss, vprec)) if (vprec <= minerr) or (vloss <= minloss): - save_model(mymodel, wkdir + "eva_%d_%.3f_%.3f_%.2f.t7" % (i, terr, vloss, vprec), multi_gpu, logger) + save_model(mymodel, wkdir + "eva_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, logger) if save_optm_state: - torch.save(optimizer.state_dict(), wkdir + "eva_%d_%.3f_%.3f_%.2f.optm.t7" % (i, terr, vloss, vprec)) + h5save(optimizer.state_dict(), wkdir + "eva_%d_%.3f_%.3f_%.2f.optm.h5" % (i, terr, vloss, vprec)) logger.info("New best model saved") namin = 0 @@ -359,11 +360,11 @@ def init_fixing(module): else: if terr < tminerr: tminerr = terr - save_model(mymodel, wkdir + "train_%d_%.3f_%.3f_%.2f.t7" % (i, terr, vloss, vprec), multi_gpu, logger) + save_model(mymodel, wkdir + "train_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, logger) if save_optm_state: - torch.save(optimizer.state_dict(), wkdir + "train_%d_%.3f_%.3f_%.2f.optm.t7" % (i, terr, vloss, vprec)) + h5save(optimizer.state_dict(), wkdir + "train_%d_%.3f_%.3f_%.2f.optm.h5" % (i, terr, vloss, vprec)) elif epoch_save: - save_model(mymodel, wkdir + "epoch_%d_%.3f_%.3f_%.2f.t7" % (i, terr, vloss, vprec), multi_gpu, logger) + save_model(mymodel, wkdir + "epoch_%d_%.3f_%.3f_%.2f.h5" % (i, terr, vloss, vprec), multi_gpu, logger) namin += 1 if namin >= earlystop: @@ -405,9 +406,9 @@ def init_fixing(module): #done_tokens = 0 #optimizer.zero_grad() -save_model(mymodel, wkdir + "last.t7", multi_gpu, logger) +save_model(mymodel, wkdir + "last.h5", multi_gpu, logger) if save_optm_state: - torch.save(optimizer.state_dict(), wkdir + "last.optm.t7") + h5save(optimizer.state_dict(), wkdir + "last.optm.h5") logger.info("model saved") td.close() diff --git a/transformer/AGG/HierDecoder.py b/transformer/AGG/HierDecoder.py index 963ec70..7f6ce00 100644 --- a/transformer/AGG/HierDecoder.py +++ b/transformer/AGG/HierDecoder.py @@ -7,6 +7,8 @@ from transformer.Decoder import DecoderLayer as DecoderLayerBase from transformer.Decoder import Decoder as DecoderBase +from cnfg.ihyp import * + class DecoderLayer(nn.Module): # isize: input size @@ -79,7 +81,7 @@ class Decoder(DecoderBase): # ahsize: number of hidden units for MultiHeadAttention # bindemb: bind embedding and classifier weight - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=512, ahsize=None, norm_output=False, bindemb=False, forbidden_index=None, num_sub=1): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=False, bindemb=False, forbidden_index=None, num_sub=1): _ahsize = isize if ahsize is None else ahsize diff --git a/transformer/AGG/HierEncoder.py b/transformer/AGG/HierEncoder.py index f4bc172..caaeea1 100644 --- a/transformer/AGG/HierEncoder.py +++ b/transformer/AGG/HierEncoder.py @@ -14,6 +14,8 @@ # ... # for the classier of the decoder, is omitted +from cnfg.ihyp import * + class EncoderLayer(nn.Module): # isize: input size @@ -59,7 +61,7 @@ class Encoder(EncoderBase): # xseql: maxmimum length of sequence # ahsize: number of hidden units for MultiHeadAttention - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=False, num_sub=1): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=False, num_sub=1): _ahsize = isize if ahsize is None else ahsize diff --git a/transformer/AGG/InceptDecoder.py b/transformer/AGG/InceptDecoder.py index 1c579d1..f4fad6b 100644 --- a/transformer/AGG/InceptDecoder.py +++ b/transformer/AGG/InceptDecoder.py @@ -7,6 +7,8 @@ from transformer.Decoder import DecoderLayer as DecoderLayerBase from transformer.Decoder import Decoder as DecoderBase +from cnfg.ihyp import * + class DecoderLayer(nn.Module): # isize: input size @@ -73,7 +75,7 @@ class Decoder(DecoderBase): # ahsize: number of hidden units for MultiHeadAttention # bindemb: bind embedding and classifier weight - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=512, ahsize=None, norm_output=False, bindemb=False, forbidden_index=None, num_sub=1): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=False, bindemb=False, forbidden_index=None, num_sub=1): _ahsize = isize if ahsize is None else ahsize diff --git a/transformer/AGG/InceptEncoder.py b/transformer/AGG/InceptEncoder.py index 6b2d407..11fcbeb 100644 --- a/transformer/AGG/InceptEncoder.py +++ b/transformer/AGG/InceptEncoder.py @@ -14,6 +14,8 @@ # ... # for the classier of the decoder, is omitted +from cnfg.ihyp import * + class EncoderLayer(nn.Module): # isize: input size @@ -57,7 +59,7 @@ class Encoder(EncoderBase): # xseql: maxmimum length of sequence # ahsize: number of hidden units for MultiHeadAttention - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=False, num_sub=1): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=False, num_sub=1): _ahsize = isize if ahsize is None else ahsize diff --git a/transformer/AvgDecoder.py b/transformer/AvgDecoder.py index 85c0b99..a49f00a 100644 --- a/transformer/AvgDecoder.py +++ b/transformer/AvgDecoder.py @@ -11,6 +11,8 @@ # Average Decoder is proposed in Accelerating Neural Transformer via an Average Attention Network(https://arxiv.org/abs/1805.00631) +from cnfg.ihyp import * + class DecoderLayer(DecoderLayerBase): # isize: input size @@ -47,7 +49,7 @@ def forward(self, inpute, inputo, src_pad_mask=None, query_unit=None, step=1): if self.drop is not None: context = self.drop(context) - context = context + (_inputo if self.norm_residue else inputo) + context = context + (_inputo if self.norm_residual else inputo) else: _query_unit = self.layer_normer1(query_unit) @@ -63,7 +65,7 @@ def forward(self, inpute, inputo, src_pad_mask=None, query_unit=None, step=1): if self.drop is not None: context = self.drop(context) - context = context + (_query_unit if self.norm_residue else query_unit) + context = context + (_query_unit if self.norm_residual else query_unit) _context = self.layer_normer2(context) _context_new = self.cross_attn(_context, inpute, mask=src_pad_mask) @@ -71,7 +73,7 @@ def forward(self, inpute, inputo, src_pad_mask=None, query_unit=None, step=1): if self.drop is not None: _context_new = self.drop(_context_new) - context = _context_new + (_context if self.norm_residue else context) + context = _context_new + (_context if self.norm_residual else context) context = self.ff(context) @@ -95,7 +97,7 @@ class Decoder(DecoderBase): # ahsize: number of hidden units for MultiHeadAttention # bindemb: bind embedding and classifier weight - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=512, ahsize=None, norm_output=True, bindemb=False, forbidden_index=None): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, bindemb=False, forbidden_index=None): _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize @@ -115,7 +117,9 @@ def forward(self, inpute, inputo, src_pad_mask=None): out = self.wemb(inputo) - out = out * sqrt(out.size(-1)) + self.pemb(inputo, expand=False) + out = out * sqrt(out.size(-1)) + if self.pemb is not None: + out = out + self.pemb(inputo, expand=False) if self.drop is not None: out = self.drop(out) @@ -163,7 +167,9 @@ def greedy_decode(self, inpute, src_pad_mask=None, max_len=512, fill_pad=False): # out: input to the decoder for the first step (bsize, 1, isize) - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -193,7 +199,9 @@ def greedy_decode(self, inpute, src_pad_mask=None, max_len=512, fill_pad=False): for i in range(2, max_len + 1): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(i - 1) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(i - 1) if self.drop is not None: out = self.drop(out) @@ -240,7 +248,9 @@ def beam_decode(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, lengt lpv = sos_emb.new_ones(real_bsize, 1) lpv_base = 6.0 ** length_penalty - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -287,7 +297,9 @@ def beam_decode(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, lengt for step in range(2, max_len + 1): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(step - 1) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(step - 1) if self.drop is not None: out = self.drop(out) diff --git a/transformer/Decoder.py b/transformer/Decoder.py index de99e91..d380c76 100644 --- a/transformer/Decoder.py +++ b/transformer/Decoder.py @@ -4,10 +4,12 @@ from torch import nn from modules.base import * from utils.base import repeat_bsize_for_beam_tensor, mask_tensor_type -from math import sqrt, inf +from math import sqrt from utils.fmt.base import pad_id +from cnfg.ihyp import * + class DecoderLayer(nn.Module): # isize: input size @@ -15,26 +17,27 @@ class DecoderLayer(nn.Module): # attn_drop: dropout for MultiHeadAttention # num_head: number of heads in MultiHeadAttention # ahsize: hidden size of MultiHeadAttention - # norm_residue: residue with layer normalized representation + # norm_residual: residue with layer normalized representation + # k_rel_pos: window size (one side) of relative positional embeddings in self attention - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residue=True): + def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residual=norm_residual_default, k_rel_pos=use_k_relative_position_decoder): super(DecoderLayer, self).__init__() _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize - self.self_attn = SelfAttn(isize, _ahsize, isize, num_head, dropout=attn_drop) + self.self_attn = SelfAttn(isize, _ahsize, isize, num_head, dropout=attn_drop, k_rel_pos=k_rel_pos) self.cross_attn = CrossAttn(isize, _ahsize, isize, num_head, dropout=attn_drop) - self.ff = PositionwiseFF(isize, _fhsize, dropout, norm_residue) + self.ff = PositionwiseFF(isize, _fhsize, dropout, norm_residual) - self.layer_normer1 = nn.LayerNorm(isize, eps=1e-06) - self.layer_normer2 = nn.LayerNorm(isize, eps=1e-06) + self.layer_normer1 = nn.LayerNorm(isize, eps=ieps_ln_default) + self.layer_normer2 = nn.LayerNorm(isize, eps=ieps_ln_default) self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None - self.norm_residue = norm_residue + self.norm_residual = norm_residual # inpute: encoded representation from encoder (bsize, seql, isize) # inputo: embedding of decoded translation (bsize, nquery, isize) @@ -55,7 +58,7 @@ def forward(self, inpute, inputo, src_pad_mask=None, tgt_pad_mask=None, query_un if self.drop is not None: context = self.drop(context) - context = context + (_inputo if self.norm_residue else inputo) + context = context + (_inputo if self.norm_residual else inputo) else: _query_unit = self.layer_normer1(query_unit) @@ -69,7 +72,7 @@ def forward(self, inpute, inputo, src_pad_mask=None, tgt_pad_mask=None, query_un if self.drop is not None: context = self.drop(context) - context = context + (_query_unit if self.norm_residue else query_unit) + context = context + (_query_unit if self.norm_residual else query_unit) _context = self.layer_normer2(context) _context_new = self.cross_attn(_context, inpute, mask=src_pad_mask) @@ -77,7 +80,7 @@ def forward(self, inpute, inputo, src_pad_mask=None, tgt_pad_mask=None, query_un if self.drop is not None: _context_new = self.drop(_context_new) - context = _context_new + (_context if self.norm_residue else context) + context = _context_new + (_context if self.norm_residual else context) context = self.ff(context) @@ -99,8 +102,9 @@ class Decoder(nn.Module): # ahsize: number of hidden units for MultiHeadAttention # bindemb: bind embedding and classifier weight # share_layer: using one shared decoder layer + # disable_pemb: disable the standard positional embedding, can be enabled when use relative postional embeddings in self attention or AAN - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=512, ahsize=None, norm_output=True, bindemb=True, forbidden_index=None, share_layer=False): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, bindemb=True, forbidden_index=None, share_layer=False, disable_pemb=disable_std_pemb_decoder): super(Decoder, self).__init__() @@ -116,7 +120,7 @@ def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0. if emb_w is not None: self.wemb.weight = emb_w - self.pemb = PositionalEmb(isize, xseql, 0, 0) + self.pemb = None if disable_pemb else PositionalEmb(isize, xseql, 0, 0) if share_layer: _shared_layer = DecoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) self.nets = nn.ModuleList([_shared_layer for i in range(num_layer)]) @@ -130,7 +134,7 @@ def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0. self.lsm = nn.LogSoftmax(-1) - self.out_normer = nn.LayerNorm(isize, eps=1e-06) if norm_output else None + self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default) if norm_output else None self.fbl = None if forbidden_index is None else tuple(set(forbidden_index)) @@ -145,7 +149,9 @@ def forward(self, inpute, inputo, src_pad_mask=None): out = self.wemb(inputo) - out = out * sqrt(out.size(-1)) + self.pemb(inputo, expand=False) + out = out * sqrt(out.size(-1)) + if self.pemb is not None: + out = out + self.pemb(inputo, expand=False) if self.drop is not None: out = self.drop(out) @@ -213,7 +219,9 @@ def greedy_decode(self, inpute, src_pad_mask=None, max_len=512, fill_pad=False): # out: input to the decoder for the first step (bsize, 1, isize) - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -243,7 +251,9 @@ def greedy_decode(self, inpute, src_pad_mask=None, max_len=512, fill_pad=False): for i in range(1, max_len): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(i) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(i) if self.drop is not None: out = self.drop(out) @@ -290,7 +300,9 @@ def beam_decode(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, lengt lpv = sos_emb.new_ones(real_bsize, 1) lpv_base = 6.0 ** length_penalty - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -337,7 +349,9 @@ def beam_decode(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, lengt for step in range(1, max_len): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(step) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(step) if self.drop is not None: out = self.drop(out) @@ -453,7 +467,7 @@ def fix_load(self): if self.fbl is not None: with torch.no_grad(): - self.classifier.bias.index_fill_(0, torch.tensor(self.fbl, dtype=torch.long, device=self.classifier.bias.device), -inf) + self.classifier.bias.index_fill_(0, torch.tensor(self.fbl, dtype=torch.long, device=self.classifier.bias.device), -inf_default) def unbind_classifier_weight(self): @@ -489,7 +503,9 @@ def greedy_decode_clip(self, inpute, src_pad_mask=None, max_len=512, return_mat= # out: input to the decoder for the first step (bsize, 1, isize) - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -518,7 +534,9 @@ def greedy_decode_clip(self, inpute, src_pad_mask=None, max_len=512, return_mat= for i in range(1, max_len): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(i) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(i) if self.drop is not None: out = self.drop(out) @@ -590,7 +608,9 @@ def beam_decode_clip(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, lpv = sos_emb.new_ones(real_bsize, 1) lpv_base = 6.0 ** length_penalty - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -642,7 +662,9 @@ def beam_decode_clip(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, for step in range(1, max_len): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(step) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(step) if self.drop is not None: out = self.drop(out) diff --git a/transformer/Doc/Para/Base/Decoder.py b/transformer/Doc/Para/Base/Decoder.py index e93fd87..a5cd865 100644 --- a/transformer/Doc/Para/Base/Decoder.py +++ b/transformer/Doc/Para/Base/Decoder.py @@ -10,6 +10,8 @@ from transformer.Decoder import DecoderLayer as DecoderLayerBase from transformer.Decoder import Decoder as DecoderBase +from cnfg.ihyp import * + class DecoderLayer(DecoderLayerBase): def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, ncross=2): @@ -19,7 +21,7 @@ def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, a super(DecoderLayer, self).__init__(isize, fhsize, dropout, attn_drop, num_head, _ahsize) self.cattns = nn.ModuleList([CrossAttn(isize, _ahsize, isize, num_head, dropout=attn_drop) for i in range(ncross)]) - self.cattn_ln = nn.ModuleList([nn.LayerNorm(isize, eps=1e-06) for i in range(ncross)]) + self.cattn_ln = nn.ModuleList([nn.LayerNorm(isize, eps=ieps_ln_default) for i in range(ncross)]) self.grs = nn.ModuleList([GateResidual(isize) for i in range(ncross)]) def forward(self, inpute, inputo, inputc, src_pad_mask=None, tgt_pad_mask=None, context_mask=None, query_unit=None): @@ -34,7 +36,7 @@ def forward(self, inpute, inputo, inputc, src_pad_mask=None, tgt_pad_mask=None, if self.drop is not None: context = self.drop(context) - context = context + (_inputo if self.norm_residue else inputo) + context = context + (_inputo if self.norm_residual else inputo) else: _query_unit = self.layer_normer1(query_unit) @@ -48,7 +50,7 @@ def forward(self, inpute, inputo, inputc, src_pad_mask=None, tgt_pad_mask=None, if self.drop is not None: context = self.drop(context) - context = context + (_query_unit if self.norm_residue else query_unit) + context = context + (_query_unit if self.norm_residual else query_unit) _context = self.layer_normer2(context) _context_new = self.cross_attn(_context, inpute, mask=src_pad_mask) @@ -56,14 +58,14 @@ def forward(self, inpute, inputo, inputc, src_pad_mask=None, tgt_pad_mask=None, if self.drop is not None: _context_new = self.drop(_context_new) - context = _context_new + (_context if self.norm_residue else context) + context = _context_new + (_context if self.norm_residual else context) for _ln, _cattn, _gr, _inputc, _maskc in zip(self.cattn_ln, self.cattns, self.grs, inputc, [None for i in range(len(inputc))] if context_mask is None else context_mask): _inputs = _ln(context) _context = _cattn(_inputs, _inputc, mask=_maskc) if self.drop is not None: _context = self.drop(_context) - context = _gr(_context, (_inputs if self.norm_residue else context)) + context = _gr(_context, (_inputs if self.norm_residual else context)) context = self.ff(context) @@ -83,7 +85,7 @@ def load_base(self, base_decoder_layer): class Decoder(DecoderBase): - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=512, ahsize=None, norm_output=True, bindemb=True, forbidden_index=None, nprev_context=2): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, bindemb=True, forbidden_index=None, nprev_context=2): _ahsize = isize if ahsize is None else ahsize @@ -100,7 +102,9 @@ def forward(self, inpute, inputo, inputc, src_pad_mask=None, context_mask=None): out = self.wemb(_inputo) isize = out.size(-1) - out = out * sqrt(isize) + self.pemb(_inputo, expand=False) + out = out * sqrt(isize) + if self.pemb is not None: + out = out + self.pemb(_inputo, expand=False) if self.drop is not None: out = self.drop(out) @@ -147,7 +151,9 @@ def greedy_decode(self, inpute, inputc, src_pad_mask=None, context_mask=None, ma sqrt_isize = sqrt(sos_emb.size(-1)) - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -171,7 +177,9 @@ def greedy_decode(self, inpute, inputc, src_pad_mask=None, context_mask=None, ma for i in range(1, max_len): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(i) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(i) if self.drop is not None: out = self.drop(out) @@ -210,7 +218,9 @@ def beam_decode(self, inpute, inputc, src_pad_mask=None, context_mask=None, beam lpv = sos_emb.new_ones(real_bsize, 1) lpv_base = 6.0 ** length_penalty - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -248,7 +258,9 @@ def beam_decode(self, inpute, inputc, src_pad_mask=None, context_mask=None, beam for step in range(1, max_len): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(step) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(step) if self.drop is not None: out = self.drop(out) diff --git a/transformer/Doc/Para/Base/Encoder.py b/transformer/Doc/Para/Base/Encoder.py index 12cf44a..938519a 100644 --- a/transformer/Doc/Para/Base/Encoder.py +++ b/transformer/Doc/Para/Base/Encoder.py @@ -11,6 +11,8 @@ from transformer.Encoder import EncoderLayer as EncoderLayerBase from transformer.Encoder import Encoder as EncoderBase +from cnfg.ihyp import * + class CrossEncoderLayer(EncoderLayerBase): def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, ncross=2): @@ -20,7 +22,7 @@ def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, a super(CrossEncoderLayer, self).__init__(isize, fhsize, dropout, attn_drop, num_head, _ahsize) self.cattns = nn.ModuleList([CrossAttn(isize, _ahsize, isize, num_head, dropout=attn_drop) for i in range(ncross)]) - self.cattn_ln = nn.ModuleList([nn.LayerNorm(isize, eps=1e-06) for i in range(ncross)]) + self.cattn_ln = nn.ModuleList([nn.LayerNorm(isize, eps=ieps_ln_default) for i in range(ncross)]) self.grs = nn.ModuleList([GateResidual(isize) for i in range(ncross)]) def forward(self, inputs, inputc, mask=None, context_mask=None): @@ -31,14 +33,14 @@ def forward(self, inputs, inputc, mask=None, context_mask=None): if self.drop is not None: context = self.drop(context) - context = context + (_inputs if self.norm_residue else inputs) + context = context + (_inputs if self.norm_residual else inputs) for _ln, _cattn, _gr, _inputc, _maskc in zip(self.cattn_ln, self.cattns, self.grs, inputc, [None for i in range(len(inputc))] if context_mask is None else context_mask): _inputs = _ln(context) _context = _cattn(_inputs, _inputc, mask=_maskc) if self.drop is not None: _context = self.drop(_context) - context = _gr(_context, (_inputs if self.norm_residue else context)) + context = _gr(_context, (_inputs if self.norm_residual else context)) context = self.ff(context) @@ -53,7 +55,7 @@ def load_base(self, base_encoder_layer): class CrossEncoder(EncoderBase): - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=True, nprev_context=2): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, nprev_context=2): _ahsize = isize if ahsize is None else ahsize @@ -66,7 +68,9 @@ def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0. def forward(self, inputs, inputc, mask=None, context_mask=None): out = self.wemb(inputs) - out = out * sqrt(out.size(-1)) + self.pemb(inputs, expand=False) + out = out * sqrt(out.size(-1)) + if self.pemb is not None: + out = out + self.pemb(inputs, expand=False) if self.drop is not None: out = self.drop(out) @@ -91,7 +95,7 @@ def load_base(self, base_encoder): class Encoder(nn.Module): - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=True, nprev_context=2, num_layer_context=1): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, nprev_context=2, num_layer_context=1): super(Encoder, self).__init__() diff --git a/transformer/Doc/Para/Base/NMT.py b/transformer/Doc/Para/Base/NMT.py index ed30177..408277f 100644 --- a/transformer/Doc/Para/Base/NMT.py +++ b/transformer/Doc/Para/Base/NMT.py @@ -2,21 +2,20 @@ from torch import nn -from numbers import Integral +from utils.fmt.base import parse_double_value_tuple from transformer.Doc.Para.Base.Encoder import Encoder from transformer.Doc.Para.Base.Decoder import Decoder +from cnfg.ihyp import * + class NMT(nn.Module): - def __init__(self, isize, snwd, tnwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, global_emb=False, num_head=8, xseql=512, ahsize=None, norm_output=True, bindDecoderEmb=False, forbidden_index=None, nprev_context=2, num_layer_context=1): + def __init__(self, isize, snwd, tnwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, global_emb=False, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, bindDecoderEmb=False, forbidden_index=None, nprev_context=2, num_layer_context=1): super(NMT, self).__init__() - if isinstance(num_layer, Integral): - enc_layer = dec_layer = num_layer - else: - enc_layer, dec_layer = num_layer + enc_layer, dec_layer = parse_double_value_tuple(num_layer) self.enc = Encoder(isize, snwd, enc_layer, fhsize, dropout, attn_drop, num_head, xseql, ahsize, norm_output, nprev_context, num_layer_context) diff --git a/transformer/Encoder.py b/transformer/Encoder.py index 2aa8658..9c32956 100644 --- a/transformer/Encoder.py +++ b/transformer/Encoder.py @@ -7,6 +7,8 @@ from utils.fmt.base import pad_id +from cnfg.ihyp import * + # vocabulary: # :0 # :1 @@ -22,24 +24,25 @@ class EncoderLayer(nn.Module): # attn_drop: dropout for MultiHeadAttention # num_head: number of heads in MultiHeadAttention # ahsize: hidden size of MultiHeadAttention - # norm_residue: residue with layer normalized representation + # norm_residual: residue with layer normalized representation + # k_rel_pos: window size (one side) of relative positional embeddings in self attention - def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residue=True): + def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None, norm_residual=norm_residual_default, k_rel_pos=use_k_relative_position_encoder): super(EncoderLayer, self).__init__() _ahsize = isize if ahsize is None else ahsize _fhsize = _ahsize * 4 if fhsize is None else fhsize - self.attn = SelfAttn(isize, _ahsize, isize, num_head, dropout=attn_drop) + self.attn = SelfAttn(isize, _ahsize, isize, num_head, dropout=attn_drop, k_rel_pos=k_rel_pos) - self.ff = PositionwiseFF(isize, _fhsize, dropout, norm_residue) + self.ff = PositionwiseFF(isize, _fhsize, dropout, norm_residual) - self.layer_normer = nn.LayerNorm(isize, eps=1e-06) + self.layer_normer = nn.LayerNorm(isize, eps=ieps_ln_default) self.drop = Dropout(dropout, inplace=True) if dropout > 0.0 else None - self.norm_residue = norm_residue + self.norm_residual = norm_residual # inputs: input of this layer (bsize, seql, isize) @@ -51,7 +54,7 @@ def forward(self, inputs, mask=None): if self.drop is not None: context = self.drop(context) - context = context + (_inputs if self.norm_residue else inputs) + context = context + (_inputs if self.norm_residual else inputs) context = self.ff(context) @@ -68,8 +71,9 @@ class Encoder(nn.Module): # xseql: maxmimum length of sequence # ahsize: number of hidden units for MultiHeadAttention # share_layer: using one shared encoder layer + # disable_pemb: disable the standard positional embedding, enable when use relative postional embeddings in self attention - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=True, share_layer=False): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, share_layer=False, disable_pemb=disable_std_pemb_encoder): super(Encoder, self).__init__() @@ -80,14 +84,14 @@ def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0. self.wemb = nn.Embedding(nwd, isize, padding_idx=0) - self.pemb = PositionalEmb(isize, xseql, 0, 0) + self.pemb = None if disable_pemb else PositionalEmb(isize, xseql, 0, 0) if share_layer: _shared_layer = EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) self.nets = nn.ModuleList([_shared_layer for i in range(num_layer)]) else: self.nets = nn.ModuleList([EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) for i in range(num_layer)]) - self.out_normer = nn.LayerNorm(isize, eps=1e-06) if norm_output else None + self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default) if norm_output else None # inputs: (bsize, seql) # mask: (bsize, 1, seql), generated with: @@ -96,7 +100,9 @@ def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0. def forward(self, inputs, mask=None): out = self.wemb(inputs) - out = out * sqrt(out.size(-1)) + self.pemb(inputs, expand=False) + out = out * sqrt(out.size(-1)) + if self.pemb is not None: + out = out + self.pemb(inputs, expand=False) if self.drop is not None: out = self.drop(out) diff --git a/transformer/EnsembleAvgDecoder.py b/transformer/EnsembleAvgDecoder.py index 36fc9df..974a54b 100644 --- a/transformer/EnsembleAvgDecoder.py +++ b/transformer/EnsembleAvgDecoder.py @@ -25,7 +25,9 @@ def forward(self, inpute, inputo, src_pad_mask=None): out = model.wemb(inputo) - out = out * sqrt(out.size(-1)) + model.pemb(inputo, expand=False) + out = out * sqrt(out.size(-1)) + if model.pemb is not None: + out = out + model.pemb(inputo, expand=False) if model.drop is not None: out = model.drop(out) @@ -59,7 +61,9 @@ def greedy_decode(self, inpute, src_pad_mask=None, max_len=512, fill_pad=False): # out: input to the decoder for the first step (bsize, 1, isize) - out = sos_emb * sqrt_isize + model.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if model.pemb is not None: + out = out + model.pemb.get_pos(0) if model.drop is not None: out = model.drop(out) @@ -95,7 +99,9 @@ def greedy_decode(self, inpute, src_pad_mask=None, max_len=512, fill_pad=False): for model, inputu in zip(self.nets, inpute): - out = model.wemb(wds) * sqrt_isize + model.pemb.get_pos(step - 1) + out = model.wemb(wds) * sqrt_isize + if model.pemb is not None: + out = out + model.pemb.get_pos(step - 1) if model.drop is not None: out = model.drop(out) @@ -149,7 +155,9 @@ def beam_decode(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, lengt for _inum, (model, inputu) in enumerate(zip(self.nets, inpute)): - out = model.get_sos_emb(inputu) * sqrt_isize + model.pemb.get_pos(0) + out = model.get_sos_emb(inputu) * sqrt_isize + if model.pemb is not None: + out = out + model.pemb.get_pos(0) if model.drop is not None: out = model.drop(out) @@ -203,7 +211,9 @@ def beam_decode(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, lengt for _inum, (model, inputu) in enumerate(zip(self.nets, inpute)): - out = model.wemb(wds) * sqrt_isize + model.pemb.get_pos(step - 1) + out = model.wemb(wds) * sqrt_isize + if model.pemb is not None: + out = out + model.pemb.get_pos(step - 1) if model.drop is not None: out = model.drop(out) diff --git a/transformer/EnsembleDecoder.py b/transformer/EnsembleDecoder.py index 463137f..5532a44 100644 --- a/transformer/EnsembleDecoder.py +++ b/transformer/EnsembleDecoder.py @@ -36,7 +36,9 @@ def forward(self, inpute, inputo, src_pad_mask=None): out = model.wemb(inputo) - out = out * sqrt(out.size(-1)) + model.pemb(inputo, expand=False) + out = out * sqrt(out.size(-1)) + if model.pemb is not None: + out = out + model.pemb(inputo, expand=False) if model.drop is not None: out = model.drop(out) @@ -80,7 +82,9 @@ def greedy_decode(self, inpute, src_pad_mask=None, max_len=512, fill_pad=False): # out: input to the decoder for the first step (bsize, 1, isize) - out = sos_emb * sqrt_isize + model.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if model.pemb is not None: + out = out + model.pemb.get_pos(0) if model.drop is not None: out = model.drop(out) @@ -116,7 +120,9 @@ def greedy_decode(self, inpute, src_pad_mask=None, max_len=512, fill_pad=False): for model, inputu in zip(self.nets, inpute): - out = model.wemb(wds) * sqrt_isize + model.pemb.get_pos(i) + out = model.wemb(wds) * sqrt_isize + if model.pemb is not None: + out = out + model.pemb.get_pos(i) if model.drop is not None: out = model.drop(out) @@ -224,7 +230,9 @@ def beam_decode(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, lengt for _inum, (model, inputu) in enumerate(zip(self.nets, inpute)): - out = model.wemb(wds) * sqrt_isize + model.pemb.get_pos(step) + out = model.wemb(wds) * sqrt_isize + if model.pemb is not None: + out = out + model.pemb.get_pos(step) if model.drop is not None: out = model.drop(out) diff --git a/transformer/NMT.py b/transformer/NMT.py index 041d2b4..cbd1662 100644 --- a/transformer/NMT.py +++ b/transformer/NMT.py @@ -3,7 +3,7 @@ import torch from torch import nn -from numbers import Integral +from utils.fmt.base import parse_double_value_tuple # import Encoder and Decoder from transformer.AGG.InceptEncoder and transformer.AGG.InceptDecoder/transformer.AGG.InceptAvgDecoder to learn complex representation with incepted transformer, transformer.TA.Encoder for Transparent Encoder. from transformer.Encoder import Encoder @@ -12,6 +12,8 @@ from transformer.Decoder import Decoder #from transformer.AvgDecoder import Decoder +from cnfg.ihyp import * + class NMT(nn.Module): # isize: size of word embedding @@ -25,14 +27,11 @@ class NMT(nn.Module): # xseql: maxmimum length of sequence # ahsize: number of hidden units for MultiHeadAttention - def __init__(self, isize, snwd, tnwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, global_emb=False, num_head=8, xseql=512, ahsize=None, norm_output=True, bindDecoderEmb=False, forbidden_index=None): + def __init__(self, isize, snwd, tnwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, global_emb=False, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, bindDecoderEmb=False, forbidden_index=None): super(NMT, self).__init__() - if isinstance(num_layer, Integral): - enc_layer = dec_layer = num_layer - else: - enc_layer, dec_layer = num_layer + enc_layer, dec_layer = parse_double_value_tuple(num_layer) self.enc = Encoder(isize, snwd, enc_layer, fhsize, dropout, attn_drop, num_head, xseql, ahsize, norm_output) diff --git a/transformer/README.md b/transformer/README.md new file mode 100644 index 0000000..34d86af --- /dev/null +++ b/transformer/README.md @@ -0,0 +1,53 @@ +# Transformer + +## `NMT.py` + +The transformer model encapsulates encoder and decoder. Switch [the comment line](https://github.com/anoidgit/transformer/blob/master/transformer/NMT.py#L9-L11) to make a choice between the standard decoder and the average decoder. + +## `Encoder.py` + +The encoder of transformer. + +## `Decoder.py` + +The standard decoder of transformer. + +## `AvgDecoder.py` + +The average decoder of transformer proposed by [Accelerating Neural Transformer via an Average Attention Network](https://www.aclweb.org/anthology/P18-1166/). + +## `EnsembleNMT.py` + +A model encapsulates several NMT models to do ensemble decoding. Switch [the comment line](https://github.com/anoidgit/transformer/blob/master/transformer/EnsembleNMT.py#L9-L11) to make a choice between the standard decoder and the average decoder. + +## `EnsembleEncoder.py` + +A model encapsulates several encoders for ensemble decoding. + +## `EnsembleDecoder.py` + +A model encapsulates several standard decoders for ensemble decoding. + +## `EnsembleAvgDecoder.py` + +A model encapsulates several average decoders proposed by [Accelerating Neural Transformer via an Average Attention Network](https://www.aclweb.org/anthology/P18-1166/) for ensemble decoding. + +## `AGG/` + +Implementation of aggregation models. + +### `Hier*.py` + +Hierarchical aggregation proposed in [Exploiting Deep Representations for Neural Machine Translation](https://www.aclweb.org/anthology/D18-1457/). + +## `TA/` + +Implementation of transparent attention proposed in [Training Deeper Neural Machine Translation Models with Transparent Attention](https://aclweb.org/anthology/D18-1338). + +## `SC/` + +Implementation of sentential context proposed in [Exploiting Sentential Context for Neural Machine Translation](https://www.aclweb.org/anthology/P19-1624/). + +## `Doc/` + +Implementation of context-aware Transformer proposed in [Improving the Transformer Translation Model with Document-Level Context](https://www.aclweb.org/anthology/D18-1049/). diff --git a/transformer/RNMTDecoder.py b/transformer/RNMTDecoder.py index c122840..83b8514 100644 --- a/transformer/RNMTDecoder.py +++ b/transformer/RNMTDecoder.py @@ -8,6 +8,8 @@ from modules.base import * from modules.rnncells import * +from cnfg.ihyp import * + class FirstLayer(nn.Module): # isize: input size @@ -100,7 +102,7 @@ class Decoder(nn.Module): # ahsize: number of hidden units for MultiHeadAttention # bindemb: bind embedding and classifier weight - def __init__(self, isize, nwd, num_layer, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=512, ahsize=None, norm_output=True, bindemb=False, forbidden_index=None, projector=False): + def __init__(self, isize, nwd, num_layer, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, bindemb=False, forbidden_index=None, projector=False): super(Decoder, self).__init__() @@ -129,7 +131,7 @@ def __init__(self, isize, nwd, num_layer, dropout=0.0, attn_drop=0.0, emb_w=None self.lsm = nn.LogSoftmax(-1) - self.out_normer = nn.LayerNorm(isize, eps=1e-06) if norm_output else None + self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default) if norm_output else None self.fbl = None if forbidden_index is None else tuple(set(forbidden_index)) diff --git a/transformer/SC/Decoder.py b/transformer/SC/Decoder.py index 63507f0..af8c573 100644 --- a/transformer/SC/Decoder.py +++ b/transformer/SC/Decoder.py @@ -12,6 +12,8 @@ from transformer.Decoder import DecoderLayer as DecoderLayerBase from transformer.Decoder import Decoder as DecoderBase +from cnfg.ihyp import * + class DecoderLayer(DecoderLayerBase): def __init__(self, isize, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, ahsize=None): @@ -74,7 +76,7 @@ def forward(self, inpute, inputh, inputo, src_pad_mask=None, tgt_pad_mask=None, class Decoder(DecoderBase): - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=512, ahsize=None, norm_output=True, bindemb=False, forbidden_index=None): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, bindemb=False, forbidden_index=None): _ahsize = isize if ahsize is None else ahsize @@ -90,7 +92,9 @@ def forward(self, inpute, inputh, inputo, src_pad_mask=None): out = self.wemb(inputo) - out = out * sqrt(out.size(-1)) + self.pemb(inputo, expand=False) + out = out * sqrt(out.size(-1)) + if self.pemb is not None: + out = out + self.pemb(inputo, expand=False) if self.drop is not None: out = self.drop(out) @@ -114,7 +118,9 @@ def greedy_decode(self, inpute, inputh, src_pad_mask=None, max_len=512, fill_pad sqrt_isize = sqrt(sos_emb.size(-1)) - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -137,7 +143,9 @@ def greedy_decode(self, inpute, inputh, src_pad_mask=None, max_len=512, fill_pad for i in range(1, max_len): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(i) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(i) if self.drop is not None: out = self.drop(out) @@ -175,7 +183,9 @@ def beam_decode(self, inpute, inputh, src_pad_mask=None, beam_size=8, max_len=51 lpv = sos_emb.new_ones(real_bsize, 1) lpv_base = 6.0 ** length_penalty - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -208,7 +218,9 @@ def beam_decode(self, inpute, inputh, src_pad_mask=None, beam_size=8, max_len=51 for step in range(1, max_len): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(step) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(step) if self.drop is not None: out = self.drop(out) diff --git a/transformer/SC/Encoder.py b/transformer/SC/Encoder.py index 292ec83..323c831 100644 --- a/transformer/SC/Encoder.py +++ b/transformer/SC/Encoder.py @@ -3,13 +3,15 @@ import torch from torch import nn from modules.base import * -from math import sqrt, inf +from math import sqrt from transformer.TA.Encoder import Encoder as EncoderBase +from cnfg.ihyp import * + class Encoder(EncoderBase): - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=True, num_layer_dec=6): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, num_layer_dec=6): _ahsize = isize if ahsize is None else ahsize @@ -40,7 +42,9 @@ def transform(lin, w, drop): bsize, seql = inputs.size() out = self.wemb(inputs) - out = out * sqrt(out.size(-1)) + self.pemb(inputs, expand=False) + out = out * sqrt(out.size(-1)) + if self.pemb is not None: + out = out + self.pemb(inputs, expand=False) if self.drop is not None: out = self.drop(out) @@ -48,7 +52,7 @@ def transform(lin, w, drop): out = self.out_normer(out) outs = [out] - _h0 = out.max(dim=1, keepdim=True)[0] if mask is None else out.masked_fill(mask.squeeze(1).unsqueeze(-1), -inf).max(dim=1, keepdim=True)[0] + _h0 = out.max(dim=1, keepdim=True)[0] if mask is None else out.masked_fill(mask.squeeze(1).unsqueeze(-1), -inf_default).max(dim=1, keepdim=True)[0] hl = [_h0] for net, attn in zip(self.nets, self.attns): diff --git a/transformer/SC/NMT.py b/transformer/SC/NMT.py index cab6551..7d4f489 100644 --- a/transformer/SC/NMT.py +++ b/transformer/SC/NMT.py @@ -2,23 +2,22 @@ from torch import nn -from numbers import Integral +from utils.fmt.base import parse_double_value_tuple from transformer.SC.Encoder import Encoder from transformer.SC.Decoder import Decoder from math import sqrt +from cnfg.ihyp import * + class NMT(nn.Module): - def __init__(self, isize, snwd, tnwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, global_emb=False, num_head=8, xseql=512, ahsize=None, norm_output=True, bindDecoderEmb=False, forbidden_index=None): + def __init__(self, isize, snwd, tnwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, global_emb=False, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, bindDecoderEmb=False, forbidden_index=None): super(NMT, self).__init__() - if isinstance(num_layer, Integral): - enc_layer = dec_layer = num_layer - else: - enc_layer, dec_layer = num_layer + enc_layer, dec_layer = parse_double_value_tuple(num_layer) self.enc = Encoder(isize, snwd, enc_layer, fhsize, dropout, attn_drop, num_head, xseql, ahsize, norm_output, num_layer) diff --git a/transformer/TA/Decoder.py b/transformer/TA/Decoder.py index 14233e8..5cec5a4 100644 --- a/transformer/TA/Decoder.py +++ b/transformer/TA/Decoder.py @@ -7,6 +7,8 @@ from transformer.Decoder import Decoder as DecoderBase +from cnfg.ihyp import * + class Decoder(DecoderBase): # isize: size of word embedding @@ -20,7 +22,7 @@ class Decoder(DecoderBase): # ahsize: number of hidden units for MultiHeadAttention # bindemb: bind embedding and classifier weight - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=512, ahsize=None, norm_output=True, bindemb=False, forbidden_index=None): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, emb_w=None, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, bindemb=False, forbidden_index=None): super(Decoder, self).__init__(isize, nwd, num_layer, fhsize, dropout, attn_drop, emb_w, num_head, xseql, ahsize, norm_output, bindemb, forbidden_index) @@ -35,7 +37,9 @@ def forward(self, inpute, inputo, src_pad_mask=None): out = self.wemb(inputo) - out = out * sqrt(out.size(-1)) + self.pemb(inputo, expand=False) + out = out * sqrt(out.size(-1)) + if self.pemb is not None: + out = out + self.pemb(inputo, expand=False) if self.drop is not None: out = self.drop(out) @@ -71,7 +75,9 @@ def greedy_decode(self, inpute, src_pad_mask=None, max_len=512, fill_pad=False): # out: input to the decoder for the first step (bsize, 1, isize) - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -101,7 +107,9 @@ def greedy_decode(self, inpute, src_pad_mask=None, max_len=512, fill_pad=False): for i in range(1, max_len): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(i) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(i) if self.drop is not None: out = self.drop(out) @@ -148,7 +156,9 @@ def beam_decode(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, lengt lpv = sos_emb.new_ones(real_bsize, 1) lpv_base = 6.0 ** length_penalty - out = sos_emb * sqrt_isize + self.pemb.get_pos(0) + out = sos_emb * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(0) if self.drop is not None: out = self.drop(out) @@ -195,7 +205,9 @@ def beam_decode(self, inpute, src_pad_mask=None, beam_size=8, max_len=512, lengt for step in range(1, max_len): - out = self.wemb(wds) * sqrt_isize + self.pemb.get_pos(step) + out = self.wemb(wds) * sqrt_isize + if self.pemb is not None: + out = out + self.pemb.get_pos(step) if self.drop is not None: out = self.drop(out) diff --git a/transformer/TA/Encoder.py b/transformer/TA/Encoder.py index f1bfb91..323aee4 100644 --- a/transformer/TA/Encoder.py +++ b/transformer/TA/Encoder.py @@ -9,6 +9,8 @@ from transformer.Encoder import EncoderLayer as EncoderLayerBase from transformer.Encoder import Encoder as EncoderBase +from cnfg.ihyp import * + class EncoderLayer(EncoderLayerBase): # isize: input size @@ -52,7 +54,7 @@ class Encoder(EncoderBase): # xseql: maxmimum length of sequence # ahsize: number of hidden units for MultiHeadAttention - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=True, num_layer_dec=6): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True, num_layer_dec=6): _ahsize = isize if ahsize is None else ahsize @@ -73,7 +75,9 @@ def forward(self, inputs, mask=None): bsize, seql = inputs.size() out = self.wemb(inputs) - out = out * sqrt(out.size(-1)) + self.pemb(inputs, expand=False) + out = out * sqrt(out.size(-1)) + if self.pemb is not None: + out = out + self.pemb(inputs, expand=False) if self.drop is not None: out = self.drop(out) diff --git a/transformer/UniEncoder.py b/transformer/UniEncoder.py index d732f2d..5cd5866 100644 --- a/transformer/UniEncoder.py +++ b/transformer/UniEncoder.py @@ -12,6 +12,8 @@ # ... # for the classier of the decoder, is omitted +from cnfg.ihyp import * + class Encoder(nn.Module): # isize: size of word embedding @@ -23,7 +25,7 @@ class Encoder(nn.Module): # xseql: maxmimum length of sequence # ahsize: number of hidden units for MultiHeadAttention - def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=512, ahsize=None, norm_output=True): + def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0.0, num_head=8, xseql=cache_len_default, ahsize=None, norm_output=True): super(Encoder, self).__init__() @@ -40,7 +42,7 @@ def __init__(self, isize, nwd, num_layer, fhsize=None, dropout=0.0, attn_drop=0. self.net = EncoderLayer(isize, _fhsize, dropout, attn_drop, num_head, _ahsize) self.halter = nn.Sequential(Scorer(isize), nn.Sigmoid()) - self.out_normer = nn.LayerNorm(isize, eps=1e-06) if norm_output else None + self.out_normer = nn.LayerNorm(isize, eps=ieps_ln_default) if norm_output else None self.act_loss = ACT_Loss() diff --git a/translator.py b/translator.py index c94cc6f..c2c2aef 100644 --- a/translator.py +++ b/translator.py @@ -12,6 +12,8 @@ from utils.fmt.single import batch_padder +from cnfg.ihyp import * + def data_loader(sentences_iter, vcbi, minbsize=1, bsize=768, maxpad=16, maxpart=4, maxtoken=3920): for i_d in batch_padder(sentences_iter, vcbi, bsize, maxpad, maxpart, maxtoken, minbsize): yield torch.tensor(i_d, dtype=torch.long) @@ -63,7 +65,7 @@ def __init__(self, modelfs, fvocab_i, fvocab_t, cnfg, minbsize=1, expand_for_mul if isinstance(modelfs, (list, tuple)): models = [] for modelf in modelfs: - tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + tmp = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) tmp = load_model_cpu(modelf, tmp) tmp.apply(load_fixing) @@ -72,7 +74,7 @@ def __init__(self, modelfs, fvocab_i, fvocab_t, cnfg, minbsize=1, expand_for_mul model = Ensemble(models) else: - model = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) + model = NMT(cnfg.isize, nwordi, nwordt, cnfg.nlayer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cache_len_default, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) model = load_model_cpu(modelfs, model) model.apply(load_fixing) @@ -167,15 +169,15 @@ def __call__(self, paragraphs): #from datautils.pymoses import Tokenizer, Detokenizer, Normalizepunctuation, Truecaser, Detruecaser #from datautils.bpe import BPEApplier, BPERemover #if __name__ == "__main__": - #tl = [] + #tl = ["28 @-@ jähriger Koch in San Francisco M@@ all tot a@@ u@@ f@@ gefunden", "ein 28 @-@ jähriger Koch , der vor kurzem nach San Francisco gezogen ist , wurde im T@@ r@@ e@@ p@@ p@@ e@@ n@@ haus eines örtlichen E@@ i@@ n@@ k@@ a@@ u@@ f@@ z@@ e@@ n@@ t@@ r@@ u@@ ms tot a@@ u@@ f@@ gefunden .", "der Bruder des O@@ p@@ f@@ e@@ r@@ s sagte aus , dass er sich niemanden vorstellen kann , der ihm schaden wollen würde , " E@@ n@@ d@@ lich ging es bei ihm wieder b@@ e@@ r@@ g@@ auf " .", "der am Mittwoch morgen in der W@@ e@@ s@@ t@@ field M@@ all g@@ e@@ f@@ u@@ n@@ d@@ e@@ n@@ e L@@ e@@ i@@ c@@ h@@ n@@ a@@ m wurde als der 28 Jahre alte Frank G@@ a@@ l@@ i@@ c@@ i@@ a aus San Francisco identifiziert , teilte die g@@ e@@ r@@ i@@ c@@ h@@ t@@ s@@ medizinische Abteilung in San Francisco mit .", "das San Francisco P@@ o@@ l@@ i@@ ce D@@ e@@ p@@ a@@ r@@ t@@ ment sagte , dass der Tod als Mord eingestuft wurde und die Ermittlungen am L@@ a@@ u@@ f@@ en sind .", "der Bruder des O@@ p@@ f@@ e@@ r@@ s , Louis G@@ a@@ l@@ i@@ c@@ i@@ a , teilte dem A@@ B@@ S Sender K@@ GO in San Francisco mit , dass Frank , der früher als Koch in B@@ o@@ s@@ t@@ on gearbeitet hat , vor sechs Monaten seinen T@@ r@@ a@@ u@@ m@@ j@@ ob als Koch im S@@ o@@ n@@ s & D@@ a@@ u@@ g@@ h@@ t@@ e@@ r@@ s Restaurant in San Francisco e@@ r@@ g@@ a@@ t@@ t@@ e@@ r@@ t hatte .", "ein Sprecher des S@@ o@@ n@@ s & D@@ a@@ u@@ g@@ h@@ t@@ e@@ r@@ s sagte , dass sie über seinen Tod " s@@ c@@ h@@ o@@ c@@ k@@ i@@ e@@ r@@ t und am Boden zerstört seien " .", "" wir sind ein kleines Team , das wie eine enge Familie arbeitet und wir werden ihn s@@ c@@ h@@ m@@ e@@ r@@ z@@ lich vermissen " , sagte der Sprecher weiter .", "unsere Gedanken und unser B@@ e@@ i@@ leid sind in dieser schweren Zeit bei F@@ r@@ a@@ n@@ k@@ s Familie und Freunden .", "Louis G@@ a@@ l@@ i@@ c@@ i@@ a gab an , dass Frank zunächst in Hostels lebte , aber dass , " die Dinge für ihn endlich b@@ e@@ r@@ g@@ auf gingen " ."] #spl = SentenceSplitter("de") #tok = Tokenizer("de") #detok = Detokenizer("en") #punc_norm = Normalizepunctuation("de") - #truecaser = Truecaser("truecase-model.de") + #truecaser = Truecaser("c1207\\truecase-model.de") #detruecaser = Detruecaser() - #tran_core = TranslatorCore("eva_20_1.384_1.088_26.74.t7", "src.vcb", "tgt.vcb", cnfg) - #bpe = BPEApplier("src.cds", "src.vcb.bpe", 8) + #tran_core = TranslatorCore("c1207\\eva_20_1.384_1.088_26.74.h5", "c1207\\src.vcb", "c1207\\tgt.vcb", cnfg) + #bpe = BPEApplier("c1207\\src.cds", "c1207\\src.vcb.bpe", 50) #debpe = BPERemover() #trans = Translator(tran_core, spl, tok, detok, bpe, debpe, punc_norm, truecaser, detruecaser) #rs = tran_core(tl) diff --git a/utils/base.py b/utils/base.py index c6cf37d..371b4df 100644 --- a/utils/base.py +++ b/utils/base.py @@ -13,6 +13,8 @@ import logging +from utils.h5serial import h5save, h5load + mask_tensor_type = torch.uint8 if torch.__version__ < "1.2.0" else torch.bool def pad_tensors(tensor_list, dim=-1): @@ -125,7 +127,7 @@ def dynamic_sample(incd, dss_ws, dss_rm): def load_model_cpu(modf, base_model): - mpg = torch.load(modf, map_location='cpu') + mpg = h5load(modf) for para, mp in zip(base_model.parameters(), mpg): para.data = mp.data @@ -134,7 +136,7 @@ def load_model_cpu(modf, base_model): def load_model_cpu_old(modf, base_model): - base_model.load_state_dict(torch.load(modf, map_location='cpu')) + base_model.load_state_dict(h5load(modf)) return base_model @@ -142,7 +144,7 @@ def save_model(model, fname, sub_module=False, logger=None): _msave = model.module if sub_module else model try: - torch.save([t.data for t in _msave.parameters()], fname) + h5save([t.data for t in _msave.parameters()], fname) except Exception as e: if logger is None: print(e) @@ -157,10 +159,10 @@ def _worker(model, fname, sub_module=False, logger=None, para_lock=None, log_suc _msave = model.module if sub_module else model try: if para_lock is None: - torch.save([t.data for t in _msave.parameters()], fname) + h5save([t.data for t in _msave.parameters()], fname) else: with para_lock: - torch.save([t.data for t in _msave.parameters()], fname) + h5save([t.data for t in _msave.parameters()], fname) except Exception as e: if logger is None: print(e) diff --git a/utils/fmt/base.py b/utils/fmt/base.py index e4f3735..0512f93 100644 --- a/utils/fmt/base.py +++ b/utils/fmt/base.py @@ -2,10 +2,10 @@ from random import shuffle -has_unk = True +from cnfg.hyp import use_unk pad_id, sos_id, eos_id = 0, 1, 2 -if has_unk: +if use_unk: unk_id = 3 init_vocab = {"":pad_id, "":sos_id, "":eos_id, "":unk_id} init_normal_token_id = 4 @@ -204,13 +204,15 @@ def no_unk_mapper(vcb, ltm, prompt=True): else: return [vcb[wd] for wd in ltm if wd in vcb] -def list2dict(lin): +def list2dict(lin, kfunc=None): - rsd = {} - for i, lu in enumerate(lin): - rsd[i] = lu + return {k: lu for k, lu in enumerate(lin)} if kfunc is None else {kfunc(k): lu for k, lu in enumerate(lin)} - return rsd +def dict_is_list(sdin, kfunc=None): + + _lset = set(range(len(sdin))) if kfunc is None else set(kfunc(i) for i in range(len(sdin))) + + return False if (_lset - sdin) else True def dict2pairs(dict_in): @@ -304,13 +306,13 @@ def get_bi_ratio(ls, lt): def map_batch(i_d, vocabi): - global has_unk, sos_id, eos_id, unk_id + global use_unk, sos_id, eos_id, unk_id if isinstance(i_d[0], (tuple, list,)): return [map_batch(idu, vocabi)[0] for idu in i_d], 2 else: rsi = [sos_id] - rsi.extend([vocabi.get(wd, unk_id) for wd in i_d] if has_unk else no_unk_mapper(vocabi, i_d))#[vocabi[wd] for wd in i_d if wd in vocabi] + rsi.extend([vocabi.get(wd, unk_id) for wd in i_d] if use_unk else no_unk_mapper(vocabi, i_d))#[vocabi[wd] for wd in i_d if wd in vocabi] rsi.append(eos_id) return rsi, 2 @@ -325,3 +327,14 @@ def pad_batch(i_d, mlen_i): if curlen < mlen_i: i_d.extend([pad_id for i in range(mlen_i - curlen)]) return i_d + +def parse_none(vin, value): + + return value if vin is None else vin + +def parse_double_value_tuple(vin): + + if isinstance(vin, (list, tuple,)): + return vin[0], vin[-1] + else: + return vin, vin diff --git a/utils/fmt/base4torch.py b/utils/fmt/base4torch.py index 88c0be2..6bd6aef 100644 --- a/utils/fmt/base4torch.py +++ b/utils/fmt/base4torch.py @@ -4,6 +4,7 @@ from math import sqrt from utils.fmt.base import list_reader +from utils.h5serial import h5save, h5load def parse_cuda(use_cuda_arg, gpuid): @@ -66,7 +67,7 @@ def load_emb_txt(vcb, embf): def load_emb(embf, embt, nword, scale_down_emb, freeze_emb): - _emb = torch.load(embf, map_location='cpu') + _emb = h5load(embf) if nword < _emb.size(0): _emb = _emb.narrow(0, 0, nword).contiguous() if scale_down_emb: diff --git a/utils/fmt/dual.py b/utils/fmt/dual.py index 53982f7..566bed3 100644 --- a/utils/fmt/dual.py +++ b/utils/fmt/dual.py @@ -36,7 +36,7 @@ def batch_loader(finput, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize): def batch_mapper(finput, ftarget, vocabi, vocabt, bsize, maxpad, maxpart, maxtoken, minbsize): - global has_unk + global use_unk for i_d, td, mlen_i, mlen_t in batch_loader(finput, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize): rsi, extok_i = map_batch(i_d, vocabi) diff --git a/utils/fmt/triple.py b/utils/fmt/triple.py index 25139f3..ef4b76c 100644 --- a/utils/fmt/triple.py +++ b/utils/fmt/triple.py @@ -40,7 +40,7 @@ def batch_loader(finput, fref, ftarget, bsize, maxpad, maxpart, maxtoken, minbsi def batch_mapper(finput, fref, ftarget, vocabi, bsize, maxpad, maxpart, maxtoken, minbsize): - global has_unk + global use_unk for i_d, rd, td, mlen_i, mlen_t in batch_loader(finput, fref, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize): rsi, extok_i = map_batch(i_d, vocabi) diff --git a/utils/h5serial.py b/utils/h5serial.py new file mode 100644 index 0000000..9630e94 --- /dev/null +++ b/utils/h5serial.py @@ -0,0 +1,71 @@ +#encoding: utf-8 + +import torch, h5py + +from collections.abc import Iterator + +from utils.fmt.base import list2dict, dict_is_list + +from cnfg.ihyp import * + +def h5write_dict(gwrt, dtw, h5args=h5modelwargs): + + for k, v in dtw.items(): + _v = tuple(v) if isinstance(v, Iterator) else v + if isinstance(_v, dict): + gwrt.create_group(k) + h5write_dict(gwrt[k], _v, h5args=h5args) + elif isinstance(_v, (list, tuple,)): + gwrt.create_group(k) + h5write_list(gwrt[k], _v, h5args=h5args) + else: + if _v.device.type == 'cpu': + gwrt.create_dataset(k, data=_v.numpy(), **h5args) + else: + gwrt.create_dataset(k, data=_v.cpu().numpy(), **h5args) + +def h5write_list(gwrt, ltw, h5args=h5modelwargs): + + h5write_dict(gwrt, list2dict(ltw, kfunc=list_key_func), h5args=h5args) + +def h5save(obj_save, fname, h5args=h5modelwargs): + + h5f = h5py.File(fname, 'w') + _obj_save = tuple(obj_save) if isinstance(obj_save, Iterator) else obj_save + if isinstance(_obj_save, dict): + h5write_dict(h5f, _obj_save, h5args=h5args) + elif isinstance(_obj_save, (list, tuple,)): + h5write_list(h5f, _obj_save, h5args=h5args) + else: + h5write_list(h5f, [_obj_save], h5args=h5args) + h5f.close() + +def restore_list_in_dict(din): + + if isinstance(din, dict): + _key_set = set(din.keys()) + if dict_is_list(_key_set, kfunc=list_key_func): + return [restore_list_in_dict(din[list_key_func(i)]) for i in range(len(_key_set))] + else: + return {k: restore_list_in_dict(v) for k, v in din.items()} + else: + return din + +def h5load_group(grd): + + rsd = {} + for k, v in grd.items(): + if isinstance(v, h5py.Dataset): + rsd[k] = torch.from_numpy(v[:]) + else: + rsd[k] = h5load_group(v) + return rsd + +def h5load(fname, restore_list=True): + + f = h5py.File(fname, "r") + rsd = h5load_group(f) + f.close() + if restore_list: + rsd = restore_list_in_dict(rsd) + return rsd