diff --git a/CHANGELOG.md b/CHANGELOG.md index a21d4b31f..29a3519a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ Note that Sockeye has checks in place to not translate with an old model that wa Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_. +## [1.18.67] +### Added +- Added an option for training to keep the initializations of the model via `--keep-initializations`. When set, the trainer will avoid deleting the params file for the first checkpoint, no matter what `--keep-last-params` is set to. + ## [1.18.66] ### Fixed - Fix to argument names that are allowed to differ for resuming training. diff --git a/sockeye/__init__.py b/sockeye/__init__.py index 7c2ade635..50884b1bf 100644 --- a/sockeye/__init__.py +++ b/sockeye/__init__.py @@ -11,4 +11,4 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -__version__ = '1.18.66' +__version__ = '1.18.67' diff --git a/sockeye/arguments.py b/sockeye/arguments.py index a99e63b11..44a8fda9b 100644 --- a/sockeye/arguments.py +++ b/sockeye/arguments.py @@ -1063,6 +1063,10 @@ def add_training_args(params): default=-1, help='Keep only the last n params files, use -1 to keep all files. Default: %(default)s') + train_params.add_argument('--keep-initializations', + action="store_true", + help='In addition to keeping the last n params files, also keep params from checkpoint 0.') + train_params.add_argument('--dry-run', action='store_true', help="Do not perform any actual training, but print statistics about the model" diff --git a/sockeye/image_captioning/train.py b/sockeye/image_captioning/train.py index 3238fa9c8..1e904f0c7 100644 --- a/sockeye/image_captioning/train.py +++ b/sockeye/image_captioning/train.py @@ -371,6 +371,7 @@ def train(args: argparse.Namespace): optimizer_config=create_optimizer_config(args, [1.0], extra_initializers), max_params_files_to_keep=args.keep_last_params, + keep_initializations=args.keep_initializations, source_vocabs=[None], target_vocab=target_vocab) diff --git a/sockeye/train.py b/sockeye/train.py index 0e434af09..f4cbcf4ea 100644 --- a/sockeye/train.py +++ b/sockeye/train.py @@ -869,6 +869,7 @@ def train(args: argparse.Namespace) -> training.TrainState: trainer = training.EarlyStoppingTrainer(model=training_model, optimizer_config=create_optimizer_config(args, source_vocab_sizes), max_params_files_to_keep=args.keep_last_params, + keep_initializations=args.keep_initializations, source_vocabs=source_vocabs, target_vocab=target_vocab) diff --git a/sockeye/training.py b/sockeye/training.py index 4d5976dab..6d03a001e 100644 --- a/sockeye/training.py +++ b/sockeye/training.py @@ -420,6 +420,7 @@ class EarlyStoppingTrainer: :param model: TrainingModel instance. :param optimizer_config: The optimizer configuration. :param max_params_files_to_keep: Maximum number of params files to keep in the output folder (last n are kept). + :param keep_initializations: Regardless of number of params to keep, never delete the first checkpoint. :param source_vocabs: Source vocabulary (and optional source factor vocabularies). :param target_vocab: Target vocabulary. """ @@ -428,11 +429,13 @@ def __init__(self, model: TrainingModel, optimizer_config: OptimizerConfig, max_params_files_to_keep: int, + keep_initializations: bool, source_vocabs: List[vocab.Vocab], target_vocab: vocab.Vocab) -> None: self.model = model self.optimizer_config = optimizer_config self.max_params_files_to_keep = max_params_files_to_keep + self.keep_initializations = keep_initializations self.tflogger = TensorboardLogger(logdir=os.path.join(model.output_dir, C.TENSORBOARD_NAME), source_vocab=source_vocabs[0], target_vocab=target_vocab) @@ -758,7 +761,7 @@ def _cleanup(self, lr_decay_opt_states_reset: str, process_manager: Optional['De Cleans parameter files, training state directory and waits for remaining decoding processes. """ utils.cleanup_params_files(self.model.output_dir, self.max_params_files_to_keep, - self.state.checkpoint, self.state.best_checkpoint) + self.state.checkpoint, self.state.best_checkpoint, self.keep_initializations) if process_manager is not None: result = process_manager.collect_results() if result is not None: @@ -922,7 +925,7 @@ def _save_params(self): """ self.model.save_params_to_file(self.current_params_fname) utils.cleanup_params_files(self.model.output_dir, self.max_params_files_to_keep, self.state.checkpoint, - self.state.best_checkpoint) + self.state.best_checkpoint, self.keep_initializations) def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter): """ diff --git a/sockeye/utils.py b/sockeye/utils.py index ca2008e45..97ff3ff21 100644 --- a/sockeye/utils.py +++ b/sockeye/utils.py @@ -898,7 +898,7 @@ def metric_value_is_better(new: float, old: float, metric: str) -> bool: return new < old -def cleanup_params_files(output_folder: str, max_to_keep: int, checkpoint: int, best_checkpoint: int): +def cleanup_params_files(output_folder: str, max_to_keep: int, checkpoint: int, best_checkpoint: int, keep_first: bool): """ Deletes oldest parameter files from a model folder. @@ -906,12 +906,13 @@ def cleanup_params_files(output_folder: str, max_to_keep: int, checkpoint: int, :param max_to_keep: Maximum number of files to keep, negative to keep all. :param checkpoint: Current checkpoint (i.e. index of last params file created). :param best_checkpoint: Best checkpoint. The parameter file corresponding to this checkpoint will not be deleted. + :param keep_first: Don't delete the first checkpoint. """ if max_to_keep <= 0: return existing_files = glob.glob(os.path.join(output_folder, C.PARAMS_PREFIX + "*")) params_name_with_dir = os.path.join(output_folder, C.PARAMS_NAME) - for n in range(0, max(1, checkpoint - max_to_keep + 1)): + for n in range(1 if keep_first else 0, max(1, checkpoint - max_to_keep + 1)): if n != best_checkpoint: param_fname_n = params_name_with_dir % n if param_fname_n in existing_files: diff --git a/test/unit/test_arguments.py b/test/unit/test_arguments.py index 13c80606a..6c3f0fa37 100644 --- a/test/unit/test_arguments.py +++ b/test/unit/test_arguments.py @@ -186,6 +186,7 @@ def test_model_parameters(test_params, expected_params): decode_and_evaluate_device_id=None, seed=13, keep_last_params=-1, + keep_initializations=False, rnn_enc_last_hidden_concat_to_embedding=False, dry_run=False)), ]) diff --git a/test/unit/test_params.py b/test/unit/test_params.py index e212d858a..cec1e907d 100644 --- a/test/unit/test_params.py +++ b/test/unit/test_params.py @@ -22,13 +22,26 @@ def test_cleanup_param_files(): - with tempfile.TemporaryDirectory() as tmpDir: + with tempfile.TemporaryDirectory() as tmp_dir: for n in itertools.chain(range(1, 20, 2), range(21, 41)): # Create empty files - open(os.path.join(tmpDir, C.PARAMS_NAME % n), "w").close() - sockeye.utils.cleanup_params_files(tmpDir, 5, 40, 17) + open(os.path.join(tmp_dir, C.PARAMS_NAME % n), "w").close() + sockeye.utils.cleanup_params_files(tmp_dir, 5, 40, 17, False) - expectedSurviving = set([os.path.join(tmpDir, C.PARAMS_NAME % n) + expectedSurviving = set([os.path.join(tmp_dir, C.PARAMS_NAME % n) for n in [17, 36, 37, 38, 39, 40]]) # 17 must survive because it is the best one - assert set(glob.glob(os.path.join(tmpDir, C.PARAMS_PREFIX + "*"))) == expectedSurviving + assert set(glob.glob(os.path.join(tmp_dir, C.PARAMS_PREFIX + "*"))) == expectedSurviving + +def test_cleanup_param_files_keep_first(): + with tempfile.TemporaryDirectory() as tmp_dir: + for n in itertools.chain(range(0, 20, 2), range(21, 41)): + # Create empty files + open(os.path.join(tmp_dir, C.PARAMS_NAME % n), "w").close() + sockeye.utils.cleanup_params_files(tmp_dir, 5, 40, 16, True) + + expectedSurviving = set([os.path.join(tmp_dir, C.PARAMS_NAME % n) + for n in [0, 16, 36, 37, 38, 39, 40]]) + # 16 must survive because it is the best one + # 0 should also survive because we set keep_first to True + assert set(glob.glob(os.path.join(tmp_dir, C.PARAMS_PREFIX + "*"))) == expectedSurviving