diff --git a/CHANGELOG.md b/CHANGELOG.md
index a21d4b31f..29a3519a7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,10 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
+## [1.18.67]
+### Added
+- Added an option for training to keep the initializations of the model via `--keep-initializations`. When set, the trainer will avoid deleting the params file for the first checkpoint, no matter what `--keep-last-params` is set to.
+
 ## [1.18.66]
 ### Fixed
 - Fix to argument names that are allowed to differ for resuming training.
diff --git a/sockeye/__init__.py b/sockeye/__init__.py
index 7c2ade635..50884b1bf 100644
--- a/sockeye/__init__.py
+++ b/sockeye/__init__.py
@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '1.18.66'
+__version__ = '1.18.67'
diff --git a/sockeye/arguments.py b/sockeye/arguments.py
index a99e63b11..44a8fda9b 100644
--- a/sockeye/arguments.py
+++ b/sockeye/arguments.py
@@ -1063,6 +1063,10 @@ def add_training_args(params):
                               default=-1,
                               help='Keep only the last n params files, use -1 to keep all files. Default: %(default)s')
 
+    train_params.add_argument('--keep-initializations',
+                              action="store_true",
+                              help='In addition to keeping the last n params files, also keep params from checkpoint 0.')
+
     train_params.add_argument('--dry-run',
                               action='store_true',
                               help="Do not perform any actual training, but print statistics about the model"
diff --git a/sockeye/image_captioning/train.py b/sockeye/image_captioning/train.py
index 3238fa9c8..1e904f0c7 100644
--- a/sockeye/image_captioning/train.py
+++ b/sockeye/image_captioning/train.py
@@ -371,6 +371,7 @@ def train(args: argparse.Namespace):
                                                 optimizer_config=create_optimizer_config(args, [1.0],
                                                                                          extra_initializers),
                                                 max_params_files_to_keep=args.keep_last_params,
+                                                keep_initializations=args.keep_initializations,
                                                 source_vocabs=[None],
                                                 target_vocab=target_vocab)
 
diff --git a/sockeye/train.py b/sockeye/train.py
index 0e434af09..f4cbcf4ea 100644
--- a/sockeye/train.py
+++ b/sockeye/train.py
@@ -869,6 +869,7 @@ def train(args: argparse.Namespace) -> training.TrainState:
         trainer = training.EarlyStoppingTrainer(model=training_model,
                                                 optimizer_config=create_optimizer_config(args, source_vocab_sizes),
                                                 max_params_files_to_keep=args.keep_last_params,
+                                                keep_initializations=args.keep_initializations,
                                                 source_vocabs=source_vocabs,
                                                 target_vocab=target_vocab)
 
diff --git a/sockeye/training.py b/sockeye/training.py
index 4d5976dab..6d03a001e 100644
--- a/sockeye/training.py
+++ b/sockeye/training.py
@@ -420,6 +420,7 @@ class EarlyStoppingTrainer:
     :param model: TrainingModel instance.
     :param optimizer_config: The optimizer configuration.
     :param max_params_files_to_keep: Maximum number of params files to keep in the output folder (last n are kept).
+    :param keep_initializations: Regardless of number of params to keep, never delete the first checkpoint.
     :param source_vocabs: Source vocabulary (and optional source factor vocabularies).
     :param target_vocab: Target vocabulary.
     """
@@ -428,11 +429,13 @@ def __init__(self,
                  model: TrainingModel,
                  optimizer_config: OptimizerConfig,
                  max_params_files_to_keep: int,
+                 keep_initializations: bool,
                  source_vocabs: List[vocab.Vocab],
                  target_vocab: vocab.Vocab) -> None:
         self.model = model
         self.optimizer_config = optimizer_config
         self.max_params_files_to_keep = max_params_files_to_keep
+        self.keep_initializations = keep_initializations
         self.tflogger = TensorboardLogger(logdir=os.path.join(model.output_dir, C.TENSORBOARD_NAME),
                                           source_vocab=source_vocabs[0],
                                           target_vocab=target_vocab)
@@ -758,7 +761,7 @@ def _cleanup(self, lr_decay_opt_states_reset: str, process_manager: Optional['De
         Cleans parameter files, training state directory and waits for remaining decoding processes.
         """
         utils.cleanup_params_files(self.model.output_dir, self.max_params_files_to_keep,
-                                   self.state.checkpoint, self.state.best_checkpoint)
+                                   self.state.checkpoint, self.state.best_checkpoint, self.keep_initializations)
         if process_manager is not None:
             result = process_manager.collect_results()
             if result is not None:
@@ -922,7 +925,7 @@ def _save_params(self):
         """
         self.model.save_params_to_file(self.current_params_fname)
         utils.cleanup_params_files(self.model.output_dir, self.max_params_files_to_keep, self.state.checkpoint,
-                                   self.state.best_checkpoint)
+                                   self.state.best_checkpoint, self.keep_initializations)
 
     def _save_training_state(self, train_iter: data_io.BaseParallelSampleIter):
         """
diff --git a/sockeye/utils.py b/sockeye/utils.py
index ca2008e45..97ff3ff21 100644
--- a/sockeye/utils.py
+++ b/sockeye/utils.py
@@ -898,7 +898,7 @@ def metric_value_is_better(new: float, old: float, metric: str) -> bool:
         return new < old
 
 
-def cleanup_params_files(output_folder: str, max_to_keep: int, checkpoint: int, best_checkpoint: int):
+def cleanup_params_files(output_folder: str, max_to_keep: int, checkpoint: int, best_checkpoint: int, keep_first: bool):
     """
     Deletes oldest parameter files from a model folder.
 
@@ -906,12 +906,13 @@ def cleanup_params_files(output_folder: str, max_to_keep: int, checkpoint: int,
     :param max_to_keep: Maximum number of files to keep, negative to keep all.
     :param checkpoint: Current checkpoint (i.e. index of last params file created).
     :param best_checkpoint: Best checkpoint. The parameter file corresponding to this checkpoint will not be deleted.
+    :param keep_first: Don't delete the first checkpoint.
     """
     if max_to_keep <= 0:
         return
     existing_files = glob.glob(os.path.join(output_folder, C.PARAMS_PREFIX + "*"))
     params_name_with_dir = os.path.join(output_folder, C.PARAMS_NAME)
-    for n in range(0, max(1, checkpoint - max_to_keep + 1)):
+    for n in range(1 if keep_first else 0, max(1, checkpoint - max_to_keep + 1)):
         if n != best_checkpoint:
             param_fname_n = params_name_with_dir % n
             if param_fname_n in existing_files:
diff --git a/test/unit/test_arguments.py b/test/unit/test_arguments.py
index 13c80606a..6c3f0fa37 100644
--- a/test/unit/test_arguments.py
+++ b/test/unit/test_arguments.py
@@ -186,6 +186,7 @@ def test_model_parameters(test_params, expected_params):
               decode_and_evaluate_device_id=None,
               seed=13,
               keep_last_params=-1,
+              keep_initializations=False,
               rnn_enc_last_hidden_concat_to_embedding=False,
               dry_run=False)),
 ])
diff --git a/test/unit/test_params.py b/test/unit/test_params.py
index e212d858a..cec1e907d 100644
--- a/test/unit/test_params.py
+++ b/test/unit/test_params.py
@@ -22,13 +22,26 @@
 
 
 def test_cleanup_param_files():
-    with tempfile.TemporaryDirectory() as tmpDir:
+    with tempfile.TemporaryDirectory() as tmp_dir:
         for n in itertools.chain(range(1, 20, 2), range(21, 41)):
             # Create empty files
-            open(os.path.join(tmpDir, C.PARAMS_NAME % n), "w").close()
-        sockeye.utils.cleanup_params_files(tmpDir, 5, 40, 17)
+            open(os.path.join(tmp_dir, C.PARAMS_NAME % n), "w").close()
+        sockeye.utils.cleanup_params_files(tmp_dir, 5, 40, 17, False)
 
-        expectedSurviving = set([os.path.join(tmpDir, C.PARAMS_NAME % n)
+        expectedSurviving = set([os.path.join(tmp_dir, C.PARAMS_NAME % n)
                                  for n in [17, 36, 37, 38, 39, 40]])
         # 17 must survive because it is the best one
-        assert set(glob.glob(os.path.join(tmpDir, C.PARAMS_PREFIX + "*"))) == expectedSurviving
+        assert set(glob.glob(os.path.join(tmp_dir, C.PARAMS_PREFIX + "*"))) == expectedSurviving
+
+def test_cleanup_param_files_keep_first():
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        for n in itertools.chain(range(0, 20, 2), range(21, 41)):
+            # Create empty files
+            open(os.path.join(tmp_dir, C.PARAMS_NAME % n), "w").close()
+        sockeye.utils.cleanup_params_files(tmp_dir, 5, 40, 16, True)
+
+        expectedSurviving = set([os.path.join(tmp_dir, C.PARAMS_NAME % n)
+                                 for n in [0, 16, 36, 37, 38, 39, 40]])
+        # 16 must survive because it is the best one
+        # 0 should also survive because we set keep_first to True
+        assert set(glob.glob(os.path.join(tmp_dir, C.PARAMS_PREFIX + "*"))) == expectedSurviving