diff --git a/CHANGELOG.md b/CHANGELOG.md index cf3ff29d9..0c20c9312 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,13 @@ Note that Sockeye has checks in place to not translate with an old model that wa Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_. +## [1.15.8] +### Fixed + - Taking the BOS and EOS tag into account when calculating the maximum input length at inference. + ## [1.15.7] ### Fixed -- fixed a problem with `--num-samples-per-shard` flag not being parsed as int. + - fixed a problem with `--num-samples-per-shard` flag not being parsed as int. ## [1.15.6] ### Added diff --git a/sockeye/__init__.py b/sockeye/__init__.py index 20a9f9bff..f0692b979 100644 --- a/sockeye/__init__.py +++ b/sockeye/__init__.py @@ -11,4 +11,4 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -__version__ = '1.15.7' +__version__ = '1.15.8' diff --git a/sockeye/data_io.py b/sockeye/data_io.py index f49151ee6..f195da9a8 100644 --- a/sockeye/data_io.py +++ b/sockeye/data_io.py @@ -267,13 +267,13 @@ def sequence_pair(self, source: List[int], target: List[int], bucket_idx: Optional[int]): - source_len = len(source) - target_len = len(target) - if bucket_idx is None: self.num_discarded += 1 return + source_len = len(source) + target_len = len(target) + self._mean_len_target_per_bucket[bucket_idx].update(target_len) self.num_sents += 1 diff --git a/sockeye/inference.py b/sockeye/inference.py index 7d7161118..edee93223 100644 --- a/sockeye/inference.py +++ b/sockeye/inference.py @@ -80,8 +80,8 @@ def __init__(self, self._build_model_components() - self.max_input_length, self.get_max_output_length = get_max_input_output_length([self], - max_output_length_num_stds) + self.max_input_length, self.get_max_output_length = models_max_input_output_length([self], + max_output_length_num_stds) self.encoder_module = None # type: Optional[mx.mod.BucketingModule] self.encoder_default_bucket_key = None # type: Optional[int] @@ -401,17 +401,18 @@ def load_models(context: mx.context.Context, utils.check_condition(vocab.are_identical(*target_vocabs), "Target vocabulary ids do not match") # set a common max_output length for all models. - max_input_len, get_max_output_length = get_max_input_output_length(models, - max_output_length_num_stds, - max_input_len) + max_input_len, get_max_output_length = models_max_input_output_length(models, + max_output_length_num_stds, + max_input_len) for model in models: model.initialize(max_input_len, get_max_output_length) return models, source_vocabs[0], target_vocabs[0] -def get_max_input_output_length(models: List[InferenceModel], num_stds: int, - max_input_len: Optional[int] = None) -> Tuple[int, Callable]: +def models_max_input_output_length(models: List[InferenceModel], + num_stds: int, + forced_max_input_len: Optional[int] = None) -> Tuple[int, Callable]: """ Returns a function to compute maximum output length given a fixed number of standard deviations as a safety margin, and the current input length. @@ -421,46 +422,83 @@ def get_max_input_output_length(models: List[InferenceModel], num_stds: int, :param models: List of models. :param num_stds: Number of standard deviations to add as a safety margin. If -1, returned maximum output lengths will always be 2 * input_length. - :param max_input_len: An optional overwrite of the maximum input length. + :param forced_max_input_len: An optional overwrite of the maximum input length. :return: The maximum input length and a function to get the output length given the input length. """ max_mean = max(model.length_ratio_mean for model in models) max_std = max(model.length_ratio_std for model in models) - if num_stds < 0: - factor = C.TARGET_MAX_LENGTH_FACTOR # type: float - else: - factor = max_mean + (max_std * num_stds) - supported_max_seq_len_source = min((model.max_supported_seq_len_source for model in models if model.max_supported_seq_len_source is not None), default=None) supported_max_seq_len_target = min((model.max_supported_seq_len_target for model in models if model.max_supported_seq_len_target is not None), default=None) - training_max_seq_len_source = min(model.training_max_seq_len_source for model in models) - if max_input_len is None: + return get_max_input_output_length(supported_max_seq_len_source, + supported_max_seq_len_target, + training_max_seq_len_source, + forced_max_input_len=forced_max_input_len, + length_ratio_mean=max_mean, + length_ratio_std=max_std, + num_stds=num_stds) + + +def get_max_input_output_length(supported_max_seq_len_source: Optional[int], + supported_max_seq_len_target: Optional[int], + training_max_seq_len_source: Optional[int], + forced_max_input_len: Optional[int], + length_ratio_mean: float, + length_ratio_std: float, + num_stds: int) -> Tuple[int, Callable]: + """ + Returns a function to compute maximum output length given a fixed number of standard deviations as a + safety margin, and the current input length. It takes into account optional maximum source and target lengths. + + :param supported_max_seq_len_source: The maximum source length supported by the models. + :param supported_max_seq_len_target: The maximum target length supported by the models. + :param training_max_seq_len_source: The maximum source length observed during training. + :param forced_max_input_len: An optional overwrite of the maximum input length. + :param length_ratio_mean: The mean of the length ratio that was calculated on the raw sequences with special + symbols such as EOS or BOS. + :param length_ratio_std: The standard deviation of the length ratio. + :param num_stds: The number of standard deviations the target length may exceed the mean target length (as long as + the supported maximum length allows for this). + :return: The maximum input length and a function to get the output length given the input length. + """ + space_for_bos = 1 + space_for_eos = 1 + + if num_stds < 0: + factor = C.TARGET_MAX_LENGTH_FACTOR # type: float + else: + factor = length_ratio_mean + (length_ratio_std * num_stds) + + if forced_max_input_len is None: # Make sure that if there is a hard constraint on the maximum source or target length we never exceed this # constraint. This is for example the case for learned positional embeddings, which are only defined for the # maximum source and target sequence length observed during training. if supported_max_seq_len_source is not None and supported_max_seq_len_target is None: max_input_len = supported_max_seq_len_source elif supported_max_seq_len_source is None and supported_max_seq_len_target is not None: - if np.ceil(factor * training_max_seq_len_source) > supported_max_seq_len_target: - max_input_len = int(np.floor(supported_max_seq_len_target / factor)) + max_output_len = supported_max_seq_len_target - space_for_bos - space_for_eos + if np.ceil(factor * training_max_seq_len_source) > max_output_len: + max_input_len = int(np.floor(max_output_len / factor)) else: max_input_len = training_max_seq_len_source elif supported_max_seq_len_source is not None or supported_max_seq_len_target is not None: - if np.ceil(factor * supported_max_seq_len_source) > supported_max_seq_len_target: - max_input_len = int(np.floor(supported_max_seq_len_target / factor)) + max_output_len = supported_max_seq_len_target - space_for_bos - space_for_eos + if np.ceil(factor * supported_max_seq_len_source) > max_output_len: + max_input_len = int(np.floor(max_output_len / factor)) else: max_input_len = supported_max_seq_len_source else: # Any source/target length is supported and max_input_len was not manually set, therefore we use the # maximum length from training. max_input_len = training_max_seq_len_source + else: + max_input_len = forced_max_input_len def get_max_output_length(input_length: int): """ @@ -469,8 +507,7 @@ def get_max_output_length(input_length: int): that the mean length ratio computed on the training data do not include these special symbols. (see data_io.analyze_sequence_lengths) """ - space_for_bos = 1 - space_for_eos = 1 + return int(np.ceil(factor * input_length)) + space_for_bos + space_for_eos return max_input_len, get_max_output_length @@ -737,7 +774,7 @@ def translate(self, trans_inputs: List[TranslatorInput]) -> List[TranslatorOutpu translated_chunks = [] # split into chunks - input_chunks = [] # type: List[InputChunk] + input_chunks = [] # type: List[InputChunk] for input_idx, trans_input in enumerate(trans_inputs): if len(trans_input.tokens) == 0: empty_translation = Translation(target_ids=[], @@ -1043,7 +1080,7 @@ def _beam_search(self, sliced_scores = scores if t == 1 and self.batch_size == 1 else scores[rows] # TODO we could save some tiny amount of time here by not running smallest_k for a finished sent (best_hyp_indices_np[rows], best_word_indices_np[rows]), \ - scores_accumulated_np[rows] = utils.smallest_k(sliced_scores, self.beam_size, t == 1) + scores_accumulated_np[rows] = utils.smallest_k(sliced_scores, self.beam_size, t == 1) # offsetting since the returned smallest_k() indices were slice-relative best_hyp_indices_np[rows] += rows.start diff --git a/test/common.py b/test/common.py index 4cd5d8e96..2d238b07c 100644 --- a/test/common.py +++ b/test/common.py @@ -168,7 +168,8 @@ def tmp_digits_dataset(prefix: str, " --output {output} {quiet}" _TRAIN_PARAMS_PREPARED_DATA_COMMON = "--use-cpu --max-seq-len {max_len} --prepared-data {prepared_data}" \ - " --validation-source {dev_source} --validation-target {dev_target} --output {model} {quiet}" + " --validation-source {dev_source} --validation-target {dev_target} " \ + "--output {model} {quiet}" _TRANSLATE_PARAMS_COMMON = "--use-cpu --models {model} --input {input} --output {output} {quiet}" diff --git a/test/unit/test_inference.py b/test/unit/test_inference.py index fa9432435..a881103c6 100644 --- a/test/unit/test_inference.py +++ b/test/unit/test_inference.py @@ -13,10 +13,10 @@ import mxnet as mx import numpy as np +import pytest import sockeye.inference - _BOS = 0 _EOS = -1 @@ -26,7 +26,7 @@ def test_concat_translations(): NUM_SRC = 7 def length_penalty(length): - return 1./length + return 1. / length expected_score = (1 + 2 + 3) / length_penalty(len(expected_target_ids)) @@ -53,7 +53,7 @@ def test_length_penalty_default(): def test_length_penalty(): lengths = mx.nd.array([[1], [2], [3]]) length_penalty = sockeye.inference.LengthPenalty(.2, 5.0) - expected_lp = np.array([[6**0.2/6**0.2], [7**0.2/6**0.2], [8**0.2/6**0.2]]) + expected_lp = np.array([[6 ** 0.2 / 6 ** 0.2], [7 ** 0.2 / 6 ** 0.2], [8 ** 0.2 / 6 ** 0.2]]) assert np.isclose(length_penalty(lengths).asnumpy(), expected_lp).all() @@ -61,8 +61,55 @@ def test_length_penalty(): def test_length_penalty_int_input(): length = 1 length_penalty = sockeye.inference.LengthPenalty(.2, 5.0) - expected_lp = [6**0.2/6**0.2] + expected_lp = [6 ** 0.2 / 6 ** 0.2] assert np.isclose(np.asarray([length_penalty(length)]), np.asarray(expected_lp)).all() + +@pytest.mark.parametrize("supported_max_seq_len_source, supported_max_seq_len_target, training_max_seq_len_source, " + "forced_max_input_len, length_ratio_mean, length_ratio_std, " + "expected_max_input_len, expected_max_output_len", + [ + (100, 100, 100, None, 0.9, 0.2, 89, 100), + (100, 100, 100, None, 1.1, 0.2, 75, 100), + # No source length constraints. + (None, 100, 100, None, 0.9, 0.1, 98, 100), + # No target length constraints. + (80, None, 100, None, 1.1, 0.4, 80, 122), + # No source/target length constraints. Source is max observed during training and target + # based on length ratios. + (None, None, 100, None, 1.0, 0.1, 100, 113), + # Force a maximum input length. + (100, 100, 100, 50, 1.1, 0.2, 50, 67), + ]) +def test_get_max_input_output_length( + supported_max_seq_len_source, + supported_max_seq_len_target, + training_max_seq_len_source, + forced_max_input_len, + length_ratio_mean, + length_ratio_std, + expected_max_input_len, + expected_max_output_len): + + max_input_len, get_max_output_len = sockeye.inference.get_max_input_output_length( + supported_max_seq_len_source=supported_max_seq_len_source, + supported_max_seq_len_target=supported_max_seq_len_target, + training_max_seq_len_source=training_max_seq_len_source, + forced_max_input_len=forced_max_input_len, + length_ratio_mean=length_ratio_mean, + length_ratio_std=length_ratio_std, + num_stds=1) + max_output_len = get_max_output_len(max_input_len) + + if supported_max_seq_len_source is not None: + assert max_input_len <= supported_max_seq_len_source + if supported_max_seq_len_target is not None: + assert max_output_len <= supported_max_seq_len_target + if expected_max_input_len is not None: + assert max_input_len == expected_max_input_len + if expected_max_output_len is not None: + assert max_output_len == expected_max_output_len + +