From 11ca2b5386d6a2b7a3a4de4c91aa75e9c15a6c9c Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 22 Aug 2023 00:33:17 -0400 Subject: [PATCH 01/62] Add input_tokes as optional output --- .../transformers/pipelines/text_generation.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 0b09fe44d7..b6a2348eef 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -60,6 +60,11 @@ class Config: "the logits for the input text sequence and the " "generated text sequence. ", ) + return_input_tokens: bool = Field( + default=False, + description="A flag that indicates whether to return " + "the input_tokens. ", + ) session_id: Optional[str] = Field( default=None, description="A user may set a string identifier " @@ -95,6 +100,13 @@ class TextGenerationOutput(BaseModel): "The logits have dimensions " "[batch_size, sequence_length, vocab_size]", ) + input_tokens: Optional[Any] = Field( # dictionary mapping "token_ids" and "attention_mask" to numpy arrays + default=None, + description="The output of the tokenizer." + "Dictionary containing token_ids and attention_mask, " + "both mapping to arrays of size " + "[batch_size, sequence_length]", + ) session_id: Optional[str] = Field( default=None, description="A string identifier for the kv cache session." ) @@ -353,7 +365,10 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]: self.multitoken_engine.session_id = inputs.session_id postprocessing_kwargs = dict( - return_logits=inputs.return_logits, streamer=inputs.streamer + return_logits=inputs.return_logits, + return_input_tokens=inputs.return_input_tokens, + input_tokens=input_tokens, + streamer=inputs.streamer, ) return engine_input, postprocessing_kwargs @@ -371,8 +386,9 @@ def process_engine_outputs( generated_tokens, skip_special_tokens=True ) logits = generated_logits if kwargs.get("return_logits") else None + input_tokens = kwargs.get("input_tokens") if kwargs.get("return_input_tokens") else None - return TextGenerationOutput(sequences=sequences, logits=logits) + return TextGenerationOutput(sequences=sequences, logits=logits, input_tokens=input_tokens) def engine_forward( self, engine_inputs: List[numpy.ndarray], context: Dict From 530d625022e025b68d237d0096328946e3ae8612 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 22 Aug 2023 00:33:57 -0400 Subject: [PATCH 02/62] Refactor Perplexity class to only compute perplexity. All other task-specific processing is handled elsewhere --- src/deepsparse/transformers/metrics.py | 149 ++++++------------------- 1 file changed, 35 insertions(+), 114 deletions(-) diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index 6b002e26f6..db6ddc4692 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -16,17 +16,11 @@ Utilities for evaluation metric computation """ - -from itertools import compress -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional import numpy -from tqdm import tqdm import torch -from deepsparse import Pipeline -from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline -from deepsparse.transformers.utils.helpers import pad_to_fixed_length from sklearn.metrics import precision_recall_fscore_support @@ -37,7 +31,7 @@ class Perplexity: - def __init__(self, pipeline: Pipeline, batch_size: int = 16): + def __init__(self): """ Given the pipeline, compute the perplexity of the model on the given text input. @@ -45,126 +39,53 @@ def __init__(self, pipeline: Pipeline, batch_size: int = 16): Code adapted from: https://huggingface.co/spaces/evaluate-metric/perplexity/blob/main/perplexity.py # noqa: E501 - :param pipeline: The pipeline to use for text generation - :param batch_size: The batch size to split the input text into non-overlapping batches """ - if not isinstance(pipeline, TextGenerationPipeline): - raise ValueError( - "Perplexity can only be computed for text generation pipelines" - ) - self._pipeline = pipeline - self._batch_size = batch_size - self._sequence_length = pipeline.sequence_length + self._predictions = None + self._targets = None self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none") - self.perplexities = [] - - def add_batch(self, predictions: List[str]): + def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray): """ - Run the model on the given input sequences and compute the perplexity. - The resulting perplexity is appended to the list of perplexities. + adds a batch of prediction results to track, should be of shape + (batch_size, num_labels) - :param predictions: The predictions to compute perplexity on + :param predictions: predicted scores from pipeline + :param targets: target values - label column should be 1 if a label is positive + 0 otherwise """ - # tokenize the input text - encodings = self._pipeline.tokenizer( - predictions, - return_attention_mask=True, - max_length=self._sequence_length, - truncation=True, - padding="max_length", - ) - - encoded_texts = encodings["input_ids"] - attention_masks = encodings["attention_mask"] - - for start_index in tqdm(range(0, len(encoded_texts), self._batch_size)): - end_index = min(start_index + self._batch_size, len(encoded_texts)) - encoded_batch = encoded_texts[start_index:end_index] - attention_mask = attention_masks[start_index:end_index] - - # Computing the ground truth labels - - # `encoded_batch` contains sequences of tokens padded - # with tokens from the left side. We need to remove - # them and zero-pad from the right side up to the length - # of the longest sequence in the batch - - encoded_batch = [ - list(compress(sequence, attn_mask)) - for (sequence, attn_mask) in zip(encoded_batch, attention_mask) - ] - max_sequence_len = max([len(sequence) for sequence in encoded_batch]) - - encoded_batch = [ - pad_to_fixed_length(numpy.array(sequence), max_sequence_len) - for sequence in encoded_batch - ] - encoded_batch = numpy.stack(encoded_batch) - - # We need to apply the analogous transformation to the attention mask - attention_mask = numpy.array(attention_mask) - attention_mask = [ - list(filter(lambda num: num != 0, mask)) for mask in attention_mask - ] - attention_mask = [ - pad_to_fixed_length(numpy.array(mask), max_sequence_len) - for mask in attention_mask - ] - attention_mask = numpy.stack(attention_mask) - - labels = encoded_batch - - out = self._pipeline( - sequences=predictions, return_logits=True, fixed_sequences_length=True - ) + if predictions.ndim == 1: + predictions = predictions.reshape(1, predictions.shape[0]) + if targets.ndim == 1: + targets = targets.reshape(1, targets.shape[0]) - logits = out.logits - - if not self._pipeline.cache_support_enabled: - # when running inference without cache, we need to apply - # analogous transformations to the logits as we did to the labels - # and attention mask - - # remove "nonsensical" logits for tokens - logits = [ - logit[-attn_mask.sum() :, :] - for (logit, attn_mask) in zip(logits, attention_mask) - ] - # pad logits to max length - logits = [ - pad_to_fixed_length(logit, max_sequence_len) for logit in logits - ] - logits = numpy.stack(logits) - - # shift logits and labels create the input and target for the loss function - shift_logits = logits[:, :-1, :] - shift_labels = labels[:, 1:] - shift_attention_mask_batch = attention_mask[:, 1:] - - # compute perplexity for this batch - perplexity_batch = torch.exp( - ( - self._loss_fct( - torch.tensor(shift_logits.transpose(0, 2, 1)), - torch.tensor(shift_labels), - ) - * torch.tensor(shift_attention_mask_batch) - ).sum(1) - / torch.tensor(shift_attention_mask_batch).sum(1) - ) - self.perplexities.extend(perplexity_batch.numpy().tolist()) + if self._predictions is None: + self._predictions = [predictions] + self._targets = [targets] + else: + self._predictions.append(predictions) + self._targets.append(targets) def compute(self) -> Dict[str, Any]: """ :return: A dictionary containing the mean perplexity and the list of perplexities """ - return { - "mean_perplexity": numpy.mean(self.perplexities), - "perplexities": self.perplexities, - } + # compile results into required str -> float dict + results = {"perplexities": []} + for prediction, target in zip(self._predictions, self._targets): + sample_perplexity = torch.exp( + self._loss_fct( + torch.tensor(prediction.transpose(0, 2, 1)), + torch.tensor(target), + ).mean() + ) + + results["perplexities"].append(sample_perplexity.item()) + + results["mean_perplexity"] = numpy.mean(results["perplexities"]) + + return results class PrecisionRecallF1: From c81692213139a6ac9918d23ad2e2d3c6c9f9ee3c Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 22 Aug 2023 00:45:22 -0400 Subject: [PATCH 03/62] Simplify perplexity evaluation. Evaluation takes place as batch size 1 only, so no need to consider batched execution. In addition, use input_tokens from generation pipeline --- .../transformers/eval_downstream.py | 57 ++++++++++++------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index ffe83aa5d0..e39240710d 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -75,12 +75,15 @@ from datasets import load_dataset, load_metric # isort: skip -def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"): - if args.max_samples: - batch_size = min(batch_size, args.max_samples) - - dataset = load_dataset(dataset_name)["test"] +def perplexity_eval(args, dataset_name="openai_humaneval"): + if dataset_name == "wikitext": + dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + dataset = dataset["text"] + else: + dataset = load_dataset(dataset_name, split="test") + # We'll use the text generation pipeline to generate a single token. + # Along with the token, it returns the logits for input sequence text_generation = Pipeline.create( task="text-generation", model_path=args.model_path, @@ -90,22 +93,37 @@ def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"): prompt_processing_sequence_length=args.max_sequence_length, max_generated_tokens=1, ) - perplexity_metrics = Perplexity(pipeline=text_generation, batch_size=batch_size) - active_engines = [ - engine - for engine in [text_generation.engine, text_generation.multitoken_engine] - if engine - ] - print("Engine info: ") - [print(f"{engine}\n") for engine in active_engines] - predictions = [] + + # Instantiate perplexity metric + perplexity_metrics = Perplexity() + + # Loop through samples for idx, sample in _enumerate_progress(dataset, args.max_samples): - predictions.append(sample["prompt"] + sample["canonical_solution"]) - if len(predictions) == batch_size: - perplexity_metrics.add_batch(predictions) - predictions = [] + # Collect input sequence + if dataset_name == "openai_humaneval": + sample = sample["prompt"] + sample["canonical_solution"] + + # Perform single token generation + prediction = text_generation( + sequences=sample, + return_logits=True, + return_input_tokens=True, + fixed_sequences_length=True, + ) + + # Need to remove tokens that were masked + input_ids = prediction.input_tokens["input_ids"] + attention_mask = prediction.input_tokens["attention_mask"].flatten() + + logits = numpy.compress(attention_mask, prediction.logits, axis=1)[:, :-1, :] + input_ids = numpy.compress(attention_mask, input_ids, axis=1)[:, 1:] + + # Add predictions (logits) and targets (input_ids) to metric + perplexity_metrics.add_batch(logits, input_ids) + if args.max_samples and idx >= args.max_samples: break + return perplexity_metrics @@ -474,7 +492,8 @@ def _split_train_val(train_dataset, val_ratio, seed=42): "imdb": imdb_eval, "conll2003": conll2003_eval, "go_emotions": go_emotions_eval, - "openai_humaneval": perplexity_eval, + "openai_humaneval": lambda args: perplexity_eval(args, dataset_name="openai_humaneval"), + "wikitext": lambda args: perplexity_eval(args, dataset_name="wikitext"), } From 5c89d89a4cc812136abdbb641b979221bee0bcb6 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 22 Aug 2023 01:01:44 -0400 Subject: [PATCH 04/62] Splits wikitext at regular intervals of the same length as the sequence length --- src/deepsparse/transformers/eval_downstream.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index e39240710d..159d5033b5 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -78,7 +78,11 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): if dataset_name == "wikitext": dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - dataset = dataset["text"] + dataset = "\n\n".join(dataset["text"]) + dataset = [ + dataset[i*args.max_sequence_length:(i+1)*args.max_sequence_length] + for i in range(len(dataset) // args.max_sequence_length) + ] else: dataset = load_dataset(dataset_name, split="test") From 5767ca0ea4c390a397c06c4909109fa0571b9054 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 22 Aug 2023 01:26:14 -0400 Subject: [PATCH 05/62] Add argument for accumulation of negative log likelihood --- src/deepsparse/transformers/metrics.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index db6ddc4692..4bac8bbfd6 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -31,7 +31,7 @@ class Perplexity: - def __init__(self): + def __init__(self, accumulate_likelihood: bool = False): """ Given the pipeline, compute the perplexity of the model on the given text input. @@ -44,6 +44,7 @@ def __init__(self): self._predictions = None self._targets = None self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none") + self._accumulate_likelihood = accumulate_likelihood def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray): """ @@ -72,20 +73,24 @@ def compute(self) -> Dict[str, Any]: and the list of perplexities """ # compile results into required str -> float dict - results = {"perplexities": []} + neg_log_likelihoods = [] for prediction, target in zip(self._predictions, self._targets): - sample_perplexity = torch.exp( + neg_log_likelihoods.append( self._loss_fct( torch.tensor(prediction.transpose(0, 2, 1)), torch.tensor(target), - ).mean() + ).mean().item() ) - results["perplexities"].append(sample_perplexity.item()) - - results["mean_perplexity"] = numpy.mean(results["perplexities"]) - - return results + if self._accumulate_likelihood: + neg_log_likelihood = numpy.mean(neg_log_likelihoods) + return {"perplexity": numpy.exp(neg_log_likelihood)} + else: + perplexities = [numpy.exp(nll) for nll in neg_log_likelihoods] + return { + "perplexities": perplexities, + "mean_perplexity": numpy.mean(perplexities), + } class PrecisionRecallF1: From ec2162e1ffc5de62c598924f86aeb16accc7fcf4 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 22 Aug 2023 01:27:18 -0400 Subject: [PATCH 06/62] Accumulate likelihood for wikitext --- src/deepsparse/transformers/eval_downstream.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 159d5033b5..b8dac3b7fc 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -83,7 +83,9 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): dataset[i*args.max_sequence_length:(i+1)*args.max_sequence_length] for i in range(len(dataset) // args.max_sequence_length) ] + accumulate_likelihood = True else: + accumulate_likelihood = False dataset = load_dataset(dataset_name, split="test") # We'll use the text generation pipeline to generate a single token. @@ -99,7 +101,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): ) # Instantiate perplexity metric - perplexity_metrics = Perplexity() + perplexity_metrics = Perplexity(accumulate_likelihood=accumulate_likelihood) # Loop through samples for idx, sample in _enumerate_progress(dataset, args.max_samples): From a7941ef99bb274add060c1bf7a5f6eb4129092d4 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 22 Aug 2023 01:43:33 -0400 Subject: [PATCH 07/62] Simplification --- src/deepsparse/transformers/eval_downstream.py | 2 +- src/deepsparse/transformers/metrics.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index b8dac3b7fc..311076f49b 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -85,8 +85,8 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): ] accumulate_likelihood = True else: - accumulate_likelihood = False dataset = load_dataset(dataset_name, split="test") + accumulate_likelihood = False # We'll use the text generation pipeline to generate a single token. # Along with the token, it returns the logits for input sequence diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index 4bac8bbfd6..e656ea8ef8 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -43,7 +43,7 @@ def __init__(self, accumulate_likelihood: bool = False): """ self._predictions = None self._targets = None - self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none") + self._loss_fct = torch.nn.CrossEntropyLoss() self._accumulate_likelihood = accumulate_likelihood def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray): @@ -79,7 +79,7 @@ def compute(self) -> Dict[str, Any]: self._loss_fct( torch.tensor(prediction.transpose(0, 2, 1)), torch.tensor(target), - ).mean().item() + ).item() ) if self._accumulate_likelihood: From 3ddd45cc383e504d11bab48fe6ba543d2ae938e3 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 23 Aug 2023 13:38:15 -0400 Subject: [PATCH 08/62] Add support for wikitext-style ppl evaluation --- .../transformers/eval_downstream.py | 120 +++++++++++++----- 1 file changed, 88 insertions(+), 32 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 311076f49b..9570579b2d 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -71,22 +71,53 @@ from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1 - +from transformers import AutoTokenizer from datasets import load_dataset, load_metric # isort: skip def perplexity_eval(args, dataset_name="openai_humaneval"): if dataset_name == "wikitext": - dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - dataset = "\n\n".join(dataset["text"]) - dataset = [ - dataset[i*args.max_sequence_length:(i+1)*args.max_sequence_length] - for i in range(len(dataset) // args.max_sequence_length) - ] - accumulate_likelihood = True + raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + + # Dataset is split into sections that contain "max_sequence_length" tokens. + # To split the dataset, first tokenize text + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + raw_text = "\n\n".join(raw_dataset["text"]) + input_tokens = tokenizer( + raw_text, + return_tensors="np", + )["input_ids"][0] + + # Then split the tokenized text into sections of size "max_sequence_length" and + # decode each section back into text format + dataset = [] + for i in range(len(input_tokens) // args.max_sequence_length): + start = i * args.max_sequence_length + end = (i+1) * args.max_sequence_length + dataset.append( + tokenizer.decode( + input_tokens[start:end], + clean_up_tokenization_spaces=False, + ) + ) + + # Handle any leftover tokens + if (i+1) * args.max_sequence_length < len(input_tokens): + start = (i+1) * args.max_sequence_length + end = len(input_tokens) + dataset.append( + tokenizer.decode( + input_tokens[start:end], + clean_up_tokenization_spaces=False, + ) + ) + + # Set perplexity computation to accumulate negative log-likelihood across + # sections + accumulate = True else: dataset = load_dataset(dataset_name, split="test") - accumulate_likelihood = False + accumulate = False # We'll use the text generation pipeline to generate a single token. # Along with the token, it returns the logits for input sequence @@ -101,33 +132,54 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): ) # Instantiate perplexity metric - perplexity_metrics = Perplexity(accumulate_likelihood=accumulate_likelihood) + perplexity_metrics = Perplexity(accumulate=accumulate) # Loop through samples + batch_samples = [] + run_inference = False + end_evaluation = False + dataset_length = len(dataset) for idx, sample in _enumerate_progress(dataset, args.max_samples): + # Collect input sequence if dataset_name == "openai_humaneval": sample = sample["prompt"] + sample["canonical_solution"] - - # Perform single token generation - prediction = text_generation( - sequences=sample, - return_logits=True, - return_input_tokens=True, - fixed_sequences_length=True, - ) - - # Need to remove tokens that were masked - input_ids = prediction.input_tokens["input_ids"] - attention_mask = prediction.input_tokens["attention_mask"].flatten() - - logits = numpy.compress(attention_mask, prediction.logits, axis=1)[:, :-1, :] - input_ids = numpy.compress(attention_mask, input_ids, axis=1)[:, 1:] - - # Add predictions (logits) and targets (input_ids) to metric - perplexity_metrics.add_batch(logits, input_ids) - - if args.max_samples and idx >= args.max_samples: + batch_samples.append(sample) + + if args.max_samples and idx == args.max_samples - 1: + run_inference = True + end_evaluation = True + + if (idx + 1) % args.batch_size == 0 or idx == dataset_length - 1: + run_inference = True + + if run_inference: + # Perform single token generation + prediction = text_generation( + sequences=batch_samples, + return_logits=True, + return_input_tokens=True, + fixed_sequences_length=True, + ) + + # Handle one sample at a time to make it simpler for masking + for s in range(len(batch_samples)): + # Need to remove tokens that were masked + input_ids = prediction.input_tokens["input_ids"][s].flatten() + logits = prediction.logits[s] + attention_mask = prediction.input_tokens["attention_mask"][s].flatten() + + logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :] + input_ids = numpy.compress(attention_mask, input_ids)[1:] + + # Add predictions (logits) and targets (input_ids) to metric + perplexity_metrics.add_batch(logits, input_ids) + + # Reset batch + batch_samples.clear() + run_inference = False + + if end_evaluation: break return perplexity_metrics @@ -502,7 +554,6 @@ def _split_train_val(train_dataset, val_ratio, seed=42): "wikitext": lambda args: perplexity_eval(args, dataset_name="wikitext"), } - def parse_args(): parser = argparse.ArgumentParser( description="Evaluate a Hugging Face Transformers " @@ -630,7 +681,12 @@ def parse_args(): type=bool, default=False, ) - + parser.add_argument( + "--batch-size", + help="Batch size to evaluate model. Default is 1", + type=int, + default=1, + ) return parser.parse_args() From 756169c0eb15ac9179ddabb084b47c11b39722be Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 23 Aug 2023 13:39:10 -0400 Subject: [PATCH 09/62] Compute batch instead of storing until compute method. This drastically reduced memory requirements --- src/deepsparse/transformers/metrics.py | 116 ++++++++++++++++--------- 1 file changed, 73 insertions(+), 43 deletions(-) diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index e656ea8ef8..394db12813 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -31,65 +31,95 @@ class Perplexity: - def __init__(self, accumulate_likelihood: bool = False): + def __init__(self, accumulate: bool = False): """ - Given the pipeline, compute the perplexity of the model - on the given text input. - - Code adapted from: - https://huggingface.co/spaces/evaluate-metric/perplexity/blob/main/perplexity.py # noqa: E501 - - non-overlapping batches + Class for computing perplexity. """ self._predictions = None self._targets = None - self._loss_fct = torch.nn.CrossEntropyLoss() - self._accumulate_likelihood = accumulate_likelihood + self._accumulate = accumulate + if accumulate: + self._neg_log_likelihood = 0. + self._number_tokens = 0 + else: + self._perplexities = None def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray): """ - adds a batch of prediction results to track, should be of shape - (batch_size, num_labels) - - :param predictions: predicted scores from pipeline - :param targets: target values - label column should be 1 if a label is positive - 0 otherwise + Computes perplexity or negative log-likelihood for each batch + (depending on accumulate argument) + and track results. + + Tracks perplexity or negative log-likelihood since storing + predictions may require a lot of memory. + + :param predictions: predicted scores. + Accepted shapes: + - [batch_size, sequence_length, vocab_size] + - [sequence_length, vocab_size] (batch size = 1) + Note: sequence length has to be uniform within a batch, but not all + batches require the same sequence length + :param targets: target values - index of correct vocabulary entry """ - if predictions.ndim == 1: - predictions = predictions.reshape(1, predictions.shape[0]) - if targets.ndim == 1: - targets = targets.reshape(1, targets.shape[0]) - if self._predictions is None: - self._predictions = [predictions] - self._targets = [targets] + if self._accumulate: + # If accumulate is True, every token from the batch contributes equally to the + # negative log-likelihood. + # Thus, merge batch and sequence length dimensions and compute negative + # log-likelihood for all tokens, and accumulate to total + predictions = numpy.reshape(predictions, (-1, predictions.shape[-1])) + targets = targets.flatten() + + # Compute negative log-likelihood and accumulate + self._neg_log_likelihood += torch.nn.functional.cross_entropy( + torch.tensor(predictions), + torch.tensor(targets), + reduction="sum", + ).item() + + # Track number of tokens processed + self._number_tokens += predictions.shape[0] else: - self._predictions.append(predictions) - self._targets.append(targets) + # If accumulate is False, compute perplexity for each sample individually. + # We assume that sequence length is uniform within a batch, but may vary from batch + # to batch. + + # Create batch dimension if it doesn't exist + if targets.ndim == 1: + predictions = numpy.expand_dims(predictions, axis=0) + targets = numpy.expand_dims(targets, axis=0) + + # Compute negative log-likelihoods for batch + neg_log_likelihoods = torch.nn.functional.cross_entropy( + torch.tensor(predictions.transpose(0, 2, 1)), + torch.tensor(targets), + reduction="none", + ).numpy().mean(-1) + + # Compute perplexities for batch + perplexities = numpy.exp(neg_log_likelihoods) + + # Store perplexities + if self._perplexities is None: + self._perplexities = perplexities + else: + self._perplexities = numpy.concatenate((self._perplexities, perplexities)) def compute(self) -> Dict[str, Any]: """ - :return: A dictionary containing the mean perplexity - and the list of perplexities + :return: A dictionary containing the final results. + If accumulate is True, return single perplexity. + Else, return a list of perplexities (one for each sample) + and mean perplexity. """ - # compile results into required str -> float dict - neg_log_likelihoods = [] - for prediction, target in zip(self._predictions, self._targets): - neg_log_likelihoods.append( - self._loss_fct( - torch.tensor(prediction.transpose(0, 2, 1)), - torch.tensor(target), - ).item() - ) - - if self._accumulate_likelihood: - neg_log_likelihood = numpy.mean(neg_log_likelihoods) - return {"perplexity": numpy.exp(neg_log_likelihood)} + + if self._accumulate: + perplexity = numpy.exp(self._neg_log_likelihood / self._number_tokens) + return {"perplexity": perplexity} else: - perplexities = [numpy.exp(nll) for nll in neg_log_likelihoods] return { - "perplexities": perplexities, - "mean_perplexity": numpy.mean(perplexities), + "perplexities": self._perplexities, + "mean_perplexity": numpy.mean(self._perplexities), } From 97b5f1ada20460d4036567912697164c95cd18e5 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 23 Aug 2023 16:36:38 -0400 Subject: [PATCH 10/62] Remove torch dependency --- src/deepsparse/transformers/metrics.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index 394db12813..418f137e17 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -20,8 +20,8 @@ import numpy -import torch from sklearn.metrics import precision_recall_fscore_support +from scipy.special import log_softmax __all__ = [ @@ -71,11 +71,7 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray): targets = targets.flatten() # Compute negative log-likelihood and accumulate - self._neg_log_likelihood += torch.nn.functional.cross_entropy( - torch.tensor(predictions), - torch.tensor(targets), - reduction="sum", - ).item() + self._neg_log_likelihood += _cross_entropy(predictions, targets, reduction="sum").sum() # Track number of tokens processed self._number_tokens += predictions.shape[0] @@ -90,11 +86,7 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray): targets = numpy.expand_dims(targets, axis=0) # Compute negative log-likelihoods for batch - neg_log_likelihoods = torch.nn.functional.cross_entropy( - torch.tensor(predictions.transpose(0, 2, 1)), - torch.tensor(targets), - reduction="none", - ).numpy().mean(-1) + neg_log_likelihoods = _cross_entropy(predictions, targets) # Compute perplexities for batch perplexities = numpy.exp(neg_log_likelihoods) @@ -181,3 +173,15 @@ def compute(self) -> Dict[str, float]: results["f1_std"] = f1.std() return results + + +def _cross_entropy(predictions, targets, reduction="mean"): + logp = log_softmax(predictions, axis=-1) + neg_log_likelihoods = -1. * numpy.take_along_axis(logp, numpy.expand_dims(targets, axis=-1), axis=-1) + neg_log_likelihoods = numpy.squeeze(neg_log_likelihoods, axis=-1) + if reduction == "mean": + neg_log_likelihoods = neg_log_likelihoods.mean(axis=-1) + elif reduction == "sum": + neg_log_likelihoods = neg_log_likelihoods.sum(axis=-1) + + return neg_log_likelihoods \ No newline at end of file From 91b592141a1203cf090c20a4e76fff4ed45d9609 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 23 Aug 2023 16:50:16 -0400 Subject: [PATCH 11/62] Move split of dataset into helper function --- .../transformers/eval_downstream.py | 63 ++++++++++--------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 9570579b2d..3ffe310c97 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -81,36 +81,9 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): # Dataset is split into sections that contain "max_sequence_length" tokens. # To split the dataset, first tokenize text - tokenizer = AutoTokenizer.from_pretrained(args.model_path) raw_text = "\n\n".join(raw_dataset["text"]) - input_tokens = tokenizer( - raw_text, - return_tensors="np", - )["input_ids"][0] - - # Then split the tokenized text into sections of size "max_sequence_length" and - # decode each section back into text format - dataset = [] - for i in range(len(input_tokens) // args.max_sequence_length): - start = i * args.max_sequence_length - end = (i+1) * args.max_sequence_length - dataset.append( - tokenizer.decode( - input_tokens[start:end], - clean_up_tokenization_spaces=False, - ) - ) - - # Handle any leftover tokens - if (i+1) * args.max_sequence_length < len(input_tokens): - start = (i+1) * args.max_sequence_length - end = len(input_tokens) - dataset.append( - tokenizer.decode( - input_tokens[start:end], - clean_up_tokenization_spaces=False, - ) - ) + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + dataset = _split_text_by_tokens(raw_text, tokenizer, args.max_sequence_length) # Set perplexity computation to accumulate negative log-likelihood across # sections @@ -539,6 +512,38 @@ def _split_train_val(train_dataset, val_ratio, seed=42): return train_ds, val_ds +def _split_text_by_tokens(text, tokenizer, sequence_length): + input_tokens = tokenizer( + text, + return_tensors="np", + )["input_ids"][0] + + # Then split the tokenized text into sections of size "max_sequence_length" and + # decode each section back into text format + split_text = [] + for i in range(len(input_tokens) // sequence_length): + start = i * sequence_length + end = (i + 1) * sequence_length + split_text.append( + tokenizer.decode( + input_tokens[start:end], + clean_up_tokenization_spaces=False, + ) + ) + + # Handle any leftover tokens + if (i + 1) * sequence_length < len(input_tokens): + start = (i + 1) * sequence_length + end = len(input_tokens) + split_text.append( + tokenizer.decode( + input_tokens[start:end], + clean_up_tokenization_spaces=False, + ) + ) + + return split_text + # Register all the supported downstream datasets here SUPPORTED_DATASETS = { "squad": lambda args: qa_eval(args, dataset_name="squad"), From 8ef20e793f40c4c9f52fd01c6c204767e9d6753d Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 23 Aug 2023 17:57:48 -0400 Subject: [PATCH 12/62] Quality fixes --- src/deepsparse/transformers/eval_downstream.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 839fefb032..890fe4b4c8 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -141,6 +141,12 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): logits = prediction.logits[s] attention_mask = prediction.input_tokens["attention_mask"][s].flatten() + sequence_length = logits.shape[0] + attention_mask = attention_mask[:sequence_length] + input_ids = input_ids[:sequence_length] + + print(attention_mask.shape) + print(logits.shape) logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :] input_ids = numpy.compress(attention_mask, input_ids)[1:] @@ -554,8 +560,12 @@ def _split_text_by_tokens(text, tokenizer, sequence_length): "imdb": imdb_eval, "conll2003": conll2003_eval, "go_emotions": go_emotions_eval, - "openai_humaneval": lambda args: perplexity_eval(args, dataset_name="openai_humaneval"), - "wikitext": lambda args: perplexity_eval(args, dataset_name="wikitext"), + "openai_humaneval": lambda args: perplexity_eval( + args, dataset_name="openai_humaneval", + ), + "wikitext": lambda args: perplexity_eval( + args, dataset_name="wikitext", + ), } def parse_args(): From 5a602289c383628270ffd8d739eda16f917ec78a Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Thu, 24 Aug 2023 11:02:21 -0400 Subject: [PATCH 13/62] Remove debugging prints --- src/deepsparse/transformers/eval_downstream.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 890fe4b4c8..7861c16ee8 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -145,8 +145,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): attention_mask = attention_mask[:sequence_length] input_ids = input_ids[:sequence_length] - print(attention_mask.shape) - print(logits.shape) logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :] input_ids = numpy.compress(attention_mask, input_ids)[1:] From 2559e419d55130d2eaa8fe8c2fa723baf590a6e3 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Thu, 24 Aug 2023 11:02:27 -0400 Subject: [PATCH 14/62] Remove debugging prints --- .../transformers/pipelines/text_generation.py | 116 +++++++++--------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 3870f9a873..19de36c182 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -33,7 +33,6 @@ ) from deepsparse.utils.onnx import default_cached_outputs - _LOGGER = logging.getLogger(__name__) __all__ = ["TextGenerationPipeline"] @@ -57,36 +56,36 @@ class Config: return_logits: bool = Field( default=False, description="A flag that indicates whether to return " - "the logits for the input text sequence and the " - "generated text sequence. ", + "the logits for the input text sequence and the " + "generated text sequence. ", ) return_input_tokens: bool = Field( default=False, description="A flag that indicates whether to return " - "the input_tokens. ", + "the input_tokens. ", ) session_id: Optional[str] = Field( default=None, description="A user may set a string identifier " - "for the kv cache session. If None, " - "and the model is using kv cache, it " - "will be set to a random uuid.", + "for the kv cache session. If None, " + "and the model is using kv cache, it " + "will be set to a random uuid.", ) fixed_sequences_length: bool = Field( default=False, description="A flag that indicates whether to modify " - "(pad or truncate) each input text sequence, so that " - "its tokenized length is equal to `sequence_length` " - "of tokens. Useful, when a batch of predictions needs " - "to have consistent length so one " - "can compute metric in a batched fashion. ", + "(pad or truncate) each input text sequence, so that " + "its tokenized length is equal to `sequence_length` " + "of tokens. Useful, when a batch of predictions needs " + "to have consistent length so one " + "can compute metric in a batched fashion. ", ) streamer: Optional[TextStreamer] = Field( default=None, description="Streamer object that will be used to stream the " - "generated sequences. Generated tokens are passed through " - "`streamer.put(token_ids)` and the streamer is responsible " - "for any further processing.", + "generated sequences. Generated tokens are passed through " + "`streamer.put(token_ids)` and the streamer is responsible " + "for any further processing.", ) @@ -97,15 +96,15 @@ class TextGenerationOutput(BaseModel): logits: Optional[Any] = Field( # numpy array, set to Any for FastAPI compatibility default=None, description="The logits for the generated text sequence." - "The logits have dimensions " - "[batch_size, sequence_length, vocab_size]", + "The logits have dimensions " + "[batch_size, sequence_length, vocab_size]", ) input_tokens: Optional[Any] = Field( # dictionary mapping "token_ids" and "attention_mask" to numpy arrays default=None, description="The output of the tokenizer." - "Dictionary containing token_ids and attention_mask, " - "both mapping to arrays of size " - "[batch_size, sequence_length]", + "Dictionary containing token_ids and attention_mask, " + "both mapping to arrays of size " + "[batch_size, sequence_length]", ) session_id: Optional[str] = Field( default=None, description="A string identifier for the kv cache session." @@ -147,14 +146,14 @@ class TextGenerationPipeline(TransformersPipeline): """ def __init__( - self, - deterministic: bool = True, - sampling_temperature: float = 1.0, - max_generated_tokens: Optional[int] = 1024, - prompt_processing_sequence_length: int = 64, - force_max_tokens: bool = False, - use_deepsparse_cache: bool = True, - **kwargs, + self, + deterministic: bool = True, + sampling_temperature: float = 1.0, + max_generated_tokens: Optional[int] = 1024, + prompt_processing_sequence_length: int = 64, + force_max_tokens: bool = False, + use_deepsparse_cache: bool = True, + **kwargs, ): kwargs_engine_type = kwargs.get("engine_type", DEEPSPARSE_ENGINE) @@ -202,7 +201,7 @@ def __init__( self.engine, self.multitoken_engine = self.initialize_engines() def initialize_engines( - self, + self, ) -> Tuple[Optional[NLDecoderEngine], Optional[NLDecoderEngine]]: """ Inititalizes a pair of engines for the pipeline. @@ -227,9 +226,9 @@ def initialize_engines( if self.cache_support_enabled: if ( - self.engine_type == DEEPSPARSE_ENGINE - and self.sequence_length <= self.prompt_processing_sequence_length - and self.enable_multitoken_prefill + self.engine_type == DEEPSPARSE_ENGINE + and self.sequence_length <= self.prompt_processing_sequence_length + and self.enable_multitoken_prefill ): raise ValueError( "Attempting to initialize auxiliary DeepSparse engine to " @@ -257,9 +256,8 @@ def initialize_engines( ) if ( - self.cache_support_enabled and self.enable_multitoken_prefill + self.cache_support_enabled and self.enable_multitoken_prefill ) or not self.cache_support_enabled: - multitoken_engine = NLDecoderEngine( onnx_file_path=self.onnx_file_path, engine_type=self.engine_type, @@ -268,7 +266,9 @@ def initialize_engines( sampling_temperature=self.sampling_temperature, deterministic=self.deterministic, sequence_length=self.sequence_length, - input_ids_length=self.prompt_processing_sequence_length, + input_ids_length=self.prompt_processing_sequence_length + if self.cache_support_enabled + else self.sequence_length, tokenizer=self.tokenizer, use_deepsparse_cache=self.use_deepsparse_cache, ) @@ -288,13 +288,13 @@ def initialize_engines( ) assert (engine is not None) or ( - multitoken_engine is not None + multitoken_engine is not None ), "At least one of the engines must be initialized for the pipeline!" return engine, multitoken_engine @staticmethod def route_input_to_bucket( - *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs + *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs ) -> Pipeline: """ This method is used to route the input to the correct pipeline. @@ -386,7 +386,7 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]: return engine_input, postprocessing_kwargs def process_engine_outputs( - self, engine_outputs: List[numpy.ndarray], **kwargs + self, engine_outputs: List[numpy.ndarray], **kwargs ) -> TextGenerationOutput: """ Convert the engine outputs to the output schema for the pipeline. @@ -404,7 +404,7 @@ def process_engine_outputs( return TextGenerationOutput(sequences=sequences, logits=logits, input_tokens=input_tokens) def engine_forward( - self, engine_inputs: List[numpy.ndarray], context: Dict + self, engine_inputs: List[numpy.ndarray], context: Dict ) -> Tuple[numpy.ndarray, numpy.ndarray]: """ Run the forward pass on the engine. @@ -456,8 +456,8 @@ def engine_forward( streamer.put(numpy.array([token])) if ( - token == self.tokenizer.eos_token_id - and not self.force_max_tokens + token == self.tokenizer.eos_token_id + and not self.force_max_tokens ): break @@ -469,7 +469,7 @@ def engine_forward( ) def prompt_inference( - self, engine_inputs: List[numpy.ndarray] + self, engine_inputs: List[numpy.ndarray] ) -> Tuple[List[int], List[numpy.ndarray]]: """ An inference run that processes the prompt through the @@ -495,8 +495,8 @@ def prompt_inference( self._reset_engines_cache() if ( - len(tokens) > self.prompt_processing_sequence_length - and self.enable_multitoken_prefill + len(tokens) > self.prompt_processing_sequence_length + and self.enable_multitoken_prefill ): for engine_inputs in self.engine_inputs_for_prefill(tokens): new_token, new_logits = self.multitoken_engine(engine_inputs) @@ -513,7 +513,7 @@ def prompt_inference( for token in tokens[num_tokens_processed:]: run_tokens.append(token) with self.timer_manager.current.time( - _TextGenerationTimings.PROMPT_PREFILL_SINGLE + _TextGenerationTimings.PROMPT_PREFILL_SINGLE ): new_token, new_logits = self.autoregressive_inference(run_tokens) @@ -524,8 +524,8 @@ def prompt_inference( return tokens, prompt_logits def autoregressive_inference( - self, - tokens: List[int], + self, + tokens: List[int], ) -> Tuple[int, numpy.ndarray]: """ An inference run that processes the last token to generate @@ -563,7 +563,7 @@ def autoregressive_inference( return generated_token, generated_logits def engine_inputs_for_prefill( - self, tokens: List[int] + self, tokens: List[int] ) -> Generator[List[numpy.ndarray], None, None]: """ Takes a list of tokens and creates a generator @@ -601,9 +601,9 @@ def engine_inputs_for_prefill( token_batches = [ tokens[ - i - * self.prompt_processing_sequence_length : (i + 1) - * self.prompt_processing_sequence_length + i + * self.prompt_processing_sequence_length: (i + 1) + * self.prompt_processing_sequence_length ] for i in range(0, num_batches) ] @@ -623,13 +623,13 @@ def engine_inputs_for_prefill( # fill it out with 1s (from the right), so that the number # of unmasked entries is equal to the sum of: engine_input[ - :, - -( - # ...the number of current input tokens... + :, + -( + # ...the number of current input tokens... self.prompt_processing_sequence_length # ...and the number of the previous cache entries + num_cached_entries - ) :, + ):, ] = 1 elif name == "causal_mask": # delay creation of the causal mask @@ -646,8 +646,8 @@ def engine_inputs_for_prefill( num_cached_entries + self.prompt_processing_sequence_length, ) - .reshape(1, -1) - .astype(numpy.int64) + .reshape(1, -1) + .astype(numpy.int64) ) engine_inputs.append(engine_input) @@ -670,7 +670,7 @@ def is_cache_support_enabled(self) -> bool: return any(default_cached_outputs(self.onnx_file_path)) def join_engine_outputs( - self, batch_outputs: List[List[numpy.ndarray]], orig_batch_size: int + self, batch_outputs: List[List[numpy.ndarray]], orig_batch_size: int ) -> List[numpy.ndarray]: """ Takes a list of outputs (batches) from the engine From 3b7e14ba60c5c5c75f399043deba3346dd2673fb Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Thu, 24 Aug 2023 14:16:29 -0400 Subject: [PATCH 15/62] Incorporate fixes for kv-cache --- .../transformers/pipelines/text_generation.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 19de36c182..95a12ef75b 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -489,20 +489,17 @@ def prompt_inference( new_token = None num_tokens_processed = 0 - # clean the state of engines' cache - # in the future, this will be paired with the session ids - # to refrain from resetting if session id is being passed - self._reset_engines_cache() - if ( len(tokens) > self.prompt_processing_sequence_length and self.enable_multitoken_prefill ): + self.multitoken_engine.reset_kv_cache() for engine_inputs in self.engine_inputs_for_prefill(tokens): new_token, new_logits = self.multitoken_engine(engine_inputs) num_tokens_processed += self.prompt_processing_sequence_length prompt_logits.append(new_logits) + self.engine.reset_kv_cache() if num_tokens_processed: # transfer the cache state from the multi-token engine to the main engine self.engine.transfer_cache_state(cache=self.multitoken_engine.kv_cache) @@ -730,7 +727,3 @@ def causal_mask_input_present(model_path: str) -> bool: inp.name == "causal_mask" for inp in onnx.load(model_path, load_external_data=False).graph.input ) - - def _reset_engines_cache(self): - self.engine.reset_kv_cache() - self.multitoken_engine.reset_kv_cache() if self.multitoken_engine else None From b5f845b2b6f328d24d48e03a85b3ff3d37309f42 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 25 Aug 2023 15:50:42 -0400 Subject: [PATCH 16/62] Include doc string for accumulate --- src/deepsparse/transformers/metrics.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index 4e195c537c..d6e7d78c3f 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -34,6 +34,10 @@ class Perplexity: def __init__(self, accumulate: bool = False): """ Class for computing perplexity. + + :param accumulate: If True, accumulate negative log-likelihood + over samples. If False, perplexity is computed separately + for each sampled and then averaged in the end. """ self._predictions = None self._targets = None From 6f3b2461f9d26a2b146517a7e3563d471bcddcbb Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 25 Aug 2023 15:51:02 -0400 Subject: [PATCH 17/62] Add support to trust-remote-code arguments --- src/deepsparse/transformers/eval_downstream.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 7861c16ee8..e17d635652 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -101,6 +101,8 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): num_cores=args.num_cores, sequence_length=args.max_sequence_length, max_generated_tokens=1, + trust_remote_code=args.trust_remote_code, + batch_size=args.batch_size, ) # Instantiate perplexity metric @@ -695,10 +697,16 @@ def parse_args(): ) parser.add_argument( "--batch-size", - help="Batch size to evaluate model. Default is 1", + help="Batch size with which to evaluate model. Default is 1", type=int, default=1, ) + parser.add_argument( + "--trust-remote-code", + help="Whether to allow for remote code execution in transformers.", + type=bool, + default=False, + ) return parser.parse_args() From 2056ec50e527d95c350549c76250c719b1af3905 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 25 Aug 2023 18:18:53 -0400 Subject: [PATCH 18/62] Add support to c4 --- .../transformers/eval_downstream.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index e17d635652..eec697ca48 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -78,19 +78,29 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): if dataset_name == "wikitext": raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + raw_text = "\n\n".join(raw_dataset["text"]) + max_token_length = None + elif dataset_name == "c4": + raw_dataset = load_dataset( + "allenai/c4", + "allenai--c4", + data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, + split="validation", + ) + raw_text = " ".join(raw_dataset[:1100]["text"]) + max_token_length = 256 * args.max_sequence_length + else: + dataset = load_dataset(dataset_name, split="test") + if dataset_name in ["wikitext", "c4"]: # Dataset is split into sections that contain "max_sequence_length" tokens. # To split the dataset, first tokenize text - raw_text = "\n\n".join(raw_dataset["text"]) tokenizer = AutoTokenizer.from_pretrained(args.model_path) - dataset = _split_text_by_tokens(raw_text, tokenizer, args.max_sequence_length) + dataset = _split_text_by_tokens(raw_text, tokenizer, args.max_sequence_length, max_token_length) # Set perplexity computation to accumulate negative log-likelihood across # sections accumulate = True - else: - dataset = load_dataset(dataset_name, split="test") - accumulate = False # We'll use the text generation pipeline to generate a single token. # Along with the token, it returns the logits for input sequence @@ -517,12 +527,15 @@ def _split_train_val(train_dataset, val_ratio, seed=42): return train_ds, val_ds -def _split_text_by_tokens(text, tokenizer, sequence_length): +def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length): input_tokens = tokenizer( text, return_tensors="np", )["input_ids"][0] + if max_token_length is not None: + input_tokens = input_tokens[:max_token_length] + # Then split the tokenized text into sections of size "max_sequence_length" and # decode each section back into text format split_text = [] @@ -566,6 +579,9 @@ def _split_text_by_tokens(text, tokenizer, sequence_length): "wikitext": lambda args: perplexity_eval( args, dataset_name="wikitext", ), + "c4": lambda args: perplexity_eval( + args, dataset_name="c4", + ), } def parse_args(): From 858bee67ab5e52627df4a3869518379a12b17922 Mon Sep 17 00:00:00 2001 From: Damian Date: Mon, 28 Aug 2023 09:10:42 +0000 Subject: [PATCH 19/62] add a missing include_prompt_logits param --- .../transformers/eval_downstream.py | 28 ++-- src/deepsparse/transformers/metrics.py | 17 ++- .../transformers/pipelines/text_generation.py | 124 +++++++++--------- 3 files changed, 95 insertions(+), 74 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index eec697ca48..31a25b0197 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -67,15 +67,18 @@ import numpy from tqdm.auto import tqdm +from transformers import AutoTokenizer from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1 -from transformers import AutoTokenizer + from datasets import load_dataset, load_metric # isort: skip def perplexity_eval(args, dataset_name="openai_humaneval"): + accumulate = False + if dataset_name == "wikitext": raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") raw_text = "\n\n".join(raw_dataset["text"]) @@ -96,7 +99,9 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): # Dataset is split into sections that contain "max_sequence_length" tokens. # To split the dataset, first tokenize text tokenizer = AutoTokenizer.from_pretrained(args.model_path) - dataset = _split_text_by_tokens(raw_text, tokenizer, args.max_sequence_length, max_token_length) + dataset = _split_text_by_tokens( + raw_text, tokenizer, args.max_sequence_length, max_token_length + ) # Set perplexity computation to accumulate negative log-likelihood across # sections @@ -144,6 +149,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): return_logits=True, return_input_tokens=True, fixed_sequences_length=True, + include_prompt_logits=True, ) # Handle one sample at a time to make it simpler for masking @@ -528,10 +534,9 @@ def _split_train_val(train_dataset, val_ratio, seed=42): def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length): - input_tokens = tokenizer( - text, - return_tensors="np", - )["input_ids"][0] + input_tokens = tokenizer(text, return_tensors="np",)[ + "input_ids" + ][0] if max_token_length is not None: input_tokens = input_tokens[:max_token_length] @@ -562,6 +567,7 @@ def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length): return split_text + # Register all the supported downstream datasets here SUPPORTED_DATASETS = { "squad": lambda args: qa_eval(args, dataset_name="squad"), @@ -574,16 +580,20 @@ def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length): "conll2003": conll2003_eval, "go_emotions": go_emotions_eval, "openai_humaneval": lambda args: perplexity_eval( - args, dataset_name="openai_humaneval", + args, + dataset_name="openai_humaneval", ), "wikitext": lambda args: perplexity_eval( - args, dataset_name="wikitext", + args, + dataset_name="wikitext", ), "c4": lambda args: perplexity_eval( - args, dataset_name="c4", + args, + dataset_name="c4", ), } + def parse_args(): parser = argparse.ArgumentParser( description="Evaluate a Hugging Face Transformers " diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index d6e7d78c3f..71683d6116 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -20,8 +20,8 @@ import numpy -from sklearn.metrics import precision_recall_fscore_support from scipy.special import log_softmax +from sklearn.metrics import precision_recall_fscore_support __all__ = [ @@ -43,7 +43,7 @@ def __init__(self, accumulate: bool = False): self._targets = None self._accumulate = accumulate if accumulate: - self._neg_log_likelihood = 0. + self._neg_log_likelihood = 0.0 self._number_tokens = 0 else: self._perplexities = None @@ -75,7 +75,9 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray): targets = targets.flatten() # Compute negative log-likelihood and accumulate - self._neg_log_likelihood += _cross_entropy(predictions, targets, reduction="sum").sum() + self._neg_log_likelihood += _cross_entropy( + predictions, targets, reduction="sum" + ).sum() # Track number of tokens processed self._number_tokens += predictions.shape[0] @@ -99,7 +101,9 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray): if self._perplexities is None: self._perplexities = perplexities else: - self._perplexities = numpy.concatenate((self._perplexities, perplexities)) + self._perplexities = numpy.concatenate( + (self._perplexities, perplexities) + ) def compute(self) -> Dict[str, Any]: """ @@ -181,7 +185,9 @@ def compute(self) -> Dict[str, float]: def _cross_entropy(predictions, targets, reduction="mean"): logp = log_softmax(predictions, axis=-1) - neg_log_likelihoods = -1. * numpy.take_along_axis(logp, numpy.expand_dims(targets, axis=-1), axis=-1) + neg_log_likelihoods = -1.0 * numpy.take_along_axis( + logp, numpy.expand_dims(targets, axis=-1), axis=-1 + ) neg_log_likelihoods = numpy.squeeze(neg_log_likelihoods, axis=-1) if reduction == "mean": neg_log_likelihoods = neg_log_likelihoods.mean(axis=-1) @@ -189,4 +195,3 @@ def _cross_entropy(predictions, targets, reduction="mean"): neg_log_likelihoods = neg_log_likelihoods.sum(axis=-1) return neg_log_likelihoods - \ No newline at end of file diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index d3864dcd85..f4a0638c3d 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -44,6 +44,7 @@ ) from deepsparse.utils.onnx import default_cached_outputs + _LOGGER = logging.getLogger(__name__) __all__ = ["TextGenerationPipeline"] @@ -67,13 +68,12 @@ class Config: return_logits: bool = Field( default=False, description="A flag that indicates whether to return " - "the logits for the input text sequence and the " - "generated text sequence. ", + "the logits for the input text sequence and the " + "generated text sequence. ", ) return_input_tokens: bool = Field( default=False, - description="A flag that indicates whether to return " - "the input_tokens. ", + description="A flag that indicates whether to return " "the input_tokens. ", ) include_prompt_logits: bool = Field( default=False, @@ -86,25 +86,25 @@ class Config: session_id: Optional[str] = Field( default=None, description="A user may set a string identifier " - "for the kv cache session. If None, " - "and the model is using kv cache, it " - "will be set to a random uuid.", + "for the kv cache session. If None, " + "and the model is using kv cache, it " + "will be set to a random uuid.", ) fixed_sequences_length: bool = Field( default=False, description="A flag that indicates whether to modify " - "(pad or truncate) each input text sequence, so that " - "its tokenized length is equal to `sequence_length` " - "of tokens. Useful, when a batch of predictions needs " - "to have consistent length so one " - "can compute metric in a batched fashion. ", + "(pad or truncate) each input text sequence, so that " + "its tokenized length is equal to `sequence_length` " + "of tokens. Useful, when a batch of predictions needs " + "to have consistent length so one " + "can compute metric in a batched fashion. ", ) streamer: Optional[TextStreamer] = Field( default=None, description="Streamer object that will be used to stream the " - "generated sequences. Generated tokens are passed through " - "`streamer.put(token_ids)` and the streamer is responsible " - "for any further processing.", + "generated sequences. Generated tokens are passed through " + "`streamer.put(token_ids)` and the streamer is responsible " + "for any further processing.", ) callback: Optional[Callable[[Any], Union[bool, Any]]] = Field( default=None, @@ -128,15 +128,17 @@ class TextGenerationOutput(BaseModel): logits: Optional[Any] = Field( # numpy array, set to Any for FastAPI compatibility default=None, description="The logits for the generated text sequence." - "The logits have dimensions " - "[batch_size, sequence_length, vocab_size]", + "The logits have dimensions " + "[batch_size, sequence_length, vocab_size]", ) - input_tokens: Optional[Any] = Field( # dictionary mapping "token_ids" and "attention_mask" to numpy arrays + input_tokens: Optional[ + Any + ] = Field( # dictionary mapping "token_ids" and "attention_mask" to numpy arrays default=None, description="The output of the tokenizer." - "Dictionary containing token_ids and attention_mask, " - "both mapping to arrays of size " - "[batch_size, sequence_length]", + "Dictionary containing token_ids and attention_mask, " + "both mapping to arrays of size " + "[batch_size, sequence_length]", ) session_id: Optional[str] = Field( default=None, description="A string identifier for the kv cache session." @@ -178,14 +180,14 @@ class TextGenerationPipeline(TransformersPipeline): """ def __init__( - self, - deterministic: bool = True, - sampling_temperature: float = 1.0, - max_generated_tokens: Optional[int] = 1024, - prompt_processing_sequence_length: int = 64, - force_max_tokens: bool = False, - use_deepsparse_cache: bool = True, - **kwargs, + self, + deterministic: bool = True, + sampling_temperature: float = 1.0, + max_generated_tokens: Optional[int] = 1024, + prompt_processing_sequence_length: int = 64, + force_max_tokens: bool = False, + use_deepsparse_cache: bool = True, + **kwargs, ): kwargs_engine_type = kwargs.get("engine_type", DEEPSPARSE_ENGINE) @@ -233,7 +235,7 @@ def __init__( self.engine, self.multitoken_engine = self.initialize_engines() def initialize_engines( - self, + self, ) -> Tuple[Optional[NLDecoderEngine], Optional[NLDecoderEngine]]: """ Inititalizes a pair of engines for the pipeline. @@ -258,9 +260,9 @@ def initialize_engines( if self.cache_support_enabled: if ( - self.engine_type == DEEPSPARSE_ENGINE - and self.sequence_length <= self.prompt_processing_sequence_length - and self.enable_multitoken_prefill + self.engine_type == DEEPSPARSE_ENGINE + and self.sequence_length <= self.prompt_processing_sequence_length + and self.enable_multitoken_prefill ): raise ValueError( "Attempting to initialize auxiliary DeepSparse engine to " @@ -288,7 +290,7 @@ def initialize_engines( ) if ( - self.cache_support_enabled and self.enable_multitoken_prefill + self.cache_support_enabled and self.enable_multitoken_prefill ) or not self.cache_support_enabled: multitoken_engine = NLDecoderEngine( onnx_file_path=self.onnx_file_path, @@ -320,13 +322,13 @@ def initialize_engines( ) assert (engine is not None) or ( - multitoken_engine is not None + multitoken_engine is not None ), "At least one of the engines must be initialized for the pipeline!" return engine, multitoken_engine @staticmethod def route_input_to_bucket( - *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs + *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs ) -> Pipeline: """ This method is used to route the input to the correct pipeline. @@ -421,7 +423,7 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]: return engine_input, postprocessing_kwargs def process_engine_outputs( - self, engine_outputs: List[numpy.ndarray], **kwargs + self, engine_outputs: List[numpy.ndarray], **kwargs ) -> TextGenerationOutput: """ Convert the engine outputs to the output schema for the pipeline. @@ -434,12 +436,16 @@ def process_engine_outputs( generated_tokens, skip_special_tokens=True ) logits = generated_logits if kwargs.get("return_logits") else None - input_tokens = kwargs.get("input_tokens") if kwargs.get("return_input_tokens") else None + input_tokens = ( + kwargs.get("input_tokens") if kwargs.get("return_input_tokens") else None + ) - return TextGenerationOutput(sequences=sequences, logits=logits, input_tokens=input_tokens) + return TextGenerationOutput( + sequences=sequences, logits=logits, input_tokens=input_tokens + ) def engine_forward( - self, engine_inputs: List[numpy.ndarray], context: Dict + self, engine_inputs: List[numpy.ndarray], context: Dict ) -> Tuple[numpy.ndarray, numpy.ndarray]: """ Run the forward pass on the engine. @@ -499,8 +505,8 @@ def engine_forward( streamer.put(numpy.array([token])) if ( - token == self.tokenizer.eos_token_id - and not self.force_max_tokens + token == self.tokenizer.eos_token_id + and not self.force_max_tokens ): break @@ -526,7 +532,7 @@ def engine_forward( ) def prompt_inference( - self, engine_inputs: List[numpy.ndarray] + self, engine_inputs: List[numpy.ndarray] ) -> Tuple[List[int], List[numpy.ndarray]]: """ An inference run that processes the prompt through the @@ -547,8 +553,8 @@ def prompt_inference( num_tokens_processed = 0 if ( - len(tokens) > self.prompt_processing_sequence_length - and self.enable_multitoken_prefill + len(tokens) > self.prompt_processing_sequence_length + and self.enable_multitoken_prefill ): self.multitoken_engine.reset_kv_cache() for engine_inputs in self.engine_inputs_for_prefill(tokens): @@ -567,7 +573,7 @@ def prompt_inference( for token in tokens[num_tokens_processed:]: run_tokens.append(token) with self.timer_manager.current.time( - _TextGenerationTimings.PROMPT_PREFILL_SINGLE + _TextGenerationTimings.PROMPT_PREFILL_SINGLE ): new_token, new_logits = self.autoregressive_inference(run_tokens) @@ -578,8 +584,8 @@ def prompt_inference( return tokens, prompt_logits def autoregressive_inference( - self, - tokens: List[int], + self, + tokens: List[int], ) -> Tuple[int, numpy.ndarray]: """ An inference run that processes the last token to generate @@ -617,7 +623,7 @@ def autoregressive_inference( return generated_token, generated_logits def engine_inputs_for_prefill( - self, tokens: List[int] + self, tokens: List[int] ) -> Generator[List[numpy.ndarray], None, None]: """ Takes a list of tokens and creates a generator @@ -655,9 +661,9 @@ def engine_inputs_for_prefill( token_batches = [ tokens[ - i - * self.prompt_processing_sequence_length: (i + 1) - * self.prompt_processing_sequence_length + i + * self.prompt_processing_sequence_length : (i + 1) + * self.prompt_processing_sequence_length ] for i in range(0, num_batches) ] @@ -677,13 +683,13 @@ def engine_inputs_for_prefill( # fill it out with 1s (from the right), so that the number # of unmasked entries is equal to the sum of: engine_input[ - :, - -( - # ...the number of current input tokens... + :, + -( + # ...the number of current input tokens... self.prompt_processing_sequence_length # ...and the number of the previous cache entries + num_cached_entries - ):, + ) :, ] = 1 elif name == "causal_mask": # delay creation of the causal mask @@ -700,8 +706,8 @@ def engine_inputs_for_prefill( num_cached_entries + self.prompt_processing_sequence_length, ) - .reshape(1, -1) - .astype(numpy.int64) + .reshape(1, -1) + .astype(numpy.int64) ) engine_inputs.append(engine_input) @@ -724,7 +730,7 @@ def is_cache_support_enabled(self) -> bool: return any(default_cached_outputs(self.onnx_file_path)) def join_engine_outputs( - self, batch_outputs: List[List[numpy.ndarray]], orig_batch_size: int + self, batch_outputs: List[List[numpy.ndarray]], orig_batch_size: int ) -> List[numpy.ndarray]: """ Takes a list of outputs (batches) from the engine From 4f6eb6b0c148ab7860a618598fbfb936f6273450 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 28 Aug 2023 10:51:25 -0400 Subject: [PATCH 20/62] Remove unnecessary capping at sequence length (it's incorrect for cached models) --- src/deepsparse/transformers/eval_downstream.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 31a25b0197..6ad46ba26e 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -82,7 +82,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): if dataset_name == "wikitext": raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") raw_text = "\n\n".join(raw_dataset["text"]) - max_token_length = None elif dataset_name == "c4": raw_dataset = load_dataset( "allenai/c4", @@ -91,7 +90,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): split="validation", ) raw_text = " ".join(raw_dataset[:1100]["text"]) - max_token_length = 256 * args.max_sequence_length else: dataset = load_dataset(dataset_name, split="test") @@ -100,7 +98,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): # To split the dataset, first tokenize text tokenizer = AutoTokenizer.from_pretrained(args.model_path) dataset = _split_text_by_tokens( - raw_text, tokenizer, args.max_sequence_length, max_token_length + raw_text, tokenizer, args.max_sequence_length, ) # Set perplexity computation to accumulate negative log-likelihood across @@ -159,10 +157,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): logits = prediction.logits[s] attention_mask = prediction.input_tokens["attention_mask"][s].flatten() - sequence_length = logits.shape[0] - attention_mask = attention_mask[:sequence_length] - input_ids = input_ids[:sequence_length] - logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :] input_ids = numpy.compress(attention_mask, input_ids)[1:] @@ -538,9 +532,6 @@ def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length): "input_ids" ][0] - if max_token_length is not None: - input_tokens = input_tokens[:max_token_length] - # Then split the tokenized text into sections of size "max_sequence_length" and # decode each section back into text format split_text = [] From ab757d0c3b52dccde3ed982e395faf9c4f74ba27 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 29 Aug 2023 09:43:18 -0400 Subject: [PATCH 21/62] Simplify processing for concatenated datasets --- .../transformers/eval_downstream.py | 89 ++++++++++++++----- 1 file changed, 67 insertions(+), 22 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 6ad46ba26e..f5c2c19b69 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -77,33 +77,24 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): - accumulate = False - - if dataset_name == "wikitext": - raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - raw_text = "\n\n".join(raw_dataset["text"]) - elif dataset_name == "c4": - raw_dataset = load_dataset( - "allenai/c4", - "allenai--c4", - data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, - split="validation", - ) - raw_text = " ".join(raw_dataset[:1100]["text"]) - else: - dataset = load_dataset(dataset_name, split="test") if dataset_name in ["wikitext", "c4"]: - # Dataset is split into sections that contain "max_sequence_length" tokens. - # To split the dataset, first tokenize text - tokenizer = AutoTokenizer.from_pretrained(args.model_path) - dataset = _split_text_by_tokens( - raw_text, tokenizer, args.max_sequence_length, + if args.kwargs is None: + kwargs = {} + else: + kwargs = json.loads(args.kwargs) + dataset = _process_concatenated_datasets( + dataset_name, + args.model_path, + args.max_sequence_length, + kwargs, ) - # Set perplexity computation to accumulate negative log-likelihood across # sections accumulate = True + else: + dataset = load_dataset(dataset_name, split="test") + accumulate = False # We'll use the text generation pipeline to generate a single token. # Along with the token, it returns the logits for input sequence @@ -157,8 +148,17 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): logits = prediction.logits[s] attention_mask = prediction.input_tokens["attention_mask"][s].flatten() + effective_sequence_length = logits.shape[0] + + input_ids = input_ids[-effective_sequence_length:] + attention_mask = attention_mask[-effective_sequence_length:] + logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :] input_ids = numpy.compress(attention_mask, input_ids)[1:] + #print(logits[:,0], flush=True) + #print(attention_mask) + #if idx == 1: + # exit() # Add predictions (logits) and targets (input_ids) to metric perplexity_metrics.add_batch(logits, input_ids) @@ -527,7 +527,46 @@ def _split_train_val(train_dataset, val_ratio, seed=42): return train_ds, val_ds -def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length): +def _process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs): + if dataset_name == "wikitext": + eos = kwargs.get("eos", "\n\n") + bos = kwargs.get("bos", "") + + raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + raw_text = raw_dataset["text"] + elif dataset_name == "c4": + eos = kwargs.get("eos", "<|endoftext|>") + bos = kwargs.get("bos", "") + raw_samples = kwargs.get("raw_samples", None) + data_file = kwargs.get("data_file", 0) + if data_file is not None: + raw_dataset = load_dataset( + "allenai/c4", + "allenai--c4", + data_files={"validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"}, + split="validation", + ) + else: + raw_dataset = load_dataset( + "allenai/c4", + "allenai--c4", + split="validation", + ) + if raw_samples is not None: + raw_dataset = raw_dataset[:raw_samples] + raw_text = raw_dataset["text"] + + # Dataset is split into sections that contain "max_sequence_length" tokens. + # To split the dataset, first tokenize text + tokenizer = AutoTokenizer.from_pretrained(model_path) + return _split_text_by_tokens( + raw_text, eos, bos, tokenizer, max_sequence_length, + ) + + +def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length): + text = "".join([bos + sample + eos for sample in text]) + input_tokens = tokenizer(text, return_tensors="np",)[ "input_ids" ][0] @@ -724,6 +763,12 @@ def parse_args(): type=bool, default=False, ) + parser.add_argument( + "--kwargs", + help="Additional arguments specific to each dataset", + type=str, + default=None, + ) return parser.parse_args() From f21eaf3a92b13195a63f318a9a5e00b7a0f95601 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 1 Sep 2023 09:28:50 -0400 Subject: [PATCH 22/62] Fix kv cache update --- src/deepsparse/transformers/pipelines/text_generation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index b30940bac8..36c85d217f 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -577,6 +577,8 @@ def prompt_inference( if num_tokens_processed: # transfer the cache state from the multi-token engine to the main engine self.engine.transfer_cache_state(cache=self.multitoken_engine.kv_cache) + else: + self.engine.reset_kv_cache() # prompt size is small, run autoregressive inference to populate kv cache run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed] From 2a18c457c235d732687e28459143d038891c69e1 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 1 Sep 2023 10:00:09 -0400 Subject: [PATCH 23/62] Fix kv cache update --- src/deepsparse/transformers/pipelines/text_generation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 36c85d217f..e34fbe5d5e 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -573,7 +573,6 @@ def prompt_inference( num_tokens_processed += self.prompt_processing_sequence_length prompt_logits.append(new_logits) - self.engine.reset_kv_cache() if num_tokens_processed: # transfer the cache state from the multi-token engine to the main engine self.engine.transfer_cache_state(cache=self.multitoken_engine.kv_cache) From 7e8da1c24abfd652d906773b9101742e177fd87e Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 1 Sep 2023 17:11:39 -0400 Subject: [PATCH 24/62] Quality fixes --- src/deepsparse/transformers/eval_downstream.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index f5c2c19b69..9ab7dd8b7d 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -107,6 +107,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): max_generated_tokens=1, trust_remote_code=args.trust_remote_code, batch_size=args.batch_size, + use_deepsparse_cache=True, ) # Instantiate perplexity metric @@ -155,10 +156,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :] input_ids = numpy.compress(attention_mask, input_ids)[1:] - #print(logits[:,0], flush=True) - #print(attention_mask) - #if idx == 1: - # exit() # Add predictions (logits) and targets (input_ids) to metric perplexity_metrics.add_batch(logits, input_ids) From 1f9c35881710aef06c323b7e5679264656e53b01 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 8 Sep 2023 10:03:23 -0400 Subject: [PATCH 25/62] remove batch size from pipeline instantiation --- src/deepsparse/transformers/eval_downstream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 9ab7dd8b7d..2a6a6c43e9 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -106,7 +106,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): sequence_length=args.max_sequence_length, max_generated_tokens=1, trust_remote_code=args.trust_remote_code, - batch_size=args.batch_size, use_deepsparse_cache=True, ) From 099b3660b3d564d509cd122edc64110deb40b6cd Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 8 Sep 2023 14:22:37 -0400 Subject: [PATCH 26/62] Rename to wikitext2 --- src/deepsparse/transformers/eval_downstream.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 2a6a6c43e9..0cbc5db5f1 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -78,7 +78,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): - if dataset_name in ["wikitext", "c4"]: + if dataset_name in ["wikitext2", "c4"]: if args.kwargs is None: kwargs = {} else: @@ -524,7 +524,7 @@ def _split_train_val(train_dataset, val_ratio, seed=42): def _process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs): - if dataset_name == "wikitext": + if dataset_name == "wikitext2": eos = kwargs.get("eos", "\n\n") bos = kwargs.get("bos", "") @@ -609,9 +609,9 @@ def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length): args, dataset_name="openai_humaneval", ), - "wikitext": lambda args: perplexity_eval( + "wikitext2": lambda args: perplexity_eval( args, - dataset_name="wikitext", + dataset_name="wikitext2", ), "c4": lambda args: perplexity_eval( args, From 5455c7c333e05ae64214b86b20abc3ff81c0aa66 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 8 Sep 2023 14:27:28 -0400 Subject: [PATCH 27/62] Remove trust_remote_code argument --- src/deepsparse/transformers/eval_downstream.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 0cbc5db5f1..81c2e4bf23 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -105,7 +105,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): num_cores=args.num_cores, sequence_length=args.max_sequence_length, max_generated_tokens=1, - trust_remote_code=args.trust_remote_code, use_deepsparse_cache=True, ) @@ -753,12 +752,6 @@ def parse_args(): type=int, default=1, ) - parser.add_argument( - "--trust-remote-code", - help="Whether to allow for remote code execution in transformers.", - type=bool, - default=False, - ) parser.add_argument( "--kwargs", help="Additional arguments specific to each dataset", From 6a330d4780dfc17502c9c6e9aac4f888c639b566 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 8 Sep 2023 16:46:08 -0400 Subject: [PATCH 28/62] Remove use_deepsparse_cache argument --- src/deepsparse/transformers/eval_downstream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 81c2e4bf23..5153a28acc 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -105,7 +105,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): num_cores=args.num_cores, sequence_length=args.max_sequence_length, max_generated_tokens=1, - use_deepsparse_cache=True, ) # Instantiate perplexity metric From a448667c545d36603d1f41e4e49a26619a811667 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 11 Sep 2023 10:27:47 -0400 Subject: [PATCH 29/62] Change padding of output to left in order to match padding of input ids and attention mask --- src/deepsparse/transformers/utils/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py index 7b465acc37..19e5a9ea8d 100644 --- a/src/deepsparse/transformers/utils/helpers.py +++ b/src/deepsparse/transformers/utils/helpers.py @@ -41,7 +41,7 @@ def pad_to_fixed_length( ) -> numpy.ndarray: """ Pads the array to a fixed length along the given axis. - The padding is done on the right side of the array. + The padding is done on the left side of the array. :param array: array to pad :param max_len: maximum length to pad to @@ -53,7 +53,7 @@ def pad_to_fixed_length( padding = [(0, 0)] * len(array.shape) # for the specified axis, pad to the max length # (from the right side of the array) - padding[axis] = (0, max_len - array.shape[axis]) + padding[axis] = (max_len - array.shape[axis], 0) return numpy.pad(array, padding, mode="constant", constant_values=value) From 54b560c0e5dd81e4169592d2839f302ef9931c6f Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 11 Sep 2023 12:16:58 -0400 Subject: [PATCH 30/62] Allow trust_remote_code to be passed as argument (in some cases tokenizer can be defined by custom code) --- src/deepsparse/transformers/eval_downstream.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 5153a28acc..f9d4d4bf21 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -105,6 +105,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): num_cores=args.num_cores, sequence_length=args.max_sequence_length, max_generated_tokens=1, + trust_remote_code=args.trust_remote_code, ) # Instantiate perplexity metric @@ -751,6 +752,12 @@ def parse_args(): type=int, default=1, ) + parser.add_argument( + "--trust-remote-code", + help="Whether to allow for remote code execution in transformers.", + type=bool, + default=False, + ) parser.add_argument( "--kwargs", help="Additional arguments specific to each dataset", From ad35340ebebb319861a31992c328f8c3f1762544 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 11 Sep 2023 12:40:18 -0400 Subject: [PATCH 31/62] Move process_concatenated_datasets to helpers file --- .../transformers/eval_downstream.py | 75 +------------------ .../transformers/utils/eval_helpers.py | 73 ++++++++++++++++++ 2 files changed, 75 insertions(+), 73 deletions(-) create mode 100644 src/deepsparse/transformers/utils/eval_helpers.py diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index f9d4d4bf21..abffbd1770 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -67,10 +67,10 @@ import numpy from tqdm.auto import tqdm -from transformers import AutoTokenizer from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1 +from deepsparse.transformers.utils.eval_helpers import process_concatenated_datasets from datasets import load_dataset, load_metric # isort: skip @@ -83,7 +83,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): kwargs = {} else: kwargs = json.loads(args.kwargs) - dataset = _process_concatenated_datasets( + dataset = process_concatenated_datasets( dataset_name, args.model_path, args.max_sequence_length, @@ -522,77 +522,6 @@ def _split_train_val(train_dataset, val_ratio, seed=42): return train_ds, val_ds -def _process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs): - if dataset_name == "wikitext2": - eos = kwargs.get("eos", "\n\n") - bos = kwargs.get("bos", "") - - raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") - raw_text = raw_dataset["text"] - elif dataset_name == "c4": - eos = kwargs.get("eos", "<|endoftext|>") - bos = kwargs.get("bos", "") - raw_samples = kwargs.get("raw_samples", None) - data_file = kwargs.get("data_file", 0) - if data_file is not None: - raw_dataset = load_dataset( - "allenai/c4", - "allenai--c4", - data_files={"validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"}, - split="validation", - ) - else: - raw_dataset = load_dataset( - "allenai/c4", - "allenai--c4", - split="validation", - ) - if raw_samples is not None: - raw_dataset = raw_dataset[:raw_samples] - raw_text = raw_dataset["text"] - - # Dataset is split into sections that contain "max_sequence_length" tokens. - # To split the dataset, first tokenize text - tokenizer = AutoTokenizer.from_pretrained(model_path) - return _split_text_by_tokens( - raw_text, eos, bos, tokenizer, max_sequence_length, - ) - - -def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length): - text = "".join([bos + sample + eos for sample in text]) - - input_tokens = tokenizer(text, return_tensors="np",)[ - "input_ids" - ][0] - - # Then split the tokenized text into sections of size "max_sequence_length" and - # decode each section back into text format - split_text = [] - for i in range(len(input_tokens) // sequence_length): - start = i * sequence_length - end = (i + 1) * sequence_length - split_text.append( - tokenizer.decode( - input_tokens[start:end], - clean_up_tokenization_spaces=False, - ) - ) - - # Handle any leftover tokens - if (i + 1) * sequence_length < len(input_tokens): - start = (i + 1) * sequence_length - end = len(input_tokens) - split_text.append( - tokenizer.decode( - input_tokens[start:end], - clean_up_tokenization_spaces=False, - ) - ) - - return split_text - - # Register all the supported downstream datasets here SUPPORTED_DATASETS = { "squad": lambda args: qa_eval(args, dataset_name="squad"), diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py new file mode 100644 index 0000000000..916e787eb5 --- /dev/null +++ b/src/deepsparse/transformers/utils/eval_helpers.py @@ -0,0 +1,73 @@ +from transformers import AutoTokenizer +from datasets import load_dataset + + +def process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs): + if dataset_name == "wikitext2": + eos = kwargs.get("eos", "\n\n") + bos = kwargs.get("bos", "") + + raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + raw_text = raw_dataset["text"] + elif dataset_name == "c4": + eos = kwargs.get("eos", "<|endoftext|>") + bos = kwargs.get("bos", "") + raw_samples = kwargs.get("raw_samples", None) + data_file = kwargs.get("data_file", 0) + if data_file is not None: + raw_dataset = load_dataset( + "allenai/c4", + "allenai--c4", + data_files={"validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"}, + split="validation", + ) + else: + raw_dataset = load_dataset( + "allenai/c4", + "allenai--c4", + split="validation", + ) + if raw_samples is not None: + raw_dataset = raw_dataset[:raw_samples] + raw_text = raw_dataset["text"] + + # Dataset is split into sections that contain "max_sequence_length" tokens. + # To split the dataset, first tokenize text + tokenizer = AutoTokenizer.from_pretrained(model_path) + return _split_text_by_tokens( + raw_text, eos, bos, tokenizer, max_sequence_length, + ) + + +def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length): + text = "".join([bos + sample + eos for sample in text]) + + input_tokens = tokenizer(text, return_tensors="np",)[ + "input_ids" + ][0] + + # Then split the tokenized text into sections of size "max_sequence_length" and + # decode each section back into text format + split_text = [] + for i in range(len(input_tokens) // sequence_length): + start = i * sequence_length + end = (i + 1) * sequence_length + split_text.append( + tokenizer.decode( + input_tokens[start:end], + clean_up_tokenization_spaces=False, + ) + ) + + # Handle any leftover tokens + if (i + 1) * sequence_length < len(input_tokens): + start = (i + 1) * sequence_length + end = len(input_tokens) + split_text.append( + tokenizer.decode( + input_tokens[start:end], + clean_up_tokenization_spaces=False, + ) + ) + + return split_text \ No newline at end of file From b16a5f6eed7f19737dae101065851f31ae07d3cf Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 13 Sep 2023 15:08:27 -0400 Subject: [PATCH 32/62] Added support for max_text_length to speed up processing of long datasets --- .../transformers/utils/eval_helpers.py | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py index 916e787eb5..1fc277340e 100644 --- a/src/deepsparse/transformers/utils/eval_helpers.py +++ b/src/deepsparse/transformers/utils/eval_helpers.py @@ -1,5 +1,6 @@ from transformers import AutoTokenizer from datasets import load_dataset +import numpy def process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs): @@ -35,16 +36,36 @@ def process_concatenated_datasets(dataset_name, model_path, max_sequence_length, # To split the dataset, first tokenize text tokenizer = AutoTokenizer.from_pretrained(model_path) return _split_text_by_tokens( - raw_text, eos, bos, tokenizer, max_sequence_length, + raw_text, eos, bos, tokenizer, max_sequence_length, kwargs.get("max_text_length", None) ) -def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length): - text = "".join([bos + sample + eos for sample in text]) +def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length, max_text_length): + text = [bos + sample + eos for sample in text] - input_tokens = tokenizer(text, return_tensors="np",)[ - "input_ids" - ][0] + if max_text_length is None: + text = "".join(text) + input_tokens = tokenizer(text, return_tensors="np")[ + "input_ids" + ][0] + elif max_text_length == -1: #per sample tokenization + input_tokens = [] + for slice in text: + input_tokens.append(tokenizer(slice, return_tensors="np")[ + "input_ids" + ][0]) + input_tokens = numpy.concatenate(input_tokens) + else: + text = "".join(text) + text_slices = len(text) // max_text_length + sliced_text = [text[i*max_text_length:(i+1)*max_text_length] for i in range(text_slices)] + sliced_text.append(text[text_slices*max_text_length:]) + input_tokens = [] + for slice in sliced_text: + input_tokens.append(tokenizer(slice, return_tensors="np")[ + "input_ids" + ][0]) + input_tokens = numpy.concatenate(input_tokens) # Then split the tokenized text into sections of size "max_sequence_length" and # decode each section back into text format From 065864a37dd1234ae55d299c1fcc33b2b0da6d3a Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 20 Sep 2023 13:52:49 -0400 Subject: [PATCH 33/62] Rebase w/ main --- .../transformers/eval_downstream.py | 2 +- .../transformers/pipelines/text_generation.py | 272 ++++++++++++++---- 2 files changed, 216 insertions(+), 58 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index abffbd1770..b9bacd8b88 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -144,7 +144,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): for s in range(len(batch_samples)): # Need to remove tokens that were masked input_ids = prediction.input_tokens["input_ids"][s].flatten() - logits = prediction.logits[s] + logits = prediction.generations.score[s] attention_mask = prediction.input_tokens["attention_mask"][s].flatten() effective_sequence_length = logits.shape[0] diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index c0eb9ee06a..821b76f620 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import logging import os import warnings +from enum import Enum from typing import ( Any, Callable, @@ -40,8 +42,10 @@ from deepsparse.transformers.utils.helpers import ( create_causal_mask, pad_to_fixed_length, + repeat_inputs, ) from deepsparse.transformers.utils.timings import TextGenerationTimings +from deepsparse.transformers.utils.token_generator import TokenGenerator from deepsparse.utils.onnx import default_cached_outputs @@ -50,6 +54,12 @@ __all__ = ["TextGenerationPipeline"] +class FinishReason(Enum): + STOP = "stop" + LENGTH = "length" + TIME = "time" + + class TextGenerationInput(BaseModel): class Config: arbitrary_types_allowed = True @@ -57,6 +67,18 @@ class Config: sequences: Union[str, List[str]] = Field( description="The input sequences to generate the text from.", ) + num_generated_predictions: int = Field( + default=1, + description="The number of text generations to create from a single prompt. If " + "the same sequence is given as an input multiple times, the number of generated" + "the number of generated predictins is equivalent to the number of times the " + "the sequence is repeated.", + ) + max_tokens: int = Field( + default=1024, + description="Maximum number of tokens to generate per output sequence. If no " + "value is provided, will default to 1024.", + ) return_logits: bool = Field( default=False, description="A flag that indicates whether to return " @@ -111,17 +133,59 @@ class Config: " tokens is generated). Set to `None` to ignore this parameter." " Default is `None`.", ) + top_p: Optional[float] = Field( + default=0.0, + description="Used for filtering generated tokens. Keep the" + " tokens where its cumulative probability is >= top_p" + " Default set to 0.0", + ) + top_k: Optional[int] = Field( + default=0, + description="Used for filtering generated tokens. Keep" + " top_k generated tokens. Default set to 0", + ) + presence_penalty: Optional[float] = Field( + default=0.0, + description="Penalty applied for generating new token. Any existing" + " token results in the subtraction of its corresponding logit value." + " Default set to 0.0", + ) + frequency_penalty: Optional[float] = Field( + default=0.0, + description="Penalty applied for generating new token. Existing" + " token frequencies summed to subtraction the logit of its" + " corresponding logit value. Default set to 0.0.", + ) + + +class GeneratedText(BaseModel): + text: str = Field( + description="The generated sequence for a given prompt. If " + "streaming is enabled, this will be the next generated token." + ) + score: Optional[Any] = Field( + description="The score for the generated token or sequence. " + "The scores have the shape [sequence_length, vocab_size]" + ) + finished: bool = Field(description="Whether generation has stopped.") + finished_reason: str = Field( + description="The reason for generation to stop. " + "Defined by FinishReason. One of stop, length, or time." + ) +# TODO: Pydantic aliases allow assignment but not reference. Still need to update. class TextGenerationOutput(BaseModel): - sequences: Union[str, List[str]] = Field( - description="The generated text sequences.", + created: datetime.datetime = Field(description="Time of inference creation.") + prompts: Union[str, List[str]] = Field( + description="Prompts used for the sequence generation. For multiple input " + "prompts, a list of prompts is returned" ) - logits: Optional[Any] = Field( # numpy array, set to Any for FastAPI compatibility - default=None, - description="The logits for the generated text sequence." - "The logits have dimensions " - "[batch_size, sequence_length, vocab_size]", + generations: Union[List[GeneratedText], List[List[GeneratedText]]] = Field( + description="For a single prompt, a single list of GeneratedText is returned. " + "If multiple prompts are given, a list of GeneratedText is returned for each " + "prompt provided. If streamng is enabled, the next generated token is returned." + "Otherwise, the full generated sequence is returned." ) input_tokens: Optional[ Any @@ -156,11 +220,6 @@ class TextGenerationPipeline(TransformersPipeline): from the probability distribution computed from the logits. Higher values will result in more random samples. Should be greater than 0.0. - :param max_generated_tokens: the maximum number of tokens to generate - given the input sequence. If None, the model will generate - tokens until the end of the sequence is reached. - Otherwise, it will generate up to the maximum number of tokens or end of - sequence is reached. :param sequence_length: sequence length to compile model and tokenizer for. This controls the maximum context length of the pipeline. Default is 512 :param prompt_sequence_length: For large prompts, the prompt is @@ -177,7 +236,6 @@ def __init__( self, deterministic: bool = True, sampling_temperature: float = 1.0, - max_generated_tokens: Optional[int] = 1024, prompt_sequence_length: int = 64, sequence_length: int = 512, force_max_tokens: bool = False, @@ -216,16 +274,8 @@ def __init__( if "WAND_OPT_FLAGS" not in os.environ: os.environ["WAND_OPT_FLAGS"] = "default,~pyramids" - if not self.cache_support_enabled and max_generated_tokens > 1: - raise ValueError( - "The model used for inference does not support kv cache. It is " - "assumed that it maps from the token sequence to predicted logits." - "Set `max_generated_tokens` to 1 to support that scenario." - ) - self.deterministic = deterministic self.sampling_temperature = sampling_temperature - self.max_generated_tokens = max_generated_tokens self.prompt_sequence_length = prompt_sequence_length self.force_max_tokens = force_max_tokens self.internal_kv_cache = internal_kv_cache @@ -280,8 +330,7 @@ def initialize_engines( # instantiation the multitoken engine or not if not self.enable_multitoken_prefill: warnings.warn( - "This ONNX graph does not support processing the prompt in " - "with processing length > 1. Creation of an auxiliary engine for " + "Creation of an auxiliary engine for " "processing the prompt at a larger processing length is disabled. " "The prompt will be processed in with processing length 1." ) @@ -382,6 +431,27 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]: :param inputs: the input schema for the pipeline :return: the inputs for the engine """ + if not self.cache_support_enabled and inputs.max_tokens > 1: + raise ValueError( + "The model used for inference does not support kv cache. It is " + "assumed that it maps from the token sequence to predicted logits." + "Set `max_tokens` to 1 to support that scenario." + ) + + # If the num_generated_predictions > 1, repeat the prompt + # num_generated_predictions times. Also, update the engine so that deterministic + # is set to False. + original_inputs = inputs.sequences + if inputs.num_generated_predictions > 1: + if isinstance(inputs.sequences, str): + inputs.sequences = [inputs.sequences] + inputs.sequences = repeat_inputs( + inputs.sequences, inputs.num_generated_predictions + ) + if self.engine: + self.engine.deterministic = False + if self.multitoken_engine: + self.multitoken_engine.deterministic = False if inputs.fixed_sequences_length or not self.cache_support_enabled: # to enforce a fixed sequence length, we need to @@ -427,7 +497,9 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]: self.engine.session_id = inputs.session_id self.multitoken_engine.session_id = inputs.session_id - postprocessing_kwargs = dict( + context = dict( + prompts=original_inputs, + num_generated_predictions=inputs.num_generated_predictions, return_logits=inputs.return_logits, return_input_tokens=inputs.return_input_tokens, input_tokens=input_tokens, @@ -435,11 +507,17 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]: include_prompt_logits=inputs.include_prompt_logits, callback=inputs.callback, stop=inputs.stop, + top_p=inputs.top_p, + top_k=inputs.top_k, + presence_penalty=inputs.presence_penalty, + frequency_penalty=inputs.frequency_penalty, + max_tokens=inputs.max_tokens, ) - return engine_input, postprocessing_kwargs + + return engine_input, context def process_engine_outputs( - self, engine_outputs: List[numpy.ndarray], **kwargs + self, engine_outputs: List[Union[numpy.ndarray, FinishReason]], **kwargs ) -> TextGenerationOutput: """ Convert the engine outputs to the output schema for the pipeline. @@ -447,22 +525,70 @@ def process_engine_outputs( :param engine_outputs: the outputs from the engine :return: the output schema for the pipeline """ - generated_tokens, generated_logits = engine_outputs + generated_tokens, generated_logits, finished_reason = engine_outputs + finished_reason = [f[0] for f in finished_reason] + sequences = self.tokenizer.batch_decode( generated_tokens, skip_special_tokens=True ) + num_preds = kwargs.get("num_generated_predictions", 1) + prompts = kwargs.get("prompts") + + def _create_generated_text_output( + sequence: str, + finish_reason: FinishReason, + logits: Optional[numpy.array] = None, + ): + return GeneratedText( + text=sequence, + score=logits, + finished=True, + finished_reason=finish_reason.value, + ) + logits = generated_logits if kwargs.get("return_logits") else None + + if logits is not None: + generations = list( + self.executor.map( + _create_generated_text_output, + sequences, + finished_reason, + logits, + ) + ) + else: + generations = list( + self.executor.map( + _create_generated_text_output, sequences, finished_reason + ) + ) + + # If the num_generated_predictions > 1, group the generations and return + # them as a list of lists where each list consists of the generated + # predictions for a given prompt, and all the lists are in the order matching + # the order that the prompts were given as inputs. + if num_preds > 1: + grouped_generations = [ + generations[n : n + num_preds] + for n in range(0, len(generations), num_preds) + ] + generations = grouped_generations + input_tokens = ( kwargs.get("input_tokens") if kwargs.get("return_input_tokens") else None ) return TextGenerationOutput( - sequences=sequences, logits=logits, input_tokens=input_tokens + created=datetime.datetime.now(), + prompts=prompts, + generations=generations, + input_tokens=input_tokens, ) def engine_forward( self, engine_inputs: List[numpy.ndarray], context: Dict - ) -> Tuple[numpy.ndarray, numpy.ndarray]: + ) -> Tuple[numpy.ndarray, numpy.ndarray, List[FinishReason]]: """ Run the forward pass on the engine. @@ -476,31 +602,46 @@ def engine_forward( # as such, a new context needs to be created since we are no longer in the # main thread. That is why `engine_` is prepended to each of the timer phase # names in this context + with self.timer_manager.new_timer_context(total_inference=False) as timer: streamer = context.get("streamer") + finished_reason = [] if not self.cache_support_enabled: - tokens, prompt_logits = self.multitoken_engine(engine_inputs) - return numpy.array([tokens]), prompt_logits + prompt_logits = self.multitoken_engine(engine_inputs) + token_generator = TokenGenerator( + logits_shape=prompt_logits[-1].shape[-1], + deterministic=self.deterministic, + **context, + ) + for prompt_logit in prompt_logits: + token_generator.generate(prompt_logit) + return numpy.array([self.tokens]), prompt_logits else: # run the prompt through with timer.time(TextGenerationTimings.PROMPT_PREFILL): - tokens, prompt_logits = self.prompt_inference(engine_inputs) + prompt_logits = self.prompt_inference(engine_inputs) + + tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist() + token_generator = TokenGenerator( + logits_shape=prompt_logits[-1].shape[-1], + tokens=tokens, + deterministic=self.deterministic, + **context, + ) + token_generator.generate(prompt_logits[-1][0, -1, :]) if streamer is not None: - streamer.put(numpy.array(tokens)) + streamer.put(numpy.array(token_generator.tokens)) # create the generated output - max_tokens = ( - self.max_generated_tokens - if self.max_generated_tokens and self.max_generated_tokens > 0 - else 100 * self.sequence_length - ) # set safety for absolute max generation + max_tokens = context.get("max_tokens", 0) + max_tokens = max_tokens if max_tokens > 0 else (100 * self.sequence_length) # last prompt token is the first generated token # add it to generated tokens, and the logits - generated_tokens = [tokens[-1]] + generated_tokens = [token_generator.tokens[-1]] generated_logits = ( prompt_logits if context.get("include_prompt_logits") @@ -512,8 +653,10 @@ def engine_forward( with timer.time(TextGenerationTimings.TOKEN_GENERATION): while len(generated_tokens) < max_tokens: with timer.time(TextGenerationTimings.TOKEN_GENERATION_SINGLE): - token, logits = self.autoregressive_inference(tokens) - tokens.append(token) + logits = self.autoregressive_inference( + tokens=token_generator.tokens + ) + token = token_generator.generate(logits=logits[0, -1, :]) generated_tokens.append(token) generated_logits.append(logits) @@ -524,6 +667,7 @@ def engine_forward( token == self.tokenizer.eos_token_id and not self.force_max_tokens ): + finished_reason.append(FinishReason.STOP) break if self._stop_token_generated(token, stop_tokens=stop): @@ -531,8 +675,10 @@ def engine_forward( "Stop token %s generated. Stopping generation." % self.tokenizer.decode(token) ) + finished_reason.append(FinishReason.STOP) break + # TODO: Add any generic callback reason? if callback is not None and callback(token) is False: _LOGGER.debug( "callback %s returned False, stopping generation." @@ -540,15 +686,21 @@ def engine_forward( ) break + if len(generated_tokens) == max_tokens: + finished_reason.append(FinishReason.LENGTH) + if streamer is not None: streamer.end() - return numpy.array([generated_tokens]), numpy.concatenate( - generated_logits, axis=1 + return ( + numpy.array([generated_tokens]), + numpy.concatenate(generated_logits, axis=1), + finished_reason, ) def prompt_inference( - self, engine_inputs: List[numpy.ndarray] + self, + engine_inputs: List[numpy.ndarray], ) -> Tuple[List[int], List[numpy.ndarray]]: """ An inference run that processes the prompt through the @@ -565,13 +717,12 @@ def prompt_inference( tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist() prompt_logits = [] - new_token = None num_tokens_processed = 0 if len(tokens) > self.prompt_sequence_length and self.enable_multitoken_prefill: self.multitoken_engine.reset_kv_cache() for engine_inputs in self.engine_inputs_for_prefill(tokens): - new_token, new_logits = self.multitoken_engine(engine_inputs) + new_logits = self.multitoken_engine(engine_inputs) num_tokens_processed += self.prompt_sequence_length prompt_logits.append(new_logits) @@ -589,13 +740,11 @@ def prompt_inference( with self.timer_manager.current.time( TextGenerationTimings.PROMPT_PREFILL_SINGLE ): - new_token, new_logits = self.autoregressive_inference(run_tokens) + new_logits = self.autoregressive_inference(run_tokens) prompt_logits.append(new_logits) - tokens.append(new_token) - - return tokens, prompt_logits + return prompt_logits def autoregressive_inference( self, @@ -632,9 +781,9 @@ def autoregressive_inference( engine_inputs_map[name] for name in self.engine.onnx_input_names_no_cache ] - generated_token, generated_logits = self.engine(engine_inputs) + generated_logits = self.engine(engine_inputs) - return generated_token, generated_logits + return generated_logits def engine_inputs_for_prefill( self, tokens: List[int] @@ -736,8 +885,10 @@ def is_cache_support_enabled(self) -> bool: return any(default_cached_outputs(self.onnx_file_path)) def join_engine_outputs( - self, batch_outputs: List[List[numpy.ndarray]], orig_batch_size: int - ) -> List[numpy.ndarray]: + self, + batch_outputs: List[List[Union[numpy.ndarray, FinishReason]]], + orig_batch_size: int, + ) -> List[Union[numpy.ndarray, FinishReason]]: """ Takes a list of outputs (batches) from the engine and joins them into a single output. Asserts that @@ -748,7 +899,7 @@ def join_engine_outputs( :param orig_batch_size: The original batch size :return: A list of joined outputs """ - tokens, logits = zip(*batch_outputs) + tokens, logits, finish_reason = zip(*batch_outputs) if self.cache_support_enabled: # if the model has kv cache, we need to account for # the fact that the predicted outputs may have @@ -780,7 +931,7 @@ def join_engine_outputs( tokens = numpy.concatenate(tokens, axis=0) logits = numpy.concatenate(logits, axis=0) - return [tokens, logits] + return [tokens, logits, finish_reason] @staticmethod def causal_mask_input_present(model_path: str) -> bool: @@ -792,10 +943,17 @@ def causal_mask_input_present(model_path: str) -> bool: :param model_path: path to the model :return: True if causal_mask input is present, False otherwise """ - return any( + is_causal_mask_input = any( inp.name == "causal_mask" for inp in onnx.load(model_path, load_external_data=False).graph.input ) + if not is_causal_mask_input: + _LOGGER.warning( + "This ONNX graph does not support processing the prompt" + "with processing length > 1" + ) + + return is_causal_mask_input def _stop_token_generated( self, token, stop_tokens: Union[None, str, Sequence[str]] From 59b93c565e6043808d39f91e9a5710629b9d05df Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 20 Sep 2023 14:15:33 -0400 Subject: [PATCH 34/62] Rebase w/ main --- src/deepsparse/transformers/eval_downstream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index b9bacd8b88..bd3e668fc2 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -104,7 +104,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): engine_type=args.engine, num_cores=args.num_cores, sequence_length=args.max_sequence_length, - max_generated_tokens=1, trust_remote_code=args.trust_remote_code, ) @@ -138,6 +137,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): return_input_tokens=True, fixed_sequences_length=True, include_prompt_logits=True, + max_tokes=1, ) # Handle one sample at a time to make it simpler for masking From f4554b17e88331a13b8009f0cb2b93e5cbe9d3d8 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 20 Sep 2023 15:00:20 -0400 Subject: [PATCH 35/62] Fix typo --- src/deepsparse/transformers/eval_downstream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index bd3e668fc2..71bbe70ff4 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -137,7 +137,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): return_input_tokens=True, fixed_sequences_length=True, include_prompt_logits=True, - max_tokes=1, + max_tokens=1, ) # Handle one sample at a time to make it simpler for masking From c5bd3836f7fa195d180cd8f48685cc3863d652f1 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 26 Sep 2023 13:45:24 -0400 Subject: [PATCH 36/62] Rebase --- .../transformers/eval_downstream.py | 6 ++-- .../transformers/pipelines/text_generation.py | 34 ++++++++++++++----- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 71bbe70ff4..def78d61fa 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -133,18 +133,18 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): # Perform single token generation prediction = text_generation( sequences=batch_samples, - return_logits=True, + output_scores=True, return_input_tokens=True, fixed_sequences_length=True, include_prompt_logits=True, - max_tokens=1, + max_new_tokens=1, ) # Handle one sample at a time to make it simpler for masking for s in range(len(batch_samples)): # Need to remove tokens that were masked input_ids = prediction.input_tokens["input_ids"][s].flatten() - logits = prediction.generations.score[s] + logits = prediction.generations[s].score attention_mask = prediction.input_tokens["attention_mask"][s].flatten() effective_sequence_length = logits.shape[0] diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 882d171e2f..78e877fa2b 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -142,14 +142,15 @@ class Config: description="GenerationConfig file consisting of parameters used to control " "sequences generated for each prompt. The current supported parameters are: " "max_length, max_new_tokens, num_return_sequences, output_scores, top_p, " - "top_k, repetition_penalty, do_sample, temperature", + "top_k, repetition_penalty, do_sample, temperature. If None is provided, " + "deepsparse defaults will be used. For all other input types, HuggingFace " + "defaults for GenerationConfig will be used. ", ) - kwargs: Optional[Dict] = Field( + generation_kwargs: Optional[Dict] = Field( default=None, description="Any arguments to override generation_config arguments. Refer to " - "the generation_config argument for a full list of supported variables. Only " - "valid when generation_config is not None.", + "the generation_config argument for a full list of supported variables.", ) @@ -217,6 +218,12 @@ class TextGenerationPipeline(TransformersPipeline): of tokens supplied even if the stop token is reached. :param internal_kv_cache: if True, the pipeline will use the deepsparse kv cache for caching the model outputs. + :param generation_config: config file consisting of parameters used to control + sequences generated for each prompt. The current supported parameters are: + max_length, max_new_tokens, num_return_sequences, output_scores, top_p, + top_k, repetition_penalty, do_sample, temperature. If None is provided, + deepsparse defaults will be used. For all other input types, HuggingFace + defaults for GenerationConfig will be used. :param kwargs: kwargs to pass to the TransformersPipeline """ @@ -425,6 +432,7 @@ def parse_inputs(self, *args, **kwargs) -> TextGenerationInput: if "sequences" in kwargs and "prompt" not in kwargs: # support prompt and sequences interchangeably kwargs["prompt"] = kwargs["sequences"] + if ( args and not isinstance(args[0], TextGenerationInput) @@ -435,6 +443,14 @@ def parse_inputs(self, *args, **kwargs) -> TextGenerationInput: kwargs["prompt"] = args[0] args = args[1:] + if kwargs: + generation_kwargs = kwargs.get("generation_kwargs", {}) + for k, v in kwargs.items(): + if not generation_kwargs.get(k) and hasattr(GenerationDefaults, k): + generation_kwargs[k] = v + + kwargs["generation_kwargs"] = generation_kwargs + return super().parse_inputs(*args, **kwargs) def process_inputs( @@ -450,7 +466,7 @@ def process_inputs( self.generation_config, inputs.generation_config, GenerationDefaults() ) - generation_config = override_config(inputs.kwargs, generation_config) + generation_config = override_config(inputs.generation_kwargs, generation_config) self.streaming = inputs.streaming if not self.cache_support_enabled and generation_config.max_length > 1: @@ -545,10 +561,10 @@ def _create_generated_text_output( finished=False, ) - def _stream_engine_outputs(self, engine_outputs, prompts, kwargs): + def _stream_engine_outputs(self, engine_outputs, prompts, generation_config): for output in engine_outputs: generated_tokens, generated_logits, finished_reason = output - logits = generated_logits if kwargs.get("return_logits") else None + logits = generated_logits if generation_config.output_scores else None generation = self._create_generated_text_output( self.tokenizer.batch_decode(generated_tokens)[0], finished_reason[0], @@ -575,7 +591,9 @@ def process_engine_outputs( streaming = kwargs.get("streaming") if streaming: - return self._stream_engine_outputs(engine_outputs, prompts, kwargs) + return self._stream_engine_outputs( + engine_outputs, prompts, generation_config + ) if self._debug: ( From 091aeca4ae6e3fc130ed0d4c770e4ae696434d4e Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 26 Sep 2023 16:38:43 -0400 Subject: [PATCH 37/62] Use max_length instead of max_new_tokens --- src/deepsparse/transformers/eval_downstream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index def78d61fa..8f07c0b242 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -137,7 +137,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): return_input_tokens=True, fixed_sequences_length=True, include_prompt_logits=True, - max_new_tokens=1, + max_length=1, ) # Handle one sample at a time to make it simpler for masking From 6bc08bc996a0a8a244f16f63aa4dc357a3c0237f Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 3 Oct 2023 13:08:35 -0400 Subject: [PATCH 38/62] Rebase --- src/deepsparse/transformers/pipelines/text_generation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 43b7edf4ae..fdbd0b5213 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -192,9 +192,6 @@ class TextGenerationOutput(BaseModel): "both mapping to arrays of size " "[batch_size, sequence_length]", ) - session_id: Optional[str] = Field( - default=None, description="A string identifier for the kv cache session." - ) class Config: arbitrary_types_allowed = True From dc943d76bbaa72deba3764ef95a1e48c493917a1 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 3 Oct 2023 13:22:36 -0400 Subject: [PATCH 39/62] Added typing and docstring --- .../transformers/utils/eval_helpers.py | 59 ++++++++++++++++++- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py index 1fc277340e..8b0d8dfe43 100644 --- a/src/deepsparse/transformers/utils/eval_helpers.py +++ b/src/deepsparse/transformers/utils/eval_helpers.py @@ -1,9 +1,37 @@ -from transformers import AutoTokenizer +from transformers import AutoTokenizer, PreTrainedTokenizerFast from datasets import load_dataset import numpy +from typing import Mapping, List, Union -def process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs): +def process_concatenated_datasets( + dataset_name: str, + model_path: str, + max_sequence_length: int, + kwargs: Mapping, +) -> list: + """ + Concatenate text datasets and split them into chunks text that, after + tokenization, have size "max_sequence_length" tokens. + + Args: + dataset_name (str): The name of the dataset to process. Options: "wikitext2" or "c4". + model_path (str): The path to a pretrained transformer model for tokenization. + max_sequence_length (int): The maximum number of tokens in each sequence. + kwargs (mapping): Additional keyword arguments. + - eos (str, optional): The end-of-sentence token. Default is "\n\n" for wikitext2 and "" for c4. + - bos (str, optional): The beginning-of-sentence token. Default is "". + - raw_samples (int, optional): The number of raw samples to use. Default is None. + - data_file (int, optional): The index of the data file to use for dataset. + Not used in wikitext2. Default is 0 for c4. + - max_text_length (int, optional): The maximum length of text to consider. + Returns: + list: A list of text sequences. + + Raises: + ValueError: If an invalid dataset_name is provided. + """ + if dataset_name == "wikitext2": eos = kwargs.get("eos", "\n\n") bos = kwargs.get("bos", "") @@ -40,7 +68,32 @@ def process_concatenated_datasets(dataset_name, model_path, max_sequence_length, ) -def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length, max_text_length): +def _split_text_by_tokens( + text: List[str], + eos: str, + bos: str, + tokenizer: PreTrainedTokenizerFast, + sequence_length: int, + max_text_length: Union[None, int], +) -> List[str]: + """ + Tokenizes and splits a list of concatenated text samples into sections of specified maximum token length. + + Args: + text (List[str]): List of concatenated text samples to be tokenized and split. + eos (str): The end-of-sentence token. + bos (str): The beginning-of-sentence token. + tokenizer (PreTrainedTokenizerFast): Tokenizer for tokenizing the text. + sequence_length (int): The maximum number of tokens in each section. + max_text_length (Union[None, int]): The maximum length of text to consider. + - If None, the entire text is tokenized and split. + - If -1, each sample is tokenized separately. + - If a positive integer, the text is split into sections of this length before tokenization. + + Returns: + List[str]: A list of sections where each section contains a maximum of "sequence_length" tokens. + """ + text = [bos + sample + eos for sample in text] if max_text_length is None: From 8f3743afb8a84c1f4e6ba32f3b489a1dd1681a74 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 3 Oct 2023 13:28:09 -0400 Subject: [PATCH 40/62] Added typing and docstring --- src/deepsparse/transformers/metrics.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index 71683d6116..3b3b043258 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -17,7 +17,6 @@ """ from typing import Any, Dict, Optional - import numpy from scipy.special import log_softmax @@ -183,7 +182,25 @@ def compute(self) -> Dict[str, float]: return results -def _cross_entropy(predictions, targets, reduction="mean"): +def _cross_entropy( + predictions: numpy.ndarray, + targets: numpy.ndarray, + reduction: str = "mean", +) -> float: + """ + Calculate the cross-entropy loss between predicted probabilities and target labels. + + Args: + predictions (numpy.ndarray): Predicted logits. + targets (nnumpy.ndarray): Target class labels. + reduction (str, optional): Specifies the reduction method for the loss. + - "mean" (default): Computes the mean loss over all samples. + - "sum": Computes the sum of losses over all samples. + + Returns: + float: The computed cross-entropy loss. + """ + logp = log_softmax(predictions, axis=-1) neg_log_likelihoods = -1.0 * numpy.take_along_axis( logp, numpy.expand_dims(targets, axis=-1), axis=-1 From 5e1d8088d0401dd94a567bd622cdf131fb133b5f Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 3 Oct 2023 13:37:56 -0400 Subject: [PATCH 41/62] Define concantenated datasets --- src/deepsparse/transformers/utils/eval_helpers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py index 8b0d8dfe43..3965d208ef 100644 --- a/src/deepsparse/transformers/utils/eval_helpers.py +++ b/src/deepsparse/transformers/utils/eval_helpers.py @@ -3,6 +3,7 @@ import numpy from typing import Mapping, List, Union +CONCATENATED_DATSETS = ["wikitext2", "c4"] def process_concatenated_datasets( dataset_name: str, @@ -32,6 +33,12 @@ def process_concatenated_datasets( ValueError: If an invalid dataset_name is provided. """ + if dataset_name not in CONCATENATED_DATSETS: + raise KeyError( + f"dataset {dataset_name} not supported for concatenated processing, " + f"available datasets are {list(CONCATENATED_DATSETS.keys())}" + ) + if dataset_name == "wikitext2": eos = kwargs.get("eos", "\n\n") bos = kwargs.get("bos", "") From 0785321f2091e9f5e6881d49a092fe270f910bec Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 3 Oct 2023 13:45:20 -0400 Subject: [PATCH 42/62] Add warning about batch-size not being a supported argument for some datasets --- src/deepsparse/transformers/eval_downstream.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 8f07c0b242..d0d17d9bea 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -67,6 +67,7 @@ import numpy from tqdm.auto import tqdm +import logging from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1 @@ -76,6 +77,12 @@ from datasets import load_dataset, load_metric # isort: skip +_LOGGER = logging.getLogger(__name__) + + +PPL_DATASETS = ["wikitext2", "c4", "openai_humaneval"] + + def perplexity_eval(args, dataset_name="openai_humaneval"): if dataset_name in ["wikitext2", "c4"]: @@ -707,6 +714,12 @@ def _main(args): f"available datasets are {list(SUPPORTED_DATASETS.keys())}" ) + if dataset not in PPL_DATASETS: + _LOGGER.warning( + "Batch-size argument is not supported for this dataset." + "Will use default value of 1." + ) + if dataset == "mnli": mnli_metrics_matched, mnli_metrics_mismatched = mnli_eval(args) mnli_metrics_matched = mnli_metrics_matched.compute() From d8914f0b72203e5560e15ff1193ac66e1ea2c4c7 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 3 Oct 2023 17:01:23 -0400 Subject: [PATCH 43/62] Add unit test for pipeline and generation in ppl eval --- .../pipelines/test_text_generation.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py index 5298c2f1dd..9acb8f32f5 100644 --- a/tests/deepsparse/transformers/pipelines/test_text_generation.py +++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py @@ -666,6 +666,28 @@ def _test_kv_cache_state( x[:, :, -start_index:-end_index, :], y, atol=_PRECISION ) + def test_pipeline_for_ppl_eval(self, ): + pipeline = self.get_pipeline( + task="text-generation", + model_path=self.model_stub, + sequence_length=self.sequence_length, + prompt_sequence_length=1, + ) + inputs = dict( + prompt=self.prompt, + output_scores=True, + return_input_tokens=True, + fixed_sequences_length=True, + include_prompt_logits=True, + max_length=1, + ) + predictions = pipeline(**inputs) + assert hasattr(predictions, "generations") + assert hasattr(predictions.generations[0], "score") + assert hasattr(predictions.generations[0], "input_tokens") + assert "input_ids" in predictions.generations[0].input_tokens + assert "attention_mask" in predictions.generations[0].input_tokens + def test_streaming_mode_returns_generator(self, setup): pipeline = self.get_pipeline( task=self.pipeline_type, From 5bf076b917caca05b25db8109dffc27ee1ebe953 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Tue, 3 Oct 2023 17:26:31 -0400 Subject: [PATCH 44/62] Add lifecycle in docstring --- src/deepsparse/transformers/metrics.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index 3b3b043258..a1ba162144 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -34,6 +34,15 @@ def __init__(self, accumulate: bool = False): """ Class for computing perplexity. + Each batch is processed via the "add_batches" method. + At the end the data is reduced to a single perplexity metric via the "compute" method. + + Example: + metric = Perplexity() + for prediction, target in samples: + metric.add_batch(prediction, target) + perplexity_value = metric.compute() + :param accumulate: If True, accumulate negative log-likelihood over samples. If False, perplexity is computed separately for each sampled and then averaged in the end. From ecf3b7775471eac92e8149b3d8d91e2c593dbf1d Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 10:59:18 -0400 Subject: [PATCH 45/62] Add copyright --- src/deepsparse/transformers/utils/eval_helpers.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py index 3965d208ef..b48308dbbb 100644 --- a/src/deepsparse/transformers/utils/eval_helpers.py +++ b/src/deepsparse/transformers/utils/eval_helpers.py @@ -1,3 +1,17 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from transformers import AutoTokenizer, PreTrainedTokenizerFast from datasets import load_dataset import numpy From fe37c32c035e98af3f113de48209d5bca3fcce42 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 11:49:25 -0400 Subject: [PATCH 46/62] Style fixes --- src/deepsparse/transformers/metrics.py | 6 +- .../transformers/pipelines/text_generation.py | 4 +- .../transformers/utils/eval_helpers.py | 55 ++++++++++--------- 3 files changed, 35 insertions(+), 30 deletions(-) diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index a1ba162144..cab1cbb5db 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -192,9 +192,9 @@ def compute(self) -> Dict[str, float]: def _cross_entropy( - predictions: numpy.ndarray, - targets: numpy.ndarray, - reduction: str = "mean", + predictions: numpy.ndarray, + targets: numpy.ndarray, + reduction: str = "mean", ) -> float: """ Calculate the cross-entropy loss between predicted probabilities and target labels. diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py index 0d0bacc717..5b8500e52f 100644 --- a/src/deepsparse/transformers/pipelines/text_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation.py @@ -661,8 +661,8 @@ def process_engine_outputs( ) outputs = dict( - created=datetime.datetime.now(), - prompts=prompts, + created=datetime.datetime.now(), + prompts=prompts, generations=generations, input_tokens=input_tokens, ) diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py index b48308dbbb..5090a6c42f 100644 --- a/src/deepsparse/transformers/utils/eval_helpers.py +++ b/src/deepsparse/transformers/utils/eval_helpers.py @@ -19,11 +19,12 @@ CONCATENATED_DATSETS = ["wikitext2", "c4"] + def process_concatenated_datasets( - dataset_name: str, - model_path: str, - max_sequence_length: int, - kwargs: Mapping, + dataset_name: str, + model_path: str, + max_sequence_length: int, + kwargs: Mapping, ) -> list: """ Concatenate text datasets and split them into chunks text that, after @@ -68,7 +69,9 @@ def process_concatenated_datasets( raw_dataset = load_dataset( "allenai/c4", "allenai--c4", - data_files={"validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"}, + data_files={ + "validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz" + }, split="validation", ) else: @@ -85,17 +88,22 @@ def process_concatenated_datasets( # To split the dataset, first tokenize text tokenizer = AutoTokenizer.from_pretrained(model_path) return _split_text_by_tokens( - raw_text, eos, bos, tokenizer, max_sequence_length, kwargs.get("max_text_length", None) + raw_text, + eos, + bos, + tokenizer, + max_sequence_length, + kwargs.get("max_text_length", None), ) def _split_text_by_tokens( - text: List[str], - eos: str, - bos: str, - tokenizer: PreTrainedTokenizerFast, - sequence_length: int, - max_text_length: Union[None, int], + text: List[str], + eos: str, + bos: str, + tokenizer: PreTrainedTokenizerFast, + sequence_length: int, + max_text_length: Union[None, int], ) -> List[str]: """ Tokenizes and splits a list of concatenated text samples into sections of specified maximum token length. @@ -119,26 +127,23 @@ def _split_text_by_tokens( if max_text_length is None: text = "".join(text) - input_tokens = tokenizer(text, return_tensors="np")[ - "input_ids" - ][0] - elif max_text_length == -1: #per sample tokenization + input_tokens = tokenizer(text, return_tensors="np")["input_ids"][0] + elif max_text_length == -1: # per sample tokenization input_tokens = [] for slice in text: - input_tokens.append(tokenizer(slice, return_tensors="np")[ - "input_ids" - ][0]) + input_tokens.append(tokenizer(slice, return_tensors="np")["input_ids"][0]) input_tokens = numpy.concatenate(input_tokens) else: text = "".join(text) text_slices = len(text) // max_text_length - sliced_text = [text[i*max_text_length:(i+1)*max_text_length] for i in range(text_slices)] - sliced_text.append(text[text_slices*max_text_length:]) + sliced_text = [ + text[i * max_text_length : (i + 1) * max_text_length] + for i in range(text_slices) + ] + sliced_text.append(text[text_slices * max_text_length :]) input_tokens = [] for slice in sliced_text: - input_tokens.append(tokenizer(slice, return_tensors="np")[ - "input_ids" - ][0]) + input_tokens.append(tokenizer(slice, return_tensors="np")["input_ids"][0]) input_tokens = numpy.concatenate(input_tokens) # Then split the tokenized text into sections of size "max_sequence_length" and @@ -165,4 +170,4 @@ def _split_text_by_tokens( ) ) - return split_text \ No newline at end of file + return split_text From ddd0325240eaa9bf58153e18e37b02cec8789b30 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 11:53:28 -0400 Subject: [PATCH 47/62] Quality fixes --- src/deepsparse/transformers/eval_downstream.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index d0d17d9bea..2075ea61c3 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -84,7 +84,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): - if dataset_name in ["wikitext2", "c4"]: if args.kwargs is None: kwargs = {} @@ -123,7 +122,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"): end_evaluation = False dataset_length = len(dataset) for idx, sample in _enumerate_progress(dataset, args.max_samples): - # Collect input sequence if dataset_name == "openai_humaneval": sample = sample["prompt"] + sample["canonical_solution"] From 24a91a32afb3a1bd46130319990f63a74bde4d95 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 11:56:21 -0400 Subject: [PATCH 48/62] Quality fixes --- .../transformers/pipelines/test_text_generation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py index 9acb8f32f5..f7715984d9 100644 --- a/tests/deepsparse/transformers/pipelines/test_text_generation.py +++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py @@ -554,7 +554,6 @@ def _test_composition_same_session_ids( session_id_1, session_id_2, ): - tokenizer = pipeline.tokenizer config = GenerationConfig( output_scores=True, max_length=num_generated_tokens, top_k=0, top_p=0.0 @@ -607,7 +606,6 @@ def _test_output( max_logits_difference_threshold: Optional[float] = None, run_cache_validation: bool = True, ): - ( generated_logits, prompt_logits, @@ -666,7 +664,9 @@ def _test_kv_cache_state( x[:, :, -start_index:-end_index, :], y, atol=_PRECISION ) - def test_pipeline_for_ppl_eval(self, ): + def test_pipeline_for_ppl_eval( + self, + ): pipeline = self.get_pipeline( task="text-generation", model_path=self.model_stub, From 301115c142846dab878af6f6fe49ae5419e34fe9 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 12:00:19 -0400 Subject: [PATCH 49/62] Quality fixes --- tests/deepsparse/transformers/test_helpers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py index 5cd1cf0dfa..610e41a232 100644 --- a/tests/deepsparse/transformers/test_helpers.py +++ b/tests/deepsparse/transformers/test_helpers.py @@ -168,7 +168,11 @@ def test_truncate_transformer_onnx_model( model_onnx_path = get_model_onnx_path(model_name) output_name = "embedding" - (truncated_onnx_path, output_names, _,) = truncate_transformer_onnx_model( + ( + truncated_onnx_path, + output_names, + _, + ) = truncate_transformer_onnx_model( model_path=model_onnx_path, emb_extraction_layer=emb_extraction_layer, hidden_layer_size=None, From e402da9ddac12b21d2db57fac8014e658c7635a9 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 12:06:02 -0400 Subject: [PATCH 50/62] Quality fixes --- src/deepsparse/transformers/eval_downstream.py | 4 ++-- src/deepsparse/transformers/metrics.py | 1 + src/deepsparse/transformers/utils/eval_helpers.py | 7 +++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 2075ea61c3..6bd1cc4175 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -62,19 +62,19 @@ import argparse import json +import logging from cProfile import Profile from pstats import Stats import numpy from tqdm.auto import tqdm -import logging from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1 from deepsparse.transformers.utils.eval_helpers import process_concatenated_datasets -from datasets import load_dataset, load_metric # isort: skip +from datasets import load_dataset, load_metric _LOGGER = logging.getLogger(__name__) diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index cab1cbb5db..c6e2a583f6 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -17,6 +17,7 @@ """ from typing import Any, Dict, Optional + import numpy from scipy.special import log_softmax diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py index 5090a6c42f..3df866bd20 100644 --- a/src/deepsparse/transformers/utils/eval_helpers.py +++ b/src/deepsparse/transformers/utils/eval_helpers.py @@ -12,10 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import List, Mapping, Union + +import numpy from transformers import AutoTokenizer, PreTrainedTokenizerFast + from datasets import load_dataset -import numpy -from typing import Mapping, List, Union + CONCATENATED_DATSETS = ["wikitext2", "c4"] From b48e05f9a06c202c4fe97138853aafd8ebf50592 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 12:16:34 -0400 Subject: [PATCH 51/62] Quality fixes --- tests/deepsparse/transformers/test_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py index 610e41a232..00077ab78e 100644 --- a/tests/deepsparse/transformers/test_helpers.py +++ b/tests/deepsparse/transformers/test_helpers.py @@ -62,8 +62,8 @@ def get_model_onnx_path(model_stubs): onnx_path = model.onnx_model.path model_onnx_paths[model_name] = onnx_path - def _get_model_onnx_path(model_name): - return model_onnx_paths[model_name] + def _get_model_onnx_path(_model_name): + return model_onnx_paths[_model_name] return _get_model_onnx_path From 61b9c5c424220a71ce896d720a985c38c979da0e Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 14:01:34 -0400 Subject: [PATCH 52/62] Quality fixes --- src/deepsparse/transformers/eval_downstream.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py index 6bd1cc4175..f9835aa58e 100644 --- a/src/deepsparse/transformers/eval_downstream.py +++ b/src/deepsparse/transformers/eval_downstream.py @@ -69,14 +69,12 @@ import numpy from tqdm.auto import tqdm +from datasets import load_dataset, load_metric from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1 from deepsparse.transformers.utils.eval_helpers import process_concatenated_datasets -from datasets import load_dataset, load_metric - - _LOGGER = logging.getLogger(__name__) From 34ee8f615c5907c3cdaa0b7afbde41a22ea593aa Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 14:05:58 -0400 Subject: [PATCH 53/62] Quality fixes --- tests/deepsparse/transformers/test_helpers.py | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py index 00077ab78e..30309ff1be 100644 --- a/tests/deepsparse/transformers/test_helpers.py +++ b/tests/deepsparse/transformers/test_helpers.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + import onnx import pytest from deepsparse.transformers.helpers import ( - get_deployment_path, + get_hugging_face_configs, + get_onnx_path, get_transformer_layer_init_names, truncate_transformer_onnx_model, ) @@ -32,8 +35,20 @@ ), ], ) -def test_get_deployment_path(stub): - assert get_deployment_path(stub) +def test_get_onnx_path_and_configs_from_stub(stub): + onnx_path = get_onnx_path(stub) + config_dir, tokenizer_dir = get_hugging_face_configs(stub) + + assert onnx_path.endswith("model.onnx") + assert os.path.exists(onnx_path) + + config_dir_files = os.listdir(config_dir) + assert "config.json" in config_dir_files + + tokenizer_dir_files = os.listdir(tokenizer_dir) + assert "tokenizer.json" in tokenizer_dir_files + # make assert optional if stubs added for models with no known tokenizer_config + assert "tokenizer_config.json" in tokenizer_dir_files @pytest.fixture(scope="session") @@ -62,8 +77,8 @@ def get_model_onnx_path(model_stubs): onnx_path = model.onnx_model.path model_onnx_paths[model_name] = onnx_path - def _get_model_onnx_path(_model_name): - return model_onnx_paths[_model_name] + def _get_model_onnx_path(model_name): + return model_onnx_paths[model_name] return _get_model_onnx_path @@ -168,11 +183,7 @@ def test_truncate_transformer_onnx_model( model_onnx_path = get_model_onnx_path(model_name) output_name = "embedding" - ( - truncated_onnx_path, - output_names, - _, - ) = truncate_transformer_onnx_model( + (truncated_onnx_path, output_names, _,) = truncate_transformer_onnx_model( model_path=model_onnx_path, emb_extraction_layer=emb_extraction_layer, hidden_layer_size=None, From b032101b6c5e5a39e47d4be196cfa23feb9fdb25 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 14:09:49 -0400 Subject: [PATCH 54/62] Quality fixes --- tests/deepsparse/transformers/test_helpers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py index 30309ff1be..c2bf27985a 100644 --- a/tests/deepsparse/transformers/test_helpers.py +++ b/tests/deepsparse/transformers/test_helpers.py @@ -183,7 +183,11 @@ def test_truncate_transformer_onnx_model( model_onnx_path = get_model_onnx_path(model_name) output_name = "embedding" - (truncated_onnx_path, output_names, _,) = truncate_transformer_onnx_model( + ( + truncated_onnx_path, + output_names, + _, + ) = truncate_transformer_onnx_model( model_path=model_onnx_path, emb_extraction_layer=emb_extraction_layer, hidden_layer_size=None, From f3cbf3dacc969ac821f91a9195826ab122b96eb5 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 14:15:29 -0400 Subject: [PATCH 55/62] Quality fixes --- tests/deepsparse/transformers/test_helpers.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py index c2bf27985a..30309ff1be 100644 --- a/tests/deepsparse/transformers/test_helpers.py +++ b/tests/deepsparse/transformers/test_helpers.py @@ -183,11 +183,7 @@ def test_truncate_transformer_onnx_model( model_onnx_path = get_model_onnx_path(model_name) output_name = "embedding" - ( - truncated_onnx_path, - output_names, - _, - ) = truncate_transformer_onnx_model( + (truncated_onnx_path, output_names, _,) = truncate_transformer_onnx_model( model_path=model_onnx_path, emb_extraction_layer=emb_extraction_layer, hidden_layer_size=None, From 483449eb91c5c810a3af3e151a00bdc3bcc35f52 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Fri, 20 Oct 2023 14:22:11 -0400 Subject: [PATCH 56/62] Quality fixes --- src/deepsparse/transformers/metrics.py | 14 +++++++------ .../transformers/utils/eval_helpers.py | 21 ++++++++++++------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index c6e2a583f6..1952ec2155 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -36,7 +36,8 @@ def __init__(self, accumulate: bool = False): Class for computing perplexity. Each batch is processed via the "add_batches" method. - At the end the data is reduced to a single perplexity metric via the "compute" method. + At the end the data is reduced to a single perplexity + metric via the "compute" method. Example: metric = Perplexity() @@ -76,8 +77,8 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray): """ if self._accumulate: - # If accumulate is True, every token from the batch contributes equally to the - # negative log-likelihood. + # If accumulate is True, every token from the batch contributes + # equally to the negative log-likelihood. # Thus, merge batch and sequence length dimensions and compute negative # log-likelihood for all tokens, and accumulate to total predictions = numpy.reshape(predictions, (-1, predictions.shape[-1])) @@ -91,9 +92,10 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray): # Track number of tokens processed self._number_tokens += predictions.shape[0] else: - # If accumulate is False, compute perplexity for each sample individually. - # We assume that sequence length is uniform within a batch, but may vary from batch - # to batch. + # If accumulate is False, compute perplexity for + # each sample individually. + # We assume that sequence length is uniform within a batch, + # but may vary from batch to batch. # Create batch dimension if it doesn't exist if targets.ndim == 1: diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py index 3df866bd20..4c0e68b9de 100644 --- a/src/deepsparse/transformers/utils/eval_helpers.py +++ b/src/deepsparse/transformers/utils/eval_helpers.py @@ -34,13 +34,17 @@ def process_concatenated_datasets( tokenization, have size "max_sequence_length" tokens. Args: - dataset_name (str): The name of the dataset to process. Options: "wikitext2" or "c4". + dataset_name (str): The name of the dataset to process. + Options: "wikitext2" or "c4". model_path (str): The path to a pretrained transformer model for tokenization. max_sequence_length (int): The maximum number of tokens in each sequence. kwargs (mapping): Additional keyword arguments. - - eos (str, optional): The end-of-sentence token. Default is "\n\n" for wikitext2 and "" for c4. - - bos (str, optional): The beginning-of-sentence token. Default is "". - - raw_samples (int, optional): The number of raw samples to use. Default is None. + - eos (str, optional): The end-of-sentence token. + Default is "\n\n" for wikitext2 and "" for c4. + - bos (str, optional): The beginning-of-sentence token. + Default is "". + - raw_samples (int, optional): The number of raw samples to use. + Default is None. - data_file (int, optional): The index of the data file to use for dataset. Not used in wikitext2. Default is 0 for c4. - max_text_length (int, optional): The maximum length of text to consider. @@ -109,7 +113,8 @@ def _split_text_by_tokens( max_text_length: Union[None, int], ) -> List[str]: """ - Tokenizes and splits a list of concatenated text samples into sections of specified maximum token length. + Tokenizes and splits a list of concatenated text samples into + sections of specified maximum token length. Args: text (List[str]): List of concatenated text samples to be tokenized and split. @@ -120,10 +125,12 @@ def _split_text_by_tokens( max_text_length (Union[None, int]): The maximum length of text to consider. - If None, the entire text is tokenized and split. - If -1, each sample is tokenized separately. - - If a positive integer, the text is split into sections of this length before tokenization. + - If a positive integer, the text is split into sections of this + length before tokenization. Returns: - List[str]: A list of sections where each section contains a maximum of "sequence_length" tokens. + List[str]: A list of sections where each section contains a + maximum of "sequence_length" tokens. """ text = [bos + sample + eos for sample in text] From e6e78286ccbbe17e2629fcd6070831ca749e0cde Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 30 Oct 2023 17:42:39 -0400 Subject: [PATCH 57/62] Rebase --- .../pipelines/test_text_generation.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py index 5df51f9b59..c70c50a5ef 100644 --- a/tests/deepsparse/transformers/pipelines/test_text_generation.py +++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py @@ -19,3 +19,98 @@ import pytest from deepsparse import Pipeline from deepsparse.transformers.utils.helpers import prepends_bos_token + + +@pytest.fixture +def pipeline(): + return Pipeline.create( + task="text_generation", + model_path="hf:mgoin/TinyStories-1M-deepsparse", + engine_type="onnxruntime", + ) + + +@pytest.fixture +def prompt(): + return "Never gonna give you up, never gonna let you down" + + +def test_freeze_first_position(pipeline): + # Test whether we should be "freezing" the first token after + # the kv cache is full + assert not prepends_bos_token(pipeline.tokenizer) + + +def test_run_same_prompt_multiple_times(pipeline, prompt): + # Test the scenario, where the same prompt is run multiple times + # Every run should produce the same output + output_1 = pipeline(prompt, output_scores=True) + output_2 = pipeline(prompt, output_scores=True) + + assert output_1.generations[0].text == output_2.generations[0].text + assert numpy.allclose( + output_1.generations[0].score, + output_2.generations[0].score, + atol=1e-3, + ) + + +def test_run_multiple_prompts_in_parallel(pipeline, prompt): + # Test the scenario, where multiple prompts are run in parallel + # Same two prompts should produce the same output + + output = pipeline([prompt, prompt], output_scores=True) + + logits_0 = output.generations[0].score + sequence_0 = output.generations[0].text + + logits_1 = output.generations[1].score + sequence_1 = output.generations[1].text + + assert numpy.allclose(logits_0, logits_1, atol=1e-3) + assert sequence_0 == sequence_1 + + +def test_num_generated_predictions(pipeline, prompt): + # Test the scenario, where multiple predictions are generated + # from the same prompt + + output_sequences = pipeline(prompt, num_return_sequences=2) + + assert len(output_sequences.generations) == 1 + assert len(output_sequences.generations[0]) == 2 + + output_sequences = pipeline([prompt, prompt], num_return_sequences=2) + assert len(output_sequences.generations) == 2 + + for generation in output_sequences.generations: + assert len(generation) == 2 + + +def test_token_generation_deterministic(pipeline, prompt): + inference = pipeline(prompt, num_return_sequences=3, do_sample=False) + generations = inference.generations + # Output should be the same from one another + text_outputs = [x.text for x in generations[0]] + assert len(set(text_outputs)) == 1 + + +def test_token_generation_non_deterministic(pipeline, prompt): + + inference = pipeline(prompt, num_return_sequences=3, do_sample=True) + generations = inference.generations + # Output should be different from one another + text_outputs = [x.text for x in generations[0]] + assert len(set(text_outputs)) == 3 + + +def test_streaming_mode_returns_generator(pipeline, prompt): + response_generator = pipeline(prompt, streaming=True) + assert inspect.isgenerator( + response_generator + ), "Pipeline should return a generator in streaming mode" + + assert all( + isinstance(response, pipeline.output_schema) for response in response_generator + ), "Pipeline should return a generator of output_schema \ + objects in streaming mode" From d7c6e5ad79bde07ba2bc34fd0aef3b138332f14b Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Mon, 30 Oct 2023 17:43:26 -0400 Subject: [PATCH 58/62] Rebase --- tests/deepsparse/transformers/test_helpers.py | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py index 30309ff1be..5cd1cf0dfa 100644 --- a/tests/deepsparse/transformers/test_helpers.py +++ b/tests/deepsparse/transformers/test_helpers.py @@ -12,14 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import onnx import pytest from deepsparse.transformers.helpers import ( - get_hugging_face_configs, - get_onnx_path, + get_deployment_path, get_transformer_layer_init_names, truncate_transformer_onnx_model, ) @@ -35,20 +32,8 @@ ), ], ) -def test_get_onnx_path_and_configs_from_stub(stub): - onnx_path = get_onnx_path(stub) - config_dir, tokenizer_dir = get_hugging_face_configs(stub) - - assert onnx_path.endswith("model.onnx") - assert os.path.exists(onnx_path) - - config_dir_files = os.listdir(config_dir) - assert "config.json" in config_dir_files - - tokenizer_dir_files = os.listdir(tokenizer_dir) - assert "tokenizer.json" in tokenizer_dir_files - # make assert optional if stubs added for models with no known tokenizer_config - assert "tokenizer_config.json" in tokenizer_dir_files +def test_get_deployment_path(stub): + assert get_deployment_path(stub) @pytest.fixture(scope="session") From 21c6f0ddf5e349fd6a2891544d47848f57bd5772 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 8 Nov 2023 10:26:48 -0500 Subject: [PATCH 59/62] Re-add unit test --- .../pipelines/test_text_generation.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py index c70c50a5ef..1a408fb92b 100644 --- a/tests/deepsparse/transformers/pipelines/test_text_generation.py +++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py @@ -104,6 +104,29 @@ def test_token_generation_non_deterministic(pipeline, prompt): assert len(set(text_outputs)) == 3 +def test_pipeline_for_ppl_eval(self, ): + pipeline = self.get_pipeline( + task="text-generation", + model_path=self.model_stub, + sequence_length=self.sequence_length, + prompt_sequence_length=1, + ) + inputs = dict( + prompt=self.prompt, + output_scores=True, + return_input_tokens=True, + fixed_sequences_length=True, + include_prompt_logits=True, + max_length=1, + ) + predictions = pipeline(**inputs) + assert hasattr(predictions, "generations") + assert hasattr(predictions.generations[0], "score") + assert hasattr(predictions.generations[0], "input_tokens") + assert "input_ids" in predictions.generations[0].input_tokens + assert "attention_mask" in predictions.generations[0].input_tokens + + def test_streaming_mode_returns_generator(pipeline, prompt): response_generator = pipeline(prompt, streaming=True) assert inspect.isgenerator( From fa0cb4b416aa360af3984a8b66e224cf027f03a8 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 8 Nov 2023 10:30:07 -0500 Subject: [PATCH 60/62] Style fix --- tests/deepsparse/transformers/pipelines/test_text_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py index 1a408fb92b..74bdf26896 100644 --- a/tests/deepsparse/transformers/pipelines/test_text_generation.py +++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py @@ -104,7 +104,7 @@ def test_token_generation_non_deterministic(pipeline, prompt): assert len(set(text_outputs)) == 3 -def test_pipeline_for_ppl_eval(self, ): +def test_pipeline_for_ppl_eval(self): pipeline = self.get_pipeline( task="text-generation", model_path=self.model_stub, From bf1b0cf3ac54957427f9fedc523f005f3168a21d Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 8 Nov 2023 15:19:51 -0500 Subject: [PATCH 61/62] Update unit test --- .../transformers/pipelines/test_text_generation.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py index 74bdf26896..b304e4a7f2 100644 --- a/tests/deepsparse/transformers/pipelines/test_text_generation.py +++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py @@ -104,22 +104,15 @@ def test_token_generation_non_deterministic(pipeline, prompt): assert len(set(text_outputs)) == 3 -def test_pipeline_for_ppl_eval(self): - pipeline = self.get_pipeline( - task="text-generation", - model_path=self.model_stub, - sequence_length=self.sequence_length, - prompt_sequence_length=1, - ) - inputs = dict( - prompt=self.prompt, +def test_pipeline_for_ppl_eval(pipeline, prompt): + predictions = pipeline( + prompt, output_scores=True, return_input_tokens=True, fixed_sequences_length=True, include_prompt_logits=True, max_length=1, ) - predictions = pipeline(**inputs) assert hasattr(predictions, "generations") assert hasattr(predictions.generations[0], "score") assert hasattr(predictions.generations[0], "input_tokens") From 0c618a698d77738f2b41c323cfc95de9833d2448 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 8 Nov 2023 16:06:38 -0500 Subject: [PATCH 62/62] Update unit test --- .../transformers/pipelines/test_text_generation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py index b304e4a7f2..fb25a33883 100644 --- a/tests/deepsparse/transformers/pipelines/test_text_generation.py +++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py @@ -114,10 +114,10 @@ def test_pipeline_for_ppl_eval(pipeline, prompt): max_length=1, ) assert hasattr(predictions, "generations") + assert hasattr(predictions, "input_tokens") assert hasattr(predictions.generations[0], "score") - assert hasattr(predictions.generations[0], "input_tokens") - assert "input_ids" in predictions.generations[0].input_tokens - assert "attention_mask" in predictions.generations[0].input_tokens + assert "input_ids" in predictions.input_tokens + assert "attention_mask" in predictions.input_tokens def test_streaming_mode_returns_generator(pipeline, prompt):