From 11ca2b5386d6a2b7a3a4de4c91aa75e9c15a6c9c Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 22 Aug 2023 00:33:17 -0400
Subject: [PATCH 01/62] Add input_tokes as optional output

---
 .../transformers/pipelines/text_generation.py | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 0b09fe44d7..b6a2348eef 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -60,6 +60,11 @@ class Config:
         "the logits for the input text sequence and the "
         "generated text sequence. ",
     )
+    return_input_tokens: bool = Field(
+        default=False,
+        description="A flag that indicates whether to return "
+        "the input_tokens. ",
+    )
     session_id: Optional[str] = Field(
         default=None,
         description="A user may set a string identifier "
@@ -95,6 +100,13 @@ class TextGenerationOutput(BaseModel):
         "The logits have dimensions "
         "[batch_size, sequence_length, vocab_size]",
     )
+    input_tokens: Optional[Any] = Field(  # dictionary mapping "token_ids" and "attention_mask" to numpy arrays
+        default=None,
+        description="The output of the tokenizer."
+        "Dictionary containing token_ids and attention_mask, "
+        "both mapping to arrays of size "
+        "[batch_size, sequence_length]",
+    )
     session_id: Optional[str] = Field(
         default=None, description="A string identifier for the kv cache session."
     )
@@ -353,7 +365,10 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
             self.multitoken_engine.session_id = inputs.session_id
 
         postprocessing_kwargs = dict(
-            return_logits=inputs.return_logits, streamer=inputs.streamer
+            return_logits=inputs.return_logits,
+            return_input_tokens=inputs.return_input_tokens,
+            input_tokens=input_tokens,
+            streamer=inputs.streamer,
         )
         return engine_input, postprocessing_kwargs
 
@@ -371,8 +386,9 @@ def process_engine_outputs(
             generated_tokens, skip_special_tokens=True
         )
         logits = generated_logits if kwargs.get("return_logits") else None
+        input_tokens = kwargs.get("input_tokens") if kwargs.get("return_input_tokens") else None
 
-        return TextGenerationOutput(sequences=sequences, logits=logits)
+        return TextGenerationOutput(sequences=sequences, logits=logits, input_tokens=input_tokens)
 
     def engine_forward(
         self, engine_inputs: List[numpy.ndarray], context: Dict

From 530d625022e025b68d237d0096328946e3ae8612 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 22 Aug 2023 00:33:57 -0400
Subject: [PATCH 02/62] Refactor Perplexity class to only compute perplexity.
 All other task-specific processing is handled elsewhere

---
 src/deepsparse/transformers/metrics.py | 149 ++++++-------------------
 1 file changed, 35 insertions(+), 114 deletions(-)

diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index 6b002e26f6..db6ddc4692 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -16,17 +16,11 @@
 Utilities for evaluation metric computation
 """
 
-
-from itertools import compress
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 
 import numpy
-from tqdm import tqdm
 
 import torch
-from deepsparse import Pipeline
-from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline
-from deepsparse.transformers.utils.helpers import pad_to_fixed_length
 from sklearn.metrics import precision_recall_fscore_support
 
 
@@ -37,7 +31,7 @@
 
 
 class Perplexity:
-    def __init__(self, pipeline: Pipeline, batch_size: int = 16):
+    def __init__(self):
         """
         Given the pipeline, compute the perplexity of the model
         on the given text input.
@@ -45,126 +39,53 @@ def __init__(self, pipeline: Pipeline, batch_size: int = 16):
         Code adapted from:
         https://huggingface.co/spaces/evaluate-metric/perplexity/blob/main/perplexity.py # noqa: E501
 
-        :param pipeline: The pipeline to use for text generation
-        :param batch_size: The batch size to split the input text into
          non-overlapping batches
         """
-        if not isinstance(pipeline, TextGenerationPipeline):
-            raise ValueError(
-                "Perplexity can only be computed for text generation pipelines"
-            )
-        self._pipeline = pipeline
-        self._batch_size = batch_size
-        self._sequence_length = pipeline.sequence_length
+        self._predictions = None
+        self._targets = None
         self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
-        self.perplexities = []
-
-    def add_batch(self, predictions: List[str]):
+    def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray):
         """
-        Run the model on the given input sequences and compute the perplexity.
-        The resulting perplexity is appended to the list of perplexities.
+        adds a batch of prediction results to track, should be of shape
+        (batch_size, num_labels)
 
-        :param predictions: The predictions to compute perplexity on
+        :param predictions: predicted scores from pipeline
+        :param targets: target values - label column should be 1 if a label is positive
+            0 otherwise
         """
-        # tokenize the input text
-        encodings = self._pipeline.tokenizer(
-            predictions,
-            return_attention_mask=True,
-            max_length=self._sequence_length,
-            truncation=True,
-            padding="max_length",
-        )
-
-        encoded_texts = encodings["input_ids"]
-        attention_masks = encodings["attention_mask"]
-
-        for start_index in tqdm(range(0, len(encoded_texts), self._batch_size)):
-            end_index = min(start_index + self._batch_size, len(encoded_texts))
-            encoded_batch = encoded_texts[start_index:end_index]
-            attention_mask = attention_masks[start_index:end_index]
-
-            # Computing the ground truth labels
-
-            # `encoded_batch` contains sequences of tokens padded
-            # with <PAD> tokens from the left side. We need to remove
-            # them and zero-pad from the right side up to the length
-            # of the longest sequence in the batch
-
-            encoded_batch = [
-                list(compress(sequence, attn_mask))
-                for (sequence, attn_mask) in zip(encoded_batch, attention_mask)
-            ]
-            max_sequence_len = max([len(sequence) for sequence in encoded_batch])
-
-            encoded_batch = [
-                pad_to_fixed_length(numpy.array(sequence), max_sequence_len)
-                for sequence in encoded_batch
-            ]
-            encoded_batch = numpy.stack(encoded_batch)
-
-            # We need to apply the analogous transformation to the attention mask
-            attention_mask = numpy.array(attention_mask)
-            attention_mask = [
-                list(filter(lambda num: num != 0, mask)) for mask in attention_mask
-            ]
-            attention_mask = [
-                pad_to_fixed_length(numpy.array(mask), max_sequence_len)
-                for mask in attention_mask
-            ]
-            attention_mask = numpy.stack(attention_mask)
-
-            labels = encoded_batch
-
-            out = self._pipeline(
-                sequences=predictions, return_logits=True, fixed_sequences_length=True
-            )
+        if predictions.ndim == 1:
+            predictions = predictions.reshape(1, predictions.shape[0])
+        if targets.ndim == 1:
+            targets = targets.reshape(1, targets.shape[0])
 
-            logits = out.logits
-
-            if not self._pipeline.cache_support_enabled:
-                # when running inference without cache, we need to apply
-                # analogous transformations to the logits as we did to the labels
-                # and attention mask
-
-                # remove "nonsensical" logits for <PAD> tokens
-                logits = [
-                    logit[-attn_mask.sum() :, :]
-                    for (logit, attn_mask) in zip(logits, attention_mask)
-                ]
-                # pad logits to max length
-                logits = [
-                    pad_to_fixed_length(logit, max_sequence_len) for logit in logits
-                ]
-                logits = numpy.stack(logits)
-
-            # shift logits and labels create the input and target for the loss function
-            shift_logits = logits[:, :-1, :]
-            shift_labels = labels[:, 1:]
-            shift_attention_mask_batch = attention_mask[:, 1:]
-
-            # compute perplexity for this batch
-            perplexity_batch = torch.exp(
-                (
-                    self._loss_fct(
-                        torch.tensor(shift_logits.transpose(0, 2, 1)),
-                        torch.tensor(shift_labels),
-                    )
-                    * torch.tensor(shift_attention_mask_batch)
-                ).sum(1)
-                / torch.tensor(shift_attention_mask_batch).sum(1)
-            )
-            self.perplexities.extend(perplexity_batch.numpy().tolist())
+        if self._predictions is None:
+            self._predictions = [predictions]
+            self._targets = [targets]
+        else:
+            self._predictions.append(predictions)
+            self._targets.append(targets)
 
     def compute(self) -> Dict[str, Any]:
         """
         :return: A dictionary containing the mean perplexity
             and the list of perplexities
         """
-        return {
-            "mean_perplexity": numpy.mean(self.perplexities),
-            "perplexities": self.perplexities,
-        }
+        # compile results into required str -> float dict
+        results = {"perplexities": []}
+        for prediction, target in zip(self._predictions, self._targets):
+            sample_perplexity = torch.exp(
+                self._loss_fct(
+                    torch.tensor(prediction.transpose(0, 2, 1)),
+                    torch.tensor(target),
+                ).mean()
+            )
+
+            results["perplexities"].append(sample_perplexity.item())
+
+        results["mean_perplexity"] = numpy.mean(results["perplexities"])
+
+        return results
 
 
 class PrecisionRecallF1:

From c81692213139a6ac9918d23ad2e2d3c6c9f9ee3c Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 22 Aug 2023 00:45:22 -0400
Subject: [PATCH 03/62] Simplify perplexity evaluation. Evaluation takes place
 as batch size 1 only, so no need to consider batched execution. In addition,
 use input_tokens from generation pipeline

---
 .../transformers/eval_downstream.py           | 57 ++++++++++++-------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index ffe83aa5d0..e39240710d 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -75,12 +75,15 @@
 from datasets import load_dataset, load_metric  # isort: skip
 
 
-def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
-    if args.max_samples:
-        batch_size = min(batch_size, args.max_samples)
-
-    dataset = load_dataset(dataset_name)["test"]
+def perplexity_eval(args, dataset_name="openai_humaneval"):
+    if dataset_name == "wikitext":
+        dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        dataset = dataset["text"]
+    else:
+        dataset = load_dataset(dataset_name, split="test")
 
+    # We'll use the text generation pipeline to generate a single token.
+    # Along with the token, it returns the logits for input sequence
     text_generation = Pipeline.create(
         task="text-generation",
         model_path=args.model_path,
@@ -90,22 +93,37 @@ def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
         prompt_processing_sequence_length=args.max_sequence_length,
         max_generated_tokens=1,
     )
-    perplexity_metrics = Perplexity(pipeline=text_generation, batch_size=batch_size)
-    active_engines = [
-        engine
-        for engine in [text_generation.engine, text_generation.multitoken_engine]
-        if engine
-    ]
-    print("Engine info: ")
-    [print(f"{engine}\n") for engine in active_engines]
-    predictions = []
+
+    # Instantiate perplexity metric
+    perplexity_metrics = Perplexity()
+
+    # Loop through samples
     for idx, sample in _enumerate_progress(dataset, args.max_samples):
-        predictions.append(sample["prompt"] + sample["canonical_solution"])
-        if len(predictions) == batch_size:
-            perplexity_metrics.add_batch(predictions)
-            predictions = []
+        # Collect input sequence
+        if dataset_name == "openai_humaneval":
+            sample = sample["prompt"] + sample["canonical_solution"]
+
+        # Perform single token generation
+        prediction = text_generation(
+            sequences=sample,
+            return_logits=True,
+            return_input_tokens=True,
+            fixed_sequences_length=True,
+        )
+
+        # Need to remove tokens that were masked
+        input_ids = prediction.input_tokens["input_ids"]
+        attention_mask = prediction.input_tokens["attention_mask"].flatten()
+
+        logits = numpy.compress(attention_mask, prediction.logits, axis=1)[:, :-1, :]
+        input_ids = numpy.compress(attention_mask, input_ids, axis=1)[:, 1:]
+
+        # Add predictions (logits) and targets (input_ids) to metric
+        perplexity_metrics.add_batch(logits, input_ids)
+
         if args.max_samples and idx >= args.max_samples:
             break
+
     return perplexity_metrics
 
 
@@ -474,7 +492,8 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
     "imdb": imdb_eval,
     "conll2003": conll2003_eval,
     "go_emotions": go_emotions_eval,
-    "openai_humaneval": perplexity_eval,
+    "openai_humaneval": lambda args: perplexity_eval(args, dataset_name="openai_humaneval"),
+    "wikitext": lambda args: perplexity_eval(args, dataset_name="wikitext"),
 }
 
 

From 5c89d89a4cc812136abdbb641b979221bee0bcb6 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 22 Aug 2023 01:01:44 -0400
Subject: [PATCH 04/62] Splits wikitext at regular intervals of the same length
 as the sequence length

---
 src/deepsparse/transformers/eval_downstream.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index e39240710d..159d5033b5 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -78,7 +78,11 @@
 def perplexity_eval(args, dataset_name="openai_humaneval"):
     if dataset_name == "wikitext":
         dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-        dataset = dataset["text"]
+        dataset = "\n\n".join(dataset["text"])
+        dataset = [
+            dataset[i*args.max_sequence_length:(i+1)*args.max_sequence_length]
+            for i in range(len(dataset) // args.max_sequence_length)
+        ]
     else:
         dataset = load_dataset(dataset_name, split="test")
 

From 5767ca0ea4c390a397c06c4909109fa0571b9054 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 22 Aug 2023 01:26:14 -0400
Subject: [PATCH 05/62] Add argument for accumulation of negative log
 likelihood

---
 src/deepsparse/transformers/metrics.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index db6ddc4692..4bac8bbfd6 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -31,7 +31,7 @@
 
 
 class Perplexity:
-    def __init__(self):
+    def __init__(self, accumulate_likelihood: bool = False):
         """
         Given the pipeline, compute the perplexity of the model
         on the given text input.
@@ -44,6 +44,7 @@ def __init__(self):
         self._predictions = None
         self._targets = None
         self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+        self._accumulate_likelihood = accumulate_likelihood
 
     def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray):
         """
@@ -72,20 +73,24 @@ def compute(self) -> Dict[str, Any]:
             and the list of perplexities
         """
         # compile results into required str -> float dict
-        results = {"perplexities": []}
+        neg_log_likelihoods = []
         for prediction, target in zip(self._predictions, self._targets):
-            sample_perplexity = torch.exp(
+            neg_log_likelihoods.append(
                 self._loss_fct(
                     torch.tensor(prediction.transpose(0, 2, 1)),
                     torch.tensor(target),
-                ).mean()
+                ).mean().item()
             )
 
-            results["perplexities"].append(sample_perplexity.item())
-
-        results["mean_perplexity"] = numpy.mean(results["perplexities"])
-
-        return results
+        if self._accumulate_likelihood:
+            neg_log_likelihood = numpy.mean(neg_log_likelihoods)
+            return {"perplexity": numpy.exp(neg_log_likelihood)}
+        else:
+            perplexities = [numpy.exp(nll) for nll in neg_log_likelihoods]
+            return {
+                "perplexities": perplexities,
+                "mean_perplexity": numpy.mean(perplexities),
+            }
 
 
 class PrecisionRecallF1:

From ec2162e1ffc5de62c598924f86aeb16accc7fcf4 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 22 Aug 2023 01:27:18 -0400
Subject: [PATCH 06/62] Accumulate likelihood for wikitext

---
 src/deepsparse/transformers/eval_downstream.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 159d5033b5..b8dac3b7fc 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -83,7 +83,9 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
             dataset[i*args.max_sequence_length:(i+1)*args.max_sequence_length]
             for i in range(len(dataset) // args.max_sequence_length)
         ]
+        accumulate_likelihood = True
     else:
+        accumulate_likelihood = False
         dataset = load_dataset(dataset_name, split="test")
 
     # We'll use the text generation pipeline to generate a single token.
@@ -99,7 +101,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
     )
 
     # Instantiate perplexity metric
-    perplexity_metrics = Perplexity()
+    perplexity_metrics = Perplexity(accumulate_likelihood=accumulate_likelihood)
 
     # Loop through samples
     for idx, sample in _enumerate_progress(dataset, args.max_samples):

From a7941ef99bb274add060c1bf7a5f6eb4129092d4 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 22 Aug 2023 01:43:33 -0400
Subject: [PATCH 07/62] Simplification

---
 src/deepsparse/transformers/eval_downstream.py | 2 +-
 src/deepsparse/transformers/metrics.py         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index b8dac3b7fc..311076f49b 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -85,8 +85,8 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
         ]
         accumulate_likelihood = True
     else:
-        accumulate_likelihood = False
         dataset = load_dataset(dataset_name, split="test")
+        accumulate_likelihood = False
 
     # We'll use the text generation pipeline to generate a single token.
     # Along with the token, it returns the logits for input sequence
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index 4bac8bbfd6..e656ea8ef8 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -43,7 +43,7 @@ def __init__(self, accumulate_likelihood: bool = False):
         """
         self._predictions = None
         self._targets = None
-        self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+        self._loss_fct = torch.nn.CrossEntropyLoss()
         self._accumulate_likelihood = accumulate_likelihood
 
     def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray):
@@ -79,7 +79,7 @@ def compute(self) -> Dict[str, Any]:
                 self._loss_fct(
                     torch.tensor(prediction.transpose(0, 2, 1)),
                     torch.tensor(target),
-                ).mean().item()
+                ).item()
             )
 
         if self._accumulate_likelihood:

From 3ddd45cc383e504d11bab48fe6ba543d2ae938e3 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 23 Aug 2023 13:38:15 -0400
Subject: [PATCH 08/62] Add support for wikitext-style ppl evaluation

---
 .../transformers/eval_downstream.py           | 120 +++++++++++++-----
 1 file changed, 88 insertions(+), 32 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 311076f49b..9570579b2d 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -71,22 +71,53 @@
 from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline
 from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1
 
-
+from transformers import AutoTokenizer
 from datasets import load_dataset, load_metric  # isort: skip
 
 
 def perplexity_eval(args, dataset_name="openai_humaneval"):
     if dataset_name == "wikitext":
-        dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-        dataset = "\n\n".join(dataset["text"])
-        dataset = [
-            dataset[i*args.max_sequence_length:(i+1)*args.max_sequence_length]
-            for i in range(len(dataset) // args.max_sequence_length)
-        ]
-        accumulate_likelihood = True
+        raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+
+        # Dataset is split into sections that contain "max_sequence_length" tokens.
+        # To split the dataset, first tokenize text
+        tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+        raw_text = "\n\n".join(raw_dataset["text"])
+        input_tokens = tokenizer(
+            raw_text,
+            return_tensors="np",
+        )["input_ids"][0]
+
+        # Then split the tokenized text into sections of size "max_sequence_length" and
+        # decode each section back into text format
+        dataset = []
+        for i in range(len(input_tokens) // args.max_sequence_length):
+            start = i * args.max_sequence_length
+            end = (i+1) * args.max_sequence_length
+            dataset.append(
+                tokenizer.decode(
+                    input_tokens[start:end],
+                    clean_up_tokenization_spaces=False,
+                )
+            )
+
+        # Handle any leftover tokens
+        if (i+1) * args.max_sequence_length < len(input_tokens):
+            start = (i+1) * args.max_sequence_length
+            end = len(input_tokens)
+            dataset.append(
+                tokenizer.decode(
+                    input_tokens[start:end],
+                    clean_up_tokenization_spaces=False,
+                )
+            )
+
+        # Set perplexity computation to accumulate negative log-likelihood across
+        # sections
+        accumulate = True
     else:
         dataset = load_dataset(dataset_name, split="test")
-        accumulate_likelihood = False
+        accumulate = False
 
     # We'll use the text generation pipeline to generate a single token.
     # Along with the token, it returns the logits for input sequence
@@ -101,33 +132,54 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
     )
 
     # Instantiate perplexity metric
-    perplexity_metrics = Perplexity(accumulate_likelihood=accumulate_likelihood)
+    perplexity_metrics = Perplexity(accumulate=accumulate)
 
     # Loop through samples
+    batch_samples = []
+    run_inference = False
+    end_evaluation = False
+    dataset_length = len(dataset)
     for idx, sample in _enumerate_progress(dataset, args.max_samples):
+
         # Collect input sequence
         if dataset_name == "openai_humaneval":
             sample = sample["prompt"] + sample["canonical_solution"]
-
-        # Perform single token generation
-        prediction = text_generation(
-            sequences=sample,
-            return_logits=True,
-            return_input_tokens=True,
-            fixed_sequences_length=True,
-        )
-
-        # Need to remove tokens that were masked
-        input_ids = prediction.input_tokens["input_ids"]
-        attention_mask = prediction.input_tokens["attention_mask"].flatten()
-
-        logits = numpy.compress(attention_mask, prediction.logits, axis=1)[:, :-1, :]
-        input_ids = numpy.compress(attention_mask, input_ids, axis=1)[:, 1:]
-
-        # Add predictions (logits) and targets (input_ids) to metric
-        perplexity_metrics.add_batch(logits, input_ids)
-
-        if args.max_samples and idx >= args.max_samples:
+        batch_samples.append(sample)
+
+        if args.max_samples and idx == args.max_samples - 1:
+            run_inference = True
+            end_evaluation = True
+
+        if (idx + 1) % args.batch_size == 0 or idx == dataset_length - 1:
+            run_inference = True
+
+        if run_inference:
+            # Perform single token generation
+            prediction = text_generation(
+                sequences=batch_samples,
+                return_logits=True,
+                return_input_tokens=True,
+                fixed_sequences_length=True,
+            )
+
+            # Handle one sample at a time to make it simpler for masking
+            for s in range(len(batch_samples)):
+                # Need to remove tokens that were masked
+                input_ids = prediction.input_tokens["input_ids"][s].flatten()
+                logits = prediction.logits[s]
+                attention_mask = prediction.input_tokens["attention_mask"][s].flatten()
+
+                logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :]
+                input_ids = numpy.compress(attention_mask, input_ids)[1:]
+
+                # Add predictions (logits) and targets (input_ids) to metric
+                perplexity_metrics.add_batch(logits, input_ids)
+
+            # Reset batch
+            batch_samples.clear()
+            run_inference = False
+
+        if end_evaluation:
             break
 
     return perplexity_metrics
@@ -502,7 +554,6 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
     "wikitext": lambda args: perplexity_eval(args, dataset_name="wikitext"),
 }
 
-
 def parse_args():
     parser = argparse.ArgumentParser(
         description="Evaluate a Hugging Face Transformers "
@@ -630,7 +681,12 @@ def parse_args():
         type=bool,
         default=False,
     )
-
+    parser.add_argument(
+        "--batch-size",
+        help="Batch size to evaluate model. Default is 1",
+        type=int,
+        default=1,
+    )
     return parser.parse_args()
 
 

From 756169c0eb15ac9179ddabb084b47c11b39722be Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 23 Aug 2023 13:39:10 -0400
Subject: [PATCH 09/62] Compute batch instead of storing until compute method.
 This drastically reduced memory requirements

---
 src/deepsparse/transformers/metrics.py | 116 ++++++++++++++++---------
 1 file changed, 73 insertions(+), 43 deletions(-)

diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index e656ea8ef8..394db12813 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -31,65 +31,95 @@
 
 
 class Perplexity:
-    def __init__(self, accumulate_likelihood: bool = False):
+    def __init__(self, accumulate: bool = False):
         """
-        Given the pipeline, compute the perplexity of the model
-        on the given text input.
-
-        Code adapted from:
-        https://huggingface.co/spaces/evaluate-metric/perplexity/blob/main/perplexity.py # noqa: E501
-
-         non-overlapping batches
+        Class for computing perplexity.
         """
         self._predictions = None
         self._targets = None
-        self._loss_fct = torch.nn.CrossEntropyLoss()
-        self._accumulate_likelihood = accumulate_likelihood
+        self._accumulate = accumulate
+        if accumulate:
+            self._neg_log_likelihood = 0.
+            self._number_tokens = 0
+        else:
+            self._perplexities = None
 
     def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray):
         """
-        adds a batch of prediction results to track, should be of shape
-        (batch_size, num_labels)
-
-        :param predictions: predicted scores from pipeline
-        :param targets: target values - label column should be 1 if a label is positive
-            0 otherwise
+        Computes perplexity or negative log-likelihood for each batch
+        (depending on accumulate argument)
+        and track results.
+
+        Tracks perplexity or negative log-likelihood since storing
+        predictions may require a lot of memory.
+
+        :param predictions: predicted scores.
+            Accepted shapes:
+              - [batch_size, sequence_length, vocab_size]
+              - [sequence_length, vocab_size] (batch size = 1)
+            Note: sequence length has to be uniform within a batch, but not all
+              batches require the same sequence length
+        :param targets: target values - index of correct vocabulary entry
         """
-        if predictions.ndim == 1:
-            predictions = predictions.reshape(1, predictions.shape[0])
-        if targets.ndim == 1:
-            targets = targets.reshape(1, targets.shape[0])
 
-        if self._predictions is None:
-            self._predictions = [predictions]
-            self._targets = [targets]
+        if self._accumulate:
+            # If accumulate is True, every token from the batch contributes equally to the
+            # negative log-likelihood.
+            # Thus, merge batch and sequence length dimensions and compute negative
+            # log-likelihood for all tokens, and accumulate to total
+            predictions = numpy.reshape(predictions, (-1, predictions.shape[-1]))
+            targets = targets.flatten()
+
+            # Compute negative log-likelihood and accumulate
+            self._neg_log_likelihood += torch.nn.functional.cross_entropy(
+                torch.tensor(predictions),
+                torch.tensor(targets),
+                reduction="sum",
+            ).item()
+
+            # Track number of tokens processed
+            self._number_tokens += predictions.shape[0]
         else:
-            self._predictions.append(predictions)
-            self._targets.append(targets)
+            # If accumulate is False, compute perplexity for each sample individually.
+            # We assume that sequence length is uniform within a batch, but may vary from batch
+            # to batch.
+
+            # Create batch dimension if it doesn't exist
+            if targets.ndim == 1:
+                predictions = numpy.expand_dims(predictions, axis=0)
+                targets = numpy.expand_dims(targets, axis=0)
+
+            # Compute negative log-likelihoods for batch
+            neg_log_likelihoods = torch.nn.functional.cross_entropy(
+                torch.tensor(predictions.transpose(0, 2, 1)),
+                torch.tensor(targets),
+                reduction="none",
+            ).numpy().mean(-1)
+
+            # Compute perplexities for batch
+            perplexities = numpy.exp(neg_log_likelihoods)
+
+            # Store perplexities
+            if self._perplexities is None:
+                self._perplexities = perplexities
+            else:
+                self._perplexities = numpy.concatenate((self._perplexities, perplexities))
 
     def compute(self) -> Dict[str, Any]:
         """
-        :return: A dictionary containing the mean perplexity
-            and the list of perplexities
+        :return: A dictionary containing the final results.
+        If accumulate is True, return single perplexity.
+        Else, return a list of perplexities (one for each sample)
+        and mean perplexity.
         """
-        # compile results into required str -> float dict
-        neg_log_likelihoods = []
-        for prediction, target in zip(self._predictions, self._targets):
-            neg_log_likelihoods.append(
-                self._loss_fct(
-                    torch.tensor(prediction.transpose(0, 2, 1)),
-                    torch.tensor(target),
-                ).item()
-            )
-
-        if self._accumulate_likelihood:
-            neg_log_likelihood = numpy.mean(neg_log_likelihoods)
-            return {"perplexity": numpy.exp(neg_log_likelihood)}
+
+        if self._accumulate:
+            perplexity = numpy.exp(self._neg_log_likelihood / self._number_tokens)
+            return {"perplexity": perplexity}
         else:
-            perplexities = [numpy.exp(nll) for nll in neg_log_likelihoods]
             return {
-                "perplexities": perplexities,
-                "mean_perplexity": numpy.mean(perplexities),
+                "perplexities": self._perplexities,
+                "mean_perplexity": numpy.mean(self._perplexities),
             }
 
 

From 97b5f1ada20460d4036567912697164c95cd18e5 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 23 Aug 2023 16:36:38 -0400
Subject: [PATCH 10/62] Remove torch dependency

---
 src/deepsparse/transformers/metrics.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index 394db12813..418f137e17 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -20,8 +20,8 @@
 
 import numpy
 
-import torch
 from sklearn.metrics import precision_recall_fscore_support
+from scipy.special import log_softmax
 
 
 __all__ = [
@@ -71,11 +71,7 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray):
             targets = targets.flatten()
 
             # Compute negative log-likelihood and accumulate
-            self._neg_log_likelihood += torch.nn.functional.cross_entropy(
-                torch.tensor(predictions),
-                torch.tensor(targets),
-                reduction="sum",
-            ).item()
+            self._neg_log_likelihood += _cross_entropy(predictions, targets, reduction="sum").sum()
 
             # Track number of tokens processed
             self._number_tokens += predictions.shape[0]
@@ -90,11 +86,7 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray):
                 targets = numpy.expand_dims(targets, axis=0)
 
             # Compute negative log-likelihoods for batch
-            neg_log_likelihoods = torch.nn.functional.cross_entropy(
-                torch.tensor(predictions.transpose(0, 2, 1)),
-                torch.tensor(targets),
-                reduction="none",
-            ).numpy().mean(-1)
+            neg_log_likelihoods = _cross_entropy(predictions, targets)
 
             # Compute perplexities for batch
             perplexities = numpy.exp(neg_log_likelihoods)
@@ -181,3 +173,15 @@ def compute(self) -> Dict[str, float]:
         results["f1_std"] = f1.std()
 
         return results
+
+
+def _cross_entropy(predictions, targets, reduction="mean"):
+    logp = log_softmax(predictions, axis=-1)
+    neg_log_likelihoods = -1. * numpy.take_along_axis(logp, numpy.expand_dims(targets, axis=-1), axis=-1)
+    neg_log_likelihoods = numpy.squeeze(neg_log_likelihoods, axis=-1)
+    if reduction == "mean":
+        neg_log_likelihoods = neg_log_likelihoods.mean(axis=-1)
+    elif reduction == "sum":
+        neg_log_likelihoods = neg_log_likelihoods.sum(axis=-1)
+
+    return neg_log_likelihoods
\ No newline at end of file

From 91b592141a1203cf090c20a4e76fff4ed45d9609 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 23 Aug 2023 16:50:16 -0400
Subject: [PATCH 11/62] Move split of dataset into helper function

---
 .../transformers/eval_downstream.py           | 63 ++++++++++---------
 1 file changed, 34 insertions(+), 29 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 9570579b2d..3ffe310c97 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -81,36 +81,9 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
 
         # Dataset is split into sections that contain "max_sequence_length" tokens.
         # To split the dataset, first tokenize text
-        tokenizer = AutoTokenizer.from_pretrained(args.model_path)
         raw_text = "\n\n".join(raw_dataset["text"])
-        input_tokens = tokenizer(
-            raw_text,
-            return_tensors="np",
-        )["input_ids"][0]
-
-        # Then split the tokenized text into sections of size "max_sequence_length" and
-        # decode each section back into text format
-        dataset = []
-        for i in range(len(input_tokens) // args.max_sequence_length):
-            start = i * args.max_sequence_length
-            end = (i+1) * args.max_sequence_length
-            dataset.append(
-                tokenizer.decode(
-                    input_tokens[start:end],
-                    clean_up_tokenization_spaces=False,
-                )
-            )
-
-        # Handle any leftover tokens
-        if (i+1) * args.max_sequence_length < len(input_tokens):
-            start = (i+1) * args.max_sequence_length
-            end = len(input_tokens)
-            dataset.append(
-                tokenizer.decode(
-                    input_tokens[start:end],
-                    clean_up_tokenization_spaces=False,
-                )
-            )
+        tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+        dataset = _split_text_by_tokens(raw_text, tokenizer, args.max_sequence_length)
 
         # Set perplexity computation to accumulate negative log-likelihood across
         # sections
@@ -539,6 +512,38 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
     return train_ds, val_ds
 
 
+def _split_text_by_tokens(text, tokenizer, sequence_length):
+    input_tokens = tokenizer(
+        text,
+        return_tensors="np",
+    )["input_ids"][0]
+
+    # Then split the tokenized text into sections of size "max_sequence_length" and
+    # decode each section back into text format
+    split_text = []
+    for i in range(len(input_tokens) // sequence_length):
+        start = i * sequence_length
+        end = (i + 1) * sequence_length
+        split_text.append(
+            tokenizer.decode(
+                input_tokens[start:end],
+                clean_up_tokenization_spaces=False,
+            )
+        )
+
+    # Handle any leftover tokens
+    if (i + 1) * sequence_length < len(input_tokens):
+        start = (i + 1) * sequence_length
+        end = len(input_tokens)
+        split_text.append(
+            tokenizer.decode(
+                input_tokens[start:end],
+                clean_up_tokenization_spaces=False,
+            )
+        )
+
+    return split_text
+
 # Register all the supported downstream datasets here
 SUPPORTED_DATASETS = {
     "squad": lambda args: qa_eval(args, dataset_name="squad"),

From 8ef20e793f40c4c9f52fd01c6c204767e9d6753d Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 23 Aug 2023 17:57:48 -0400
Subject: [PATCH 12/62] Quality fixes

---
 src/deepsparse/transformers/eval_downstream.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 839fefb032..890fe4b4c8 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -141,6 +141,12 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
                 logits = prediction.logits[s]
                 attention_mask = prediction.input_tokens["attention_mask"][s].flatten()
 
+                sequence_length = logits.shape[0]
+                attention_mask = attention_mask[:sequence_length]
+                input_ids = input_ids[:sequence_length]
+
+                print(attention_mask.shape)
+                print(logits.shape)
                 logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :]
                 input_ids = numpy.compress(attention_mask, input_ids)[1:]
 
@@ -554,8 +560,12 @@ def _split_text_by_tokens(text, tokenizer, sequence_length):
     "imdb": imdb_eval,
     "conll2003": conll2003_eval,
     "go_emotions": go_emotions_eval,
-    "openai_humaneval": lambda args: perplexity_eval(args, dataset_name="openai_humaneval"),
-    "wikitext": lambda args: perplexity_eval(args, dataset_name="wikitext"),
+    "openai_humaneval": lambda args: perplexity_eval(
+        args, dataset_name="openai_humaneval",
+    ),
+    "wikitext": lambda args: perplexity_eval(
+        args, dataset_name="wikitext",
+    ),
 }
 
 def parse_args():

From 5a602289c383628270ffd8d739eda16f917ec78a Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 24 Aug 2023 11:02:21 -0400
Subject: [PATCH 13/62] Remove debugging prints

---
 src/deepsparse/transformers/eval_downstream.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 890fe4b4c8..7861c16ee8 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -145,8 +145,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
                 attention_mask = attention_mask[:sequence_length]
                 input_ids = input_ids[:sequence_length]
 
-                print(attention_mask.shape)
-                print(logits.shape)
                 logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :]
                 input_ids = numpy.compress(attention_mask, input_ids)[1:]
 

From 2559e419d55130d2eaa8fe8c2fa723baf590a6e3 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 24 Aug 2023 11:02:27 -0400
Subject: [PATCH 14/62] Remove debugging prints

---
 .../transformers/pipelines/text_generation.py | 116 +++++++++---------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 3870f9a873..19de36c182 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -33,7 +33,6 @@
 )
 from deepsparse.utils.onnx import default_cached_outputs
 
-
 _LOGGER = logging.getLogger(__name__)
 
 __all__ = ["TextGenerationPipeline"]
@@ -57,36 +56,36 @@ class Config:
     return_logits: bool = Field(
         default=False,
         description="A flag that indicates whether to return "
-        "the logits for the input text sequence and the "
-        "generated text sequence. ",
+                    "the logits for the input text sequence and the "
+                    "generated text sequence. ",
     )
     return_input_tokens: bool = Field(
         default=False,
         description="A flag that indicates whether to return "
-        "the input_tokens. ",
+                    "the input_tokens. ",
     )
     session_id: Optional[str] = Field(
         default=None,
         description="A user may set a string identifier "
-        "for the kv cache session. If None, "
-        "and the model is using kv cache, it "
-        "will be set to a random uuid.",
+                    "for the kv cache session. If None, "
+                    "and the model is using kv cache, it "
+                    "will be set to a random uuid.",
     )
     fixed_sequences_length: bool = Field(
         default=False,
         description="A flag that indicates whether to modify "
-        "(pad or truncate) each input text sequence, so that "
-        "its tokenized length is equal to `sequence_length` "
-        "of tokens. Useful, when a batch of predictions needs "
-        "to have consistent length so one "
-        "can compute metric in a batched fashion. ",
+                    "(pad or truncate) each input text sequence, so that "
+                    "its tokenized length is equal to `sequence_length` "
+                    "of tokens. Useful, when a batch of predictions needs "
+                    "to have consistent length so one "
+                    "can compute metric in a batched fashion. ",
     )
     streamer: Optional[TextStreamer] = Field(
         default=None,
         description="Streamer object that will be used to stream the "
-        "generated sequences. Generated tokens are passed through "
-        "`streamer.put(token_ids)` and the streamer is responsible "
-        "for any further processing.",
+                    "generated sequences. Generated tokens are passed through "
+                    "`streamer.put(token_ids)` and the streamer is responsible "
+                    "for any further processing.",
     )
 
 
@@ -97,15 +96,15 @@ class TextGenerationOutput(BaseModel):
     logits: Optional[Any] = Field(  # numpy array, set to Any for FastAPI compatibility
         default=None,
         description="The logits for the generated text sequence."
-        "The logits have dimensions "
-        "[batch_size, sequence_length, vocab_size]",
+                    "The logits have dimensions "
+                    "[batch_size, sequence_length, vocab_size]",
     )
     input_tokens: Optional[Any] = Field(  # dictionary mapping "token_ids" and "attention_mask" to numpy arrays
         default=None,
         description="The output of the tokenizer."
-        "Dictionary containing token_ids and attention_mask, "
-        "both mapping to arrays of size "
-        "[batch_size, sequence_length]",
+                    "Dictionary containing token_ids and attention_mask, "
+                    "both mapping to arrays of size "
+                    "[batch_size, sequence_length]",
     )
     session_id: Optional[str] = Field(
         default=None, description="A string identifier for the kv cache session."
@@ -147,14 +146,14 @@ class TextGenerationPipeline(TransformersPipeline):
     """
 
     def __init__(
-        self,
-        deterministic: bool = True,
-        sampling_temperature: float = 1.0,
-        max_generated_tokens: Optional[int] = 1024,
-        prompt_processing_sequence_length: int = 64,
-        force_max_tokens: bool = False,
-        use_deepsparse_cache: bool = True,
-        **kwargs,
+            self,
+            deterministic: bool = True,
+            sampling_temperature: float = 1.0,
+            max_generated_tokens: Optional[int] = 1024,
+            prompt_processing_sequence_length: int = 64,
+            force_max_tokens: bool = False,
+            use_deepsparse_cache: bool = True,
+            **kwargs,
     ):
         kwargs_engine_type = kwargs.get("engine_type", DEEPSPARSE_ENGINE)
 
@@ -202,7 +201,7 @@ def __init__(
         self.engine, self.multitoken_engine = self.initialize_engines()
 
     def initialize_engines(
-        self,
+            self,
     ) -> Tuple[Optional[NLDecoderEngine], Optional[NLDecoderEngine]]:
         """
         Inititalizes a pair of engines for the pipeline.
@@ -227,9 +226,9 @@ def initialize_engines(
 
         if self.cache_support_enabled:
             if (
-                self.engine_type == DEEPSPARSE_ENGINE
-                and self.sequence_length <= self.prompt_processing_sequence_length
-                and self.enable_multitoken_prefill
+                    self.engine_type == DEEPSPARSE_ENGINE
+                    and self.sequence_length <= self.prompt_processing_sequence_length
+                    and self.enable_multitoken_prefill
             ):
                 raise ValueError(
                     "Attempting to initialize auxiliary DeepSparse engine to "
@@ -257,9 +256,8 @@ def initialize_engines(
                 )
 
         if (
-            self.cache_support_enabled and self.enable_multitoken_prefill
+                self.cache_support_enabled and self.enable_multitoken_prefill
         ) or not self.cache_support_enabled:
-
             multitoken_engine = NLDecoderEngine(
                 onnx_file_path=self.onnx_file_path,
                 engine_type=self.engine_type,
@@ -268,7 +266,9 @@ def initialize_engines(
                 sampling_temperature=self.sampling_temperature,
                 deterministic=self.deterministic,
                 sequence_length=self.sequence_length,
-                input_ids_length=self.prompt_processing_sequence_length,
+                input_ids_length=self.prompt_processing_sequence_length
+                if self.cache_support_enabled
+                else self.sequence_length,
                 tokenizer=self.tokenizer,
                 use_deepsparse_cache=self.use_deepsparse_cache,
             )
@@ -288,13 +288,13 @@ def initialize_engines(
             )
 
         assert (engine is not None) or (
-            multitoken_engine is not None
+                multitoken_engine is not None
         ), "At least one of the engines must be initialized for the pipeline!"
         return engine, multitoken_engine
 
     @staticmethod
     def route_input_to_bucket(
-        *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
+            *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
     ) -> Pipeline:
         """
         This method is used to route the input to the correct pipeline.
@@ -386,7 +386,7 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
         return engine_input, postprocessing_kwargs
 
     def process_engine_outputs(
-        self, engine_outputs: List[numpy.ndarray], **kwargs
+            self, engine_outputs: List[numpy.ndarray], **kwargs
     ) -> TextGenerationOutput:
         """
         Convert the engine outputs to the output schema for the pipeline.
@@ -404,7 +404,7 @@ def process_engine_outputs(
         return TextGenerationOutput(sequences=sequences, logits=logits, input_tokens=input_tokens)
 
     def engine_forward(
-        self, engine_inputs: List[numpy.ndarray], context: Dict
+            self, engine_inputs: List[numpy.ndarray], context: Dict
     ) -> Tuple[numpy.ndarray, numpy.ndarray]:
         """
         Run the forward pass on the engine.
@@ -456,8 +456,8 @@ def engine_forward(
                         streamer.put(numpy.array([token]))
 
                     if (
-                        token == self.tokenizer.eos_token_id
-                        and not self.force_max_tokens
+                            token == self.tokenizer.eos_token_id
+                            and not self.force_max_tokens
                     ):
                         break
 
@@ -469,7 +469,7 @@ def engine_forward(
         )
 
     def prompt_inference(
-        self, engine_inputs: List[numpy.ndarray]
+            self, engine_inputs: List[numpy.ndarray]
     ) -> Tuple[List[int], List[numpy.ndarray]]:
         """
         An inference run that processes the prompt through the
@@ -495,8 +495,8 @@ def prompt_inference(
         self._reset_engines_cache()
 
         if (
-            len(tokens) > self.prompt_processing_sequence_length
-            and self.enable_multitoken_prefill
+                len(tokens) > self.prompt_processing_sequence_length
+                and self.enable_multitoken_prefill
         ):
             for engine_inputs in self.engine_inputs_for_prefill(tokens):
                 new_token, new_logits = self.multitoken_engine(engine_inputs)
@@ -513,7 +513,7 @@ def prompt_inference(
         for token in tokens[num_tokens_processed:]:
             run_tokens.append(token)
             with self.timer_manager.current.time(
-                _TextGenerationTimings.PROMPT_PREFILL_SINGLE
+                    _TextGenerationTimings.PROMPT_PREFILL_SINGLE
             ):
                 new_token, new_logits = self.autoregressive_inference(run_tokens)
 
@@ -524,8 +524,8 @@ def prompt_inference(
         return tokens, prompt_logits
 
     def autoregressive_inference(
-        self,
-        tokens: List[int],
+            self,
+            tokens: List[int],
     ) -> Tuple[int, numpy.ndarray]:
         """
         An inference run that processes the last token to generate
@@ -563,7 +563,7 @@ def autoregressive_inference(
         return generated_token, generated_logits
 
     def engine_inputs_for_prefill(
-        self, tokens: List[int]
+            self, tokens: List[int]
     ) -> Generator[List[numpy.ndarray], None, None]:
         """
         Takes a list of tokens and creates a generator
@@ -601,9 +601,9 @@ def engine_inputs_for_prefill(
 
         token_batches = [
             tokens[
-                i
-                * self.prompt_processing_sequence_length : (i + 1)
-                * self.prompt_processing_sequence_length
+            i
+            * self.prompt_processing_sequence_length: (i + 1)
+                                                      * self.prompt_processing_sequence_length
             ]
             for i in range(0, num_batches)
         ]
@@ -623,13 +623,13 @@ def engine_inputs_for_prefill(
                     # fill it out with 1s (from the right), so that the number
                     # of unmasked entries is equal to the sum of:
                     engine_input[
-                        :,
-                        -(
-                            # ...the number of current input tokens...
+                    :,
+                    -(
+                        # ...the number of current input tokens...
                             self.prompt_processing_sequence_length
                             # ...and the number of the previous cache entries
                             + num_cached_entries
-                        ) :,
+                    ):,
                     ] = 1
                 elif name == "causal_mask":
                     # delay creation of the causal mask
@@ -646,8 +646,8 @@ def engine_inputs_for_prefill(
                                 num_cached_entries
                                 + self.prompt_processing_sequence_length,
                             )
-                            .reshape(1, -1)
-                            .astype(numpy.int64)
+                                .reshape(1, -1)
+                                .astype(numpy.int64)
                         )
 
                 engine_inputs.append(engine_input)
@@ -670,7 +670,7 @@ def is_cache_support_enabled(self) -> bool:
         return any(default_cached_outputs(self.onnx_file_path))
 
     def join_engine_outputs(
-        self, batch_outputs: List[List[numpy.ndarray]], orig_batch_size: int
+            self, batch_outputs: List[List[numpy.ndarray]], orig_batch_size: int
     ) -> List[numpy.ndarray]:
         """
         Takes a list of outputs (batches) from the engine

From 3b7e14ba60c5c5c75f399043deba3346dd2673fb Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Thu, 24 Aug 2023 14:16:29 -0400
Subject: [PATCH 15/62] Incorporate fixes for kv-cache

---
 .../transformers/pipelines/text_generation.py         | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 19de36c182..95a12ef75b 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -489,20 +489,17 @@ def prompt_inference(
         new_token = None
         num_tokens_processed = 0
 
-        # clean the state of engines' cache
-        # in the future, this will be paired with the session ids
-        # to refrain from resetting if session id is being passed
-        self._reset_engines_cache()
-
         if (
                 len(tokens) > self.prompt_processing_sequence_length
                 and self.enable_multitoken_prefill
         ):
+            self.multitoken_engine.reset_kv_cache()
             for engine_inputs in self.engine_inputs_for_prefill(tokens):
                 new_token, new_logits = self.multitoken_engine(engine_inputs)
                 num_tokens_processed += self.prompt_processing_sequence_length
                 prompt_logits.append(new_logits)
 
+        self.engine.reset_kv_cache()
         if num_tokens_processed:
             # transfer the cache state from the multi-token engine to the main engine
             self.engine.transfer_cache_state(cache=self.multitoken_engine.kv_cache)
@@ -730,7 +727,3 @@ def causal_mask_input_present(model_path: str) -> bool:
             inp.name == "causal_mask"
             for inp in onnx.load(model_path, load_external_data=False).graph.input
         )
-
-    def _reset_engines_cache(self):
-        self.engine.reset_kv_cache()
-        self.multitoken_engine.reset_kv_cache() if self.multitoken_engine else None

From b5f845b2b6f328d24d48e03a85b3ff3d37309f42 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 25 Aug 2023 15:50:42 -0400
Subject: [PATCH 16/62] Include doc string for accumulate

---
 src/deepsparse/transformers/metrics.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index 4e195c537c..d6e7d78c3f 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -34,6 +34,10 @@ class Perplexity:
     def __init__(self, accumulate: bool = False):
         """
         Class for computing perplexity.
+
+        :param accumulate: If True, accumulate negative log-likelihood
+            over samples. If False, perplexity is computed separately
+            for each sampled and then averaged in the end.
         """
         self._predictions = None
         self._targets = None

From 6f3b2461f9d26a2b146517a7e3563d471bcddcbb Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 25 Aug 2023 15:51:02 -0400
Subject: [PATCH 17/62] Add support to trust-remote-code arguments

---
 src/deepsparse/transformers/eval_downstream.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 7861c16ee8..e17d635652 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -101,6 +101,8 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
         num_cores=args.num_cores,
         sequence_length=args.max_sequence_length,
         max_generated_tokens=1,
+        trust_remote_code=args.trust_remote_code,
+        batch_size=args.batch_size,
     )
 
     # Instantiate perplexity metric
@@ -695,10 +697,16 @@ def parse_args():
     )
     parser.add_argument(
         "--batch-size",
-        help="Batch size to evaluate model. Default is 1",
+        help="Batch size with which to evaluate model. Default is 1",
         type=int,
         default=1,
     )
+    parser.add_argument(
+        "--trust-remote-code",
+        help="Whether to allow for remote code execution in transformers.",
+        type=bool,
+        default=False,
+    )
     return parser.parse_args()
 
 

From 2056ec50e527d95c350549c76250c719b1af3905 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 25 Aug 2023 18:18:53 -0400
Subject: [PATCH 18/62] Add support to c4

---
 .../transformers/eval_downstream.py           | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index e17d635652..eec697ca48 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -78,19 +78,29 @@
 def perplexity_eval(args, dataset_name="openai_humaneval"):
     if dataset_name == "wikitext":
         raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        raw_text = "\n\n".join(raw_dataset["text"])
+        max_token_length = None
+    elif dataset_name == "c4":
+        raw_dataset = load_dataset(
+            "allenai/c4",
+            "allenai--c4",
+            data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
+            split="validation",
+        )
+        raw_text = " ".join(raw_dataset[:1100]["text"])
+        max_token_length = 256 * args.max_sequence_length
+    else:
+        dataset = load_dataset(dataset_name, split="test")
 
+    if dataset_name in ["wikitext", "c4"]:
         # Dataset is split into sections that contain "max_sequence_length" tokens.
         # To split the dataset, first tokenize text
-        raw_text = "\n\n".join(raw_dataset["text"])
         tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-        dataset = _split_text_by_tokens(raw_text, tokenizer, args.max_sequence_length)
+        dataset = _split_text_by_tokens(raw_text, tokenizer, args.max_sequence_length, max_token_length)
 
         # Set perplexity computation to accumulate negative log-likelihood across
         # sections
         accumulate = True
-    else:
-        dataset = load_dataset(dataset_name, split="test")
-        accumulate = False
 
     # We'll use the text generation pipeline to generate a single token.
     # Along with the token, it returns the logits for input sequence
@@ -517,12 +527,15 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
     return train_ds, val_ds
 
 
-def _split_text_by_tokens(text, tokenizer, sequence_length):
+def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length):
     input_tokens = tokenizer(
         text,
         return_tensors="np",
     )["input_ids"][0]
 
+    if max_token_length is not None:
+        input_tokens = input_tokens[:max_token_length]
+
     # Then split the tokenized text into sections of size "max_sequence_length" and
     # decode each section back into text format
     split_text = []
@@ -566,6 +579,9 @@ def _split_text_by_tokens(text, tokenizer, sequence_length):
     "wikitext": lambda args: perplexity_eval(
         args, dataset_name="wikitext",
     ),
+    "c4": lambda args: perplexity_eval(
+        args, dataset_name="c4",
+    ),
 }
 
 def parse_args():

From 858bee67ab5e52627df4a3869518379a12b17922 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 28 Aug 2023 09:10:42 +0000
Subject: [PATCH 19/62] add a missing include_prompt_logits param

---
 .../transformers/eval_downstream.py           |  28 ++--
 src/deepsparse/transformers/metrics.py        |  17 ++-
 .../transformers/pipelines/text_generation.py | 124 +++++++++---------
 3 files changed, 95 insertions(+), 74 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index eec697ca48..31a25b0197 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -67,15 +67,18 @@
 
 import numpy
 from tqdm.auto import tqdm
+from transformers import AutoTokenizer
 
 from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline
 from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1
 
-from transformers import AutoTokenizer
+
 from datasets import load_dataset, load_metric  # isort: skip
 
 
 def perplexity_eval(args, dataset_name="openai_humaneval"):
+    accumulate = False
+
     if dataset_name == "wikitext":
         raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
         raw_text = "\n\n".join(raw_dataset["text"])
@@ -96,7 +99,9 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
         # Dataset is split into sections that contain "max_sequence_length" tokens.
         # To split the dataset, first tokenize text
         tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-        dataset = _split_text_by_tokens(raw_text, tokenizer, args.max_sequence_length, max_token_length)
+        dataset = _split_text_by_tokens(
+            raw_text, tokenizer, args.max_sequence_length, max_token_length
+        )
 
         # Set perplexity computation to accumulate negative log-likelihood across
         # sections
@@ -144,6 +149,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
                 return_logits=True,
                 return_input_tokens=True,
                 fixed_sequences_length=True,
+                include_prompt_logits=True,
             )
 
             # Handle one sample at a time to make it simpler for masking
@@ -528,10 +534,9 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
 
 
 def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length):
-    input_tokens = tokenizer(
-        text,
-        return_tensors="np",
-    )["input_ids"][0]
+    input_tokens = tokenizer(text, return_tensors="np",)[
+        "input_ids"
+    ][0]
 
     if max_token_length is not None:
         input_tokens = input_tokens[:max_token_length]
@@ -562,6 +567,7 @@ def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length):
 
     return split_text
 
+
 # Register all the supported downstream datasets here
 SUPPORTED_DATASETS = {
     "squad": lambda args: qa_eval(args, dataset_name="squad"),
@@ -574,16 +580,20 @@ def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length):
     "conll2003": conll2003_eval,
     "go_emotions": go_emotions_eval,
     "openai_humaneval": lambda args: perplexity_eval(
-        args, dataset_name="openai_humaneval",
+        args,
+        dataset_name="openai_humaneval",
     ),
     "wikitext": lambda args: perplexity_eval(
-        args, dataset_name="wikitext",
+        args,
+        dataset_name="wikitext",
     ),
     "c4": lambda args: perplexity_eval(
-        args, dataset_name="c4",
+        args,
+        dataset_name="c4",
     ),
 }
 
+
 def parse_args():
     parser = argparse.ArgumentParser(
         description="Evaluate a Hugging Face Transformers "
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index d6e7d78c3f..71683d6116 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -20,8 +20,8 @@
 
 import numpy
 
-from sklearn.metrics import precision_recall_fscore_support
 from scipy.special import log_softmax
+from sklearn.metrics import precision_recall_fscore_support
 
 
 __all__ = [
@@ -43,7 +43,7 @@ def __init__(self, accumulate: bool = False):
         self._targets = None
         self._accumulate = accumulate
         if accumulate:
-            self._neg_log_likelihood = 0.
+            self._neg_log_likelihood = 0.0
             self._number_tokens = 0
         else:
             self._perplexities = None
@@ -75,7 +75,9 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray):
             targets = targets.flatten()
 
             # Compute negative log-likelihood and accumulate
-            self._neg_log_likelihood += _cross_entropy(predictions, targets, reduction="sum").sum()
+            self._neg_log_likelihood += _cross_entropy(
+                predictions, targets, reduction="sum"
+            ).sum()
 
             # Track number of tokens processed
             self._number_tokens += predictions.shape[0]
@@ -99,7 +101,9 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray):
             if self._perplexities is None:
                 self._perplexities = perplexities
             else:
-                self._perplexities = numpy.concatenate((self._perplexities, perplexities))
+                self._perplexities = numpy.concatenate(
+                    (self._perplexities, perplexities)
+                )
 
     def compute(self) -> Dict[str, Any]:
         """
@@ -181,7 +185,9 @@ def compute(self) -> Dict[str, float]:
 
 def _cross_entropy(predictions, targets, reduction="mean"):
     logp = log_softmax(predictions, axis=-1)
-    neg_log_likelihoods = -1. * numpy.take_along_axis(logp, numpy.expand_dims(targets, axis=-1), axis=-1)
+    neg_log_likelihoods = -1.0 * numpy.take_along_axis(
+        logp, numpy.expand_dims(targets, axis=-1), axis=-1
+    )
     neg_log_likelihoods = numpy.squeeze(neg_log_likelihoods, axis=-1)
     if reduction == "mean":
         neg_log_likelihoods = neg_log_likelihoods.mean(axis=-1)
@@ -189,4 +195,3 @@ def _cross_entropy(predictions, targets, reduction="mean"):
         neg_log_likelihoods = neg_log_likelihoods.sum(axis=-1)
 
     return neg_log_likelihoods
-  
\ No newline at end of file
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index d3864dcd85..f4a0638c3d 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -44,6 +44,7 @@
 )
 from deepsparse.utils.onnx import default_cached_outputs
 
+
 _LOGGER = logging.getLogger(__name__)
 
 __all__ = ["TextGenerationPipeline"]
@@ -67,13 +68,12 @@ class Config:
     return_logits: bool = Field(
         default=False,
         description="A flag that indicates whether to return "
-                    "the logits for the input text sequence and the "
-                    "generated text sequence. ",
+        "the logits for the input text sequence and the "
+        "generated text sequence. ",
     )
     return_input_tokens: bool = Field(
         default=False,
-        description="A flag that indicates whether to return "
-                    "the input_tokens. ",
+        description="A flag that indicates whether to return " "the input_tokens. ",
     )
     include_prompt_logits: bool = Field(
         default=False,
@@ -86,25 +86,25 @@ class Config:
     session_id: Optional[str] = Field(
         default=None,
         description="A user may set a string identifier "
-                    "for the kv cache session. If None, "
-                    "and the model is using kv cache, it "
-                    "will be set to a random uuid.",
+        "for the kv cache session. If None, "
+        "and the model is using kv cache, it "
+        "will be set to a random uuid.",
     )
     fixed_sequences_length: bool = Field(
         default=False,
         description="A flag that indicates whether to modify "
-                    "(pad or truncate) each input text sequence, so that "
-                    "its tokenized length is equal to `sequence_length` "
-                    "of tokens. Useful, when a batch of predictions needs "
-                    "to have consistent length so one "
-                    "can compute metric in a batched fashion. ",
+        "(pad or truncate) each input text sequence, so that "
+        "its tokenized length is equal to `sequence_length` "
+        "of tokens. Useful, when a batch of predictions needs "
+        "to have consistent length so one "
+        "can compute metric in a batched fashion. ",
     )
     streamer: Optional[TextStreamer] = Field(
         default=None,
         description="Streamer object that will be used to stream the "
-                    "generated sequences. Generated tokens are passed through "
-                    "`streamer.put(token_ids)` and the streamer is responsible "
-                    "for any further processing.",
+        "generated sequences. Generated tokens are passed through "
+        "`streamer.put(token_ids)` and the streamer is responsible "
+        "for any further processing.",
     )
     callback: Optional[Callable[[Any], Union[bool, Any]]] = Field(
         default=None,
@@ -128,15 +128,17 @@ class TextGenerationOutput(BaseModel):
     logits: Optional[Any] = Field(  # numpy array, set to Any for FastAPI compatibility
         default=None,
         description="The logits for the generated text sequence."
-                    "The logits have dimensions "
-                    "[batch_size, sequence_length, vocab_size]",
+        "The logits have dimensions "
+        "[batch_size, sequence_length, vocab_size]",
     )
-    input_tokens: Optional[Any] = Field(  # dictionary mapping "token_ids" and "attention_mask" to numpy arrays
+    input_tokens: Optional[
+        Any
+    ] = Field(  # dictionary mapping "token_ids" and "attention_mask" to numpy arrays
         default=None,
         description="The output of the tokenizer."
-                    "Dictionary containing token_ids and attention_mask, "
-                    "both mapping to arrays of size "
-                    "[batch_size, sequence_length]",
+        "Dictionary containing token_ids and attention_mask, "
+        "both mapping to arrays of size "
+        "[batch_size, sequence_length]",
     )
     session_id: Optional[str] = Field(
         default=None, description="A string identifier for the kv cache session."
@@ -178,14 +180,14 @@ class TextGenerationPipeline(TransformersPipeline):
     """
 
     def __init__(
-            self,
-            deterministic: bool = True,
-            sampling_temperature: float = 1.0,
-            max_generated_tokens: Optional[int] = 1024,
-            prompt_processing_sequence_length: int = 64,
-            force_max_tokens: bool = False,
-            use_deepsparse_cache: bool = True,
-            **kwargs,
+        self,
+        deterministic: bool = True,
+        sampling_temperature: float = 1.0,
+        max_generated_tokens: Optional[int] = 1024,
+        prompt_processing_sequence_length: int = 64,
+        force_max_tokens: bool = False,
+        use_deepsparse_cache: bool = True,
+        **kwargs,
     ):
         kwargs_engine_type = kwargs.get("engine_type", DEEPSPARSE_ENGINE)
 
@@ -233,7 +235,7 @@ def __init__(
         self.engine, self.multitoken_engine = self.initialize_engines()
 
     def initialize_engines(
-            self,
+        self,
     ) -> Tuple[Optional[NLDecoderEngine], Optional[NLDecoderEngine]]:
         """
         Inititalizes a pair of engines for the pipeline.
@@ -258,9 +260,9 @@ def initialize_engines(
 
         if self.cache_support_enabled:
             if (
-                    self.engine_type == DEEPSPARSE_ENGINE
-                    and self.sequence_length <= self.prompt_processing_sequence_length
-                    and self.enable_multitoken_prefill
+                self.engine_type == DEEPSPARSE_ENGINE
+                and self.sequence_length <= self.prompt_processing_sequence_length
+                and self.enable_multitoken_prefill
             ):
                 raise ValueError(
                     "Attempting to initialize auxiliary DeepSparse engine to "
@@ -288,7 +290,7 @@ def initialize_engines(
                 )
 
         if (
-                self.cache_support_enabled and self.enable_multitoken_prefill
+            self.cache_support_enabled and self.enable_multitoken_prefill
         ) or not self.cache_support_enabled:
             multitoken_engine = NLDecoderEngine(
                 onnx_file_path=self.onnx_file_path,
@@ -320,13 +322,13 @@ def initialize_engines(
             )
 
         assert (engine is not None) or (
-                multitoken_engine is not None
+            multitoken_engine is not None
         ), "At least one of the engines must be initialized for the pipeline!"
         return engine, multitoken_engine
 
     @staticmethod
     def route_input_to_bucket(
-            *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
+        *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
     ) -> Pipeline:
         """
         This method is used to route the input to the correct pipeline.
@@ -421,7 +423,7 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
         return engine_input, postprocessing_kwargs
 
     def process_engine_outputs(
-            self, engine_outputs: List[numpy.ndarray], **kwargs
+        self, engine_outputs: List[numpy.ndarray], **kwargs
     ) -> TextGenerationOutput:
         """
         Convert the engine outputs to the output schema for the pipeline.
@@ -434,12 +436,16 @@ def process_engine_outputs(
             generated_tokens, skip_special_tokens=True
         )
         logits = generated_logits if kwargs.get("return_logits") else None
-        input_tokens = kwargs.get("input_tokens") if kwargs.get("return_input_tokens") else None
+        input_tokens = (
+            kwargs.get("input_tokens") if kwargs.get("return_input_tokens") else None
+        )
 
-        return TextGenerationOutput(sequences=sequences, logits=logits, input_tokens=input_tokens)
+        return TextGenerationOutput(
+            sequences=sequences, logits=logits, input_tokens=input_tokens
+        )
 
     def engine_forward(
-            self, engine_inputs: List[numpy.ndarray], context: Dict
+        self, engine_inputs: List[numpy.ndarray], context: Dict
     ) -> Tuple[numpy.ndarray, numpy.ndarray]:
         """
         Run the forward pass on the engine.
@@ -499,8 +505,8 @@ def engine_forward(
                         streamer.put(numpy.array([token]))
 
                     if (
-                            token == self.tokenizer.eos_token_id
-                            and not self.force_max_tokens
+                        token == self.tokenizer.eos_token_id
+                        and not self.force_max_tokens
                     ):
                         break
 
@@ -526,7 +532,7 @@ def engine_forward(
         )
 
     def prompt_inference(
-            self, engine_inputs: List[numpy.ndarray]
+        self, engine_inputs: List[numpy.ndarray]
     ) -> Tuple[List[int], List[numpy.ndarray]]:
         """
         An inference run that processes the prompt through the
@@ -547,8 +553,8 @@ def prompt_inference(
         num_tokens_processed = 0
 
         if (
-                len(tokens) > self.prompt_processing_sequence_length
-                and self.enable_multitoken_prefill
+            len(tokens) > self.prompt_processing_sequence_length
+            and self.enable_multitoken_prefill
         ):
             self.multitoken_engine.reset_kv_cache()
             for engine_inputs in self.engine_inputs_for_prefill(tokens):
@@ -567,7 +573,7 @@ def prompt_inference(
         for token in tokens[num_tokens_processed:]:
             run_tokens.append(token)
             with self.timer_manager.current.time(
-                    _TextGenerationTimings.PROMPT_PREFILL_SINGLE
+                _TextGenerationTimings.PROMPT_PREFILL_SINGLE
             ):
                 new_token, new_logits = self.autoregressive_inference(run_tokens)
 
@@ -578,8 +584,8 @@ def prompt_inference(
         return tokens, prompt_logits
 
     def autoregressive_inference(
-            self,
-            tokens: List[int],
+        self,
+        tokens: List[int],
     ) -> Tuple[int, numpy.ndarray]:
         """
         An inference run that processes the last token to generate
@@ -617,7 +623,7 @@ def autoregressive_inference(
         return generated_token, generated_logits
 
     def engine_inputs_for_prefill(
-            self, tokens: List[int]
+        self, tokens: List[int]
     ) -> Generator[List[numpy.ndarray], None, None]:
         """
         Takes a list of tokens and creates a generator
@@ -655,9 +661,9 @@ def engine_inputs_for_prefill(
 
         token_batches = [
             tokens[
-            i
-            * self.prompt_processing_sequence_length: (i + 1)
-                                                      * self.prompt_processing_sequence_length
+                i
+                * self.prompt_processing_sequence_length : (i + 1)
+                * self.prompt_processing_sequence_length
             ]
             for i in range(0, num_batches)
         ]
@@ -677,13 +683,13 @@ def engine_inputs_for_prefill(
                     # fill it out with 1s (from the right), so that the number
                     # of unmasked entries is equal to the sum of:
                     engine_input[
-                    :,
-                    -(
-                        # ...the number of current input tokens...
+                        :,
+                        -(
+                            # ...the number of current input tokens...
                             self.prompt_processing_sequence_length
                             # ...and the number of the previous cache entries
                             + num_cached_entries
-                    ):,
+                        ) :,
                     ] = 1
                 elif name == "causal_mask":
                     # delay creation of the causal mask
@@ -700,8 +706,8 @@ def engine_inputs_for_prefill(
                                 num_cached_entries
                                 + self.prompt_processing_sequence_length,
                             )
-                                .reshape(1, -1)
-                                .astype(numpy.int64)
+                            .reshape(1, -1)
+                            .astype(numpy.int64)
                         )
 
                 engine_inputs.append(engine_input)
@@ -724,7 +730,7 @@ def is_cache_support_enabled(self) -> bool:
         return any(default_cached_outputs(self.onnx_file_path))
 
     def join_engine_outputs(
-            self, batch_outputs: List[List[numpy.ndarray]], orig_batch_size: int
+        self, batch_outputs: List[List[numpy.ndarray]], orig_batch_size: int
     ) -> List[numpy.ndarray]:
         """
         Takes a list of outputs (batches) from the engine

From 4f6eb6b0c148ab7860a618598fbfb936f6273450 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 28 Aug 2023 10:51:25 -0400
Subject: [PATCH 20/62] Remove unnecessary capping at sequence length (it's
 incorrect for cached models)

---
 src/deepsparse/transformers/eval_downstream.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 31a25b0197..6ad46ba26e 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -82,7 +82,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
     if dataset_name == "wikitext":
         raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
         raw_text = "\n\n".join(raw_dataset["text"])
-        max_token_length = None
     elif dataset_name == "c4":
         raw_dataset = load_dataset(
             "allenai/c4",
@@ -91,7 +90,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
             split="validation",
         )
         raw_text = " ".join(raw_dataset[:1100]["text"])
-        max_token_length = 256 * args.max_sequence_length
     else:
         dataset = load_dataset(dataset_name, split="test")
 
@@ -100,7 +98,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
         # To split the dataset, first tokenize text
         tokenizer = AutoTokenizer.from_pretrained(args.model_path)
         dataset = _split_text_by_tokens(
-            raw_text, tokenizer, args.max_sequence_length, max_token_length
+            raw_text, tokenizer, args.max_sequence_length,
         )
 
         # Set perplexity computation to accumulate negative log-likelihood across
@@ -159,10 +157,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
                 logits = prediction.logits[s]
                 attention_mask = prediction.input_tokens["attention_mask"][s].flatten()
 
-                sequence_length = logits.shape[0]
-                attention_mask = attention_mask[:sequence_length]
-                input_ids = input_ids[:sequence_length]
-
                 logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :]
                 input_ids = numpy.compress(attention_mask, input_ids)[1:]
 
@@ -538,9 +532,6 @@ def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length):
         "input_ids"
     ][0]
 
-    if max_token_length is not None:
-        input_tokens = input_tokens[:max_token_length]
-
     # Then split the tokenized text into sections of size "max_sequence_length" and
     # decode each section back into text format
     split_text = []

From ab757d0c3b52dccde3ed982e395faf9c4f74ba27 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 29 Aug 2023 09:43:18 -0400
Subject: [PATCH 21/62] Simplify processing for concatenated datasets

---
 .../transformers/eval_downstream.py           | 89 ++++++++++++++-----
 1 file changed, 67 insertions(+), 22 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 6ad46ba26e..f5c2c19b69 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -77,33 +77,24 @@
 
 
 def perplexity_eval(args, dataset_name="openai_humaneval"):
-    accumulate = False
-
-    if dataset_name == "wikitext":
-        raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-        raw_text = "\n\n".join(raw_dataset["text"])
-    elif dataset_name == "c4":
-        raw_dataset = load_dataset(
-            "allenai/c4",
-            "allenai--c4",
-            data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
-            split="validation",
-        )
-        raw_text = " ".join(raw_dataset[:1100]["text"])
-    else:
-        dataset = load_dataset(dataset_name, split="test")
 
     if dataset_name in ["wikitext", "c4"]:
-        # Dataset is split into sections that contain "max_sequence_length" tokens.
-        # To split the dataset, first tokenize text
-        tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-        dataset = _split_text_by_tokens(
-            raw_text, tokenizer, args.max_sequence_length,
+        if args.kwargs is None:
+            kwargs = {}
+        else:
+            kwargs = json.loads(args.kwargs)
+        dataset = _process_concatenated_datasets(
+            dataset_name,
+            args.model_path,
+            args.max_sequence_length,
+            kwargs,
         )
-
         # Set perplexity computation to accumulate negative log-likelihood across
         # sections
         accumulate = True
+    else:
+        dataset = load_dataset(dataset_name, split="test")
+        accumulate = False
 
     # We'll use the text generation pipeline to generate a single token.
     # Along with the token, it returns the logits for input sequence
@@ -157,8 +148,17 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
                 logits = prediction.logits[s]
                 attention_mask = prediction.input_tokens["attention_mask"][s].flatten()
 
+                effective_sequence_length = logits.shape[0]
+
+                input_ids = input_ids[-effective_sequence_length:]
+                attention_mask = attention_mask[-effective_sequence_length:]
+
                 logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :]
                 input_ids = numpy.compress(attention_mask, input_ids)[1:]
+                #print(logits[:,0], flush=True)
+                #print(attention_mask)
+                #if idx == 1:
+                #    exit()
 
                 # Add predictions (logits) and targets (input_ids) to metric
                 perplexity_metrics.add_batch(logits, input_ids)
@@ -527,7 +527,46 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
     return train_ds, val_ds
 
 
-def _split_text_by_tokens(text, tokenizer, sequence_length, max_token_length):
+def _process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs):
+    if dataset_name == "wikitext":
+        eos = kwargs.get("eos", "\n\n")
+        bos = kwargs.get("bos", "")
+
+        raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        raw_text = raw_dataset["text"]
+    elif dataset_name == "c4":
+        eos = kwargs.get("eos", "<|endoftext|>")
+        bos = kwargs.get("bos", "")
+        raw_samples = kwargs.get("raw_samples", None)
+        data_file = kwargs.get("data_file", 0)
+        if data_file is not None:
+            raw_dataset = load_dataset(
+                "allenai/c4",
+                "allenai--c4",
+                data_files={"validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"},
+                split="validation",
+            )
+        else:
+            raw_dataset = load_dataset(
+                "allenai/c4",
+                "allenai--c4",
+                split="validation",
+            )
+        if raw_samples is not None:
+            raw_dataset = raw_dataset[:raw_samples]
+        raw_text = raw_dataset["text"]
+
+    # Dataset is split into sections that contain "max_sequence_length" tokens.
+    # To split the dataset, first tokenize text
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    return _split_text_by_tokens(
+        raw_text, eos, bos, tokenizer, max_sequence_length,
+    )
+
+
+def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length):
+    text = "".join([bos + sample + eos for sample in text])
+
     input_tokens = tokenizer(text, return_tensors="np",)[
         "input_ids"
     ][0]
@@ -724,6 +763,12 @@ def parse_args():
         type=bool,
         default=False,
     )
+    parser.add_argument(
+        "--kwargs",
+        help="Additional arguments specific to each dataset",
+        type=str,
+        default=None,
+    )
     return parser.parse_args()
 
 

From f21eaf3a92b13195a63f318a9a5e00b7a0f95601 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 1 Sep 2023 09:28:50 -0400
Subject: [PATCH 22/62] Fix kv cache update

---
 src/deepsparse/transformers/pipelines/text_generation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index b30940bac8..36c85d217f 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -577,6 +577,8 @@ def prompt_inference(
         if num_tokens_processed:
             # transfer the cache state from the multi-token engine to the main engine
             self.engine.transfer_cache_state(cache=self.multitoken_engine.kv_cache)
+        else:
+            self.engine.reset_kv_cache()
 
         # prompt size is small, run autoregressive inference to populate kv cache
         run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed]

From 2a18c457c235d732687e28459143d038891c69e1 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 1 Sep 2023 10:00:09 -0400
Subject: [PATCH 23/62] Fix kv cache update

---
 src/deepsparse/transformers/pipelines/text_generation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 36c85d217f..e34fbe5d5e 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -573,7 +573,6 @@ def prompt_inference(
                 num_tokens_processed += self.prompt_processing_sequence_length
                 prompt_logits.append(new_logits)
 
-        self.engine.reset_kv_cache()
         if num_tokens_processed:
             # transfer the cache state from the multi-token engine to the main engine
             self.engine.transfer_cache_state(cache=self.multitoken_engine.kv_cache)

From 7e8da1c24abfd652d906773b9101742e177fd87e Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 1 Sep 2023 17:11:39 -0400
Subject: [PATCH 24/62] Quality fixes

---
 src/deepsparse/transformers/eval_downstream.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index f5c2c19b69..9ab7dd8b7d 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -107,6 +107,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
         max_generated_tokens=1,
         trust_remote_code=args.trust_remote_code,
         batch_size=args.batch_size,
+        use_deepsparse_cache=True,
     )
 
     # Instantiate perplexity metric
@@ -155,10 +156,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
 
                 logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :]
                 input_ids = numpy.compress(attention_mask, input_ids)[1:]
-                #print(logits[:,0], flush=True)
-                #print(attention_mask)
-                #if idx == 1:
-                #    exit()
 
                 # Add predictions (logits) and targets (input_ids) to metric
                 perplexity_metrics.add_batch(logits, input_ids)

From 1f9c35881710aef06c323b7e5679264656e53b01 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 8 Sep 2023 10:03:23 -0400
Subject: [PATCH 25/62] remove batch size from pipeline instantiation

---
 src/deepsparse/transformers/eval_downstream.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 9ab7dd8b7d..2a6a6c43e9 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -106,7 +106,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
         sequence_length=args.max_sequence_length,
         max_generated_tokens=1,
         trust_remote_code=args.trust_remote_code,
-        batch_size=args.batch_size,
         use_deepsparse_cache=True,
     )
 

From 099b3660b3d564d509cd122edc64110deb40b6cd Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 8 Sep 2023 14:22:37 -0400
Subject: [PATCH 26/62] Rename to wikitext2

---
 src/deepsparse/transformers/eval_downstream.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 2a6a6c43e9..0cbc5db5f1 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -78,7 +78,7 @@
 
 def perplexity_eval(args, dataset_name="openai_humaneval"):
 
-    if dataset_name in ["wikitext", "c4"]:
+    if dataset_name in ["wikitext2", "c4"]:
         if args.kwargs is None:
             kwargs = {}
         else:
@@ -524,7 +524,7 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
 
 
 def _process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs):
-    if dataset_name == "wikitext":
+    if dataset_name == "wikitext2":
         eos = kwargs.get("eos", "\n\n")
         bos = kwargs.get("bos", "")
 
@@ -609,9 +609,9 @@ def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length):
         args,
         dataset_name="openai_humaneval",
     ),
-    "wikitext": lambda args: perplexity_eval(
+    "wikitext2": lambda args: perplexity_eval(
         args,
-        dataset_name="wikitext",
+        dataset_name="wikitext2",
     ),
     "c4": lambda args: perplexity_eval(
         args,

From 5455c7c333e05ae64214b86b20abc3ff81c0aa66 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 8 Sep 2023 14:27:28 -0400
Subject: [PATCH 27/62] Remove trust_remote_code argument

---
 src/deepsparse/transformers/eval_downstream.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 0cbc5db5f1..81c2e4bf23 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -105,7 +105,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
         num_cores=args.num_cores,
         sequence_length=args.max_sequence_length,
         max_generated_tokens=1,
-        trust_remote_code=args.trust_remote_code,
         use_deepsparse_cache=True,
     )
 
@@ -753,12 +752,6 @@ def parse_args():
         type=int,
         default=1,
     )
-    parser.add_argument(
-        "--trust-remote-code",
-        help="Whether to allow for remote code execution in transformers.",
-        type=bool,
-        default=False,
-    )
     parser.add_argument(
         "--kwargs",
         help="Additional arguments specific to each dataset",

From 6a330d4780dfc17502c9c6e9aac4f888c639b566 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 8 Sep 2023 16:46:08 -0400
Subject: [PATCH 28/62] Remove use_deepsparse_cache argument

---
 src/deepsparse/transformers/eval_downstream.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 81c2e4bf23..5153a28acc 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -105,7 +105,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
         num_cores=args.num_cores,
         sequence_length=args.max_sequence_length,
         max_generated_tokens=1,
-        use_deepsparse_cache=True,
     )
 
     # Instantiate perplexity metric

From a448667c545d36603d1f41e4e49a26619a811667 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 11 Sep 2023 10:27:47 -0400
Subject: [PATCH 29/62] Change padding of output to left in order to match
 padding of input ids and attention mask

---
 src/deepsparse/transformers/utils/helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py
index 7b465acc37..19e5a9ea8d 100644
--- a/src/deepsparse/transformers/utils/helpers.py
+++ b/src/deepsparse/transformers/utils/helpers.py
@@ -41,7 +41,7 @@ def pad_to_fixed_length(
 ) -> numpy.ndarray:
     """
     Pads the array to a fixed length along the given axis.
-    The padding is done on the right side of the array.
+    The padding is done on the left side of the array.
 
     :param array: array to pad
     :param max_len: maximum length to pad to
@@ -53,7 +53,7 @@ def pad_to_fixed_length(
     padding = [(0, 0)] * len(array.shape)
     # for the specified axis, pad to the max length
     # (from the right side of the array)
-    padding[axis] = (0, max_len - array.shape[axis])
+    padding[axis] = (max_len - array.shape[axis], 0)
     return numpy.pad(array, padding, mode="constant", constant_values=value)
 
 

From 54b560c0e5dd81e4169592d2839f302ef9931c6f Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 11 Sep 2023 12:16:58 -0400
Subject: [PATCH 30/62] Allow trust_remote_code to be passed as argument (in
 some cases tokenizer can be defined by custom code)

---
 src/deepsparse/transformers/eval_downstream.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 5153a28acc..f9d4d4bf21 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -105,6 +105,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
         num_cores=args.num_cores,
         sequence_length=args.max_sequence_length,
         max_generated_tokens=1,
+        trust_remote_code=args.trust_remote_code,
     )
 
     # Instantiate perplexity metric
@@ -751,6 +752,12 @@ def parse_args():
         type=int,
         default=1,
     )
+    parser.add_argument(
+        "--trust-remote-code",
+        help="Whether to allow for remote code execution in transformers.",
+        type=bool,
+        default=False,
+    )
     parser.add_argument(
         "--kwargs",
         help="Additional arguments specific to each dataset",

From ad35340ebebb319861a31992c328f8c3f1762544 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 11 Sep 2023 12:40:18 -0400
Subject: [PATCH 31/62] Move process_concatenated_datasets to helpers file

---
 .../transformers/eval_downstream.py           | 75 +------------------
 .../transformers/utils/eval_helpers.py        | 73 ++++++++++++++++++
 2 files changed, 75 insertions(+), 73 deletions(-)
 create mode 100644 src/deepsparse/transformers/utils/eval_helpers.py

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index f9d4d4bf21..abffbd1770 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -67,10 +67,10 @@
 
 import numpy
 from tqdm.auto import tqdm
-from transformers import AutoTokenizer
 
 from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline
 from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1
+from deepsparse.transformers.utils.eval_helpers import process_concatenated_datasets
 
 
 from datasets import load_dataset, load_metric  # isort: skip
@@ -83,7 +83,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
             kwargs = {}
         else:
             kwargs = json.loads(args.kwargs)
-        dataset = _process_concatenated_datasets(
+        dataset = process_concatenated_datasets(
             dataset_name,
             args.model_path,
             args.max_sequence_length,
@@ -522,77 +522,6 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
     return train_ds, val_ds
 
 
-def _process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs):
-    if dataset_name == "wikitext2":
-        eos = kwargs.get("eos", "\n\n")
-        bos = kwargs.get("bos", "")
-
-        raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
-        raw_text = raw_dataset["text"]
-    elif dataset_name == "c4":
-        eos = kwargs.get("eos", "<|endoftext|>")
-        bos = kwargs.get("bos", "")
-        raw_samples = kwargs.get("raw_samples", None)
-        data_file = kwargs.get("data_file", 0)
-        if data_file is not None:
-            raw_dataset = load_dataset(
-                "allenai/c4",
-                "allenai--c4",
-                data_files={"validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"},
-                split="validation",
-            )
-        else:
-            raw_dataset = load_dataset(
-                "allenai/c4",
-                "allenai--c4",
-                split="validation",
-            )
-        if raw_samples is not None:
-            raw_dataset = raw_dataset[:raw_samples]
-        raw_text = raw_dataset["text"]
-
-    # Dataset is split into sections that contain "max_sequence_length" tokens.
-    # To split the dataset, first tokenize text
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    return _split_text_by_tokens(
-        raw_text, eos, bos, tokenizer, max_sequence_length,
-    )
-
-
-def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length):
-    text = "".join([bos + sample + eos for sample in text])
-
-    input_tokens = tokenizer(text, return_tensors="np",)[
-        "input_ids"
-    ][0]
-
-    # Then split the tokenized text into sections of size "max_sequence_length" and
-    # decode each section back into text format
-    split_text = []
-    for i in range(len(input_tokens) // sequence_length):
-        start = i * sequence_length
-        end = (i + 1) * sequence_length
-        split_text.append(
-            tokenizer.decode(
-                input_tokens[start:end],
-                clean_up_tokenization_spaces=False,
-            )
-        )
-
-    # Handle any leftover tokens
-    if (i + 1) * sequence_length < len(input_tokens):
-        start = (i + 1) * sequence_length
-        end = len(input_tokens)
-        split_text.append(
-            tokenizer.decode(
-                input_tokens[start:end],
-                clean_up_tokenization_spaces=False,
-            )
-        )
-
-    return split_text
-
-
 # Register all the supported downstream datasets here
 SUPPORTED_DATASETS = {
     "squad": lambda args: qa_eval(args, dataset_name="squad"),
diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py
new file mode 100644
index 0000000000..916e787eb5
--- /dev/null
+++ b/src/deepsparse/transformers/utils/eval_helpers.py
@@ -0,0 +1,73 @@
+from transformers import AutoTokenizer
+from datasets import load_dataset
+
+
+def process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs):
+    if dataset_name == "wikitext2":
+        eos = kwargs.get("eos", "\n\n")
+        bos = kwargs.get("bos", "")
+
+        raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        raw_text = raw_dataset["text"]
+    elif dataset_name == "c4":
+        eos = kwargs.get("eos", "<|endoftext|>")
+        bos = kwargs.get("bos", "")
+        raw_samples = kwargs.get("raw_samples", None)
+        data_file = kwargs.get("data_file", 0)
+        if data_file is not None:
+            raw_dataset = load_dataset(
+                "allenai/c4",
+                "allenai--c4",
+                data_files={"validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"},
+                split="validation",
+            )
+        else:
+            raw_dataset = load_dataset(
+                "allenai/c4",
+                "allenai--c4",
+                split="validation",
+            )
+        if raw_samples is not None:
+            raw_dataset = raw_dataset[:raw_samples]
+        raw_text = raw_dataset["text"]
+
+    # Dataset is split into sections that contain "max_sequence_length" tokens.
+    # To split the dataset, first tokenize text
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    return _split_text_by_tokens(
+        raw_text, eos, bos, tokenizer, max_sequence_length,
+    )
+
+
+def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length):
+    text = "".join([bos + sample + eos for sample in text])
+
+    input_tokens = tokenizer(text, return_tensors="np",)[
+        "input_ids"
+    ][0]
+
+    # Then split the tokenized text into sections of size "max_sequence_length" and
+    # decode each section back into text format
+    split_text = []
+    for i in range(len(input_tokens) // sequence_length):
+        start = i * sequence_length
+        end = (i + 1) * sequence_length
+        split_text.append(
+            tokenizer.decode(
+                input_tokens[start:end],
+                clean_up_tokenization_spaces=False,
+            )
+        )
+
+    # Handle any leftover tokens
+    if (i + 1) * sequence_length < len(input_tokens):
+        start = (i + 1) * sequence_length
+        end = len(input_tokens)
+        split_text.append(
+            tokenizer.decode(
+                input_tokens[start:end],
+                clean_up_tokenization_spaces=False,
+            )
+        )
+
+    return split_text
\ No newline at end of file

From b16a5f6eed7f19737dae101065851f31ae07d3cf Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 13 Sep 2023 15:08:27 -0400
Subject: [PATCH 32/62] Added support for max_text_length to speed up
 processing of long datasets

---
 .../transformers/utils/eval_helpers.py        | 33 +++++++++++++++----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py
index 916e787eb5..1fc277340e 100644
--- a/src/deepsparse/transformers/utils/eval_helpers.py
+++ b/src/deepsparse/transformers/utils/eval_helpers.py
@@ -1,5 +1,6 @@
 from transformers import AutoTokenizer
 from datasets import load_dataset
+import numpy
 
 
 def process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs):
@@ -35,16 +36,36 @@ def process_concatenated_datasets(dataset_name, model_path, max_sequence_length,
     # To split the dataset, first tokenize text
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     return _split_text_by_tokens(
-        raw_text, eos, bos, tokenizer, max_sequence_length,
+        raw_text, eos, bos, tokenizer, max_sequence_length, kwargs.get("max_text_length", None)
     )
 
 
-def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length):
-    text = "".join([bos + sample + eos for sample in text])
+def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length, max_text_length):
+    text = [bos + sample + eos for sample in text]
 
-    input_tokens = tokenizer(text, return_tensors="np",)[
-        "input_ids"
-    ][0]
+    if max_text_length is None:
+        text = "".join(text)
+        input_tokens = tokenizer(text, return_tensors="np")[
+            "input_ids"
+        ][0]
+    elif max_text_length == -1: #per sample tokenization
+        input_tokens = []
+        for slice in text:
+            input_tokens.append(tokenizer(slice, return_tensors="np")[
+                "input_ids"
+            ][0])
+        input_tokens = numpy.concatenate(input_tokens)
+    else:
+        text = "".join(text)
+        text_slices = len(text) // max_text_length
+        sliced_text = [text[i*max_text_length:(i+1)*max_text_length] for i in range(text_slices)]
+        sliced_text.append(text[text_slices*max_text_length:])
+        input_tokens = []
+        for slice in sliced_text:
+            input_tokens.append(tokenizer(slice, return_tensors="np")[
+                "input_ids"
+            ][0])
+        input_tokens = numpy.concatenate(input_tokens)
 
     # Then split the tokenized text into sections of size "max_sequence_length" and
     # decode each section back into text format

From 065864a37dd1234ae55d299c1fcc33b2b0da6d3a Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 20 Sep 2023 13:52:49 -0400
Subject: [PATCH 33/62] Rebase w/ main

---
 .../transformers/eval_downstream.py           |   2 +-
 .../transformers/pipelines/text_generation.py | 272 ++++++++++++++----
 2 files changed, 216 insertions(+), 58 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index abffbd1770..b9bacd8b88 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -144,7 +144,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
             for s in range(len(batch_samples)):
                 # Need to remove tokens that were masked
                 input_ids = prediction.input_tokens["input_ids"][s].flatten()
-                logits = prediction.logits[s]
+                logits = prediction.generations.score[s]
                 attention_mask = prediction.input_tokens["attention_mask"][s].flatten()
 
                 effective_sequence_length = logits.shape[0]
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index c0eb9ee06a..821b76f620 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime
 import logging
 import os
 import warnings
+from enum import Enum
 from typing import (
     Any,
     Callable,
@@ -40,8 +42,10 @@
 from deepsparse.transformers.utils.helpers import (
     create_causal_mask,
     pad_to_fixed_length,
+    repeat_inputs,
 )
 from deepsparse.transformers.utils.timings import TextGenerationTimings
+from deepsparse.transformers.utils.token_generator import TokenGenerator
 from deepsparse.utils.onnx import default_cached_outputs
 
 
@@ -50,6 +54,12 @@
 __all__ = ["TextGenerationPipeline"]
 
 
+class FinishReason(Enum):
+    STOP = "stop"
+    LENGTH = "length"
+    TIME = "time"
+
+
 class TextGenerationInput(BaseModel):
     class Config:
         arbitrary_types_allowed = True
@@ -57,6 +67,18 @@ class Config:
     sequences: Union[str, List[str]] = Field(
         description="The input sequences to generate the text from.",
     )
+    num_generated_predictions: int = Field(
+        default=1,
+        description="The number of text generations to create from a single prompt. If "
+        "the same sequence is given as an input multiple times, the number of generated"
+        "the number of generated predictins is equivalent to the number of times the "
+        "the sequence is repeated.",
+    )
+    max_tokens: int = Field(
+        default=1024,
+        description="Maximum number of tokens to generate per output sequence. If no "
+        "value is provided, will default to 1024.",
+    )
     return_logits: bool = Field(
         default=False,
         description="A flag that indicates whether to return "
@@ -111,17 +133,59 @@ class Config:
         " tokens is generated). Set to `None` to ignore this parameter."
         " Default is `None`.",
     )
+    top_p: Optional[float] = Field(
+        default=0.0,
+        description="Used for filtering generated tokens. Keep the"
+        " tokens where its cumulative probability is >= top_p"
+        " Default set to 0.0",
+    )
+    top_k: Optional[int] = Field(
+        default=0,
+        description="Used for filtering generated tokens. Keep"
+        " top_k generated tokens. Default set to 0",
+    )
+    presence_penalty: Optional[float] = Field(
+        default=0.0,
+        description="Penalty applied for generating new token. Any existing"
+        " token results in the subtraction of its corresponding logit value."
+        " Default set to 0.0",
+    )
+    frequency_penalty: Optional[float] = Field(
+        default=0.0,
+        description="Penalty applied for generating new token. Existing"
+        " token frequencies summed to subtraction the logit of its"
+        " corresponding logit value. Default set to 0.0.",
+    )
+
+
+class GeneratedText(BaseModel):
+    text: str = Field(
+        description="The generated sequence for a given prompt. If "
+        "streaming is enabled, this will be the next generated token."
+    )
+    score: Optional[Any] = Field(
+        description="The score for the generated token or sequence. "
+        "The scores have the shape [sequence_length, vocab_size]"
+    )
+    finished: bool = Field(description="Whether generation has stopped.")
+    finished_reason: str = Field(
+        description="The reason for generation to stop. "
+        "Defined by FinishReason. One of stop, length, or time."
+    )
 
 
+# TODO: Pydantic aliases allow assignment but not reference. Still need to update.
 class TextGenerationOutput(BaseModel):
-    sequences: Union[str, List[str]] = Field(
-        description="The generated text sequences.",
+    created: datetime.datetime = Field(description="Time of inference creation.")
+    prompts: Union[str, List[str]] = Field(
+        description="Prompts used for the sequence generation. For multiple input "
+        "prompts, a list of prompts is returned"
     )
-    logits: Optional[Any] = Field(  # numpy array, set to Any for FastAPI compatibility
-        default=None,
-        description="The logits for the generated text sequence."
-        "The logits have dimensions "
-        "[batch_size, sequence_length, vocab_size]",
+    generations: Union[List[GeneratedText], List[List[GeneratedText]]] = Field(
+        description="For a single prompt, a single list of GeneratedText is returned. "
+        "If multiple prompts are given, a list of GeneratedText is returned for each "
+        "prompt provided. If streamng is enabled, the next generated token is returned."
+        "Otherwise, the full generated sequence is returned."
     )
     input_tokens: Optional[
         Any
@@ -156,11 +220,6 @@ class TextGenerationPipeline(TransformersPipeline):
         from the probability distribution computed from the logits.
         Higher values will result in more random samples. Should
         be greater than 0.0.
-    :param max_generated_tokens: the maximum number of tokens to generate
-        given the input sequence. If None, the model will generate
-        tokens until the end of the sequence is reached.
-        Otherwise, it will generate up to the maximum number of tokens or end of
-        sequence is reached.
     :param sequence_length: sequence length to compile model and tokenizer for.
         This controls the maximum context length of the pipeline. Default is 512
     :param prompt_sequence_length: For large prompts, the prompt is
@@ -177,7 +236,6 @@ def __init__(
         self,
         deterministic: bool = True,
         sampling_temperature: float = 1.0,
-        max_generated_tokens: Optional[int] = 1024,
         prompt_sequence_length: int = 64,
         sequence_length: int = 512,
         force_max_tokens: bool = False,
@@ -216,16 +274,8 @@ def __init__(
             if "WAND_OPT_FLAGS" not in os.environ:
                 os.environ["WAND_OPT_FLAGS"] = "default,~pyramids"
 
-        if not self.cache_support_enabled and max_generated_tokens > 1:
-            raise ValueError(
-                "The model used for inference does not support kv cache. It is "
-                "assumed that it maps from the token sequence to predicted logits."
-                "Set `max_generated_tokens` to 1 to support that scenario."
-            )
-
         self.deterministic = deterministic
         self.sampling_temperature = sampling_temperature
-        self.max_generated_tokens = max_generated_tokens
         self.prompt_sequence_length = prompt_sequence_length
         self.force_max_tokens = force_max_tokens
         self.internal_kv_cache = internal_kv_cache
@@ -280,8 +330,7 @@ def initialize_engines(
             # instantiation the multitoken engine or not
             if not self.enable_multitoken_prefill:
                 warnings.warn(
-                    "This ONNX graph does not support processing the prompt in "
-                    "with processing length > 1. Creation of an auxiliary engine for "
+                    "Creation of an auxiliary engine for "
                     "processing the prompt at a larger processing length is disabled. "
                     "The prompt will be processed in with processing length 1."
                 )
@@ -382,6 +431,27 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
         :param inputs: the input schema for the pipeline
         :return: the inputs for the engine
         """
+        if not self.cache_support_enabled and inputs.max_tokens > 1:
+            raise ValueError(
+                "The model used for inference does not support kv cache. It is "
+                "assumed that it maps from the token sequence to predicted logits."
+                "Set `max_tokens` to 1 to support that scenario."
+            )
+
+        # If the num_generated_predictions > 1, repeat the prompt
+        # num_generated_predictions times. Also, update the engine so that deterministic
+        # is set to False.
+        original_inputs = inputs.sequences
+        if inputs.num_generated_predictions > 1:
+            if isinstance(inputs.sequences, str):
+                inputs.sequences = [inputs.sequences]
+            inputs.sequences = repeat_inputs(
+                inputs.sequences, inputs.num_generated_predictions
+            )
+            if self.engine:
+                self.engine.deterministic = False
+            if self.multitoken_engine:
+                self.multitoken_engine.deterministic = False
 
         if inputs.fixed_sequences_length or not self.cache_support_enabled:
             # to enforce a fixed sequence length, we need to
@@ -427,7 +497,9 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
             self.engine.session_id = inputs.session_id
             self.multitoken_engine.session_id = inputs.session_id
 
-        postprocessing_kwargs = dict(
+        context = dict(
+            prompts=original_inputs,
+            num_generated_predictions=inputs.num_generated_predictions,
             return_logits=inputs.return_logits,
             return_input_tokens=inputs.return_input_tokens,
             input_tokens=input_tokens,
@@ -435,11 +507,17 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
             include_prompt_logits=inputs.include_prompt_logits,
             callback=inputs.callback,
             stop=inputs.stop,
+            top_p=inputs.top_p,
+            top_k=inputs.top_k,
+            presence_penalty=inputs.presence_penalty,
+            frequency_penalty=inputs.frequency_penalty,
+            max_tokens=inputs.max_tokens,
         )
-        return engine_input, postprocessing_kwargs
+
+        return engine_input, context
 
     def process_engine_outputs(
-        self, engine_outputs: List[numpy.ndarray], **kwargs
+        self, engine_outputs: List[Union[numpy.ndarray, FinishReason]], **kwargs
     ) -> TextGenerationOutput:
         """
         Convert the engine outputs to the output schema for the pipeline.
@@ -447,22 +525,70 @@ def process_engine_outputs(
         :param engine_outputs: the outputs from the engine
         :return: the output schema for the pipeline
         """
-        generated_tokens, generated_logits = engine_outputs
+        generated_tokens, generated_logits, finished_reason = engine_outputs
+        finished_reason = [f[0] for f in finished_reason]
+
         sequences = self.tokenizer.batch_decode(
             generated_tokens, skip_special_tokens=True
         )
+        num_preds = kwargs.get("num_generated_predictions", 1)
+        prompts = kwargs.get("prompts")
+
+        def _create_generated_text_output(
+            sequence: str,
+            finish_reason: FinishReason,
+            logits: Optional[numpy.array] = None,
+        ):
+            return GeneratedText(
+                text=sequence,
+                score=logits,
+                finished=True,
+                finished_reason=finish_reason.value,
+            )
+
         logits = generated_logits if kwargs.get("return_logits") else None
+
+        if logits is not None:
+            generations = list(
+                self.executor.map(
+                    _create_generated_text_output,
+                    sequences,
+                    finished_reason,
+                    logits,
+                )
+            )
+        else:
+            generations = list(
+                self.executor.map(
+                    _create_generated_text_output, sequences, finished_reason
+                )
+            )
+
+        # If the num_generated_predictions > 1, group the generations and return
+        # them as a list of lists where each list consists of the generated
+        # predictions for a given prompt, and all the lists are in the order matching
+        # the order that the prompts were given as inputs.
+        if num_preds > 1:
+            grouped_generations = [
+                generations[n : n + num_preds]
+                for n in range(0, len(generations), num_preds)
+            ]
+            generations = grouped_generations
+
         input_tokens = (
             kwargs.get("input_tokens") if kwargs.get("return_input_tokens") else None
         )
 
         return TextGenerationOutput(
-            sequences=sequences, logits=logits, input_tokens=input_tokens
+            created=datetime.datetime.now(),
+            prompts=prompts,
+            generations=generations,
+            input_tokens=input_tokens,
         )
 
     def engine_forward(
         self, engine_inputs: List[numpy.ndarray], context: Dict
-    ) -> Tuple[numpy.ndarray, numpy.ndarray]:
+    ) -> Tuple[numpy.ndarray, numpy.ndarray, List[FinishReason]]:
         """
         Run the forward pass on the engine.
 
@@ -476,31 +602,46 @@ def engine_forward(
         # as such, a new context needs to be created since we are no longer in the
         # main thread. That is why `engine_` is prepended to each of the timer phase
         # names in this context
+
         with self.timer_manager.new_timer_context(total_inference=False) as timer:
             streamer = context.get("streamer")
+            finished_reason = []
 
             if not self.cache_support_enabled:
-                tokens, prompt_logits = self.multitoken_engine(engine_inputs)
-                return numpy.array([tokens]), prompt_logits
+                prompt_logits = self.multitoken_engine(engine_inputs)
+                token_generator = TokenGenerator(
+                    logits_shape=prompt_logits[-1].shape[-1],
+                    deterministic=self.deterministic,
+                    **context,
+                )
+                for prompt_logit in prompt_logits:
+                    token_generator.generate(prompt_logit)
+                return numpy.array([self.tokens]), prompt_logits
 
             else:
                 # run the prompt through
                 with timer.time(TextGenerationTimings.PROMPT_PREFILL):
-                    tokens, prompt_logits = self.prompt_inference(engine_inputs)
+                    prompt_logits = self.prompt_inference(engine_inputs)
+
+            tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()
+            token_generator = TokenGenerator(
+                logits_shape=prompt_logits[-1].shape[-1],
+                tokens=tokens,
+                deterministic=self.deterministic,
+                **context,
+            )
+            token_generator.generate(prompt_logits[-1][0, -1, :])
 
             if streamer is not None:
-                streamer.put(numpy.array(tokens))
+                streamer.put(numpy.array(token_generator.tokens))
 
             # create the generated output
-            max_tokens = (
-                self.max_generated_tokens
-                if self.max_generated_tokens and self.max_generated_tokens > 0
-                else 100 * self.sequence_length
-            )  # set safety for absolute max generation
+            max_tokens = context.get("max_tokens", 0)
+            max_tokens = max_tokens if max_tokens > 0 else (100 * self.sequence_length)
 
             # last prompt token is the first generated token
             # add it to generated tokens, and the logits
-            generated_tokens = [tokens[-1]]
+            generated_tokens = [token_generator.tokens[-1]]
             generated_logits = (
                 prompt_logits
                 if context.get("include_prompt_logits")
@@ -512,8 +653,10 @@ def engine_forward(
             with timer.time(TextGenerationTimings.TOKEN_GENERATION):
                 while len(generated_tokens) < max_tokens:
                     with timer.time(TextGenerationTimings.TOKEN_GENERATION_SINGLE):
-                        token, logits = self.autoregressive_inference(tokens)
-                    tokens.append(token)
+                        logits = self.autoregressive_inference(
+                            tokens=token_generator.tokens
+                        )
+                        token = token_generator.generate(logits=logits[0, -1, :])
                     generated_tokens.append(token)
                     generated_logits.append(logits)
 
@@ -524,6 +667,7 @@ def engine_forward(
                         token == self.tokenizer.eos_token_id
                         and not self.force_max_tokens
                     ):
+                        finished_reason.append(FinishReason.STOP)
                         break
 
                     if self._stop_token_generated(token, stop_tokens=stop):
@@ -531,8 +675,10 @@ def engine_forward(
                             "Stop token %s generated. Stopping generation."
                             % self.tokenizer.decode(token)
                         )
+                        finished_reason.append(FinishReason.STOP)
                         break
 
+                    # TODO: Add any generic callback reason?
                     if callback is not None and callback(token) is False:
                         _LOGGER.debug(
                             "callback %s returned False, stopping generation."
@@ -540,15 +686,21 @@ def engine_forward(
                         )
                         break
 
+                    if len(generated_tokens) == max_tokens:
+                        finished_reason.append(FinishReason.LENGTH)
+
             if streamer is not None:
                 streamer.end()
 
-        return numpy.array([generated_tokens]), numpy.concatenate(
-            generated_logits, axis=1
+        return (
+            numpy.array([generated_tokens]),
+            numpy.concatenate(generated_logits, axis=1),
+            finished_reason,
         )
 
     def prompt_inference(
-        self, engine_inputs: List[numpy.ndarray]
+        self,
+        engine_inputs: List[numpy.ndarray],
     ) -> Tuple[List[int], List[numpy.ndarray]]:
         """
         An inference run that processes the prompt through the
@@ -565,13 +717,12 @@ def prompt_inference(
         tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()
 
         prompt_logits = []
-        new_token = None
         num_tokens_processed = 0
 
         if len(tokens) > self.prompt_sequence_length and self.enable_multitoken_prefill:
             self.multitoken_engine.reset_kv_cache()
             for engine_inputs in self.engine_inputs_for_prefill(tokens):
-                new_token, new_logits = self.multitoken_engine(engine_inputs)
+                new_logits = self.multitoken_engine(engine_inputs)
                 num_tokens_processed += self.prompt_sequence_length
                 prompt_logits.append(new_logits)
 
@@ -589,13 +740,11 @@ def prompt_inference(
             with self.timer_manager.current.time(
                 TextGenerationTimings.PROMPT_PREFILL_SINGLE
             ):
-                new_token, new_logits = self.autoregressive_inference(run_tokens)
+                new_logits = self.autoregressive_inference(run_tokens)
 
             prompt_logits.append(new_logits)
 
-        tokens.append(new_token)
-
-        return tokens, prompt_logits
+        return prompt_logits
 
     def autoregressive_inference(
         self,
@@ -632,9 +781,9 @@ def autoregressive_inference(
             engine_inputs_map[name] for name in self.engine.onnx_input_names_no_cache
         ]
 
-        generated_token, generated_logits = self.engine(engine_inputs)
+        generated_logits = self.engine(engine_inputs)
 
-        return generated_token, generated_logits
+        return generated_logits
 
     def engine_inputs_for_prefill(
         self, tokens: List[int]
@@ -736,8 +885,10 @@ def is_cache_support_enabled(self) -> bool:
         return any(default_cached_outputs(self.onnx_file_path))
 
     def join_engine_outputs(
-        self, batch_outputs: List[List[numpy.ndarray]], orig_batch_size: int
-    ) -> List[numpy.ndarray]:
+        self,
+        batch_outputs: List[List[Union[numpy.ndarray, FinishReason]]],
+        orig_batch_size: int,
+    ) -> List[Union[numpy.ndarray, FinishReason]]:
         """
         Takes a list of outputs (batches) from the engine
         and joins them into a single output. Asserts that
@@ -748,7 +899,7 @@ def join_engine_outputs(
         :param orig_batch_size: The original batch size
         :return: A list of joined outputs
         """
-        tokens, logits = zip(*batch_outputs)
+        tokens, logits, finish_reason = zip(*batch_outputs)
         if self.cache_support_enabled:
             # if the model has kv cache, we need to account for
             # the fact that the predicted outputs may have
@@ -780,7 +931,7 @@ def join_engine_outputs(
         tokens = numpy.concatenate(tokens, axis=0)
         logits = numpy.concatenate(logits, axis=0)
 
-        return [tokens, logits]
+        return [tokens, logits, finish_reason]
 
     @staticmethod
     def causal_mask_input_present(model_path: str) -> bool:
@@ -792,10 +943,17 @@ def causal_mask_input_present(model_path: str) -> bool:
         :param model_path: path to the model
         :return: True if causal_mask input is present, False otherwise
         """
-        return any(
+        is_causal_mask_input = any(
             inp.name == "causal_mask"
             for inp in onnx.load(model_path, load_external_data=False).graph.input
         )
+        if not is_causal_mask_input:
+            _LOGGER.warning(
+                "This ONNX graph does not support processing the prompt"
+                "with processing length > 1"
+            )
+
+        return is_causal_mask_input
 
     def _stop_token_generated(
         self, token, stop_tokens: Union[None, str, Sequence[str]]

From 59b93c565e6043808d39f91e9a5710629b9d05df Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 20 Sep 2023 14:15:33 -0400
Subject: [PATCH 34/62] Rebase w/ main

---
 src/deepsparse/transformers/eval_downstream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index b9bacd8b88..bd3e668fc2 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -104,7 +104,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
         engine_type=args.engine,
         num_cores=args.num_cores,
         sequence_length=args.max_sequence_length,
-        max_generated_tokens=1,
         trust_remote_code=args.trust_remote_code,
     )
 
@@ -138,6 +137,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
                 return_input_tokens=True,
                 fixed_sequences_length=True,
                 include_prompt_logits=True,
+                max_tokes=1,
             )
 
             # Handle one sample at a time to make it simpler for masking

From f4554b17e88331a13b8009f0cb2b93e5cbe9d3d8 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 20 Sep 2023 15:00:20 -0400
Subject: [PATCH 35/62] Fix typo

---
 src/deepsparse/transformers/eval_downstream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index bd3e668fc2..71bbe70ff4 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -137,7 +137,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
                 return_input_tokens=True,
                 fixed_sequences_length=True,
                 include_prompt_logits=True,
-                max_tokes=1,
+                max_tokens=1,
             )
 
             # Handle one sample at a time to make it simpler for masking

From c5bd3836f7fa195d180cd8f48685cc3863d652f1 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 26 Sep 2023 13:45:24 -0400
Subject: [PATCH 36/62] Rebase

---
 .../transformers/eval_downstream.py           |  6 ++--
 .../transformers/pipelines/text_generation.py | 34 ++++++++++++++-----
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 71bbe70ff4..def78d61fa 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -133,18 +133,18 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
             # Perform single token generation
             prediction = text_generation(
                 sequences=batch_samples,
-                return_logits=True,
+                output_scores=True,
                 return_input_tokens=True,
                 fixed_sequences_length=True,
                 include_prompt_logits=True,
-                max_tokens=1,
+                max_new_tokens=1,
             )
 
             # Handle one sample at a time to make it simpler for masking
             for s in range(len(batch_samples)):
                 # Need to remove tokens that were masked
                 input_ids = prediction.input_tokens["input_ids"][s].flatten()
-                logits = prediction.generations.score[s]
+                logits = prediction.generations[s].score
                 attention_mask = prediction.input_tokens["attention_mask"][s].flatten()
 
                 effective_sequence_length = logits.shape[0]
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 882d171e2f..78e877fa2b 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -142,14 +142,15 @@ class Config:
         description="GenerationConfig file consisting of parameters used to control "
         "sequences generated for each prompt. The current supported parameters are: "
         "max_length, max_new_tokens, num_return_sequences, output_scores, top_p, "
-        "top_k, repetition_penalty, do_sample, temperature",
+        "top_k, repetition_penalty, do_sample, temperature. If None is provided, "
+        "deepsparse defaults will be used. For all other input types, HuggingFace "
+        "defaults for GenerationConfig will be used. ",
     )
 
-    kwargs: Optional[Dict] = Field(
+    generation_kwargs: Optional[Dict] = Field(
         default=None,
         description="Any arguments to override generation_config arguments. Refer to "
-        "the generation_config argument for a full list of supported variables. Only "
-        "valid when generation_config is not None.",
+        "the generation_config argument for a full list of supported variables.",
     )
 
 
@@ -217,6 +218,12 @@ class TextGenerationPipeline(TransformersPipeline):
         of tokens supplied even if the stop token is reached.
     :param internal_kv_cache: if True, the pipeline will use the deepsparse kv cache
         for caching the model outputs.
+    :param generation_config: config file consisting of parameters used to control
+        sequences generated for each prompt. The current supported parameters are:
+        max_length, max_new_tokens, num_return_sequences, output_scores, top_p,
+        top_k, repetition_penalty, do_sample, temperature. If None is provided,
+        deepsparse defaults will be used. For all other input types, HuggingFace
+        defaults for GenerationConfig will be used.
     :param kwargs: kwargs to pass to the TransformersPipeline
     """
 
@@ -425,6 +432,7 @@ def parse_inputs(self, *args, **kwargs) -> TextGenerationInput:
         if "sequences" in kwargs and "prompt" not in kwargs:
             # support prompt and sequences interchangeably
             kwargs["prompt"] = kwargs["sequences"]
+
         if (
             args
             and not isinstance(args[0], TextGenerationInput)
@@ -435,6 +443,14 @@ def parse_inputs(self, *args, **kwargs) -> TextGenerationInput:
             kwargs["prompt"] = args[0]
             args = args[1:]
 
+        if kwargs:
+            generation_kwargs = kwargs.get("generation_kwargs", {})
+            for k, v in kwargs.items():
+                if not generation_kwargs.get(k) and hasattr(GenerationDefaults, k):
+                    generation_kwargs[k] = v
+
+            kwargs["generation_kwargs"] = generation_kwargs
+
         return super().parse_inputs(*args, **kwargs)
 
     def process_inputs(
@@ -450,7 +466,7 @@ def process_inputs(
             self.generation_config, inputs.generation_config, GenerationDefaults()
         )
 
-        generation_config = override_config(inputs.kwargs, generation_config)
+        generation_config = override_config(inputs.generation_kwargs, generation_config)
 
         self.streaming = inputs.streaming
         if not self.cache_support_enabled and generation_config.max_length > 1:
@@ -545,10 +561,10 @@ def _create_generated_text_output(
             finished=False,
         )
 
-    def _stream_engine_outputs(self, engine_outputs, prompts, kwargs):
+    def _stream_engine_outputs(self, engine_outputs, prompts, generation_config):
         for output in engine_outputs:
             generated_tokens, generated_logits, finished_reason = output
-            logits = generated_logits if kwargs.get("return_logits") else None
+            logits = generated_logits if generation_config.output_scores else None
             generation = self._create_generated_text_output(
                 self.tokenizer.batch_decode(generated_tokens)[0],
                 finished_reason[0],
@@ -575,7 +591,9 @@ def process_engine_outputs(
         streaming = kwargs.get("streaming")
 
         if streaming:
-            return self._stream_engine_outputs(engine_outputs, prompts, kwargs)
+            return self._stream_engine_outputs(
+                engine_outputs, prompts, generation_config
+            )
 
         if self._debug:
             (

From 091aeca4ae6e3fc130ed0d4c770e4ae696434d4e Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 26 Sep 2023 16:38:43 -0400
Subject: [PATCH 37/62] Use max_length instead of max_new_tokens

---
 src/deepsparse/transformers/eval_downstream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index def78d61fa..8f07c0b242 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -137,7 +137,7 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
                 return_input_tokens=True,
                 fixed_sequences_length=True,
                 include_prompt_logits=True,
-                max_new_tokens=1,
+                max_length=1,
             )
 
             # Handle one sample at a time to make it simpler for masking

From 6bc08bc996a0a8a244f16f63aa4dc357a3c0237f Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 3 Oct 2023 13:08:35 -0400
Subject: [PATCH 38/62] Rebase

---
 src/deepsparse/transformers/pipelines/text_generation.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 43b7edf4ae..fdbd0b5213 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -192,9 +192,6 @@ class TextGenerationOutput(BaseModel):
         "both mapping to arrays of size "
         "[batch_size, sequence_length]",
     )
-    session_id: Optional[str] = Field(
-        default=None, description="A string identifier for the kv cache session."
-    )
 
     class Config:
         arbitrary_types_allowed = True

From dc943d76bbaa72deba3764ef95a1e48c493917a1 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 3 Oct 2023 13:22:36 -0400
Subject: [PATCH 39/62] Added typing and docstring

---
 .../transformers/utils/eval_helpers.py        | 59 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py
index 1fc277340e..8b0d8dfe43 100644
--- a/src/deepsparse/transformers/utils/eval_helpers.py
+++ b/src/deepsparse/transformers/utils/eval_helpers.py
@@ -1,9 +1,37 @@
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, PreTrainedTokenizerFast
 from datasets import load_dataset
 import numpy
+from typing import Mapping, List, Union
 
 
-def process_concatenated_datasets(dataset_name, model_path, max_sequence_length, kwargs):
+def process_concatenated_datasets(
+        dataset_name: str,
+        model_path: str,
+        max_sequence_length: int,
+        kwargs: Mapping,
+) -> list:
+    """
+    Concatenate text datasets and split them into chunks text that, after
+    tokenization, have size "max_sequence_length" tokens.
+
+    Args:
+        dataset_name (str): The name of the dataset to process. Options: "wikitext2" or "c4".
+        model_path (str): The path to a pretrained transformer model for tokenization.
+        max_sequence_length (int): The maximum number of tokens in each sequence.
+        kwargs (mapping): Additional keyword arguments.
+            - eos (str, optional): The end-of-sentence token. Default is "\n\n" for wikitext2 and "" for c4.
+            - bos (str, optional): The beginning-of-sentence token. Default is "".
+            - raw_samples (int, optional): The number of raw samples to use. Default is None.
+            - data_file (int, optional): The index of the data file to use for dataset.
+                Not used in wikitext2. Default is 0 for c4.
+            - max_text_length (int, optional): The maximum length of text to consider.
+    Returns:
+        list: A list of text sequences.
+
+    Raises:
+        ValueError: If an invalid dataset_name is provided.
+    """
+
     if dataset_name == "wikitext2":
         eos = kwargs.get("eos", "\n\n")
         bos = kwargs.get("bos", "")
@@ -40,7 +68,32 @@ def process_concatenated_datasets(dataset_name, model_path, max_sequence_length,
     )
 
 
-def _split_text_by_tokens(text, eos, bos, tokenizer, sequence_length, max_text_length):
+def _split_text_by_tokens(
+        text: List[str],
+        eos: str,
+        bos: str,
+        tokenizer: PreTrainedTokenizerFast,
+        sequence_length: int,
+        max_text_length: Union[None, int],
+) -> List[str]:
+    """
+    Tokenizes and splits a list of concatenated text samples into sections of specified maximum token length.
+
+    Args:
+        text (List[str]): List of concatenated text samples to be tokenized and split.
+        eos (str): The end-of-sentence token.
+        bos (str): The beginning-of-sentence token.
+        tokenizer (PreTrainedTokenizerFast): Tokenizer for tokenizing the text.
+        sequence_length (int): The maximum number of tokens in each section.
+        max_text_length (Union[None, int]): The maximum length of text to consider.
+            - If None, the entire text is tokenized and split.
+            - If -1, each sample is tokenized separately.
+            - If a positive integer, the text is split into sections of this length before tokenization.
+
+    Returns:
+        List[str]: A list of sections where each section contains a maximum of "sequence_length" tokens.
+    """
+
     text = [bos + sample + eos for sample in text]
 
     if max_text_length is None:

From 8f3743afb8a84c1f4e6ba32f3b489a1dd1681a74 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 3 Oct 2023 13:28:09 -0400
Subject: [PATCH 40/62] Added typing and docstring

---
 src/deepsparse/transformers/metrics.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index 71683d6116..3b3b043258 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -17,7 +17,6 @@
 """
 
 from typing import Any, Dict, Optional
-
 import numpy
 
 from scipy.special import log_softmax
@@ -183,7 +182,25 @@ def compute(self) -> Dict[str, float]:
         return results
 
 
-def _cross_entropy(predictions, targets, reduction="mean"):
+def _cross_entropy(
+        predictions: numpy.ndarray,
+        targets: numpy.ndarray,
+        reduction: str = "mean",
+) -> float:
+    """
+    Calculate the cross-entropy loss between predicted probabilities and target labels.
+
+    Args:
+        predictions (numpy.ndarray): Predicted logits.
+        targets (nnumpy.ndarray): Target class labels.
+        reduction (str, optional): Specifies the reduction method for the loss.
+            - "mean" (default): Computes the mean loss over all samples.
+            - "sum": Computes the sum of losses over all samples.
+
+    Returns:
+        float: The computed cross-entropy loss.
+    """
+
     logp = log_softmax(predictions, axis=-1)
     neg_log_likelihoods = -1.0 * numpy.take_along_axis(
         logp, numpy.expand_dims(targets, axis=-1), axis=-1

From 5e1d8088d0401dd94a567bd622cdf131fb133b5f Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 3 Oct 2023 13:37:56 -0400
Subject: [PATCH 41/62] Define concantenated datasets

---
 src/deepsparse/transformers/utils/eval_helpers.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py
index 8b0d8dfe43..3965d208ef 100644
--- a/src/deepsparse/transformers/utils/eval_helpers.py
+++ b/src/deepsparse/transformers/utils/eval_helpers.py
@@ -3,6 +3,7 @@
 import numpy
 from typing import Mapping, List, Union
 
+CONCATENATED_DATSETS = ["wikitext2", "c4"]
 
 def process_concatenated_datasets(
         dataset_name: str,
@@ -32,6 +33,12 @@ def process_concatenated_datasets(
         ValueError: If an invalid dataset_name is provided.
     """
 
+    if dataset_name not in CONCATENATED_DATSETS:
+        raise KeyError(
+            f"dataset {dataset_name} not supported for concatenated processing, "
+            f"available datasets are {list(CONCATENATED_DATSETS.keys())}"
+        )
+
     if dataset_name == "wikitext2":
         eos = kwargs.get("eos", "\n\n")
         bos = kwargs.get("bos", "")

From 0785321f2091e9f5e6881d49a092fe270f910bec Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 3 Oct 2023 13:45:20 -0400
Subject: [PATCH 42/62] Add warning about batch-size not being a supported
 argument for some datasets

---
 src/deepsparse/transformers/eval_downstream.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 8f07c0b242..d0d17d9bea 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -67,6 +67,7 @@
 
 import numpy
 from tqdm.auto import tqdm
+import logging
 
 from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline
 from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1
@@ -76,6 +77,12 @@
 from datasets import load_dataset, load_metric  # isort: skip
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
+PPL_DATASETS = ["wikitext2", "c4", "openai_humaneval"]
+
+
 def perplexity_eval(args, dataset_name="openai_humaneval"):
 
     if dataset_name in ["wikitext2", "c4"]:
@@ -707,6 +714,12 @@ def _main(args):
             f"available datasets are {list(SUPPORTED_DATASETS.keys())}"
         )
 
+    if dataset not in PPL_DATASETS:
+        _LOGGER.warning(
+            "Batch-size argument is not supported for this dataset."
+            "Will use default value of 1."
+        )
+
     if dataset == "mnli":
         mnli_metrics_matched, mnli_metrics_mismatched = mnli_eval(args)
         mnli_metrics_matched = mnli_metrics_matched.compute()

From d8914f0b72203e5560e15ff1193ac66e1ea2c4c7 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 3 Oct 2023 17:01:23 -0400
Subject: [PATCH 43/62] Add unit test for pipeline and generation in ppl eval

---
 .../pipelines/test_text_generation.py         | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py
index 5298c2f1dd..9acb8f32f5 100644
--- a/tests/deepsparse/transformers/pipelines/test_text_generation.py
+++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py
@@ -666,6 +666,28 @@ def _test_kv_cache_state(
                 x[:, :, -start_index:-end_index, :], y, atol=_PRECISION
             )
 
+    def test_pipeline_for_ppl_eval(self, ):
+        pipeline = self.get_pipeline(
+            task="text-generation",
+            model_path=self.model_stub,
+            sequence_length=self.sequence_length,
+            prompt_sequence_length=1,
+        )
+        inputs = dict(
+            prompt=self.prompt,
+            output_scores=True,
+            return_input_tokens=True,
+            fixed_sequences_length=True,
+            include_prompt_logits=True,
+            max_length=1,
+        )
+        predictions = pipeline(**inputs)
+        assert hasattr(predictions, "generations")
+        assert hasattr(predictions.generations[0], "score")
+        assert hasattr(predictions.generations[0], "input_tokens")
+        assert "input_ids" in predictions.generations[0].input_tokens
+        assert "attention_mask" in predictions.generations[0].input_tokens
+
     def test_streaming_mode_returns_generator(self, setup):
         pipeline = self.get_pipeline(
             task=self.pipeline_type,

From 5bf076b917caca05b25db8109dffc27ee1ebe953 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Tue, 3 Oct 2023 17:26:31 -0400
Subject: [PATCH 44/62] Add lifecycle in docstring

---
 src/deepsparse/transformers/metrics.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index 3b3b043258..a1ba162144 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -34,6 +34,15 @@ def __init__(self, accumulate: bool = False):
         """
         Class for computing perplexity.
 
+        Each batch is processed via the "add_batches" method.
+        At the end the data is reduced to a single perplexity metric via the "compute" method.
+
+        Example:
+        metric = Perplexity()
+        for prediction, target in samples:
+            metric.add_batch(prediction, target)
+        perplexity_value = metric.compute()
+
         :param accumulate: If True, accumulate negative log-likelihood
             over samples. If False, perplexity is computed separately
             for each sampled and then averaged in the end.

From ecf3b7775471eac92e8149b3d8d91e2c593dbf1d Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 10:59:18 -0400
Subject: [PATCH 45/62] Add copyright

---
 src/deepsparse/transformers/utils/eval_helpers.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py
index 3965d208ef..b48308dbbb 100644
--- a/src/deepsparse/transformers/utils/eval_helpers.py
+++ b/src/deepsparse/transformers/utils/eval_helpers.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from transformers import AutoTokenizer, PreTrainedTokenizerFast
 from datasets import load_dataset
 import numpy

From fe37c32c035e98af3f113de48209d5bca3fcce42 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 11:49:25 -0400
Subject: [PATCH 46/62] Style fixes

---
 src/deepsparse/transformers/metrics.py        |  6 +-
 .../transformers/pipelines/text_generation.py |  4 +-
 .../transformers/utils/eval_helpers.py        | 55 ++++++++++---------
 3 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index a1ba162144..cab1cbb5db 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -192,9 +192,9 @@ def compute(self) -> Dict[str, float]:
 
 
 def _cross_entropy(
-        predictions: numpy.ndarray,
-        targets: numpy.ndarray,
-        reduction: str = "mean",
+    predictions: numpy.ndarray,
+    targets: numpy.ndarray,
+    reduction: str = "mean",
 ) -> float:
     """
     Calculate the cross-entropy loss between predicted probabilities and target labels.
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 0d0bacc717..5b8500e52f 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -661,8 +661,8 @@ def process_engine_outputs(
         )
 
         outputs = dict(
-            created=datetime.datetime.now(), 
-            prompts=prompts, 
+            created=datetime.datetime.now(),
+            prompts=prompts,
             generations=generations,
             input_tokens=input_tokens,
         )
diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py
index b48308dbbb..5090a6c42f 100644
--- a/src/deepsparse/transformers/utils/eval_helpers.py
+++ b/src/deepsparse/transformers/utils/eval_helpers.py
@@ -19,11 +19,12 @@
 
 CONCATENATED_DATSETS = ["wikitext2", "c4"]
 
+
 def process_concatenated_datasets(
-        dataset_name: str,
-        model_path: str,
-        max_sequence_length: int,
-        kwargs: Mapping,
+    dataset_name: str,
+    model_path: str,
+    max_sequence_length: int,
+    kwargs: Mapping,
 ) -> list:
     """
     Concatenate text datasets and split them into chunks text that, after
@@ -68,7 +69,9 @@ def process_concatenated_datasets(
             raw_dataset = load_dataset(
                 "allenai/c4",
                 "allenai--c4",
-                data_files={"validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"},
+                data_files={
+                    "validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"
+                },
                 split="validation",
             )
         else:
@@ -85,17 +88,22 @@ def process_concatenated_datasets(
     # To split the dataset, first tokenize text
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     return _split_text_by_tokens(
-        raw_text, eos, bos, tokenizer, max_sequence_length, kwargs.get("max_text_length", None)
+        raw_text,
+        eos,
+        bos,
+        tokenizer,
+        max_sequence_length,
+        kwargs.get("max_text_length", None),
     )
 
 
 def _split_text_by_tokens(
-        text: List[str],
-        eos: str,
-        bos: str,
-        tokenizer: PreTrainedTokenizerFast,
-        sequence_length: int,
-        max_text_length: Union[None, int],
+    text: List[str],
+    eos: str,
+    bos: str,
+    tokenizer: PreTrainedTokenizerFast,
+    sequence_length: int,
+    max_text_length: Union[None, int],
 ) -> List[str]:
     """
     Tokenizes and splits a list of concatenated text samples into sections of specified maximum token length.
@@ -119,26 +127,23 @@ def _split_text_by_tokens(
 
     if max_text_length is None:
         text = "".join(text)
-        input_tokens = tokenizer(text, return_tensors="np")[
-            "input_ids"
-        ][0]
-    elif max_text_length == -1: #per sample tokenization
+        input_tokens = tokenizer(text, return_tensors="np")["input_ids"][0]
+    elif max_text_length == -1:  # per sample tokenization
         input_tokens = []
         for slice in text:
-            input_tokens.append(tokenizer(slice, return_tensors="np")[
-                "input_ids"
-            ][0])
+            input_tokens.append(tokenizer(slice, return_tensors="np")["input_ids"][0])
         input_tokens = numpy.concatenate(input_tokens)
     else:
         text = "".join(text)
         text_slices = len(text) // max_text_length
-        sliced_text = [text[i*max_text_length:(i+1)*max_text_length] for i in range(text_slices)]
-        sliced_text.append(text[text_slices*max_text_length:])
+        sliced_text = [
+            text[i * max_text_length : (i + 1) * max_text_length]
+            for i in range(text_slices)
+        ]
+        sliced_text.append(text[text_slices * max_text_length :])
         input_tokens = []
         for slice in sliced_text:
-            input_tokens.append(tokenizer(slice, return_tensors="np")[
-                "input_ids"
-            ][0])
+            input_tokens.append(tokenizer(slice, return_tensors="np")["input_ids"][0])
         input_tokens = numpy.concatenate(input_tokens)
 
     # Then split the tokenized text into sections of size "max_sequence_length" and
@@ -165,4 +170,4 @@ def _split_text_by_tokens(
             )
         )
 
-    return split_text
\ No newline at end of file
+    return split_text

From ddd0325240eaa9bf58153e18e37b02cec8789b30 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 11:53:28 -0400
Subject: [PATCH 47/62] Quality fixes

---
 src/deepsparse/transformers/eval_downstream.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index d0d17d9bea..2075ea61c3 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -84,7 +84,6 @@
 
 
 def perplexity_eval(args, dataset_name="openai_humaneval"):
-
     if dataset_name in ["wikitext2", "c4"]:
         if args.kwargs is None:
             kwargs = {}
@@ -123,7 +122,6 @@ def perplexity_eval(args, dataset_name="openai_humaneval"):
     end_evaluation = False
     dataset_length = len(dataset)
     for idx, sample in _enumerate_progress(dataset, args.max_samples):
-
         # Collect input sequence
         if dataset_name == "openai_humaneval":
             sample = sample["prompt"] + sample["canonical_solution"]

From 24a91a32afb3a1bd46130319990f63a74bde4d95 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 11:56:21 -0400
Subject: [PATCH 48/62] Quality fixes

---
 .../transformers/pipelines/test_text_generation.py          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py
index 9acb8f32f5..f7715984d9 100644
--- a/tests/deepsparse/transformers/pipelines/test_text_generation.py
+++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py
@@ -554,7 +554,6 @@ def _test_composition_same_session_ids(
         session_id_1,
         session_id_2,
     ):
-
         tokenizer = pipeline.tokenizer
         config = GenerationConfig(
             output_scores=True, max_length=num_generated_tokens, top_k=0, top_p=0.0
@@ -607,7 +606,6 @@ def _test_output(
         max_logits_difference_threshold: Optional[float] = None,
         run_cache_validation: bool = True,
     ):
-
         (
             generated_logits,
             prompt_logits,
@@ -666,7 +664,9 @@ def _test_kv_cache_state(
                 x[:, :, -start_index:-end_index, :], y, atol=_PRECISION
             )
 
-    def test_pipeline_for_ppl_eval(self, ):
+    def test_pipeline_for_ppl_eval(
+        self,
+    ):
         pipeline = self.get_pipeline(
             task="text-generation",
             model_path=self.model_stub,

From 301115c142846dab878af6f6fe49ae5419e34fe9 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 12:00:19 -0400
Subject: [PATCH 49/62] Quality fixes

---
 tests/deepsparse/transformers/test_helpers.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py
index 5cd1cf0dfa..610e41a232 100644
--- a/tests/deepsparse/transformers/test_helpers.py
+++ b/tests/deepsparse/transformers/test_helpers.py
@@ -168,7 +168,11 @@ def test_truncate_transformer_onnx_model(
     model_onnx_path = get_model_onnx_path(model_name)
     output_name = "embedding"
 
-    (truncated_onnx_path, output_names, _,) = truncate_transformer_onnx_model(
+    (
+        truncated_onnx_path,
+        output_names,
+        _,
+    ) = truncate_transformer_onnx_model(
         model_path=model_onnx_path,
         emb_extraction_layer=emb_extraction_layer,
         hidden_layer_size=None,

From e402da9ddac12b21d2db57fac8014e658c7635a9 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 12:06:02 -0400
Subject: [PATCH 50/62] Quality fixes

---
 src/deepsparse/transformers/eval_downstream.py    | 4 ++--
 src/deepsparse/transformers/metrics.py            | 1 +
 src/deepsparse/transformers/utils/eval_helpers.py | 7 +++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 2075ea61c3..6bd1cc4175 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -62,19 +62,19 @@
 
 import argparse
 import json
+import logging
 from cProfile import Profile
 from pstats import Stats
 
 import numpy
 from tqdm.auto import tqdm
-import logging
 
 from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline
 from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1
 from deepsparse.transformers.utils.eval_helpers import process_concatenated_datasets
 
 
-from datasets import load_dataset, load_metric  # isort: skip
+from datasets import load_dataset, load_metric
 
 
 _LOGGER = logging.getLogger(__name__)
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index cab1cbb5db..c6e2a583f6 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -17,6 +17,7 @@
 """
 
 from typing import Any, Dict, Optional
+
 import numpy
 
 from scipy.special import log_softmax
diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py
index 5090a6c42f..3df866bd20 100644
--- a/src/deepsparse/transformers/utils/eval_helpers.py
+++ b/src/deepsparse/transformers/utils/eval_helpers.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import List, Mapping, Union
+
+import numpy
 from transformers import AutoTokenizer, PreTrainedTokenizerFast
+
 from datasets import load_dataset
-import numpy
-from typing import Mapping, List, Union
+
 
 CONCATENATED_DATSETS = ["wikitext2", "c4"]
 

From b48e05f9a06c202c4fe97138853aafd8ebf50592 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 12:16:34 -0400
Subject: [PATCH 51/62] Quality fixes

---
 tests/deepsparse/transformers/test_helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py
index 610e41a232..00077ab78e 100644
--- a/tests/deepsparse/transformers/test_helpers.py
+++ b/tests/deepsparse/transformers/test_helpers.py
@@ -62,8 +62,8 @@ def get_model_onnx_path(model_stubs):
         onnx_path = model.onnx_model.path
         model_onnx_paths[model_name] = onnx_path
 
-    def _get_model_onnx_path(model_name):
-        return model_onnx_paths[model_name]
+    def _get_model_onnx_path(_model_name):
+        return model_onnx_paths[_model_name]
 
     return _get_model_onnx_path
 

From 61b9c5c424220a71ce896d720a985c38c979da0e Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 14:01:34 -0400
Subject: [PATCH 52/62] Quality fixes

---
 src/deepsparse/transformers/eval_downstream.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 6bd1cc4175..f9835aa58e 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -69,14 +69,12 @@
 import numpy
 from tqdm.auto import tqdm
 
+from datasets import load_dataset, load_metric
 from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline
 from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1
 from deepsparse.transformers.utils.eval_helpers import process_concatenated_datasets
 
 
-from datasets import load_dataset, load_metric
-
-
 _LOGGER = logging.getLogger(__name__)
 
 

From 34ee8f615c5907c3cdaa0b7afbde41a22ea593aa Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 14:05:58 -0400
Subject: [PATCH 53/62] Quality fixes

---
 tests/deepsparse/transformers/test_helpers.py | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py
index 00077ab78e..30309ff1be 100644
--- a/tests/deepsparse/transformers/test_helpers.py
+++ b/tests/deepsparse/transformers/test_helpers.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 import onnx
 
 import pytest
 from deepsparse.transformers.helpers import (
-    get_deployment_path,
+    get_hugging_face_configs,
+    get_onnx_path,
     get_transformer_layer_init_names,
     truncate_transformer_onnx_model,
 )
@@ -32,8 +35,20 @@
         ),
     ],
 )
-def test_get_deployment_path(stub):
-    assert get_deployment_path(stub)
+def test_get_onnx_path_and_configs_from_stub(stub):
+    onnx_path = get_onnx_path(stub)
+    config_dir, tokenizer_dir = get_hugging_face_configs(stub)
+
+    assert onnx_path.endswith("model.onnx")
+    assert os.path.exists(onnx_path)
+
+    config_dir_files = os.listdir(config_dir)
+    assert "config.json" in config_dir_files
+
+    tokenizer_dir_files = os.listdir(tokenizer_dir)
+    assert "tokenizer.json" in tokenizer_dir_files
+    # make assert optional if stubs added for models with no known tokenizer_config
+    assert "tokenizer_config.json" in tokenizer_dir_files
 
 
 @pytest.fixture(scope="session")
@@ -62,8 +77,8 @@ def get_model_onnx_path(model_stubs):
         onnx_path = model.onnx_model.path
         model_onnx_paths[model_name] = onnx_path
 
-    def _get_model_onnx_path(_model_name):
-        return model_onnx_paths[_model_name]
+    def _get_model_onnx_path(model_name):
+        return model_onnx_paths[model_name]
 
     return _get_model_onnx_path
 
@@ -168,11 +183,7 @@ def test_truncate_transformer_onnx_model(
     model_onnx_path = get_model_onnx_path(model_name)
     output_name = "embedding"
 
-    (
-        truncated_onnx_path,
-        output_names,
-        _,
-    ) = truncate_transformer_onnx_model(
+    (truncated_onnx_path, output_names, _,) = truncate_transformer_onnx_model(
         model_path=model_onnx_path,
         emb_extraction_layer=emb_extraction_layer,
         hidden_layer_size=None,

From b032101b6c5e5a39e47d4be196cfa23feb9fdb25 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 14:09:49 -0400
Subject: [PATCH 54/62] Quality fixes

---
 tests/deepsparse/transformers/test_helpers.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py
index 30309ff1be..c2bf27985a 100644
--- a/tests/deepsparse/transformers/test_helpers.py
+++ b/tests/deepsparse/transformers/test_helpers.py
@@ -183,7 +183,11 @@ def test_truncate_transformer_onnx_model(
     model_onnx_path = get_model_onnx_path(model_name)
     output_name = "embedding"
 
-    (truncated_onnx_path, output_names, _,) = truncate_transformer_onnx_model(
+    (
+        truncated_onnx_path,
+        output_names,
+        _,
+    ) = truncate_transformer_onnx_model(
         model_path=model_onnx_path,
         emb_extraction_layer=emb_extraction_layer,
         hidden_layer_size=None,

From f3cbf3dacc969ac821f91a9195826ab122b96eb5 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 14:15:29 -0400
Subject: [PATCH 55/62] Quality fixes

---
 tests/deepsparse/transformers/test_helpers.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py
index c2bf27985a..30309ff1be 100644
--- a/tests/deepsparse/transformers/test_helpers.py
+++ b/tests/deepsparse/transformers/test_helpers.py
@@ -183,11 +183,7 @@ def test_truncate_transformer_onnx_model(
     model_onnx_path = get_model_onnx_path(model_name)
     output_name = "embedding"
 
-    (
-        truncated_onnx_path,
-        output_names,
-        _,
-    ) = truncate_transformer_onnx_model(
+    (truncated_onnx_path, output_names, _,) = truncate_transformer_onnx_model(
         model_path=model_onnx_path,
         emb_extraction_layer=emb_extraction_layer,
         hidden_layer_size=None,

From 483449eb91c5c810a3af3e151a00bdc3bcc35f52 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Fri, 20 Oct 2023 14:22:11 -0400
Subject: [PATCH 56/62] Quality fixes

---
 src/deepsparse/transformers/metrics.py        | 14 +++++++------
 .../transformers/utils/eval_helpers.py        | 21 ++++++++++++-------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index c6e2a583f6..1952ec2155 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -36,7 +36,8 @@ def __init__(self, accumulate: bool = False):
         Class for computing perplexity.
 
         Each batch is processed via the "add_batches" method.
-        At the end the data is reduced to a single perplexity metric via the "compute" method.
+        At the end the data is reduced to a single perplexity
+        metric via the "compute" method.
 
         Example:
         metric = Perplexity()
@@ -76,8 +77,8 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray):
         """
 
         if self._accumulate:
-            # If accumulate is True, every token from the batch contributes equally to the
-            # negative log-likelihood.
+            # If accumulate is True, every token from the batch contributes
+            # equally to the negative log-likelihood.
             # Thus, merge batch and sequence length dimensions and compute negative
             # log-likelihood for all tokens, and accumulate to total
             predictions = numpy.reshape(predictions, (-1, predictions.shape[-1]))
@@ -91,9 +92,10 @@ def add_batch(self, predictions: numpy.ndarray, targets: numpy.ndarray):
             # Track number of tokens processed
             self._number_tokens += predictions.shape[0]
         else:
-            # If accumulate is False, compute perplexity for each sample individually.
-            # We assume that sequence length is uniform within a batch, but may vary from batch
-            # to batch.
+            # If accumulate is False, compute perplexity for
+            # each sample individually.
+            # We assume that sequence length is uniform within a batch,
+            # but may vary from batch to batch.
 
             # Create batch dimension if it doesn't exist
             if targets.ndim == 1:
diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py
index 3df866bd20..4c0e68b9de 100644
--- a/src/deepsparse/transformers/utils/eval_helpers.py
+++ b/src/deepsparse/transformers/utils/eval_helpers.py
@@ -34,13 +34,17 @@ def process_concatenated_datasets(
     tokenization, have size "max_sequence_length" tokens.
 
     Args:
-        dataset_name (str): The name of the dataset to process. Options: "wikitext2" or "c4".
+        dataset_name (str): The name of the dataset to process.
+            Options: "wikitext2" or "c4".
         model_path (str): The path to a pretrained transformer model for tokenization.
         max_sequence_length (int): The maximum number of tokens in each sequence.
         kwargs (mapping): Additional keyword arguments.
-            - eos (str, optional): The end-of-sentence token. Default is "\n\n" for wikitext2 and "" for c4.
-            - bos (str, optional): The beginning-of-sentence token. Default is "".
-            - raw_samples (int, optional): The number of raw samples to use. Default is None.
+            - eos (str, optional): The end-of-sentence token.
+                Default is "\n\n" for wikitext2 and "" for c4.
+            - bos (str, optional): The beginning-of-sentence token.
+                Default is "".
+            - raw_samples (int, optional): The number of raw samples to use.
+                Default is None.
             - data_file (int, optional): The index of the data file to use for dataset.
                 Not used in wikitext2. Default is 0 for c4.
             - max_text_length (int, optional): The maximum length of text to consider.
@@ -109,7 +113,8 @@ def _split_text_by_tokens(
     max_text_length: Union[None, int],
 ) -> List[str]:
     """
-    Tokenizes and splits a list of concatenated text samples into sections of specified maximum token length.
+    Tokenizes and splits a list of concatenated text samples into
+    sections of specified maximum token length.
 
     Args:
         text (List[str]): List of concatenated text samples to be tokenized and split.
@@ -120,10 +125,12 @@ def _split_text_by_tokens(
         max_text_length (Union[None, int]): The maximum length of text to consider.
             - If None, the entire text is tokenized and split.
             - If -1, each sample is tokenized separately.
-            - If a positive integer, the text is split into sections of this length before tokenization.
+            - If a positive integer, the text is split into sections of this
+                length before tokenization.
 
     Returns:
-        List[str]: A list of sections where each section contains a maximum of "sequence_length" tokens.
+        List[str]: A list of sections where each section contains a
+            maximum of "sequence_length" tokens.
     """
 
     text = [bos + sample + eos for sample in text]

From e6e78286ccbbe17e2629fcd6070831ca749e0cde Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 30 Oct 2023 17:42:39 -0400
Subject: [PATCH 57/62] Rebase

---
 .../pipelines/test_text_generation.py         | 95 +++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py
index 5df51f9b59..c70c50a5ef 100644
--- a/tests/deepsparse/transformers/pipelines/test_text_generation.py
+++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py
@@ -19,3 +19,98 @@
 import pytest
 from deepsparse import Pipeline
 from deepsparse.transformers.utils.helpers import prepends_bos_token
+
+
+@pytest.fixture
+def pipeline():
+    return Pipeline.create(
+        task="text_generation",
+        model_path="hf:mgoin/TinyStories-1M-deepsparse",
+        engine_type="onnxruntime",
+    )
+
+
+@pytest.fixture
+def prompt():
+    return "Never gonna give you up, never gonna let you down"
+
+
+def test_freeze_first_position(pipeline):
+    # Test whether we should be "freezing" the first token after
+    # the kv cache is full
+    assert not prepends_bos_token(pipeline.tokenizer)
+
+
+def test_run_same_prompt_multiple_times(pipeline, prompt):
+    # Test the scenario, where the same prompt is run multiple times
+    # Every run should produce the same output
+    output_1 = pipeline(prompt, output_scores=True)
+    output_2 = pipeline(prompt, output_scores=True)
+
+    assert output_1.generations[0].text == output_2.generations[0].text
+    assert numpy.allclose(
+        output_1.generations[0].score,
+        output_2.generations[0].score,
+        atol=1e-3,
+    )
+
+
+def test_run_multiple_prompts_in_parallel(pipeline, prompt):
+    # Test the scenario, where multiple prompts are run in parallel
+    # Same two prompts should produce the same output
+
+    output = pipeline([prompt, prompt], output_scores=True)
+
+    logits_0 = output.generations[0].score
+    sequence_0 = output.generations[0].text
+
+    logits_1 = output.generations[1].score
+    sequence_1 = output.generations[1].text
+
+    assert numpy.allclose(logits_0, logits_1, atol=1e-3)
+    assert sequence_0 == sequence_1
+
+
+def test_num_generated_predictions(pipeline, prompt):
+    # Test the scenario, where multiple predictions are generated
+    # from the same prompt
+
+    output_sequences = pipeline(prompt, num_return_sequences=2)
+
+    assert len(output_sequences.generations) == 1
+    assert len(output_sequences.generations[0]) == 2
+
+    output_sequences = pipeline([prompt, prompt], num_return_sequences=2)
+    assert len(output_sequences.generations) == 2
+
+    for generation in output_sequences.generations:
+        assert len(generation) == 2
+
+
+def test_token_generation_deterministic(pipeline, prompt):
+    inference = pipeline(prompt, num_return_sequences=3, do_sample=False)
+    generations = inference.generations
+    # Output should be the same from one another
+    text_outputs = [x.text for x in generations[0]]
+    assert len(set(text_outputs)) == 1
+
+
+def test_token_generation_non_deterministic(pipeline, prompt):
+
+    inference = pipeline(prompt, num_return_sequences=3, do_sample=True)
+    generations = inference.generations
+    # Output should be different from one another
+    text_outputs = [x.text for x in generations[0]]
+    assert len(set(text_outputs)) == 3
+
+
+def test_streaming_mode_returns_generator(pipeline, prompt):
+    response_generator = pipeline(prompt, streaming=True)
+    assert inspect.isgenerator(
+        response_generator
+    ), "Pipeline should return a generator in streaming mode"
+
+    assert all(
+        isinstance(response, pipeline.output_schema) for response in response_generator
+    ), "Pipeline should return a generator of output_schema \
+           objects in streaming mode"

From d7c6e5ad79bde07ba2bc34fd0aef3b138332f14b Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Mon, 30 Oct 2023 17:43:26 -0400
Subject: [PATCH 58/62] Rebase

---
 tests/deepsparse/transformers/test_helpers.py | 21 +++----------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/tests/deepsparse/transformers/test_helpers.py b/tests/deepsparse/transformers/test_helpers.py
index 30309ff1be..5cd1cf0dfa 100644
--- a/tests/deepsparse/transformers/test_helpers.py
+++ b/tests/deepsparse/transformers/test_helpers.py
@@ -12,14 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-
 import onnx
 
 import pytest
 from deepsparse.transformers.helpers import (
-    get_hugging_face_configs,
-    get_onnx_path,
+    get_deployment_path,
     get_transformer_layer_init_names,
     truncate_transformer_onnx_model,
 )
@@ -35,20 +32,8 @@
         ),
     ],
 )
-def test_get_onnx_path_and_configs_from_stub(stub):
-    onnx_path = get_onnx_path(stub)
-    config_dir, tokenizer_dir = get_hugging_face_configs(stub)
-
-    assert onnx_path.endswith("model.onnx")
-    assert os.path.exists(onnx_path)
-
-    config_dir_files = os.listdir(config_dir)
-    assert "config.json" in config_dir_files
-
-    tokenizer_dir_files = os.listdir(tokenizer_dir)
-    assert "tokenizer.json" in tokenizer_dir_files
-    # make assert optional if stubs added for models with no known tokenizer_config
-    assert "tokenizer_config.json" in tokenizer_dir_files
+def test_get_deployment_path(stub):
+    assert get_deployment_path(stub)
 
 
 @pytest.fixture(scope="session")

From 21c6f0ddf5e349fd6a2891544d47848f57bd5772 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 8 Nov 2023 10:26:48 -0500
Subject: [PATCH 59/62] Re-add unit test

---
 .../pipelines/test_text_generation.py         | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py
index c70c50a5ef..1a408fb92b 100644
--- a/tests/deepsparse/transformers/pipelines/test_text_generation.py
+++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py
@@ -104,6 +104,29 @@ def test_token_generation_non_deterministic(pipeline, prompt):
     assert len(set(text_outputs)) == 3
 
 
+def test_pipeline_for_ppl_eval(self, ):
+    pipeline = self.get_pipeline(
+        task="text-generation",
+        model_path=self.model_stub,
+        sequence_length=self.sequence_length,
+        prompt_sequence_length=1,
+    )
+    inputs = dict(
+        prompt=self.prompt,
+        output_scores=True,
+        return_input_tokens=True,
+        fixed_sequences_length=True,
+        include_prompt_logits=True,
+        max_length=1,
+    )
+    predictions = pipeline(**inputs)
+    assert hasattr(predictions, "generations")
+    assert hasattr(predictions.generations[0], "score")
+    assert hasattr(predictions.generations[0], "input_tokens")
+    assert "input_ids" in predictions.generations[0].input_tokens
+    assert "attention_mask" in predictions.generations[0].input_tokens
+
+
 def test_streaming_mode_returns_generator(pipeline, prompt):
     response_generator = pipeline(prompt, streaming=True)
     assert inspect.isgenerator(

From fa0cb4b416aa360af3984a8b66e224cf027f03a8 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 8 Nov 2023 10:30:07 -0500
Subject: [PATCH 60/62] Style fix

---
 tests/deepsparse/transformers/pipelines/test_text_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py
index 1a408fb92b..74bdf26896 100644
--- a/tests/deepsparse/transformers/pipelines/test_text_generation.py
+++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py
@@ -104,7 +104,7 @@ def test_token_generation_non_deterministic(pipeline, prompt):
     assert len(set(text_outputs)) == 3
 
 
-def test_pipeline_for_ppl_eval(self, ):
+def test_pipeline_for_ppl_eval(self):
     pipeline = self.get_pipeline(
         task="text-generation",
         model_path=self.model_stub,

From bf1b0cf3ac54957427f9fedc523f005f3168a21d Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 8 Nov 2023 15:19:51 -0500
Subject: [PATCH 61/62] Update unit test

---
 .../transformers/pipelines/test_text_generation.py  | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py
index 74bdf26896..b304e4a7f2 100644
--- a/tests/deepsparse/transformers/pipelines/test_text_generation.py
+++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py
@@ -104,22 +104,15 @@ def test_token_generation_non_deterministic(pipeline, prompt):
     assert len(set(text_outputs)) == 3
 
 
-def test_pipeline_for_ppl_eval(self):
-    pipeline = self.get_pipeline(
-        task="text-generation",
-        model_path=self.model_stub,
-        sequence_length=self.sequence_length,
-        prompt_sequence_length=1,
-    )
-    inputs = dict(
-        prompt=self.prompt,
+def test_pipeline_for_ppl_eval(pipeline, prompt):
+    predictions = pipeline(
+        prompt,
         output_scores=True,
         return_input_tokens=True,
         fixed_sequences_length=True,
         include_prompt_logits=True,
         max_length=1,
     )
-    predictions = pipeline(**inputs)
     assert hasattr(predictions, "generations")
     assert hasattr(predictions.generations[0], "score")
     assert hasattr(predictions.generations[0], "input_tokens")

From 0c618a698d77738f2b41c323cfc95de9833d2448 Mon Sep 17 00:00:00 2001
From: Alexandre Marques <alexandre@neuralmagic.com>
Date: Wed, 8 Nov 2023 16:06:38 -0500
Subject: [PATCH 62/62] Update unit test

---
 .../transformers/pipelines/test_text_generation.py          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/deepsparse/transformers/pipelines/test_text_generation.py b/tests/deepsparse/transformers/pipelines/test_text_generation.py
index b304e4a7f2..fb25a33883 100644
--- a/tests/deepsparse/transformers/pipelines/test_text_generation.py
+++ b/tests/deepsparse/transformers/pipelines/test_text_generation.py
@@ -114,10 +114,10 @@ def test_pipeline_for_ppl_eval(pipeline, prompt):
         max_length=1,
     )
     assert hasattr(predictions, "generations")
+    assert hasattr(predictions, "input_tokens")
     assert hasattr(predictions.generations[0], "score")
-    assert hasattr(predictions.generations[0], "input_tokens")
-    assert "input_ids" in predictions.generations[0].input_tokens
-    assert "attention_mask" in predictions.generations[0].input_tokens
+    assert "input_ids" in predictions.input_tokens
+    assert "attention_mask" in predictions.input_tokens
 
 
 def test_streaming_mode_returns_generator(pipeline, prompt):