From 7a8be8b39d35ba6e71f45ba6f3e0a546bad61e57 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 3 Nov 2023 07:52:22 +0000
Subject: [PATCH 1/3] initial commit

---
 src/deepsparse/eval/__init__.py               |  13 ++
 src/deepsparse/eval/evaluator.py              |  81 ++++++++++++
 src/deepsparse/eval/integrations/__init__.py  |  13 ++
 .../lm_cache/roneneldan/TinyStories-1M_.db    | Bin 0 -> 12288 bytes
 .../integrations/lm_evaluation_harness.py     | 118 ++++++++++++++++++
 5 files changed, 225 insertions(+)
 create mode 100644 src/deepsparse/eval/__init__.py
 create mode 100644 src/deepsparse/eval/evaluator.py
 create mode 100644 src/deepsparse/eval/integrations/__init__.py
 create mode 100644 src/deepsparse/eval/integrations/lm_cache/roneneldan/TinyStories-1M_.db
 create mode 100644 src/deepsparse/eval/integrations/lm_evaluation_harness.py

diff --git a/src/deepsparse/eval/__init__.py b/src/deepsparse/eval/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/src/deepsparse/eval/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/deepsparse/eval/evaluator.py b/src/deepsparse/eval/evaluator.py
new file mode 100644
index 0000000000..ee486a1abe
--- /dev/null
+++ b/src/deepsparse/eval/evaluator.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The main entrypoint for evaluating a model
+or a Pipeline on a requested dataset
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+from pipeline import DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE, Pipeline
+from src.deepsparse.eval.registry import EVAL_REGISTRY
+
+
+@dataclass
+class Metric:
+    type: str
+    value: float
+
+
+@dataclass
+class Dataset:
+    type: str
+    name: str
+    config: str
+    split: str
+
+
+@dataclass
+class EvalSample:
+    input: Any
+    output: Any
+
+
+@dataclass
+class Evaluation:
+    task: str
+    dataset: Dataset
+    metrics: List[Metric]
+    samples: List[EvalSample]
+
+
+def eval(
+    target: Union["Module", Pipeline],
+    datasets: Union[str, List[str]],
+    integration: str,
+    batch_size: int = 1,
+    target_args: Optional[Dict] = None,
+    engine: Union[None, DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE] = None,
+    engine_args: Optional[Dict] = None,
+    splits=None,
+    metrics=None,
+    **kwargs,
+) -> List[Evaluation]:
+    # TODO: Decide on the final types of arguments later
+
+    # TODO: Implement registry
+    eval_integration = EVAL_REGISTRY.resolve(target, integration, datasets)
+
+    return eval_integration(
+        target=target,
+        target_args=target_args,
+        datasets=datasets,
+        splits=splits,
+        metrics=metrics,
+        batch_size=batch_size,
+        engine=engine,
+        engine_args=engine_args,
+        **kwargs,
+    )
diff --git a/src/deepsparse/eval/integrations/__init__.py b/src/deepsparse/eval/integrations/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/src/deepsparse/eval/integrations/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/deepsparse/eval/integrations/lm_cache/roneneldan/TinyStories-1M_.db b/src/deepsparse/eval/integrations/lm_cache/roneneldan/TinyStories-1M_.db
new file mode 100644
index 0000000000000000000000000000000000000000..64ce058ea2f912dd57df07a2f112eac67bd0a4ff
GIT binary patch
literal 12288
zcmeI#&r8EF6u|MM$`E06w;g(U?4+QA_y=e!Ifxx<*MmEi$|4NA4mA}$`S<xJ_@8*R
z8QppJ@_mpWFCl?^a(c6ebm^=-uisXslM6LcTB|D&rIc>lUfZ_e^Zjkojy~TNTAf|~
z4#PhcgieJn{MmJYeFP9d009ILKmY**5I_I{1o|Q{(}Qso>2Fs)Rko??a%C6Y5L{;_
z&W+@8k{X$Gs}nhTv0sv#$6W5Sn`xZQ<<`tk<fE(_D@l5n9G?b*aU5y2tQYq4{k3YV
z`l4*ydXpF3Ma6kH@pb)y_Z<3e<U0ZZ1Q0*~0R#|0009ILKmY**`YLefA4dJ(*YD+B
X2q1s}0tg_000IagfB*srYy|!R+Ici5

literal 0
HcmV?d00001

diff --git a/src/deepsparse/eval/integrations/lm_evaluation_harness.py b/src/deepsparse/eval/integrations/lm_evaluation_harness.py
new file mode 100644
index 0000000000..d8cc4a146e
--- /dev/null
+++ b/src/deepsparse/eval/integrations/lm_evaluation_harness.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from deepsparse import Pipeline
+from lm_eval import evaluator
+from lm_eval.base import BaseLM
+from typing import Union
+
+
+class DeepSparseLM(BaseLM):
+    # potentially create pipeline inside of this class
+    def __init__(self, stub, max_length: Optional[int] = None):
+        self.model = Pipeline.create(task="text_generation", model_path=stub)
+
+        self.default_max_length = 1024
+        self._max_length = max_length
+
+    @classmethod
+    def from_pipeline(cls, pipeline: Pipeline):
+        return cls(pipeline)
+
+    @property
+    def batch_size(self):
+        return self.model._batch_size
+
+    @property
+    def eot_token(self) -> str:
+        pass
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override greedy_until
+        raise NotImplementedError()
+
+    def _loglikelihood_tokens(self, requests, **kwargs):
+        pass
+
+    @property
+    def device(self):
+        return "cpu"
+
+    @property
+    def eot_token_id(self) -> int:
+        pass
+
+    @property
+    def max_gen_toks(self):
+        pass
+
+    @property
+    def max_length(self):
+        return self._max_length or self.default_max_length
+
+    def tok_encode(self, string: str):
+        return self.model.tokenizer.encode(string)
+
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+
+
+def integration_eval(
+        target: Union[str, "Module"],
+        target_args,
+        datasets,
+        splits,
+        metrics,
+        batch_size,
+        engine,
+        engine_args,
+        **kwargs,
+):
+
+    if isinstance(target, str):
+        target = DeepSparseLM(stub=target)
+    else:
+        pass # model is a torch.Module
+
+    results = evaluator.simple_evaluate(
+          model=kwargs.get("model", target),
+          model_args=kwargs.get("model_args", target_args),
+          tasks=kwargs.get("tasks", datasets),
+    #     num_fewshot=num_fewshot,
+    #     batch_size=batch_size,
+    #     max_batch_size=max_batch_size,
+    #     device=device,
+    #     no_cache=no_cache,
+    #     limit=limit,
+    #     description_dict=description_dict,
+    #     decontamination_ngrams_path=decontamination_ngrams_path,
+    #     check_integrity=check_integrity,
+    #     write_out=write_out,
+    #     output_base_path=output_base_path,
+          **kwargs,
+    )
+
+
+if __name__ == "__main__":
+    target = "hf:mgoin/TinyStories-1M-deepsparse"
+    datasets = ["hellaswag"]
+    target_args = ""
+    limit = 2 # testing purposes
+    integration_eval(target=target, datasets=datasets, target_args=target_args, limit=limit, splits=None, metrics=None, batch_size=1, engine=None, engine_args=None)

From edead87766b35a35781cb7aa1354011eaf80e838 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 3 Nov 2023 09:53:57 +0000
Subject: [PATCH 2/3] add first unittest

---
 src/deepsparse/eval/evaluator.py              | 41 ++++++++---
 ...n_harness.py => llm_evaluation_harness.py} | 73 ++++++++-----------
 tests/deepsparse/eval/__init__.py             | 13 ++++
 .../deepsparse/eval/integrations/__init__.py  | 13 ++++
 .../test_llm_evaluation_harness.py            | 26 +++++++
 5 files changed, 114 insertions(+), 52 deletions(-)
 rename src/deepsparse/eval/integrations/{lm_evaluation_harness.py => llm_evaluation_harness.py} (62%)
 create mode 100644 tests/deepsparse/eval/__init__.py
 create mode 100644 tests/deepsparse/eval/integrations/__init__.py
 create mode 100644 tests/deepsparse/eval/integrations/test_llm_evaluation_harness.py

diff --git a/src/deepsparse/eval/evaluator.py b/src/deepsparse/eval/evaluator.py
index ee486a1abe..98c8596771 100644
--- a/src/deepsparse/eval/evaluator.py
+++ b/src/deepsparse/eval/evaluator.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-The main entrypoint for evaluating a model
-or a Pipeline on a requested dataset
+The main entrypoint for evaluating a target
+on a requested dataset
 """
 
 from dataclasses import dataclass
@@ -45,28 +45,51 @@ class EvalSample:
 
 @dataclass
 class Evaluation:
+    # TODO: How to handle serialization of the
+    # data structure (to yaml and json)
     task: str
     dataset: Dataset
     metrics: List[Metric]
     samples: List[EvalSample]
 
 
-def eval(
-    target: Union["Module", Pipeline],
+def evaluate(
+    target: str,
     datasets: Union[str, List[str]],
     integration: str,
+    engine_type: Union[None, DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE] = None,
     batch_size: int = 1,
     target_args: Optional[Dict] = None,
-    engine: Union[None, DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE] = None,
     engine_args: Optional[Dict] = None,
     splits=None,
     metrics=None,
     **kwargs,
 ) -> List[Evaluation]:
-    # TODO: Decide on the final types of arguments later
+    """
+    :param target: The target to evaluate. Can be a path to
+        a sparsezoo stub, hugging face path, or a path to a
+        local directory containing a model file
+    :param datasets: The datasets to evaluate on. Can be a string
+        for a single dataset or a list of strings for multiple datasets.
+    :param integration: The name of the evaluation integration to use.
+        Must be a valid integration name from the EVAL_REGISTRY.
+    :param engine_type: The engine to use for the evaluation.
+    :param batch_size: The batch size to use for the evaluation.
+    :param target_args: The arguments to alter the
+        behavior of the evaluated target.
+    :param engine_args: The arguments to pass to the engine.
+    :param splits: ...
+    :param metrics: ...
+    :param kwargs: Additional arguments to pass to the evaluation integration.
+    :return: A list of Evaluation objects containing the results of the evaluation.
+    """
 
-    # TODO: Implement registry
-    eval_integration = EVAL_REGISTRY.resolve(target, integration, datasets)
+    # TODO: Implement a function that checks for valid target
+    # TODO: Implement EVAL_REGISTRY
+    # TODO: Implement a function that checks for valid engine_type
+    # TODO: Clarify the type of missing arguments
+
+    eval_integration = EVAL_REGISTRY.get(target, integration, datasets)
 
     return eval_integration(
         target=target,
@@ -75,7 +98,7 @@ def eval(
         splits=splits,
         metrics=metrics,
         batch_size=batch_size,
-        engine=engine,
+        engine_type=engine_type,
         engine_args=engine_args,
         **kwargs,
     )
diff --git a/src/deepsparse/eval/integrations/lm_evaluation_harness.py b/src/deepsparse/eval/integrations/llm_evaluation_harness.py
similarity index 62%
rename from src/deepsparse/eval/integrations/lm_evaluation_harness.py
rename to src/deepsparse/eval/integrations/llm_evaluation_harness.py
index d8cc4a146e..719c9235f9 100644
--- a/src/deepsparse/eval/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/eval/integrations/llm_evaluation_harness.py
@@ -14,10 +14,35 @@
 
 from typing import Optional
 
+from transformers import AutoModelForCausalLM
+
 from deepsparse import Pipeline
 from lm_eval import evaluator
 from lm_eval.base import BaseLM
-from typing import Union
+
+
+def integration_eval(
+    target,
+    target_args,
+    datasets,
+    batch_size,
+    splits=None,
+    metrics=None,
+    engine_type=None,
+    engine_args=None,
+    **kwargs,
+):
+    model = initialize_model(target, target_args)
+
+    evaluator.simple_evaluate(
+        model=model,
+        model_args=kwargs.get("model_args", target_args),
+        tasks=kwargs.get("tasks", datasets),
+        batch_size=batch_size,
+        **kwargs,
+    )
+
+    return True
 
 
 class DeepSparseLM(BaseLM):
@@ -74,45 +99,7 @@ def tok_decode(self, tokens):
         return self.tokenizer.decode(tokens)
 
 
-def integration_eval(
-        target: Union[str, "Module"],
-        target_args,
-        datasets,
-        splits,
-        metrics,
-        batch_size,
-        engine,
-        engine_args,
-        **kwargs,
-):
-
-    if isinstance(target, str):
-        target = DeepSparseLM(stub=target)
-    else:
-        pass # model is a torch.Module
-
-    results = evaluator.simple_evaluate(
-          model=kwargs.get("model", target),
-          model_args=kwargs.get("model_args", target_args),
-          tasks=kwargs.get("tasks", datasets),
-    #     num_fewshot=num_fewshot,
-    #     batch_size=batch_size,
-    #     max_batch_size=max_batch_size,
-    #     device=device,
-    #     no_cache=no_cache,
-    #     limit=limit,
-    #     description_dict=description_dict,
-    #     decontamination_ngrams_path=decontamination_ngrams_path,
-    #     check_integrity=check_integrity,
-    #     write_out=write_out,
-    #     output_base_path=output_base_path,
-          **kwargs,
-    )
-
-
-if __name__ == "__main__":
-    target = "hf:mgoin/TinyStories-1M-deepsparse"
-    datasets = ["hellaswag"]
-    target_args = ""
-    limit = 2 # testing purposes
-    integration_eval(target=target, datasets=datasets, target_args=target_args, limit=limit, splits=None, metrics=None, batch_size=1, engine=None, engine_args=None)
+def initialize_model(target, target_args):
+    # creates model: Union[DeepSparseLM, Module]
+    # given the target
+    return AutoModelForCausalLM.from_pretrained(target)
diff --git a/tests/deepsparse/eval/__init__.py b/tests/deepsparse/eval/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/tests/deepsparse/eval/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/deepsparse/eval/integrations/__init__.py b/tests/deepsparse/eval/integrations/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/tests/deepsparse/eval/integrations/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/deepsparse/eval/integrations/test_llm_evaluation_harness.py b/tests/deepsparse/eval/integrations/test_llm_evaluation_harness.py
new file mode 100644
index 0000000000..7f452859f3
--- /dev/null
+++ b/tests/deepsparse/eval/integrations/test_llm_evaluation_harness.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from src.deepsparse.eval.integrations.llm_evaluation_harness import integration_eval
+
+
+@pytest.mark.parametrize(
+    "target,datasets, batch_size",
+    [("roneneldan/TinyStories-1M", ["hellaswag"], 1)],
+)
+def test_integration_eval(target, datasets, batch_size):
+    out = integration_eval(
+        target=target, datasets=datasets, batch_size=1, target_args=None, limit=5
+    )

From 627d4244257f5df8cfec782cb73912cc0f7e3117 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Fri, 3 Nov 2023 10:56:36 +0100
Subject: [PATCH 3/3] Update
 tests/deepsparse/eval/integrations/test_llm_evaluation_harness.py

---
 .../deepsparse/eval/integrations/test_llm_evaluation_harness.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/deepsparse/eval/integrations/test_llm_evaluation_harness.py b/tests/deepsparse/eval/integrations/test_llm_evaluation_harness.py
index 7f452859f3..faef9ca921 100644
--- a/tests/deepsparse/eval/integrations/test_llm_evaluation_harness.py
+++ b/tests/deepsparse/eval/integrations/test_llm_evaluation_harness.py
@@ -21,6 +21,6 @@
     [("roneneldan/TinyStories-1M", ["hellaswag"], 1)],
 )
 def test_integration_eval(target, datasets, batch_size):
-    out = integration_eval(
+    assert integration_eval(
         target=target, datasets=datasets, batch_size=1, target_args=None, limit=5
     )