Turnkey-LLM (aka lemonade) (#225)

onnx · Aug 29, 2024 · 9045dc4 · 9045dc4
1 parent 4c51521
commit 9045dc4
Show file tree

Hide file tree

Showing 25 changed files with 2,506 additions and 28 deletions.
diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml
@@ -24,15 +24,20 @@ jobs:
         run: >-
           python -m pip install build --user
       - name: Build a binary wheel and a source tarball
-        run: >-
+        run: |
           python -m build --sdist --wheel --outdir dist/ .
+          version=$(python setup.py --version)
+          echo "VERSION=$version" >> $GITHUB_ENV
       - name: Test wheel
         shell: bash -el {0}
         run: |
           python -m pip install --upgrade pip
-          pip install dist/*.whl
+          pip install "dist/turnkeyml-${{ env.VERSION }}-py3-none-any.whl"
           models=$(turnkey models-location --quiet)
           turnkey -i $models/selftest/linear.py discover export-pytorch
+          # Test LLMs as well
+          pip install "dist/turnkeyml-${{ env.VERSION }}-py3-none-any.whl[llm]"
+          lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "Hello, my thoughts are"
       - name: Publish distribution package to PyPI
         if: startsWith(github.ref, 'refs/tags/v')
         uses: pypa/gh-action-pypi-publish@release/v1

diff --git a/.github/workflows/test_lemonade.yml b/.github/workflows/test_lemonade.yml
@@ -0,0 +1,46 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Lint and Test Lemonade
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+
+permissions:
+  contents: read
+
+jobs:
+  make-lemonade:
+    env:
+        LEMONADE_CI_MODE: "True"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Miniconda with 64-bit Python
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          miniconda-version: "latest"
+          activate-environment: lemon
+          python-version: "3.10"
+      - name: Install dependencies
+        shell: bash -el {0}
+        run: |
+          python -m pip install --upgrade pip
+          conda install pylint
+          python -m pip check
+          pip install -e .[llm]
+      - name: Lint with PyLint
+        shell: bash -el {0}
+        run: |
+          pylint src/turnkeyml/llm --rcfile .pylintrc --disable E0401
+      - name: Run lemonade tests
+        shell: bash -el {0}
+        run: |
+          lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "hi" --max-new-tokens 10
+
+          python test/llm_api.py
+          
+
diff --git a/.github/workflows/test_turnkey.yml b/.github/workflows/test_turnkey.yml
@@ -42,7 +42,7 @@ jobs:
       - name: Lint with PyLint
         shell: bash -el {0}
         run: |
-          pylint src/turnkeyml --rcfile .pylintrc
+          pylint src/turnkeyml --rcfile .pylintrc --ignore-paths src/turnkeyml/llm
           pylint examples --rcfile .pylintrc --disable E0401,E0611
       - name: Test with unittest
         shell: bash -el {0}
@@ -77,31 +77,31 @@ jobs:
           rm -rf ~/.cache/turnkey
           pip install -e examples/cli/plugins/example_tool
           turnkey -i examples/cli/scripts/hello_world.py discover export-pytorch example-plugin-tool benchmark
-      - name: Install and Start Slurm
-        if: runner.os != 'Windows'
-        shell: bash -el {0}
-        run: |
-          sudo apt update -y
-          sudo apt install slurm-wlm -y
-          cp test/helpers/slurm.conf test/helpers/slurm_modified.conf
-          sed -i "s/YOUR_HOSTNAME_HERE/$HOSTNAME/" test/helpers/slurm_modified.conf
-          sudo mv test/helpers/slurm_modified.conf /etc/slurm/slurm.conf
-          sudo service slurmd start
-          sudo service slurmctld start
-          sudo service munge start
-      - name: Test turnkey on Slurm
-        if: runner.os != 'Windows'
-        shell: bash -el {0}
-        run: |
-          # Create conda environment for Slurm using srun (sbatch + wait)
-          export SKIP_REQUIREMENTS_INSTALL="True"
-          export TORCH_CPU="True"
-          srun src/turnkeyml/cli/setup_venv.sh
+      # - name: Install and Start Slurm
+      #   if: runner.os != 'Windows'
+      #   shell: bash -el {0}
+      #   run: |
+      #     sudo apt update -y
+      #     sudo apt install slurm-wlm -y
+      #     cp test/helpers/slurm.conf test/helpers/slurm_modified.conf
+      #     sed -i "s/YOUR_HOSTNAME_HERE/$HOSTNAME/" test/helpers/slurm_modified.conf
+      #     sudo mv test/helpers/slurm_modified.conf /etc/slurm/slurm.conf
+      #     sudo service slurmd start
+      #     sudo service slurmctld start
+      #     sudo service munge start
+      # - name: Test turnkey on Slurm
+      #   if: runner.os != 'Windows'
+      #   shell: bash -el {0}
+      #   run: |
+      #     # Create conda environment for Slurm using srun (sbatch + wait)
+      #     export SKIP_REQUIREMENTS_INSTALL="True"
+      #     export TORCH_CPU="True"
+      #     srun src/turnkeyml/cli/setup_venv.sh
 
-          # Run tests on Slurm
-          export TURNKEY_SLURM_USE_DEFAULT_MEMORY="True"
-          turnkey -i models/selftest/linear.py --use-slurm --cache-dir local_cache discover export-pytorch
-          bash test/helpers/check_slurm_output.sh slurm-2.out
+      #     # Run tests on Slurm
+      #     export TURNKEY_SLURM_USE_DEFAULT_MEMORY="True"
+      #     turnkey -i models/selftest/linear.py --use-slurm --cache-dir local_cache discover export-pytorch
+      #     bash test/helpers/check_slurm_output.sh slurm-2.out
 
       # Below tests are commented out as the GitHub runner runs out of space installing the requirements
       # - name: Check installation of requirements.txt and their compatibility with turnkey

diff --git a/README.md b/README.md
@@ -6,8 +6,12 @@
 
 We are on a mission to make it easy to use the most important tools in the ONNX ecosystem. TurnkeyML accomplishes this by providing a no-code CLI, `turnkey`, as well as a low-code API, that provide seamless integration of these tools.
 
+We also provide [`turnkey-llm`](https://github.com/onnx/turnkeyml/tree/main/src/turnkeyml/llm), which has LLM-specific tools for prompting, accuracy measurement, and serving on a variety of runtimes (Huggingface, onnxruntime-genai) and hardware (CPU, GPU, and NPU).
+
 ## Getting Started
 
+### Quick Start
+
 The easiest way to get started is:
 1. `pip install turnkeyml`
 2. Copy a PyTorch example of a model, like the one on this [Huggingface BERT model card](https://huggingface.co/google-bert/bert-base-uncased), into a file named `bert.py`.
@@ -21,6 +25,10 @@ output = model(**encoded_input)
 ```
 3. `turnkey -i bert.py discover export-pytorch`: make a BERT ONNX file from this `bert.py` example.
 
+### LLMs
+
+For LLM setup instructions, see [`turnkey-llm`](https://github.com/onnx/turnkeyml/tree/main/src/turnkeyml/llm).
+
 ## Demo
 
 Here's `turnkey` in action: BERT-Base is exported from PyTorch to ONNX using `torch.onnx.export`, optimized for inference with `onnxruntime`, and converted to fp16 with `onnxmltools`:

diff --git a/docs/mmlu_accuracy.md b/docs/mmlu_accuracy.md
@@ -0,0 +1,100 @@
+
+# Using the MMLU accuracy test tools
+
+The Massive Multitask Language Understanding (MMLU) benchmark is a comprehensive evaluation framework designed to assess the capabilities of language models across a wide range of subjects and disciplines. It encompasses a diverse set of questions covering topics from humanities to natural sciences, aiming to measure a model's depth and breadth of knowledge and its ability to generalize across different types of language understanding tasks. For detailed list of subjects tested refer [here](#detailed-list-of-subjects-categories-tested).
+
+This tool provides an automated way to evaluate language models on the MMLU benchmark. It automates the process of downloading the dataset, preparing evaluation prompts, running the model to generate answers, and calculating accuracy metrics across different subjects within the MMLU dataset.
+
+## Dataset
+The MMLU dataset can be automatically downloaded by the script to the mmlu_data directory the first time you run the benchmark. The data is sourced from [here](https://people.eecs.berkeley.edu/~hendrycks/data.tar).
+
+## Running the Benchmark
+
+`lemonade -i facebook/opt-125m huggingface-load accuracy-mmlu --ntrain 5 --tests astronomy`
+
+### Optional arguments:
+
+`--ntrain`: The ntrain parameter is designed to specify the number of training examples to be used from a development (dev) set for creating context or background information in the prompts for evaluating language models, especially in tasks like MMLU (default: 5).
+
+In the context of few-shot learning, particularly with language models, "shots" refer to the number of examples provided to the model to help it understand or adapt to the task at hand without explicit training.
+By setting `--ntrain` to 5 we achieve 5-shot setting in MMLU.
+The model is expected to generate an answer to the test question based on the context provided by the preceding question-answer pairs.
+
+`--data-dir`: The directory where the MMLU data is stored (default: "<lemonade_cache_dir>/data").
+
+`--tests`: Specific tests to run, identified by their subject names. Accepts multiple test names.
+
+
+## How It Works
+
+1. `Data Preparation:` On the first run, the script downloads the MMLU dataset and extracts it into the specified data directory. It then prepares the data by reading the development and test sets for the specified subjects.
+
+1. `Prompt Generation:` For each subject, the script generates prompts from the development set to provide context for the test questions. This includes a configurable number of training examples (--ntrain) to help the model understand the task.
+
+1. `Model Evaluation:` The specified language model is used to generate answers to each test question. Testing methodology adopted from [here](https://github.com/hendrycks/test).
+
+1. `Accuracy Calculation:` The script compares the model-generated answers against the correct answers to calculate accuracy metrics for each subject.
+
+1. `Saving Results:` Detailed results for each subject, including questions, prompts, correct and generated answers, and overall accuracy, are saved to CSV files in the specified results directory. A summary CSV file compiling accuracy metrics across all evaluated subjects is also generated and available in the cache directory.
+
+## Detailed list of subjects/ categories tested
+
+| Test Subject                     | Category          |
+|----------------------------------|-------------------|
+| Abstract Algebra                 | Math              |
+| Anatomy                          | Health            |
+| Astronomy                        | Physics           |
+| Business Ethics                  | Business          |
+| Clinical Knowledge               | Health            |
+| College Biology                  | Biology           |
+| College Chemistry                | Chemistry         |
+| College Computer Science         | Computer Science  |
+| College Mathematics              | Math              |
+| College Medicine                 | Health            |
+| College Physics                  | Physics           |
+| Computer Security                | Computer Science  |
+| Conceptual Physics               | Physics           |
+| Econometrics                     | Economics         |
+| Electrical Engineering           | Engineering       |
+| Elementary Mathematics           | Math              |
+| Formal Logic                     | Philosophy        |
+| Global Facts                     | Other             |
+| High School Biology              | Biology           |
+| High School Chemistry            | Chemistry         |
+| High School Computer Science     | Computer Science  |
+| High School European History     | History           |
+| High School Geography            | Geography         |
+| High School Government and Politics | Politics        |
+| High School Macroeconomics       | Economics         |
+| High School Mathematics          | Math              |
+| High School Microeconomics       | Economics         |
+| High School Physics              | Physics           |
+| High School Psychology           | Psychology        |
+| High School Statistics           | Math              |
+| High School US History           | History           |
+| High School World History        | History           |
+| Human Aging                      | Health            |
+| Human Sexuality                  | Culture           |
+| International Law                | Law               |
+| Jurisprudence                    | Law               |
+| Logical Fallacies                | Philosophy        |
+| Machine Learning                 | Computer Science  |
+| Management                       | Business          |
+| Marketing                        | Business          |
+| Medical Genetics                 | Health            |
+| Miscellaneous                    | Other             |
+| Moral Disputes                   | Philosophy        |
+| Moral Scenarios                  | Philosophy        |
+| Nutrition                        | Health            |
+| Philosophy                       | Philosophy        |
+| Prehistory                       | History           |
+| Professional Accounting          | Other             |
+| Professional Law                 | Law               |
+| Professional Medicine            | Health            |
+| Professional Psychology          | Psychology        |
+| Public Relations                 | Politics          |
+| Security Studies                 | Politics          |
+| Sociology                        | Culture           |
+| US Foreign Policy                | Politics          |
+| Virology                         | Health            |
+| World Religions                  | Philosophy        |
diff --git a/docs/perplexity.md b/docs/perplexity.md
@@ -0,0 +1,72 @@
+
+# Perplexity Evaluation
+
+
+## Overview
+
+Perplexity is a measurement of how well a probability model predicts a sample. A lower perplexity indicates the model is more confident in its predictions. In the context of language models, perplexity measures the likelihood of the sequence according to the model, given as:
+
+`Perplexity (P) = exp(Average Negative Log-Likelihood)`
+
+`Where Average Negative Log-Likelihood = (1/N) * Sum[-log p(x_i) from i=1 to N]`
+
+
+## Script Functionality
+
+### Key Components
+
+- **`max_length`**: The maximum input length the model can handle at once (set by the model's configuration).
+- **`stride`**: The step size for the window, set to half of `max_length` to ensure some overlap and preserve context.
+- **`seq_len`**: The total length of the tokenized input.
+
+### Detailed Steps
+
+1. **Load Model and Tokenizer**: Receive the model and tokenizer with specified configurations.
+2. **Load and Prepare Data**: Loads the "wikitext-2-raw-v1" dataset and concatenates texts with double newlines. The data is then tokenized.
+3. **Sliding Window Perplexity Calculation**: The script uses a sliding window approach (with a stride of half the window size) to calculate the perplexity for subsets of the data, adjusting for the maximum input length of the model:
+    - For each window, input data is processed, and the corresponding labels are adjusted to mask out irrelevant parts (using `-100`).
+    - The model computes the logits and loss for each window.
+    - Predicted and actual words at the end of each window are logged for analysis.
+4. **Logging to CSV**: Summarizes the context window, predicted and actual next words, and loss for each window into a CSV file for further analysis.
+5. **Perplexity Calculation**: Calculates the total negative log-likelihood adjusted by the effective token count for each window, then computes the average across all tokens to determine the perplexity.
+
+### Example Outputs
+
+The script outputs a CSV file named `summary_results.csv` with the following columns:
+
+- **Context (Partial context displayed for Brevity)**
+- **Predicted next word**
+- **Actual next word**
+- **Loss for this window**
+
+These entries help in understanding how the model is performing at each step of the text.
+
+## How to Interpret Perplexity Results
+
+Understanding Perplexity
+Definition: Perplexity is defined as the exponential of the average negative log-likelihood of a model on a given test set. 
+
+Lower Values are Better: A lower perplexity score indicates that the model has a higher probability of correctly predicting the sample, suggesting better performance. A lower perplexity means the model is more certain about its predictions.
+
+### Interpretation:
+
+**High Perplexity:** Indicates confusion or a high level of uncertainty in the model’s predictions. A high perplexity can suggest that the model's language understanding is poor or that the model is not well-tuned for the given data.
+
+**Low Perplexity:** Suggests that the model predictions are more accurate and that it assigns higher probabilities to the actual observed outcomes. This is indicative of a model that has a good grasp of the language patterns seen in the test set.
+Practical Implications
+
+**Model Comparison:** Perplexity is particularly useful for comparing different versions of the same model (e.g., before and after quantization, fine-tuning or training on additional data). The model with the lower perplexity is generally considered better at modeling the language of the test corpus.
+
+**Model Selection for Applications:** For applications involving language generation (like machine translation, text summarization, or chatbots), selecting a model with lower perplexity might result in more fluent, coherent, and contextually appropriate text output.
+
+**Diagnosing Model Fit:** High perplexity could indicate underfitting, where the model is too simple to capture the complexity of the language data. It can also help in diagnosing whether the model is well-suited for the specific domain of the text being modeled.
+
+
+### Caveats in Interpretation
+
+**Dependency on Test Set:** Perplexity is highly dependent on the test set used. A model can show very different perplexity scores on different datasets. Therefore, it's important to consider the nature and domain of the test set when evaluating perplexity.
+
+**Not a Complete Measure:** While perplexity provides a measure of how uncertain a model is about its predictions, it does not directly measure how coherent or contextually appropriate generated texts are. Other qualitative assessments and metrics might be necessary to fully evaluate a language model's output.
+
+**Comparison Across Different Data:** Comparing perplexity scores across models trained or tested on different datasets can be misleading because the intrinsic difficulty of the datasets can affect the perplexity.
+