Fix streaming mode

onnx · Jan 20, 2025 · 8000dde · 8000dde
1 parent 22fcc66
commit 8000dde
Showing 1 changed file with 39 additions and 27 deletions.
diff --git a/src/lemonade/tools/ort_genai/oga.py b/src/lemonade/tools/ort_genai/oga.py
@@ -157,11 +157,18 @@ def generate(
             )
         params.try_graph_capture_with_max_batch_size(1)
 
+        if streamer:
+            # This LoC may cause trouble during the OGA <0.6 to 0.6 transition.
+            # However, streaming mode does not work without it.
+            # We are seeking help from Microsoft to understand what the
+            # correct API call would be to enable streaming mode.
+            params.input_ids = input_ids
+
         generator = og.Generator(self.model, params)
-        generator.append_tokens(input_ids)
 
         if streamer is None:
             prompt_start_time = time.perf_counter()
+            generator.append_tokens(input_ids)
             generator.generate_next_token()
             prompt_end_time = time.perf_counter()
 
@@ -192,6 +199,11 @@ def generate(
             stop_early = False
 
             while not generator.is_done() and not stop_early:
+                # This LoC may cause trouble during the OGA <0.6 to 0.6 transition.
+                # However, streaming mode does not work without it.
+                # We are seeking help from Microsoft to understand what the
+                # correct API call would be to enable streaming mode.
+                generator.compute_logits()
                 generator.generate_next_token()
 
                 new_token = generator.get_next_tokens()[0]
@@ -208,32 +220,32 @@ def generate(
 
 class OgaLoad(FirstTool):
     """
-    Tool that loads an LLM in OnnxRuntime-GenAI for use with CPU or DirectML execution providers.
-
-    Input: path to a checkpoint.
-        Supported choices for cpu and igpu from HF model repository:
-            LLM models on Huggingface supported by model_builder.  See documentation
-            (https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported
-            models.
-        Supported choices for npu from HF model repository:
-            Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
-        Local models for cpu, igpu, or npu:
-            The specified checkpoint is converted to a local path, via mapping to lower case
-            and replacing '/' with '_'.  If this model already exists in the 'models' folderr
-            of the lemonade cache and if it has a subfolder <device>-<dtype>, then this model
-            will be used.  If the --force flag is used and the model is built with model_builder,
-            then it will be rebuilt.
-
-
-
-    Output:
-        state.model: handle to a Huggingface-style LLM loaded on DirectML device
-        state.tokenizer = Huggingface-style LLM tokenizer instance
-        state.dtype = data type of the model on DirectML device
-        state.checkpoint = name of the checkpoint used to load state.model
-
-    Note: This tool expects the onnxruntime-genai-directml library to be pre-installed.
-            If that library is not installed, this tool will not load.
+        Tool that loads an LLM in OnnxRuntime-GenAI for use with CPU or DirectML execution providers.
+
+        Input: path to a checkpoint.
+    4        Supported choices for cpu and igpu from HF model repository:
+                LLM models on Huggingface supported by model_builder.  See documentation
+                (https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported
+                models.
+            Supported choices for npu from HF model repository:
+                Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
+            Local models for cpu, igpu, or npu:
+                The specified checkpoint is converted to a local path, via mapping to lower case
+                and replacing '/' with '_'.  If this model already exists in the 'models' folderr
+                of the lemonade cache and if it has a subfolder <device>-<dtype>, then this model
+                will be used.  If the --force flag is used and the model is built with model_builder,
+                then it will be rebuilt.
+
+
+
+        Output:
+            state.model: handle to a Huggingface-style LLM loaded on DirectML device
+            state.tokenizer = Huggingface-style LLM tokenizer instance
+            state.dtype = data type of the model on DirectML device
+            state.checkpoint = name of the checkpoint used to load state.model
+
+        Note: This tool expects the onnxruntime-genai-directml library to be pre-installed.
+                If that library is not installed, this tool will not load.
     """
 
     unique_name = "oga-load"