Skip to content

Commit

Permalink
Fix streaming mode
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremyfowers committed Jan 20, 2025
1 parent 22fcc66 commit 8000dde
Showing 1 changed file with 39 additions and 27 deletions.
66 changes: 39 additions & 27 deletions src/lemonade/tools/ort_genai/oga.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,18 @@ def generate(
)
params.try_graph_capture_with_max_batch_size(1)

if streamer:
# This LoC may cause trouble during the OGA <0.6 to 0.6 transition.
# However, streaming mode does not work without it.
# We are seeking help from Microsoft to understand what the
# correct API call would be to enable streaming mode.
params.input_ids = input_ids

generator = og.Generator(self.model, params)
generator.append_tokens(input_ids)

if streamer is None:
prompt_start_time = time.perf_counter()
generator.append_tokens(input_ids)
generator.generate_next_token()
prompt_end_time = time.perf_counter()

Expand Down Expand Up @@ -192,6 +199,11 @@ def generate(
stop_early = False

while not generator.is_done() and not stop_early:
# This LoC may cause trouble during the OGA <0.6 to 0.6 transition.
# However, streaming mode does not work without it.
# We are seeking help from Microsoft to understand what the
# correct API call would be to enable streaming mode.
generator.compute_logits()
generator.generate_next_token()

new_token = generator.get_next_tokens()[0]
Expand All @@ -208,32 +220,32 @@ def generate(

class OgaLoad(FirstTool):
"""
Tool that loads an LLM in OnnxRuntime-GenAI for use with CPU or DirectML execution providers.
Input: path to a checkpoint.
Supported choices for cpu and igpu from HF model repository:
LLM models on Huggingface supported by model_builder. See documentation
(https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported
models.
Supported choices for npu from HF model repository:
Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
Local models for cpu, igpu, or npu:
The specified checkpoint is converted to a local path, via mapping to lower case
and replacing '/' with '_'. If this model already exists in the 'models' folderr
of the lemonade cache and if it has a subfolder <device>-<dtype>, then this model
will be used. If the --force flag is used and the model is built with model_builder,
then it will be rebuilt.
Output:
state.model: handle to a Huggingface-style LLM loaded on DirectML device
state.tokenizer = Huggingface-style LLM tokenizer instance
state.dtype = data type of the model on DirectML device
state.checkpoint = name of the checkpoint used to load state.model
Note: This tool expects the onnxruntime-genai-directml library to be pre-installed.
If that library is not installed, this tool will not load.
Tool that loads an LLM in OnnxRuntime-GenAI for use with CPU or DirectML execution providers.
Input: path to a checkpoint.
4 Supported choices for cpu and igpu from HF model repository:
LLM models on Huggingface supported by model_builder. See documentation
(https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md) for supported
models.
Supported choices for npu from HF model repository:
Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
Local models for cpu, igpu, or npu:
The specified checkpoint is converted to a local path, via mapping to lower case
and replacing '/' with '_'. If this model already exists in the 'models' folderr
of the lemonade cache and if it has a subfolder <device>-<dtype>, then this model
will be used. If the --force flag is used and the model is built with model_builder,
then it will be rebuilt.
Output:
state.model: handle to a Huggingface-style LLM loaded on DirectML device
state.tokenizer = Huggingface-style LLM tokenizer instance
state.dtype = data type of the model on DirectML device
state.checkpoint = name of the checkpoint used to load state.model
Note: This tool expects the onnxruntime-genai-directml library to be pre-installed.
If that library is not installed, this tool will not load.
"""

unique_name = "oga-load"
Expand Down

0 comments on commit 8000dde

Please sign in to comment.