From 3ba21341854347e65319298d5e01fa9f1e90873c Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Tue, 15 Oct 2024 18:52:58 -0700
Subject: [PATCH 01/27] intermediate commit

---
 Package.swift                                 |  20 +-
 .../LLMLocalContextParameters.swift           | 218 -----------
 .../Configuration/LLMLocalParameters.swift    | 177 ---------
 .../LLMLocalPlatformConfiguration.swift       |  39 +-
 .../LLMLocalSamplingParameters.swift          | 357 ------------------
 .../Helpers/LLMModel+numParameters.swift      |  23 ++
 .../ModelConfiguration+PromptFormat.swift     |  20 +
 .../SpeziLLMLocal/Helpers/String+Cxx.swift    |  30 --
 Sources/SpeziLLMLocal/LLMLocalPlatform.swift  |  32 +-
 .../LLMLocalSchema+PromptFormatting.swift     | 275 --------------
 Sources/SpeziLLMLocal/LLMLocalSchema.swift    |  35 +-
 .../LLMLocalSession+Generate.swift            |  92 +++++
 .../LLMLocalSession+Generation.swift          | 194 ----------
 .../LLMLocalSession+Sampling.swift            |  46 ---
 .../SpeziLLMLocal/LLMLocalSession+Setup.swift |  64 ++--
 .../LLMLocalSession+Tokenization.swift        |  81 ----
 Sources/SpeziLLMLocal/LLMLocalSession.swift   |  47 ++-
 .../Resources/Localizable.xcstrings           |   1 +
 .../LLMLocalDownloadManager+DefaultUrls.swift |  92 -----
 .../LLMLocalDownloadManager.swift             |  79 ++--
 .../LLMLocalDownloadManagerDelegate.swift     |  86 -----
 .../LLMLocalDownloadView.swift                |  50 ++-
 .../LLMLocalLoadingManager.swift              |   0
 23 files changed, 342 insertions(+), 1716 deletions(-)
 delete mode 100644 Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
 delete mode 100644 Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
 delete mode 100644 Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift
 create mode 100644 Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift
 create mode 100644 Sources/SpeziLLMLocal/Helpers/ModelConfiguration+PromptFormat.swift
 delete mode 100644 Sources/SpeziLLMLocal/Helpers/String+Cxx.swift
 delete mode 100644 Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift
 create mode 100644 Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
 delete mode 100644 Sources/SpeziLLMLocal/LLMLocalSession+Generation.swift
 delete mode 100644 Sources/SpeziLLMLocal/LLMLocalSession+Sampling.swift
 delete mode 100644 Sources/SpeziLLMLocal/LLMLocalSession+Tokenization.swift
 delete mode 100644 Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager+DefaultUrls.swift
 delete mode 100644 Sources/SpeziLLMLocalDownload/LLMLocalDownloadManagerDelegate.swift
 create mode 100644 Sources/SpeziLLMLocalDownload/LLMLocalLoadingManager.swift

diff --git a/Package.swift b/Package.swift
index b285f1bd..1f041b47 100644
--- a/Package.swift
+++ b/Package.swift
@@ -27,8 +27,10 @@ let package = Package(
         .library(name: "SpeziLLMFog", targets: ["SpeziLLMFog"])
     ],
     dependencies: [
+        .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.18.0"),
+        .package(url: "https://github.com/ml-explore/mlx-swift-examples", from: "1.16.0"),
+        .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.12")),
         .package(url: "https://github.com/StanfordBDHG/OpenAI", .upToNextMinor(from: "0.2.9")),
-        .package(url: "https://github.com/StanfordBDHG/llama.cpp", .upToNextMinor(from: "0.3.3")),
         .package(url: "https://github.com/StanfordSpezi/Spezi", from: "1.2.1"),
         .package(url: "https://github.com/StanfordSpezi/SpeziFoundation", from: "2.0.0-beta.3"),
         .package(url: "https://github.com/StanfordSpezi/SpeziStorage", from: "1.0.2"),
@@ -49,19 +51,23 @@ let package = Package(
             name: "SpeziLLMLocal",
             dependencies: [
                 .target(name: "SpeziLLM"),
-                .product(name: "llama", package: "llama.cpp"),
                 .product(name: "SpeziFoundation", package: "SpeziFoundation"),
-                .product(name: "Spezi", package: "Spezi")
-            ],
-            swiftSettings: [
-                .interoperabilityMode(.Cxx)
+                .product(name: "Spezi", package: "Spezi"),
+                .product(name: "MLX", package: "mlx-swift"),
+                .product(name: "MLXFast", package: "mlx-swift"),
+                .product(name: "MLXNN", package: "mlx-swift"),
+                .product(name: "MLXOptimizers", package: "mlx-swift"),
+                .product(name: "MLXRandom", package: "mlx-swift"),
+                .product(name: "Transformers", package: "swift-transformers"),
+                .product(name: "LLM", package: "mlx-swift-examples")
             ]
         ),
         .target(
             name: "SpeziLLMLocalDownload",
             dependencies: [
                 .product(name: "SpeziOnboarding", package: "SpeziOnboarding"),
-                .product(name: "SpeziViews", package: "SpeziViews")
+                .product(name: "SpeziViews", package: "SpeziViews"),
+                .product(name: "LLM", package: "mlx-swift-examples")
             ]
         ),
         .target(
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
deleted file mode 100644
index e4ad4a92..00000000
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
+++ /dev/null
@@ -1,218 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import Foundation
-@preconcurrency import llama
-
-
-/// Represents the context parameters of the LLM.
-/// 
-/// Internally, these data points are passed as a llama.cpp `llama_context_params` C struct to the LLM.
-public struct LLMLocalContextParameters: Sendable {
-    // swiftlint:disable identifier_name
-    /// Swift representation of the `ggml_type` of llama.cpp, indicating data types within KV caches.
-    public enum GGMLType: UInt32 {
-        case f32 = 0
-        case f16
-        case q4_0
-        case q4_1
-        case q5_0 = 6
-        case q5_1
-        case q8_0
-        case q8_1
-        /// k-quantizations
-        case q2_k
-        case q3_k
-        case q4_k
-        case q5_k
-        case q6_k
-        case q8_k
-        case iq2_xxs
-        case iq2_xs
-        case i8
-        case i16
-        case i32
-    }
-    // swiftlint:enable identifier_name
-    
-    
-    /// Wrapped C struct from the llama.cpp library, later-on passed to the LLM
-    private var wrapped: llama_context_params
-    
-    
-    /// Context parameters in llama.cpp's low-level C representation
-    var llamaCppRepresentation: llama_context_params {
-        wrapped
-    }
-    
-    /// RNG seed of the LLM
-    var seed: UInt32 {
-        get {
-            wrapped.seed
-        }
-        set {
-            wrapped.seed = newValue
-        }
-    }
-    
-    /// Context window size in tokens (0 = take default window size from model)
-    var contextWindowSize: UInt32 {
-        get {
-            wrapped.n_ctx
-        }
-        set {
-            wrapped.n_ctx = newValue
-        }
-    }
-    
-    /// Maximum batch size during prompt processing
-    var batchSize: UInt32 {
-        get {
-            wrapped.n_batch
-        }
-        set {
-            wrapped.n_batch = newValue
-        }
-    }
-    
-    /// Number of threads used by LLM for generation of output
-    var threadCount: UInt32 {
-        get {
-            wrapped.n_threads
-        }
-        set {
-            wrapped.n_threads = newValue
-        }
-    }
-    
-    /// Number of threads used by LLM for batch processing
-    var threadCountBatch: UInt32 {
-        get {
-            wrapped.n_threads_batch
-        }
-        set {
-            wrapped.n_threads_batch = newValue
-        }
-    }
-    
-    /// RoPE base frequency (0 = take default from model)
-    var ropeFreqBase: Float {
-        get {
-            wrapped.rope_freq_base
-        }
-        set {
-            wrapped.rope_freq_base = newValue
-        }
-    }
-    
-    /// RoPE frequency scaling factor (0 = take default from model)
-    var ropeFreqScale: Float {
-        get {
-            wrapped.rope_freq_scale
-        }
-        set {
-            wrapped.rope_freq_scale = newValue
-        }
-    }
-    
-    /// If `true`, offload the KQV ops (including the KV cache) to GPU
-    var offloadKQV: Bool {
-        get {
-            wrapped.offload_kqv
-        }
-        set {
-            wrapped.offload_kqv = newValue
-        }
-    }
-    
-    /// ``GGMLType`` of the key of the KV cache
-    var kvKeyType: GGMLType {
-        get {
-            GGMLType(rawValue: wrapped.type_k.rawValue) ?? .f16
-        }
-        set {
-            wrapped.type_k = ggml_type(rawValue: newValue.rawValue)
-        }
-    }
-    
-    /// ``GGMLType`` of the value of the KV cache
-    var kvValueType: GGMLType {
-        get {
-            GGMLType(rawValue: wrapped.type_v.rawValue) ?? .f16
-        }
-        set {
-            wrapped.type_v = ggml_type(rawValue: newValue.rawValue)
-        }
-    }
-    
-    /// If `true`, the (deprecated) `llama_eval()` call computes all logits, not just the last one
-    var computeAllLogits: Bool {
-        get {
-            wrapped.logits_all
-        }
-        set {
-            wrapped.logits_all = newValue
-        }
-    }
-    
-    /// If `true`, the mode is set to embeddings only
-    var embeddingsOnly: Bool {
-        get {
-            wrapped.embeddings
-        }
-        set {
-            wrapped.embeddings = newValue
-        }
-    }
-    
-    /// Creates the ``LLMLocalContextParameters`` which wrap the underlying llama.cpp `llama_context_params` C struct.
-    /// Is passed to the underlying llama.cpp model in order to configure the context of the LLM.
-    ///
-    /// - Parameters:
-    ///   - seed: RNG seed of the LLM, defaults to `4294967295` (which represents a random seed).
-    ///   - contextWindowSize: Context window size in tokens, defaults to `1024`.
-    ///   - batchSize: Maximum batch size during prompt processing, defaults to `1024` tokens.
-    ///   - threadCount: Number of threads used by LLM for generation of output, defaults to the processor count of the device.
-    ///   - threadCountBatch: Number of threads used by LLM for batch processing, defaults to the processor count of the device.
-    ///   - ropeFreqBase: RoPE base frequency, defaults to `0` indicating the default from model.
-    ///   - ropeFreqScale: RoPE frequency scaling factor, defaults to `0` indicating the default from model.
-    ///   - offloadKQV: Offloads the KQV ops (including the KV cache) to GPU, defaults to `true`.
-    ///   - kvKeyType: ``GGMLType`` of the key of the KV cache, defaults to ``GGMLType/f16``.
-    ///   - kvValueType: ``GGMLType`` of the value of the KV cache, defaults to ``GGMLType/f16``.
-    ///   - computeAllLogits: `llama_eval()` call computes all logits, not just the last one. Defaults to `false`.
-    ///   - embeddingsOnly: Embedding-only mode, defaults to `false`.
-    public init(
-        seed: UInt32 = 4294967295,
-        contextWindowSize: UInt32 = 1024,
-        batchSize: UInt32 = 1024,
-        threadCount: UInt32 = .init(ProcessInfo.processInfo.processorCount),
-        threadCountBatch: UInt32 = .init(ProcessInfo.processInfo.processorCount),
-        ropeFreqBase: Float = 0.0,
-        ropeFreqScale: Float = 0.0,
-        offloadKQV: Bool = true,
-        kvKeyType: GGMLType = .f16,
-        kvValueType: GGMLType = .f16,
-        computeAllLogits: Bool = false,
-        embeddingsOnly: Bool = false
-    ) {
-        self.wrapped = llama_context_default_params()
-        
-        self.seed = seed
-        self.contextWindowSize = contextWindowSize
-        self.batchSize = batchSize
-        self.threadCount = threadCount
-        self.threadCountBatch = threadCountBatch
-        self.ropeFreqBase = ropeFreqBase
-        self.ropeFreqScale = ropeFreqScale
-        self.offloadKQV = offloadKQV
-        self.kvKeyType = kvKeyType
-        self.kvValueType = kvValueType
-        self.computeAllLogits = computeAllLogits
-        self.embeddingsOnly = embeddingsOnly
-    }
-}
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
deleted file mode 100644
index 2d5e8e5e..00000000
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
+++ /dev/null
@@ -1,177 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import Foundation
-@preconcurrency import llama
-
-
-/// Represents the parameters of the LLM.
-/// 
-/// Internally, these data points are passed as a llama.cpp `llama_model_params` C struct to the LLM.
-public struct LLMLocalParameters: Sendable {
-    /// Typealias for an internal llama.cpp progress callback function
-    public typealias LlamaProgressCallback = (@convention(c) (Float, UnsafeMutableRawPointer?) -> Bool)
-    
-    
-    /// Defaults of possible LLMs parameter settings.
-    public enum Defaults {
-        /// Default system prompt for local LLMs.
-        public static let defaultSystemPrompt: String = {
-            String(localized: LocalizedStringResource("SPEZI_LLM_LOCAL_SYSTEM_PROMPT", bundle: .atURL(from: .module)))
-        }()
-    }
-    
-    
-    /// The to-be-used system prompt of the LLM
-    let systemPrompt: String?
-    /// Indicates the maximum output length generated by the LLM.
-    let maxOutputLength: Int
-    /// Indicates whether the BOS token is added by the LLM. If `nil`, the default from the model itself is taken.
-    let addBosToken: Bool
-    
-    
-    /// Wrapped C struct from the llama.cpp library, later-on passed to the LLM
-    private var wrapped: llama_model_params
-    
-    
-    /// Model parameters in llama.cpp's low-level C representation
-    var llamaCppRepresentation: llama_model_params {
-        wrapped
-    }
-    
-    /// Number of layers to store in VRAM
-    /// - Note: On iOS simulators, this property has to be set to 0 (which is automatically done by the library).
-    var gpuLayerCount: Int32 {
-        get {
-            wrapped.n_gpu_layers
-        }
-        set {
-            wrapped.n_gpu_layers = newValue
-        }
-    }
-    
-    /// Indicates the GPU that is used for scratch and small tensors.
-    var mainGpu: Int32 {
-        get {
-            wrapped.main_gpu
-        }
-        set {
-            wrapped.main_gpu = newValue
-        }
-    }
-    
-    /// Indicates how to split layers across multiple GPUs.
-    var tensorSplit: UnsafePointer<Float>? {
-        get {
-            wrapped.tensor_split
-        }
-        set {
-            wrapped.tensor_split = newValue
-        }
-    }
-
-    /// Progress callback called with a progress value between 0 and 1
-    var progressCallback: LlamaProgressCallback? {
-        get {
-            wrapped.progress_callback
-        }
-        set {
-            wrapped.progress_callback = newValue
-        }
-    }
-    
-    /// Context pointer that is passed to the progress callback
-    var progressCallbackUserData: UnsafeMutableRawPointer? {
-        get {
-            wrapped.progress_callback_user_data
-        }
-        set {
-            wrapped.progress_callback_user_data = newValue
-        }
-    }
-
-    /// Indicates wether booleans should be kept together to avoid misalignment during copy-by-value.
-    var vocabOnly: Bool {
-        get {
-            wrapped.vocab_only
-        }
-        set {
-            wrapped.vocab_only = newValue
-        }
-    }
-    
-    /// Indicates if mmap should be used.
-    var useMmap: Bool {
-        get {
-            wrapped.use_mmap
-        }
-        set {
-            wrapped.use_mmap = newValue
-        }
-    }
-    
-    /// Forces the system to keep model in RAM.
-    var useMlock: Bool {
-        get {
-            wrapped.use_mlock
-        }
-        set {
-            wrapped.use_mlock = newValue
-        }
-    }
-    
-    
-    /// Creates the ``LLMLocalParameters`` which wrap the underlying llama.cpp `llama_model_params` C struct.
-    /// Is passed to the underlying llama.cpp model in order to configure the LLM.
-    ///
-    /// - Parameters:
-    ///   - systemPrompt: The to-be-used system prompt of the LLM enabling fine-tuning of the LLMs behaviour. Defaults to the regular default chat-based LLM system prompt.
-    ///   - maxOutputLength: The maximum output length generated by the Spezi LLM, defaults to `512`.
-    ///   - addBosToken: Indicates wether the BOS token is added by the Spezi LLM, defaults to `false`.
-    ///   - gpuLayerCount: Number of layers to store in VRAM, defaults to `1`, meaning Apple's `Metal` framework is enabled.
-    ///   - mainGpu: GPU that is used for scratch and small tensors, defaults to `0` representing the main GPU.
-    ///   - tensorSplit: Split layers across multiple GPUs, defaults to `nil`, meaning no split.
-    ///   - progressCallback: Progress callback called with a progress value between 0 and 1, defaults to `nil`.
-    ///   - progressCallbackUserData: Context pointer that is passed to the progress callback, defaults to `nil`.
-    ///   - vocabOnly: Indicates wether booleans should be kept together to avoid misalignment during copy-by-value., defaults to `false`.
-    ///   - useMmap: Indicates if mmap should be used., defaults to `true`.
-    ///   - useMlock: Forces the system to keep model in RAM, defaults to `false`.
-    public init(
-        systemPrompt: String? = Defaults.defaultSystemPrompt,
-        maxOutputLength: Int = 512,
-        addBosToken: Bool = false,
-        gpuLayerCount: Int32 = 1,
-        mainGpu: Int32 = 0,
-        tensorSplit: UnsafePointer<Float>? = nil,
-        progressCallback: LlamaProgressCallback? = nil,
-        progressCallbackUserData: UnsafeMutableRawPointer? = nil,
-        vocabOnly: Bool = false,
-        useMmap: Bool = true,
-        useMlock: Bool = false
-    ) {
-        self.wrapped = llama_model_default_params()
-        
-        self.systemPrompt = systemPrompt
-        self.maxOutputLength = maxOutputLength
-        self.addBosToken = addBosToken
-        
-        /// Overwrite `gpuLayerCount` in case of a simulator target environment
-        #if targetEnvironment(simulator)
-        self.gpuLayerCount = 0     // Disable Metal on simulator as crash otherwise
-        #else
-        self.gpuLayerCount = gpuLayerCount
-        #endif
-        self.mainGpu = mainGpu
-        self.tensorSplit = tensorSplit
-        self.progressCallback = progressCallback
-        self.progressCallbackUserData = progressCallbackUserData
-        self.vocabOnly = vocabOnly
-        self.useMmap = useMmap
-        self.useMlock = useMlock
-    }
-}
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalPlatformConfiguration.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalPlatformConfiguration.swift
index 13d708dc..bd8f1c5c 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalPlatformConfiguration.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalPlatformConfiguration.swift
@@ -7,43 +7,42 @@
 //
 
 import Foundation
-import llama
-
 
 /// Represents the configuration of the Spezi ``LLMLocalPlatform``.
 public struct LLMLocalPlatformConfiguration: Sendable {
-    /// Wrapper around the `ggml_numa_strategy` type of llama.cpp, indicating the non-unified memory access configuration of the device.
-    public enum NonUniformMemoryAccess: UInt32, Sendable {
-        case disabled
-        case distributed
-        case isolated
-        case numaCtl
-        case mirror
-        case count
-        
+    public struct MemoryLimit: Sendable {
+        /// The memory limit in MB
+        let limit: Int
         
-        var wrappedValue: ggml_numa_strategy {
-            .init(rawValue: self.rawValue)
-        }
+        /// Calls to malloc will wait on scheduled tasks if the limit is exceeded.  If
+        /// there are no more scheduled tasks an error will be raised if `relaxed`
+        /// is false or memory will be allocated (including the potential for
+        /// swap) if `relaxed` is true.
+        ///
+        /// The memory limit defaults to 1.5 times the maximum recommended working set
+        /// size reported by the device ([recommendedMaxWorkingSetSize](https://developer.apple.com/documentation/metal/mtldevice/2369280-recommendedmaxworkingsetsize))
+        let relaxed: Bool
     }
     
+    /// The cache limit in MB, to disable set limit to 0
+    let cacheLimit: Int
     
+    let memoryLimit: MemoryLimit?
     /// The task priority of the initiated LLM inference tasks.
     let taskPriority: TaskPriority
-    /// Indicates the non-unified memory access configuration of the device.
-    let nonUniformMemoryAccess: NonUniformMemoryAccess
     
     
     /// Creates the ``LLMLocalPlatformConfiguration`` which configures the Spezi ``LLMLocalPlatform``.
     ///
     /// - Parameters:
     ///   - taskPriority: The task priority of the initiated LLM inference tasks, defaults to `.userInitiated`.
-    ///   - nonUniformMemoryAccess: Indicates if this is a device with non-unified memory access.
     public init(
-        taskPriority: TaskPriority = .userInitiated,
-        nonUniformMemoryAccess: NonUniformMemoryAccess = .disabled
+        cacheLimit: Int = 20,
+        memoryLimit: MemoryLimit? = nil,
+        taskPriority: TaskPriority = .userInitiated
     ) {
+        self.cacheLimit = cacheLimit
+        self.memoryLimit = memoryLimit
         self.taskPriority = taskPriority
-        self.nonUniformMemoryAccess = nonUniformMemoryAccess
     }
 }
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift
deleted file mode 100644
index bd1f941d..00000000
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift
+++ /dev/null
@@ -1,357 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import Foundation
-import llama
-
-
-/// Represents the sampling parameters of the LLM.
-/// 
-/// Internally, these data points are passed as a llama.cpp `llama_sampling_params` C struct to the LLM.
-public struct LLMLocalSamplingParameters: Sendable {    // swiftlint:disable:this type_body_length
-    /// Helper enum for the Mirostat sampling method
-    public enum Mirostat {
-        init(rawValue: Int, targetEntropy: Float = 5.0, learningRate: Float = 0.1) {
-            switch rawValue {
-            case 0:
-                self = .disabled
-            case 1:
-                self = .v1(targetEntropy: targetEntropy, learningRate: learningRate)
-            case 2:
-                self = .v2(targetEntropy: targetEntropy, learningRate: learningRate)
-            default:
-                self = .disabled
-            }
-        }
-        
-        
-        case disabled
-        case v1(targetEntropy: Float, learningRate: Float)  // swiftlint:disable:this identifier_name
-        case v2(targetEntropy: Float, learningRate: Float)  // swiftlint:disable:this identifier_name
-        
-        
-        var rawValue: Int {
-            switch self {
-            case .disabled:
-                return 0
-            case .v1:
-                return 1
-            case .v2:
-                return 2
-            }
-        }
-    }
-    
-    public struct ClassifierFreeGuidance {
-        let negativePrompt: String?
-        let scale: Float
-        
-        
-        public init(negativePrompt: String? = nil, scale: Float = 1.0) {
-            self.negativePrompt = negativePrompt
-            self.scale = scale
-        }
-    }
-    
-    
-    /// Wrapped C struct from the llama.cpp library, later-on passed to the LLM.
-    private var wrapped: llama_sampling_params
-    
-    
-    /// Sampling parameters in llama.cpp's low-level C representation.
-    var llamaCppRepresentation: llama_sampling_params {
-        wrapped
-    }
-    
-    var llamaCppSamplingContext: UnsafeMutablePointer<llama_sampling_context>? {
-        llama_sampling_init(wrapped)
-    }
-    
-    /// Number of previous tokens to remember.
-    var rememberTokens: Int32 {
-        get {
-            wrapped.n_prev
-        }
-        set {
-            wrapped.n_prev = newValue
-        }
-    }
-    
-    /// If greater than 0, output the probabilities of top n\_probs tokens.
-    var outputProbabilities: Int32 {
-        get {
-            wrapped.n_probs
-        }
-        set {
-            wrapped.n_probs = newValue
-        }
-    }
-    
-    /// Top-K Sampling: K most likely next words (<= 0 to use vocab size).
-    var topK: Int32 {
-        get {
-            wrapped.top_k
-        }
-        set {
-            wrapped.top_k = newValue
-        }
-    }
-    
-    /// Top-p Sampling: Smallest possible set of words whose cumulative probability exceeds the probability p (1.0 = disabled).
-    var topP: Float {
-        get {
-            wrapped.top_p
-        }
-        set {
-            wrapped.top_p = newValue
-        }
-    }
-    
-    /// Min-p Sampling (0.0 = disabled).
-    var minP: Float {
-        get {
-            wrapped.min_p
-        }
-        set {
-            wrapped.min_p = newValue
-        }
-    }
-    
-    /// Tail Free Sampling (1.0 = disabled).
-    var tfs: Float {
-        get {
-            wrapped.tfs_z
-        }
-        set {
-            wrapped.tfs_z = newValue
-        }
-    }
-    
-    /// Locally Typical Sampling.
-    var typicalP: Float {
-        get {
-            wrapped.typical_p
-        }
-        set {
-            wrapped.typical_p = newValue
-        }
-    }
-    
-    /// Temperature Sampling: A higher value indicates more creativity of the model but also more hallucinations.
-    var temperature: Float {
-        get {
-            wrapped.temp
-        }
-        set {
-            wrapped.temp = newValue
-        }
-    }
-    
-    /// Last n tokens to penalize (0 = disable penalty, -1 = context size).
-    var penaltyLastTokens: Int32 {
-        get {
-            wrapped.penalty_last_n
-        }
-        set {
-            wrapped.penalty_last_n = newValue
-        }
-    }
-    
-    /// Penalize repeated tokens (1.0 = disabled).
-    var penaltyRepeat: Float {
-        get {
-            wrapped.penalty_repeat
-        }
-        set {
-            wrapped.penalty_repeat = newValue
-        }
-    }
-    
-    /// Penalize frequency (0.0 = disabled).
-    var penaltyFrequency: Float {
-        get {
-            wrapped.penalty_repeat
-        }
-        set {
-            wrapped.penalty_repeat = newValue
-        }
-    }
-    
-    /// Presence penalty (0.0 = disabled).
-    var penaltyPresence: Float {
-        get {
-            wrapped.penalty_present
-        }
-        set {
-            wrapped.penalty_present = newValue
-        }
-    }
-    
-    /// Penalize new lines.
-    var penalizeNewLines: Bool {
-        get {
-            wrapped.penalize_nl
-        }
-        set {
-            wrapped.penalize_nl = newValue
-        }
-    }
-    
-    /// Mirostat sampling.
-    var mirostat: Mirostat {
-        get {
-            .init(
-                rawValue: Int(wrapped.mirostat),
-                targetEntropy: wrapped.mirostat_tau,
-                learningRate: wrapped.mirostat_eta
-            )
-        }
-        set {
-            wrapped.mirostat = Int32(newValue.rawValue)
-            
-            if case .v1(let targetEntropy, let learningRate) = mirostat {
-                wrapped.mirostat_tau = targetEntropy
-                wrapped.mirostat_eta = learningRate
-            } else if case .v2(let targetEntropy, let learningRate) = mirostat {
-                wrapped.mirostat_tau = targetEntropy
-                wrapped.mirostat_eta = learningRate
-            } else {
-                wrapped.mirostat_tau = 5.0
-                wrapped.mirostat_eta = 0.1
-            }
-        }
-    }
-    
-    // C++ vector doesn't conform to Swift sequence on VisionOS SDK (Swift C++ Interop bug),
-    // therefore requiring workaround for VisionSDK
-    #if !os(visionOS)
-    /// Classifier-Free Guidance.
-    var cfg: ClassifierFreeGuidance {
-        get {
-            .init(
-                negativePrompt: String(wrapped.cfg_negative_prompt),
-                scale: wrapped.cfg_scale
-            )
-        }
-        set {
-            if let negativePrompt = newValue.negativePrompt {
-                wrapped.cfg_negative_prompt = std.string(negativePrompt)
-            }
-            wrapped.cfg_scale = newValue.scale
-        }
-    }
-
-    
-    /// Creates the ``LLMLocalContextParameters`` which wrap the underlying llama.cpp `llama_context_params` C struct.
-    /// Is passed to the underlying llama.cpp model in order to configure the context of the LLM.
-    ///
-    /// - Parameters:
-    ///   - rememberTokens: Number of previous tokens to remember.
-    ///   - outputProbabilities: If greater than 0, output the probabilities of top n\_probs tokens.
-    ///   - topK: Top-K Sampling: K most likely next words (<= 0 to use vocab size).
-    ///   - topP: Top-p Sampling: Smallest possible set of words whose cumulative probability exceeds the probability p (1.0 = disabled).
-    ///   - minP: Min-p Sampling (0.0 = disabled).
-    ///   - tfs: Tail Free Sampling (1.0 = disabled).
-    ///   - typicalP: Locally Typical Sampling.
-    ///   - temperature: Temperature Sampling: A higher value indicates more creativity of the model but also more hallucinations.
-    ///   - penaltyLastTokens: Last n tokens to penalize (0 = disable penalty, -1 = context size).
-    ///   - penaltyRepeat: Penalize repeated tokens (1.0 = disabled).
-    ///   - penaltyFrequency: Penalize frequency (0.0 = disabled).
-    ///   - penaltyPresence: Presence penalty (0.0 = disabled).
-    ///   - penalizeNewLines: Penalize new lines.
-    ///   - mirostat: Mirostat sampling.
-    ///   - cfg: Classifier-Free Guidance.
-    public init(
-        rememberTokens: Int32 = 256,
-        outputProbabilities: Int32 = 0,
-        topK: Int32 = 40,
-        topP: Float = 0.95,
-        minP: Float = 0.05,
-        tfs: Float = 1.0,
-        typicalP: Float = 1.0,
-        temperature: Float = 0.8,
-        penaltyLastTokens: Int32 = 64,
-        penaltyRepeat: Float = 1.1,
-        penaltyFrequency: Float = 0.0,
-        penaltyPresence: Float = 0.0,
-        penalizeNewLines: Bool = true,
-        mirostat: Mirostat = .disabled,
-        cfg: ClassifierFreeGuidance = .init()
-    ) {
-        self.wrapped = llama_sampling_params()
-        
-        self.rememberTokens = rememberTokens
-        self.outputProbabilities = outputProbabilities
-        self.topK = topK
-        self.topP = topP
-        self.minP = minP
-        self.tfs = tfs
-        self.typicalP = typicalP
-        self.temperature = temperature
-        self.penaltyLastTokens = penaltyLastTokens
-        self.penaltyRepeat = penaltyRepeat
-        self.penaltyFrequency = penaltyFrequency
-        self.penaltyPresence = penaltyPresence
-        self.penalizeNewLines = penalizeNewLines
-        self.mirostat = mirostat
-        self.cfg = cfg
-    }
-    #else
-    /// Creates the ``LLMLocalContextParameters`` which wrap the underlying llama.cpp `llama_context_params` C struct.
-    /// Is passed to the underlying llama.cpp model in order to configure the context of the LLM.
-    ///
-    /// - Parameters:
-    ///   - rememberTokens: Number of previous tokens to remember.
-    ///   - outputProbabilities: If greater than 0, output the probabilities of top n\_probs tokens.
-    ///   - topK: Top-K Sampling: K most likely next words (<= 0 to use vocab size).
-    ///   - topP: Top-p Sampling: Smallest possible set of words whose cumulative probability exceeds the probability p (1.0 = disabled).
-    ///   - minP: Min-p Sampling (0.0 = disabled).
-    ///   - tfs: Tail Free Sampling (1.0 = disabled).
-    ///   - typicalP: Locally Typical Sampling.
-    ///   - temperature: Temperature Sampling: A higher value indicates more creativity of the model but also more hallucinations.
-    ///   - penaltyLastTokens: Last n tokens to penalize (0 = disable penalty, -1 = context size).
-    ///   - penaltyRepeat: Penalize repeated tokens (1.0 = disabled).
-    ///   - penaltyFrequency: Penalize frequency (0.0 = disabled).
-    ///   - penaltyPresence: Presence penalty (0.0 = disabled).
-    ///   - penalizeNewLines: Penalize new lines.
-    ///   - mirostat: Mirostat sampling.
-    public init(
-        rememberTokens: Int32 = 256,
-        outputProbabilities: Int32 = 0,
-        topK: Int32 = 40,
-        topP: Float = 0.95,
-        minP: Float = 0.05,
-        tfs: Float = 1.0,
-        typicalP: Float = 1.0,
-        temperature: Float = 0.8,
-        penaltyLastTokens: Int32 = 64,
-        penaltyRepeat: Float = 1.1,
-        penaltyFrequency: Float = 0.0,
-        penaltyPresence: Float = 0.0,
-        penalizeNewLines: Bool = true,
-        mirostat: Mirostat = .disabled
-    ) {
-        self.wrapped = llama_sampling_params()
-        
-        self.rememberTokens = rememberTokens
-        self.outputProbabilities = outputProbabilities
-        self.topK = topK
-        self.topP = topP
-        self.minP = minP
-        self.tfs = tfs
-        self.typicalP = typicalP
-        self.temperature = temperature
-        self.penaltyLastTokens = penaltyLastTokens
-        self.penaltyRepeat = penaltyRepeat
-        self.penaltyFrequency = penaltyFrequency
-        self.penaltyPresence = penaltyPresence
-        self.penalizeNewLines = penalizeNewLines
-        self.mirostat = mirostat
-    }
-    #endif
-}
diff --git a/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift b/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift
new file mode 100644
index 00000000..bcf4f678
--- /dev/null
+++ b/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift
@@ -0,0 +1,23 @@
+//
+//  LLMModel+numParameters.swift
+//  SpeziLLM
+//
+//  Created by Leon Nissen on 10/14/24.
+//
+
+import MLXNN
+
+extension Module {
+    /// Compute the number of parameters in a possibly quantized model
+    public func numParameters() -> Int {
+        leafModules().flattenedValues().map { mod -> Int in
+            if let quantized = mod as? QuantizedLinear {
+                return quantized.scales.size * quantized.groupSize
+            } else if let quantized = mod as? QuantizedEmbedding {
+                return quantized.scales.size * quantized.groupSize
+            } else {
+                return mod.parameters().flattenedValues().reduce(0) { $0 + $1.size }
+            }
+        }.reduce(0, +)
+    }
+}
diff --git a/Sources/SpeziLLMLocal/Helpers/ModelConfiguration+PromptFormat.swift b/Sources/SpeziLLMLocal/Helpers/ModelConfiguration+PromptFormat.swift
new file mode 100644
index 00000000..c3078782
--- /dev/null
+++ b/Sources/SpeziLLMLocal/Helpers/ModelConfiguration+PromptFormat.swift
@@ -0,0 +1,20 @@
+//
+//  ModelConfiguration+PromptFormat.swift
+//  SpeziLLM
+//
+//  Created by Leon Nissen on 10/15/24.
+//
+
+import MLXLLM
+
+
+extension ModelConfiguration {
+    var foo: String {
+        switch self.name {
+        case ModelConfiguration.codeLlama13b4bit.name:
+            return ""
+        default:
+            return ""
+        }
+    }
+}
diff --git a/Sources/SpeziLLMLocal/Helpers/String+Cxx.swift b/Sources/SpeziLLMLocal/Helpers/String+Cxx.swift
deleted file mode 100644
index 43679865..00000000
--- a/Sources/SpeziLLMLocal/Helpers/String+Cxx.swift
+++ /dev/null
@@ -1,30 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import Foundation
-
-
-extension String {
-    /// Initializes a Swift `String` from a C++ `string`.
-    ///
-    /// - Parameters:
-    ///    - cxxString: The given C++ `string`
-    ///
-    /// In the Release build mode, the Swift compiler is unable to choose the correct String initializer from the Swift stdlib.
-    /// Therefore, manual `String `extension by SpeziLLM that mirrors the C++ interop implementation within the Swift stdlib: https://github.com/apple/swift/blob/cf2a338afca54a787d59b83db6238b1568215b94/stdlib/public/Cxx/std/String.swift#L231-L239
-    init(_ cxxString: std.string) {
-        let buffer = UnsafeBufferPointer<CChar>(
-            start: cxxString.__c_strUnsafe(),
-            count: cxxString.size()
-        )
-        self = buffer.withMemoryRebound(to: UInt8.self) {
-            String(decoding: $0, as: UTF8.self)
-        }
-        withExtendedLifetime(cxxString) {}
-    }
-}
diff --git a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
index e6adc770..1d29c714 100644
--- a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
@@ -7,10 +7,10 @@
 //
 
 import Foundation
-import llama
 import Spezi
 import SpeziFoundation
 import SpeziLLM
+import MLX
 
 
 /// LLM execution platform of an ``LLMLocalSchema``.
@@ -39,13 +39,11 @@ import SpeziLLM
 /// }
 /// ```
 public actor LLMLocalPlatform: LLMPlatform, DefaultInitializable {
-    /// Enforce only one concurrent execution of a local LLM.
-    private let semaphore = AsyncSemaphore(value: 1)
+    
     let configuration: LLMLocalPlatformConfiguration
     
     @MainActor public var state: LLMPlatformState = .idle
     
-    
     /// Creates an instance of the ``LLMLocalPlatform``.
     ///
     /// - Parameters:
@@ -59,34 +57,18 @@ public actor LLMLocalPlatform: LLMPlatform, DefaultInitializable {
         self.init(configuration: .init())
     }
     
-    
     public nonisolated func configure() {
-        // Initialize the llama.cpp backend
-        llama_backend_init()
-        llama_numa_init(configuration.nonUniformMemoryAccess.wrappedValue)
+        MLX.GPU.set(cacheLimit: configuration.cacheLimit * 1024 * 1024)
+        if let memoryLimit = configuration.memoryLimit {
+            MLX.GPU.set(memoryLimit: memoryLimit.limit, relaxed: memoryLimit.relaxed)
+        }
     }
     
     public nonisolated func callAsFunction(with llmSchema: LLMLocalSchema) -> LLMLocalSession {
         LLMLocalSession(self, schema: llmSchema)
     }
     
-    nonisolated func exclusiveAccess() async throws {
-        try await semaphore.waitCheckingCancellation()
-        await MainActor.run {
-            state = .processing
-        }
-    }
-    
-    nonisolated func signal() async {
-        semaphore.signal()
-        await MainActor.run {
-            state = .idle
-        }
-    }
-    
-    
     deinit {
-        // Frees the llama.cpp backend
-        llama_backend_free()
+        MLX.GPU.clearCache()
     }
 }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift b/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift
deleted file mode 100644
index 0859cbe6..00000000
--- a/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift
+++ /dev/null
@@ -1,275 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import SpeziLLM
-
-
-extension LLMLocalSchema {
-    /// Holds default prompt formatting strategies for [Llama2](https://ai.meta.com/llama/) as well as [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) models.
-    public enum PromptFormattingDefaults {
-        /// Prompt formatting closure for the [Llama3](https://ai.meta.com/llama/) model
-        public static let llama3: (@Sendable (LLMContext) throws -> String) = { chat in // swiftlint:disable:this closure_body_length
-            /// BOS token of the LLM, used at the start of each prompt passage.
-            let BEGINOFTEXT = "<|begin_of_text|>"
-            /// The system identifier.
-            let SYSTEM = "system"
-            /// The user identifier.
-            let USER = "user"
-            /// The assistant identifier.
-            let ASSISTANT = "assistant"
-            /// The start token for enclosing the role of a particular message, e.g. <|start_header_id|>{role}<|end_header_id|>
-            let STARTHEADERID = "<|start_header_id|>"
-            /// The end token for enclosing the role of a particular message, e.g. <|start_header_id|>{role}<|end_header_id|>
-            let ENDHEADERID = "<|end_header_id|>"
-            /// The token that signifies the end of the message in a turn.
-            let EOTID = "<|eot_id|>"
-            
-            guard chat.first?.role == .system else {
-                throw LLMLocalError.illegalContext
-            }
-            
-            var systemPrompts: [String] = []
-            var initialUserPrompt: String = ""
-            
-            for contextEntity in chat {
-                if contextEntity.role != .system {
-                    if contextEntity.role == .user {
-                        initialUserPrompt = contextEntity.content
-                        break
-                    } else {
-                        throw LLMLocalError.illegalContext
-                    }
-                }
-                
-                systemPrompts.append(contextEntity.content)
-            }
-            
-            /// Build the initial Llama3 prompt structure
-            /// 
-            /// Template of the prompt structure:
-            /// <|begin_of_text|>
-            /// <|start_header_id|>user<|end_header_id|>
-            /// {{ user_message }}<|eot_id|>
-            /// <|start_header_id|>assistant<|end_header_id|>
-            var prompt = """
-            \(BEGINOFTEXT)
-            \(STARTHEADERID)\(SYSTEM)\(ENDHEADERID)
-            \(systemPrompts.joined(separator: " "))\(EOTID)
-            
-            \(STARTHEADERID)\(USER)\(ENDHEADERID)
-            \(initialUserPrompt)\(EOTID)
-            
-            """ + " "   // Add a spacer to the generated output from the model
-            
-            for contextEntity in chat.dropFirst(2) {
-                if contextEntity.role == .assistant() {
-                    /// Append response from assistant to the Llama3 prompt structure
-                    prompt += """
-                    \(STARTHEADERID)\(ASSISTANT)\(ENDHEADERID)
-                    \(contextEntity.content)
-                    \(EOTID)
-                    """
-                } else if contextEntity.role == .user {
-                    /// Append response from user to the Llama3 prompt structure
-                    prompt += """
-                    \(STARTHEADERID)\(USER)\(ENDHEADERID)
-                    \(contextEntity.content)
-                    \(EOTID)
-                    """ + " "   // Add a spacer to the generated output from the model
-                }
-            }
-            
-            prompt +=
-            """
-            \(STARTHEADERID)\(ASSISTANT)\(ENDHEADERID)
-            """
-            
-            return prompt
-        }
-        
-        /// Prompt formatting closure for the [Llama2](https://ai.meta.com/llama/) model
-        public static let llama2: (@Sendable (LLMContext) throws -> String) = { chat in     // swiftlint:disable:this closure_body_length
-            /// BOS token of the LLM, used at the start of each prompt passage.
-            let BOS = "<s>"
-            /// EOS token of the LLM, used at the end of each prompt passage.
-            let EOS = "</s>"
-            /// BOSYS token of the LLM, used at the start of the system prompt.
-            let BOSYS = "<<SYS>>"
-            /// EOSYS token of the LLM, used at the end of the system prompt.
-            let EOSYS = "<</SYS>>"
-            /// BOINST token of the LLM, used at the start of the instruction part of the prompt.
-            let BOINST = "[INST]"
-            /// EOINST token of the LLM, used at the end of the instruction part of the prompt.
-            let EOINST = "[/INST]"
-            
-            guard chat.first?.role == .system else {
-                throw LLMLocalError.illegalContext
-            }
-            
-            var systemPrompts: [String] = []
-            var initialUserPrompt: String = ""
-            
-            for contextEntity in chat {
-                if contextEntity.role != .system {
-                    if contextEntity.role == .user {
-                        initialUserPrompt = contextEntity.content
-                        break
-                    } else {
-                        throw LLMLocalError.illegalContext
-                    }
-                }
-                
-                systemPrompts.append(contextEntity.content)
-            }
-            
-            /// Build the initial Llama2 prompt structure
-            ///
-            /// A template of the prompt structure looks like:
-            /// """
-            /// <s>[INST] <<SYS>>
-            /// {your_system_prompt}
-            /// <</SYS>>
-            ///
-            /// {user_message_1} [/INST]
-            /// """
-            var prompt = """
-            \(BOS)\(BOINST) \(BOSYS)
-            \(systemPrompts.joined(separator: " "))
-            \(EOSYS)
-            
-            \(initialUserPrompt) \(EOINST)
-            """ + " "   // Add a spacer to the generated output from the model
-            
-            for contextEntity in chat.dropFirst(2) {
-                if contextEntity.role == .assistant() {
-                    /// Append response from assistant to the Llama2 prompt structure
-                    ///
-                    /// A template for appending an assistant response to the overall prompt looks like:
-                    /// {user_message_1} [/INST]){model_reply_1}</s>
-                    prompt += """
-                    \(contextEntity.content)\(EOS)
-                    """
-                } else if contextEntity.role == .user {
-                    /// Append response from user to the Llama2 prompt structure
-                    ///
-                    /// A template for appending an assistant response to the overall prompt looks like:
-                    /// <s>[INST] {user_message_2} [/INST]
-                    prompt += """
-                    \(BOS)\(BOINST) \(contextEntity.content) \(EOINST)
-                    """ + " "   // Add a spacer to the generated output from the model
-                }
-            }
-            
-            return prompt
-        }
-        
-        /// Prompt formatting closure for the [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) model
-        public static let phi2: (@Sendable (LLMContext) throws -> String) = { chat in
-            guard chat.first?.role == .system else {
-                throw LLMLocalError.illegalContext
-            }
-            
-            var systemPrompts: [String] = []
-            var initialUserPrompt: String = ""
-            
-            for contextEntity in chat {
-                if contextEntity.role != .system {
-                    if contextEntity.role == .user {
-                        initialUserPrompt = contextEntity.content
-                        break
-                    } else {
-                        throw LLMLocalError.illegalContext
-                    }
-                }
-                
-                systemPrompts.append(contextEntity.content)
-            }
-            
-            /// Build the initial Phi-2 prompt structure
-            ///
-            /// A template of the prompt structure looks like:
-            /// """
-            /// System: {your_system_prompt}
-            /// Instruct: {model_reply_1}
-            /// Output: {model_reply_1}
-            /// """
-            var prompt = """
-            System: \(systemPrompts.joined(separator: " "))
-            Instruct: \(initialUserPrompt)\n
-            """
-            
-            for contextEntity in chat.dropFirst(2) {
-                if contextEntity.role == .assistant() {
-                    /// Append response from assistant to the Phi-2 prompt structure
-                    prompt += """
-                    Output: \(contextEntity.content)\n
-                    """
-                } else if contextEntity.role == .user {
-                    /// Append response from assistant to the Phi-2 prompt structure
-                    prompt += """
-                    Instruct: \(contextEntity.content)\n
-                    """
-                }
-            }
-            
-            /// Model starts responding after
-            if chat.last?.role == .user {
-                prompt += "Output: "
-            }
-            
-            return prompt
-        }
-        
-        /// Prompt formatting closure for the [Gemma](https://ai.google.dev/gemma/docs/formatting) models
-        /// - Important: System prompts are ignored as Gemma doesn't support them
-        public static let gemma: (@Sendable (LLMContext) throws -> String) = { chat in
-            /// Start token of Gemma
-            let startToken = "<start_of_turn>"
-            /// End token of Gemma
-            let endToken = "<end_of_turn>"
-            
-            /// Build the initial Gemma prompt structure
-            ///
-            /// A template of the prompt structure looks like:
-            /// """
-            /// <start_of_turn>user
-            /// knock knock<end_of_turn>
-            /// <start_of_turn>model
-            /// who is there<end_of_turn>
-            /// <start_of_turn>user
-            /// Gemma<end_of_turn>
-            /// <start_of_turn>model
-            /// Gemma who?<end_of_turn>
-            /// """
-            var prompt = ""
-            
-            for contextEntity in chat {
-                if contextEntity.role == .assistant() {
-                    /// Append response from assistant to the Gemma prompt structure
-                    prompt += """
-                    \(startToken)model
-                    \(contextEntity.content)\(endToken)\n
-                    """
-                } else if contextEntity.role == .user {
-                    /// Append response from assistant to the Gemma prompt structure
-                    prompt += """
-                    \(startToken)user
-                    \(contextEntity.content)\(endToken)\n
-                    """
-                }
-            }
-            
-            /// Model starts responding after
-            if chat.last?.role == .user {
-                prompt += "\(startToken)model\n"
-            }
-            
-            return prompt
-        }
-    }
-}
diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
index 40204d23..bc2f64a8 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSchema.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
@@ -9,6 +9,7 @@
 import Foundation
 import SpeziChat
 import SpeziLLM
+@preconcurrency import MLXLLM
 
 
 /// Defines the type and configuration of the ``LLMLocalSession``.
@@ -20,17 +21,17 @@ import SpeziLLM
 public struct LLMLocalSchema: LLMSchema {
     public typealias Platform = LLMLocalPlatform
     
+    let generateParameters: GenerateParameters
     
-    /// The on-device `URL` where the model is located.
-    let modelPath: URL
-    /// Parameters of the llama.cpp LLM.
-    let parameters: LLMLocalParameters
-    /// Context parameters of the llama.cpp LLM.
-    let contextParameters: LLMLocalContextParameters
-    /// Sampling parameters of the llama.cpp LLM.
-    let samplingParameters: LLMLocalSamplingParameters
+    let maxTokens: Int
+    
+    let displayEveryNTokens: Int
+    
+    let configuration: ModelConfiguration
     /// Closure to properly format the ``LLMLocal/context`` to a `String` which is tokenized and passed to the LLM.
     let formatChat: (@Sendable (LLMContext) throws -> String)
+    
+    
     public let injectIntoContext: Bool
     
     
@@ -44,17 +45,17 @@ public struct LLMLocalSchema: LLMSchema {
     ///   - injectIntoContext: Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``, defaults to false.
     ///   - formatChat: Closure to properly format the ``LLMLocalSession/context`` to a `String` which is tokenized and passed to the LLM, defaults to Llama2 prompt format.
     public init(
-        modelPath: URL,
-        parameters: LLMLocalParameters = .init(),
-        contextParameters: LLMLocalContextParameters = .init(),
-        samplingParameters: LLMLocalSamplingParameters = .init(),
+        configuration: ModelConfiguration,
+        generateParameters: GenerateParameters = GenerateParameters(temperature: 0.6),
+        maxTokens: Int = 2048,
+        displayEveryNTokens: Int = 4,
         injectIntoContext: Bool = false,
-        formatChat: @escaping (@Sendable (LLMContext) throws -> String) = PromptFormattingDefaults.llama2
+        formatChat: @escaping (@Sendable (LLMContext) throws -> String)
     ) {
-        self.modelPath = modelPath
-        self.parameters = parameters
-        self.contextParameters = contextParameters
-        self.samplingParameters = samplingParameters
+        self.generateParameters = generateParameters
+        self.maxTokens = maxTokens
+        self.displayEveryNTokens = displayEveryNTokens
+        self.configuration = configuration
         self.injectIntoContext = injectIntoContext
         self.formatChat = formatChat
     }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
new file mode 100644
index 00000000..ea1138b2
--- /dev/null
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -0,0 +1,92 @@
+//
+//  LLMLocalSession+Generate.swift
+//  SpeziLLM
+//
+//  Created by Leon Nissen on 10/15/24.
+//
+import Foundation
+import os
+import SpeziChat
+import SpeziLLM
+import MLXLLM
+import MLX
+import MLXRandom
+
+extension LLMLocalSession {
+    func _generate(continuation: AsyncThrowingStream<String, any Error>.Continuation) async {
+        guard let modelContainer = await self.modelContainer else {
+            Self.logger.error("SpeziLLMLocal: Failed to load `modelContainer`")
+            await finishGenerationWithError(LLMLocalError.modelNotFound, on: continuation)
+            return
+        }
+        
+        let modelConfiguration = self.schema.configuration
+        
+        guard let formattedChat = try? await schema.formatChat(self.context) else {
+            Self.logger.error("SpeziLLMLocal: Failed to format chat with given context")
+            await finishGenerationWithError(LLMLocalError.illegalContext, on: continuation)
+            return
+        }
+        
+        let prompt = modelConfiguration.prepare(prompt: formattedChat)
+        let promptTokens = await modelContainer.perform { _, tokenizer in
+            tokenizer.encode(text: prompt)
+        }
+        
+        // each time you generate you will get something new
+        MLXRandom.seed(UInt64(Date.timeIntervalSinceReferenceDate * 1000))
+        
+        let extraEOSTokens = modelConfiguration.extraEOSTokens
+        
+        guard await !checkCancellation(on: continuation) else {
+            return
+        }
+        
+        let (result, tokenizer) = await modelContainer.perform { model, tokenizer in
+            // Execute the inference
+            let result = MLXLLM.generate(
+                promptTokens: promptTokens,
+                parameters: self.schema.generateParameters,
+                model: model,
+                tokenizer: tokenizer,
+                extraEOSTokens: extraEOSTokens
+            ) { tokens in
+                if Task.isCancelled {
+                    return .stop
+                }
+                
+                if tokens.count >= self.schema.maxTokens {
+                    continuation.finish()
+                    Task { @MainActor in
+                        self.state = .ready
+                    }
+                    return .stop
+                }
+                
+                if schema.injectIntoContext && tokens.count % schema.displayEveryNTokens == 0 {
+                    let lastTokens = Array(tokens.suffix(schema.displayEveryNTokens))
+                    let text = " " + tokenizer.decode(tokens: lastTokens)
+                    continuation.yield(text)
+                }
+                
+                return .more
+            }
+            
+            return (result, tokenizer)
+        }
+        
+        await MainActor.run {
+            if schema.injectIntoContext {
+                // Yielding every Nth token may result in missing the final tokens.
+                let reaminingTokens = result.tokens.count % schema.displayEveryNTokens
+                let lastTokens = Array(result.tokens.suffix(reaminingTokens))
+                let text = " " + tokenizer.decode(tokens: lastTokens)
+                continuation.yield(text)
+                context.completeAssistantStreaming()
+            } else {
+                context.append(assistantOutput: result.output, complete: true)
+            }
+            state = .ready
+        }
+    }
+}
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generation.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generation.swift
deleted file mode 100644
index cbdb0ade..00000000
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generation.swift
+++ /dev/null
@@ -1,194 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import Foundation
-import llama
-import SpeziLLM
-
-
-/// Extension of ``LLMLocalSession`` handling the text generation.
-extension LLMLocalSession {
-    /// Typealias for the llama.cpp `llama_token`.
-    typealias LLMLocalToken = llama_token
-    
-    
-    /// Based on the input prompt, generate the output with llama.cpp
-    ///
-    /// - Parameters:
-    ///   - continuation: A Swift `AsyncThrowingStream` that streams the generated output.
-    func _generate( // swiftlint:disable:this identifier_name function_body_length cyclomatic_complexity
-        continuation: AsyncThrowingStream<String, Error>.Continuation
-    ) async {
-        Self.logger.debug("SpeziLLMLocal: Local LLM started a new inference")
-        
-        await MainActor.run {
-            self.state = .generating
-        }
-        
-        // Log the most important parameters of the LLM
-        Self.logger.debug("SpeziLLMLocal: n_length = \(self.schema.parameters.maxOutputLength, privacy: .public), n_ctx = \(self.schema.contextParameters.contextWindowSize, privacy: .public), n_batch = \(self.schema.contextParameters.batchSize, privacy: .public), n_kv_req = \(self.schema.parameters.maxOutputLength, privacy: .public)")
-        
-        // Allocate new model context, if not already present
-        if self.modelContext == nil {
-            guard let context = llama_new_context_with_model(model, schema.contextParameters.llamaCppRepresentation) else {
-                Self.logger.error("SpeziLLMLocal: Failed to initialize context")
-                await finishGenerationWithError(LLMLocalError.generationError, on: continuation)
-                return
-            }
-            self.modelContext = context
-        }
-
-        // Check if the maximal output generation length is smaller or equals to the context window size.
-        guard schema.parameters.maxOutputLength <= schema.contextParameters.contextWindowSize else {
-            Self.logger.error("SpeziLLMLocal: Error: n_kv_req \(self.schema.parameters.maxOutputLength, privacy: .public) > n_ctx, the required KV cache size is not big enough")
-            await finishGenerationWithError(LLMLocalError.generationError, on: continuation)
-            return
-        }
-        
-        // Tokenizes the entire context of the LLM
-        guard let tokens = try? await tokenize() else {
-            Self.logger.error("""
-            SpeziLLMLocal: Tokenization failed as illegal context exists.
-            Ensure the content of the context is structured in: System Prompt, User prompt, and an
-            arbitrary number of assistant responses and follow up user prompts.
-            """)
-            await finishGenerationWithError(LLMLocalError.illegalContext, on: continuation)
-            return
-        }
-        
-        guard await !checkCancellation(on: continuation) else {
-            return
-        }
-        
-        // Check if the input token count is smaller than the context window size decremented by 4 (space for end tokens).
-        guard tokens.count <= schema.contextParameters.contextWindowSize - 4 else {
-            Self.logger.error("""
-            SpeziLLMLocal: Input prompt is too long with \(tokens.count, privacy: .public) tokens for the configured
-            context window size of \(self.schema.contextParameters.contextWindowSize, privacy: .public) tokens.
-            """)
-            await finishGenerationWithError(LLMLocalError.generationError, on: continuation)
-            return
-        }
-        
-        // Clear the KV cache in order to free up space for the incoming prompt (as we inject the entire history of the chat again)
-        llama_kv_cache_clear(self.modelContext)
-        
-        var batch = llama_batch_init(Int32(tokens.count), 0, 1)
-        defer {
-            llama_batch_free(batch)
-        }
-        
-        // Evaluate the initial prompt
-        for (tokenIndex, token) in tokens.enumerated() {
-            llama_batch_add(&batch, token, Int32(tokenIndex), getLlamaSeqIdVector(), false)
-        }
-        // llama_decode will output logits only for the last token of the prompt
-        batch.logits[Int(batch.n_tokens) - 1] = 1
-        
-        guard await !checkCancellation(on: continuation) else {
-            return
-        }
-        
-        if llama_decode(self.modelContext, batch) != 0 {
-            Self.logger.error("""
-            SpeziLLMLocal: Initial prompt decoding as failed!
-            """)
-            await finishGenerationWithError(LLMLocalError.generationError, on: continuation)
-            return
-        }
-        
-        guard await !checkCancellation(on: continuation) else {
-            return
-        }
-        
-        // Batch already includes tokens from the input prompt
-        var batchTokenIndex = batch.n_tokens
-        var decodedTokens = 0
-
-        // Calculate the token generation rate
-        let startTime = Date()
-        
-        while decodedTokens <= schema.parameters.maxOutputLength {
-            guard await !checkCancellation(on: continuation) else {
-                return
-            }
-            
-            let nextTokenId = sample(batchSize: batch.n_tokens)
-            
-            // Either finish the generation once EOS token appears, the maximum output length of the answer is reached or the context window is reached
-            if nextTokenId == llama_token_eos(self.model)
-                || decodedTokens == schema.parameters.maxOutputLength
-                || batchTokenIndex == schema.contextParameters.contextWindowSize {
-                continuation.finish()
-                await MainActor.run {
-                    self.state = .ready
-                }
-                return
-            }
-            
-            var nextStringPiece = String(llama_token_to_piece(self.modelContext, nextTokenId, true))
-            // As first character is sometimes randomly prefixed by a single space (even though prompt has an additional character)
-            if decodedTokens == 0 && nextStringPiece.starts(with: " ") {
-                nextStringPiece = String(nextStringPiece.dropFirst())
-            }
-            
-            // Yield the response from the model to the Stream
-            Self.logger.debug("""
-            SpeziLLMLocal: Yielded token: \(nextStringPiece, privacy: .public)
-            """)
-            
-            // Automatically inject the yielded string piece into the `LLMLocal/context`
-            if schema.injectIntoContext && nextTokenId != 0 {
-                let nextStringPiece = nextStringPiece
-                await MainActor.run {
-                    context.append(assistantOutput: nextStringPiece)
-                }
-            }
-            
-            if nextTokenId != 0 {
-                continuation.yield(nextStringPiece)
-            }
-            
-            // Prepare the next batch
-            llama_batch_clear(&batch)
-            
-            // Push generated output token for the next evaluation round
-            llama_batch_add(&batch, nextTokenId, batchTokenIndex, getLlamaSeqIdVector(), true)
-            
-            decodedTokens += 1
-            batchTokenIndex += 1
-            
-            // Evaluate the current batch with the transformer model
-            let decodeOutput = llama_decode(self.modelContext, batch)
-            if decodeOutput != 0 {      // = 0 Success, > 0 Warning, < 0 Error
-                Self.logger.error("SpeziLLMLocal: Decoding of generated output failed. Output: \(decodeOutput, privacy: .public)")
-                await finishGenerationWithError(LLMLocalError.generationError, on: continuation)
-                return
-            }
-        }
-        
-        let elapsedTime = Date().timeIntervalSince(startTime)
-        
-        Self.logger.debug("SpeziLLMLocal: Decoded \(decodedTokens, privacy: .public) tokens in \(String(format: "%.2f", elapsedTime), privacy: .public) s, speed: \(String(format: "%.2f", Double(decodedTokens) / elapsedTime), privacy: .public)) t/s")
-
-        llama_print_timings(self.modelContext)
-        
-        continuation.finish()
-        if schema.injectIntoContext {
-            await MainActor.run {
-                context.completeAssistantStreaming()
-            }
-        }
-        
-        await MainActor.run {
-            self.state = .ready
-        }
-        
-        Self.logger.debug("SpeziLLMLocal: Local LLM completed an inference")
-    }
-}
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Sampling.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Sampling.swift
deleted file mode 100644
index 942e2826..00000000
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Sampling.swift
+++ /dev/null
@@ -1,46 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import Foundation
-import llama
-
-
-extension LLMLocalSession {
-    /// Based on the current state of the context, sample the to be inferred output via the temperature method
-    ///
-    /// - Parameters:
-    ///     - batchSize: The current size of the `llama_batch`
-    /// - Returns: A sampled `LLMLocalToken`
-    func sample(batchSize: Int32) -> LLMLocalToken {
-        let nVocab = llama_n_vocab(model)
-        let logits = llama_get_logits_ith(self.modelContext, batchSize - 1)
-        
-        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(nVocab))
-        
-        for tokenId in 0 ..< nVocab {
-            candidates.append(llama_token_data(id: tokenId, logit: logits?[Int(tokenId)] ?? 0, p: 0.0))
-        }
-        
-        var candidatesP: llama_token_data_array = .init(
-            data: candidates.withUnsafeMutableBytes { $0.baseAddress?.assumingMemoryBound(to: llama_token_data.self) }, // &candidates
-            size: candidates.count,
-            sorted: false
-        )
-        
-        // Sample via the temperature method
-        let minKeep = Int(max(1, schema.samplingParameters.outputProbabilities))
-        llama_sample_top_k(modelContext, &candidatesP, schema.samplingParameters.topK, minKeep)
-        llama_sample_tail_free(modelContext, &candidatesP, schema.samplingParameters.tfs, minKeep)
-        llama_sample_typical(modelContext, &candidatesP, schema.samplingParameters.typicalP, minKeep)
-        llama_sample_top_p(modelContext, &candidatesP, schema.samplingParameters.topP, minKeep)
-        llama_sample_min_p(modelContext, &candidatesP, schema.samplingParameters.minP, minKeep)
-        llama_sample_temp(modelContext, &candidatesP, schema.samplingParameters.temperature)
-        
-        return llama_sample_token(modelContext, &candidatesP)
-    }
-}
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
index c60d2793..1fbf6bef 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
@@ -1,48 +1,60 @@
 //
-// This source file is part of the Stanford Spezi open source project
+//  LLMLocalSession+Setup.swift
+//  SpeziLLM
 //
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
+//  Created by Leon Nissen on 10/4/24.
 //
 
-import llama
+import Foundation
+@preconcurrency import MLXLLM
+@preconcurrency import Hub
 
 
 extension LLMLocalSession {
-    /// Set up the local LLM execution environment via llama.cpp
-    ///
-    /// - Parameters:
-    ///   - continuation: A Swift `AsyncThrowingStream` that streams the generated output.
-    /// - Returns: `true` if the setup was successful, `false` otherwise.
+    private func verifyModelDownload() -> Bool {
+        let repo = Hub.Repo(id: self.schema.configuration.name)
+        let url = HubApi.shared.localRepoLocation(repo)
+        let modelFileExtension = ".safetensors"
+        
+        do {
+            let contents = try FileManager.default.contentsOfDirectory(atPath: url.path())
+            return contents.first(where: { $0.hasSuffix(modelFileExtension) }) != nil
+        } catch {
+            return false
+        }
+    }
+    
+    
     func setup(continuation: AsyncThrowingStream<String, Error>.Continuation) async -> Bool {
         Self.logger.debug("SpeziLLMLocal: Local LLM is being initialized")
+        
         await MainActor.run {
-            state = .loading
+            self.state = .loading
         }
         
-        guard let model = llama_load_model_from_file(schema.modelPath.path().cString(using: .utf8), schema.parameters.llamaCppRepresentation) else {
+        guard verifyModelDownload() else {
             await finishGenerationWithError(LLMLocalError.modelNotFound, on: continuation)
             Self.logger.error("SpeziLLMLocal: Local LLM file could not be opened, indicating that the model file doesn't exist")
             return false
         }
         
-        /// Check if model was trained for the configured context window size
-        guard schema.contextParameters.contextWindowSize <= llama_n_ctx_train(model) else {
-            await finishGenerationWithError(LLMLocalError.contextSizeMismatch, on: continuation)
-            Self.logger.error("""
-            SpeziLLMLocal: Model was trained on only \(llama_n_ctx_train(model), privacy: .public) context tokens,
-            not the configured \(self.schema.contextParameters.contextWindowSize, privacy: .public) context tokens
-            """)
+        do {
+            let modelContainer = try await loadModelContainer(configuration: self.schema.configuration)
+            
+            let numParams = await modelContainer.perform { [] model, _ in
+                return model.numParameters()
+            }
+            
+            await MainActor.run {
+                self.modelContainer = modelContainer
+                self.numParameters = numParams
+                self.state = .ready
+            }
+        } catch {
+            continuation.yield(with: .failure(error))
+            Self.logger.error("SpeziLLMLocal: Failed to load local `modelContainer`")
             return false
         }
-        
-        self.model = model
-        
-        await MainActor.run {
-            state = .ready
-        }
-        Self.logger.debug("SpeziLLMLocal: Local LLM finished initializing, now ready to use")
         return true
     }
 }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Tokenization.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Tokenization.swift
deleted file mode 100644
index 5ea001fb..00000000
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Tokenization.swift
+++ /dev/null
@@ -1,81 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import Foundation
-import llama
-
-
-/// Extension of ``LLMLocalSession`` handling the text tokenization.
-extension LLMLocalSession {
-    /// Converts the current context of the model to the individual `LLMLocalToken`'s based on the model's dictionary.
-    /// This is a required tasks as LLMs internally processes tokens.
-    ///
-    /// - Returns: The tokenized `String` as `LLMLocalToken`'s.
-    func tokenize() async throws -> [LLMLocalToken] {
-        // Format the chat into a prompt that conforms to the prompt structure of the respective LLM
-        let formattedChat = try await schema.formatChat(self.context)
-        
-        // C++ vector doesn't conform to Swift sequence on VisionOS SDK (Swift C++ Interop bug),
-        // therefore requiring workaround for VisionSDK
-        #if !os(visionOS)
-        var tokens: [LLMLocalToken] = .init(
-            llama_tokenize_with_context(self.modelContext, std.string(formattedChat), schema.parameters.addBosToken, true)
-        )
-        #else
-        // Swift String to C++ String buggy on VisionOS, workaround via C-based `char` array
-        guard let cString = formattedChat.cString(using: .utf8) else {
-            fatalError("SpeziLLMLocal: Couldn't bridge the LLM Swift-based String context to a C-based String.")
-        }
-        
-        let cxxTokensVector = llama_tokenize_with_context_from_char_array(self.modelContext, cString, schema.parameters.addBosToken, true)
-        
-        // Get C array from C++ vector containing the tokenized content
-        guard var cxxTokensArray = vectorToIntArray(cxxTokensVector) else {
-            fatalError("SpeziLLMLocal: Couldn't get C array containing the tokenized content from C++ vector.")
-        }
-        
-        // Extract tokens from C array to a Swift array
-        var tokens: [LLMLocalToken] = []
-        
-        for _ in 0...cxxTokensVector.size() {
-            tokens.append(cxxTokensArray.pointee)
-            cxxTokensArray = cxxTokensArray.advanced(by: 1)
-        }
-        #endif
-        
-        // Truncate tokens if there wouldn't be enough context size for the generated output
-        if tokens.count > Int(schema.contextParameters.contextWindowSize) - schema.parameters.maxOutputLength {
-            tokens = Array(tokens.suffix(Int(schema.contextParameters.contextWindowSize) - schema.parameters.maxOutputLength))
-        }
-        
-        // Output generation shouldn't run without any tokens
-        if tokens.isEmpty {
-            tokens.append(llama_token_bos(self.model))
-            Self.logger.warning("""
-            SpeziLLMLocal: The input prompt didn't map to any tokens, so the prompt was considered empty.
-            To mediate this issue, a BOS token was added to the prompt so that the output generation
-            doesn't run without any tokens.
-            """)
-        }
-        
-        return tokens
-    }
-    
-    /// Converts an array of `LLMLocalToken`s to an array of tupels of `LLMLocalToken`s as well as their `String` representation.
-    ///
-    /// - Parameters:
-    ///     - tokens: An array of `LLMLocalToken`s that should be detokenized.
-    /// - Returns: An array of tupels of `LLMLocalToken`s as well as their `String` representation.
-    ///
-    /// - Note: Used only for debug purposes
-    func detokenize(tokens: [LLMLocalToken]) -> [(LLMLocalToken, String)] {
-        tokens.reduce(into: [(LLMLocalToken, String)]()) { partialResult, token in
-            partialResult.append((token, String(llama_token_to_piece(self.modelContext, token, true))))
-        }
-    }
-}
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift
index 6771ba84..aa4a589d 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession.swift
@@ -10,6 +10,9 @@ import Foundation
 import os
 import SpeziChat
 import SpeziLLM
+import MLXLLM
+import MLX
+import MLXRandom
 
 
 /// Represents an ``LLMLocalSchema`` in execution.
@@ -65,16 +68,19 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable {
     let platform: LLMLocalPlatform
     let schema: LLMLocalSchema
     
+    @ObservationIgnored private var modelExist: Bool {
+        false
+    }
+    
     /// A task managing the ``LLMLocalSession`` output generation.
     @ObservationIgnored private var task: Task<(), Never>?
     
     @MainActor public var state: LLMState = .uninitialized
     @MainActor public var context: LLMContext = []
     
-    /// A pointer to the allocated model via llama.cpp.
-    @ObservationIgnored var model: OpaquePointer?
-    /// A pointer to the allocated model context from llama.cpp.
-    @ObservationIgnored var modelContext: OpaquePointer?
+    @MainActor public var numParameters: Int?
+    @MainActor public var modelConfiguration: ModelConfiguration?
+    @MainActor public var modelContainer: ModelContainer?
     
     
     /// Creates an instance of a ``LLMLocalSession`` responsible for LLM inference.
@@ -86,34 +92,19 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable {
     init(_ platform: LLMLocalPlatform, schema: LLMLocalSchema) {
         self.platform = platform
         self.schema = schema
-        
-        // Inject system prompt into context
-        if let systemPrompt = schema.parameters.systemPrompt {
-            Task { @MainActor in
-                context.append(systemMessage: systemPrompt)
-            }
-        }
     }
     
-    
     @discardableResult
     public func generate() async throws -> AsyncThrowingStream<String, Error> {
-        try await platform.exclusiveAccess()
-        
         let (stream, continuation) = AsyncThrowingStream.makeStream(of: String.self)
         
-        // Execute the output generation of the LLM
         task = Task(priority: platform.configuration.taskPriority) {
-            // Unregister as soon as `Task` finishes
-            defer {
-                Task {
-                    await platform.signal()
-                }
-            }
-            
-            // Setup the model, if not already done
-            if model == nil {
+            if await state == .uninitialized {
                 guard await setup(continuation: continuation) else {
+                    await MainActor.run {
+                        state = .error(error: LLMLocalError.modelNotReadyYet)
+                    }
+                    await finishGenerationWithError(LLMLocalError.modelNotReadyYet, on: continuation)
                     return
                 }
             }
@@ -122,18 +113,22 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable {
                 return
             }
             
-            // Execute the inference
+            await MainActor.run {
+                self.state = .generating
+            }
+            
+            // Execute the output generation of the LLM
             await _generate(continuation: continuation)
         }
         
         return stream
     }
     
+    
     public func cancel() {
         task?.cancel()
     }
     
-    
     deinit {
         cancel()
     }
diff --git a/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings b/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
index b2290765..335659bd 100644
--- a/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
+++ b/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
@@ -152,6 +152,7 @@
       }
     },
     "SPEZI_LLM_LOCAL_SYSTEM_PROMPT" : {
+      "extractionState" : "stale",
       "localizations" : {
         "en" : {
           "stringUnit" : {
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager+DefaultUrls.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager+DefaultUrls.swift
deleted file mode 100644
index 07dad7a6..00000000
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager+DefaultUrls.swift
+++ /dev/null
@@ -1,92 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import Foundation
-
-
-extension LLMLocalDownloadManager {
-    /// Defaults of possible LLMs to download via the ``LLMLocalDownloadManager``.
-    public enum LLMUrlDefaults {
-        /// LLama 3 8B model with `Q4_K_M` quantization in its instruct variation (~5 GB)
-        public static var llama3InstructModelUrl: URL {
-            guard let url = URL(string: "https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf") else {
-                preconditionFailure("""
-                    SpeziLLM: Invalid LLMUrlDefaults LLM download URL.
-                """)
-            }
-            
-            return url
-        }
-        
-        /// LLama 2 7B model with `Q4_K_M` quantization in its chat variation (~3.5GB)
-        public static var llama2ChatModelUrl: URL {
-            guard let url = URL(string: "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf") else {
-                preconditionFailure("""
-                    SpeziLLM: Invalid LLMUrlDefaults LLM download URL.
-                """)
-            }
-            
-            return url
-        }
-        
-        /// LLama 2 13B model with `Q4_K_M` quantization in its chat variation (~7GB)
-        public static var llama2Chat13BModelUrl: URL {
-            guard let url = URL(string: "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin") else {
-                preconditionFailure("""
-                    SpeziLLM: Invalid LLMUrlDefaults LLM download URL.
-                """)
-            }
-            
-            return url
-        }
-        
-        /// Phi-2 model with `Q5_K_M` quantization (~2GB)
-        public static var phi2ModelUrl: URL {
-            guard let url = URL(string: "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q5_K_M.gguf") else {
-                preconditionFailure("""
-                    SpeziLLM: Invalid LLMUrlDefaults LLM download URL.
-                """)
-            }
-            
-            return url
-        }
-        
-        /// Gemma 7B model with `Q4_K_M` quantization (~5GB)
-        public static var gemma7BModelUrl: URL {
-            guard let url = URL(string: "https://huggingface.co/rahuldshetty/gemma-7b-it-gguf-quantized/resolve/main/gemma-7b-it-Q4_K_M.gguf") else {
-                preconditionFailure("""
-                    SpeziLLM: Invalid LLMUrlDefaults LLM download URL.
-                """)
-            }
-            
-            return url
-        }
-        
-        /// Gemma 2B model with `Q4_K_M` quantization (~1.5GB)
-        public static var gemma2BModelUrl: URL {
-            guard let url = URL(string: "https://huggingface.co/rahuldshetty/gemma-2b-gguf-quantized/resolve/main/gemma-2b-Q4_K_M.gguf") else {
-                preconditionFailure("""
-                    SpeziLLM: Invalid LLMUrlDefaults LLM download URL.
-                """)
-            }
-            
-            return url
-        }
-        
-        /// Tiny LLama 1.1B model with `Q5_K_M` quantization in its chat variation (~800MB)
-        public static var tinyLLama2ModelUrl: URL {
-            guard let url = URL(string: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf") else {
-                preconditionFailure("""
-                    SpeziLLM: Invalid LLMUrlDefaults LLM download URL.
-                """)
-            }
-            
-            return url
-        }
-    }
-}
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
index e2788b2a..8080c472 100644
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
+++ b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
@@ -9,7 +9,8 @@
 import Foundation
 import Observation
 import SpeziViews
-
+import MLXLLM
+import Hub
 
 /// Manages the download and storage of Large Language Models (LLM) to the local device.
 ///
@@ -25,7 +26,7 @@ public final class LLMLocalDownloadManager: NSObject {
     /// An enum containing all possible states of the ``LLMLocalDownloadManager``.
     public enum DownloadState: Equatable {
         case idle
-        case downloading(progress: Double)
+        case downloading(progress: Progress)
         case downloaded(storageUrl: URL)
         case error(LocalizedError)
         
@@ -41,41 +42,73 @@ public final class LLMLocalDownloadManager: NSObject {
         }
     }
     
-    /// The delegate handling the download manager tasks.
-    @ObservationIgnored private var downloadDelegate: LLMLocalDownloadManagerDelegate?  // swiftlint:disable:this weak_delegate
     /// The `URLSessionDownloadTask` that handles the download of the model.
-    @ObservationIgnored private var downloadTask: URLSessionDownloadTask?
-    /// Remote `URL` from where the LLM file should be downloaded.
-    private let llmDownloadUrl: URL
-    /// Local `URL` where the downloaded model is stored.
-    let llmStorageUrl: URL
+    @ObservationIgnored private var downloadTask: Task<(), Never>?
     /// Indicates the current state of the ``LLMLocalDownloadManager``.
     @MainActor public var state: DownloadState = .idle
+    private let modelConfiguration: ModelConfiguration
     
+    @ObservationIgnored public var modelExists: Bool {
+        let repo = Hub.Repo(id: modelConfiguration.name)
+        let url = HubApi.shared.localRepoLocation(repo)
+        let modelFileExtension = ".safetensors"
+        
+        do {
+            let contents = try FileManager.default.contentsOfDirectory(atPath: url.path())
+            return contents.first(where: { $0.hasSuffix(modelFileExtension) }) != nil
+        } catch {
+            return false
+        }
+    }
     
     /// Creates a ``LLMLocalDownloadManager`` that helps with downloading LLM files from remote servers.
     ///
     /// - Parameters:
-    ///   - llmDownloadUrl: The remote `URL` from where the LLM file should be downloaded.
-    ///   - llmStorageUrl: The local `URL` where the LLM file should be stored.
-    public init(
-        llmDownloadUrl: URL = LLMUrlDefaults.llama2ChatModelUrl,
-        llmStorageUrl: URL = .cachesDirectory.appending(path: "llm.gguf")
-    ) {
-        self.llmDownloadUrl = llmDownloadUrl
-        self.llmStorageUrl = llmStorageUrl
+    ///   - modelConfiguration: TODO
+    public init(modelConfiguration: ModelConfiguration) {
+        self.modelConfiguration = modelConfiguration
     }
     
+    /// Creates a ``LLMLocalDownloadManager`` that helps with downloading LLM files from remote servers.
+    ///
+    /// - Parameters:
+    ///   - modelID: TODO
+    public init(modelID: String) {
+        self.modelConfiguration = .init(id: modelID)
+    }
     
     /// Starts a `URLSessionDownloadTask` to download the specified model.
     public func startDownload() {
-        downloadTask?.cancel()
-        
-        downloadDelegate = LLMLocalDownloadManagerDelegate(manager: self, storageUrl: llmStorageUrl)
-        let session = URLSession(configuration: .default, delegate: downloadDelegate, delegateQueue: nil)
-        downloadTask = session.downloadTask(with: llmDownloadUrl)
+        if case let .directory(url) = modelConfiguration.id {
+            Task { @MainActor in
+                self.state = .downloaded(storageUrl: url)
+            }
+            return
+        }
         
-        downloadTask?.resume()
+        downloadTask?.cancel()
+        downloadTask = Task(priority: .userInitiated) {
+            do {
+                let _ = try await loadModelContainer(configuration: modelConfiguration) { progress in
+                    Task { @MainActor in
+                        self.state = .downloading(progress: progress)
+                    }
+                }
+                
+                Task { @MainActor in
+                    self.state = .downloaded(storageUrl: modelConfiguration.modelDirectory())
+                }
+            } catch {
+                Task { @MainActor in
+                    self.state = .error(
+                        AnyLocalizedError(
+                            error: error,
+                            defaultErrorDescription: LocalizedStringResource("LLM_DOWNLOAD_FAILED_ERROR", bundle: .atURL(from: .module))
+                        )
+                    )
+                }
+            }
+        }
     }
     
     /// Cancels the download of a specified model via a `URLSessionDownloadTask`.
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManagerDelegate.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManagerDelegate.swift
deleted file mode 100644
index 780dc054..00000000
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManagerDelegate.swift
+++ /dev/null
@@ -1,86 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import Foundation
-import os
-import SpeziViews
-
-
-/// Delegate of the ``LLMLocalDownloadManager`` implementing the methods of the`URLSessionDownloadDelegate` conformance.
-class LLMLocalDownloadManagerDelegate: NSObject, URLSessionDownloadDelegate {
-    /// A Swift `Logger` that logs important information from the `LocalLLMDownloadManager`.
-    private static let logger = Logger(subsystem: "edu.stanford.spezi", category: "SpeziLLM")
-    /// A `weak` reference to the ``LLMLocalDownloadManager``.
-    private weak var manager: LLMLocalDownloadManager?
-    /// The storage location `URL` of the downloaded LLM.
-    private let storageUrl: URL
-
-    
-    /// Creates a new `LLMLocalDownloadManagerDelegate`
-    /// - Parameters:
-    ///   - manager: The ``LLMLocalDownloadManager`` from which the `LLMLocalDownloadManagerDelegate` is initialized.
-    ///   - storageUrl: The `URL` where the downloaded LLM should be stored.
-    init(manager: LLMLocalDownloadManager, storageUrl: URL) {
-        self.manager = manager
-        self.storageUrl = storageUrl
-    }
-
-    
-    /// Indicates the progress of the current model download.
-    func urlSession(
-        _ session: URLSession,
-        downloadTask: URLSessionDownloadTask,
-        didWriteData bytesWritten: Int64,
-        totalBytesWritten: Int64,
-        totalBytesExpectedToWrite: Int64
-    ) {
-        let progress = Double(totalBytesWritten) / Double(totalBytesExpectedToWrite) * 100
-        Task { @MainActor in
-            self.manager?.state = .downloading(progress: progress)
-        }
-    }
-
-    /// Indicates the completion of the model download including the downloaded file `URL`.
-    func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didFinishDownloadingTo location: URL) {
-        do {
-            _ = try FileManager.default.replaceItemAt(self.storageUrl, withItemAt: location)
-            Task { @MainActor in
-                self.manager?.state = .downloaded(storageUrl: self.storageUrl)
-            }
-        } catch {
-            Task { @MainActor in
-                self.manager?.state = .error(
-                    AnyLocalizedError(
-                        error: error,
-                        defaultErrorDescription:
-                            LocalizedStringResource("LLM_DOWNLOAD_FAILED_ERROR", bundle: .atURL(from: .module))
-                    )
-                )
-            }
-            Self.logger.error("\(String(describing: error))")
-        }
-    }
-
-    /// Indicates an error during the model download
-    func urlSession(_ session: URLSession, task: URLSessionTask, didCompleteWithError error: Error?) {
-        // The `error` property is set for client-side errors (e.g. couldn't resolve host name),
-        // the `task.error` property is set in the case of server-side errors.
-        // If none of these properties are set, no error has occurred.
-        if let error = error ?? task.error {
-            Task { @MainActor in
-                self.manager?.state = .error(
-                    AnyLocalizedError(
-                        error: error,
-                        defaultErrorDescription: LocalizedStringResource("LLM_DOWNLOAD_FAILED_ERROR", bundle: .atURL(from: .module))
-                    )
-                )
-            }
-            Self.logger.error("\(String(describing: error))")
-        }
-    }
-}
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
index 2a2a0912..3a1c02ab 100644
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
+++ b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
@@ -9,6 +9,7 @@
 import SpeziOnboarding
 import SpeziViews
 import SwiftUI
+import MLXLLM
 
 
 /// Provides an onboarding view for downloading locally executed Spezi LLMs to the device.
@@ -155,7 +156,7 @@ public struct LLMLocalDownloadView: View {
     /// Represents the download progress of the model in percent (from 0 to 100)
     @MainActor private var downloadProgress: Double {
         if case .downloading(let progress) = self.downloadManager.state {
-            return progress
+            return progress.fractionCompleted * 100
         } else if case .downloaded = self.downloadManager.state {
             return 100.0
         }
@@ -165,9 +166,7 @@ public struct LLMLocalDownloadView: View {
     
     /// A `Bool` flag indicating if the model already exists on the device
     private var modelExists: Bool {
-        FileManager.default.fileExists(
-            atPath: self.downloadManager.llmStorageUrl.path()
-        )
+        self.downloadManager.modelExists
     }
     
     
@@ -179,16 +178,12 @@ public struct LLMLocalDownloadView: View {
     ///   - llmDownloadLocation: The local `URL` where the LLM file should be stored.
     ///   - action: The action that should be performed when pressing the primary button of the view.
     public init(
+        model modelConfiguration: ModelConfiguration,
         downloadDescription: LocalizedStringResource,
-        llmDownloadUrl: URL = LLMLocalDownloadManager.LLMUrlDefaults.llama2ChatModelUrl,
-        llmStorageUrl: URL = .cachesDirectory.appending(path: "llm.gguf"),
         action: @escaping () async throws -> Void
     ) {
         self._downloadManager = State(
-            wrappedValue: LLMLocalDownloadManager(
-                llmDownloadUrl: llmDownloadUrl,
-                llmStorageUrl: llmStorageUrl
-            )
+            wrappedValue: LLMLocalDownloadManager(modelConfiguration: modelConfiguration)
         )
         self.downloadDescription = Text(downloadDescription)
         self.action = action
@@ -203,16 +198,38 @@ public struct LLMLocalDownloadView: View {
     ///   - action: The action that should be performed when pressing the primary button of the view.
     @_disfavoredOverload
     public init<S: StringProtocol>(
+        model modelConfiguration: ModelConfiguration,
+        downloadDescription: S,
+        action: @escaping () async throws -> Void
+    ) {
+        self._downloadManager = State(
+            wrappedValue: LLMLocalDownloadManager(modelConfiguration: modelConfiguration)
+        )
+        self.downloadDescription = Text(verbatim: String(downloadDescription))
+        self.action = action
+    }
+    
+    @_disfavoredOverload
+    public init(
+        model modelID: String,
+        downloadDescription: LocalizedStringResource,
+        action: @escaping () async throws -> Void
+    ) {
+        self._downloadManager = State(
+            wrappedValue: LLMLocalDownloadManager(modelID: modelID)
+        )
+        self.downloadDescription = Text(downloadDescription)
+        self.action = action
+    }
+    
+    @_disfavoredOverload
+    public init<S: StringProtocol>(
+        model modelID: String,
         downloadDescription: S,
-        llmDownloadUrl: URL = LLMLocalDownloadManager.LLMUrlDefaults.llama2ChatModelUrl,
-        llmStorageUrl: URL = .cachesDirectory.appending(path: "llm.gguf"),
         action: @escaping () async throws -> Void
     ) {
         self._downloadManager = State(
-            wrappedValue: LLMLocalDownloadManager(
-                llmDownloadUrl: llmDownloadUrl,
-                llmStorageUrl: llmStorageUrl
-            )
+            wrappedValue: LLMLocalDownloadManager(modelID: modelID)
         )
         self.downloadDescription = Text(verbatim: String(downloadDescription))
         self.action = action
@@ -223,6 +240,7 @@ public struct LLMLocalDownloadView: View {
 #if DEBUG
 #Preview {
     LLMLocalDownloadView(
+        model: .phi3_4bit,
         downloadDescription: "LLM_DOWNLOAD_DESCRIPTION".localized(.module),
         action: {}
     )
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalLoadingManager.swift b/Sources/SpeziLLMLocalDownload/LLMLocalLoadingManager.swift
new file mode 100644
index 00000000..e69de29b

From 519dac17f6b35864d8fe8ae8d1773d93670cba10 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Wed, 16 Oct 2024 11:05:33 -0700
Subject: [PATCH 02/27] add simulator check

---
 Sources/SpeziLLMLocal/LLMLocalPlatform.swift | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
index 1d29c714..b8f3d0b0 100644
--- a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
@@ -58,6 +58,10 @@ public actor LLMLocalPlatform: LLMPlatform, DefaultInitializable {
     }
     
     public nonisolated func configure() {
+#if targetEnvironment(simulator)
+        assertionFailure("SpeziLLMLocal: Code cannot be run on simulator.")
+#endif
+        
         MLX.GPU.set(cacheLimit: configuration.cacheLimit * 1024 * 1024)
         if let memoryLimit = configuration.memoryLimit {
             MLX.GPU.set(memoryLimit: memoryLimit.limit, relaxed: memoryLimit.relaxed)

From 85df6f1e592bb4cae975bdcca7dc06129b907919 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Thu, 17 Oct 2024 19:04:59 -0700
Subject: [PATCH 03/27] improve code to PR comments

---
 README.md                                     | 32 +------------------
 .../Helpers/LLMModel+numParameters.swift      | 21 +++++++-----
 .../ModelConfiguration+PromptFormat.swift     | 20 ------------
 Sources/SpeziLLMLocal/LLMLocalError.swift     |  2 +-
 Sources/SpeziLLMLocal/LLMLocalPlatform.swift  |  5 ++-
 Sources/SpeziLLMLocal/LLMLocalSchema.swift    | 25 +++++++--------
 .../LLMLocalSession+Generate.swift            | 18 +++++++----
 .../SpeziLLMLocal/LLMLocalSession+Setup.swift | 15 +++++----
 Sources/SpeziLLMLocal/LLMLocalSession.swift   |  9 +++---
 .../Resources/Localizable.xcstrings           | 11 -------
 .../LLMLocalDownloadManager.swift             | 16 +++++-----
 .../LLMLocalDownloadView.swift                |  4 +--
 .../LLMLocalLoadingManager.swift              |  0
 13 files changed, 63 insertions(+), 115 deletions(-)
 delete mode 100644 Sources/SpeziLLMLocal/Helpers/ModelConfiguration+PromptFormat.swift
 delete mode 100644 Sources/SpeziLLMLocalDownload/LLMLocalLoadingManager.swift

diff --git a/README.md b/README.md
index f173a3a8..a664e406 100644
--- a/README.md
+++ b/README.md
@@ -57,37 +57,7 @@ The section below highlights the setup and basic use of the [SpeziLLMLocal](http
 
 ### Spezi LLM Local
 
-The target enables developers to easily execute medium-size Language Models (LLMs) locally on-device via the [llama.cpp framework](https://github.com/ggerganov/llama.cpp). The module allows you to interact with the locally run LLM via purely Swift-based APIs, no interaction with low-level C or C++ code is necessary, building on top of the infrastructure of the [SpeziLLM target](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm).
-
-> [!IMPORTANT]
-> Important: In order to use the LLM local target, one needs to set build parameters in the consuming Xcode project or the consuming SPM package to enable the [Swift / C++ Interop](https://www.swift.org/documentation/cxx-interop/), introduced in Xcode 15 and Swift 5.9. Keep in mind that this is true for nested dependencies, one needs to set this configuration recursivly for the entire dependency tree towards the llama.cpp SPM package. <!-- markdown-link-check-disable-line -->
-> 
-> **For Xcode projects:**
-> - Open your [build settings in Xcode](https://developer.apple.com/documentation/xcode/configuring-the-build-settings-of-a-target/) by selecting *PROJECT_NAME > TARGET_NAME > Build Settings*.
-> - Within the *Build Settings*, search for the `C++ and Objective-C Interoperability` setting and set it to `C++ / Objective-C++`. This enables the project to use the C++ headers from llama.cpp.
->
-> **For SPM packages:**
-> - Open the `Package.swift` file of your [SPM package]((https://www.swift.org/documentation/package-manager/)) <!-- markdown-link-check-disable-line -->
-> - Within the package `target` that consumes the llama.cpp package, add the `interoperabilityMode(_:)` Swift build setting like that:
-> ```swift
-> /// Adds the dependency to the Spezi LLM SPM package
-> dependencies: [
->     .package(url: "https://github.com/StanfordSpezi/SpeziLLM", .upToNextMinor(from: "0.6.0"))
-> ],
-> targets: [
->   .target(
->       name: "ExampleConsumingTarget",
->       /// State the dependence of the target to SpeziLLMLocal
->       dependencies: [
->           .product(name: "SpeziLLMLocal", package: "SpeziLLM")
->       ],
->       /// Important: Configure the `.interoperabilityMode(_:)` within the `swiftSettings`
->       swiftSettings: [
->           .interoperabilityMode(.Cxx)
->       ]
->   )
-> ]
-> ```
+The target enables developers to easily execute medium-size Language Models (LLMs) locally on-device. The module allows you to interact with the locally run LLM via purely Swift-based APIs, no interaction with low-level code is necessary, building on top of the infrastructure of the [SpeziLLM target](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm).
 
 #### Setup
 
diff --git a/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift b/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift
index bcf4f678..da6d89eb 100644
--- a/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift
+++ b/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift
@@ -10,14 +10,19 @@ import MLXNN
 extension Module {
     /// Compute the number of parameters in a possibly quantized model
     public func numParameters() -> Int {
-        leafModules().flattenedValues().map { mod -> Int in
-            if let quantized = mod as? QuantizedLinear {
-                return quantized.scales.size * quantized.groupSize
-            } else if let quantized = mod as? QuantizedEmbedding {
-                return quantized.scales.size * quantized.groupSize
-            } else {
-                return mod.parameters().flattenedValues().reduce(0) { $0 + $1.size }
+        leafModules()
+            .flattenedValues()
+            .map { mod -> Int in
+                if let quantized = mod as? QuantizedLinear {
+                    return quantized.scales.size * quantized.groupSize
+                } else if let quantized = mod as? QuantizedEmbedding {
+                    return quantized.scales.size * quantized.groupSize
+                } else {
+                    return mod.parameters()
+                        .flattenedValues()
+                        .reduce(0) { $0 + $1.size }
+                }
             }
-        }.reduce(0, +)
+            .reduce(0, +)
     }
 }
diff --git a/Sources/SpeziLLMLocal/Helpers/ModelConfiguration+PromptFormat.swift b/Sources/SpeziLLMLocal/Helpers/ModelConfiguration+PromptFormat.swift
deleted file mode 100644
index c3078782..00000000
--- a/Sources/SpeziLLMLocal/Helpers/ModelConfiguration+PromptFormat.swift
+++ /dev/null
@@ -1,20 +0,0 @@
-//
-//  ModelConfiguration+PromptFormat.swift
-//  SpeziLLM
-//
-//  Created by Leon Nissen on 10/15/24.
-//
-
-import MLXLLM
-
-
-extension ModelConfiguration {
-    var foo: String {
-        switch self.name {
-        case ModelConfiguration.codeLlama13b4bit.name:
-            return ""
-        default:
-            return ""
-        }
-    }
-}
diff --git a/Sources/SpeziLLMLocal/LLMLocalError.swift b/Sources/SpeziLLMLocal/LLMLocalError.swift
index 4f3b96a8..c6f7ada4 100644
--- a/Sources/SpeziLLMLocal/LLMLocalError.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalError.swift
@@ -1,7 +1,7 @@
 //
 // This source file is part of the Stanford Spezi open source project
 //
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
 //
 // SPDX-License-Identifier: MIT
 //
diff --git a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
index b8f3d0b0..66d51dd1 100644
--- a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
@@ -1,16 +1,16 @@
 //
 // This source file is part of the Stanford Spezi open source project
 //
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
 //
 // SPDX-License-Identifier: MIT
 //
 
 import Foundation
+import MLX
 import Spezi
 import SpeziFoundation
 import SpeziLLM
-import MLX
 
 
 /// LLM execution platform of an ``LLMLocalSchema``.
@@ -39,7 +39,6 @@ import MLX
 /// }
 /// ```
 public actor LLMLocalPlatform: LLMPlatform, DefaultInitializable {
-    
     let configuration: LLMLocalPlatformConfiguration
     
     @MainActor public var state: LLMPlatformState = .idle
diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
index bc2f64a8..0a41abab 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSchema.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
@@ -1,15 +1,15 @@
 //
 // This source file is part of the Stanford Spezi open source project
 //
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
 //
 // SPDX-License-Identifier: MIT
 //
 
 import Foundation
+import MLXLLM
 import SpeziChat
 import SpeziLLM
-@preconcurrency import MLXLLM
 
 
 /// Defines the type and configuration of the ``LLMLocalSession``.
@@ -20,33 +20,32 @@ import SpeziLLM
 /// - Tip: For more information, refer to the documentation of the `LLMSchema` from SpeziLLM.
 public struct LLMLocalSchema: LLMSchema {
     public typealias Platform = LLMLocalPlatform
-    
+    /// Parameters controlling the LLM generation process.
     let generateParameters: GenerateParameters
-    
+    /// Maximum number of tokens to generate in a single output.
     let maxTokens: Int
-    
+    /// Interval for displaying output after every N tokens generated.
     let displayEveryNTokens: Int
-    
+    /// Configuration settings for the model being used.
     let configuration: ModelConfiguration
     /// Closure to properly format the ``LLMLocal/context`` to a `String` which is tokenized and passed to the LLM.
     let formatChat: (@Sendable (LLMContext) throws -> String)
-    
-    
+    /// Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``.
     public let injectIntoContext: Bool
     
     
     /// Creates an instance of the ``LLMLocalSchema`` containing all necessary configuration for local LLM inference.
     ///
     /// - Parameters:
-    ///   - modelPath: A local `URL` where the LLM file is stored. The format of the LLM must be in the llama.cpp `.gguf` format.
-    ///   - parameters: Parameterize the LLM via ``LLMLocalParameters``.
-    ///   - contextParameters: Configure the context of the LLM via ``LLMLocalContextParameters``.
-    ///   - samplingParameters: Parameterize the sampling methods of the LLM via ``LLMLocalSamplingParameters``.
+    ///   - configuration: A local `URL` where the LLM file is stored. The format of the LLM must be in the llama.cpp `.gguf` format.
+    ///   - generateParameters: Parameters controlling the LLM generation process.
+    ///   - maxTokens: Maximum number of tokens to generate in a single output, defaults to 2048.
+    ///   - displayEveryNTokens: Interval for displaying output after every N tokens generated, defaults to 4 (improve by ~15% compared to update at every token).
     ///   - injectIntoContext: Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``, defaults to false.
     ///   - formatChat: Closure to properly format the ``LLMLocalSession/context`` to a `String` which is tokenized and passed to the LLM, defaults to Llama2 prompt format.
     public init(
         configuration: ModelConfiguration,
-        generateParameters: GenerateParameters = GenerateParameters(temperature: 0.6),
+        generateParameters: GenerateParameters = GenerateParameters(),
         maxTokens: Int = 2048,
         displayEveryNTokens: Int = 4,
         injectIntoContext: Bool = false,
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index ea1138b2..2756a39e 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -1,18 +1,22 @@
 //
-//  LLMLocalSession+Generate.swift
-//  SpeziLLM
+// This source file is part of the Stanford Spezi open source project
 //
-//  Created by Leon Nissen on 10/15/24.
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
 //
+// SPDX-License-Identifier: MIT
+//
+
 import Foundation
+import MLX
+import MLXLLM
+import MLXRandom
 import os
 import SpeziChat
 import SpeziLLM
-import MLXLLM
-import MLX
-import MLXRandom
+
 
 extension LLMLocalSession {
+    // swiftlint:disable:next identifier_name function_body_length
     func _generate(continuation: AsyncThrowingStream<String, any Error>.Continuation) async {
         guard let modelContainer = await self.modelContainer else {
             Self.logger.error("SpeziLLMLocal: Failed to load `modelContainer`")
@@ -63,7 +67,7 @@ extension LLMLocalSession {
                     return .stop
                 }
                 
-                if schema.injectIntoContext && tokens.count % schema.displayEveryNTokens == 0 {
+                if schema.injectIntoContext && tokens.count.isMultiple(of: schema.displayEveryNTokens) {
                     let lastTokens = Array(tokens.suffix(schema.displayEveryNTokens))
                     let text = " " + tokenizer.decode(tokens: lastTokens)
                     continuation.yield(text)
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
index 1fbf6bef..edbbbb60 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
@@ -1,13 +1,14 @@
 //
-//  LLMLocalSession+Setup.swift
-//  SpeziLLM
+// This source file is part of the Stanford Spezi open source project
 //
-//  Created by Leon Nissen on 10/4/24.
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
 //
 
 import Foundation
-@preconcurrency import MLXLLM
-@preconcurrency import Hub
+import Hub
+import MLXLLM
 
 
 extension LLMLocalSession {
@@ -18,7 +19,7 @@ extension LLMLocalSession {
         
         do {
             let contents = try FileManager.default.contentsOfDirectory(atPath: url.path())
-            return contents.first(where: { $0.hasSuffix(modelFileExtension) }) != nil
+            return contents.contains { $0.hasSuffix(modelFileExtension) }
         } catch {
             return false
         }
@@ -42,7 +43,7 @@ extension LLMLocalSession {
             let modelContainer = try await loadModelContainer(configuration: self.schema.configuration)
             
             let numParams = await modelContainer.perform { [] model, _ in
-                return model.numParameters()
+                model.numParameters()
             }
             
             await MainActor.run {
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift
index aa4a589d..feea9f85 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession.swift
@@ -1,18 +1,19 @@
 //
 // This source file is part of the Stanford Spezi open source project
 //
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
 //
 // SPDX-License-Identifier: MIT
 //
 
+
 import Foundation
+import MLX
+import MLXLLM
+import MLXRandom
 import os
 import SpeziChat
 import SpeziLLM
-import MLXLLM
-import MLX
-import MLXRandom
 
 
 /// Represents an ``LLMLocalSchema`` in execution.
diff --git a/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings b/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
index 335659bd..074d0133 100644
--- a/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
+++ b/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
@@ -150,17 +150,6 @@
           }
         }
       }
-    },
-    "SPEZI_LLM_LOCAL_SYSTEM_PROMPT" : {
-      "extractionState" : "stale",
-      "localizations" : {
-        "en" : {
-          "stringUnit" : {
-            "state" : "translated",
-            "value" : "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe and still concise. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
-          }
-        }
-      }
     }
   },
   "version" : "1.0"
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
index 8080c472..24824422 100644
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
+++ b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
@@ -7,10 +7,10 @@
 //
 
 import Foundation
+import Hub
+import MLXLLM
 import Observation
 import SpeziViews
-import MLXLLM
-import Hub
 
 /// Manages the download and storage of Large Language Models (LLM) to the local device.
 ///
@@ -55,24 +55,24 @@ public final class LLMLocalDownloadManager: NSObject {
         
         do {
             let contents = try FileManager.default.contentsOfDirectory(atPath: url.path())
-            return contents.first(where: { $0.hasSuffix(modelFileExtension) }) != nil
+            return contents.contains { $0.hasSuffix(modelFileExtension) }
         } catch {
             return false
         }
     }
     
-    /// Creates a ``LLMLocalDownloadManager`` that helps with downloading LLM files from remote servers.
+    /// Initializes a ``LLMLocalDownloadManager`` instance to manage the download of LLM files from the remote server.
     ///
     /// - Parameters:
-    ///   - modelConfiguration: TODO
+    ///   - modelConfiguration: The configuration specifying the parameters and settings for the LLM that needs to be downloaded.
     public init(modelConfiguration: ModelConfiguration) {
         self.modelConfiguration = modelConfiguration
     }
     
-    /// Creates a ``LLMLocalDownloadManager`` that helps with downloading LLM files from remote servers.
+    /// Initializes a ``LLMLocalDownloadManager`` instance to manage the download of Large Language Model (LLM) files from remote servers.
     ///
     /// - Parameters:
-    ///   - modelID: TODO
+    ///   - modelID: The Huggingface model ID of the LLM that needs to be downloaded.
     public init(modelID: String) {
         self.modelConfiguration = .init(id: modelID)
     }
@@ -89,7 +89,7 @@ public final class LLMLocalDownloadManager: NSObject {
         downloadTask?.cancel()
         downloadTask = Task(priority: .userInitiated) {
             do {
-                let _ = try await loadModelContainer(configuration: modelConfiguration) { progress in
+                _ = try await loadModelContainer(configuration: modelConfiguration) { progress in
                     Task { @MainActor in
                         self.state = .downloading(progress: progress)
                     }
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
index 3a1c02ab..e3c86b36 100644
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
+++ b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
@@ -1,15 +1,15 @@
 //
 // This source file is part of the Stanford Spezi open source project
 //
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
 //
 // SPDX-License-Identifier: MIT
 //
 
+import MLXLLM
 import SpeziOnboarding
 import SpeziViews
 import SwiftUI
-import MLXLLM
 
 
 /// Provides an onboarding view for downloading locally executed Spezi LLMs to the device.
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalLoadingManager.swift b/Sources/SpeziLLMLocalDownload/LLMLocalLoadingManager.swift
deleted file mode 100644
index e69de29b..00000000

From d74b64709e488984a3b4d79a5bb921330a611999 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Thu, 17 Oct 2024 19:08:53 -0700
Subject: [PATCH 04/27] fix REUSE

---
 .../SpeziLLMLocal/Helpers/LLMModel+numParameters.swift    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift b/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift
index da6d89eb..aa01bd4b 100644
--- a/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift
+++ b/Sources/SpeziLLMLocal/Helpers/LLMModel+numParameters.swift
@@ -1,12 +1,14 @@
 //
-//  LLMModel+numParameters.swift
-//  SpeziLLM
+// This source file is part of the Stanford Spezi open source project
 //
-//  Created by Leon Nissen on 10/14/24.
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
 //
 
 import MLXNN
 
+
 extension Module {
     /// Compute the number of parameters in a possibly quantized model
     public func numParameters() -> Int {

From 03fc955a0f0fa68c9b0c4f57fdf745088ecf72dd Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Tue, 22 Oct 2024 13:31:15 -0700
Subject: [PATCH 05/27] adjust UITest project add note to README

---
 Package.swift                                 |  2 +-
 README.md                                     |  3 +++
 .../LLMLocalDownloadView.swift                |  1 -
 .../LLMLocal/LLMLocalChatTestView.swift       | 25 +++++++++++--------
 .../LLMLocalOnboardingDownloadView.swift      |  7 +++---
 .../Onboarding/LLMLocalOnboardingFlow.swift   |  2 +-
 Tests/UITests/TestApp/TestAppDelegate.swift   |  1 -
 .../UITests/UITests.xcodeproj/project.pbxproj |  2 --
 8 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/Package.swift b/Package.swift
index 1f041b47..a1f3fc7e 100644
--- a/Package.swift
+++ b/Package.swift
@@ -27,7 +27,7 @@ let package = Package(
         .library(name: "SpeziLLMFog", targets: ["SpeziLLMFog"])
     ],
     dependencies: [
-        .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.18.0"),
+        .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.18.1"),
         .package(url: "https://github.com/ml-explore/mlx-swift-examples", from: "1.16.0"),
         .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.12")),
         .package(url: "https://github.com/StanfordBDHG/OpenAI", .upToNextMinor(from: "0.2.9")),
diff --git a/README.md b/README.md
index a664e406..03a43af4 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,9 @@ The section below highlights the setup and basic use of the [SpeziLLMLocal](http
 
 The target enables developers to easily execute medium-size Language Models (LLMs) locally on-device. The module allows you to interact with the locally run LLM via purely Swift-based APIs, no interaction with low-level code is necessary, building on top of the infrastructure of the [SpeziLLM target](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm).
 
+> [!IMPORTANT]  
+> Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) package encounters a crash when initializing the GPU as of version 0.18.1.
+
 #### Setup
 
 You can configure the Spezi Local LLM execution within the typical `SpeziAppDelegate`.
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
index e3c86b36..61bae01e 100644
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
+++ b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
@@ -54,7 +54,6 @@ public struct LLMLocalDownloadView: View {
     private let action: () async throws -> Void
     /// Description of the to-be-downloaded model shown in the ``LLMLocalDownloadView``.
     private let downloadDescription: Text
-    
     /// Indicates the state of the view, get's derived from the ``LLMLocalDownloadManager/state``.
     @State private var viewState: ViewState = .idle
 
diff --git a/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift b/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift
index 34b5797b..684f143c 100644
--- a/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift
+++ b/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift
@@ -25,18 +25,21 @@ struct LLMLocalChatTestView: View {
             } else {
                 LLMChatViewSchema(
                     with: LLMLocalSchema(
-                        modelPath: .cachesDirectory.appending(path: "llm.gguf"),
-                        parameters: .init(maxOutputLength: 512),
-                        contextParameters: .init(contextWindowSize: 1024),
-                        formatChat: LLMLocalSchema.PromptFormattingDefaults.llama3
+                        configuration: .phi3_4bit,
+                        formatChat: { context in
+                            context
+                                .filter { $0.role == .user }
+                                .map { $0.content }
+                                .joined(separator: " ")
+                        }
                     )
                 )
             }
         }
-            .navigationTitle("LLM_LOCAL_CHAT_VIEW_TITLE")
+        .navigationTitle("LLM_LOCAL_CHAT_VIEW_TITLE")
     }
-                                         
-                                         
+    
+    
     init(mockMode: Bool = false) {
         self.mockMode = mockMode
     }
@@ -48,10 +51,10 @@ struct LLMLocalChatTestView: View {
     NavigationStack {
         LLMLocalChatTestView(mockMode: true)
     }
-        .previewWith {
-            LLMRunner {
-                LLMMockPlatform()
-            }
+    .previewWith {
+        LLMRunner {
+            LLMMockPlatform()
         }
+    }
 }
 #endif
diff --git a/Tests/UITests/TestApp/LLMLocal/Onboarding/LLMLocalOnboardingDownloadView.swift b/Tests/UITests/TestApp/LLMLocal/Onboarding/LLMLocalOnboardingDownloadView.swift
index dfbcb4d6..37bbe71b 100644
--- a/Tests/UITests/TestApp/LLMLocal/Onboarding/LLMLocalOnboardingDownloadView.swift
+++ b/Tests/UITests/TestApp/LLMLocal/Onboarding/LLMLocalOnboardingDownloadView.swift
@@ -19,11 +19,10 @@ struct LLMLocalOnboardingDownloadView: View {
     
     var body: some View {
         LLMLocalDownloadView(
+            model: .phi3_4bit,
             downloadDescription: "LLM_DOWNLOAD_DESCRIPTION",
-            llmDownloadUrl: LLMLocalDownloadManager.LLMUrlDefaults.llama3InstructModelUrl /// By default, download the Llama3 model
-        ) {
-            onboardingNavigationPath.nextStep()
-        }
+            action: onboardingNavigationPath.nextStep
+        )
     }
 }
 
diff --git a/Tests/UITests/TestApp/LLMLocal/Onboarding/LLMLocalOnboardingFlow.swift b/Tests/UITests/TestApp/LLMLocal/Onboarding/LLMLocalOnboardingFlow.swift
index f0a52acb..104392e7 100644
--- a/Tests/UITests/TestApp/LLMLocal/Onboarding/LLMLocalOnboardingFlow.swift
+++ b/Tests/UITests/TestApp/LLMLocal/Onboarding/LLMLocalOnboardingFlow.swift
@@ -23,7 +23,7 @@ struct LLMLocalOnboardingFlow: View {
                 LLMLocalOnboardingDownloadView()
             }
         }
-            .interactiveDismissDisabled(!completedOnboardingFlow)
+        .interactiveDismissDisabled(!completedOnboardingFlow)
     }
 }
 
diff --git a/Tests/UITests/TestApp/TestAppDelegate.swift b/Tests/UITests/TestApp/TestAppDelegate.swift
index 2f4d84a8..f4cc6369 100644
--- a/Tests/UITests/TestApp/TestAppDelegate.swift
+++ b/Tests/UITests/TestApp/TestAppDelegate.swift
@@ -46,7 +46,6 @@ class TestAppDelegate: SpeziAppDelegate {
             
             LLMRunner {
                 LLMMockPlatform()
-                LLMLocalPlatform()
                 // No CA certificate (meaning no encrypted traffic) for development purposes, see `caCertificateUrl` above
                 LLMFogPlatform(configuration: .init(host: "spezillmfog.local", caCertificate: nil))
                 LLMOpenAIPlatform()
diff --git a/Tests/UITests/UITests.xcodeproj/project.pbxproj b/Tests/UITests/UITests.xcodeproj/project.pbxproj
index e271b199..020fccc8 100644
--- a/Tests/UITests/UITests.xcodeproj/project.pbxproj
+++ b/Tests/UITests/UITests.xcodeproj/project.pbxproj
@@ -605,7 +605,6 @@
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
-				SWIFT_OBJC_INTEROP_MODE = objcxx;
 				SWIFT_STRICT_CONCURRENCY = complete;
 				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2,7";
@@ -648,7 +647,6 @@
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
 				SWIFT_EMIT_LOC_STRINGS = YES;
-				SWIFT_OBJC_INTEROP_MODE = objcxx;
 				SWIFT_STRICT_CONCURRENCY = complete;
 				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2,7";

From accb8eff5d4a57857a620e83c17ce83a1f63a6a9 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Fri, 25 Oct 2024 13:23:26 -0700
Subject: [PATCH 06/27] update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 03a43af4..ffa9d241 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ The section below highlights the setup and basic use of the [SpeziLLMLocal](http
 The target enables developers to easily execute medium-size Language Models (LLMs) locally on-device. The module allows you to interact with the locally run LLM via purely Swift-based APIs, no interaction with low-level code is necessary, building on top of the infrastructure of the [SpeziLLM target](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm).
 
 > [!IMPORTANT]  
-> Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) package encounters a crash when initializing the GPU as of version 0.18.1.
+> Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that.
 
 #### Setup
 

From b2ceaa5dfddb3d50e6be361b5d3026183d2743f4 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Sun, 27 Oct 2024 12:44:22 -0700
Subject: [PATCH 07/27] intermediate commit

---
 Package.swift                                 |   1 +
 README.md                                     |   3 +
 .../LLMLocalContextParameters.swift           |  33 +++
 .../Configuration/LLMLocalModel.swift         |  68 +++++
 .../Configuration/LLMLocalParameters.swift    |  51 ++++
 .../LLMLocalSamplingParameters.swift          |  42 +++
 .../LLMLocalSchema+PromptFormatting.swift     | 275 ++++++++++++++++++
 Sources/SpeziLLMLocal/LLMLocalSchema.swift    |  35 +--
 .../LLMLocalSession+Generate.swift            |  36 ++-
 Sources/SpeziLLMLocal/LLMLocalSession.swift   |   7 +
 .../Resources/Localizable.xcstrings           |   3 +
 .../LLMLocalDownloadManager.swift             |  19 +-
 .../LLMLocalDownloadView.swift                |  37 +--
 13 files changed, 542 insertions(+), 68 deletions(-)
 create mode 100644 Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
 create mode 100644 Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift
 create mode 100644 Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
 create mode 100644 Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift
 create mode 100644 Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift

diff --git a/Package.swift b/Package.swift
index a1f3fc7e..032458aa 100644
--- a/Package.swift
+++ b/Package.swift
@@ -67,6 +67,7 @@ let package = Package(
             dependencies: [
                 .product(name: "SpeziOnboarding", package: "SpeziOnboarding"),
                 .product(name: "SpeziViews", package: "SpeziViews"),
+                .target(name: "SpeziLLMLocal"),
                 .product(name: "LLM", package: "mlx-swift-examples")
             ]
         ),
diff --git a/README.md b/README.md
index ffa9d241..b6bde0ea 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,9 @@ The target enables developers to easily execute medium-size Language Models (LLM
 > [!IMPORTANT]  
 > Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that.
 
+> [!IMPORTANT]
+> Important: To use the LLM local target, some LLMs require adding the [Increase Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement to the project.
+
 #### Setup
 
 You can configure the Spezi Local LLM execution within the typical `SpeziAppDelegate`.
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
new file mode 100644
index 00000000..7db1506b
--- /dev/null
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
@@ -0,0 +1,33 @@
+//
+// This source file is part of the Stanford Spezi open source project
+//
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
+//
+
+import Foundation
+
+
+/// Represents the context parameters of the LLM.
+public struct LLMLocalContextParameters: Sendable {
+    /// RNG seed of the LLM
+    var seed: UInt64?
+    
+    /// If `true`, the mode is set to embeddings only
+    var embeddingsOnly: Bool
+    
+    /// Creates the ``LLMLocalContextParameters`` which wrap the underlying llama.cpp `llama_context_params` C struct.
+    /// Is passed to the underlying llama.cpp model in order to configure the context of the LLM.
+    ///
+    /// - Parameters:
+    ///   - seed: RNG seed of the LLM, defaults to a random seed.
+    ///   - embeddingsOnly: Embedding-only mode, defaults to `false`.
+    public init(
+        seed: UInt64? = nil,
+        embeddingsOnly: Bool = false
+    ) {
+        self.seed = seed
+        self.embeddingsOnly = embeddingsOnly
+    }
+}
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift
new file mode 100644
index 00000000..9d53a81a
--- /dev/null
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift
@@ -0,0 +1,68 @@
+//
+// This source file is part of the Stanford Spezi open source project
+//
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
+//
+
+
+public enum LLMLocalModel {
+    case llama3_1_8B_4bit
+    case llama3_8B_4bit
+    case llama3_2_1B_4bit
+    case llama3_2_3B_4bit
+    case mistralNeMo4bit
+    case smolLM_135M_4bit
+    case mistral7B4bit
+    case codeLlama13b4bit
+    case phi4bit
+    case phi3_4bit
+    case phi3_5_4bit
+    case gemma2bQuantized
+    case gemma_2_9b_it_4bit
+    case gemma_2_2b_it_4bit
+    case qwen205b4bit
+    case openelm270m4bit
+    /// Set the Huggingface ID of the model. e.g. "\<USER\>/\<MODEL\>"
+    case custom(id: String)
+    
+    public var hubID: String {
+        switch self {
+        case .llama3_1_8B_4bit:
+            return "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
+        case .llama3_8B_4bit:
+            return "mlx-community/Meta-Llama-3-8B-Instruct-4bit"
+        case .llama3_2_1B_4bit:
+            return "mlx-community/Llama-3.2-1B-Instruct-4bit"
+        case .llama3_2_3B_4bit:
+            return "mlx-community/Llama-3.2-3B-Instruct-4bit"
+        case .mistralNeMo4bit:
+            return "mlx-community/Mistral-Nemo-Instruct-2407-4bit"
+        case .smolLM_135M_4bit:
+            return "mlx-community/SmolLM-135M-Instruct-4bit"
+        case .mistral7B4bit:
+            return "mlx-community/Mistral-7B-Instruct-v0.3-4bit"
+        case .codeLlama13b4bit:
+            return "mlx-community/CodeLlama-13b-Instruct-hf-4bit-MLX"
+        case .phi4bit:
+            return "mlx-community/phi-2-hf-4bit-mlx"
+        case .phi3_4bit:
+            return "mlx-community/Phi-3-mini-4k-instruct-4bit-no-q-embed"
+        case .phi3_5_4bit:
+            return "mlx-community/Phi-3.5-mini-instruct-4bit"
+        case .gemma2bQuantized:
+            return "mlx-community/quantized-gemma-2b-it"
+        case .gemma_2_9b_it_4bit:
+            return "mlx-community/gemma-2-9b-it-4bit"
+        case .gemma_2_2b_it_4bit:
+            return "mlx-community/gemma-2-2b-it-4bit"
+        case .qwen205b4bit:
+            return "mlx-community/Qwen1.5-0.5B-Chat-4bit"
+        case .openelm270m4bit:
+            return "mlx-community/OpenELM-270M-Instruct"
+        case .custom(let id):
+            return id
+        }
+    }
+}
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
new file mode 100644
index 00000000..9a654f2b
--- /dev/null
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
@@ -0,0 +1,51 @@
+//
+// This source file is part of the Stanford Spezi open source project
+//
+// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
+//
+
+import Foundation
+
+/// Represents the parameters of the LLM.
+public struct LLMLocalParameters: Sendable {
+    
+    /// Defaults of possible LLMs parameter settings.
+    public enum Defaults {
+        /// Default system prompt for local LLMs.
+        public static let defaultSystemPrompt: String = {
+            String(localized: LocalizedStringResource("SPEZI_LLM_LOCAL_SYSTEM_PROMPT", bundle: .atURL(from: .module)))
+        }()
+    }
+    
+    
+    /// The to-be-used system prompt of the LLM
+    let systemPrompt: String?
+    /// Indicates the maximum output length generated by the LLM.
+    let maxOutputLength: Int
+    
+    let extraEOSTokens: Set<String>
+    /// Interval for displaying output after every N tokens generated.
+    let displayEveryNTokens: Int
+    
+    /// Creates the ``LLMLocalParameters`` which wrap the underlying llama.cpp `llama_model_params` C struct.
+    /// Is passed to the underlying llama.cpp model in order to configure the LLM.
+    ///
+    /// - Parameters:
+    ///   - systemPrompt: The to-be-used system prompt of the LLM enabling fine-tuning of the LLMs behaviour. Defaults to the regular default chat-based LLM system prompt.
+    ///   - maxOutputLength: The maximum output length generated by the Spezi LLM, defaults to `512`.
+    ///   - extraEOSTokens: TODO
+    ///   - displayEveryNTokens: Interval for displaying output after every N tokens generated, defaults to `4`.
+    public init(
+        systemPrompt: String? = Defaults.defaultSystemPrompt,
+        maxOutputLength: Int = 512,
+        extraEOSTokens: Set<String> = [],
+        displayEveryNTokens: Int = 4
+    ) {
+        self.systemPrompt = systemPrompt
+        self.maxOutputLength = maxOutputLength
+        self.extraEOSTokens = extraEOSTokens
+        self.displayEveryNTokens = displayEveryNTokens
+    }
+}
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift
new file mode 100644
index 00000000..16198850
--- /dev/null
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift
@@ -0,0 +1,42 @@
+//
+// This source file is part of the Stanford Spezi open source project
+//
+// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
+//
+
+import Foundation
+
+
+/// Represents the sampling parameters of the LLM.
+public struct LLMLocalSamplingParameters: Sendable {    // swiftlint:disable:this type_body_length
+    /// Top-p Sampling: Smallest possible set of words whose cumulative probability exceeds the probability p (1.0 = disabled).
+    let topP: Float
+    /// Temperature Sampling: A higher value indicates more creativity of the model but also more hallucinations.
+    let temperature: Float
+    /// Penalize repeated tokens (nil = disabled).
+    let penaltyRepeat: Float?
+    /// Number of tokens to consider for repetition penalty
+    let repetitionContextSize: Int
+
+
+    /// Creates the ``LLMLocalContextParameters``
+    ///
+    /// - Parameters:
+    ///   - topP: Top-p Sampling: Smallest possible set of words whose cumulative probability exceeds the probability p (1.0 = disabled).
+    ///   - temperature: Temperature Sampling: A higher value indicates more creativity of the model but also more hallucinations.
+    ///   - penaltyRepeat: Penalize repeated tokens (nil = disabled).
+    ///   - repetitionContextSize: Number of tokens to consider for repetition penalty
+    public init(
+        topP: Float = 1.0,
+        temperature: Float = 0.6,
+        penaltyRepeat: Float? = nil,
+        repetitionContextSize: Int = 20
+    ) {
+        self.topP = topP
+        self.temperature = temperature
+        self.penaltyRepeat = penaltyRepeat
+        self.repetitionContextSize = repetitionContextSize
+    }
+}
diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift b/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift
new file mode 100644
index 00000000..0859cbe6
--- /dev/null
+++ b/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift
@@ -0,0 +1,275 @@
+//
+// This source file is part of the Stanford Spezi open source project
+//
+// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
+//
+
+import SpeziLLM
+
+
+extension LLMLocalSchema {
+    /// Holds default prompt formatting strategies for [Llama2](https://ai.meta.com/llama/) as well as [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) models.
+    public enum PromptFormattingDefaults {
+        /// Prompt formatting closure for the [Llama3](https://ai.meta.com/llama/) model
+        public static let llama3: (@Sendable (LLMContext) throws -> String) = { chat in // swiftlint:disable:this closure_body_length
+            /// BOS token of the LLM, used at the start of each prompt passage.
+            let BEGINOFTEXT = "<|begin_of_text|>"
+            /// The system identifier.
+            let SYSTEM = "system"
+            /// The user identifier.
+            let USER = "user"
+            /// The assistant identifier.
+            let ASSISTANT = "assistant"
+            /// The start token for enclosing the role of a particular message, e.g. <|start_header_id|>{role}<|end_header_id|>
+            let STARTHEADERID = "<|start_header_id|>"
+            /// The end token for enclosing the role of a particular message, e.g. <|start_header_id|>{role}<|end_header_id|>
+            let ENDHEADERID = "<|end_header_id|>"
+            /// The token that signifies the end of the message in a turn.
+            let EOTID = "<|eot_id|>"
+            
+            guard chat.first?.role == .system else {
+                throw LLMLocalError.illegalContext
+            }
+            
+            var systemPrompts: [String] = []
+            var initialUserPrompt: String = ""
+            
+            for contextEntity in chat {
+                if contextEntity.role != .system {
+                    if contextEntity.role == .user {
+                        initialUserPrompt = contextEntity.content
+                        break
+                    } else {
+                        throw LLMLocalError.illegalContext
+                    }
+                }
+                
+                systemPrompts.append(contextEntity.content)
+            }
+            
+            /// Build the initial Llama3 prompt structure
+            /// 
+            /// Template of the prompt structure:
+            /// <|begin_of_text|>
+            /// <|start_header_id|>user<|end_header_id|>
+            /// {{ user_message }}<|eot_id|>
+            /// <|start_header_id|>assistant<|end_header_id|>
+            var prompt = """
+            \(BEGINOFTEXT)
+            \(STARTHEADERID)\(SYSTEM)\(ENDHEADERID)
+            \(systemPrompts.joined(separator: " "))\(EOTID)
+            
+            \(STARTHEADERID)\(USER)\(ENDHEADERID)
+            \(initialUserPrompt)\(EOTID)
+            
+            """ + " "   // Add a spacer to the generated output from the model
+            
+            for contextEntity in chat.dropFirst(2) {
+                if contextEntity.role == .assistant() {
+                    /// Append response from assistant to the Llama3 prompt structure
+                    prompt += """
+                    \(STARTHEADERID)\(ASSISTANT)\(ENDHEADERID)
+                    \(contextEntity.content)
+                    \(EOTID)
+                    """
+                } else if contextEntity.role == .user {
+                    /// Append response from user to the Llama3 prompt structure
+                    prompt += """
+                    \(STARTHEADERID)\(USER)\(ENDHEADERID)
+                    \(contextEntity.content)
+                    \(EOTID)
+                    """ + " "   // Add a spacer to the generated output from the model
+                }
+            }
+            
+            prompt +=
+            """
+            \(STARTHEADERID)\(ASSISTANT)\(ENDHEADERID)
+            """
+            
+            return prompt
+        }
+        
+        /// Prompt formatting closure for the [Llama2](https://ai.meta.com/llama/) model
+        public static let llama2: (@Sendable (LLMContext) throws -> String) = { chat in     // swiftlint:disable:this closure_body_length
+            /// BOS token of the LLM, used at the start of each prompt passage.
+            let BOS = "<s>"
+            /// EOS token of the LLM, used at the end of each prompt passage.
+            let EOS = "</s>"
+            /// BOSYS token of the LLM, used at the start of the system prompt.
+            let BOSYS = "<<SYS>>"
+            /// EOSYS token of the LLM, used at the end of the system prompt.
+            let EOSYS = "<</SYS>>"
+            /// BOINST token of the LLM, used at the start of the instruction part of the prompt.
+            let BOINST = "[INST]"
+            /// EOINST token of the LLM, used at the end of the instruction part of the prompt.
+            let EOINST = "[/INST]"
+            
+            guard chat.first?.role == .system else {
+                throw LLMLocalError.illegalContext
+            }
+            
+            var systemPrompts: [String] = []
+            var initialUserPrompt: String = ""
+            
+            for contextEntity in chat {
+                if contextEntity.role != .system {
+                    if contextEntity.role == .user {
+                        initialUserPrompt = contextEntity.content
+                        break
+                    } else {
+                        throw LLMLocalError.illegalContext
+                    }
+                }
+                
+                systemPrompts.append(contextEntity.content)
+            }
+            
+            /// Build the initial Llama2 prompt structure
+            ///
+            /// A template of the prompt structure looks like:
+            /// """
+            /// <s>[INST] <<SYS>>
+            /// {your_system_prompt}
+            /// <</SYS>>
+            ///
+            /// {user_message_1} [/INST]
+            /// """
+            var prompt = """
+            \(BOS)\(BOINST) \(BOSYS)
+            \(systemPrompts.joined(separator: " "))
+            \(EOSYS)
+            
+            \(initialUserPrompt) \(EOINST)
+            """ + " "   // Add a spacer to the generated output from the model
+            
+            for contextEntity in chat.dropFirst(2) {
+                if contextEntity.role == .assistant() {
+                    /// Append response from assistant to the Llama2 prompt structure
+                    ///
+                    /// A template for appending an assistant response to the overall prompt looks like:
+                    /// {user_message_1} [/INST]){model_reply_1}</s>
+                    prompt += """
+                    \(contextEntity.content)\(EOS)
+                    """
+                } else if contextEntity.role == .user {
+                    /// Append response from user to the Llama2 prompt structure
+                    ///
+                    /// A template for appending an assistant response to the overall prompt looks like:
+                    /// <s>[INST] {user_message_2} [/INST]
+                    prompt += """
+                    \(BOS)\(BOINST) \(contextEntity.content) \(EOINST)
+                    """ + " "   // Add a spacer to the generated output from the model
+                }
+            }
+            
+            return prompt
+        }
+        
+        /// Prompt formatting closure for the [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) model
+        public static let phi2: (@Sendable (LLMContext) throws -> String) = { chat in
+            guard chat.first?.role == .system else {
+                throw LLMLocalError.illegalContext
+            }
+            
+            var systemPrompts: [String] = []
+            var initialUserPrompt: String = ""
+            
+            for contextEntity in chat {
+                if contextEntity.role != .system {
+                    if contextEntity.role == .user {
+                        initialUserPrompt = contextEntity.content
+                        break
+                    } else {
+                        throw LLMLocalError.illegalContext
+                    }
+                }
+                
+                systemPrompts.append(contextEntity.content)
+            }
+            
+            /// Build the initial Phi-2 prompt structure
+            ///
+            /// A template of the prompt structure looks like:
+            /// """
+            /// System: {your_system_prompt}
+            /// Instruct: {model_reply_1}
+            /// Output: {model_reply_1}
+            /// """
+            var prompt = """
+            System: \(systemPrompts.joined(separator: " "))
+            Instruct: \(initialUserPrompt)\n
+            """
+            
+            for contextEntity in chat.dropFirst(2) {
+                if contextEntity.role == .assistant() {
+                    /// Append response from assistant to the Phi-2 prompt structure
+                    prompt += """
+                    Output: \(contextEntity.content)\n
+                    """
+                } else if contextEntity.role == .user {
+                    /// Append response from assistant to the Phi-2 prompt structure
+                    prompt += """
+                    Instruct: \(contextEntity.content)\n
+                    """
+                }
+            }
+            
+            /// Model starts responding after
+            if chat.last?.role == .user {
+                prompt += "Output: "
+            }
+            
+            return prompt
+        }
+        
+        /// Prompt formatting closure for the [Gemma](https://ai.google.dev/gemma/docs/formatting) models
+        /// - Important: System prompts are ignored as Gemma doesn't support them
+        public static let gemma: (@Sendable (LLMContext) throws -> String) = { chat in
+            /// Start token of Gemma
+            let startToken = "<start_of_turn>"
+            /// End token of Gemma
+            let endToken = "<end_of_turn>"
+            
+            /// Build the initial Gemma prompt structure
+            ///
+            /// A template of the prompt structure looks like:
+            /// """
+            /// <start_of_turn>user
+            /// knock knock<end_of_turn>
+            /// <start_of_turn>model
+            /// who is there<end_of_turn>
+            /// <start_of_turn>user
+            /// Gemma<end_of_turn>
+            /// <start_of_turn>model
+            /// Gemma who?<end_of_turn>
+            /// """
+            var prompt = ""
+            
+            for contextEntity in chat {
+                if contextEntity.role == .assistant() {
+                    /// Append response from assistant to the Gemma prompt structure
+                    prompt += """
+                    \(startToken)model
+                    \(contextEntity.content)\(endToken)\n
+                    """
+                } else if contextEntity.role == .user {
+                    /// Append response from assistant to the Gemma prompt structure
+                    prompt += """
+                    \(startToken)user
+                    \(contextEntity.content)\(endToken)\n
+                    """
+                }
+            }
+            
+            /// Model starts responding after
+            if chat.last?.role == .user {
+                prompt += "\(startToken)model\n"
+            }
+            
+            return prompt
+        }
+    }
+}
diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
index 0a41abab..ac1d3fcf 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSchema.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
@@ -20,20 +20,21 @@ import SpeziLLM
 /// - Tip: For more information, refer to the documentation of the `LLMSchema` from SpeziLLM.
 public struct LLMLocalSchema: LLMSchema {
     public typealias Platform = LLMLocalPlatform
-    /// Parameters controlling the LLM generation process.
-    let generateParameters: GenerateParameters
-    /// Maximum number of tokens to generate in a single output.
-    let maxTokens: Int
-    /// Interval for displaying output after every N tokens generated.
-    let displayEveryNTokens: Int
-    /// Configuration settings for the model being used.
-    let configuration: ModelConfiguration
+    
+    /// Closure to properly format the ``LLMLocal/context`` to a `String` which is tokenized and passed to the LLM.
+    let parameters: LLMLocalParameters
+    /// Context parameters of the llama.cpp LLM.
+    let contextParameters: LLMLocalContextParameters
+    /// Sampling parameters of the llama.cpp LLM.
+    let samplingParameters: LLMLocalSamplingParameters
     /// Closure to properly format the ``LLMLocal/context`` to a `String` which is tokenized and passed to the LLM.
     let formatChat: (@Sendable (LLMContext) throws -> String)
     /// Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``.
     public let injectIntoContext: Bool
     
     
+    package let configuration: ModelConfiguration
+    
     /// Creates an instance of the ``LLMLocalSchema`` containing all necessary configuration for local LLM inference.
     ///
     /// - Parameters:
@@ -44,18 +45,18 @@ public struct LLMLocalSchema: LLMSchema {
     ///   - injectIntoContext: Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``, defaults to false.
     ///   - formatChat: Closure to properly format the ``LLMLocalSession/context`` to a `String` which is tokenized and passed to the LLM, defaults to Llama2 prompt format.
     public init(
-        configuration: ModelConfiguration,
-        generateParameters: GenerateParameters = GenerateParameters(),
-        maxTokens: Int = 2048,
-        displayEveryNTokens: Int = 4,
+        model: LLMLocalModel,
+        parameters: LLMLocalParameters = .init(),
+        contextParameters: LLMLocalContextParameters = .init(),
+        samplingParameters: LLMLocalSamplingParameters = .init(),
         injectIntoContext: Bool = false,
         formatChat: @escaping (@Sendable (LLMContext) throws -> String)
     ) {
-        self.generateParameters = generateParameters
-        self.maxTokens = maxTokens
-        self.displayEveryNTokens = displayEveryNTokens
-        self.configuration = configuration
-        self.injectIntoContext = injectIntoContext
+        self.parameters = parameters
+        self.contextParameters = contextParameters
+        self.samplingParameters = samplingParameters
         self.formatChat = formatChat
+        self.injectIntoContext = injectIntoContext
+        self.configuration = .init(id: model.hubID)
     }
 }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index 2756a39e..6c147c28 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -37,8 +37,7 @@ extension LLMLocalSession {
             tokenizer.encode(text: prompt)
         }
         
-        // each time you generate you will get something new
-        MLXRandom.seed(UInt64(Date.timeIntervalSinceReferenceDate * 1000))
+        MLXRandom.seed(self.schema.contextParameters.seed ?? UInt64(Date.timeIntervalSinceReferenceDate * 1000))
         
         let extraEOSTokens = modelConfiguration.extraEOSTokens
         
@@ -46,11 +45,18 @@ extension LLMLocalSession {
             return
         }
         
+        let parameters: GenerateParameters = .init(
+            temperature: schema.samplingParameters.temperature,
+            topP: schema.samplingParameters.topP,
+            repetitionPenalty: schema.samplingParameters.penaltyRepeat,
+            repetitionContextSize: schema.samplingParameters.repetitionContextSize
+        )
+        
         let (result, tokenizer) = await modelContainer.perform { model, tokenizer in
             // Execute the inference
             let result = MLXLLM.generate(
                 promptTokens: promptTokens,
-                parameters: self.schema.generateParameters,
+                parameters: parameters,
                 model: model,
                 tokenizer: tokenizer,
                 extraEOSTokens: extraEOSTokens
@@ -59,7 +65,7 @@ extension LLMLocalSession {
                     return .stop
                 }
                 
-                if tokens.count >= self.schema.maxTokens {
+                if tokens.count >= self.schema.parameters.maxOutputLength {
                     continuation.finish()
                     Task { @MainActor in
                         self.state = .ready
@@ -67,9 +73,11 @@ extension LLMLocalSession {
                     return .stop
                 }
                 
-                if schema.injectIntoContext && tokens.count.isMultiple(of: schema.displayEveryNTokens) {
-                    let lastTokens = Array(tokens.suffix(schema.displayEveryNTokens))
-                    let text = " " + tokenizer.decode(tokens: lastTokens)
+                if schema.injectIntoContext && tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) {
+                    let lastTokens = Array(tokens.suffix(schema.parameters.displayEveryNTokens))
+                    let text = tokenizer.decode(tokens: lastTokens)
+                    
+                    Self.logger.debug("SpeziLLMLocal: Yielded token: \(text, privacy: .public)")
                     continuation.yield(text)
                 }
                 
@@ -79,17 +87,27 @@ extension LLMLocalSession {
             return (result, tokenizer)
         }
         
+        Self.logger.debug(
+            """
+            SpeziLLMLocal: 
+            Prompt Tokens per second: \(result.promptTokensPerSecond, privacy: .public) 
+            Generation tokens per second: \(result.tokensPerSecond, privacy: .public)
+            """
+        )
+        
         await MainActor.run {
             if schema.injectIntoContext {
                 // Yielding every Nth token may result in missing the final tokens.
-                let reaminingTokens = result.tokens.count % schema.displayEveryNTokens
+                let reaminingTokens = result.tokens.count % schema.parameters.displayEveryNTokens
                 let lastTokens = Array(result.tokens.suffix(reaminingTokens))
-                let text = " " + tokenizer.decode(tokens: lastTokens)
+                let text = tokenizer.decode(tokens: lastTokens)
                 continuation.yield(text)
                 context.completeAssistantStreaming()
             } else {
                 context.append(assistantOutput: result.output, complete: true)
             }
+            
+            continuation.finish()
             state = .ready
         }
     }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift
index feea9f85..7c87c7bc 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession.swift
@@ -93,6 +93,13 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable {
     init(_ platform: LLMLocalPlatform, schema: LLMLocalSchema) {
         self.platform = platform
         self.schema = schema
+        
+        // Inject system prompt into context
+        if let systemPrompt = schema.parameters.systemPrompt {
+            Task { @MainActor in
+                context.append(systemMessage: systemPrompt)
+            }
+        }
     }
     
     @discardableResult
diff --git a/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings b/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
index 074d0133..0ee6a86e 100644
--- a/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
+++ b/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
@@ -150,6 +150,9 @@
           }
         }
       }
+    },
+    "SPEZI_LLM_LOCAL_SYSTEM_PROMPT" : {
+
     }
   },
   "version" : "1.0"
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
index 24824422..4ffe465f 100644
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
+++ b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
@@ -11,6 +11,7 @@ import Hub
 import MLXLLM
 import Observation
 import SpeziViews
+import SpeziLLMLocal
 
 /// Manages the download and storage of Large Language Models (LLM) to the local device.
 ///
@@ -49,7 +50,11 @@ public final class LLMLocalDownloadManager: NSObject {
     private let modelConfiguration: ModelConfiguration
     
     @ObservationIgnored public var modelExists: Bool {
-        let repo = Hub.Repo(id: modelConfiguration.name)
+        LLMLocalDownloadManager.modelExsist(model: .custom(id: modelConfiguration.name))
+    }
+    
+    public static func modelExsist(model: LLMLocalModel) -> Bool {
+        let repo = Hub.Repo(id: model.hubID)
         let url = HubApi.shared.localRepoLocation(repo)
         let modelFileExtension = ".safetensors"
         
@@ -61,20 +66,12 @@ public final class LLMLocalDownloadManager: NSObject {
         }
     }
     
-    /// Initializes a ``LLMLocalDownloadManager`` instance to manage the download of LLM files from the remote server.
-    ///
-    /// - Parameters:
-    ///   - modelConfiguration: The configuration specifying the parameters and settings for the LLM that needs to be downloaded.
-    public init(modelConfiguration: ModelConfiguration) {
-        self.modelConfiguration = modelConfiguration
-    }
-    
     /// Initializes a ``LLMLocalDownloadManager`` instance to manage the download of Large Language Model (LLM) files from remote servers.
     ///
     /// - Parameters:
     ///   - modelID: The Huggingface model ID of the LLM that needs to be downloaded.
-    public init(modelID: String) {
-        self.modelConfiguration = .init(id: modelID)
+    public init(model: LLMLocalModel) {
+        self.modelConfiguration = .init(id: model.hubID)
     }
     
     /// Starts a `URLSessionDownloadTask` to download the specified model.
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
index 61bae01e..86bd51c0 100644
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
+++ b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
@@ -10,6 +10,7 @@ import MLXLLM
 import SpeziOnboarding
 import SpeziViews
 import SwiftUI
+import SpeziLLMLocal
 
 
 /// Provides an onboarding view for downloading locally executed Spezi LLMs to the device.
@@ -138,7 +139,7 @@ public struct LLMLocalDownloadView: View {
                 .progressViewStyle(LinearProgressViewStyle())
                 .padding()
             
-            Text("Downloaded \(String(format: "%.2f", downloadProgress))% of 100%.", bundle: .module)
+            Text("Downloaded \(String(format: "%.0f", downloadProgress))% of 100%.", bundle: .module)
                 .padding(.top, 5)
         }
     }
@@ -177,12 +178,12 @@ public struct LLMLocalDownloadView: View {
     ///   - llmDownloadLocation: The local `URL` where the LLM file should be stored.
     ///   - action: The action that should be performed when pressing the primary button of the view.
     public init(
-        model modelConfiguration: ModelConfiguration,
+        model: LLMLocalModel,
         downloadDescription: LocalizedStringResource,
         action: @escaping () async throws -> Void
     ) {
         self._downloadManager = State(
-            wrappedValue: LLMLocalDownloadManager(modelConfiguration: modelConfiguration)
+            wrappedValue: LLMLocalDownloadManager(model: model)
         )
         self.downloadDescription = Text(downloadDescription)
         self.action = action
@@ -197,38 +198,12 @@ public struct LLMLocalDownloadView: View {
     ///   - action: The action that should be performed when pressing the primary button of the view.
     @_disfavoredOverload
     public init<S: StringProtocol>(
-        model modelConfiguration: ModelConfiguration,
+        model: LLMLocalModel,
         downloadDescription: S,
         action: @escaping () async throws -> Void
     ) {
         self._downloadManager = State(
-            wrappedValue: LLMLocalDownloadManager(modelConfiguration: modelConfiguration)
-        )
-        self.downloadDescription = Text(verbatim: String(downloadDescription))
-        self.action = action
-    }
-    
-    @_disfavoredOverload
-    public init(
-        model modelID: String,
-        downloadDescription: LocalizedStringResource,
-        action: @escaping () async throws -> Void
-    ) {
-        self._downloadManager = State(
-            wrappedValue: LLMLocalDownloadManager(modelID: modelID)
-        )
-        self.downloadDescription = Text(downloadDescription)
-        self.action = action
-    }
-    
-    @_disfavoredOverload
-    public init<S: StringProtocol>(
-        model modelID: String,
-        downloadDescription: S,
-        action: @escaping () async throws -> Void
-    ) {
-        self._downloadManager = State(
-            wrappedValue: LLMLocalDownloadManager(modelID: modelID)
+            wrappedValue: LLMLocalDownloadManager(model: model)
         )
         self.downloadDescription = Text(verbatim: String(downloadDescription))
         self.action = action

From 426075667b08949cddb37d5ea80cfa2bee23839c Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Sun, 27 Oct 2024 13:34:35 -0700
Subject: [PATCH 08/27] fix liniting issues

---
 .../Configuration/LLMLocalModel.swift         | 19 ++++++++++++++++
 .../Configuration/LLMLocalParameters.swift    |  3 +--
 .../LLMLocalSamplingParameters.swift          |  2 +-
 .../LLMLocalSession+Generate.swift            |  4 ++--
 .../LLMLocalDownloadManager.swift             | 22 +++++++++++--------
 .../LLMLocalDownloadView.swift                |  2 +-
 6 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift
index 9d53a81a..e011a09f 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift
@@ -7,26 +7,45 @@
 //
 
 
+// swiftlint:disable identifier_name
+/// The Local LLM Model that need to be used 
 public enum LLMLocalModel {
+    /// mlx-community/Meta-Llama-3.1-8B-Instruct-4bit
     case llama3_1_8B_4bit
+    /// mlx-community/Meta-Llama-3-8B-Instruct-4bit
     case llama3_8B_4bit
+    /// mlx-community/Llama-3.2-1B-Instruct-4bit
     case llama3_2_1B_4bit
+    /// mlx-community/Llama-3.2-3B-Instruct-4bit
     case llama3_2_3B_4bit
+    /// mlx-community/Mistral-Nemo-Instruct-2407-4bit
     case mistralNeMo4bit
+    /// mlx-community/SmolLM-135M-Instruct-4bit
     case smolLM_135M_4bit
+    /// mlx-community/Mistral-7B-Instruct-v0.3-4bit
     case mistral7B4bit
+    /// mlx-community/CodeLlama-13b-Instruct-hf-4bit-MLX
     case codeLlama13b4bit
+    /// mlx-community/phi-2-hf-4bit-mlx
     case phi4bit
+    /// mlx-community/Phi-3-mini-4k-instruct-4bit-no-q-embed
     case phi3_4bit
+    /// mlx-community/Phi-3.5-mini-instruct-4bit
     case phi3_5_4bit
+    /// mlx-community/quantized-gemma-2b-it
     case gemma2bQuantized
+    /// mlx-community/gemma-2-9b-it-4bit
     case gemma_2_9b_it_4bit
+    /// mlx-community/gemma-2-2b-it-4bit
     case gemma_2_2b_it_4bit
+    /// mlx-community/Qwen1.5-0.5B-Chat-4bit
     case qwen205b4bit
+    /// mlx-community/OpenELM-270M-Instruct
     case openelm270m4bit
     /// Set the Huggingface ID of the model. e.g. "\<USER\>/\<MODEL\>"
     case custom(id: String)
     
+    /// The Huggingface ID for the model
     public var hubID: String {
         switch self {
         case .llama3_1_8B_4bit:
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
index 9a654f2b..01c9fcf3 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
@@ -10,7 +10,6 @@ import Foundation
 
 /// Represents the parameters of the LLM.
 public struct LLMLocalParameters: Sendable {
-    
     /// Defaults of possible LLMs parameter settings.
     public enum Defaults {
         /// Default system prompt for local LLMs.
@@ -35,7 +34,7 @@ public struct LLMLocalParameters: Sendable {
     /// - Parameters:
     ///   - systemPrompt: The to-be-used system prompt of the LLM enabling fine-tuning of the LLMs behaviour. Defaults to the regular default chat-based LLM system prompt.
     ///   - maxOutputLength: The maximum output length generated by the Spezi LLM, defaults to `512`.
-    ///   - extraEOSTokens: TODO
+    ///   - extraEOSTokens: Additional tokens to use for end of string
     ///   - displayEveryNTokens: Interval for displaying output after every N tokens generated, defaults to `4`.
     public init(
         systemPrompt: String? = Defaults.defaultSystemPrompt,
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift
index 16198850..24b3474e 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalSamplingParameters.swift
@@ -10,7 +10,7 @@ import Foundation
 
 
 /// Represents the sampling parameters of the LLM.
-public struct LLMLocalSamplingParameters: Sendable {    // swiftlint:disable:this type_body_length
+public struct LLMLocalSamplingParameters: Sendable {
     /// Top-p Sampling: Smallest possible set of words whose cumulative probability exceeds the probability p (1.0 = disabled).
     let topP: Float
     /// Temperature Sampling: A higher value indicates more creativity of the model but also more hallucinations.
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index 6c147c28..3e72e957 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -89,8 +89,8 @@ extension LLMLocalSession {
         
         Self.logger.debug(
             """
-            SpeziLLMLocal: 
-            Prompt Tokens per second: \(result.promptTokensPerSecond, privacy: .public) 
+            SpeziLLMLocal:
+            Prompt Tokens per second: \(result.promptTokensPerSecond, privacy: .public)
             Generation tokens per second: \(result.tokensPerSecond, privacy: .public)
             """
         )
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
index 4ffe465f..1e1b2b89 100644
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
+++ b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager.swift
@@ -10,8 +10,8 @@ import Foundation
 import Hub
 import MLXLLM
 import Observation
-import SpeziViews
 import SpeziLLMLocal
+import SpeziViews
 
 /// Manages the download and storage of Large Language Models (LLM) to the local device.
 ///
@@ -53,6 +53,18 @@ public final class LLMLocalDownloadManager: NSObject {
         LLMLocalDownloadManager.modelExsist(model: .custom(id: modelConfiguration.name))
     }
     
+    /// Initializes a ``LLMLocalDownloadManager`` instance to manage the download of Large Language Model (LLM) files from remote servers.
+    ///
+    /// - Parameters:
+    ///   - modelID: The Huggingface model ID of the LLM that needs to be downloaded.
+    public init(model: LLMLocalModel) {
+        self.modelConfiguration = .init(id: model.hubID)
+    }
+    
+    /// Checks if a model is already downloaded to the local device.
+    ///
+    /// - Parameter model: The model to check for local existence.
+    /// - Returns: A Boolean value indicating whether the model exists on the device.
     public static func modelExsist(model: LLMLocalModel) -> Bool {
         let repo = Hub.Repo(id: model.hubID)
         let url = HubApi.shared.localRepoLocation(repo)
@@ -66,14 +78,6 @@ public final class LLMLocalDownloadManager: NSObject {
         }
     }
     
-    /// Initializes a ``LLMLocalDownloadManager`` instance to manage the download of Large Language Model (LLM) files from remote servers.
-    ///
-    /// - Parameters:
-    ///   - modelID: The Huggingface model ID of the LLM that needs to be downloaded.
-    public init(model: LLMLocalModel) {
-        self.modelConfiguration = .init(id: model.hubID)
-    }
-    
     /// Starts a `URLSessionDownloadTask` to download the specified model.
     public func startDownload() {
         if case let .directory(url) = modelConfiguration.id {
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
index 86bd51c0..f1eec999 100644
--- a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
+++ b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadView.swift
@@ -7,10 +7,10 @@
 //
 
 import MLXLLM
+import SpeziLLMLocal
 import SpeziOnboarding
 import SpeziViews
 import SwiftUI
-import SpeziLLMLocal
 
 
 /// Provides an onboarding view for downloading locally executed Spezi LLMs to the device.

From e99b35d2adb4004d902584490cac4856d0ff83c1 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Sun, 27 Oct 2024 14:38:52 -0700
Subject: [PATCH 09/27] add comments fix memory selection

---
 .../LLMLocalPlatformConfiguration.swift       | 23 +++++++++++++++----
 Sources/SpeziLLMLocal/LLMLocalPlatform.swift  |  5 ++--
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalPlatformConfiguration.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalPlatformConfiguration.swift
index bd8f1c5c..86523817 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalPlatformConfiguration.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalPlatformConfiguration.swift
@@ -10,8 +10,9 @@ import Foundation
 
 /// Represents the configuration of the Spezi ``LLMLocalPlatform``.
 public struct LLMLocalPlatformConfiguration: Sendable {
+    /// Represents the memory limit for the MLX GPU.
     public struct MemoryLimit: Sendable {
-        /// The memory limit in MB
+        /// The memory limit in MB.
         let limit: Int
         
         /// Calls to malloc will wait on scheduled tasks if the limit is exceeded.  If
@@ -22,11 +23,21 @@ public struct LLMLocalPlatformConfiguration: Sendable {
         /// The memory limit defaults to 1.5 times the maximum recommended working set
         /// size reported by the device ([recommendedMaxWorkingSetSize](https://developer.apple.com/documentation/metal/mtldevice/2369280-recommendedmaxworkingsetsize))
         let relaxed: Bool
+        
+        /// Creates the `MemoryLimit` which configures the GPU used by MLX.
+        ///
+        /// - Parameters:
+        ///   - limit: The memory limit in MB.
+        ///   - relaxed: See  `relaxed` in ``LLMLocalPlatformConfiguration/MemoryLimit``.
+        public init(limit: Int, relaxed: Bool = false) {
+            self.limit = limit
+            self.relaxed = relaxed
+        }
     }
     
-    /// The cache limit in MB, to disable set limit to 0
-    let cacheLimit: Int
-    
+    /// The cache limit in MB, to disable set limit to `0`.
+    let cacheLimit: Int?
+    /// The memory limit for the GPU used by MLX.
     let memoryLimit: MemoryLimit?
     /// The task priority of the initiated LLM inference tasks.
     let taskPriority: TaskPriority
@@ -35,9 +46,11 @@ public struct LLMLocalPlatformConfiguration: Sendable {
     /// Creates the ``LLMLocalPlatformConfiguration`` which configures the Spezi ``LLMLocalPlatform``.
     ///
     /// - Parameters:
+    ///   - cacheLimit: The cache limit for the GPU used by MLX, defaults to `nil`.
+    ///   - memoryLimit: The memory limit for the GPU used by MLX, defaults to `nil`.
     ///   - taskPriority: The task priority of the initiated LLM inference tasks, defaults to `.userInitiated`.
     public init(
-        cacheLimit: Int = 20,
+        cacheLimit: Int? = nil,
         memoryLimit: MemoryLimit? = nil,
         taskPriority: TaskPriority = .userInitiated
     ) {
diff --git a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
index 66d51dd1..a8930de3 100644
--- a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
@@ -60,8 +60,9 @@ public actor LLMLocalPlatform: LLMPlatform, DefaultInitializable {
 #if targetEnvironment(simulator)
         assertionFailure("SpeziLLMLocal: Code cannot be run on simulator.")
 #endif
-        
-        MLX.GPU.set(cacheLimit: configuration.cacheLimit * 1024 * 1024)
+        if let cacheLimit = configuration.cacheLimit {
+            MLX.GPU.set(cacheLimit: cacheLimit * 1024 * 1024)
+        }
         if let memoryLimit = configuration.memoryLimit {
             MLX.GPU.set(memoryLimit: memoryLimit.limit, relaxed: memoryLimit.relaxed)
         }

From fded07ff62c87204151c4bbe2f57d86555142252 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Mon, 28 Oct 2024 12:26:28 -0700
Subject: [PATCH 10/27] intermediate commit

---
 .../Configuration/LLMLocalModel.swift         | 34 +++++++++----------
 Sources/SpeziLLMLocal/LLMLocalSchema.swift    |  5 ++-
 .../LLMLocalSession+Generate.swift            |  9 +++--
 .../SpeziLLMLocal/LLMLocalSession+Setup.swift | 10 +++---
 Sources/SpeziLLMLocal/LLMLocalSession.swift   | 13 ++++++-
 .../Resources/Localizable.xcstrings           |  9 ++++-
 6 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift
index e011a09f..0e6f9bcf 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalModel.swift
@@ -8,39 +8,39 @@
 
 
 // swiftlint:disable identifier_name
-/// The Local LLM Model that need to be used 
+/// Represents the available LLM models.
 public enum LLMLocalModel {
-    /// mlx-community/Meta-Llama-3.1-8B-Instruct-4bit
+    /// Llama 3.1, 8 Billion Parameters, Instruct Mode, 4-bit Version
     case llama3_1_8B_4bit
-    /// mlx-community/Meta-Llama-3-8B-Instruct-4bit
+    /// Llama 3, 8 Billion Parameters, Instruction-Tuned, 4-bit Version
     case llama3_8B_4bit
-    /// mlx-community/Llama-3.2-1B-Instruct-4bit
+    /// Llama 3.2, 1 Billion Parameters, Instruction-Tuned, 4-bit Version
     case llama3_2_1B_4bit
-    /// mlx-community/Llama-3.2-3B-Instruct-4bit
+    /// Llama 3.2, 3 Billion Parameters, Instruction-Tuned, 4-bit Version
     case llama3_2_3B_4bit
-    /// mlx-community/Mistral-Nemo-Instruct-2407-4bit
+    /// Mistral Nemo, Instruction-Tuned, Model 2407, 4-bit Version
     case mistralNeMo4bit
-    /// mlx-community/SmolLM-135M-Instruct-4bit
+    /// SmolLM, 135 Million Parameters, Instruction-Tuned, 4-bit Version
     case smolLM_135M_4bit
-    /// mlx-community/Mistral-7B-Instruct-v0.3-4bit
+    /// Mistral, 7 Billion Parameters, Instruction-Tuned, Version 0.3, 4-bit Version
     case mistral7B4bit
-    /// mlx-community/CodeLlama-13b-Instruct-hf-4bit-MLX
+    /// Code Llama, 13 Billion Parameters, Instruction-Tuned, Hugging Face Format, 4-bit, MLX Version
     case codeLlama13b4bit
-    /// mlx-community/phi-2-hf-4bit-mlx
+    /// Phi 2, Hugging Face Format, 4-bit, MLX Version
     case phi4bit
-    /// mlx-community/Phi-3-mini-4k-instruct-4bit-no-q-embed
+    /// Phi 3 Mini, 4K Context Window, Instruction-Tuned, 4-bit Version, No Q-Embedding
     case phi3_4bit
-    /// mlx-community/Phi-3.5-mini-instruct-4bit
+    /// Phi 3.5 Mini, Instruction-Tuned, 4-bit Version
     case phi3_5_4bit
-    /// mlx-community/quantized-gemma-2b-it
+    /// Quantized Gemma, 2 Billion Parameters, Instruction-Tuned
     case gemma2bQuantized
-    /// mlx-community/gemma-2-9b-it-4bit
+    /// Gemma 2, 9 Billion Parameters, Instruction-Tuned, 4-bit Version
     case gemma_2_9b_it_4bit
-    /// mlx-community/gemma-2-2b-it-4bit
+    /// Gemma 2, 2 Billion Parameters, Instruction-Tuned, 4-bit Version
     case gemma_2_2b_it_4bit
-    /// mlx-community/Qwen1.5-0.5B-Chat-4bit
+    /// Qwen 1.5, 0.5 Billion Parameters, Chat-Tuned, 4-bit Version
     case qwen205b4bit
-    /// mlx-community/OpenELM-270M-Instruct
+    /// OpenELM, 270 Million Parameters, Instruction-Tuned
     case openelm270m4bit
     /// Set the Huggingface ID of the model. e.g. "\<USER\>/\<MODEL\>"
     case custom(id: String)
diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
index ac1d3fcf..3bf3ca24 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSchema.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
@@ -31,9 +31,8 @@ public struct LLMLocalSchema: LLMSchema {
     let formatChat: (@Sendable (LLMContext) throws -> String)
     /// Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``.
     public let injectIntoContext: Bool
-    
-    
-    package let configuration: ModelConfiguration
+    /// The models configuration which is based on `mlx-libraries`
+    internal let configuration: ModelConfiguration
     
     /// Creates an instance of the ``LLMLocalSchema`` containing all necessary configuration for local LLM inference.
     ///
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index 3e72e957..8f9d6e31 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -17,7 +17,7 @@ import SpeziLLM
 
 extension LLMLocalSession {
     // swiftlint:disable:next identifier_name function_body_length
-    func _generate(continuation: AsyncThrowingStream<String, any Error>.Continuation) async {
+    internal func _generate(continuation: AsyncThrowingStream<String, any Error>.Continuation) async {
         guard let modelContainer = await self.modelContainer else {
             Self.logger.error("SpeziLLMLocal: Failed to load `modelContainer`")
             await finishGenerationWithError(LLMLocalError.modelNotFound, on: continuation)
@@ -53,7 +53,6 @@ extension LLMLocalSession {
         )
         
         let (result, tokenizer) = await modelContainer.perform { model, tokenizer in
-            // Execute the inference
             let result = MLXLLM.generate(
                 promptTokens: promptTokens,
                 parameters: parameters,
@@ -66,6 +65,7 @@ extension LLMLocalSession {
                 }
                 
                 if tokens.count >= self.schema.parameters.maxOutputLength {
+                    Self.logger.debug("SpeziLLMLocal: Max output length exceeded.")
                     continuation.finish()
                     Task { @MainActor in
                         self.state = .ready
@@ -102,11 +102,10 @@ extension LLMLocalSession {
                 let lastTokens = Array(result.tokens.suffix(reaminingTokens))
                 let text = tokenizer.decode(tokens: lastTokens)
                 continuation.yield(text)
-                context.completeAssistantStreaming()
-            } else {
-                context.append(assistantOutput: result.output, complete: true)
             }
             
+            context.append(assistantOutput: result.output, complete: true)
+            context.completeAssistantStreaming()
             continuation.finish()
             state = .ready
         }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
index edbbbb60..b099cf87 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
@@ -25,8 +25,8 @@ extension LLMLocalSession {
         }
     }
     
-    
-    func setup(continuation: AsyncThrowingStream<String, Error>.Continuation) async -> Bool {
+    // swiftlint:disable:next identifier_name function_body_length
+    internal func _setup(continuation: AsyncThrowingStream<String, Error>.Continuation?) async -> Bool {
         Self.logger.debug("SpeziLLMLocal: Local LLM is being initialized")
         
         await MainActor.run {
@@ -34,7 +34,9 @@ extension LLMLocalSession {
         }
         
         guard verifyModelDownload() else {
-            await finishGenerationWithError(LLMLocalError.modelNotFound, on: continuation)
+            if let continuation {
+                await finishGenerationWithError(LLMLocalError.modelNotFound, on: continuation)
+            }
             Self.logger.error("SpeziLLMLocal: Local LLM file could not be opened, indicating that the model file doesn't exist")
             return false
         }
@@ -52,7 +54,7 @@ extension LLMLocalSession {
                 self.state = .ready
             }
         } catch {
-            continuation.yield(with: .failure(error))
+            continuation?.yield(with: .failure(error))
             Self.logger.error("SpeziLLMLocal: Failed to load local `modelContainer`")
             return false
         }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift
index 7c87c7bc..026859f7 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession.swift
@@ -102,13 +102,24 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable {
         }
     }
     
+    /// Initializes the model in advance.
+    /// Calling this method before user interaction prepares the model, which leads to reduced response time for the first prompt.
+    public func setup() async throws {
+        guard await _setup(continuation: nil) else {
+            throw LLMLocalError.modelNotReadyYet
+        }
+    }
+    
+    
+    /// Based on the input prompt, generate the output.
+    /// - Returns: A Swift `AsyncThrowingStream` that streams the generated output.
     @discardableResult
     public func generate() async throws -> AsyncThrowingStream<String, Error> {
         let (stream, continuation) = AsyncThrowingStream.makeStream(of: String.self)
         
         task = Task(priority: platform.configuration.taskPriority) {
             if await state == .uninitialized {
-                guard await setup(continuation: continuation) else {
+                guard await _setup(continuation: continuation) else {
                     await MainActor.run {
                         state = .error(error: LLMLocalError.modelNotReadyYet)
                     }
diff --git a/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings b/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
index 0ee6a86e..b2290765 100644
--- a/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
+++ b/Sources/SpeziLLMLocal/Resources/Localizable.xcstrings
@@ -152,7 +152,14 @@
       }
     },
     "SPEZI_LLM_LOCAL_SYSTEM_PROMPT" : {
-
+      "localizations" : {
+        "en" : {
+          "stringUnit" : {
+            "state" : "translated",
+            "value" : "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe and still concise. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
+          }
+        }
+      }
     }
   },
   "version" : "1.0"

From 4480b8ccb3936129c9ff2ec5e9d5e32c264b3da0 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Mon, 28 Oct 2024 12:27:34 -0700
Subject: [PATCH 11/27] remote swiftlint

---
 Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
index b099cf87..4ebb4573 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
@@ -25,7 +25,7 @@ extension LLMLocalSession {
         }
     }
     
-    // swiftlint:disable:next identifier_name function_body_length
+    // swiftlint:disable:next identifier_name
     internal func _setup(continuation: AsyncThrowingStream<String, Error>.Continuation?) async -> Bool {
         Self.logger.debug("SpeziLLMLocal: Local LLM is being initialized")
         

From 768cbfc6e109cdbd717ef78c527589805cc4a8f4 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Mon, 28 Oct 2024 18:15:11 -0700
Subject: [PATCH 12/27] intermediate commit

---
 README.md                                             |  3 ++-
 .../Configuration/LLMLocalContextParameters.swift     |  8 +-------
 .../TestApp/LLMLocal/LLMLocalChatTestView.swift       |  9 ++-------
 .../TestAppUITests/TestAppLLMLocalUITests.swift       | 11 ++++++++++-
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index b6bde0ea..2de78184 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,8 @@ struct LLMLocalDemoView: View {
                 // Instantiate the `LLMLocalSchema` to an `LLMLocalSession` via the `LLMRunner`.
                 let llmSession: LLMLocalSession = runner(
                     with: LLMLocalSchema(
-                        modelPath: URL(string: "URL to the local model file")!
+                        model: .llama3_8B_4bit,
+                        formatChat: LLMLocalSchema.PromptFormattingDefaults.llama3
                     )
                 )
 
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
index 7db1506b..a707e839 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
@@ -14,20 +14,14 @@ public struct LLMLocalContextParameters: Sendable {
     /// RNG seed of the LLM
     var seed: UInt64?
     
-    /// If `true`, the mode is set to embeddings only
-    var embeddingsOnly: Bool
-    
     /// Creates the ``LLMLocalContextParameters`` which wrap the underlying llama.cpp `llama_context_params` C struct.
     /// Is passed to the underlying llama.cpp model in order to configure the context of the LLM.
     ///
     /// - Parameters:
     ///   - seed: RNG seed of the LLM, defaults to a random seed.
-    ///   - embeddingsOnly: Embedding-only mode, defaults to `false`.
     public init(
-        seed: UInt64? = nil,
-        embeddingsOnly: Bool = false
+        seed: UInt64? = nil
     ) {
         self.seed = seed
-        self.embeddingsOnly = embeddingsOnly
     }
 }
diff --git a/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift b/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift
index 684f143c..ad6999cf 100644
--- a/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift
+++ b/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift
@@ -25,13 +25,8 @@ struct LLMLocalChatTestView: View {
             } else {
                 LLMChatViewSchema(
                     with: LLMLocalSchema(
-                        configuration: .phi3_4bit,
-                        formatChat: { context in
-                            context
-                                .filter { $0.role == .user }
-                                .map { $0.content }
-                                .joined(separator: " ")
-                        }
+                        model: .llama3_8B_4bit,
+                        formatChat: LLMLocalSchema.PromptFormattingDefaults.llama3
                     )
                 )
             }
diff --git a/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift b/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift
index d089935f..7e82bea1 100644
--- a/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift
+++ b/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift
@@ -43,8 +43,17 @@ class TestAppLLMLocalUITests: XCTestCase {
         sleep(1)
         
         // Chat
+        let inputTextfield = app.textViews["Message Input Textfield"]
+        XCTAssertTrue(inputTextfield.exists)
+        
         #if !os(macOS)
-        try app.textViews["Message Input Textfield"].enter(value: "New Message!", options: [.disableKeyboardDismiss])
+        if UIDevice.current.userInterfaceIdiom == .pad {
+            inputTextfield.tap()
+            sleep(1)
+            inputTextfield.typeText("New Message!")
+        } else {
+            try inputTextfield.enter(value: "New Message!", options: [.disableKeyboardDismiss])
+        }
         #else
         try app.textFields["Message Input Textfield"].enter(value: "New Message!", options: [.disableKeyboardDismiss])
         #endif

From 3ea48b0fd0464647bfa39637a242575a6c06157d Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Mon, 28 Oct 2024 19:00:48 -0700
Subject: [PATCH 13/27] skip test on release ipad

---
 Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift b/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift
index 7e82bea1..09a00849 100644
--- a/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift
+++ b/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift
@@ -46,8 +46,13 @@ class TestAppLLMLocalUITests: XCTestCase {
         let inputTextfield = app.textViews["Message Input Textfield"]
         XCTAssertTrue(inputTextfield.exists)
         
+        
         #if !os(macOS)
         if UIDevice.current.userInterfaceIdiom == .pad {
+            #if RELEASE
+            throw XCTSkip("Skipped on iPad, see: https://github.com/StanfordBDHG/XCTestExtensions/issues/27")
+            #endif
+            
             inputTextfield.tap()
             sleep(1)
             inputTextfield.typeText("New Message!")

From a251d8a287bff7484d12a1e56a7cd3aedbf173fd Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Tue, 29 Oct 2024 07:10:54 -0700
Subject: [PATCH 14/27] skip openai test due to same issue

---
 .../UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift  | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift b/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift
index 134ad9ec..8e08625b 100644
--- a/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift
+++ b/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift
@@ -29,6 +29,10 @@ class TestAppLLMOpenAIUITests: XCTestCase {
     func testSpeziLLMOpenAIOnboarding() throws {    // swiftlint:disable:this function_body_length
         let app = XCUIApplication()
         
+        if UIDevice.current.userInterfaceIdiom == .pad {
+            throw XCTSkip("Skipped on iPad, see: https://github.com/StanfordBDHG/XCTestExtensions/issues/27")
+        }
+        
         XCTAssert(app.buttons["LLMOpenAI"].waitForExistence(timeout: 2))
         app.buttons["LLMOpenAI"].tap()
         
@@ -141,6 +145,10 @@ class TestAppLLMOpenAIUITests: XCTestCase {
     func testSpeziLLMOpenAIChat() throws {
         let app = XCUIApplication()
         
+        if UIDevice.current.userInterfaceIdiom == .pad {
+            throw XCTSkip("Skipped on iPad, see: https://github.com/StanfordBDHG/XCTestExtensions/issues/27")
+        }
+        
         XCTAssert(app.buttons["LLMOpenAI"].waitForExistence(timeout: 2))
         app.buttons["LLMOpenAI"].tap()
         

From d44f38d4f9de5804803ae0a0b7cc59d6935b7f2b Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Mon, 18 Nov 2024 11:22:00 -0800
Subject: [PATCH 15/27] bump to mlx-swift-examples 1.18.1

---
 .../SpeziLLM/Models/LLMContextEntity.swift    |   2 +-
 ...LLMContext+formatForTransformersChat.swift |  20 ++
 .../LLMLocalSchema+PromptFormatting.swift     | 275 ------------------
 Sources/SpeziLLMLocal/LLMLocalSchema.swift    |   7 +-
 .../LLMLocalSession+Generate.swift            |  41 ++-
 .../SpeziLLMLocal/LLMLocalSession+Setup.swift |   2 +
 6 files changed, 43 insertions(+), 304 deletions(-)
 create mode 100644 Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
 delete mode 100644 Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift

diff --git a/Sources/SpeziLLM/Models/LLMContextEntity.swift b/Sources/SpeziLLM/Models/LLMContextEntity.swift
index 4f88e0d1..1b93c957 100644
--- a/Sources/SpeziLLM/Models/LLMContextEntity.swift
+++ b/Sources/SpeziLLM/Models/LLMContextEntity.swift
@@ -46,7 +46,7 @@ public struct LLMContextEntity: Codable, Equatable, Hashable, Identifiable {
         case tool(id: String, name: String)
         
         
-        var rawValue: String {
+        public var rawValue: String {
             switch self {
             case .user: "user"
             case .assistant: "assistant"
diff --git a/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift b/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
new file mode 100644
index 00000000..d8da53b0
--- /dev/null
+++ b/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
@@ -0,0 +1,20 @@
+//
+// This source file is part of the Stanford Spezi open source project
+//
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
+//
+
+import SpeziLLM
+
+public extension LLMContext {
+    func formatForTransformersChat() -> [[String: String]] {
+        self.map { entry in
+            return [
+                "role": entry.role.rawValue,
+                "content": entry.content
+            ]
+        }
+    }
+}
diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift b/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift
deleted file mode 100644
index 0859cbe6..00000000
--- a/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift
+++ /dev/null
@@ -1,275 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import SpeziLLM
-
-
-extension LLMLocalSchema {
-    /// Holds default prompt formatting strategies for [Llama2](https://ai.meta.com/llama/) as well as [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) models.
-    public enum PromptFormattingDefaults {
-        /// Prompt formatting closure for the [Llama3](https://ai.meta.com/llama/) model
-        public static let llama3: (@Sendable (LLMContext) throws -> String) = { chat in // swiftlint:disable:this closure_body_length
-            /// BOS token of the LLM, used at the start of each prompt passage.
-            let BEGINOFTEXT = "<|begin_of_text|>"
-            /// The system identifier.
-            let SYSTEM = "system"
-            /// The user identifier.
-            let USER = "user"
-            /// The assistant identifier.
-            let ASSISTANT = "assistant"
-            /// The start token for enclosing the role of a particular message, e.g. <|start_header_id|>{role}<|end_header_id|>
-            let STARTHEADERID = "<|start_header_id|>"
-            /// The end token for enclosing the role of a particular message, e.g. <|start_header_id|>{role}<|end_header_id|>
-            let ENDHEADERID = "<|end_header_id|>"
-            /// The token that signifies the end of the message in a turn.
-            let EOTID = "<|eot_id|>"
-            
-            guard chat.first?.role == .system else {
-                throw LLMLocalError.illegalContext
-            }
-            
-            var systemPrompts: [String] = []
-            var initialUserPrompt: String = ""
-            
-            for contextEntity in chat {
-                if contextEntity.role != .system {
-                    if contextEntity.role == .user {
-                        initialUserPrompt = contextEntity.content
-                        break
-                    } else {
-                        throw LLMLocalError.illegalContext
-                    }
-                }
-                
-                systemPrompts.append(contextEntity.content)
-            }
-            
-            /// Build the initial Llama3 prompt structure
-            /// 
-            /// Template of the prompt structure:
-            /// <|begin_of_text|>
-            /// <|start_header_id|>user<|end_header_id|>
-            /// {{ user_message }}<|eot_id|>
-            /// <|start_header_id|>assistant<|end_header_id|>
-            var prompt = """
-            \(BEGINOFTEXT)
-            \(STARTHEADERID)\(SYSTEM)\(ENDHEADERID)
-            \(systemPrompts.joined(separator: " "))\(EOTID)
-            
-            \(STARTHEADERID)\(USER)\(ENDHEADERID)
-            \(initialUserPrompt)\(EOTID)
-            
-            """ + " "   // Add a spacer to the generated output from the model
-            
-            for contextEntity in chat.dropFirst(2) {
-                if contextEntity.role == .assistant() {
-                    /// Append response from assistant to the Llama3 prompt structure
-                    prompt += """
-                    \(STARTHEADERID)\(ASSISTANT)\(ENDHEADERID)
-                    \(contextEntity.content)
-                    \(EOTID)
-                    """
-                } else if contextEntity.role == .user {
-                    /// Append response from user to the Llama3 prompt structure
-                    prompt += """
-                    \(STARTHEADERID)\(USER)\(ENDHEADERID)
-                    \(contextEntity.content)
-                    \(EOTID)
-                    """ + " "   // Add a spacer to the generated output from the model
-                }
-            }
-            
-            prompt +=
-            """
-            \(STARTHEADERID)\(ASSISTANT)\(ENDHEADERID)
-            """
-            
-            return prompt
-        }
-        
-        /// Prompt formatting closure for the [Llama2](https://ai.meta.com/llama/) model
-        public static let llama2: (@Sendable (LLMContext) throws -> String) = { chat in     // swiftlint:disable:this closure_body_length
-            /// BOS token of the LLM, used at the start of each prompt passage.
-            let BOS = "<s>"
-            /// EOS token of the LLM, used at the end of each prompt passage.
-            let EOS = "</s>"
-            /// BOSYS token of the LLM, used at the start of the system prompt.
-            let BOSYS = "<<SYS>>"
-            /// EOSYS token of the LLM, used at the end of the system prompt.
-            let EOSYS = "<</SYS>>"
-            /// BOINST token of the LLM, used at the start of the instruction part of the prompt.
-            let BOINST = "[INST]"
-            /// EOINST token of the LLM, used at the end of the instruction part of the prompt.
-            let EOINST = "[/INST]"
-            
-            guard chat.first?.role == .system else {
-                throw LLMLocalError.illegalContext
-            }
-            
-            var systemPrompts: [String] = []
-            var initialUserPrompt: String = ""
-            
-            for contextEntity in chat {
-                if contextEntity.role != .system {
-                    if contextEntity.role == .user {
-                        initialUserPrompt = contextEntity.content
-                        break
-                    } else {
-                        throw LLMLocalError.illegalContext
-                    }
-                }
-                
-                systemPrompts.append(contextEntity.content)
-            }
-            
-            /// Build the initial Llama2 prompt structure
-            ///
-            /// A template of the prompt structure looks like:
-            /// """
-            /// <s>[INST] <<SYS>>
-            /// {your_system_prompt}
-            /// <</SYS>>
-            ///
-            /// {user_message_1} [/INST]
-            /// """
-            var prompt = """
-            \(BOS)\(BOINST) \(BOSYS)
-            \(systemPrompts.joined(separator: " "))
-            \(EOSYS)
-            
-            \(initialUserPrompt) \(EOINST)
-            """ + " "   // Add a spacer to the generated output from the model
-            
-            for contextEntity in chat.dropFirst(2) {
-                if contextEntity.role == .assistant() {
-                    /// Append response from assistant to the Llama2 prompt structure
-                    ///
-                    /// A template for appending an assistant response to the overall prompt looks like:
-                    /// {user_message_1} [/INST]){model_reply_1}</s>
-                    prompt += """
-                    \(contextEntity.content)\(EOS)
-                    """
-                } else if contextEntity.role == .user {
-                    /// Append response from user to the Llama2 prompt structure
-                    ///
-                    /// A template for appending an assistant response to the overall prompt looks like:
-                    /// <s>[INST] {user_message_2} [/INST]
-                    prompt += """
-                    \(BOS)\(BOINST) \(contextEntity.content) \(EOINST)
-                    """ + " "   // Add a spacer to the generated output from the model
-                }
-            }
-            
-            return prompt
-        }
-        
-        /// Prompt formatting closure for the [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) model
-        public static let phi2: (@Sendable (LLMContext) throws -> String) = { chat in
-            guard chat.first?.role == .system else {
-                throw LLMLocalError.illegalContext
-            }
-            
-            var systemPrompts: [String] = []
-            var initialUserPrompt: String = ""
-            
-            for contextEntity in chat {
-                if contextEntity.role != .system {
-                    if contextEntity.role == .user {
-                        initialUserPrompt = contextEntity.content
-                        break
-                    } else {
-                        throw LLMLocalError.illegalContext
-                    }
-                }
-                
-                systemPrompts.append(contextEntity.content)
-            }
-            
-            /// Build the initial Phi-2 prompt structure
-            ///
-            /// A template of the prompt structure looks like:
-            /// """
-            /// System: {your_system_prompt}
-            /// Instruct: {model_reply_1}
-            /// Output: {model_reply_1}
-            /// """
-            var prompt = """
-            System: \(systemPrompts.joined(separator: " "))
-            Instruct: \(initialUserPrompt)\n
-            """
-            
-            for contextEntity in chat.dropFirst(2) {
-                if contextEntity.role == .assistant() {
-                    /// Append response from assistant to the Phi-2 prompt structure
-                    prompt += """
-                    Output: \(contextEntity.content)\n
-                    """
-                } else if contextEntity.role == .user {
-                    /// Append response from assistant to the Phi-2 prompt structure
-                    prompt += """
-                    Instruct: \(contextEntity.content)\n
-                    """
-                }
-            }
-            
-            /// Model starts responding after
-            if chat.last?.role == .user {
-                prompt += "Output: "
-            }
-            
-            return prompt
-        }
-        
-        /// Prompt formatting closure for the [Gemma](https://ai.google.dev/gemma/docs/formatting) models
-        /// - Important: System prompts are ignored as Gemma doesn't support them
-        public static let gemma: (@Sendable (LLMContext) throws -> String) = { chat in
-            /// Start token of Gemma
-            let startToken = "<start_of_turn>"
-            /// End token of Gemma
-            let endToken = "<end_of_turn>"
-            
-            /// Build the initial Gemma prompt structure
-            ///
-            /// A template of the prompt structure looks like:
-            /// """
-            /// <start_of_turn>user
-            /// knock knock<end_of_turn>
-            /// <start_of_turn>model
-            /// who is there<end_of_turn>
-            /// <start_of_turn>user
-            /// Gemma<end_of_turn>
-            /// <start_of_turn>model
-            /// Gemma who?<end_of_turn>
-            /// """
-            var prompt = ""
-            
-            for contextEntity in chat {
-                if contextEntity.role == .assistant() {
-                    /// Append response from assistant to the Gemma prompt structure
-                    prompt += """
-                    \(startToken)model
-                    \(contextEntity.content)\(endToken)\n
-                    """
-                } else if contextEntity.role == .user {
-                    /// Append response from assistant to the Gemma prompt structure
-                    prompt += """
-                    \(startToken)user
-                    \(contextEntity.content)\(endToken)\n
-                    """
-                }
-            }
-            
-            /// Model starts responding after
-            if chat.last?.role == .user {
-                prompt += "\(startToken)model\n"
-            }
-            
-            return prompt
-        }
-    }
-}
diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
index 3bf3ca24..8b303568 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSchema.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
@@ -27,8 +27,6 @@ public struct LLMLocalSchema: LLMSchema {
     let contextParameters: LLMLocalContextParameters
     /// Sampling parameters of the llama.cpp LLM.
     let samplingParameters: LLMLocalSamplingParameters
-    /// Closure to properly format the ``LLMLocal/context`` to a `String` which is tokenized and passed to the LLM.
-    let formatChat: (@Sendable (LLMContext) throws -> String)
     /// Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``.
     public let injectIntoContext: Bool
     /// The models configuration which is based on `mlx-libraries`
@@ -42,19 +40,16 @@ public struct LLMLocalSchema: LLMSchema {
     ///   - maxTokens: Maximum number of tokens to generate in a single output, defaults to 2048.
     ///   - displayEveryNTokens: Interval for displaying output after every N tokens generated, defaults to 4 (improve by ~15% compared to update at every token).
     ///   - injectIntoContext: Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``, defaults to false.
-    ///   - formatChat: Closure to properly format the ``LLMLocalSession/context`` to a `String` which is tokenized and passed to the LLM, defaults to Llama2 prompt format.
     public init(
         model: LLMLocalModel,
         parameters: LLMLocalParameters = .init(),
         contextParameters: LLMLocalContextParameters = .init(),
         samplingParameters: LLMLocalSamplingParameters = .init(),
-        injectIntoContext: Bool = false,
-        formatChat: @escaping (@Sendable (LLMContext) throws -> String)
+        injectIntoContext: Bool = false
     ) {
         self.parameters = parameters
         self.contextParameters = contextParameters
         self.samplingParameters = samplingParameters
-        self.formatChat = formatChat
         self.injectIntoContext = injectIntoContext
         self.configuration = .init(id: model.hubID)
     }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index 8f9d6e31..c7948d84 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -26,25 +26,22 @@ extension LLMLocalSession {
         
         let modelConfiguration = self.schema.configuration
         
-        guard let formattedChat = try? await schema.formatChat(self.context) else {
+        let messages = await self.context.formatForTransformersChat()
+        guard let promptTokens = try? await modelContainer.perform({ _, tokenizer in
+            try tokenizer.applyChatTemplate(messages: messages)
+        }) else {
             Self.logger.error("SpeziLLMLocal: Failed to format chat with given context")
             await finishGenerationWithError(LLMLocalError.illegalContext, on: continuation)
             return
         }
         
-        let prompt = modelConfiguration.prepare(prompt: formattedChat)
-        let promptTokens = await modelContainer.perform { _, tokenizer in
-            tokenizer.encode(text: prompt)
-        }
-        
         MLXRandom.seed(self.schema.contextParameters.seed ?? UInt64(Date.timeIntervalSinceReferenceDate * 1000))
         
-        let extraEOSTokens = modelConfiguration.extraEOSTokens
-        
         guard await !checkCancellation(on: continuation) else {
             return
         }
         
+        let extraEOSTokens = modelConfiguration.extraEOSTokens
         let parameters: GenerateParameters = .init(
             temperature: schema.samplingParameters.temperature,
             topP: schema.samplingParameters.topP,
@@ -52,7 +49,7 @@ extension LLMLocalSession {
             repetitionContextSize: schema.samplingParameters.repetitionContextSize
         )
         
-        let (result, tokenizer) = await modelContainer.perform { model, tokenizer in
+        let result = await modelContainer.perform { model, tokenizer in
             let result = MLXLLM.generate(
                 promptTokens: promptTokens,
                 parameters: parameters,
@@ -66,10 +63,6 @@ extension LLMLocalSession {
                 
                 if tokens.count >= self.schema.parameters.maxOutputLength {
                     Self.logger.debug("SpeziLLMLocal: Max output length exceeded.")
-                    continuation.finish()
-                    Task { @MainActor in
-                        self.state = .ready
-                    }
                     return .stop
                 }
                 
@@ -84,7 +77,15 @@ extension LLMLocalSession {
                 return .more
             }
             
-            return (result, tokenizer)
+            if schema.injectIntoContext {
+                // Yielding every Nth token may result in missing the final tokens.
+                let reaminingTokens = result.tokens.count % schema.parameters.displayEveryNTokens
+                let lastTokens = Array(result.tokens.suffix(reaminingTokens))
+                let text = tokenizer.decode(tokens: lastTokens)
+                continuation.yield(text)
+            }
+            
+            return result
         }
         
         Self.logger.debug(
@@ -96,16 +97,12 @@ extension LLMLocalSession {
         )
         
         await MainActor.run {
-            if schema.injectIntoContext {
-                // Yielding every Nth token may result in missing the final tokens.
-                let reaminingTokens = result.tokens.count % schema.parameters.displayEveryNTokens
-                let lastTokens = Array(result.tokens.suffix(reaminingTokens))
-                let text = tokenizer.decode(tokens: lastTokens)
-                continuation.yield(text)
-            }
-            
             context.append(assistantOutput: result.output, complete: true)
             context.completeAssistantStreaming()
+            
+            if !schema.injectIntoContext {
+                continuation.yield(result.output)
+            }
             continuation.finish()
             state = .ready
         }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
index 4ebb4573..715ba732 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift
@@ -58,6 +58,8 @@ extension LLMLocalSession {
             Self.logger.error("SpeziLLMLocal: Failed to load local `modelContainer`")
             return false
         }
+        
+        Self.logger.debug("SpeziLLMLocal: Local LLM has finished initializing")
         return true
     }
 }

From 89c52c8f6e991116af8327d6962cc6ae3ec5b4ab Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Mon, 18 Nov 2024 11:24:05 -0800
Subject: [PATCH 16/27] bump version

---
 Package.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Package.swift b/Package.swift
index 032458aa..163532d9 100644
--- a/Package.swift
+++ b/Package.swift
@@ -28,7 +28,7 @@ let package = Package(
     ],
     dependencies: [
         .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.18.1"),
-        .package(url: "https://github.com/ml-explore/mlx-swift-examples", from: "1.16.0"),
+        .package(url: "https://github.com/ml-explore/mlx-swift-examples", from: "1.18.1"),
         .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.12")),
         .package(url: "https://github.com/StanfordBDHG/OpenAI", .upToNextMinor(from: "0.2.9")),
         .package(url: "https://github.com/StanfordSpezi/Spezi", from: "1.2.1"),

From 1374334f2642eb655e329fb9d7091f08e1037c17 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Wed, 20 Nov 2024 11:42:30 -0800
Subject: [PATCH 17/27] add chatTemplate overwrite

---
 .../SpeziLLMLocal/Configuration/LLMLocalParameters.swift    | 6 +++++-
 Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift        | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
index 01c9fcf3..52df110d 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
@@ -18,6 +18,7 @@ public struct LLMLocalParameters: Sendable {
         }()
     }
     
+    let chatTempalte: String?
     
     /// The to-be-used system prompt of the LLM
     let systemPrompt: String?
@@ -36,15 +37,18 @@ public struct LLMLocalParameters: Sendable {
     ///   - maxOutputLength: The maximum output length generated by the Spezi LLM, defaults to `512`.
     ///   - extraEOSTokens: Additional tokens to use for end of string
     ///   - displayEveryNTokens: Interval for displaying output after every N tokens generated, defaults to `4`.
+    ///   - chatTemplate: Can be set to manually overwrite the chatTemplate within the `swift-transformers` package.
     public init(
         systemPrompt: String? = Defaults.defaultSystemPrompt,
         maxOutputLength: Int = 512,
         extraEOSTokens: Set<String> = [],
-        displayEveryNTokens: Int = 4
+        displayEveryNTokens: Int = 4,
+        chatTemplate: String? = nil
     ) {
         self.systemPrompt = systemPrompt
         self.maxOutputLength = maxOutputLength
         self.extraEOSTokens = extraEOSTokens
         self.displayEveryNTokens = displayEveryNTokens
+        self.chatTempalte = chatTemplate
     }
 }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index c7948d84..572e4e89 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -28,7 +28,11 @@ extension LLMLocalSession {
         
         let messages = await self.context.formatForTransformersChat()
         guard let promptTokens = try? await modelContainer.perform({ _, tokenizer in
-            try tokenizer.applyChatTemplate(messages: messages)
+            if let chatTempalte = self.schema.parameters.chatTempalte {
+                return try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTempalte)
+            } else {
+                return try tokenizer.applyChatTemplate(messages: messages)
+            }
         }) else {
             Self.logger.error("SpeziLLMLocal: Failed to format chat with given context")
             await finishGenerationWithError(LLMLocalError.illegalContext, on: continuation)

From 547556df7a4c099607b1e0a44be3040a63f6bd8c Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Sun, 1 Dec 2024 13:51:56 -0800
Subject: [PATCH 18/27] intermediate commit

---
 .../Configuration/LLMLocalContextParameters.swift  |  2 +-
 .../Configuration/LLMLocalParameters.swift         |  7 ++++---
 .../SpeziLLMLocal/LLMLocalSession+Generate.swift   | 14 ++++++++------
 Sources/SpeziLLMLocal/LLMLocalSession.swift        |  2 ++
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
index a707e839..f0547fb7 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
@@ -12,7 +12,7 @@ import Foundation
 /// Represents the context parameters of the LLM.
 public struct LLMLocalContextParameters: Sendable {
     /// RNG seed of the LLM
-    var seed: UInt64?
+    let seed: UInt64?
     
     /// Creates the ``LLMLocalContextParameters`` which wrap the underlying llama.cpp `llama_context_params` C struct.
     /// Is passed to the underlying llama.cpp model in order to configure the context of the LLM.
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
index 52df110d..e619751c 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
@@ -18,8 +18,6 @@ public struct LLMLocalParameters: Sendable {
         }()
     }
     
-    let chatTempalte: String?
-    
     /// The to-be-used system prompt of the LLM
     let systemPrompt: String?
     /// Indicates the maximum output length generated by the LLM.
@@ -28,6 +26,9 @@ public struct LLMLocalParameters: Sendable {
     let extraEOSTokens: Set<String>
     /// Interval for displaying output after every N tokens generated.
     let displayEveryNTokens: Int
+    /// The chat template to use for the model in the Jinja format
+    let chatTemplate: String?
+    
     
     /// Creates the ``LLMLocalParameters`` which wrap the underlying llama.cpp `llama_model_params` C struct.
     /// Is passed to the underlying llama.cpp model in order to configure the LLM.
@@ -49,6 +50,6 @@ public struct LLMLocalParameters: Sendable {
         self.maxOutputLength = maxOutputLength
         self.extraEOSTokens = extraEOSTokens
         self.displayEveryNTokens = displayEveryNTokens
-        self.chatTempalte = chatTemplate
+        self.chatTemplate = chatTemplate
     }
 }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index 572e4e89..085d51c1 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -24,12 +24,15 @@ extension LLMLocalSession {
             return
         }
         
-        let modelConfiguration = self.schema.configuration
+        let messages = if await !self.customContext.isEmpty {
+            await self.customContext
+        } else {
+            await self.context.formatForTransformersChat()
+        }
         
-        let messages = await self.context.formatForTransformersChat()
         guard let promptTokens = try? await modelContainer.perform({ _, tokenizer in
-            if let chatTempalte = self.schema.parameters.chatTempalte {
-                return try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTempalte)
+            if let chatTempalte = self.schema.parameters.chatTemplate {
+               return try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTempalte)
             } else {
                 return try tokenizer.applyChatTemplate(messages: messages)
             }
@@ -45,7 +48,6 @@ extension LLMLocalSession {
             return
         }
         
-        let extraEOSTokens = modelConfiguration.extraEOSTokens
         let parameters: GenerateParameters = .init(
             temperature: schema.samplingParameters.temperature,
             topP: schema.samplingParameters.topP,
@@ -59,7 +61,7 @@ extension LLMLocalSession {
                 parameters: parameters,
                 model: model,
                 tokenizer: tokenizer,
-                extraEOSTokens: extraEOSTokens
+                extraEOSTokens: schema.parameters.extraEOSTokens
             ) { tokens in
                 if Task.isCancelled {
                     return .stop
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift
index 026859f7..bd60758a 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession.swift
@@ -78,6 +78,8 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable {
     
     @MainActor public var state: LLMState = .uninitialized
     @MainActor public var context: LLMContext = []
+    /// Overrides the `context` with a custom highly customizable context in the `swift-transformers` format.
+    @MainActor public var customContext: [[String: String]] = []
     
     @MainActor public var numParameters: Int?
     @MainActor public var modelConfiguration: ModelConfiguration?

From 8f2984ff9d1e3c9d384565a419a7784af1bf44e7 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Sun, 1 Dec 2024 19:24:55 -0800
Subject: [PATCH 19/27] intermediate commit

---
 Sources/SpeziLLMLocal/LLMLocalSchema.swift    | 19 +++++++++++--
 .../LLMLocalSession+Update.swift              | 28 +++++++++++++++++++
 Sources/SpeziLLMLocal/LLMLocalSession.swift   |  2 +-
 3 files changed, 46 insertions(+), 3 deletions(-)
 create mode 100644 Sources/SpeziLLMLocal/LLMLocalSession+Update.swift

diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
index 8b303568..71b28d30 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSchema.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
@@ -23,9 +23,9 @@ public struct LLMLocalSchema: LLMSchema {
     
     /// Closure to properly format the ``LLMLocal/context`` to a `String` which is tokenized and passed to the LLM.
     let parameters: LLMLocalParameters
-    /// Context parameters of the llama.cpp LLM.
+    /// Context parameters of the LLM.
     let contextParameters: LLMLocalContextParameters
-    /// Sampling parameters of the llama.cpp LLM.
+    /// Sampling parameters of the LLM.
     let samplingParameters: LLMLocalSamplingParameters
     /// Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``.
     public let injectIntoContext: Bool
@@ -53,4 +53,19 @@ public struct LLMLocalSchema: LLMSchema {
         self.injectIntoContext = injectIntoContext
         self.configuration = .init(id: model.hubID)
     }
+    
+    @_disfavoredOverload
+    internal init(
+        configuration: ModelConfiguration,
+        parameters: LLMLocalParameters = .init(),
+        contextParameters: LLMLocalContextParameters = .init(),
+        samplingParameters: LLMLocalSamplingParameters = .init(),
+        injectIntoContext: Bool = false
+    ) {
+        self.configuration = configuration
+        self.parameters = parameters
+        self.contextParameters = contextParameters
+        self.samplingParameters = samplingParameters
+        self.injectIntoContext = injectIntoContext
+    }
 }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift
new file mode 100644
index 00000000..427e9f18
--- /dev/null
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift
@@ -0,0 +1,28 @@
+//
+//  LLMUpdateableLocalSchema.swift
+//  SpeziLLM
+//
+//  Created by Leon Nissen on 12/1/24.
+//
+
+import Foundation
+import SpeziLLM
+
+extension LLMLocalSession {
+    public func update(
+        parameters: LLMLocalParameters? = nil,
+        contextParameters: LLMLocalContextParameters? = nil,
+        samplingParameters: LLMLocalSamplingParameters? = nil,
+        injectIntoContext: Bool? = nil
+    ) {
+        cancel()
+        
+        self.schema = .init(
+            configuration: self.schema.configuration,
+            parameters: parameters ?? self.schema.parameters,
+            contextParameters: contextParameters ?? self.schema.contextParameters,
+            samplingParameters: samplingParameters ?? self.schema.samplingParameters,
+            injectIntoContext: injectIntoContext ?? self.schema.injectIntoContext
+        )
+    }
+}
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift
index bd60758a..70a53016 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession.swift
@@ -67,7 +67,7 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable {
     static let logger = Logger(subsystem: "edu.stanford.spezi", category: "SpeziLLMLocal")
     
     let platform: LLMLocalPlatform
-    let schema: LLMLocalSchema
+    var schema: LLMLocalSchema
     
     @ObservationIgnored private var modelExist: Bool {
         false

From 1f9feb78582c03c8a62c9a130c0f4b1d4c96dee9 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Thu, 5 Dec 2024 13:21:48 -0800
Subject: [PATCH 20/27] intermediate commit

---
 .../LLMLocalContextParameters.swift           | 27 -------------------
 .../Configuration/LLMLocalParameters.swift    |  7 ++++-
 ...LLMContext+formatForTransformersChat.swift |  5 ++++
 Sources/SpeziLLMLocal/LLMLocalSchema.swift    | 13 +++------
 .../LLMLocalSession+Generate.swift            |  2 +-
 .../LLMLocalSession+Update.swift              | 21 +++++++++++----
 6 files changed, 31 insertions(+), 44 deletions(-)
 delete mode 100644 Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift

diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
deleted file mode 100644
index f0547fb7..00000000
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
+++ /dev/null
@@ -1,27 +0,0 @@
-//
-// This source file is part of the Stanford Spezi open source project
-//
-// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
-//
-// SPDX-License-Identifier: MIT
-//
-
-import Foundation
-
-
-/// Represents the context parameters of the LLM.
-public struct LLMLocalContextParameters: Sendable {
-    /// RNG seed of the LLM
-    let seed: UInt64?
-    
-    /// Creates the ``LLMLocalContextParameters`` which wrap the underlying llama.cpp `llama_context_params` C struct.
-    /// Is passed to the underlying llama.cpp model in order to configure the context of the LLM.
-    ///
-    /// - Parameters:
-    ///   - seed: RNG seed of the LLM, defaults to a random seed.
-    public init(
-        seed: UInt64? = nil
-    ) {
-        self.seed = seed
-    }
-}
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
index e619751c..096611fa 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
@@ -22,12 +22,14 @@ public struct LLMLocalParameters: Sendable {
     let systemPrompt: String?
     /// Indicates the maximum output length generated by the LLM.
     let maxOutputLength: Int
-    
+    /// Additional End of Sequence tokens at which the generation will be stoped.
     let extraEOSTokens: Set<String>
     /// Interval for displaying output after every N tokens generated.
     let displayEveryNTokens: Int
     /// The chat template to use for the model in the Jinja format
     let chatTemplate: String?
+    /// RNG seed of the LLM
+    let seed: UInt64?
     
     
     /// Creates the ``LLMLocalParameters`` which wrap the underlying llama.cpp `llama_model_params` C struct.
@@ -38,18 +40,21 @@ public struct LLMLocalParameters: Sendable {
     ///   - maxOutputLength: The maximum output length generated by the Spezi LLM, defaults to `512`.
     ///   - extraEOSTokens: Additional tokens to use for end of string
     ///   - displayEveryNTokens: Interval for displaying output after every N tokens generated, defaults to `4`.
+    ///   - seed: RNG seed of the LLM, defaults to a random seed.
     ///   - chatTemplate: Can be set to manually overwrite the chatTemplate within the `swift-transformers` package.
     public init(
         systemPrompt: String? = Defaults.defaultSystemPrompt,
         maxOutputLength: Int = 512,
         extraEOSTokens: Set<String> = [],
         displayEveryNTokens: Int = 4,
+        seed: UInt64? = nil,
         chatTemplate: String? = nil
     ) {
         self.systemPrompt = systemPrompt
         self.maxOutputLength = maxOutputLength
         self.extraEOSTokens = extraEOSTokens
         self.displayEveryNTokens = displayEveryNTokens
+        self.seed = seed
         self.chatTemplate = chatTemplate
     }
 }
diff --git a/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift b/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
index d8da53b0..88848d83 100644
--- a/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
+++ b/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
@@ -9,6 +9,11 @@
 import SpeziLLM
 
 public extension LLMContext {
+    /// Formats the current ``LLMContext`` for compatibility with Transformers-based chat models.
+    ///
+    /// - Returns: An array of dictionaries where each dictionary represents a message in the format:
+    ///   - `role`: The role of the message (e.g., "user", "assistant"), derived from the `rawValue` of the entry's `role`.
+    ///   - `content`: The textual content of the message.
     func formatForTransformersChat() -> [[String: String]] {
         self.map { entry in
             return [
diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
index 71b28d30..a17b2605 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSchema.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSchema.swift
@@ -23,8 +23,6 @@ public struct LLMLocalSchema: LLMSchema {
     
     /// Closure to properly format the ``LLMLocal/context`` to a `String` which is tokenized and passed to the LLM.
     let parameters: LLMLocalParameters
-    /// Context parameters of the LLM.
-    let contextParameters: LLMLocalContextParameters
     /// Sampling parameters of the LLM.
     let samplingParameters: LLMLocalSamplingParameters
     /// Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``.
@@ -35,20 +33,17 @@ public struct LLMLocalSchema: LLMSchema {
     /// Creates an instance of the ``LLMLocalSchema`` containing all necessary configuration for local LLM inference.
     ///
     /// - Parameters:
-    ///   - configuration: A local `URL` where the LLM file is stored. The format of the LLM must be in the llama.cpp `.gguf` format.
-    ///   - generateParameters: Parameters controlling the LLM generation process.
-    ///   - maxTokens: Maximum number of tokens to generate in a single output, defaults to 2048.
-    ///   - displayEveryNTokens: Interval for displaying output after every N tokens generated, defaults to 4 (improve by ~15% compared to update at every token).
+    ///   - model: The `LLMLocalModel` to be used by the schema.
+    ///   - parameters: Parameters controlling the LLM generation process.
+    ///   - samplingParameters: Represents the sampling parameters of the LLM.
     ///   - injectIntoContext: Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``, defaults to false.
     public init(
         model: LLMLocalModel,
         parameters: LLMLocalParameters = .init(),
-        contextParameters: LLMLocalContextParameters = .init(),
         samplingParameters: LLMLocalSamplingParameters = .init(),
         injectIntoContext: Bool = false
     ) {
         self.parameters = parameters
-        self.contextParameters = contextParameters
         self.samplingParameters = samplingParameters
         self.injectIntoContext = injectIntoContext
         self.configuration = .init(id: model.hubID)
@@ -58,13 +53,11 @@ public struct LLMLocalSchema: LLMSchema {
     internal init(
         configuration: ModelConfiguration,
         parameters: LLMLocalParameters = .init(),
-        contextParameters: LLMLocalContextParameters = .init(),
         samplingParameters: LLMLocalSamplingParameters = .init(),
         injectIntoContext: Bool = false
     ) {
         self.configuration = configuration
         self.parameters = parameters
-        self.contextParameters = contextParameters
         self.samplingParameters = samplingParameters
         self.injectIntoContext = injectIntoContext
     }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index 085d51c1..ec8f2c0b 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -42,7 +42,7 @@ extension LLMLocalSession {
             return
         }
         
-        MLXRandom.seed(self.schema.contextParameters.seed ?? UInt64(Date.timeIntervalSinceReferenceDate * 1000))
+        MLXRandom.seed(self.schema.parameters.seed ?? UInt64(Date.timeIntervalSinceReferenceDate * 1000))
         
         guard await !checkCancellation(on: continuation) else {
             return
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift
index 427e9f18..c5982dcf 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift
@@ -1,17 +1,29 @@
 //
-//  LLMUpdateableLocalSchema.swift
-//  SpeziLLM
+// This source file is part of the Stanford Spezi open source project
 //
-//  Created by Leon Nissen on 12/1/24.
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
 //
 
 import Foundation
 import SpeziLLM
 
 extension LLMLocalSession {
+    
+    /// Updates the existing instance of the ``LLMLocalSchema`` with new parameters.
+    ///
+    /// - Parameters:
+    ///   - model: An instance of `LLMLocalModel` to be used by the schema.
+    ///   - parameters: A dictionary or object containing parameters that control the LLM generation process.
+    ///   - samplingParameters: An object representing the sampling parameters for the LLM.
+    ///   - injectIntoContext: A Boolean value indicating whether the inference output from the ``LLMLocalSession``
+    ///     should be automatically inserted into the ``LLMLocalSession/context``. Defaults to `false`.
+    ///
+    /// - Important: Calling this method automatically invokes `cancel()`, stopping all running tasks associated
+    ///   with the current session.
     public func update(
         parameters: LLMLocalParameters? = nil,
-        contextParameters: LLMLocalContextParameters? = nil,
         samplingParameters: LLMLocalSamplingParameters? = nil,
         injectIntoContext: Bool? = nil
     ) {
@@ -20,7 +32,6 @@ extension LLMLocalSession {
         self.schema = .init(
             configuration: self.schema.configuration,
             parameters: parameters ?? self.schema.parameters,
-            contextParameters: contextParameters ?? self.schema.contextParameters,
             samplingParameters: samplingParameters ?? self.schema.samplingParameters,
             injectIntoContext: injectIntoContext ?? self.schema.injectIntoContext
         )

From 305f05c0b4945cec9f1f9b8ee8108e9445c0bb16 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Thu, 5 Dec 2024 13:25:23 -0800
Subject: [PATCH 21/27] fix swiftlint

---
 .../Helpers/LLMContext+formatForTransformersChat.swift      | 6 +++---
 Sources/SpeziLLMLocal/LLMLocalSession+Update.swift          | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift b/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
index 88848d83..66c2366b 100644
--- a/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
+++ b/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
@@ -8,15 +8,15 @@
 
 import SpeziLLM
 
-public extension LLMContext {
+extension LLMContext {
     /// Formats the current ``LLMContext`` for compatibility with Transformers-based chat models.
     ///
     /// - Returns: An array of dictionaries where each dictionary represents a message in the format:
     ///   - `role`: The role of the message (e.g., "user", "assistant"), derived from the `rawValue` of the entry's `role`.
     ///   - `content`: The textual content of the message.
-    func formatForTransformersChat() -> [[String: String]] {
+     public func formatForTransformersChat() -> [[String: String]] {
         self.map { entry in
-            return [
+            [
                 "role": entry.role.rawValue,
                 "content": entry.content
             ]
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift
index c5982dcf..a935d046 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift
@@ -10,7 +10,6 @@ import Foundation
 import SpeziLLM
 
 extension LLMLocalSession {
-    
     /// Updates the existing instance of the ``LLMLocalSchema`` with new parameters.
     ///
     /// - Parameters:
@@ -25,7 +24,7 @@ extension LLMLocalSession {
     public func update(
         parameters: LLMLocalParameters? = nil,
         samplingParameters: LLMLocalSamplingParameters? = nil,
-        injectIntoContext: Bool? = nil
+        injectIntoContext: Bool? = nil // swiftlint:disable:this discouraged_optional_boolean
     ) {
         cancel()
         

From 3eb07e1f5e28d935dae2eeb31500b671a5f645ce Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Thu, 5 Dec 2024 14:42:20 -0800
Subject: [PATCH 22/27] intermediate commit

---
 Sources/SpeziLLMLocal/LLMLocalPlatform.swift  |  18 ++-
 .../Mock/LLMLocalMockSession.swift            | 111 ++++++++++++++++++
 .../LLMFog/Account/AccountSetupHeader.swift   |   4 +-
 .../LLMLocal/LLMLocalChatTestView.swift       |   3 +-
 .../TestAppLLMLocalUITests.swift              |   2 +-
 5 files changed, 129 insertions(+), 9 deletions(-)
 create mode 100644 Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift

diff --git a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
index a8930de3..f9ee358f 100644
--- a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
@@ -11,7 +11,9 @@ import MLX
 import Spezi
 import SpeziFoundation
 import SpeziLLM
-
+#if targetEnvironment(simulator)
+import OSLog
+#endif
 
 /// LLM execution platform of an ``LLMLocalSchema``.
 ///
@@ -58,18 +60,26 @@ public actor LLMLocalPlatform: LLMPlatform, DefaultInitializable {
     
     public nonisolated func configure() {
 #if targetEnvironment(simulator)
-        assertionFailure("SpeziLLMLocal: Code cannot be run on simulator.")
-#endif
+        Logger(
+            subsystem: "Spezi",
+            category: "LLMLocalPlatform"
+        ).warning("SpeziLLMLocal is only supported on physical devices. Use `LLMMockPlatform` instead.")
+#else
         if let cacheLimit = configuration.cacheLimit {
             MLX.GPU.set(cacheLimit: cacheLimit * 1024 * 1024)
         }
         if let memoryLimit = configuration.memoryLimit {
             MLX.GPU.set(memoryLimit: memoryLimit.limit, relaxed: memoryLimit.relaxed)
         }
+#endif
     }
     
-    public nonisolated func callAsFunction(with llmSchema: LLMLocalSchema) -> LLMLocalSession {
+    public nonisolated func callAsFunction(with llmSchema: LLMLocalSchema) -> some LLMSession {
+        #if targetEnvironment(simulator)
+        LLMLocalMockSession(self, schema: llmSchema)
+        #else
         LLMLocalSession(self, schema: llmSchema)
+        #endif
     }
     
     deinit {
diff --git a/Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift b/Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift
new file mode 100644
index 00000000..768b5bdf
--- /dev/null
+++ b/Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift
@@ -0,0 +1,111 @@
+//
+// This source file is part of the Stanford Spezi open source project
+//
+// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
+//
+
+import Foundation
+import Observation
+import SpeziLLM
+
+
+/// A mock ``LLMLocalMockSession``, used for testing purposes.
+///
+/// See `LLMMockSession` for more details
+@Observable
+public final class LLMLocalMockSession: LLMSession, @unchecked Sendable {
+    let platform: LLMLocalPlatform
+    let schema: LLMLocalSchema
+    
+    @ObservationIgnored private var task: Task<(), Never>?
+    
+    @MainActor public var state: LLMState = .uninitialized
+    @MainActor public var context: LLMContext = []
+    
+    
+    /// Initializer for the ``LLMMockSession``.
+    ///
+    /// - Parameters:
+    ///     - platform: The mock LLM platform.
+    ///     - schema: The mock LLM schema.
+    init(_ platform: LLMLocalPlatform, schema: LLMLocalSchema) {
+        self.platform = platform
+        self.schema = schema
+    }
+    
+    
+    @discardableResult
+    public func generate() async throws -> AsyncThrowingStream<String, Error> {
+        let (stream, continuation) = AsyncThrowingStream.makeStream(of: String.self)
+        
+        // swiftlint:disable:next closure_body_length
+        task = Task {
+            await MainActor.run {
+                self.state = .loading
+            }
+            try? await Task.sleep(for: .seconds(1))
+            guard await !checkCancellation(on: continuation) else {
+                return
+            }
+            
+            /// Generate mock messages
+            await MainActor.run {
+                self.state = .generating
+            }
+            await injectAndYield("Mock ", on: continuation)
+            
+            try? await Task.sleep(for: .milliseconds(500))
+            guard await !checkCancellation(on: continuation) else {
+                return
+            }
+            await injectAndYield("Message ", on: continuation)
+            
+            try? await Task.sleep(for: .milliseconds(500))
+            guard await !checkCancellation(on: continuation) else {
+                return
+            }
+            await injectAndYield("from ", on: continuation)
+            
+            try? await Task.sleep(for: .milliseconds(500))
+            guard await !checkCancellation(on: continuation) else {
+                return
+            }
+            await injectAndYield("SpeziLLM!", on: continuation)
+            
+            try? await Task.sleep(for: .milliseconds(500))
+            guard await !checkCancellation(on: continuation) else {
+                return
+            }
+            await injectAndYield("Using SpeziLLMLocal only works on physical devices.", on: continuation)
+            
+            
+            continuation.finish()
+            await MainActor.run {
+                context.completeAssistantStreaming()
+                self.state = .ready
+            }
+        }
+        
+        return stream
+    }
+    
+    public func cancel() {
+        task?.cancel()
+    }
+    
+    private func injectAndYield(_ piece: String, on continuation: AsyncThrowingStream<String, Error>.Continuation) async {
+        continuation.yield(piece)
+        if schema.injectIntoContext {
+            await MainActor.run {
+                context.append(assistantOutput: piece)
+            }
+        }
+    }
+    
+    
+    deinit {
+        cancel()
+    }
+}
diff --git a/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift b/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift
index 2536ca9f..ff4fd0d6 100644
--- a/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift
+++ b/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift
@@ -13,7 +13,7 @@ import SwiftUI
 
 struct AccountSetupHeader: View {
     @Environment(Account.self) private var account
-    @Environment(\.accountSetupState) private var setupState
+    @Environment(\._accountSetupState) private var setupState
     
     
     var body: some View {
@@ -25,7 +25,7 @@ struct AccountSetupHeader: View {
                 .padding(.top, 30)
             Text("ACCOUNT_SUBTITLE")
                 .padding(.bottom, 8)
-            if account.signedIn, case .presentingExistingAccount = setupState {
+            if account.signedIn, case .loadingExistingAccount = setupState {
                 Text("ACCOUNT_SIGNED_IN_DESCRIPTION")
             } else {
                 Text("ACCOUNT_SETUP_DESCRIPTION")
diff --git a/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift b/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift
index ad6999cf..03b7881d 100644
--- a/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift
+++ b/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift
@@ -25,8 +25,7 @@ struct LLMLocalChatTestView: View {
             } else {
                 LLMChatViewSchema(
                     with: LLMLocalSchema(
-                        model: .llama3_8B_4bit,
-                        formatChat: LLMLocalSchema.PromptFormattingDefaults.llama3
+                        model: .llama3_8B_4bit
                     )
                 )
             }
diff --git a/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift b/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift
index 09a00849..ceecfeb7 100644
--- a/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift
+++ b/Tests/UITests/TestAppUITests/TestAppLLMLocalUITests.swift
@@ -43,7 +43,7 @@ class TestAppLLMLocalUITests: XCTestCase {
         sleep(1)
         
         // Chat
-        let inputTextfield = app.textViews["Message Input Textfield"]
+        let inputTextfield = app.textFields["Message Input Textfield"]
         XCTAssertTrue(inputTextfield.exists)
         
         

From bd71161cec918c723059ea675a94b6fd7aebef6f Mon Sep 17 00:00:00 2001
From: Paul Schmiedmayer <PSchmiedmayer@users.noreply.github.com>
Date: Sun, 8 Dec 2024 17:03:30 -0800
Subject: [PATCH 23/27] Update Dependencies & Merge

---
 Package.swift                                             | 8 ++++----
 .../UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift  | 4 ++++
 Tests/UITests/UITests.xcodeproj/project.pbxproj           | 6 +++---
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/Package.swift b/Package.swift
index 163532d9..515241a8 100644
--- a/Package.swift
+++ b/Package.swift
@@ -27,12 +27,12 @@ let package = Package(
         .library(name: "SpeziLLMFog", targets: ["SpeziLLMFog"])
     ],
     dependencies: [
-        .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.18.1"),
-        .package(url: "https://github.com/ml-explore/mlx-swift-examples", from: "1.18.1"),
-        .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.12")),
+        .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.21.2")),
+        .package(url: "https://github.com/ml-explore/mlx-swift-examples", exact: "1.18.1"),
+        .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.14")),
         .package(url: "https://github.com/StanfordBDHG/OpenAI", .upToNextMinor(from: "0.2.9")),
         .package(url: "https://github.com/StanfordSpezi/Spezi", from: "1.2.1"),
-        .package(url: "https://github.com/StanfordSpezi/SpeziFoundation", from: "2.0.0-beta.3"),
+        .package(url: "https://github.com/StanfordSpezi/SpeziFoundation", from: "2.0.0"),
         .package(url: "https://github.com/StanfordSpezi/SpeziStorage", from: "1.0.2"),
         .package(url: "https://github.com/StanfordSpezi/SpeziOnboarding", from: "1.1.1"),
         .package(url: "https://github.com/StanfordSpezi/SpeziChat", .upToNextMinor(from: "0.2.1")),
diff --git a/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift b/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift
index aa2b2e77..e4c2fc3f 100644
--- a/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift
+++ b/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift
@@ -29,9 +29,11 @@ class TestAppLLMOpenAIUITests: XCTestCase {
     func testSpeziLLMOpenAIOnboarding() throws {    // swiftlint:disable:this function_body_length
         let app = XCUIApplication()
         
+        #if canImport(UIKit)
         if UIDevice.current.userInterfaceIdiom == .pad {
             throw XCTSkip("Skipped on iPad, see: https://github.com/StanfordBDHG/XCTestExtensions/issues/27")
         }
+        #endif
         
         XCTAssert(app.buttons["LLMOpenAI"].waitForExistence(timeout: 2))
         app.buttons["LLMOpenAI"].tap()
@@ -145,9 +147,11 @@ class TestAppLLMOpenAIUITests: XCTestCase {
     func testSpeziLLMOpenAIChat() throws {
         let app = XCUIApplication()
         
+        #if canImport(UIKit)
         if UIDevice.current.userInterfaceIdiom == .pad {
             throw XCTSkip("Skipped on iPad, see: https://github.com/StanfordBDHG/XCTestExtensions/issues/27")
         }
+        #endif
         
         XCTAssert(app.buttons["LLMOpenAI"].waitForExistence(timeout: 2))
         app.buttons["LLMOpenAI"].tap()
diff --git a/Tests/UITests/UITests.xcodeproj/project.pbxproj b/Tests/UITests/UITests.xcodeproj/project.pbxproj
index 020fccc8..c66a1500 100644
--- a/Tests/UITests/UITests.xcodeproj/project.pbxproj
+++ b/Tests/UITests/UITests.xcodeproj/project.pbxproj
@@ -733,7 +733,7 @@
 			repositoryURL = "https://github.com/StanfordBDHG/XCTestExtensions.git";
 			requirement = {
 				kind = upToNextMajorVersion;
-				minimumVersion = 1.0.0;
+				minimumVersion = 1.1.0;
 			};
 		};
 		9770F28F2BB3C40C00478571 /* XCRemoteSwiftPackageReference "SpeziFirebase" */ = {
@@ -741,7 +741,7 @@
 			repositoryURL = "https://github.com/StanfordSpezi/SpeziFirebase";
 			requirement = {
 				kind = upToNextMajorVersion;
-				minimumVersion = "2.0.0-beta.4";
+				minimumVersion = 2.0.0;
 			};
 		};
 		979D418E2BB3EBD8001953BD /* XCRemoteSwiftPackageReference "SpeziAccount" */ = {
@@ -749,7 +749,7 @@
 			repositoryURL = "https://github.com/StanfordSpezi/SpeziAccount";
 			requirement = {
 				kind = upToNextMajorVersion;
-				minimumVersion = "2.0.0-beta.8";
+				minimumVersion = 2.1.0;
 			};
 		};
 /* End XCRemoteSwiftPackageReference section */

From 865dc3b2115a1aa6a535478bbdf0d09a4deb7bd2 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Mon, 9 Dec 2024 13:33:18 -0800
Subject: [PATCH 24/27] adjust to PR comments

---
 Sources/SpeziLLM/Models/LLMContextEntity.swift               | 2 +-
 Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift | 5 +++--
 ...TransformersChat.swift => LLMContext+FormattedChat.swift} | 2 +-
 Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift         | 2 +-
 Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift         | 2 +-
 5 files changed, 7 insertions(+), 6 deletions(-)
 rename Sources/SpeziLLMLocal/Helpers/{LLMContext+formatForTransformersChat.swift => LLMContext+FormattedChat.swift} (92%)

diff --git a/Sources/SpeziLLM/Models/LLMContextEntity.swift b/Sources/SpeziLLM/Models/LLMContextEntity.swift
index 1b93c957..a842ad17 100644
--- a/Sources/SpeziLLM/Models/LLMContextEntity.swift
+++ b/Sources/SpeziLLM/Models/LLMContextEntity.swift
@@ -46,7 +46,7 @@ public struct LLMContextEntity: Codable, Equatable, Hashable, Identifiable {
         case tool(id: String, name: String)
         
         
-        public var rawValue: String {
+        package var rawValue: String {
             switch self {
             case .user: "user"
             case .assistant: "assistant"
diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
index 096611fa..70da6ad6 100644
--- a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
+++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift
@@ -18,6 +18,7 @@ public struct LLMLocalParameters: Sendable {
         }()
     }
     
+    
     /// The to-be-used system prompt of the LLM
     let systemPrompt: String?
     /// Indicates the maximum output length generated by the LLM.
@@ -26,10 +27,10 @@ public struct LLMLocalParameters: Sendable {
     let extraEOSTokens: Set<String>
     /// Interval for displaying output after every N tokens generated.
     let displayEveryNTokens: Int
-    /// The chat template to use for the model in the Jinja format
-    let chatTemplate: String?
     /// RNG seed of the LLM
     let seed: UInt64?
+    /// The chat template to use for the model in the Jinja format
+    let chatTemplate: String?
     
     
     /// Creates the ``LLMLocalParameters`` which wrap the underlying llama.cpp `llama_model_params` C struct.
diff --git a/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift b/Sources/SpeziLLMLocal/Helpers/LLMContext+FormattedChat.swift
similarity index 92%
rename from Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
rename to Sources/SpeziLLMLocal/Helpers/LLMContext+FormattedChat.swift
index 66c2366b..d55594b1 100644
--- a/Sources/SpeziLLMLocal/Helpers/LLMContext+formatForTransformersChat.swift
+++ b/Sources/SpeziLLMLocal/Helpers/LLMContext+FormattedChat.swift
@@ -14,7 +14,7 @@ extension LLMContext {
     /// - Returns: An array of dictionaries where each dictionary represents a message in the format:
     ///   - `role`: The role of the message (e.g., "user", "assistant"), derived from the `rawValue` of the entry's `role`.
     ///   - `content`: The textual content of the message.
-     public func formatForTransformersChat() -> [[String: String]] {
+    package var formattedChat: [[String: String]] {
         self.map { entry in
             [
                 "role": entry.role.rawValue,
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index ec8f2c0b..e758c296 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -27,7 +27,7 @@ extension LLMLocalSession {
         let messages = if await !self.customContext.isEmpty {
             await self.customContext
         } else {
-            await self.context.formatForTransformersChat()
+            await self.context.formattedChat
         }
         
         guard let promptTokens = try? await modelContainer.perform({ _, tokenizer in
diff --git a/Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift b/Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift
index 768b5bdf..690c425b 100644
--- a/Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift
+++ b/Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift
@@ -72,7 +72,7 @@ public final class LLMLocalMockSession: LLMSession, @unchecked Sendable {
             guard await !checkCancellation(on: continuation) else {
                 return
             }
-            await injectAndYield("SpeziLLM!", on: continuation)
+            await injectAndYield("SpeziLLM! ", on: continuation)
             
             try? await Task.sleep(for: .milliseconds(500))
             guard await !checkCancellation(on: continuation) else {

From 79470e5553500a289ff76839c8778013cab72ae7 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Tue, 10 Dec 2024 12:19:33 -0800
Subject: [PATCH 25/27] adapt to PR comments

---
 Sources/SpeziLLMLocal/LLMLocalPlatform.swift  |  2 +-
 .../LLMLocalSession+Generate.swift            | 29 +++++++-----
 Sources/SpeziLLMLocal/LLMLocalSession.swift   |  1 +
 .../SpeziLLMLocal.docc/SpeziLLMLocal.md       | 45 +++++--------------
 .../LLMFog/Account/AccountSetupHeader.swift   |  4 +-
 5 files changed, 30 insertions(+), 51 deletions(-)

diff --git a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
index f9ee358f..6e80e03b 100644
--- a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
@@ -74,7 +74,7 @@ public actor LLMLocalPlatform: LLMPlatform, DefaultInitializable {
 #endif
     }
     
-    public nonisolated func callAsFunction(with llmSchema: LLMLocalSchema) -> some LLMSession {
+    public nonisolated func callAsFunction(with llmSchema: LLMLocalSchema) -> LLMLocalSession {
         #if targetEnvironment(simulator)
         LLMLocalMockSession(self, schema: llmSchema)
         #else
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index e758c296..54f55110 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -72,23 +72,34 @@ extension LLMLocalSession {
                     return .stop
                 }
                 
-                if schema.injectIntoContext && tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) {
+                if tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) {
                     let lastTokens = Array(tokens.suffix(schema.parameters.displayEveryNTokens))
                     let text = tokenizer.decode(tokens: lastTokens)
                     
                     Self.logger.debug("SpeziLLMLocal: Yielded token: \(text, privacy: .public)")
                     continuation.yield(text)
+                    
+                    if schema.injectIntoContext {
+                        Task { @MainActor in
+                            context.append(assistantOutput: text)
+                        }
+                    }
                 }
                 
                 return .more
             }
             
+            // Yielding every Nth token may result in missing the final tokens.
+            let reaminingTokens = result.tokens.count % schema.parameters.displayEveryNTokens
+            let lastTokens = Array(result.tokens.suffix(reaminingTokens))
+            let text = tokenizer.decode(tokens: lastTokens)
+            continuation.yield(text)
+            
             if schema.injectIntoContext {
-                // Yielding every Nth token may result in missing the final tokens.
-                let reaminingTokens = result.tokens.count % schema.parameters.displayEveryNTokens
-                let lastTokens = Array(result.tokens.suffix(reaminingTokens))
-                let text = tokenizer.decode(tokens: lastTokens)
-                continuation.yield(text)
+                Task { @MainActor in
+                    context.append(assistantOutput: text)
+                    context.completeAssistantStreaming()
+                }
             }
             
             return result
@@ -103,12 +114,6 @@ extension LLMLocalSession {
         )
         
         await MainActor.run {
-            context.append(assistantOutput: result.output, complete: true)
-            context.completeAssistantStreaming()
-            
-            if !schema.injectIntoContext {
-                continuation.yield(result.output)
-            }
             continuation.finish()
             state = .ready
         }
diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift
index 70a53016..93bab019 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession.swift
@@ -79,6 +79,7 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable {
     @MainActor public var state: LLMState = .uninitialized
     @MainActor public var context: LLMContext = []
     /// Overrides the `context` with a custom highly customizable context in the `swift-transformers` format.
+    /// - Important: When using the `customContext`, `injectToContext` will have no effect, and the assistant output will **not** be added to the `customContext`
     @MainActor public var customContext: [[String: String]] = []
     
     @MainActor public var numParameters: Int?
diff --git a/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md b/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md
index c6823d3f..641f8b3f 100644
--- a/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md
+++ b/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md
@@ -4,7 +4,7 @@
 #
 # This source file is part of the Stanford Spezi open source project
 #
-# SPDX-FileCopyrightText: 2023 Stanford University and the project authors (see CONTRIBUTORS.md)
+# SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md)
 #
 # SPDX-License-Identifier: MIT
 #       
@@ -14,7 +14,8 @@ Provides local Language Model execution capabilities on-device.
 
 ## Overview
 
-The ``SpeziLLMLocal`` target enables the usage of locally executed Language Models (LLMs) directly on-device, without the need for any kind of internet connection and no data every leaving the local device. The underlying technology used for the LLM inference is [llama.cpp](https://github.com/ggerganov/llama.cpp), a C/C++ library for executing [LLaMa models](https://ai.meta.com/llama/). ``SpeziLLMLocal`` provides a pure Swift-based API for interacting with the locally executed model, building on top of the infrastructure of the [SpeziLLM target](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm).
+
+The ``SpeziLLMLocal`` target enables the usage of locally executed Language Models (LLMs) directly on-device, without the need for any kind of internet connection and no data every leaving the local device. The underlying technology used for the LLM inference is [`mlx-swift`](https://github.com/ml-explore/mlx-swift). ``SpeziLLMLocal`` provides a pure Swift-based API for interacting with the locally executed model, building on top of the infrastructure of the [SpeziLLM target](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm).
 
 ## Setup
 
@@ -26,45 +27,19 @@ You need to add the SpeziLLM Swift package to
 
 > Important: If your application is not yet configured to use Spezi, follow the [Spezi setup article](https://swiftpackageindex.com/stanfordspezi/spezi/documentation/spezi/initial-setup) to set up the core Spezi infrastructure.
  
-> Important: In order to use the LLM local target, one needs to set build parameters in the consuming Xcode project or the consuming SPM package to enable the [Swift / C++ Interop](https://www.swift.org/documentation/cxx-interop/), introduced in Xcode 15 and Swift 5.9. Keep in mind that this is true for nested dependencies, one needs to set this configuration recursivly for the entire dependency tree towards the llama.cpp SPM package.  <!-- markdown-link-check-disable-line -->
-> 
-> **For Xcode projects:**
-> - Open your [build settings in Xcode](https://developer.apple.com/documentation/xcode/configuring-the-build-settings-of-a-target/) by selecting *PROJECT_NAME > TARGET_NAME > Build Settings*.
-> - Within the *Build Settings*, search for the `C++ and Objective-C Interoperability` setting and set it to `C++ / Objective-C++`. This enables the project to use the C++ headers from llama.cpp.
-> 
-> **For SPM packages:**
-> - Open the `Package.swift` file of your [SPM package]((https://www.swift.org/documentation/package-manager/)) <!-- markdown-link-check-disable-line -->
-> - Within the package `target` that consumes the llama.cpp package, add the `interoperabilityMode(_:)` Swift build setting like that:
-> ```swift
-> /// Adds the dependency to the Spezi LLM SPM package
-> dependencies: [
->     .package(url: "https://github.com/StanfordSpezi/SpeziLLM", .upToNextMinor(from: "0.6.0"))
-> ],
-> targets: [
->   .target(
->       name: "ExampleConsumingTarget",
->       /// State the dependence of the target to SpeziLLMLocal
->       dependencies: [
->           .product(name: "SpeziLLMLocal", package: "SpeziLLM")
->       ],
->       /// Important: Configure the `.interoperabilityMode(_:)` within the `swiftSettings`
->       swiftSettings: [
->           .interoperabilityMode(.Cxx)
->       ]
->   )
-> ]
->```
+> Important: Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that.
+
+> Important: To use the LLM local target, some LLMs require adding the [Increase Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement to the project.
 
 ## Spezi LLM Local Components
 
-The core components of the ``SpeziLLMLocal`` target are ``LLMLocalSchema``, ``LLMLocalSession`` as well as ``LLMLocalPlatform``. They heavily utilize the [llama.cpp library](https://github.com/ggerganov/llama.cpp) to perform the inference of the Language Model. ``LLMLocalSchema`` defines the type and configuration of the LLM, ``LLMLocalSession`` represents the ``LLMLocalSchema`` in execution while ``LLMLocalPlatform`` is the LLM execution platform.
+The core components of the ``SpeziLLMLocal`` target are ``LLMLocalSchema``, ``LLMLocalSession`` as well as ``LLMLocalPlatform``. They heavily utilize [mlx-swift-examples](https://github.com/ml-explore/mlx-swift-examples) to perform the inference of the Language Model. ``LLMLocalSchema`` defines the type and configuration of the LLM, ``LLMLocalSession`` represents the ``LLMLocalSchema`` in execution while ``LLMLocalPlatform`` is the LLM execution platform.
 
-> Important: To execute a LLM locally, the model file must be present on the local device. 
-> The model must be in the popular `.gguf` format introduced by the [llama.cpp library](https://github.com/ggerganov/llama.cpp)
+> Important: To execute a LLM locally, the model file must be present on the local device.
 
 > Tip: In order to download the model file of the Language model to the local device, SpeziLLM provides the [SpeziLLMLocalDownload](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillmlocaldownload) target which provides model download and storage functionalities.
 
-``LLMLocalSchema`` offers a variety of configuration possibilities, such as the used model file, the context window, the maximum output size or the batch size. These options can be set via the ``LLMLocalSchema/init(modelPath:parameters:contextParameters:samplingParameters:injectIntoContext:formatChat:)`` initializer and the ``LLMLocalParameters``, ``LLMLocalContextParameters``, and ``LLMLocalSamplingParameters`` types. Keep in mind that the model file must be in the popular `.gguf` format!
+``LLMLocalSchema`` offers a variety of configuration possibilities, such as the used model file, the context window, the maximum output size or the batch size. These options can be set via the ``LLMLocalSchema/init(model:parameters:samplingParameters:injectIntoContext:)`` initializer and the ``LLMLocalParameters``, and ``LLMLocalSamplingParameters`` types.
 
 - Important: ``LLMLocalSchema``, ``LLMLocalSession`` as well as ``LLMLocalPlatform`` shouldn't be used on it's own but always used together with the Spezi `LLMRunner`!
 
@@ -107,7 +82,7 @@ struct LLMLocalDemoView: View {
                 // Instantiate the `LLMLocalSchema` to an `LLMLocalSession` via the `LLMRunner`.
                 let llmSession: LLMLocalSession = runner(
                     with: LLMLocalSchema(
-                        modelPath: URL(string: "URL to the local model file")!
+                        model: .llama3_1_8B_4bit
                     )
                 )
 
diff --git a/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift b/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift
index ff4fd0d6..a297ceb0 100644
--- a/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift
+++ b/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift
@@ -13,8 +13,6 @@ import SwiftUI
 
 struct AccountSetupHeader: View {
     @Environment(Account.self) private var account
-    @Environment(\._accountSetupState) private var setupState
-    
     
     var body: some View {
         VStack {
@@ -25,7 +23,7 @@ struct AccountSetupHeader: View {
                 .padding(.top, 30)
             Text("ACCOUNT_SUBTITLE")
                 .padding(.bottom, 8)
-            if account.signedIn, case .loadingExistingAccount = setupState {
+            if account.signedIn, case .loadingExistingAccount = account.details {
                 Text("ACCOUNT_SIGNED_IN_DESCRIPTION")
             } else {
                 Text("ACCOUNT_SETUP_DESCRIPTION")

From c87abd2a0ecc13731c27355e97caa06dc5bec4a4 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Tue, 10 Dec 2024 12:28:38 -0800
Subject: [PATCH 26/27] fix swiftlint

---
 Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
index 54f55110..c7785d72 100644
--- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift
@@ -55,6 +55,7 @@ extension LLMLocalSession {
             repetitionContextSize: schema.samplingParameters.repetitionContextSize
         )
         
+        // swiftlint:disable:next closure_body_length
         let result = await modelContainer.perform { model, tokenizer in
             let result = MLXLLM.generate(
                 promptTokens: promptTokens,

From 021fe773adbfb175e1415a7f96cdb72dbb60a0d5 Mon Sep 17 00:00:00 2001
From: Leon Nissen <>
Date: Tue, 10 Dec 2024 14:00:34 -0800
Subject: [PATCH 27/27] fix LLMLocalMockSession vs LLMLocalSession

---
 Sources/SpeziLLMLocal/LLMLocalPlatform.swift           | 10 ++++++----
 .../TestApp/LLMFog/Account/AccountSetupHeader.swift    |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
index 6e80e03b..83a23dd1 100644
--- a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
+++ b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift
@@ -74,13 +74,15 @@ public actor LLMLocalPlatform: LLMPlatform, DefaultInitializable {
 #endif
     }
     
-    public nonisolated func callAsFunction(with llmSchema: LLMLocalSchema) -> LLMLocalSession {
-        #if targetEnvironment(simulator)
+#if targetEnvironment(simulator)
+    public nonisolated func callAsFunction(with llmSchema: LLMLocalSchema) -> LLMLocalMockSession {
         LLMLocalMockSession(self, schema: llmSchema)
-        #else
+    }
+#else
+    public nonisolated func callAsFunction(with llmSchema: LLMLocalSchema) -> LLMLocalSession {
         LLMLocalSession(self, schema: llmSchema)
-        #endif
     }
+#endif
     
     deinit {
         MLX.GPU.clearCache()
diff --git a/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift b/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift
index a297ceb0..cf1ea4dd 100644
--- a/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift
+++ b/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift
@@ -13,6 +13,7 @@ import SwiftUI
 
 struct AccountSetupHeader: View {
     @Environment(Account.self) private var account
+    @Environment(\.accountSetupState) var setupState
     
     var body: some View {
         VStack {
@@ -23,7 +24,7 @@ struct AccountSetupHeader: View {
                 .padding(.top, 30)
             Text("ACCOUNT_SUBTITLE")
                 .padding(.bottom, 8)
-            if account.signedIn, case .loadingExistingAccount = account.details {
+            if account.signedIn, case .loadingExistingAccount = setupState {
                 Text("ACCOUNT_SIGNED_IN_DESCRIPTION")
             } else {
                 Text("ACCOUNT_SETUP_DESCRIPTION")