diff --git a/documentation/example_conf/llamacpp.conf b/documentation/example_conf/llamacpp.conf new file mode 100644 index 0000000..53c5580 --- /dev/null +++ b/documentation/example_conf/llamacpp.conf @@ -0,0 +1,17 @@ +{ + "single" : + { + "framework" : "llamacpp", + "model" : ["rocket-3b.Q4_0.gguf"], + "input_info" : [ + { + "format" : "flexible" + } + ], + "output_info" : [ + { + "format" : "flexible" + } + ] + } +} diff --git a/ml_inference_offloading/src/main/assets/models/README.md b/ml_inference_offloading/src/main/assets/models/README.md index 49ca3db..c751fd2 100644 --- a/ml_inference_offloading/src/main/assets/models/README.md +++ b/ml_inference_offloading/src/main/assets/models/README.md @@ -4,6 +4,12 @@ ### yolov8s_float32 +### llamacpp + +To run llamacpp model, copy gguf file into this directory. +You can download small size LLM gguf model [here](https://huggingface.co/TheBloke/rocket-3B-GGUF). +To enable optimized GEMM/GEMV kernels use Q4_0 to Q4_0_x_x from [prebuilt libraries](https://github.com/nnstreamer/nnstreamer-android-resource). + ### llama2c To run llama2c model, copy model.bin and tokenizer.bin file into this directory.