Update to 3.0.5 (#69)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
lamini-ai · Aug 13, 2024 · aa2b82e · aa2b82e
1 parent 2581b51
commit aa2b82e
Show file tree

Hide file tree

Showing 10 changed files with 373 additions and 106 deletions.
diff --git a/lamini/__init__.py b/lamini/__init__.py
@@ -3,10 +3,6 @@
 # isort: off
 from lamini.error import error
 
-from lamini.runners.llama_v2_runner import LlamaV2Runner
-from lamini.runners.llama_v3_runner import LlamaV3Runner
-from lamini.runners.basic_model_runner import BasicModelRunner
-from lamini.runners.mistral_runner import MistralRunner
 from lamini.api.lamini import Lamini
 from lamini.api.classifier import Classifier
 from lamini.api.embedding import Embedding
@@ -32,3 +28,4 @@
 max_workers = int(os.environ.get("LAMINI_MAX_WORKERS", 4))
 batch_size = int(os.environ.get("LAMINI_BATCH_SIZE", 5))
 static_batching = bool(os.environ.get("LAMINI_STATIC_BATCHING", False))
+bypass_reservation = bool(os.environ.get("LAMINI_BYPASS_RESERVATION", False))
diff --git a/lamini/api/lamini.py b/lamini/api/lamini.py
@@ -10,7 +10,7 @@
 from lamini.api.rest_requests import get_version
 from lamini.api.train import Train
 from lamini.api.utils.completion import Completion
-from lamini.api.utils.upload_client import get_dataset_name, upload_to_blob
+from lamini.api.utils.upload_client import upload_to_blob
 
 logger = logging.getLogger(__name__)
 
@@ -98,37 +98,36 @@ def get_data_str(d):
         output = self.trainer.get_upload_base_path()
         self.upload_base_path = output["upload_base_path"]
 
-        dataset_id = get_dataset_name()
-
         try:
             if self.upload_base_path == "azure":
                 data_str = get_data_str(data)
-                output = self.trainer.create_blob_dataset_location(
-                    self.upload_base_path, dataset_id, is_public
+                response = self.trainer.create_blob_dataset_location(
+                    self.upload_base_path, is_public
                 )
-                self.upload_file_path = output["dataset_location"]
+                self.upload_file_path = response["dataset_location"]
                 upload_to_blob(data_str, self.upload_file_path)
                 self.trainer.update_blob_dataset_num_datapoints(
-                    dataset_id, num_datapoints
+                    response["dataset_id"], num_datapoints
                 )
                 print("Data pairs uploaded to blob.")
             else:
-                output = self.trainer.upload_dataset_locally(
-                    self.upload_base_path, dataset_id, is_public, data
+                response = self.trainer.upload_dataset_locally(
+                    self.upload_base_path, is_public, data
                 )
-                self.upload_file_path = output["dataset_location"]
+                self.upload_file_path = response["dataset_location"]
                 print("Data pairs uploaded to local.")
 
+            print(response)
             print(
-                f"\nYour dataset id is: {dataset_id} . Consider using this in the future to train using the same data. \nEg: "
-                f"llm.train(dataset_id='{dataset_id}')"
+                f"\nYour dataset id is: {response['dataset_id']} . Consider using this in the future to train using the same data. \nEg: "
+                f"llm.train(data_or_dataset_id='{response['dataset_id']}')"
             )
 
         except Exception as e:
             print(f"Error uploading data pairs: {e}")
             raise e
 
-        return dataset_id
+        return response["dataset_id"]
 
     def upload_file(
         self, file_path: str, input_key: str = "input", output_key: str = "output"
@@ -186,10 +185,8 @@ def train(
         ],
         finetune_args: Optional[dict] = None,
         gpu_config: Optional[dict] = None,
-        peft_args: Optional[dict] = None,
         is_public: Optional[bool] = None,
-        use_cached_model: Optional[bool] = None,
-        multi_node: Optional[bool] = None,
+        **kwargs,
     ):
         if isinstance(data_or_dataset_id, str):
             dataset_id = data_or_dataset_id
@@ -199,7 +196,7 @@ def train(
         base_path = self.trainer.get_upload_base_path()
         self.upload_base_path = base_path["upload_base_path"]
         existing_dataset = self.trainer.get_existing_dataset(
-            dataset_id, self.upload_base_path, is_public
+            dataset_id, self.upload_base_path
         )
         self.upload_file_path = existing_dataset["dataset_location"]
 
@@ -209,10 +206,7 @@ def train(
             upload_file_path=self.upload_file_path,
             finetune_args=finetune_args,
             gpu_config=gpu_config,
-            peft_args=peft_args,
             is_public=is_public,
-            use_cached_model=use_cached_model,
-            multi_node=multi_node,
         )
         job["dataset_id"] = dataset_id
         return job
@@ -228,20 +222,14 @@ def train_and_wait(
         ],
         finetune_args: Optional[dict] = None,
         gpu_config: Optional[dict] = None,
-        peft_args: Optional[dict] = None,
         is_public: Optional[bool] = None,
-        use_cached_model: Optional[bool] = None,
-        multi_node: Optional[bool] = None,
         **kwargs,
     ):
         job = self.train(
             data_or_dataset_id,
             finetune_args=finetune_args,
             gpu_config=gpu_config,
-            peft_args=peft_args,
             is_public=is_public,
-            use_cached_model=use_cached_model,
-            multi_node=multi_node,
         )
 
         try:

diff --git a/lamini/api/train.py b/lamini/api/train.py
@@ -28,10 +28,7 @@ def train(
         upload_file_path: Optional[str] = None,
         finetune_args: Optional[dict] = None,
         gpu_config: Optional[dict] = None,
-        peft_args: Optional[dict] = None,
         is_public: Optional[bool] = None,
-        use_cached_model: Optional[bool] = None,
-        multi_node: Optional[bool] = None,
     ):
         req_data = {"model_name": model_name}
         req_data["dataset_id"] = dataset_id
@@ -41,14 +38,8 @@ def train(
             req_data["finetune_args"] = finetune_args
         if gpu_config is not None:
             req_data["gpu_config"] = gpu_config
-        if peft_args is not None:
-            req_data["peft_args"] = peft_args
         if is_public is not None:
             req_data["is_public"] = is_public
-        if use_cached_model is not None:
-            req_data["use_cached_model"] = use_cached_model
-        if multi_node is not None:
-            req_data["multi_node"] = multi_node
         url = self.api_prefix + "train"
 
         job = make_web_request(self.api_key, url, "post", req_data)
@@ -102,21 +93,15 @@ def evaluate(self, job_id=None):
 
         return make_web_request(self.api_key, url, "get")
 
-    def create_blob_dataset_location(
-        self, upload_base_path, dataset_id, is_public, data=None
-    ):
+    def create_blob_dataset_location(self, upload_base_path, is_public):
         url = self.api_prefix + "data"
         req_data = {
             "upload_base_path": upload_base_path,
-            "dataset_id": dataset_id,
         }
 
         if is_public is not None:
             req_data["is_public"] = is_public
 
-        if data is not None:
-            req_data["data"] = data
-
         return make_web_request(
             self.api_key,
             url,
@@ -142,11 +127,10 @@ def get_upload_base_path(self):
         url = self.api_prefix + "get-upload-base-path"
         return make_web_request(self.api_key, url, "get")
 
-    def upload_dataset_locally(self, upload_base_path, dataset_id, is_public, data):
+    def upload_dataset_locally(self, upload_base_path, is_public, data):
         url = self.api_prefix + "local-data"
         req_data = {}
         req_data["upload_base_path"] = upload_base_path
-        req_data["dataset_id"] = dataset_id
         req_data["data"] = SerializableGenerator(data)
         if is_public is not None:
             req_data["is_public"] = is_public
@@ -157,12 +141,10 @@ def upload_dataset_locally(self, upload_base_path, dataset_id, is_public, data):
             req_data,
         )
 
-    def get_existing_dataset(self, dataset_id, upload_base_path, is_public):
+    def get_existing_dataset(self, dataset_id, upload_base_path):
         url = self.api_prefix + "existing-data"
         req_data = {"dataset_id": dataset_id}
         req_data["upload_base_path"] = upload_base_path
-        if is_public is not None:
-            req_data["is_public"] = is_public
         return make_web_request(
             self.api_key,
             url,

diff --git a/lamini/api/utils/completion.py b/lamini/api/utils/completion.py
@@ -1,17 +1,55 @@
-import logging
-from typing import List, Optional, Union
+from typing import List, Optional, Union, Dict, Any
 
 import aiohttp
 import lamini
 from lamini.api.lamini_config import get_config, get_configured_key, get_configured_url
 from lamini.api.rest_requests import make_async_web_request, make_web_request
 
-logger = logging.getLogger(__name__)
-
 
 class Completion:
-    def __init__(self, api_key, api_url):
+    """ Hanlder for formatting and POST request for the completions
+    and streaming_completions API endpoints.
+
+    
+    Parameters
+    ----------    
+    api_key: Optinal[str]
+        Lamini platform API key, if not provided the key stored
+        within ~.lamini/configure.yaml will be used. If either
+        don't exist then an error is raised.
+
+    api_url: Optional[str]
+        Lamini platform api url, only needed if a different url is needed outside of the
+        defined ones here: https://github.com/lamini-ai/lamini-platform/blob/main/sdk/lamini/api/lamini_config.py#L68
+            i.e. localhost, staging.lamini.ai, or api.lamini.ai
+            Additionally, LLAMA_ENVIRONMENT can be set as an environment variable
+            that will be grabbed for the url before any of the above defaults
+
+    """
+
+    def __init__(self, api_key, api_url) -> None:
+
+        """
+        Configuration dictionary for platform metadata provided by the following function:
+                https://github.com/lamini-ai/lamini-platform/blob/main/sdk/lamini/api/lamini_config.py
+            Configurations currently hold the following keys and data as a yaml format:
+                local:
+                    url: <url>
+                staging:
+                    url: <url>
+                production:
+                    url: <url>
+
+                local:
+                    key: <auth-key>
+                staging:
+                    key: <auth-key>
+                production:
+                    key:
+                        <auth-key>
+        """
         self.config = get_config()
+
         self.api_key = api_key or lamini.api_key or get_configured_key(self.config)
         self.api_url = api_url or lamini.api_url or get_configured_url(self.config)
         self.api_prefix = self.api_url + "/v1/"
@@ -23,7 +61,33 @@ def generate(
         output_type: Optional[dict] = None,
         max_tokens: Optional[int] = None,
         max_new_tokens: Optional[int] = None,
-    ):
+    ) -> Dict[str, Any]:
+        """Handles construction of the POST request headers and body, then 
+        a web request is made with the response returned.
+
+        Parameters
+        ----------
+        prompt: Union[str, List[str]]:
+            Input prompt for the LLM
+
+        model_name: str
+            LLM model name from HuggingFace
+
+        output_type: Optional[dict] = None
+            Json format for the LLM output
+
+        max_tokens: Optional[int] = None
+            Upper limit in total tokens
+
+        max_new_tokens: Optional[int] = None
+            Upper limit for newly generated tokens
+        
+        Returns
+        -------
+        resp: Dict[str, Any]
+            Json data returned from POST request
+        """
+
         req_data = self.make_llm_req_map(
             prompt=prompt,
             model_name=model_name,
@@ -36,7 +100,23 @@ def generate(
         )
         return resp
 
-    async def async_generate(self, params, client: aiohttp.ClientSession = None):
+    async def async_generate(self, params: Dict[str, Any], client: aiohttp.ClientSession = None) -> Dict[str, Any]:
+        """
+
+        Parameters
+        ----------
+        params: Dict[str, Any]
+            POST Request input parameters
+        
+        client: aiohttp.ClientSession = None
+            ClientSession handler
+
+        Returns
+        -------
+        resp: Dict[str, Any]
+            Json data returned from POST request
+        """
+
         if client is not None:
             assert isinstance(client, aiohttp.ClientSession)
             resp = await make_async_web_request(
@@ -61,14 +141,38 @@ def make_llm_req_map(
         output_type: Optional[dict] = None,
         max_tokens: Optional[int] = None,
         max_new_tokens: Optional[int] = None,
-    ):
+    ) -> Dict[str, Any]:
         """Returns a dict of parameters for calling the remote LLM inference API.
 
         NOTE: Copied from lamini.py.
 
         TODO: Create a helper function that accepts all values and returns a dict. And replace callers
         of self.make_llm_req_map() with the calling of the free function.
+
+        Parameters
+        ----------
+        model_name: str
+            LLM model name from HuggingFace
+
+        prompt: Union[str, List[str]]:
+            Input prompt for the LLM
+
+        output_type: Optional[dict] = None
+            Json format for the LLM output
+
+        max_tokens: Optional[int] = None
+            Upper limit in total tokens
+
+        max_new_tokens: Optional[int] = None
+            Upper limit for newly generated tokens
+        
+        Returns
+        -------
+        req_data: Dict[str, Any]
+            Constructed dictionary with parameters provided into the correctly
+            specified keys for a REST request.
         """
+
         req_data = {}
         req_data["model_name"] = model_name
         # TODO: prompt should be named prompt to signal it's a batch.

diff --git a/lamini/api/utils/iterators.py b/lamini/api/utils/iterators.py
@@ -1,7 +1,19 @@
-from typing import Iterator
+from typing import Iterator, AsyncGenerator, Any
 
 
-async def async_iter(normal_iter: Iterator):
-    """Adapt an normal iterator to an async iterator"""
+async def async_iter(normal_iter: Iterator) -> AsyncGenerator[Any, None]:
+    """Adapt an normal iterator to an async iterator
+
+    Parameters
+    ----------
+    normal_iter: Iterator
+        Iterator to wrap with a yield generator
+    
+    Yields
+    -------
+    item: Any
+        Items within the provided normal iterator
+    """
+
     for item in normal_iter:
         yield item