instructlab · gabe-l-hart · May 31, 2024 · May 31, 2024 · May 31, 2024 · May 31, 2024
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
@@ -3,8 +3,12 @@
 Abhishek
 Akash
 AMDGPU
+API
+API's
+api
 arge
 arXiv
+ascii
 backend
 backends
 benchmarking
@@ -37,6 +41,7 @@ Eval
 Excalidraw
 exfiltrate
 exfiltrating
+extensibility
 Finetuning
 formedness
 GFX
@@ -109,7 +114,9 @@ Shivchander
 Signoff
 Srivastava
 subdirectory
+submodule
 Sudalairaj
+sync'ed
 Taj
 tatsu
 TBD
@@ -123,6 +130,7 @@ triager's
 triagers
 unquantized
 USM
+utf
 UX
 venv
 watsonx
@@ -135,3 +143,4 @@ XT
 XTX
 Xu
 YAML
+yaml
diff --git a/api-definitions/common/README.md b/api-definitions/common/README.md
@@ -0,0 +1,3 @@
+# Common
+
+This section of the API definitions holds common structures that are shared across multiple service definitions.
diff --git a/api-definitions/common/file-path.yaml b/api-definitions/common/file-path.yaml
@@ -0,0 +1,35 @@
+################################################################################
+# This schema defines the common ways to reference files and directories held in
+# the various supported storage media.
+################################################################################
+
+schemas:
+  LocalPath:
+    required: ['path']
+    properties:
+      path:
+        type: string
+        description: The name of the file on disk
+
+  ObjectStoragePath:
+    allOf:
+      - $ref: './object-storage-connection.yaml#/schemas/Bucket'
+      - type: object
+        description: 'Path within an object storage bucket'
+        required: ['path']
+        properties:
+          path:
+            type: string
+            description: The path within the bucket
+
+  FilePath:
+    description: Path to an individual file
+    oneOf:
+      - $ref: '#/schemas/LocalPath'
+      - $ref: '#/schemas/ObjectStoragePath'
+
+  DirectoryPath:
+    description: Path to a directory
+    oneOf:
+      - $ref: '#/schemas/LocalPath'
+      - $ref: '#/schemas/ObjectStoragePath'
diff --git a/api-definitions/common/job-status.yaml b/api-definitions/common/job-status.yaml
@@ -0,0 +1,29 @@
+################################################################################
+# This schema defines the common elements of a Job Status response. Individual
+# job types may extend this model with task-specific properties, but these
+# common properties must be present in the response to all job status queries.
+################################################################################
+
+schemas:
+  JobStatus:
+    type: object
+    description: The status of a job in the system
+    properties:
+      job_id:
+        type: string
+        description: Unique identifier for a single job
+      status:
+        type: string
+        enum:
+          - QUEUED
+          - RUNNING
+          - COMPLETED
+          - CANCELED
+          - ERRORED
+        description: >
+          Status of the job in the system:
+            * QUEUED: The job has not started and is waiting to be scheduled
+            * RUNNING: The job is actively running as expected
+            * COMPLETED: The job has completed successfully
+            * CANCELED: The job was canceled by user action
+            * ERRORED: The job terminated in an error state and is not running
diff --git a/api-definitions/common/object-storage-connection.yaml b/api-definitions/common/object-storage-connection.yaml
@@ -0,0 +1,51 @@
+################################################################################
+# This schema defines the common components used to reference content held in a
+# cloud object store using an S3 interface.
+################################################################################
+
+schemas:
+  HMACCredentials:
+    type: object
+    properties:
+      access_key_id:
+        type: string
+        description: The public Access Key ID
+      secret_key:
+        type: string
+        description: The private Secret Key
+
+  IAMCredentials:
+    type: object
+    properties:
+      # TODO: What else goes here?
+      apikey:
+        type: string
+        description: The IAM apikey
+
+
+  Service:
+    type: object
+    description: Pointer to an object storage service
+    required: ['endpoint', 'credentials']
+    properties:
+      endpoint:
+        type: string
+        description: The qualified endpoint of the object storage service (http://, https://)
+      credentials:
+        oneOf:
+          - $ref: '#/schemas/HMACCredentials'
+          - $ref: '#/schemas/IAMCredentials'
+      region:
+        type: string
+        description: The region qualifier for this service
+
+  Bucket:
+    allOf:
+      - $ref: '#/schemas/Service'
+      - type: object
+        description: Pointer to an object storage bucket
+        required: ['bucket']
+        properties:
+          bucket:
+            type: string
+            description: The name of the bucket
diff --git a/api-definitions/platform.yaml b/api-definitions/platform.yaml
@@ -0,0 +1,21 @@
+openapi: '3.0.2'
+info:
+  title: InstructLab Backend Platform
+  version: '0.1.0'
+
+paths:
+  ## Inference #################################################################
+
+  ## Customization #############################################################
+
+  ## Data Jobs #################################################################
+
+  #########
+  ## SDG ##
+  #########
+  /synthetic-data-generations:
+    $ref: './platform/synthetic-data-generations.yaml#/paths/~1synthetic-data-generations'
+  /synthetic-data-generations/{job_id}:
+    $ref: './platform/synthetic-data-generations.yaml#/paths/~1synthetic-data-generations~1{job_id}'
+  /synthetic-data-generations/tasks:
+    $ref: './platform/synthetic-data-generations.yaml#/paths/~1synthetic-data-generations~1tasks'
diff --git a/api-definitions/platform/synthetic-data-generations.yaml b/api-definitions/platform/synthetic-data-generations.yaml
@@ -0,0 +1,104 @@
+openapi: '3.0.2'
+info:
+  title: Synthetic Data Generation
+  version: '0.1.0'
+
+paths:
+  /synthetic-data-generations:
+    post:
+      summary: Initialize a Synthetic Data Generation job
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/SDGJobBody'
+      responses:
+        "201":
+          description: A successfully submitted job
+          content:
+            application/json:
+              schema:
+                $ref: '../common/job-status.yaml#/schemas/JobStatus'
+  /synthetic-data-generations/{job_id}:
+    get:
+      summary: Retrieve the status of a Synthetic Data Generation job
+      responses:
+        "200":
+          description: The status for the job
+          content:
+            application/json:
+              schema:
+                $ref: '../common/job-status.yaml#/schemas/JobStatus'
+
+    delete:
+      summary: Cancel a running Synthetic Data Generation job
+      responses:
+        "200":
+          description: The status for the job after cancellation
+          content:
+            application/json:
+              schema:
+                $ref: '../common/job-status.yaml#/schemas/JobStatus'
+
+  /synthetic-data-generations/tasks:
+    get:
+      summary: List the currently supported SDG tasks
+      responses:
+        "200":
+          description: The set of currently supported SDG tasks
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  $ref: '#/components/schemas/SDGTaskDefinition'
+
+components:
+  schemas:
+    SDGJobBody:
+      type: object
+      required: ['output_directory', 'tasks', 'seed_data']
+      properties:
+        output_directory:
+          description: Location to place the output in
+          $ref: '../common/file-path.yaml#/schemas/DirectoryPath'
+
+        tasks:
+          description: Mapping from task name to task config. The config will be validated against the task's config schema.
+          type: object
+          additionalProperties:
+            type: object
+            description: Config for the given task. This will be deep-merged over the default values.
+
+        seed_data:
+          description: The file or directory containing the seed data
+          oneOf:
+            - $ref: '../common/file-path.yaml#/schemas/FilePath'
+            - $ref: '../common/file-path.yaml#/schemas/DirectoryPath'
+
+    SDGTaskDefinition:
+      type: object
+      required: ['name', 'data_json_schema', 'config_json_schema']
+      properties:
+        name:
+          type: string
+          description: The name of the task
+        data_json_schema:
+          type: object
+          description: The json schema for input data files for this task
+          # TODO: This doesn't render cleanly for some reason, but the body here
+          # must be a valid JSON Schema
+          # $ref: 'https://json-schema.org/draft-04/schema#'
+        data_example:
+          type: string
+          description: Example of an input data file for this task
+        config_json_schema:
+          type: object
+          description: The json schema for the config of this task
+          # TODO: This doesn't render cleanly for some reason, but the body here
+          # must be a valid JSON Schema
+          # $ref: 'https://json-schema.org/draft-04/schema#'
+        config_defaults:
+          type: object
+          description: Default values for all config values
diff --git a/docs/backend/api-definitions-guidelines.md b/docs/backend/api-definitions-guidelines.md
@@ -0,0 +1,58 @@
+# API Definitions Guidelines
+
+This document describes how service APIs will be managed for `InstructLab` and the sub-components of the `InstructLab` backend.
+
+## What parts of InstructLab need service APIs?
+
+There are two primary classes of service APIs needed to support InstructLab:
+
+* `Platform APIs`: These are APIs that are ignorant of `InstructLab` and provide generic AI platform capabilities (e.g., Fine Tuning, SDG, Eval)
+* `InstructLab APIs`: These are the APIs that reflect the user-facing functionality of `InstructLab` itself. They are aware of the end-to-end `InstructLab` workflow.
+
+The `InstructLab APIs` are essential for hosting `InstructLab` as a service in a repeatable way. The `Platform APIs` are critical for component reuse and extensibility (e.g., new SDG algorithms for new taxonomy data types), but they are not strictly required for hosting `InstructLab` as a service.
+
+## How will service APIs be defined?
+
+Service APIs will be defined using [OpenAPI](https://www.openapis.org/) format in [YAML](https://yaml.org/). For structural and style guidelines, see [api-definitions](../../api-definitions/README.md).
+
+## Where will service API definitions live?
+
+Service API definitions will live a new repository github.com/instructlab/service-api-definitions. This repo will have two primary responsibilities:
+
+1. House the static service API definitions
+2. Build and publish any language-specific generated packages for consumption by service implementation projects (see below)
+
+## How will service implementations reference shared APIs?
+
+When a project chooses to implement one or more service APIs, there are three acceptable methods for doing so, listed in order of preference:
+
+1. Consume a supported language-specific package. The `service-api-definitions` repo will build consumable packages with generated code for supported languages. This is the preferred method of consumption as it avoids repository references and code duplication.
+2. For languages without a supported package, the `service-api-definitions` repo may be held as a [git submodule](https://www.git-scm.com/book/en/v2/Git-Tools-Submodules).
+3. It is also acceptable for an implementation to copy the relevant API definitions to the local project repository. Any changes made in the central repository will need to be sync'ed by the project owners, and any new APIs added in the project will not be considered usable until they have been integrated into the central API definitions.
+
+## Style Guidelines
+
+* Use `kebab-case` for path elements
+  * All characters must be in the [ascii](https://www.ascii-code.com/) character set to avoid percent encoding in URIs
+  * All letters must be lowercase
+  * Words are separated by the `-` (dash) character
+* Use `snake_case` for properties
+  * All characters must be in the [utf-8](https://www.w3schools.com/charsets/ref_html_utf8.asp) character set for simple `json` encoding
+  * Words are separated by the `_` (underscore) character
+* Use `UpperCamelCase` for internal reusable schema names
+  * These are internal names, so the character set is not limited
+  * Words are capitalized and concatenated with no separator
+
+## API Layout
+
+* There will be two main portions of the APIs:
+  * `instructlab.yaml`: This defines the user-facing `InstructLab` REST API
+  * `platform.yaml`: This defines the platform-level APIs used by the `InstructLab` workflow.
+* Each platform `Capability` should own its own fully-functional sub-API file that can be used by individual capability service implementations
+* Any schema object that is reused between endpoints should be housed in a schema file under the central `common` directory.
+
+## Versioning and Stability
+
+**WARNING** At this stage in development, we make no guarantees about stability and support for APIs!
+
+**FUTURE**: Once stabilized, the APIs will follow an agreed-upon form of [semantic versioning](https://semver.org/) so that users can rely on the API's stability. The decision of how to version the API and at what granularity to do so is still under discussion.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Common

		This section of the API definitions holds common structures that are shared across multiple service definitions.