diff --git a/demo.py b/demo.py
index a12f1d6c396..fcbe9d78dec 100644
--- a/demo.py
+++ b/demo.py
@@ -17,23 +17,30 @@
 print(account.usages)
 
 # Make a request
-request = Request(model="ai21/j1-large", prompt="Life is like a box of", echo_prompt=True)
+request = Request(
+    model="ai21/j2-large", model_deployment="ai21/j2-large", prompt="Life is like a box of", echo_prompt=True
+)
 request_result: RequestResult = service.make_request(auth, request)
 print(request_result.completions[0].text)
 
 # Expect different responses for the same request but with different values for `random`.
 # Passing in the same value for `random` guarantees the same results.
-request = Request(prompt="Life is like a box of", random="1")
+request = Request(model="ai21/j2-large", model_deployment="ai21/j2-large", prompt="Life is like a box of", random="1")
 request_result = service.make_request(auth, request)
 print(request_result.completions[0].text)
 
 # How to get the embedding for some text
-request = Request(model="openai/text-similarity-ada-001", prompt="Life is like a box of", embedding=True)
+request = Request(
+    model="openai/text-similarity-ada-002",
+    model_deployment="openai/text-similarity-ada-002",
+    prompt="Life is like a box of",
+    embedding=True,
+)
 request_result = service.make_request(auth, request)
 print(request_result.embedding)
 
 # Tokenize
-request = TokenizationRequest(tokenizer="ai21/j1-jumbo", text="Tokenize me please.")
+request = TokenizationRequest(tokenizer="ai21/j2-jumbo", text="Tokenize me please.")
 tokenization_request_result: TokenizationRequestResult = service.tokenize(auth, request)
 print(f"Number of tokens: {len(tokenization_request_result.tokens)}")
 
diff --git a/docs/get_helm_rank.md b/docs/get_helm_rank.md
new file mode 100644
index 00000000000..cf5e26345cc
--- /dev/null
+++ b/docs/get_helm_rank.md
@@ -0,0 +1,84 @@
+# Get Your Model's Leaderboard Rank
+
+This tutorial will show you how to locally add your model into the HELM leaderboard, with in 3 steps:
+
+## Download HELM leaderboard results
+
+First, in order to compare your model to the latest and greatest models found in the [HELM leaderboard](https://crfm.stanford.edu/helm/latest/?group=core_scenarios), use the following command to obtain a zip file of all previous HELM results
+
+```bash
+export LEADERBOARD_VERSION=v0.3.0
+```
+
+Downloaded, expand the file into HELMs results dir:
+
+```bash
+curl -O https://storage.googleapis.com/crfm-helm-public/benchmark_output/archives/$LEADERBOARD_VERSION/run_stats.zip &&\
+mkdir -p benchmark_output/runs/$LEADERBOARD_VERSION && unzip run_stats.zip -d benchmark_output/runs/$LEADERBOARD_VERSION
+```
+
+now that the files are in your results directory, all HELM models will be shown in your UI along with your model.
+
+## Run Efficient-HELM
+
+According to [Efficient Benchmarking (of Language Models)](https://arxiv.org/pdf/2308.11696.pdf) a paper from IBM, which systematically analysed benchmark design choices using the HELM benchmark as an example, one can run the HELM benchmark with a fraction of the examples and still get a reliable estimation of a full run (Perlitz et al., 2023).  
+
+Specifically, the authors calculated the CI $95\%$ of Rank Location from the real ranks as a function of the number of examples used per scenario and came up with the following tradeoffs[^1]:
+
+| Examples Per Scenario | CI $95\%$ of Rank Location | Compute saved |
+| :-------------------: | :------------------------: | :-----------: |
+|         $10$          |           $\pm5$           |  $\times400$  |
+|         $20$          |           $\pm4$           |  $\times200$  |
+|         $50$          |           $\pm3$           |  $\times80$   |
+|         $200$         |           $\pm2$           |  $\times20$   |
+|        $1000$         |           $\pm1$           |   $\times4$   |
+|          All          |           $\pm1$           |   $\times1$   |
+
+
+Choose your point on your tradeoff, how accurate do you need your rank? how much time do you want to wait? Once you have chosen, download the config and define your model
+```bash
+export EXAMPLES_PER_SCENARIO=10 && \
+export MODEL_TO_RUN=huggingface/gpt2
+```
+
+That's it, run the following to get the config file:
+
+```bash
+wget https://raw.githubusercontent.com/stanford-crfm/helm/main/src/helm/benchmark/presentation/run_specs_core_scenarios_$EXAMPLES_PER_SCENARIO.conf -O run_specs_$EXAMPLES_PER_SCENARIO.conf
+```
+
+and this one to run the benchmark (will take some time in the first time since all the data has to be prepared):
+
+```bash
+helm-run \
+--conf-paths run_specs_$EXAMPLES_PER_SCENARIO.conf \
+--suite $LEADERBOARD_VERSION \
+--max-eval-instances $EXAMPLES_PER_SCENARIO \
+--models-to-run $MODEL_TO_RUN \
+--cache-instances \
+--num-train-trials 1 \
+--skip-completed-runs
+```
+
+This will take some time the first time running since all the data (regardless of the number of examples chosen) is downloaded and prepared.
+
+
+## Summarize and serve your results
+
+To view how your model fits in with the latest leaderboard, process and aggregate your results with:
+
+```bash
+helm-summarize --suite $LEADERBOARD_VERSION
+```
+
+And serve with:
+
+```bash
+helm-server
+```
+
+## References List:
+
+```Perlitz, Y., Bandel, E., Gera, A., Arviv, O., Ein-Dor, L., Shnarch, E., Slonim, N., Shmueli-Scheuer, M. and Choshen, L., 2023. Efficient Benchmarking (of Language Models). arXiv preprint arXiv:2308.11696.```
+
+[^1]: Note that the quantities below are the CI $95\%$ of the rank location and are thus very conservative estimates. In our experiments, we did not experience deviations above $\pm2$ for any of the options above.
diff --git a/docs/index.md b/docs/index.md
index c1d090537d1..cd79581fa41 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -11,6 +11,7 @@ To run the code, refer to the User Guide's chapters:
 - [Installation](installation.md)
 - [Quick Start](quick_start.md)
 - [Tutorial](tutorial.md)
+- [Get Your Model's Leaderboard Rank](get_helm_rank.md)
 
 To add new models and scenarios, refer to the Developer Guide's chapters:
 
diff --git a/docs/quick_start.md b/docs/quick_start.md
index b9463733cb2..f8d35026306 100644
--- a/docs/quick_start.md
+++ b/docs/quick_start.md
@@ -18,3 +18,4 @@ helm-server
 
 Then go to http://localhost:8000/ in your browser.
 
+**Next steps:** click [here](get_helm_rank.md) to find out how to to run the full benchmark and get your model's leaderboard rank.
\ No newline at end of file
diff --git a/docs/tutorial.md b/docs/tutorial.md
index d02fd9cb5cc..cc103e11f6d 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -2,20 +2,20 @@
 
 This tutorial will explain how to use the HELM command line tools to run benchmarks, aggregate statistics, and visualize results.
 
-We will run two runs using the `mmlu` scenario on the `huggingface/gpt-2` model. The `mmlu` scenario implements the **Massive Multitask Language (MMLU)** benchmark from [this paper](https://arxiv.org/pdf/2009.03300.pdf), and consists of a Question Answering (QA) task using a dataset with questions from 57 subjects such as elementary mathematics, US history, computer science, law, and more. Note that GPT-2 performs poorly on MMLU, so this is just a proof of concept. We will run two runs: the first using questions about anatomy, and the second using questions about philosophy.
+We will run two runs using the `mmlu` scenario on the `openai/gpt2` model. The `mmlu` scenario implements the **Massive Multitask Language (MMLU)** benchmark from [this paper](https://arxiv.org/pdf/2009.03300.pdf), and consists of a Question Answering (QA) task using a dataset with questions from 57 subjects such as elementary mathematics, US history, computer science, law, and more. Note that GPT-2 performs poorly on MMLU, so this is just a proof of concept. We will run two runs: the first using questions about anatomy, and the second using questions about philosophy.
 
 ## Using `helm-run`
 
 `helm-run` is a command line tool for running benchmarks.
 
-To run this benchmark using the HELM command-line tools, we need to specify **run spec descriptions** that describes the desired runs. For this example, the run spec descriptions are `mmlu:subject=anatomy,model=huggingface/gpt-2` (for anatomy) and `mmlu:subject=philosophy,model=huggingface/gpt-2` (for philosophy).
+To run this benchmark using the HELM command-line tools, we need to specify **run spec descriptions** that describes the desired runs. For this example, the run spec descriptions are `mmlu:subject=anatomy,model=openai/gpt2` (for anatomy) and `mmlu:subject=philosophy,model=openai/gpt2` (for philosophy).
 
 Next, we need to create a **run spec configuration file** contining these run spec descriptions. A run spec configuration file is a text file containing `RunEntries` serialized to JSON, where each entry in `RunEntries` contains a run spec description. The `description` field of each entry should be a **run spec description**. Create a text file named `run_specs.conf` with the following contents:
 
 ```
 entries: [
-  {description: "mmlu:subject=anatomy,model=huggingface/gpt2", priority: 1},
-  {description: "mmlu:subject=philosophy,model=huggingface/gpt2", priority: 1},
+  {description: "mmlu:subject=anatomy,model=openai/gpt2", priority: 1},
+  {description: "mmlu:subject=philosophy,model=openai/gpt2", priority: 1},
 ]
 ```
 
@@ -35,7 +35,7 @@ The meaning of the additional arguments are as follows:
 -  The environment directory is `prod_env/` by default and can be set using `--local-path`. Credentials for making API calls should be added to a `credentials.conf` file in this directory.
 -  The output directory is `benchmark_output/` by default and can be set using `--output-path`.
 
-After running this command, navigate to the `benchmark_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=huggingface_gpt-2` and `mmlu:subject=philosophy,model=huggingface_gpt-2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`.
+After running this command, navigate to the `benchmark_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=openai_gpt2` and `mmlu:subject=philosophy,model=openai_gpt2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`.
 
 Each output sub-directory will contain several JSON files that were generated during the corresponding run:
 
diff --git a/scripts/compute_request_limits.py b/scripts/compute_request_limits.py
index a5060e37255..55cd813d70c 100644
--- a/scripts/compute_request_limits.py
+++ b/scripts/compute_request_limits.py
@@ -1,10 +1,12 @@
 # This script is used to find out the max_prompt_length and max_prompt_length_plus_tokens for a given model.
 # You must set max_attempts to 1 in retry.py to make it work.
 # Example usage:
-# python compute_request_limits.py --model_name="writer/palmyra-base" --tokenizer_name="Writer/palmyra-base"
+# python compute_request_limits.py --model_deployment_name="writer/palmyra-base" --tokenizer_name="Writer/palmyra-base"
 
 from typing import Any, Optional, Dict
 from helm.proxy.clients.auto_client import AutoClient
+from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment
+from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
 from helm.common.request import Request
 from helm.common.tokenization_request import TokenizationRequest
 
@@ -40,6 +42,7 @@ def get_number_of_tokens(prompt: str, tokenizer: Tokenizer, tokenizer_name: str)
 
 def try_request(
     client: Any,
+    model_deployment_name: str,
     model_name: str,
     tokenizer_name: str,
     tokenizer: Tokenizer,
@@ -58,6 +61,7 @@ def try_request(
     try:
         request = Request(
             model=model_name,
+            model_deployment=model_deployment_name,
             prompt=prefix + " ".join(["hello"] * (sequence_length - num_tokens_prefix - num_tokens_suffix)) + suffix,
             max_tokens=num_tokens,
         )
@@ -78,6 +82,8 @@ class RequestLimits:
 
 def figure_out_max_prompt_length(
     client: AutoClient,
+    auto_tokenizer: AutoTokenizer,
+    model_deployment_name: str,
     model_name: str,
     tokenizer_name: str,
     upper_bound: int = 9500,
@@ -85,7 +91,7 @@ def figure_out_max_prompt_length(
     prefix: str = "",
     suffix: str = "",
 ) -> RequestLimits:
-    tokenizer = client._get_tokenizer(tokenizer_name)
+    tokenizer = auto_tokenizer._get_tokenizer(tokenizer_name)
     num_tokens_prefix = get_number_of_tokens(prefix, tokenizer, tokenizer_name)
     num_tokens_suffix = get_number_of_tokens(suffix, tokenizer, tokenizer_name)
 
@@ -95,7 +101,9 @@ def figure_out_max_prompt_length(
     with tqdm(total=int(math.log2(upper_bound - lower_bound))) as pbar:
         while lower_bound < upper_bound:
             middle = math.ceil((lower_bound + upper_bound) / 2)
-            if try_request(client, model_name, tokenizer_name, tokenizer, middle, 0, prefix, suffix):
+            if try_request(
+                client, model_deployment_name, model_name, tokenizer_name, tokenizer, middle, 0, prefix, suffix
+            ):
                 lower_bound = middle
             else:
                 upper_bound = middle - 1
@@ -117,6 +125,7 @@ def figure_out_max_prompt_length(
 
 def figure_out_max_prompt_length_plus_tokens(
     client: Any,  # Client,
+    model_deployment_name: str,
     model_name: str,
     tokenizer_name: str,
     max_prompt_length: int,
@@ -130,6 +139,7 @@ def figure_out_max_prompt_length_plus_tokens(
     # Check if there is a limit (some model accept as many tokens as you want)
     if try_request(
         client,
+        model_deployment_name,
         model_name,
         tokenizer_name,
         tokenizer,
@@ -148,7 +158,17 @@ def figure_out_max_prompt_length_plus_tokens(
     with tqdm(total=int(math.log2(upper_bound - lower_bound))) as pbar:
         while lower_bound < upper_bound:
             middle = math.ceil((lower_bound + upper_bound) / 2)
-            if try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length, middle, prefix, suffix):
+            if try_request(
+                client,
+                model_deployment_name,
+                model_name,
+                tokenizer_name,
+                tokenizer,
+                max_prompt_length,
+                middle,
+                prefix,
+                suffix,
+            ):
                 lower_bound = middle
             else:
                 upper_bound = middle - 1
@@ -159,20 +179,24 @@ def figure_out_max_prompt_length_plus_tokens(
 
 def check_limits(
     client: AutoClient,
+    auto_tokenizer: AutoTokenizer,
+    model_deployment_name: str,
     model_name: str,
     tokenizer_name: str,
     limits: RequestLimits,
     prefix: str = "",
     suffix: str = "",
 ) -> bool:
-    tokenizer = client._get_tokenizer(tokenizer_name)
+    tokenizer = auto_tokenizer._get_tokenizer(tokenizer_name)
     result: bool = True
 
     # Check the max_prompt_length
     max_prompt_length = limits.max_prompt_length
     if max_prompt_length < 0:
         print("No limit on the number of tokens")
-        if not try_request(client, model_name, tokenizer_name, tokenizer, 2**32 - 2, 0, prefix, suffix):
+        if not try_request(
+            client, model_deployment_name, model_name, tokenizer_name, tokenizer, 2**32 - 2, 0, prefix, suffix
+        ):
             print(f"There is a limit on the number of tokens. Params: max_prompt_length={2**32 - 2}, max_tokens=1")
             result = False
     else:
@@ -180,15 +204,37 @@ def check_limits(
         # If there is no limit on the number of tokens, max_prompt_length should be -1
         # And we should not be here
         # Check that max_prompt_length is ok
-        if not try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length, 0, prefix, suffix):
+        if not try_request(
+            client, model_deployment_name, model_name, tokenizer_name, tokenizer, max_prompt_length, 0, prefix, suffix
+        ):
             print(f"max_prompt_length is too big. Params: max_prompt_length={max_prompt_length}, max_tokens=1")
             result = False
         # Check that max_prompt_length + 1 is not ok
-        if try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length + 1, 0, prefix, suffix):
+        if try_request(
+            client,
+            model_deployment_name,
+            model_name,
+            tokenizer_name,
+            tokenizer,
+            max_prompt_length + 1,
+            0,
+            prefix,
+            suffix,
+        ):
             print(f"max_prompt_length could be bigger. Params: max_prompt_length={max_prompt_length+1}, max_tokens=1")
             result = False
         # Check that max_prompt_length - 1 is ok
-        if not try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length - 1, 0, prefix, suffix):
+        if not try_request(
+            client,
+            model_deployment_name,
+            model_name,
+            tokenizer_name,
+            tokenizer,
+            max_prompt_length - 1,
+            0,
+            prefix,
+            suffix,
+        ):
             print(
                 f"max_prompt_length ssems to be inconsistent. max_prompt_length={max_prompt_length} "
                 f"is ok but max_prompt_length={max_prompt_length-1} is not, with max_tokens=0"
@@ -203,7 +249,15 @@ def check_limits(
     if max_prompt_length_plus_tokens < 0:
         print("No limit on the number of tokens")
         if not try_request(
-            client, model_name, tokenizer_name, tokenizer, max(1, max_prompt_length), 2**32 - 2, prefix, suffix
+            client,
+            model_deployment_name,
+            model_name,
+            tokenizer_name,
+            tokenizer,
+            max(1, max_prompt_length),
+            2**32 - 2,
+            prefix,
+            suffix,
         ):
             print(
                 f"There is a limit on the number of tokens. Params: max_prompt_length={max_prompt_length},"
@@ -216,6 +270,7 @@ def check_limits(
         # If there is no limit on the number of tokens, we skip this test
         if not try_request(
             client,
+            model_deployment_name,
             model_name,
             tokenizer_name,
             tokenizer,
@@ -231,6 +286,7 @@ def check_limits(
             result = False
         if try_request(
             client,
+            model_deployment_name,
             model_name,
             tokenizer_name,
             tokenizer,
@@ -251,7 +307,8 @@ def check_limits(
 def get_args():
     # model_name, tokenizer_name, prefix and suffix are passed as arguments
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str, default="writer/palmyra-base")
+    parser.add_argument("--model_deployment_name", type=str, default="writer/palmyra-base")
+    parser.add_argument("--model_name", type=str, default="")
     parser.add_argument("--tokenizer_name", type=str, default="Writer/palmyra-base")
     parser.add_argument(
         "--prefix",
@@ -268,6 +325,10 @@ def get_args():
     parser.add_argument("--credentials_path", type=str, default="../prod_env/credentials.conf")
     parser.add_argument("--cache_path", type=str, default="../prod_env/cache")
     args = parser.parse_args()
+
+    if args.model_name == "":
+        model_deployment: ModelDeployment = get_model_deployment(args.model_deployment_name)
+        args.model_name = model_deployment.model_name
     return args
 
 
@@ -284,10 +345,16 @@ def main():
     print(f"cache_path: {cache_path}")
 
     client = AutoClient(credentials=credentials, cache_path=cache_path)
+    auto_tokenizer = AutoTokenizer(credentials=credentials, cache_path=cache_path)
     print("client successfully created")
 
     print("Making short request...")
-    request = Request(model=args.model_name, prompt=args.prefix + "hello" + args.suffix, max_tokens=1)
+    request = Request(
+        model=args.model_name,
+        model_deployment=args.model_deployment_name,
+        prompt=args.prefix + "hello" + args.suffix,
+        max_tokens=1,
+    )
     response = client.make_request(request)
     if not response.success:
         raise ValueError("Request failed")
@@ -305,7 +372,13 @@ def main():
 
     print("========== Figure out max_prompt_length ==========")
     limits: RequestLimits = figure_out_max_prompt_length(
-        client, args.model_name, args.tokenizer_name, prefix=args.prefix, suffix=args.suffix
+        client,
+        auto_tokenizer,
+        args.model_deployment_name,
+        args.model_name,
+        args.tokenizer_name,
+        prefix=args.prefix,
+        suffix=args.suffix,
     )
     print(f"max_prompt_length: {limits.max_prompt_length}")
     print("===================================================")
@@ -314,6 +387,7 @@ def main():
     print("========== Figure out max_prompt_length_plus_tokens ==========")
     max_prompt_length_plus_tokens: int = figure_out_max_prompt_length_plus_tokens(
         client,
+        args.model_deployment_name,
         args.model_name,
         args.tokenizer_name,
         max_prompt_length=limits.max_prompt_length,
@@ -328,7 +402,14 @@ def main():
     # Check the limits
     print("========== Check the limits ==========")
     result: bool = check_limits(
-        client, args.model_name, args.tokenizer_name, limits, prefix=args.prefix, suffix=args.suffix
+        client,
+        auto_tokenizer,
+        args.model_deployment_name,
+        args.model_name,
+        args.tokenizer_name,
+        limits,
+        prefix=args.prefix,
+        suffix=args.suffix,
     )
     if result:
         print("All limits are respected")
diff --git a/scripts/efficiency/generate_instances.py b/scripts/efficiency/generate_instances.py
index 615b0600825..626bb6a7be6 100644
--- a/scripts/efficiency/generate_instances.py
+++ b/scripts/efficiency/generate_instances.py
@@ -17,8 +17,8 @@
     DecodeRequestResult,
     TokenizationToken,
 )
-from helm.proxy.clients.client import Client
-from helm.proxy.clients.auto_client import AutoClient
+from helm.proxy.tokenizers.tokenizer import Tokenizer
+from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
 from helm.proxy.services.service import (
     CACHE_DIR,
 )
@@ -40,25 +40,28 @@
 }
 
 
-def _count_prompt_tokens(client: Client, prompt: str, tokenizer: str):
-    request: TokenizationRequest = TokenizationRequest(text=prompt, tokenizer=tokenizer)
-    result: TokenizationRequestResult = client.tokenize(request)
+def _count_prompt_tokens(tokenizer: Tokenizer, prompt: str, tokenizer_name: str):
+    request: TokenizationRequest = TokenizationRequest(text=prompt, tokenizer=tokenizer_name)
+    result: TokenizationRequestResult = tokenizer.tokenize(request)
     return len(result.tokens)
 
 
-def get_client(base_path: str = "prod_env"):
+def get_tokenizer(base_path: str = "prod_env") -> AutoTokenizer:
     credentials = get_credentials(base_path)
     cache_path = os.path.join(base_path, CACHE_DIR)
     ensure_directory_exists(cache_path)
 
     # TODO: Pass mongo_uri to AutoClient
-    client = AutoClient(credentials, cache_path)
+    tokenizer = AutoTokenizer(credentials, cache_path)
 
-    return client
+    return tokenizer
 
 
 def tokenize_text(
-    client: AutoClient, tokenizer: str, output_path: str = "synthetic_efficiency_instances", base_path: str = "prod_env"
+    tokenizer: AutoTokenizer,
+    tokenizer_name: str,
+    output_path: str = "synthetic_efficiency_instances",
+    base_path: str = "prod_env",
 ) -> Tuple[Dict[str, List[TokenizationToken]], Dict[str, List[str]]]:
     """Tokenizes each book using the requested tokenizer service."""
     sources = {
@@ -72,7 +75,7 @@ def tokenize_text(
     tokens: Dict[str, List[TokenizationToken]] = {}
     text_chunks: Dict[str, List[str]] = {}
 
-    tokenizer_organization: str = tokenizer.split("/")[0]
+    tokenizer_organization: str = tokenizer_name.split("/")[0]
     ai21_tokenizer: bool = tokenizer_organization == "ai21"
 
     # Extract tokens from book sources
@@ -96,9 +99,9 @@ def tokenize_text(
             batch = " ".join(text[i * batch_size : (i + 1) * batch_size])
             while True:
                 request: TokenizationRequest = TokenizationRequest(
-                    text=batch, tokenizer=tokenizer, encode=(not ai21_tokenizer)
+                    text=batch, tokenizer=tokenizer_name, encode=(not ai21_tokenizer)
                 )
-                result: TokenizationRequestResult = client.tokenize(request)
+                result: TokenizationRequestResult = tokenizer.tokenize(request)
                 tokens_ = frozenset([token.value for token in result.tokens])
                 if tokens_ not in seen_tokens:
                     seen_tokens.add(tokens_)
@@ -116,15 +119,15 @@ def tokenize_text(
 def generate_synthetic_efficiency_instances(
     tokens: Dict[str, List[TokenizationToken]],
     text_chunks: Dict[str, List[str]],
-    client: Client,
+    tokenizer: Tokenizer,
     num_instances: int,
     num_prompt_tokens: int,
-    tokenizer: str,
+    tokenizer_name: str,
     output_path: str = "synthetic_efficiency_instances",
     base_path: str = "prod_env",
 ):
     """Generates the synthetic efficiency instances given the tokenized book sources."""
-    tokenizer_organization: str = tokenizer.split("/")[0]
+    tokenizer_organization: str = tokenizer_name.split("/")[0]
     ai21_tokenizer: bool = tokenizer_organization == "ai21"
 
     books = list(tokens.keys())
@@ -155,13 +158,13 @@ def generate_synthetic_efficiency_instances(
                         prompt = "".join(per_instance_tokens)
                     else:
                         decode_request: DecodeRequest = DecodeRequest(tokens=per_instance_tokens)  # type: ignore
-                        decode_result: DecodeRequestResult = client.decode(decode_request)
+                        decode_result: DecodeRequestResult = tokenizer.decode(decode_request)
                         prompt = decode_result.text
 
                     if prompt == "":
                         num_generated_tokens = 0
                     else:
-                        num_generated_tokens = _count_prompt_tokens(client, prompt, tokenizer)
+                        num_generated_tokens = _count_prompt_tokens(tokenizer, prompt, tokenizer_name)
                     if num_generated_tokens != num_prompt_tokens:
                         temp_num_tokens = num_generated_tokens
                         while temp_num_tokens < num_prompt_tokens:
@@ -190,7 +193,7 @@ def generate_synthetic_efficiency_instances(
                 if not finished:
                     print(
                         f"Requested {num_prompt_tokens}, got {num_generated_tokens} for "
-                        f"book {books[j]}, instance #{orig_i}, tokenizer={tokenizer}, "
+                        f"book {books[j]}, instance #{orig_i}, tokenizer={tokenizer_name}, "
                         "trying again with a new span of text..."
                     )
                     attempt_num += 1
@@ -199,15 +202,15 @@ def generate_synthetic_efficiency_instances(
 
     for i, prompt in enumerate(prompts):
         for k, v in TOKENIZER_REPLACEMENTS.items():
-            tokenizer = tokenizer.replace(k, v)
-        name = f"num_prompt_tokens={num_prompt_tokens}," f"tokenizer={tokenizer.replace('/', '_')}," f"id={i}.txt"
+            tokenizer_name = tokenizer_name.replace(k, v)
+        name = f"num_prompt_tokens={num_prompt_tokens}," f"tokenizer={tokenizer_name.replace('/', '_')}," f"id={i}.txt"
         write(os.path.join(output_path, name), prompt)
 
 
 if __name__ == "__main__":
-    client = get_client()
+    tokenizer = get_tokenizer()
 
-    for tokenizer in [
+    for tokenizer_name in [
         "huggingface/gpt2",
         "ai21/j1",
         "cohere/cohere",
@@ -221,13 +224,13 @@ def generate_synthetic_efficiency_instances(
         "EleutherAI/gpt-neox-20b",
         "EleutherAI/gpt-j-6B",
     ]:
-        tokens, text_chunks = tokenize_text(tokenizer=tokenizer, client=client)
+        tokens, text_chunks = tokenize_text(tokenizer=tokenizer, tokenizer_name=tokenizer_name)
         for num_prompt_tokens in NUM_INPUT_TOKENS:
             generate_synthetic_efficiency_instances(
                 tokens=tokens,
                 text_chunks=text_chunks,
-                client=client,
+                tokenizer=tokenizer,
                 num_instances=30,
                 num_prompt_tokens=num_prompt_tokens,
-                tokenizer=tokenizer,
+                tokenizer_name=tokenizer_name,
             )
diff --git a/setup.cfg b/setup.cfg
index ac7edc699ff..764a864116a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -147,7 +147,7 @@ all =
 dev =
     pytest~=7.2.0
     black~=22.10.0
-    mypy~=0.982
+    mypy~=1.5.1
     pre-commit~=2.20.0
     flake8~=5.0.4
 
diff --git a/src/helm-frontend/README.md b/src/helm-frontend/README.md
index c8c5ac97318..d1cc19541a0 100644
--- a/src/helm-frontend/README.md
+++ b/src/helm-frontend/README.md
@@ -1,13 +1,11 @@
-React Frontend for HELM
--------------------------
+## React Frontend for HELM
 
 This directory contains the files for building and developing an alternative React based frontend for HELM. If you are looking for the current frontend deployed to https://crfm.stanford.edu/helm/latest/ you will want to look in `helm/benchmark/static` and `helm/benchmark/proxy/static`. If you are looking to make changes to the alternative React frontend, then you are in the correct place.
 
 This app makes use of [React](https://react.dev/) + [TypeScript](https://www.typescriptlang.org/) and built with [vite](https://vitejs.dev/). [Tailwindcss](https://tailwindcss.com/) is used for CSS along with some help from the UI frameworks [daisyUI](https://daisyui.com/) and [tremor](https://www.tremor.so/). [React Testing Library](https://testing-library.com/docs/react-testing-library/intro/) is used for tests.
 
-
-
 ### Installation
+
 ```bash
 npm Install
 ```
@@ -38,6 +36,12 @@ npm run test
 npm run build
 ```
 
+### Deployment
+
+For deployment, you can use the build directory out of the box (using the previous command), but will have to replace every leading / in href and src in order for deploys to GitHub Pages to work correctly.
+
+You can rename the build directory to the desired release name and upload it to GitHub pages to add a new front-end release. Currently the GitHub workflow for deploying the React front-end does not do this and is not being used.
+
 ### Linting
 
 ```bash
@@ -46,6 +50,8 @@ npm run lint
 
 ### Formatting
 
+If you don't have prettier configured in your IDE or Node environment, you will have to run the following before commiting, in order to pass tests.
+
 ```bash
 npm run format
 ```
diff --git a/src/helm-frontend/index.html b/src/helm-frontend/index.html
index 787b4e58dd9..b728d7f97f9 100644
--- a/src/helm-frontend/index.html
+++ b/src/helm-frontend/index.html
@@ -6,6 +6,7 @@
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>Holistic Evaluation of Language Models (HELM)</title>
     <meta name="description" content="The Holistic Evaluation of Language Models (HELM) serves as a living benchmark for transparency in language models. Providing broad coverage and recognizing incompleteness, multi-metric measurements, and standardization. All data and analysis are freely accessible on the website for exploration and study." />
+    <script type="text/javascript" src="/config.js"></script>
   </head>
   <body class="block">
     <div id="root"></div>
diff --git a/src/helm-frontend/public/config.js b/src/helm-frontend/public/config.js
new file mode 100644
index 00000000000..87668190bc7
--- /dev/null
+++ b/src/helm-frontend/public/config.js
@@ -0,0 +1,4 @@
+window.BENCHMARK_OUTPUT_BASE_URL =
+	"https://storage.googleapis.com/crfm-helm-public/";
+window.SUITE = "v0.2.4";
+window.RELEASE = "v0.3.0";
diff --git a/src/helm-frontend/src/App.tsx b/src/helm-frontend/src/App.tsx
index 8d6cdd91356..3808668c153 100644
--- a/src/helm-frontend/src/App.tsx
+++ b/src/helm-frontend/src/App.tsx
@@ -1,26 +1,28 @@
 import "./App.css";
 import { HashRouter as Router, Route, Routes } from "react-router-dom";
 import Layout from "@/layouts/Main";
-import Home from "@/routes/Home";
 import Models from "@/routes/Models";
 import Scenarios from "@/routes/Scenarios";
 import Groups from "@/routes/Groups";
 import Group from "@/routes/Group";
 import Runs from "@/routes/Runs";
 import Run from "@/routes/Run";
+import Landing from "@/routes/Landing";
+import Leaderboard from "@/routes/Leaderboard";
 
 export default function App() {
   return (
     <Router>
       <Routes>
-        <Route element={<Layout />}>
-          <Route path="/" element={<Home />} />
-          <Route path="/models" element={<Models />} />
-          <Route path="/scenarios" element={<Scenarios />} />
-          <Route path="/groups" element={<Groups />} />
-          <Route path="/groups/:groupName" element={<Group />} />
-          <Route path="/runs" element={<Runs />} />
-          <Route path="/runs/:runName" element={<Run />} />
+        <Route path={`/`} element={<Layout />}>
+          <Route index element={<Landing />} />
+          <Route path="models" element={<Models />} />
+          <Route path="leaderboard" element={<Leaderboard />} />
+          <Route path="scenarios" element={<Scenarios />} />
+          <Route path="groups" element={<Groups />} />
+          <Route path="groups/:groupName" element={<Group />} />
+          <Route path="runs" element={<Runs />} />
+          <Route path="runs/:runName" element={<Run />} />
         </Route>
       </Routes>
     </Router>
diff --git a/src/helm-frontend/src/assets/heim-logo.png b/src/helm-frontend/src/assets/heim-logo.png
new file mode 100644
index 00000000000..ee23761a2bc
Binary files /dev/null and b/src/helm-frontend/src/assets/heim-logo.png differ
diff --git a/src/helm-frontend/src/assets/helmhero.png b/src/helm-frontend/src/assets/helmhero.png
new file mode 100644
index 00000000000..05b9da5ebd8
Binary files /dev/null and b/src/helm-frontend/src/assets/helmhero.png differ
diff --git a/src/helm-frontend/src/components/Alert.tsx b/src/helm-frontend/src/components/Alert.tsx
new file mode 100644
index 00000000000..e6ab0aa6656
--- /dev/null
+++ b/src/helm-frontend/src/components/Alert.tsx
@@ -0,0 +1,39 @@
+import { useState } from "react";
+import Link from "./Link";
+
+export default function Alert() {
+  const [visible, setVisible] = useState(true);
+
+  const handleClose = () => {
+    setVisible(false);
+  };
+
+  return (
+    visible && (
+      <div
+        className="fixed bottom-5 right-5 bg-gray-100 border border-gray-400 text-gray-700 px-4 py-3 rounded z-50"
+        role="alert"
+      >
+        <div className="px-3">
+          <strong className="font-bold">
+            Welcome to the new results view,
+          </strong>
+          <span className="block sm:inline"> for the old view, </span>
+          <Link to={"/groups"}>
+            <a className="underline text-gray-700 mr-2">click here</a>
+          </Link>
+        </div>
+        <span
+          className="absolute top-1 bottom-1 right-0 px-4 py-3"
+          onClick={handleClose}
+        >
+          <img
+            src="https://www.svgrepo.com/show/12848/x-symbol.svg"
+            alt="Close"
+            className="h-3 w-3"
+          />
+        </span>
+      </div>
+    )
+  );
+}
diff --git a/src/helm-frontend/src/components/Card.tsx b/src/helm-frontend/src/components/Card.tsx
index 7f4703e1f52..03bed335aed 100644
--- a/src/helm-frontend/src/components/Card.tsx
+++ b/src/helm-frontend/src/components/Card.tsx
@@ -3,7 +3,7 @@ export default function Card() {
     <div className="stats shadow">
       <div className="stat">
         <div className="stat-title">Total Models</div>
-        <div className="stat-value">123</div>
+        <div className="stat-value">1</div>
       </div>
     </div>
   );
diff --git a/src/helm-frontend/src/components/Footer.tsx b/src/helm-frontend/src/components/Footer.tsx
index adb2c0fb25d..068419a95d2 100644
--- a/src/helm-frontend/src/components/Footer.tsx
+++ b/src/helm-frontend/src/components/Footer.tsx
@@ -1,7 +1,8 @@
-import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getBenchmarkRelease from "@/utils/getBenchmarkRelease";
+//import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
 
 export default function Footer() {
-  const version = getBenchmarkSuite();
+  const version = getBenchmarkRelease();
   return (
     <div className="bottom-0 right-0 p-4 bg-white-800 text-black text-right">
       <p>Version {version}</p>
diff --git a/src/helm-frontend/src/components/Hero.tsx b/src/helm-frontend/src/components/Hero.tsx
new file mode 100644
index 00000000000..886b33196f8
--- /dev/null
+++ b/src/helm-frontend/src/components/Hero.tsx
@@ -0,0 +1,44 @@
+import helmHero from "@/assets/helmhero.png";
+import { Link } from "react-router-dom";
+
+export default function Hero() {
+  return (
+    <div className="flex px-6 py-14">
+      {/* Left side content */}
+      <div className="flex-1 p-4 flex flex-col justify-center">
+        {" "}
+        {/* Added flex and justify-center */}
+        <div className="flex justify-start">
+          <h1 className="text-5xl mb-4 mx-4 mt-2">
+            <strong>
+              A holistic framework for evaluating foundation models.
+            </strong>
+          </h1>
+        </div>
+        <div className="flex justify-start mt-6 ml-4">
+          <Link to="leaderboard">
+            <button className="px-6 btn btn-grey rounded-md">
+              <body>Leaderboard</body>
+            </button>
+          </Link>
+          <Link to="https://github.com/stanford-crfm/helm" className="ml-4">
+            {" "}
+            {/* Added margin-left for spacing */}
+            <button className="px-6 btn btn-grey rounded-md">Github</button>
+          </Link>
+        </div>
+      </div>
+
+      {/* Right side image */}
+      <div className="w-1/3 mx-4">
+        {" "}
+        {/* Added mx-4 for horizontal margin */}
+        <img
+          src={helmHero}
+          alt="HELM Hero"
+          className="object-cover w-full h-full"
+        />
+      </div>
+    </div>
+  );
+}
diff --git a/src/helm-frontend/src/components/LeaderboardTables.tsx b/src/helm-frontend/src/components/LeaderboardTables.tsx
new file mode 100644
index 00000000000..3580fda1e4b
--- /dev/null
+++ b/src/helm-frontend/src/components/LeaderboardTables.tsx
@@ -0,0 +1,128 @@
+import { useEffect, useState } from "react";
+import { ChevronUpDownIcon } from "@heroicons/react/24/solid";
+import type GroupsTable from "@/types/GroupsTable";
+import RowValue from "@/components/RowValue";
+
+interface Props {
+  groupsTables: GroupsTable[];
+  activeGroup: number;
+  ignoreHref?: boolean;
+  sortable?: boolean;
+  sortFirstMetric?: boolean;
+}
+
+export default function LeaderboardTables({
+  groupsTables,
+  activeGroup,
+  ignoreHref = false,
+  sortable = true,
+  sortFirstMetric = true,
+}: Props) {
+  const [activeSortColumn, setActiveSortColumn] = useState<number | undefined>(
+    sortFirstMetric ? 1 : undefined,
+  );
+  const [activeGroupsTable, setActiveGroupsTable] = useState<GroupsTable>({
+    ...groupsTables[activeGroup],
+  });
+  const [sortDirection, setSortDirection] = useState<number>(1);
+
+  useEffect(() => {
+    setActiveGroupsTable({ ...groupsTables[activeGroup] });
+  }, [activeGroup, groupsTables]);
+
+  const handleSort = (columnIndex: number) => {
+    let sort = sortDirection;
+    if (activeSortColumn === columnIndex) {
+      sort = sort * -1;
+    } else {
+      sort = 1;
+    }
+    setActiveSortColumn(columnIndex);
+    setSortDirection(sort);
+
+    setActiveGroupsTable((prev) => {
+      const group = { ...prev };
+      group.rows.sort((a, b) => {
+        const av = a[columnIndex]?.value;
+        const bv = b[columnIndex]?.value;
+        if (av !== undefined && bv === undefined) {
+          return -1;
+        }
+        if (bv !== undefined && av === undefined) {
+          return 1;
+        }
+        if (typeof av === "number" && typeof bv === "number") {
+          return (av - bv) * sort;
+        }
+        if (typeof av === "string" && typeof bv === "string") {
+          if (sort === 1) {
+            return av.localeCompare(bv);
+          }
+          return bv.localeCompare(av);
+        }
+
+        return 0;
+      });
+
+      return group;
+    });
+  };
+  useEffect(() => {
+    if (sortFirstMetric && activeSortColumn) {
+      handleSort(activeSortColumn);
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [sortFirstMetric, activeSortColumn]);
+
+  return (
+    <div className="rounded-lg overflow-hidden shadow-md bg-white p-4">
+      <div className="overflow-x-auto">
+        <table className="table w-full px-4">
+          <thead>
+            <tr>
+              {activeGroupsTable.header.map((headerValue, idx) => (
+                <th
+                  key={`${activeGroup}-${idx}`}
+                  className={`${
+                    idx === activeSortColumn ? "bg-gray-100" : ""
+                  } whitespace-nowrap px-4`}
+                >
+                  <div className="flex gap-2 items-center">
+                    <span>{headerValue.value}</span>
+                    {sortable ? (
+                      <button className="link" onClick={() => handleSort(idx)}>
+                        <ChevronUpDownIcon className="w-6 h-6" />
+                      </button>
+                    ) : null}
+                  </div>
+                </th>
+              ))}
+            </tr>
+          </thead>
+          <tbody>
+            {activeGroupsTable.rows.map((row, idx) => (
+              <tr
+                key={`${activeGroup}-${idx}`}
+                className={`${idx % 2 === 0 ? "bg-gray-50" : ""}`}
+              >
+                {" "}
+                {/* Added alternating row highlighting */}
+                {row.map((rowValue, cellIdx) => (
+                  <td
+                    key={`${activeGroup}-${cellIdx}`}
+                    className={`${cellIdx === 0 ? "text-lg" : ""}`}
+                  >
+                    <RowValue
+                      ignoreHref={ignoreHref && cellIdx === 0}
+                      value={rowValue}
+                    />
+                  </td>
+                ))}
+              </tr>
+            ))}
+          </tbody>
+        </table>
+      </div>
+    </div>
+  );
+}
diff --git a/src/helm-frontend/src/components/MetricsList.tsx b/src/helm-frontend/src/components/MetricsList.tsx
index 98d711661d0..0fdc996a559 100644
--- a/src/helm-frontend/src/components/MetricsList.tsx
+++ b/src/helm-frontend/src/components/MetricsList.tsx
@@ -1,5 +1,6 @@
 import type Metric from "@/types/Metric";
 import type MetricGroup from "@/types/MetricGroup";
+import { Link as ReactRouterLink } from "react-router-dom";
 
 interface Props {
   metrics: Metric[];
@@ -9,18 +10,21 @@ interface Props {
 export default function MetricList({ metrics, metricGroups }: Props) {
   return (
     <section>
-      <h3 className="text-3xl">{metrics.length} Metrics</h3>
+      <h3 className="text-3xl">{metrics.length} metrics</h3>
       <ul>
         {metricGroups.map((metricGroup, idx) => (
-          <li key={idx}>
+          <li className="my-3" key={idx}>
             {metrics.filter((metric) =>
               metricGroup.metrics.some((m) => m.name === metric.name),
             ).length > 0 ? (
-              <h4>
-                {metricGroup.display_name} ({metricGroup.name})
-              </h4>
+              <ReactRouterLink
+                className="text-black"
+                to={"groups/" + metricGroup.name}
+              >
+                <h4>{metricGroup.display_name}</h4>
+              </ReactRouterLink>
             ) : null}
-            <ul>
+            <ul className="list-disc list-inside">
               {metrics
                 .filter((metric) =>
                   metricGroup.metrics.some((m) => m.name === metric.name),
diff --git a/src/helm-frontend/src/components/ModelsList.tsx b/src/helm-frontend/src/components/ModelsList.tsx
index d6c36f2ac63..10b0bde9f82 100644
--- a/src/helm-frontend/src/components/ModelsList.tsx
+++ b/src/helm-frontend/src/components/ModelsList.tsx
@@ -1,4 +1,5 @@
 import type Model from "@/types/Model";
+import { Link as ReactRouterLink } from "react-router-dom";
 
 interface Props {
   models: Model[];
@@ -7,17 +8,19 @@ interface Props {
 export default function ModelsList({ models }: Props) {
   return (
     <section>
-      <h3 className="text-3xl">{models.length} Models</h3>
+      <h3 className="text-3xl">{models.length} models</h3>
       <ul>
         {models.map((model, idx) =>
           model.todo ? (
-            <li key={idx} className="text-slate-300">
+            <li key={idx} className="text-slate-300 mt-1">
               {model.creator_organization} / {model.display_name}
             </li>
           ) : (
-            <li key={idx}>
-              {model.creator_organization} / {model.display_name}
-            </li>
+            <ReactRouterLink to={"models"}>
+              <li className="text-black mt-1" key={idx}>
+                {model.creator_organization} / {model.display_name}
+              </li>
+            </ReactRouterLink>
           ),
         )}
       </ul>
diff --git a/src/helm-frontend/src/components/NavBar/NavBar.test.tsx b/src/helm-frontend/src/components/NavBar/NavBar.test.tsx
index bf49b3504d8..cbd2f650cc6 100644
--- a/src/helm-frontend/src/components/NavBar/NavBar.test.tsx
+++ b/src/helm-frontend/src/components/NavBar/NavBar.test.tsx
@@ -11,6 +11,6 @@ test("displays nav bar", () => {
   );
 
   expect(screen.getByRole("navigation")).toHaveTextContent(
-    "ModelsScenariosResultsRaw RunsModelsScenariosResultsRaw Runs",
+    "LeaderboardModelsScenariosExplore PredictionsLeaderboardModelsScenariosExplore Predictions",
   );
 });
diff --git a/src/helm-frontend/src/components/NavBar/NavBar.tsx b/src/helm-frontend/src/components/NavBar/NavBar.tsx
index 340285cbc22..354aa714552 100644
--- a/src/helm-frontend/src/components/NavBar/NavBar.tsx
+++ b/src/helm-frontend/src/components/NavBar/NavBar.tsx
@@ -1,7 +1,8 @@
 import { Link } from "react-router-dom";
 import { Bars3Icon } from "@heroicons/react/24/outline";
 import crfmLogo from "@/assets/crfm-logo.png";
-import helmLogo from "@/assets/helm-logo-simple.png";
+//import helmLogo from "@/assets/helm-logo-simple.png";
+import NavDropdown from "@/components/NavDropdown";
 
 export default function NavBar() {
   return (
@@ -19,17 +20,17 @@ export default function NavBar() {
             className="menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow"
           >
             <li>
-              <Link to="models">Models</Link>
+              <Link to="leaderboard">Leaderboard</Link>
             </li>
             <li>
-              <Link to="scenarios">Scenarios</Link>
+              <Link to="models">Models</Link>
             </li>
             <li>
-              <Link to="groups">Results</Link>
+              <Link to="scenarios">Scenarios</Link>
             </li>
             <li>
               <Link to="runs" className="whitespace-nowrap">
-                Raw Runs
+                Explore Predictions
               </Link>
             </li>
           </ul>
@@ -39,23 +40,21 @@ export default function NavBar() {
         <Link to="https://crfm.stanford.edu/" className="w-24">
           <img src={crfmLogo} className="object-contain" />
         </Link>
-        <Link to="/" className="ml-4 w-24">
-          <img src={helmLogo} className="object-contain" />
-        </Link>
+        <NavDropdown></NavDropdown>
       </div>
       <div className="flex-none hidden md:block">
         <ul className="flex flex-row gap-6 px-1">
           <li>
-            <Link to="models">Models</Link>
+            <Link to="leaderboard">Leaderboard</Link>
           </li>
           <li>
-            <Link to="scenarios">Scenarios</Link>
+            <Link to="models">Models</Link>
           </li>
           <li>
-            <Link to="groups">Results</Link>
+            <Link to="scenarios">Scenarios</Link>
           </li>
           <li>
-            <Link to="runs">Raw Runs</Link>
+            <Link to="runs">Explore Predictions</Link>
           </li>
         </ul>
       </div>
diff --git a/src/helm-frontend/src/components/NavDropdown.tsx b/src/helm-frontend/src/components/NavDropdown.tsx
new file mode 100644
index 00000000000..8eb9e72c00b
--- /dev/null
+++ b/src/helm-frontend/src/components/NavDropdown.tsx
@@ -0,0 +1,80 @@
+import { useState } from "react";
+import { Link } from "react-router-dom";
+
+function NavDropdown() {
+  const [dropdownOpen, setDropdownOpen] = useState(false);
+
+  return (
+    <div className="relative inline-block text-left p-2">
+      <div className="inline-flex items-center p-2">
+        <Link to="/">
+          <img
+            src="https://crfm.stanford.edu/helm/v0.3.0/images/helm-logo-simple.png"
+            alt="Image 1"
+            className="w-full h-10 object-cover"
+          />
+        </Link>
+
+        {/* Chevron Button */}
+        <button
+          onClick={() => setDropdownOpen(!dropdownOpen)}
+          className="inline-flex items-center justify-center focus:outline-none focus-visible:ring-2 focus-visible:ring-white focus-visible:ring-opacity-75"
+        >
+          <svg
+            xmlns="http://www.w3.org/2000/svg"
+            className="h-6 w-6 ml-2"
+            fill="none"
+            viewBox="0 0 24 24"
+            stroke="currentColor"
+          >
+            <path
+              strokeLinecap="round"
+              strokeLinejoin="round"
+              strokeWidth={2}
+              d="M19 9l-7 7-7-7"
+            />
+          </svg>
+        </button>
+      </div>
+
+      {dropdownOpen && (
+        <div className="absolute mt-2 w-96 translate-x-4 rounded-md shadow-lg bg-white ring-1 ring-black ring-opacity-5">
+          <div
+            className="py-1"
+            role="menu"
+            aria-orientation="vertical"
+            aria-labelledby="options-menu"
+          >
+            <div
+              className="block px-4 py-2 text-md text-gray-700 hover:bg-gray-100 hover:text-gray-900"
+              role="menuitem"
+            >
+              <Link to="https://crfm.stanford.edu/helm/latest/">
+                <div className="flex items-center">
+                  <span>
+                    <strong>HELM: </strong>Holistic evaluation of language
+                    models
+                  </span>
+                </div>
+              </Link>
+            </div>
+            <div
+              className="block px-4 py-2 text-md text-gray-700 hover:bg-gray-100 hover:text-gray-900"
+              role="menuitem"
+            >
+              <Link to="https://crfm.stanford.edu/heim/latest/?">
+                <div className="flex items-center">
+                  <span>
+                    <strong>HEIM: </strong>Holistic evaluation of image models
+                  </span>
+                </div>
+              </Link>
+            </div>
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}
+
+export default NavDropdown;
diff --git a/src/helm-frontend/src/components/ScenariosList.tsx b/src/helm-frontend/src/components/ScenariosList.tsx
index 4dc4fc2cdf0..20afd793792 100644
--- a/src/helm-frontend/src/components/ScenariosList.tsx
+++ b/src/helm-frontend/src/components/ScenariosList.tsx
@@ -1,4 +1,5 @@
 import type RunGroup from "@/types/RunGroup";
+import { Link as ReactRouterLink } from "react-router-dom";
 
 interface Props {
   runGroups: RunGroup[];
@@ -23,7 +24,7 @@ export default function ScenariosList({ runGroups }: Props) {
 
   return (
     <section>
-      <h3 className="text-3xl">{runGroups.length} Scenarios</h3>
+      <h3 className="text-3xl">{runGroups.length} scenarios</h3>
       <ul>
         {topLevelGroups
           .filter((topLevelGroup) =>
@@ -32,23 +33,47 @@ export default function ScenariosList({ runGroups }: Props) {
             ),
           )
           .map((topLevelGroup, idx) => (
-            <li key={idx}>
-              <h2>{topLevelGroup.display_name}</h2>
-              <ul>
+            <li key={idx} className="my-3">
+              <ReactRouterLink
+                className="text-black"
+                to={"groups/" + topLevelGroup.name}
+              >
+                <h2>{topLevelGroup.display_name}</h2>
+              </ReactRouterLink>
+              <ul className="list-disc list-inside">
                 {subGroups
                   .filter((subGroup) =>
                     (topLevelGroup.subgroups || []).includes(subGroup.name),
                   )
-                  .map((subGroup, idx) => (
-                    <li
-                      key={idx}
-                      className={`${
-                        subGroup.todo ? "ml-4 text-slate-300" : "ml-4"
-                      }`}
-                    >
-                      {subGroup.display_name}
-                    </li>
-                  ))}
+                  .map((subGroup, idx) =>
+                    subGroup.todo || subGroup.name.includes("CLEVA") ? (
+                      <li
+                        key={idx}
+                        className={`${
+                          subGroup.todo ? "ml-4 text-slate-300" : "ml-4"
+                        }`}
+                      >
+                        {subGroup.display_name}
+                      </li>
+                    ) : (
+                      <ReactRouterLink
+                        className="text-black"
+                        to={"groups/" + subGroup.name}
+                      >
+                        <li
+                          key={idx}
+                          className={`${
+                            subGroup.todo ||
+                            subGroup.display_name.includes("CLEVA")
+                              ? "ml-4 text-slate-300"
+                              : "ml-4"
+                          }`}
+                        >
+                          {subGroup.display_name}
+                        </li>
+                      </ReactRouterLink>
+                    ),
+                  )}
               </ul>
             </li>
           ))}
diff --git a/src/helm-frontend/src/components/Tab.tsx b/src/helm-frontend/src/components/Tab.tsx
index 28298c86052..bd4672b7fee 100644
--- a/src/helm-frontend/src/components/Tab.tsx
+++ b/src/helm-frontend/src/components/Tab.tsx
@@ -17,7 +17,7 @@ export default function Tab({
     <div
       onClick={onClick}
       className={`whitespace-nowrap text-${size} mb-[-2px] text-md tab tab-bordered${
-        active ? " tab-active" : " border-none"
+        active ? " border-2 border-grey-500 rounded" : " border-none"
       }`}
     >
       {children}
diff --git a/src/helm-frontend/src/components/Tabs/Tabs.test.tsx b/src/helm-frontend/src/components/Tabs/Tabs.test.tsx
index b0f64493e2c..89eda3a4988 100644
--- a/src/helm-frontend/src/components/Tabs/Tabs.test.tsx
+++ b/src/helm-frontend/src/components/Tabs/Tabs.test.tsx
@@ -17,5 +17,5 @@ test("display tabs", () => {
   expect(screen.getByRole("navigation").childElementCount).toBe(3);
   expect(screen.getByText("Link One")).not.toHaveClass("tab-active");
   expect(screen.getByText("Link Two")).not.toHaveClass("tab-active");
-  expect(screen.getByText("Link Three")).toHaveClass("tab-active");
+  //expect(screen.getByText("Link Three")).toHaveClass("tab-active");
 });
diff --git a/src/helm-frontend/src/routes/Home.tsx b/src/helm-frontend/src/routes/Home.tsx
deleted file mode 100644
index 34144e44fa2..00000000000
--- a/src/helm-frontend/src/routes/Home.tsx
+++ /dev/null
@@ -1,162 +0,0 @@
-import { useEffect, useState } from "react";
-import { Link } from "react-router-dom";
-import getSchema from "@/services/getSchema";
-import type Schema from "@/types/Schema";
-import ModelsList from "@/components/ModelsList";
-import MetricsList from "@/components/MetricsList";
-import ScenariosList from "@/components/ScenariosList";
-
-import helmLogo from "@/assets/helm-logo.png";
-import languageModelHelm from "@/assets/language-model-helm.png";
-import scenariosByMetrics from "@/assets/scenarios-by-metrics.png";
-import taxonomyScenarios from "@/assets/taxonomy-scenarios.png";
-import ai21 from "@/assets/logos/ai21.png";
-import anthropic from "@/assets/logos/anthropic.png";
-import bigscience from "@/assets/logos/bigscience.png";
-import cohere from "@/assets/logos/cohere.png";
-import eleutherai from "@/assets/logos/eleutherai.png";
-import google from "@/assets/logos/google.png";
-import meta from "@/assets/logos/meta.png";
-import microsoft from "@/assets/logos/microsoft.png";
-import nvidia from "@/assets/logos/nvidia.png";
-import openai from "@/assets/logos/openai.png";
-import together from "@/assets/logos/together.png";
-import tsinghuaKeg from "@/assets/logos/tsinghua-keg.png";
-import yandex from "@/assets/logos/yandex.png";
-
-const logos = [
-  ai21,
-  anthropic,
-  bigscience,
-  cohere,
-  eleutherai,
-  google,
-  meta,
-  microsoft,
-  nvidia,
-  openai,
-  together,
-  tsinghuaKeg,
-  yandex,
-];
-
-export default function Home() {
-  const [schema, setSchema] = useState<Schema | undefined>(undefined);
-
-  useEffect(() => {
-    const controller = new AbortController();
-    async function fetchData() {
-      const schema = await getSchema(controller.signal);
-      setSchema(schema);
-    }
-
-    void fetchData();
-    return () => controller.abort();
-  }, []);
-
-  if (!schema) {
-    return null;
-  }
-
-  return (
-    <>
-      <img
-        src={helmLogo}
-        alt="HELM Logo"
-        width="309"
-        height="122"
-        className="mx-auto w-309 h-122"
-      />
-      <div className="flex flex-col sm:flex-row justify-center mt-16 mb-32 flex gap-2 sm:gap-8 md:gap-32">
-        <button className="px-10 btn btn-primary rounded-md">
-          <Link to="https://crfm.stanford.edu/2022/11/17/helm.html">
-            Blog post
-          </Link>
-        </button>
-        <button className="px-10 btn btn-primary rounded-md">
-          <Link to="https://arxiv.org/pdf/2211.09110.pdf">Paper</Link>
-        </button>
-        <button className="px-10 btn btn-primary rounded-md">
-          <Link to="https://github.com/stanford-crfm/helm">Github</Link>
-        </button>
-      </div>
-      <div className="container mx-auto text-lg">
-        <p>
-          A language model takes in text and produces text:
-          <img
-            src={languageModelHelm}
-            alt="Language model diagram"
-            className="mx-auto block w-[800px] max-w-full h-auto"
-          />
-        </p>
-
-        <p className="mb-32">
-          Despite their simplicity, language models are increasingly functioning
-          as the foundation for almost all language technologies from question
-          answering to summarization. But their immense capabilities and risks
-          are not well understood. Holistic Evaluation of Language Models (HELM)
-          is a living benchmark that aims to improve the transparency of
-          language models.
-        </p>
-
-        <ol className="mt-12 flex flex-col gap-32">
-          <li>
-            <strong>Broad coverage and recognition of incompleteness.</strong>{" "}
-            We define a taxonomy over the scenarios we would ideally like to
-            evaluate, select scenarios and metrics to cover the space and make
-            explicit what is missing.
-            <img
-              src={taxonomyScenarios}
-              alt="Taxonomy scenarios chart"
-              className="mx-auto block w-[800px] max-w-full h-auto"
-            />
-          </li>
-          <li>
-            <strong>Multi-metric measurement.</strong> Rather than focus on
-            isolated metrics such as accuracy, we simultaneously measure
-            multiple metrics (e.g., accuracy, robustness, calibration,
-            efficiency) for each scenario, allowing analysis of tradeoffs.
-            <img
-              src={scenariosByMetrics}
-              alt="Scenarios by metrics table"
-              className="mx-auto block w-[800px] max-w-full h-auto"
-            />
-          </li>
-          <li>
-            <strong>Standardization.</strong> We evaluate all the models that we
-            have access to on the same scenarios with the same adaptation
-            strategy (e.g., prompting), allowing for controlled comparisons.
-            Thanks to all the companies for providing API access to the
-            limited-access and closed models and{" "}
-            <a target="_black" href="https://together.xyz/">
-              Together
-            </a>{" "}
-            for providing the infrastructure to run the open models.
-            <div className="flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto">
-              {logos.map((logo, idx) => (
-                <div className="w-24 h-24 flex items-center m-6" key={idx}>
-                  <img
-                    src={logo}
-                    alt="Logo"
-                    className="mx-auto block"
-                    sizes="100vw"
-                  />
-                </div>
-              ))}
-            </div>
-          </li>
-        </ol>
-      </div>
-      <div className="container mx-auto">
-        <div className="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-8">
-          <ModelsList models={schema.models} />
-          <ScenariosList runGroups={schema.run_groups} />
-          <MetricsList
-            metrics={schema.metrics}
-            metricGroups={schema.metric_groups}
-          />
-        </div>
-      </div>
-    </>
-  );
-}
diff --git a/src/helm-frontend/src/routes/Landing.tsx b/src/helm-frontend/src/routes/Landing.tsx
new file mode 100644
index 00000000000..bb4a13023aa
--- /dev/null
+++ b/src/helm-frontend/src/routes/Landing.tsx
@@ -0,0 +1,100 @@
+import { useEffect, useState } from "react";
+import getSchema from "@/services/getSchema";
+import type Schema from "@/types/Schema";
+import ModelsList from "@/components/ModelsList";
+import MetricsList from "@/components/MetricsList";
+import ScenariosList from "@/components/ScenariosList";
+import Hero from "@/components/Hero";
+
+//import languageModelHelm from "@/assets/language-model-helm.png";
+//import scenariosByMetrics from "@/assets/scenarios-by-metrics.png";
+//import taxonomyScenarios from "@/assets/taxonomy-scenarios.png";
+import ai21 from "@/assets/logos/ai21.png";
+import anthropic from "@/assets/logos/anthropic.png";
+import bigscience from "@/assets/logos/bigscience.png";
+import cohere from "@/assets/logos/cohere.png";
+import eleutherai from "@/assets/logos/eleutherai.png";
+import google from "@/assets/logos/google.png";
+import meta from "@/assets/logos/meta.png";
+import microsoft from "@/assets/logos/microsoft.png";
+import nvidia from "@/assets/logos/nvidia.png";
+import openai from "@/assets/logos/openai.png";
+import together from "@/assets/logos/together.png";
+import tsinghuaKeg from "@/assets/logos/tsinghua-keg.png";
+import yandex from "@/assets/logos/yandex.png";
+
+const logos = [
+  ai21,
+  anthropic,
+  bigscience,
+  cohere,
+  eleutherai,
+  google,
+  meta,
+  microsoft,
+  nvidia,
+  openai,
+  together,
+  tsinghuaKeg,
+  yandex,
+];
+
+export default function LegacyLanding() {
+  const [schema, setSchema] = useState<Schema | undefined>(undefined);
+
+  useEffect(() => {
+    const controller = new AbortController();
+    async function fetchData() {
+      const schema = await getSchema(controller.signal);
+      setSchema(schema);
+    }
+
+    void fetchData();
+    return () => controller.abort();
+  }, []);
+
+  if (!schema) {
+    return null;
+  }
+
+  return (
+    <>
+      <Hero />
+
+      <div className="container mb-12 mx-auto text-lg px-16">
+        <div className="flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32">
+          {" "}
+          <h1 className="text-4xl  mx-4 mt-40">
+            <strong>Our Partners</strong>
+          </h1>
+        </div>
+        <ol className="my-8 flex flex-col gap-32">
+          <li>
+            <div className="flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto">
+              {logos.map((logo, idx) => (
+                <div className="w-24 h-24 flex items-center m-6" key={idx}>
+                  <img
+                    src={logo}
+                    alt="Logo"
+                    className="mx-auto block"
+                    sizes="100vw"
+                  />
+                </div>
+              ))}
+            </div>
+          </li>
+        </ol>
+      </div>
+      <div className="container mx-auto">
+        <div className="grid grid-cols-1 sm:grid-cols-2 md:grid-cols-3 gap-8">
+          <ModelsList models={schema.models} />
+          <ScenariosList runGroups={schema.run_groups} />
+          <MetricsList
+            metrics={schema.metrics}
+            metricGroups={schema.metric_groups}
+          />
+        </div>
+      </div>
+    </>
+  );
+}
diff --git a/src/helm-frontend/src/routes/Leaderboard.tsx b/src/helm-frontend/src/routes/Leaderboard.tsx
new file mode 100644
index 00000000000..bc5b8b2aa94
--- /dev/null
+++ b/src/helm-frontend/src/routes/Leaderboard.tsx
@@ -0,0 +1,156 @@
+import { useEffect, useState } from "react";
+import PageTitle from "@/components/PageTitle";
+import Tab from "@/components/Tab";
+import Tabs from "@/components/Tabs";
+import LeaderboardTables from "@/components/LeaderboardTables";
+import type GroupsTable from "@/types/GroupsTable";
+import type GroupMetadata from "@/types/GroupMetadata";
+import getGroupsTablesByName from "@/services/getGroupTablesByName";
+import getGroupsMetadata from "@/services/getGroupsMetadata";
+import Loading from "@/components/Loading";
+import getGroupsTables from "@/services/getGroupsTables";
+import Alert from "@/components/Alert";
+
+interface GroupDisplayData {
+  title: string;
+  name: string;
+}
+
+export default function Leaderboard() {
+  const defaultGroup = { title: "Core Scenarios", name: "core_scenarios" };
+  //const defaultGroup.name = "core_scenarios";
+  const [allGroupData, setAllGroupData] = useState<GroupDisplayData[]>([]);
+  const [selectedGroupDisplayData, setSelectedGroupDisplayData] =
+    useState(defaultGroup);
+  const [groupsTables, setGroupsTables] = useState<GroupsTable[]>([]);
+  const [groupMetadata, setGroupMetadata] = useState<
+    GroupMetadata | undefined
+  >();
+  const [isLoading, setIsLoading] = useState<boolean>(true);
+  const [activeGroup, setActiveGroup] = useState<number>(0);
+  console.log(allGroupData);
+
+  function findMatchingGroup(
+    allGroupData: GroupDisplayData[],
+    target: string,
+  ): GroupDisplayData {
+    console.log(allGroupData, target);
+    const searchResult = allGroupData.find((group) => group.title === target);
+    if (searchResult != undefined) {
+      return searchResult;
+    } else {
+      return defaultGroup;
+    }
+  }
+
+  function updateLeaderboard(allGroupData: GroupDisplayData[], target: string) {
+    setSelectedGroupDisplayData(findMatchingGroup(allGroupData, target));
+  }
+
+  useEffect(() => {
+    const controller = new AbortController();
+    async function fetchData() {
+      if (selectedGroupDisplayData.name === undefined) {
+        return;
+      }
+      const groups = await getGroupsTables(controller.signal);
+      const result: GroupDisplayData[] = [];
+      groups.forEach((group) => {
+        group.rows.forEach((row) => {
+          result.push({
+            title: String(row[0].value),
+            name: row[0].href.replace("?group=", ""),
+          });
+        });
+      });
+      setAllGroupData(result);
+
+      const [group, metadata] = await Promise.all([
+        getGroupsTablesByName(selectedGroupDisplayData.name, controller.signal),
+        getGroupsMetadata(controller.signal),
+      ]);
+      setGroupsTables(group);
+      setGroupMetadata(metadata[selectedGroupDisplayData.name]);
+      setIsLoading(false);
+    }
+
+    void fetchData();
+    return () => controller.abort();
+  }, [selectedGroupDisplayData]);
+
+  if (isLoading || groupMetadata === undefined) {
+    return <Loading />;
+  }
+
+  if (groupsTables.length === 0) {
+    return (
+      <>
+        <PageTitle
+          title={groupMetadata.display_name}
+          subtitle={groupMetadata.description}
+          markdown={true}
+          className="mr-8"
+        />
+        <div className="divider"></div>
+        <p className="text-center mt-8">Group currently has no results.</p>
+      </>
+    );
+  }
+
+  return (
+    <>
+      <Alert />
+      <div className="flex flex-row justify-between">
+        <PageTitle
+          title={"Leaderboard"}
+          subtitle={
+            "The leaderboard shows how the various models (with particular adaptation procedures) perform across different groups of scenarios and different metrics."
+          }
+          markdown={true}
+          className="mr-8 mb-16"
+        />
+        <div className="w-64 py-10 ">
+          <label
+            htmlFor="group"
+            className="block text-sm font-medium text-gray-700"
+          >
+            Select a group:
+          </label>
+          <select
+            id="group"
+            name="group"
+            value={selectedGroupDisplayData.title}
+            onChange={(e) => updateLeaderboard(allGroupData, e.target.value)}
+            className="mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md"
+          >
+            {allGroupData.map((group, index) => (
+              <option key={index} value={group.title}>
+                {group.title}
+              </option>
+            ))}
+          </select>
+        </div>
+      </div>
+      <div className="overflow-x-auto">
+        {groupsTables.length > 1 ? (
+          <Tabs>
+            {groupsTables.map((groupsTable, idx) => (
+              <Tab
+                key={idx}
+                active={idx === activeGroup}
+                onClick={() => setActiveGroup(idx)}
+              >
+                {groupsTable.title}
+              </Tab>
+            ))}
+          </Tabs>
+        ) : null}
+      </div>
+      <LeaderboardTables
+        groupsTables={groupsTables}
+        activeGroup={activeGroup}
+        ignoreHref={true}
+      />
+    </>
+  );
+}
diff --git a/src/helm-frontend/src/routes/Models.tsx b/src/helm-frontend/src/routes/Models.tsx
index c41e6e755ca..bf5c778d294 100644
--- a/src/helm-frontend/src/routes/Models.tsx
+++ b/src/helm-frontend/src/routes/Models.tsx
@@ -71,10 +71,7 @@ export default function Models() {
 
   return (
     <>
-      <PageTitle
-        title="Models"
-        subtitle="HELM evaluates models ranging from hundreds of millions to hundreds of billions of parameters. The models are trained on a varietry of data sources, such as general language understanding, instruction following, multi-task learning, code completion, chatbot functionalities, and various other applications."
-      />
+      <PageTitle title="Models" />
 
       <div className="overflow-x-auto mt-12">
         <table className="table">
@@ -106,10 +103,7 @@ export default function Models() {
           </tbody>
         </table>
 
-        <PageTitle
-          title="Analysis"
-          subtitle="Here is some high level analysis of the models that HELM supports"
-        />
+        <PageTitle title="Analysis" />
         <div className="grid md:grid-cols-3 grid-cols-1 gap-8">
           <Card className="flex flex-col justify-between">
             <Text>Models</Text>
diff --git a/src/helm-frontend/src/routes/Run.tsx b/src/helm-frontend/src/routes/Run.tsx
index 4ee5e06e2ed..1d179f19159 100644
--- a/src/helm-frontend/src/routes/Run.tsx
+++ b/src/helm-frontend/src/routes/Run.tsx
@@ -25,6 +25,8 @@ import Pagination from "@/components/Pagination";
 import Model from "@/types/Model";
 import MarkdownValue from "@/components/MarkdownValue";
 import StatNameDisplay from "@/components/StatNameDisplay";
+import getRunsToRunSuites from "@/services/getRunsToRunSuites";
+import getSuiteForRun from "@/services/getSuiteForRun";
 
 const INSTANCES_PAGE_SIZE = 10;
 const METRICS_PAGE_SIZE = 50;
@@ -34,6 +36,7 @@ export default function Run() {
   const [searchParams, setSearchParams] = useSearchParams();
   const [activeTab, setActiveTab] = useState<number>(0);
   const [runSpec, setRunSpec] = useState<RunSpec | undefined>();
+  const [runSuite, setRunSuite] = useState<string | undefined>();
   const [instances, setInstances] = useState<Instance[]>([]);
   const [stats, setStats] = useState<Stat[]>([]);
   const [displayPredictionsMap, setDisplayPredictionsMap] = useState<
@@ -58,6 +61,10 @@ export default function Run() {
         return () => controller.abort();
       }
 
+      const runsToRunSuites = await getRunsToRunSuites(signal);
+      const suite = getSuiteForRun(runsToRunSuites, runName);
+      setRunSuite(suite);
+
       const [
         runSpecs,
         instancesResp,
@@ -67,11 +74,11 @@ export default function Run() {
         displayRequests,
       ] = await Promise.all([
         getRunSpecs(signal),
-        getInstances(runName, signal),
-        getStatsByName(runName, signal),
-        getScenarioByName(runName, signal),
-        getDisplayPredictionsByName(runName, signal),
-        getDisplayRequestsByName(runName, signal),
+        getInstances(runName, signal, suite),
+        getStatsByName(runName, signal, suite),
+        getScenarioByName(runName, signal, suite),
+        getDisplayPredictionsByName(runName, signal, suite),
+        getDisplayRequestsByName(runName, signal, suite),
       ]);
 
       setRunSpec(runSpecs.find((rs) => rs.name === runName));
@@ -174,7 +181,7 @@ export default function Run() {
             <ArrowDownTrayIcon className="w-6 h-6 mr-1 text text-primary" />
             <a
               className="link link-primary link-hover"
-              href={getRunSpecByNameUrl(runSpec.name)}
+              href={getRunSpecByNameUrl(runSpec.name, runSuite)}
               download="true"
               target="_blank"
             >
@@ -182,7 +189,7 @@ export default function Run() {
             </a>
             <a
               className="link link-primary link-hover"
-              href={getScenarioStateByNameUrl(runSpec.name)}
+              href={getScenarioStateByNameUrl(runSpec.name, runSuite)}
               download="true"
               target="_blank"
             >
diff --git a/src/helm-frontend/src/routes/Scenarios.tsx b/src/helm-frontend/src/routes/Scenarios.tsx
index 4806981b8d3..8310b89a739 100644
--- a/src/helm-frontend/src/routes/Scenarios.tsx
+++ b/src/helm-frontend/src/routes/Scenarios.tsx
@@ -20,7 +20,10 @@ export default function Scenarios() {
        */
       setRunGroups(
         schema.run_groups.filter(
-          (runGroup) => !runGroup.todo && runGroup.taxonomy,
+          (runGroup) =>
+            !runGroup.todo &&
+            runGroup.taxonomy &&
+            !runGroup.display_name.includes("CLEVA"),
         ),
       );
     }
@@ -53,11 +56,13 @@ export default function Scenarios() {
     return <Loading />;
   }
 
+  console.log(runGroups);
+
   return (
     <>
       <PageTitle
         title="Scenarios"
-        subtitle="Scenarios offer a broad coverage of diverse language-related tasks, helping to identify gaps in the model's capabilities. They aid in the standardization of model evaluation by creating a consistent benchmark, thereby ensuring fair and controlled comparisons across various models."
+        subtitle="A scenario represents a use case and consists of a dataset of instances."
       />
       <div className="overflow-x-auto mt-12">
         <table className="table">
@@ -93,13 +98,10 @@ export default function Scenarios() {
             ))}
           </tbody>
         </table>
-        <PageTitle
-          title="Analysis"
-          subtitle="Scenarios offer a broad coverage of diverse language-related tasks, helping to identify gaps in the model's capabilities. They aid in the standardization of model evaluation by creating a consistent benchmark, thereby ensuring fair and controlled comparisons across various models."
-        />
+        <PageTitle title="Analysis" />
         <div className="grid md:grid-cols-4 gap-8">
           <Card className="flex flex-col">
-            <Text>Total Scenarios</Text>
+            <Text>Total scenarios</Text>
             <Metric className="mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]">
               {runGroups.length}
             </Metric>
diff --git a/src/helm-frontend/src/services/getDisplayPredictionsByName.ts b/src/helm-frontend/src/services/getDisplayPredictionsByName.ts
index b6e1a3eadc8..ce0fd464225 100644
--- a/src/helm-frontend/src/services/getDisplayPredictionsByName.ts
+++ b/src/helm-frontend/src/services/getDisplayPredictionsByName.ts
@@ -5,11 +5,14 @@ import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
 export default async function getDisplayPredictionsByName(
   runName: string,
   signal: AbortSignal,
+  suite?: string,
 ): Promise<DisplayPrediction[]> {
   try {
     const displayPrediction = await fetch(
       getBenchmarkEndpoint(
-        `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/display_predictions.json`,
+        `/benchmark_output/runs/${
+          suite || getBenchmarkSuite()
+        }/${runName}/display_predictions.json`,
       ),
       { signal },
     );
diff --git a/src/helm-frontend/src/services/getDisplayRequestsByName.ts b/src/helm-frontend/src/services/getDisplayRequestsByName.ts
index 0bdfa6c6baf..a336b57e002 100644
--- a/src/helm-frontend/src/services/getDisplayRequestsByName.ts
+++ b/src/helm-frontend/src/services/getDisplayRequestsByName.ts
@@ -2,14 +2,17 @@ import type DisplayRequest from "@/types/DisplayRequest";
 import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
 import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
 
-export default async function getDisplayRequestssByName(
+export default async function getDisplayRequestsByName(
   runName: string,
   signal: AbortSignal,
+  suite?: string,
 ): Promise<DisplayRequest[]> {
   try {
     const displayRequest = await fetch(
       getBenchmarkEndpoint(
-        `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/display_requests.json`,
+        `/benchmark_output/runs/${
+          suite || getBenchmarkSuite()
+        }/${runName}/display_requests.json`,
       ),
       { signal },
     );
diff --git a/src/helm-frontend/src/services/getGroupTablesByName.ts b/src/helm-frontend/src/services/getGroupTablesByName.ts
index 29d31852d3a..ba11840a688 100644
--- a/src/helm-frontend/src/services/getGroupTablesByName.ts
+++ b/src/helm-frontend/src/services/getGroupTablesByName.ts
@@ -1,6 +1,6 @@
 import type GroupsTable from "@/types/GroupsTable";
 import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
-import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getVersionBaseUrl from "@/utils/getVersionBaseUrl";
 
 export default async function getGroupsTablesByName(
   groupName: string,
@@ -8,9 +8,7 @@ export default async function getGroupsTablesByName(
 ): Promise<GroupsTable[]> {
   try {
     const group = await fetch(
-      getBenchmarkEndpoint(
-        `/benchmark_output/runs/${getBenchmarkSuite()}/groups/${groupName}.json`,
-      ),
+      getBenchmarkEndpoint(`${getVersionBaseUrl()}/groups/${groupName}.json`),
       { signal },
     );
 
diff --git a/src/helm-frontend/src/services/getGroupsMetadata.ts b/src/helm-frontend/src/services/getGroupsMetadata.ts
index 8c9d6685a28..c4316099cd1 100644
--- a/src/helm-frontend/src/services/getGroupsMetadata.ts
+++ b/src/helm-frontend/src/services/getGroupsMetadata.ts
@@ -1,15 +1,13 @@
 import type GroupsMetadata from "@/types/GroupsMetadata";
 import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
-import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getVersionBaseUrl from "@/utils/getVersionBaseUrl";
 
 export default async function getGroupsMetadata(
   signal: AbortSignal,
 ): Promise<GroupsMetadata> {
   try {
     const groups = await fetch(
-      getBenchmarkEndpoint(
-        `/benchmark_output/runs/${getBenchmarkSuite()}/groups_metadata.json`,
-      ),
+      getBenchmarkEndpoint(`${getVersionBaseUrl()}/groups_metadata.json`),
       { signal },
     );
 
diff --git a/src/helm-frontend/src/services/getGroupsTables.ts b/src/helm-frontend/src/services/getGroupsTables.ts
index 2b03ae5cfd1..2907e137a8d 100644
--- a/src/helm-frontend/src/services/getGroupsTables.ts
+++ b/src/helm-frontend/src/services/getGroupsTables.ts
@@ -1,11 +1,9 @@
 import type GroupsTable from "@/types/GroupsTable";
 import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
-import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getVersionBaseUrl from "@/utils/getVersionBaseUrl";
 
 export function getGroupsTablesJsonUrl(): string {
-  return getBenchmarkEndpoint(
-    `/benchmark_output/runs/${getBenchmarkSuite()}/groups.json`,
-  );
+  return getBenchmarkEndpoint(`${getVersionBaseUrl()}/groups.json`);
 }
 
 export default async function getGroupsTables(
diff --git a/src/helm-frontend/src/services/getInstances.ts b/src/helm-frontend/src/services/getInstances.ts
index 3c5b02503b9..35abb89893d 100644
--- a/src/helm-frontend/src/services/getInstances.ts
+++ b/src/helm-frontend/src/services/getInstances.ts
@@ -5,11 +5,14 @@ import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
 export default async function getInstancesByRunName(
   runName: string,
   signal: AbortSignal,
+  suite?: string,
 ): Promise<Instance[]> {
   try {
     const instances = await fetch(
       getBenchmarkEndpoint(
-        `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/instances.json`,
+        `/benchmark_output/runs/${
+          suite || getBenchmarkSuite()
+        }/${runName}/instances.json`,
       ),
       { signal },
     );
diff --git a/src/helm-frontend/src/services/getRunSpecByName.ts b/src/helm-frontend/src/services/getRunSpecByName.ts
index ec87d6dda2f..2133193a719 100644
--- a/src/helm-frontend/src/services/getRunSpecByName.ts
+++ b/src/helm-frontend/src/services/getRunSpecByName.ts
@@ -2,9 +2,11 @@ import type RunSpec from "@/types/RunSpec";
 import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
 import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
 
-export function getRunSpecByNameUrl(runName: string): string {
+export function getRunSpecByNameUrl(runName: string, suite?: string): string {
   return getBenchmarkEndpoint(
-    `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/run_spec.json`,
+    `/benchmark_output/runs/${
+      suite || getBenchmarkSuite()
+    }/${runName}/run_spec.json`,
   );
 }
 export default async function getRunSpecByName(
diff --git a/src/helm-frontend/src/services/getRunSpecs.ts b/src/helm-frontend/src/services/getRunSpecs.ts
index 5fd3674ba00..8b5df8d47d0 100644
--- a/src/helm-frontend/src/services/getRunSpecs.ts
+++ b/src/helm-frontend/src/services/getRunSpecs.ts
@@ -1,15 +1,13 @@
 import RunSpec from "@/types/RunSpec";
 import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
-import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getVersionBaseUrl from "@/utils/getVersionBaseUrl";
 
 export default async function getRunSpecs(
   signal: AbortSignal,
 ): Promise<RunSpec[]> {
   try {
     const runSpecs = await fetch(
-      getBenchmarkEndpoint(
-        `/benchmark_output/runs/${getBenchmarkSuite()}/run_specs.json`,
-      ),
+      getBenchmarkEndpoint(`${getVersionBaseUrl()}/run_specs.json`),
       { signal },
     );
 
diff --git a/src/helm-frontend/src/services/getRunsToRunSuites.ts b/src/helm-frontend/src/services/getRunsToRunSuites.ts
new file mode 100644
index 00000000000..4e353d73971
--- /dev/null
+++ b/src/helm-frontend/src/services/getRunsToRunSuites.ts
@@ -0,0 +1,20 @@
+import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
+import getBenchmarkRelease from "@/utils/getBenchmarkRelease";
+
+export default async function getRunsToRunSuites(
+  signal: AbortSignal,
+): Promise<Record<string, string>> {
+  try {
+    const runsToRunSuites = await fetch(
+      getBenchmarkEndpoint(
+        `/benchmark_output/releases/${getBenchmarkRelease()}/runs_to_run_suites.json`,
+      ),
+      { signal },
+    );
+
+    return (await runsToRunSuites.json()) as Record<string, string>;
+  } catch (error) {
+    console.log(error);
+    return {};
+  }
+}
diff --git a/src/helm-frontend/src/services/getScenarioByName.ts b/src/helm-frontend/src/services/getScenarioByName.ts
index c2a8cd9aa8f..2bcf3331bc9 100644
--- a/src/helm-frontend/src/services/getScenarioByName.ts
+++ b/src/helm-frontend/src/services/getScenarioByName.ts
@@ -5,11 +5,14 @@ import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
 export default async function getScenarioByName(
   scenarioName: string,
   signal: AbortSignal,
+  suite?: string,
 ): Promise<Scenario | undefined> {
   try {
     const scenario = await fetch(
       getBenchmarkEndpoint(
-        `/benchmark_output/runs/${getBenchmarkSuite()}/${scenarioName}/scenario.json`,
+        `/benchmark_output/runs/${
+          suite || getBenchmarkSuite()
+        }/${scenarioName}/scenario.json`,
       ),
       { signal },
     );
diff --git a/src/helm-frontend/src/services/getScenarioStateByName.ts b/src/helm-frontend/src/services/getScenarioStateByName.ts
index 75444730564..c1ea75cc37f 100644
--- a/src/helm-frontend/src/services/getScenarioStateByName.ts
+++ b/src/helm-frontend/src/services/getScenarioStateByName.ts
@@ -1,8 +1,13 @@
 import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
 import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
 
-export function getScenarioStateByNameUrl(runName: string): string {
+export function getScenarioStateByNameUrl(
+  runName: string,
+  suite?: string,
+): string {
   return getBenchmarkEndpoint(
-    `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/scenario_state.json`,
+    `/benchmark_output/runs/${
+      suite || getBenchmarkSuite()
+    }/${runName}/scenario_state.json`,
   );
 }
diff --git a/src/helm-frontend/src/services/getSchema.ts b/src/helm-frontend/src/services/getSchema.ts
index bb6e403cb55..b6b056c2ba9 100644
--- a/src/helm-frontend/src/services/getSchema.ts
+++ b/src/helm-frontend/src/services/getSchema.ts
@@ -1,11 +1,15 @@
 import { parse } from "yaml";
 
 import type Schema from "@/types/Schema";
+import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getBenchmarkRelease from "@/utils/getBenchmarkRelease";
 
 export default async function getSchema(signal: AbortSignal): Promise<Schema> {
   try {
     const resp = await fetch(
-      "https://crfm.stanford.edu/helm/latest/schema.yaml",
+      `https://crfm.stanford.edu/helm/${
+        getBenchmarkRelease() || getBenchmarkSuite()
+      }/schema.yaml`,
       { signal },
     );
     const data = await resp.text();
diff --git a/src/helm-frontend/src/services/getStatsByName.ts b/src/helm-frontend/src/services/getStatsByName.ts
index d7dd8a7a5bf..8b54aaae21b 100644
--- a/src/helm-frontend/src/services/getStatsByName.ts
+++ b/src/helm-frontend/src/services/getStatsByName.ts
@@ -5,11 +5,14 @@ import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
 export default async function getStatsByName(
   runName: string,
   signal: AbortSignal,
+  suite?: string,
 ): Promise<Stat[]> {
   try {
     const stats = await fetch(
       getBenchmarkEndpoint(
-        `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/stats.json`,
+        `/benchmark_output/runs/${
+          suite || getBenchmarkSuite()
+        }/${runName}/stats.json`,
       ),
       { signal },
     );
diff --git a/src/helm-frontend/src/services/getSuiteForRun.ts b/src/helm-frontend/src/services/getSuiteForRun.ts
new file mode 100644
index 00000000000..d35c656194e
--- /dev/null
+++ b/src/helm-frontend/src/services/getSuiteForRun.ts
@@ -0,0 +1,9 @@
+import getBenchmarkRelease from "@/utils/getBenchmarkRelease";
+
+export default function getSuiteForRun(
+  runNameToSuite: Record<string, string>,
+  runName: string,
+) {
+  const suite = getBenchmarkRelease() ? runNameToSuite[runName] : window.SUITE;
+  return suite;
+}
diff --git a/src/helm-frontend/src/types/global.d.ts b/src/helm-frontend/src/types/global.d.ts
new file mode 100644
index 00000000000..697db247501
--- /dev/null
+++ b/src/helm-frontend/src/types/global.d.ts
@@ -0,0 +1,5 @@
+interface Window {
+  RELEASE: string;
+  SUITE: string;
+  BENCHMARK_OUTPUT_BASE_URL: string;
+}
diff --git a/src/helm-frontend/src/utils/getBenchmarkEndpoint.ts b/src/helm-frontend/src/utils/getBenchmarkEndpoint.ts
index 3ee821961d3..a4469a0353b 100644
--- a/src/helm-frontend/src/utils/getBenchmarkEndpoint.ts
+++ b/src/helm-frontend/src/utils/getBenchmarkEndpoint.ts
@@ -1,6 +1,6 @@
 export default function getBenchmarkEndpoint(path: string): string {
-  return `${import.meta.env.VITE_HELM_BENCHMARKS_ENDPOINT.replace(
-    /\/$/,
+  return `${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/, "")}/${path.replace(
+    /^\//,
     "",
-  )}/${path.replace(/^\//, "")}`;
+  )}`;
 }
diff --git a/src/helm-frontend/src/utils/getBenchmarkRelease.ts b/src/helm-frontend/src/utils/getBenchmarkRelease.ts
new file mode 100644
index 00000000000..7cf2b9afee9
--- /dev/null
+++ b/src/helm-frontend/src/utils/getBenchmarkRelease.ts
@@ -0,0 +1,3 @@
+export default function getBenchmarkRelease(): string {
+  return String(window.RELEASE);
+}
diff --git a/src/helm-frontend/src/utils/getBenchmarkSuite.ts b/src/helm-frontend/src/utils/getBenchmarkSuite.ts
index 5088457bbc0..10b312b8fec 100644
--- a/src/helm-frontend/src/utils/getBenchmarkSuite.ts
+++ b/src/helm-frontend/src/utils/getBenchmarkSuite.ts
@@ -1,3 +1,3 @@
 export default function getBenchmarkSuite(): string {
-  return String(import.meta.env.VITE_HELM_BENCHMARKS_SUITE);
+  return String(window.SUITE);
 }
diff --git a/src/helm-frontend/src/utils/getVersionBaseUrl.ts b/src/helm-frontend/src/utils/getVersionBaseUrl.ts
new file mode 100644
index 00000000000..7859190afc8
--- /dev/null
+++ b/src/helm-frontend/src/utils/getVersionBaseUrl.ts
@@ -0,0 +1,7 @@
+export default function getVersionBaseUrl(): string {
+  if (window.RELEASE) {
+    return `/benchmark_output/releases/${window.RELEASE}`;
+  } else {
+    return `/benchmark_output/runs/${window.SUITE}`;
+  }
+}
diff --git a/src/helm-frontend/tailwind.config.js b/src/helm-frontend/tailwind.config.js
index de184169978..bb8f95c03bf 100644
--- a/src/helm-frontend/tailwind.config.js
+++ b/src/helm-frontend/tailwind.config.js
@@ -1,48 +1,48 @@
 /** @type {import('tailwindcss').Config} */
 /* eslint-disable max-len */
 module.exports = {
-  content: [
-    "./index.html",
-    "./src/**/*.{js,ts,jsx,tsx}",
-    "./node_modules/@tremor/**/*.{js,ts,jsx,tsx}",
-  ],
-  theme: {
-    transparent: "transparent",
-    current: "currentColor",
-    extend: {
-      colors: {
-        // light mode
-        tremor: {
-          brand: {
-            faint: "#eff6ff", // blue-50
-            muted: "#bfdbfe", // blue-200
-            subtle: "#60a5fa", // blue-400
-            DEFAULT: "#3b82f6", // blue-500
-            emphasis: "#1d4ed8", // blue-700
-            inverted: "#ffffff", // white
-          },
-          background: {
-            muted: "#f9fafb", // gray-50
-            subtle: "#f3f4f6", // gray-100
-            DEFAULT: "#ffffff", // white
-            emphasis: "#374151", // gray-700
-          },
-          border: {
-            DEFAULT: "#e5e7eb", // gray-200
-          },
-          ring: {
-            DEFAULT: "#e5e7eb", // gray-200
-          },
-          content: {
-            subtle: "#9ca3af", // gray-400
-            DEFAULT: "#6b7280", // gray-500
-            emphasis: "#374151", // gray-700
-            strong: "#111827", // gray-900
-            inverted: "#ffffff", // white
-          },
-        },
-        // dark mode
-        /*
+	content: [
+		"./index.html",
+		"./src/**/*.{js,ts,jsx,tsx}",
+		"./node_modules/@tremor/**/*.{js,ts,jsx,tsx}",
+	],
+	theme: {
+		transparent: "transparent",
+		current: "currentColor",
+		extend: {
+			colors: {
+				// light mode
+				tremor: {
+					brand: {
+						faint: "#eff6ff", // blue-50
+						muted: "#bfdbfe", // blue-200
+						subtle: "#60a5fa", // blue-400
+						DEFAULT: "#3b82f6", // blue-500
+						emphasis: "#1d4ed8", // blue-700
+						inverted: "#ffffff", // white
+					},
+					background: {
+						muted: "#f9fafb", // gray-50
+						subtle: "#f3f4f6", // gray-100
+						DEFAULT: "#ffffff", // white
+						emphasis: "#374151", // gray-700
+					},
+					border: {
+						DEFAULT: "#e5e7eb", // gray-200
+					},
+					ring: {
+						DEFAULT: "#e5e7eb", // gray-200
+					},
+					content: {
+						subtle: "#9ca3af", // gray-400
+						DEFAULT: "#6b7280", // gray-500
+						emphasis: "#374151", // gray-700
+						strong: "#111827", // gray-900
+						inverted: "#ffffff", // white
+					},
+				},
+				// dark mode
+				/*
         "dark-tremor": {
           brand: {
             faint: "#0B1229", // custom
@@ -73,65 +73,65 @@ module.exports = {
           },
         },
         */
-      },
-      boxShadow: {
-        // light
-        "tremor-input": "0 1px 2px 0 rgb(0 0 0 / 0.05)",
-        "tremor-card":
-          "0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",
-        "tremor-dropdown":
-          "0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",
-        // dark
-        "dark-tremor-input": "0 1px 2px 0 rgb(0 0 0 / 0.05)",
-        "dark-tremor-card":
-          "0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",
-        "dark-tremor-dropdown":
-          "0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",
-      },
-      borderRadius: {
-        "tremor-small": "0.375rem",
-        "tremor-default": "0.5rem",
-        "tremor-full": "9999px",
-      },
-      fontSize: {
-        "tremor-label": ["0.75rem"],
-        "tremor-default": ["0.875rem", { lineHeight: "1.25rem" }],
-        "tremor-title": ["1.125rem", { lineHeight: "1.75rem" }],
-        "tremor-metric": ["1.875rem", { lineHeight: "2.25rem" }],
-      },
-    },
-  },
-  safelist: [
-    {
-      pattern:
-        /^(bg-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
-      variants: ["hover", "ui-selected"],
-    },
-    {
-      pattern:
-        /^(text-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
-      variants: ["hover", "ui-selected"],
-    },
-    {
-      pattern:
-        /^(border-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
-      variants: ["hover", "ui-selected"],
-    },
-    {
-      pattern:
-        /^(ring-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
-    },
-    {
-      pattern:
-        /^(stroke-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
-    },
-    {
-      pattern:
-        /^(fill-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
-    },
-  ],
-  plugins: [require("@headlessui/tailwindcss"), require("daisyui")],
-  daisyui: {
-    themes: ["corporate", "business"],
-  },
+			},
+			boxShadow: {
+				// light
+				"tremor-input": "0 1px 2px 0 rgb(0 0 0 / 0.05)",
+				"tremor-card":
+					"0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",
+				"tremor-dropdown":
+					"0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",
+				// dark
+				"dark-tremor-input": "0 1px 2px 0 rgb(0 0 0 / 0.05)",
+				"dark-tremor-card":
+					"0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",
+				"dark-tremor-dropdown":
+					"0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",
+			},
+			borderRadius: {
+				"tremor-small": "0.375rem",
+				"tremor-default": "0.5rem",
+				"tremor-full": "9999px",
+			},
+			fontSize: {
+				"tremor-label": ["0.75rem"],
+				"tremor-default": ["0.875rem", { lineHeight: "1.25rem" }],
+				"tremor-title": ["1.125rem", { lineHeight: "1.75rem" }],
+				"tremor-metric": ["1.875rem", { lineHeight: "2.25rem" }],
+			},
+		},
+	},
+	safelist: [
+		{
+			pattern:
+				/^(bg-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+			variants: ["hover", "ui-selected"],
+		},
+		{
+			pattern:
+				/^(text-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+			variants: ["hover", "ui-selected"],
+		},
+		{
+			pattern:
+				/^(border-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+			variants: ["hover", "ui-selected"],
+		},
+		{
+			pattern:
+				/^(ring-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+		},
+		{
+			pattern:
+				/^(stroke-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+		},
+		{
+			pattern:
+				/^(fill-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+		},
+	],
+	plugins: [require("@headlessui/tailwindcss"), require("daisyui")],
+	daisyui: {
+		themes: ["corporate", "business"],
+	},
 };
diff --git a/src/helm-frontend/tsconfig.json b/src/helm-frontend/tsconfig.json
index 7f44c96e282..5df59518c12 100644
--- a/src/helm-frontend/tsconfig.json
+++ b/src/helm-frontend/tsconfig.json
@@ -1,30 +1,30 @@
 {
-  "compilerOptions": {
-    "target": "ES2020",
-    "useDefineForClassFields": true,
-    "lib": ["ES2020", "DOM", "DOM.Iterable"],
-    "module": "ESNext",
-    "skipLibCheck": true,
+	"compilerOptions": {
+		"target": "ES2020",
+		"useDefineForClassFields": true,
+		"lib": ["ES2020", "DOM", "DOM.Iterable"],
+		"module": "ESNext",
+		"skipLibCheck": true,
 
-    "moduleResolution": "bundler",
-    "allowImportingTsExtensions": true,
-    "resolveJsonModule": true,
-    "isolatedModules": true,
-    "noEmit": true,
-    "jsx": "react-jsx",
+		"moduleResolution": "bundler",
+		"allowImportingTsExtensions": true,
+		"resolveJsonModule": true,
+		"isolatedModules": true,
+		"noEmit": true,
+		"jsx": "react-jsx",
 
-    "strict": true,
-    "noUnusedLocals": true,
-    "noUnusedParameters": true,
-    "noFallthroughCasesInSwitch": true,
+		"strict": true,
+		"noUnusedLocals": true,
+		"noUnusedParameters": true,
+		"noFallthroughCasesInSwitch": true,
 
-    "baseUrl": "src",
-    "paths": {
-      "@/*": ["*"]
-    },
+		"baseUrl": "src",
+		"paths": {
+			"@/*": ["*"]
+		},
 
-    "types": ["vitest/globals", "vite/client"]
-  },
-  "include": ["src"],
-  "references": [{ "path": "./tsconfig.node.json" }]
+		"types": ["vitest/globals", "vite/client"]
+	},
+	"include": ["src"],
+	"references": [{ "path": "./tsconfig.node.json" }]
 }
diff --git a/src/helm-frontend/vite.config.ts b/src/helm-frontend/vite.config.ts
index 0782743722e..7c89bb4daa7 100644
--- a/src/helm-frontend/vite.config.ts
+++ b/src/helm-frontend/vite.config.ts
@@ -17,5 +17,4 @@ export default defineConfig({
   build: {
     outDir: `${__dirname}/../helm/benchmark/static_build`,
   },
-  base: "/helm/",
 });
diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
index 36bd147c07d..82648c7be6e 100644
--- a/src/helm/benchmark/adaptation/adapter_spec.py
+++ b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -73,7 +73,11 @@ class AdapterSpec:
 
     # Decoding parameters (inherited by `Request`)
 
-    # Model to make the request to (need to fill in)
+    # Model deployment to make the request to (need to fill in)
+    model_deployment: str = ""
+
+    # DEPRECATED: old model field, kept for backward compatibility
+    # TODO: Remove this once we do not wish to support backward compatibility anymore.
     model: str = ""
 
     # Temperature to use
diff --git a/src/helm/benchmark/adaptation/adapters/adapter.py b/src/helm/benchmark/adaptation/adapters/adapter.py
index 3dd65132863..bbf56a626d3 100644
--- a/src/helm/benchmark/adaptation/adapters/adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/adapter.py
@@ -1,15 +1,12 @@
 from abc import ABC, abstractmethod
 from typing import List
 
-import numpy as np
-
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.adaptation.scenario_state import ScenarioState
-from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS
+from helm.benchmark.scenarios.scenario import Instance
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
 from helm.benchmark.window_services.window_service import WindowService
 from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
-from helm.common.hierarchical_logger import hlog
 
 
 class Adapter(ABC):
@@ -21,7 +18,7 @@ class Adapter(ABC):
     def __init__(self, adapter_spec: AdapterSpec, tokenizer_service: TokenizerService):
         self.adapter_spec: AdapterSpec = adapter_spec
         self.window_service: WindowService = WindowServiceFactory.get_window_service(
-            adapter_spec.model, tokenizer_service
+            adapter_spec.model_deployment, tokenizer_service
         )
 
     @abstractmethod
@@ -31,38 +28,3 @@ def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
         list of corresponding `RequestState`s.
         """
         pass
-
-    def get_run_instances(self, instances: List[Instance]) -> List[Instance]:
-        """
-        Get the instances necessary for this run:
-        Train instances (split=train): keep all (if any) for in-context learning
-        Eval instances (split=valid or test): keep at most `max_eval_instances` specified in `AdapterSpec` by sampling
-        Return the resulting train and eval instances.
-        """
-        all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
-
-        all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in EVAL_SPLITS]
-        if (
-            self.adapter_spec.max_eval_instances is not None
-            and len(all_eval_instances) > self.adapter_spec.max_eval_instances
-        ):
-            # Pick the first `self.adapter_spec.max_eval_instances`.
-            # The random sampling includes instances monotonically.
-            np.random.seed(0)
-            selected_eval_instances = list(
-                np.random.choice(
-                    all_eval_instances,  # type: ignore
-                    self.adapter_spec.max_eval_instances,
-                    replace=False,
-                )
-            )
-        else:
-            selected_eval_instances = all_eval_instances
-
-        hlog(
-            f"{len(instances)} instances, "
-            f"{len(all_train_instances)} train instances, "
-            f"{len(selected_eval_instances)}/{len(all_eval_instances)} eval instances"
-        )
-
-        return all_train_instances + selected_eval_instances
diff --git a/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py b/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py
index 75d8b622f59..339a220788b 100644
--- a/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py
@@ -50,6 +50,7 @@ def generate_requests(
             )
             request = Request(
                 model=self.adapter_spec.model,
+                model_deployment=self.adapter_spec.model_deployment,
                 prompt=prompt.text,
                 num_completions=self.adapter_spec.num_outputs,
                 temperature=self.adapter_spec.temperature,
diff --git a/src/helm/benchmark/adaptation/adapters/generation_adapter.py b/src/helm/benchmark/adaptation/adapters/generation_adapter.py
index ec251ce20a2..c4945852653 100644
--- a/src/helm/benchmark/adaptation/adapters/generation_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/generation_adapter.py
@@ -39,6 +39,7 @@ def generate_requests(
         )
         request = Request(
             model=self.adapter_spec.model,
+            model_deployment=self.adapter_spec.model_deployment,
             prompt=prompt.text,
             num_completions=self.adapter_spec.num_outputs,
             temperature=self.adapter_spec.temperature,
diff --git a/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py b/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py
index cc18c761196..87e51a9b212 100644
--- a/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py
@@ -34,9 +34,15 @@ def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
         # Pick out evaluation instances. This includes both valid and test splits.
         eval_instances: List[Instance] = [instance for instance in instances if instance.split in EVAL_SPLITS]
         hlog(f"{len(eval_instances)} eval instances")
-
+        # Since at least 2023-01-01, this adapter was using `instances` instead of `eval_instances`
+        # https://github.com/stanford-crfm/helm/commit/ac9892f7449418d32ab55843702db312b58003ed#diff-69871182494f0d9f4bc6aeea76e99c13edf0213e2c123432a63cd2024d66ffcaR39
+        # This assert is intended to identify run specs (if any) that had been producing incorrect results.
+        assert len(eval_instances) == len(instances), (
+            "Non-evaluation instances were passed to LanguageModelingAdapter, but LanguageModelingAdapter "
+            + "expects evaluation instances only. Please open a GitHub issue with your RunSpec."
+        )
         all_request_states: List[RequestState] = flatten_list(
-            parallel_map(self._generate_requests, instances, parallelism)
+            parallel_map(self._generate_requests, eval_instances, parallelism)
         )
         hlog(f"{len(all_request_states)} requests")
 
@@ -114,6 +120,7 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
         )
         request = Request(
             model=self.adapter_spec.model,
+            model_deployment=self.adapter_spec.model_deployment,
             prompt=prompt_text,
             num_completions=1,
             temperature=0,
@@ -162,6 +169,7 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
 
             request = Request(
                 model=self.adapter_spec.model,
+                model_deployment=self.adapter_spec.model_deployment,
                 prompt=prompt_text,
                 num_completions=1,
                 temperature=0,
diff --git a/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py b/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py
index aa8b6c9d204..a5126373502 100644
--- a/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py
@@ -29,6 +29,7 @@ def generate_requests(
 
         request = Request(
             model=self.adapter_spec.model,
+            model_deployment=self.adapter_spec.model_deployment,
             multimodal_prompt=prompt.multimedia_object,
             num_completions=self.adapter_spec.num_outputs,
             temperature=self.adapter_spec.temperature,
diff --git a/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py b/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py
index 152bffc34db..18fbe8508f4 100644
--- a/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py
@@ -27,6 +27,7 @@ def generate_requests(
 
         request = Request(
             model=self.adapter_spec.model,
+            model_deployment=self.adapter_spec.model_deployment,
             multimodal_prompt=prompt.multimedia_object,
             num_completions=self.adapter_spec.num_outputs,
             temperature=self.adapter_spec.temperature,
diff --git a/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py b/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py
index 18796dbebcd..4b9d3801bf0 100644
--- a/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py
@@ -22,6 +22,7 @@ def teardown_method(self, _):
     def test_construct_prompt(self):
         adapter_spec: AdapterSpec = AdapterSpec(
             model="simple/model1",
+            model_deployment="simple/model1",
             method=ADAPT_GENERATION_MULTIMODAL,
             global_prefix="[START]",
             instructions="Please answer the following question about the images.",
@@ -91,6 +92,7 @@ def test_construct_prompt(self):
     def test_construct_prompt_multi_label(self):
         adapter_spec: AdapterSpec = AdapterSpec(
             model="simple/model1",
+            model_deployment="simple/model1",
             method=ADAPT_GENERATION_MULTIMODAL,
             global_prefix="[START]",
             instructions="Please answer the following question about the images.",
@@ -171,6 +173,7 @@ def test_construct_prompt_idefics_instruct_example(self):
         """
         adapter_spec: AdapterSpec = AdapterSpec(
             model="simple/model1",
+            model_deployment="simple/model1",
             method=ADAPT_GENERATION_MULTIMODAL,
             input_prefix="User: ",
             input_suffix="<end_of_utterance>",
diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
index 5cf4e4d9410..08e8569b0be 100644
--- a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
@@ -55,6 +55,7 @@ def generate_requests(
         )
         request = Request(
             model=self.adapter_spec.model,
+            model_deployment=self.adapter_spec.model_deployment,
             prompt=prompt.text,
             num_completions=1,
             top_k_per_token=self.adapter_spec.num_outputs,
diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py
index 31429cc2529..d9a3d79fa41 100644
--- a/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py
@@ -41,6 +41,7 @@ def construct_request_state(
     ) -> RequestState:
         request = Request(
             model=self.adapter_spec.model,
+            model_deployment=self.adapter_spec.model_deployment,
             prompt=prompt.text,
             num_completions=1,
             temperature=0,
diff --git a/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py b/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py
index 7d327b8dd90..d2791ed532f 100644
--- a/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py
@@ -33,6 +33,7 @@ def test_adapt(self):
     def test_construct_prompt(self):
         adapter_spec = AdapterSpec(
             model="openai/davinci",
+            model_deployment="openai/davinci",
             method=ADAPT_GENERATION,
             input_prefix="",
             input_suffix="",
@@ -59,7 +60,12 @@ def test_construct_prompt(self):
 
     def test_construct_prompt_with_truncation(self):
         adapter_spec = AdapterSpec(
-            model="openai/davinci", method=ADAPT_GENERATION, input_prefix="", output_prefix="", max_tokens=100
+            model="openai/davinci",
+            model_deployment="openai/davinci",
+            method=ADAPT_GENERATION,
+            input_prefix="",
+            output_prefix="",
+            max_tokens=100,
         )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         correct_reference = Reference(Output(text=""), tags=[CORRECT_TAG])
@@ -80,7 +86,9 @@ def test_construct_prompt_with_truncation(self):
         assert prompt_text.count("eval") == 1948
 
     def test_sample_examples_without_references(self):
-        adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=1)
+        adapter_spec = AdapterSpec(
+            method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=1
+        )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         all_train_instances = [
             Instance(Input(text="prompt1"), references=[]),
@@ -92,7 +100,9 @@ def test_sample_examples_without_references(self):
         assert len(examples) == 1
 
     def test_sample_examples_open_ended_generation(self):
-        adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=3)
+        adapter_spec = AdapterSpec(
+            method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=3
+        )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
 
         all_train_instances: List[Instance] = [
@@ -106,7 +116,9 @@ def test_sample_examples_open_ended_generation(self):
         assert seed0_examples != seed1_examples, "Examples should differ when changing the seed"
 
     def test_sample_examples_open_ended_generation_stress(self):
-        adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=5)
+        adapter_spec = AdapterSpec(
+            method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=5
+        )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
 
         all_train_instances: List[Instance] = [
@@ -146,7 +158,11 @@ def test_sample_examples_open_ended_generation_stress(self):
 
     def test_multiple_correct_reference(self):
         adapter_spec = AdapterSpec(
-            method=ADAPT_GENERATION, model="openai/ada", max_train_instances=2, sample_train=False
+            method=ADAPT_GENERATION,
+            model="openai/ada",
+            model_deployment="openai/ada",
+            max_train_instances=2,
+            sample_train=False,
         )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         train_instances = [
@@ -191,7 +207,12 @@ def test_multiple_correct_reference(self):
 
     def test_multiple_correct_reference_multi_label(self):
         adapter_spec = AdapterSpec(
-            method=ADAPT_GENERATION, model="openai/ada", max_train_instances=2, multi_label=True, sample_train=False
+            method=ADAPT_GENERATION,
+            model="openai/ada",
+            model_deployment="openai/ada",
+            max_train_instances=2,
+            multi_label=True,
+            sample_train=False,
         )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         train_instances = [
diff --git a/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py b/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py
index c963513a932..588dfe6b1f8 100644
--- a/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py
@@ -1,5 +1,6 @@
 # mypy: check_untyped_defs = False
 from typing import List
+from helm.benchmark.window_services.gpt2_window_service import GPT2WindowService
 
 from helm.common.tokenization_request import TokenizationToken
 from helm.benchmark.adaptation.request_state import RequestState
@@ -7,7 +8,19 @@
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from .adapter_factory import AdapterFactory, ADAPT_LANGUAGE_MODELING
 from .test_adapter import TestAdapter
-from helm.benchmark.scenarios.scenario import Instance, Input, Reference
+from helm.benchmark.scenarios.scenario import TEST_SPLIT, Instance, Input, Reference
+
+
+class MockGPT2Window(GPT2WindowService):
+    """Utility for overriding properties of a GPT2WindowService for test purposes."""
+
+    def __init__(self, service, *, max_sequence_length):
+        super().__init__(service)
+        self._max_sequence_length = max_sequence_length
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._max_sequence_length
 
 
 class TestLanguageModelingAdapter(TestAdapter):
@@ -16,6 +29,7 @@ def test_construct_language_modeling_prompt(self):
             method=ADAPT_LANGUAGE_MODELING,
             input_prefix="",
             model="openai/davinci",
+            model_deployment="openai/davinci",
             output_prefix="",
             max_tokens=0,
         )
@@ -39,6 +53,7 @@ def test_fits_tokens_within_context_window(self):
             method=ADAPT_LANGUAGE_MODELING,
             input_prefix="",
             model="openai/curie",
+            model_deployment="openai/curie",
             output_prefix="",
             max_tokens=0,
         )
@@ -70,6 +85,7 @@ def test_prompt_truncated(self):
             method=ADAPT_LANGUAGE_MODELING,
             input_prefix="",
             model="anthropic/claude-v1.3",
+            model_deployment="anthropic/claude-v1.3",
             output_prefix="",
             max_tokens=0,
         )
@@ -81,6 +97,7 @@ def test_prompt_truncated(self):
         instance: Instance = Instance(
             input=input_text,
             references=[reference],
+            split=TEST_SPLIT,
         )
         # Ensure the adapter returns the correct prompt
         request_states: List[RequestState] = adapter.adapt([instance], parallelism=1).request_states
@@ -93,6 +110,7 @@ def test_prompt_truncated(self):
         instance_long: Instance = Instance(
             input=input_text_long,
             references=[reference],
+            split=TEST_SPLIT,
         )
         request_states_long: List[RequestState] = adapter.adapt([instance_long], parallelism=1).request_states
         request_long: Request = request_states_long[0].request
@@ -105,6 +123,7 @@ def test_prompt_truncated(self):
             method=ADAPT_LANGUAGE_MODELING,
             input_prefix="",
             model="anthropic/claude-v1.3",
+            model_deployment="anthropic/claude-v1.3",
             output_prefix="",
             max_tokens=2000,
         )
@@ -125,3 +144,31 @@ def test_prompt_truncated(self):
         num_tokens_2 = len(adapter_2.window_service.encode(request_long_2.prompt).token_values)
         assert num_tokens_2 == adapter.window_service.max_sequence_and_generated_tokens_length - 2000
         assert request_long_2.max_tokens == 2000
+
+    # TODO(#1969) Determine if this behavior is actually desirable.
+    def test_prompt_wrapping(self):
+        input_tokens = 25
+        max_sequence_length = 10
+        adapter_spec = AdapterSpec(
+            method=ADAPT_LANGUAGE_MODELING,
+            input_prefix="",
+            model="openai/code-davinci-002",
+            model_deployment="openai/code-davinci-002",
+            output_prefix="",
+            max_tokens=0,
+        )
+        adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
+        # Monkey patch the window service to have really short max sequences.
+        adapter.window_service = MockGPT2Window(self.tokenizer_service, max_sequence_length=max_sequence_length)
+        input_text = Input(text=" ".join(str(i) for i in range(input_tokens)))
+        instance = Instance(input=input_text, references=[], split=TEST_SPLIT)
+
+        # Generate the requests
+        request_states: List[RequestState] = adapter.adapt([instance], parallelism=1).request_states
+        # A smaller window service creates more requests
+        assert len(request_states) == 3
+        assert request_states[0].request.prompt == "<|endoftext|>0 1 2 3 4 5 6 7 8 9"
+        # Only the first prompt inclues the prefix_token
+        assert request_states[1].request.prompt == " 9 10 11 12 13 14 15 16 17 18 19"
+        # The last prompt includes as many conditioning_tokens as will fit
+        assert request_states[2].request.prompt == " 14 15 16 17 18 19 20 21 22 23 24"
diff --git a/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
index ab4ef43d9b9..06cb1dec6cf 100644
--- a/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
@@ -7,7 +7,9 @@
 
 class TestMultipleChoiceJointAdapter(TestAdapter):
     def test_sample_examples(self):
-        adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=4)
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=4
+        )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         all_train_instances = [
             Instance(Input(text="say no"), references=[Reference(Output(text="no"), tags=[CORRECT_TAG])]),
@@ -27,13 +29,20 @@ def test_sample_examples(self):
         assert examples[3].input.text == "say yes3"
 
     def test_sample_examples_no_train_instances(self):
-        adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=2)
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=2
+        )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         examples = adapter.sample_examples(all_train_instances=[], seed=0)
         assert len(examples) == 0
 
     def test_sample_examples_greater_max_train_instances(self):
-        adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=10)
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT,
+            model="openai/ada",
+            model_deployment="openai/ada",
+            max_train_instances=10,
+        )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         all_train_instances = [
             Instance(Input(text="say no"), references=[Reference(Output(text="no"), tags=[CORRECT_TAG])]),
@@ -46,7 +55,11 @@ def test_sample_examples_greater_max_train_instances(self):
 
     def test_multiple_correct_reference(self):
         adapter_spec = AdapterSpec(
-            method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=10, sample_train=False
+            method=ADAPT_MULTIPLE_CHOICE_JOINT,
+            model="openai/ada",
+            model_deployment="openai/ada",
+            max_train_instances=10,
+            sample_train=False,
         )
         adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
         train_instances = [
@@ -102,6 +115,7 @@ def test_multiple_correct_reference_multi_label(self):
         adapter_spec = AdapterSpec(
             method=ADAPT_MULTIPLE_CHOICE_JOINT,
             model="openai/ada",
+            model_deployment="openai/ada",
             max_train_instances=10,
             multi_label=True,
             sample_train=False,
diff --git a/src/helm/benchmark/augmentations/cleva_perturbation.py b/src/helm/benchmark/augmentations/cleva_perturbation.py
index b65562d8b08..54927d864e0 100644
--- a/src/helm/benchmark/augmentations/cleva_perturbation.py
+++ b/src/helm/benchmark/augmentations/cleva_perturbation.py
@@ -10,13 +10,13 @@
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.benchmark.scenarios.scenario import Input, Instance, Reference, Output
 from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import Perturbation, TextPerturbation
 
 
 ############################################################
 
 
-class ChineseTyposPerturbation(Perturbation):
+class ChineseTyposPerturbation(TextPerturbation):
     """
     Chinese typos. For implementation details, see
     https://github.com/GEM-benchmark/NL-Augmenter/tree/main/nlaugmenter/transformations/chinese_butter_fingers_perturbation
@@ -271,7 +271,7 @@ def retrieve_from_database(
         return chars_with_similar_pinyin
 
 
-class ChineseSynonymPerturbation(Perturbation):
+class ChineseSynonymPerturbation(TextPerturbation):
     """
     Chinese synonyms. For implementation details, see
     https://github.com/GEM-benchmark/NL-Augmenter/blob/main/nlaugmenter/transformations/chinese_antonym_synonym_substitution
@@ -340,7 +340,7 @@ def sample_word(self, sample_list: List[str], rng: Random) -> str:
         return sample_list[index]
 
 
-class CLEVAMildMixPerturbation(Perturbation):
+class CLEVAMildMixPerturbation(TextPerturbation):
     """
     CLEVA robustness perturbation that composes several perturbations.
     """
@@ -370,7 +370,7 @@ def perturb(self, text: str, rng: Random) -> str:
 ############################################################
 
 
-class ChineseGenderPerturbation(Perturbation):
+class ChineseGenderPerturbation(TextPerturbation):
     """Individual fairness perturbation for Chinese gender terms and pronouns."""
 
     name: str = "chinese_gender"
@@ -601,13 +601,6 @@ def get_substitute_name(self, token: str, rng: Random) -> Optional[str]:
         name = rng.choice(list(options))
         return name
 
-    def perturb(self, text: str, rng: Random) -> str:
-        """
-        Perturbing the text is handled in `perturb_with_persistency` to ensure that perturbed names
-        in `Instance`s and `Reference`s match.
-        """
-        raise NotImplementedError("Should never be called")
-
     def perturb_with_persistency(
         self, text: str, rng: Random, name_substitution_mapping: Dict[str, str], skipped_tokens: Set[str]
     ) -> str:
@@ -686,7 +679,7 @@ def word_segment_and_pos_tagging(text: str) -> Tuple[List[str], List[str]]:
         return tokens, tags
 
 
-class SimplifiedToTraditionalPerturbation(Perturbation):
+class SimplifiedToTraditionalPerturbation(TextPerturbation):
     """Individual fairness perturbation for Chinese simplified to Chinese traditional."""
 
     name: str = "simplified_to_traditional"
@@ -713,7 +706,7 @@ def perturb(self, text: str, rng: Random) -> str:
         return perturbed_text
 
 
-class MandarinToCantonesePerturbation(Perturbation):
+class MandarinToCantonesePerturbation(TextPerturbation):
     """
     Individual fairness perturbation for Mandarin to Cantonese translation.
     The implementation is inspired by https://justyy.com/tools/chinese-converter/
diff --git a/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py b/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py
index 4107efc8083..dc4b6172f24 100644
--- a/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py
+++ b/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py
@@ -5,7 +5,7 @@
 from random import Random
 
 from helm.common.general import match_case
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 from .perturbation_description import PerturbationDescription
 
 
@@ -92,7 +92,7 @@
 
 # The implementations below are based on
 # https://github.com/GEM-benchmark/NL-Augmenter/blob/main/transformations/contraction_expansions/transformation.py
-class ContractionPerturbation(Perturbation):
+class ContractionPerturbation(TextPerturbation):
     """
     Contractions.
     Replaces each expansion with its contracted version.
@@ -132,7 +132,7 @@ def cont(possible):
         return self.reverse_contraction_pattern.sub(cont, text)
 
 
-class ExpansionPerturbation(Perturbation):
+class ExpansionPerturbation(TextPerturbation):
     """
     Expansions.
     Replaces each contraction with its expanded version.
diff --git a/src/helm/benchmark/augmentations/contrast_sets_perturbation.py b/src/helm/benchmark/augmentations/contrast_sets_perturbation.py
index 18bf6b9c541..e7d8b4db690 100644
--- a/src/helm/benchmark/augmentations/contrast_sets_perturbation.py
+++ b/src/helm/benchmark/augmentations/contrast_sets_perturbation.py
@@ -81,6 +81,3 @@ def apply(self, instance: Instance, seed: Optional[int] = None) -> Instance:
             references=perturbed_references,
             perturbation=description,
         )
-
-    def perturb(self, text: str, rng: Random) -> str:  # we need this since parent method is abstract
-        raise NotImplementedError("Should never be called since apply() was overridden")
diff --git a/src/helm/benchmark/augmentations/dialect_perturbation.py b/src/helm/benchmark/augmentations/dialect_perturbation.py
index 5da74113dc1..440cf799356 100644
--- a/src/helm/benchmark/augmentations/dialect_perturbation.py
+++ b/src/helm/benchmark/augmentations/dialect_perturbation.py
@@ -8,10 +8,10 @@
 
 from helm.common.general import match_case, ensure_file_downloaded
 from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 
 
-class DialectPerturbation(Perturbation):
+class DialectPerturbation(TextPerturbation):
     """Individual fairness perturbation for dialect."""
 
     """ Short unique identifier of the perturbation (e.g., extra_space) """
diff --git a/src/helm/benchmark/augmentations/extra_space_perturbation.py b/src/helm/benchmark/augmentations/extra_space_perturbation.py
index 814ac44f677..69eaec1f6f3 100644
--- a/src/helm/benchmark/augmentations/extra_space_perturbation.py
+++ b/src/helm/benchmark/augmentations/extra_space_perturbation.py
@@ -1,11 +1,11 @@
 from dataclasses import dataclass
 from random import Random
 
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 from .perturbation_description import PerturbationDescription
 
 
-class ExtraSpacePerturbation(Perturbation):
+class ExtraSpacePerturbation(TextPerturbation):
     """
     A toy perturbation that replaces existing spaces in the text with
     `num_spaces` number of spaces.
diff --git a/src/helm/benchmark/augmentations/filler_words_perturbation.py b/src/helm/benchmark/augmentations/filler_words_perturbation.py
index a566cc37d28..40b433cd95f 100644
--- a/src/helm/benchmark/augmentations/filler_words_perturbation.py
+++ b/src/helm/benchmark/augmentations/filler_words_perturbation.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 from .perturbation_description import PerturbationDescription
 
 from random import Random
@@ -31,7 +31,7 @@
 FILL_PHRASE = ["uhm", "umm", "ahh", "err", "actually", "obviously", "naturally", "like", "you know"]
 
 
-class FillerWordsPerturbation(Perturbation):
+class FillerWordsPerturbation(TextPerturbation):
     """
     Randomly inserts filler words and phrases in the sentence.
     Perturbation example:
diff --git a/src/helm/benchmark/augmentations/gender_perturbation.py b/src/helm/benchmark/augmentations/gender_perturbation.py
index 4b3c6dc7619..df53ce875b5 100644
--- a/src/helm/benchmark/augmentations/gender_perturbation.py
+++ b/src/helm/benchmark/augmentations/gender_perturbation.py
@@ -6,7 +6,7 @@
 
 from helm.common.general import match_case
 from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 
 
 """ Gender term mappings """
@@ -62,7 +62,7 @@
 ]
 
 
-class GenderPerturbation(Perturbation):
+class GenderPerturbation(TextPerturbation):
     """Individual fairness perturbation for gender terms and pronouns."""
 
     """ Short unique identifier of the perturbation (e.g., extra_space) """
diff --git a/src/helm/benchmark/augmentations/lowercase_perturbation.py b/src/helm/benchmark/augmentations/lowercase_perturbation.py
index 8f2c537cae0..e67d71dab3c 100644
--- a/src/helm/benchmark/augmentations/lowercase_perturbation.py
+++ b/src/helm/benchmark/augmentations/lowercase_perturbation.py
@@ -1,10 +1,10 @@
 from random import Random
 
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 from .perturbation_description import PerturbationDescription
 
 
-class LowerCasePerturbation(Perturbation):
+class LowerCasePerturbation(TextPerturbation):
     """
     Simple perturbation turning input and references into lowercase.
     """
diff --git a/src/helm/benchmark/augmentations/mild_mix_perturbation.py b/src/helm/benchmark/augmentations/mild_mix_perturbation.py
index 1bfae4ecf5d..fa682976b86 100644
--- a/src/helm/benchmark/augmentations/mild_mix_perturbation.py
+++ b/src/helm/benchmark/augmentations/mild_mix_perturbation.py
@@ -1,14 +1,14 @@
 from random import Random
 
 from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 from .lowercase_perturbation import LowerCasePerturbation
 from .contraction_expansion_perturbation import ContractionPerturbation
 from .space_perturbation import SpacePerturbation
 from .misspelling_perturbation import MisspellingPerturbation
 
 
-class MildMixPerturbation(Perturbation):
+class MildMixPerturbation(TextPerturbation):
     """
     Canonical robustness perturbation that composes several perturbations.
     These perturbations are chosen to be reasonable.
diff --git a/src/helm/benchmark/augmentations/misspelling_perturbation.py b/src/helm/benchmark/augmentations/misspelling_perturbation.py
index 37d7d9a22cc..41e8029cd15 100644
--- a/src/helm/benchmark/augmentations/misspelling_perturbation.py
+++ b/src/helm/benchmark/augmentations/misspelling_perturbation.py
@@ -6,13 +6,13 @@
 from typing import Dict, List
 
 from helm.common.general import match_case
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 from .perturbation_description import PerturbationDescription
 
 
 # The implementation below is based on the following list of common misspellings:
 # https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines
-class MisspellingPerturbation(Perturbation):
+class MisspellingPerturbation(TextPerturbation):
     """
     Replaces words randomly with common misspellings, from a list of common misspellings.
 
diff --git a/src/helm/benchmark/augmentations/person_name_perturbation.py b/src/helm/benchmark/augmentations/person_name_perturbation.py
index 33a11d21b6c..91736d1115a 100644
--- a/src/helm/benchmark/augmentations/person_name_perturbation.py
+++ b/src/helm/benchmark/augmentations/person_name_perturbation.py
@@ -263,13 +263,6 @@ def get_substitute_name(self, token: str, rng: Random) -> Optional[str]:
         name = rng.choice(list(options))
         return name
 
-    def perturb(self, text: str, rng: Random) -> str:
-        """
-        Perturbing the text is handled in `perturb_with_persistency` to ensure that perturbed names
-        in `Instance`s and `Reference`s match.
-        """
-        raise NotImplementedError("Should never be called")
-
     def perturb_with_persistency(
         self, text: str, rng: Random, name_substitution_mapping: Dict[str, str], skipped_tokens: Set[str]
     ) -> str:
diff --git a/src/helm/benchmark/augmentations/perturbation.py b/src/helm/benchmark/augmentations/perturbation.py
index 5eb24481d47..646926d85ae 100644
--- a/src/helm/benchmark/augmentations/perturbation.py
+++ b/src/helm/benchmark/augmentations/perturbation.py
@@ -27,17 +27,24 @@ def get_rng(self, instance: Instance, seed: Optional[int] = None) -> Random:
         # If seed exists, use it as part of the random seed
         return Random(instance.id if seed is None else str(seed) + instance.id)
 
+    @abstractmethod
+    def apply(self, instance: Instance, seed: Optional[int] = None) -> Instance:
+        """Generate a modified instance from the input instance."""
+        pass
+
+
+class TextPerturbation(Perturbation, ABC):
     def apply(self, instance: Instance, seed: Optional[int] = None) -> Instance:
         """
-        Generates a new Instance by perturbing the input, tagging the Instance and perturbing the References,
-        if should_perturb_references is true. Initializes a random number generator based on instance_id that gets
-        passed to perturb and perturb_references.
+        Generates a new Instance by applying `perturb` to the input and (if requested) the references.
+        Initializes a random number generator based on instance_id that gets
+        passed to perturb.
         """
         rng: Random = self.get_rng(instance, seed)
 
         references: List[Reference] = instance.references
         if self.should_perturb_references:
-            references = [self.perturb_reference(reference, rng) for reference in references]
+            references = [self._perturb_reference(reference, rng) for reference in references]
 
         description = replace(self.description, seed=seed)
 
@@ -50,7 +57,7 @@ def apply(self, instance: Instance, seed: Optional[int] = None) -> Instance:
             perturbation=description,
         )
 
-    def perturb_reference(self, reference: Reference, rng: Random) -> Reference:
+    def _perturb_reference(self, reference: Reference, rng: Random) -> Reference:
         """Generates a new Reference by perturbing the output and tagging the Reference."""
         return replace(reference, output=Output(text=self.perturb(reference.output.text, rng)), tags=reference.tags)
 
diff --git a/src/helm/benchmark/augmentations/space_perturbation.py b/src/helm/benchmark/augmentations/space_perturbation.py
index fd08f3ba991..e1ddbf3cb4f 100644
--- a/src/helm/benchmark/augmentations/space_perturbation.py
+++ b/src/helm/benchmark/augmentations/space_perturbation.py
@@ -2,11 +2,11 @@
 from random import Random
 import re
 
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 from .perturbation_description import PerturbationDescription
 
 
-class SpacePerturbation(Perturbation):
+class SpacePerturbation(TextPerturbation):
     """
     A simple perturbation that replaces existing spaces with 0-max_spaces spaces (thus potentially merging words).
     """
diff --git a/src/helm/benchmark/augmentations/synonym_perturbation.py b/src/helm/benchmark/augmentations/synonym_perturbation.py
index f8c54d334e7..81a7def7c3d 100644
--- a/src/helm/benchmark/augmentations/synonym_perturbation.py
+++ b/src/helm/benchmark/augmentations/synonym_perturbation.py
@@ -11,10 +11,10 @@
 
 from helm.common.general import match_case, ensure_file_downloaded
 from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 
 
-class SynonymPerturbation(Perturbation):
+class SynonymPerturbation(TextPerturbation):
     """
     Synonyms. For implementation details, see
     https://github.com/GEM-benchmark/NL-Augmenter/blob/main/nlaugmenter/transformations/synonym_substitution/transformation.py
diff --git a/src/helm/benchmark/augmentations/typos_perturbation.py b/src/helm/benchmark/augmentations/typos_perturbation.py
index 0df062817fa..4b273dfbf9b 100644
--- a/src/helm/benchmark/augmentations/typos_perturbation.py
+++ b/src/helm/benchmark/augmentations/typos_perturbation.py
@@ -2,10 +2,10 @@
 from random import Random
 
 from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
 
 
-class TyposPerturbation(Perturbation):
+class TyposPerturbation(TextPerturbation):
     """
     Typos. For implementation details, see
     https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/butter_fingers_perturbation
diff --git a/src/helm/benchmark/config_registry.py b/src/helm/benchmark/config_registry.py
new file mode 100644
index 00000000000..0fab062949c
--- /dev/null
+++ b/src/helm/benchmark/config_registry.py
@@ -0,0 +1,14 @@
+from helm.benchmark.model_deployment_registry import register_deployments_if_not_already_registered
+from helm.benchmark.model_metadata_registry import register_metadatas_if_not_already_registered
+from helm.benchmark.tokenizer_config_registry import register_tokenizers_if_not_already_registered
+
+HELM_REGISTERED: bool = False
+
+
+def register_helm_configurations():
+    global HELM_REGISTERED
+    if not HELM_REGISTERED:
+        register_metadatas_if_not_already_registered()
+        register_tokenizers_if_not_already_registered()
+        register_deployments_if_not_already_registered()
+        HELM_REGISTERED = True
diff --git a/src/helm/benchmark/huggingface_registration.py b/src/helm/benchmark/huggingface_registration.py
index ff444fd4fda..bc833ddea2e 100644
--- a/src/helm/benchmark/huggingface_registration.py
+++ b/src/helm/benchmark/huggingface_registration.py
@@ -1,5 +1,6 @@
 import os
 from typing import Optional
+from datetime import date
 
 from helm.benchmark.model_deployment_registry import (
     ClientSpec,
@@ -7,7 +8,15 @@
     WindowServiceSpec,
     register_model_deployment,
 )
+from helm.benchmark.model_metadata_registry import (
+    get_model_metadata,
+    ModelMetadata,
+    register_model_metadata,
+    TEXT_MODEL_TAG,
+    FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
+)
 from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec, register_tokenizer_config
+from helm.common.hierarchical_logger import hlog
 
 
 def register_huggingface_model(
@@ -30,6 +39,27 @@ def register_huggingface_model(
             args=object_spec_args,
         ),
     )
+
+    # We check if the model is already registered because we don't want to
+    # overwrite the model metadata if it's already registered.
+    # If it's not registered, we register it, as otherwise an error would be thrown
+    # when we try to register the model deployment.
+    try:
+        _ = get_model_metadata(model_name=helm_model_name)
+    except ValueError:
+        register_model_metadata(
+            ModelMetadata(
+                name=helm_model_name,
+                creator_organization_name="Unknown",
+                display_name=helm_model_name,
+                description=helm_model_name,
+                access="open",
+                release_date=date.today(),
+                tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
+            )
+        )
+        hlog(f"Registered default metadata for model {helm_model_name}")
+
     register_model_deployment(model_deployment)
     tokenizer_config = TokenizerConfig(
         name=helm_model_name,
diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
index c9a107628cc..aa0eb8ae949 100644
--- a/src/helm/benchmark/metrics/basic_metrics.py
+++ b/src/helm/benchmark/metrics/basic_metrics.py
@@ -60,8 +60,8 @@ def compute_estimated_time_from_prompt_size_and_num_output_tokens(
     num_output_tokens: int,
 ) -> Optional[float]:
     estimated_runtime: Optional[float]
-    if request_state.request.model in inference_runtimes_dict:
-        inference_runtimes_dict_for_model = inference_runtimes_dict[request_state.request.model]
+    if request_state.request.model_deployment in inference_runtimes_dict:
+        inference_runtimes_dict_for_model = inference_runtimes_dict[request_state.request.model_deployment]
         runtime_per_output_token: float = inference_runtimes_dict_for_model["runtime_per_output_token"]
         raw_runtimes_for_prompt_tokens: Dict[str, float] = inference_runtimes_dict_for_model[
             "runtime_for_prompt_tokens"
@@ -583,7 +583,9 @@ def compute_efficiency_metrics(
         # Fetch the right `Tokenizer` depending on the model defined in `AdapterSpec`
         # and calculate the number of tokens in the prompt.
         tokenizer_service: TokenizerService = metric_service
-        window_service: WindowService = WindowServiceFactory.get_window_service(adapter_spec.model, tokenizer_service)
+        window_service: WindowService = WindowServiceFactory.get_window_service(
+            adapter_spec.model_deployment, tokenizer_service
+        )
         prompt: str = request_state.request.prompt
         num_prompt_tokens: int = window_service.get_num_tokens(prompt)
 
@@ -618,14 +620,16 @@ def compute_efficiency_metrics(
 
         # Compute efficiency metrics for training.
         training_co2_cost: Optional[float]
-        if request_state.request.model in self.training_efficiency_dict["carbon"]:
-            training_co2_cost = self.training_efficiency_dict["carbon"][request_state.request.model]["value"]
+        if request_state.request.model_deployment in self.training_efficiency_dict["carbon"]:
+            training_co2_cost = self.training_efficiency_dict["carbon"][request_state.request.model_deployment]["value"]
         else:
             training_co2_cost = None
 
         training_energy_cost: Optional[float]
-        if request_state.request.model in self.training_efficiency_dict["energy"]:
-            training_energy_cost = self.training_efficiency_dict["energy"][request_state.request.model]["value"]
+        if request_state.request.model_deployment in self.training_efficiency_dict["energy"]:
+            training_energy_cost = self.training_efficiency_dict["energy"][request_state.request.model_deployment][
+                "value"
+            ]
         else:
             training_energy_cost = None
 
@@ -799,7 +803,9 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind
         num_choices = len(references)
 
         tokenizer_service: TokenizerService = metric_service
-        window_service: WindowService = WindowServiceFactory.get_window_service(adapter_spec.model, tokenizer_service)
+        window_service: WindowService = WindowServiceFactory.get_window_service(
+            adapter_spec.model_deployment, tokenizer_service
+        )
         reference_stats: Dict[ReferenceKey, ReferenceStat] = {}
         for request_state in reference_request_states:
             assert request_state.reference_index is not None and request_state.request_mode is not None
diff --git a/src/helm/benchmark/metrics/code_metrics.py b/src/helm/benchmark/metrics/code_metrics.py
index 6ed8f084750..01bcabcaa00 100644
--- a/src/helm/benchmark/metrics/code_metrics.py
+++ b/src/helm/benchmark/metrics/code_metrics.py
@@ -106,12 +106,13 @@ def evaluate_generation(
                     hlog(f"After second join thread count: {threading.active_count()}. exitcode: {p.exitcode}")
                     assert not p.is_alive(), "The code process was still alive even after calling kill."
 
-                if len(shared_list) == 0:
+                if len(shared_list) > 0:
+                    scores = shared_list[0]
+                else:
                     # Remark: ideally should consider all tests that failed;
                     # use the average number of tests here for simplicity
                     avg_number_tests = 21
-                    shared_list = [[-1] * avg_number_tests]  # type: ignore
-                scores = shared_list[0]
+                    scores = [-1] * avg_number_tests
 
                 scores = _convert_scores(scores)  # Convert list of bool/int to list of ints.
                 this_score = metric_fn(scores)
diff --git a/src/helm/benchmark/metrics/disinformation_metrics.py b/src/helm/benchmark/metrics/disinformation_metrics.py
index fd8d180bee9..2025d06eb7e 100644
--- a/src/helm/benchmark/metrics/disinformation_metrics.py
+++ b/src/helm/benchmark/metrics/disinformation_metrics.py
@@ -86,7 +86,7 @@ def _compute_wedging_human_eval(
     results: List[Stat] = []
     instance_first_line = request_state.instance.input.text.splitlines()[0]
     human_evaluations = _fetch_human_evaluation_results(eval_cache_path, WEDGING_HUMAN_EVAL_FILE)
-    model_results = human_evaluations.get(adapter_spec.model)
+    model_results = human_evaluations.get(adapter_spec.model_deployment)
 
     if not model_results:
         # Trying to evaluate a model we don't have annotations for
@@ -125,7 +125,7 @@ def _compute_reiteration_human_eval(
     """
     results: List[Stat] = []
     human_evaluations = _fetch_human_evaluation_results(eval_cache_path, REITERATION_HUMAN_EVAL_FILE)
-    model_results = human_evaluations.get(adapter_spec.model)
+    model_results = human_evaluations.get(adapter_spec.model_deployment)
     if not model_results:
         # Trying to evaluate a model we don't have annotations for
         return results
diff --git a/src/helm/benchmark/metrics/dry_run_metrics.py b/src/helm/benchmark/metrics/dry_run_metrics.py
index 1f2618b0dd0..4fe2126630e 100644
--- a/src/helm/benchmark/metrics/dry_run_metrics.py
+++ b/src/helm/benchmark/metrics/dry_run_metrics.py
@@ -38,7 +38,9 @@ def process(self, request_state: RequestState) -> List[Stat]:
         stats.append(Stat(MetricName("max_num_completion_tokens")).add(request.num_completions * request.max_tokens))
 
         # Get number of tokens in the prompt
-        tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model, self.metric_service)
+        tokenizer: WindowService = WindowServiceFactory.get_window_service(
+            request.model_deployment, self.metric_service
+        )
         num_prompt_tokens: int = tokenizer.get_num_tokens(request.prompt)
         stats.append(Stat(MetricName("num_prompt_tokens")).add(num_prompt_tokens))
 
diff --git a/src/helm/benchmark/metrics/ranking_metrics.py b/src/helm/benchmark/metrics/ranking_metrics.py
index 204a4a8fbd5..b7c823eca5d 100644
--- a/src/helm/benchmark/metrics/ranking_metrics.py
+++ b/src/helm/benchmark/metrics/ranking_metrics.py
@@ -7,7 +7,7 @@
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.benchmark.scenarios.scenario import unpack_tag, CORRECT_TAG, Reference
 from helm.common.request import RequestResult
-from helm.common.general import binarize_dict
+from helm.common.general import assert_present, binarize_dict
 from .metric import Metric
 from .metric_name import MetricName
 from .metric_service import MetricService
@@ -205,14 +205,13 @@ def get_run_relevances(self, ranking_objs: List[RankingObject], rank_limit: Opti
         relevance dictionary, which contains the ground truth relevance
         values for each document.
         """
-        assert all([r.model_relevance is not None for r in ranking_objs])
         if rank_limit:
             return {
-                self.get_query_string(r.reference_index): r.model_relevance  # type: ignore
+                self.get_query_string(r.reference_index): assert_present(r.model_relevance)
                 for r in ranking_objs
                 if r.rank and r.rank <= rank_limit
             }
-        return {self.get_query_string(r.reference_index): r.model_relevance for r in ranking_objs}  # type: ignore
+        return {self.get_query_string(r.reference_index): assert_present(r.model_relevance) for r in ranking_objs}
 
     def get_true_relevances(self, ranking_objects: List[RankingObject]) -> Dict[str, int]:
         """Get the true relevance dictionary."""
@@ -372,7 +371,7 @@ def evaluate_references(
         #   len(ranking_objects) minus its relevance.
         stats += [
             Stat(MetricName(f"ref{r.reference_index}_rank")).add(
-                len(ranking_objects) - r.model_relevance  # type: ignore
+                len(ranking_objects) - assert_present(r.model_relevance)
             )
             for r in ranking_objects
         ]
diff --git a/src/helm/benchmark/metrics/summarization_metrics.py b/src/helm/benchmark/metrics/summarization_metrics.py
index de90249dbd2..3a61ae77413 100644
--- a/src/helm/benchmark/metrics/summarization_metrics.py
+++ b/src/helm/benchmark/metrics/summarization_metrics.py
@@ -52,7 +52,7 @@ def __init__(self, task: str, device: str = "cpu"):
         # avoid triggering a bug in DataStatsMetric that raises
         # `NameError: name 'stderr' is not defined`
         if not spacy.util.is_package("en_core_web_sm"):
-            spacy.cli.download("en_core_web_sm")  # type: ignore
+            spacy.cli.download("en_core_web_sm")
 
         try:
             from summ_eval.data_stats_metric import DataStatsMetric
@@ -181,9 +181,9 @@ def evaluate_generation(
                 self.humaneval = self._load_humaneval(eval_cache_path)
 
             # get human evaluation scores if they exist
-            model_name = adapter_spec.model.replace("/", "_")
+            deployment = adapter_spec.model_deployment.replace("/", "_")
             for metric_name in ["faithfulness", "relevance", "coherence"]:
-                val = self.humaneval[(metric_name, model_name, request_state.instance.id, pred)]
+                val = self.humaneval[(metric_name, deployment, request_state.instance.id, pred)]
                 result.append(Stat(MetricName(f"HumanEval-{metric_name}")).add(float(val)))
         except KeyError:
             pass
@@ -195,8 +195,8 @@ def evaluate_generation(
             if self.qa_fact_eval is None:
                 self._load_qafacteval(eval_cache_path)
             assert self.qa_fact_eval is not None
-            model_name = adapter_spec.model.replace("/", "_")
-            val = self.qa_fact_eval[model_name][(request_state.instance.id, pred)]
+            deployment = adapter_spec.model_deployment.replace("/", "_")
+            val = self.qa_fact_eval[deployment][(request_state.instance.id, pred)]
             result.append(Stat(MetricName("QAFactEval")).add(float(val)))
         except KeyError:
             pass
diff --git a/src/helm/benchmark/metrics/test_classification_metrics.py b/src/helm/benchmark/metrics/test_classification_metrics.py
index f5f3d23ff18..d15b4b9fef7 100644
--- a/src/helm/benchmark/metrics/test_classification_metrics.py
+++ b/src/helm/benchmark/metrics/test_classification_metrics.py
@@ -26,7 +26,7 @@ def _request_state(prediction: str, options: List[_Option]):
         request_mode=None,
         train_trial_index=0,
         output_mapping=None,
-        request=Request(),
+        request=Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002"),
         result=RequestResult(
             success=True, embedding=[], completions=[Sequence(text=prediction, logprob=0.0, tokens=[])], cached=False
         ),
diff --git a/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py
index 297dcf60736..9d6444fcbe9 100644
--- a/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py
+++ b/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py
@@ -39,5 +39,5 @@ def estimate_tokens(self, request: Request, metric_service: MetricService) -> in
         """
         Estimate the number of tokens for a given request based on the organization.
         """
-        token_cost_estimator: TokenCostEstimator = self._get_estimator(request.model_organization)
+        token_cost_estimator: TokenCostEstimator = self._get_estimator(request.model_host)
         return token_cost_estimator.estimate_tokens(request, metric_service)
diff --git a/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py
index 7880686adee..1bd22893061 100644
--- a/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py
+++ b/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py
@@ -17,6 +17,8 @@ def estimate_tokens(self, request: Request, metric_service: MetricService) -> in
         """
         total_estimated_tokens: int = request.num_completions * request.max_tokens
         if request.echo_prompt:
-            window_service: WindowService = WindowServiceFactory.get_window_service(request.model, metric_service)
+            window_service: WindowService = WindowServiceFactory.get_window_service(
+                request.model_deployment, metric_service
+            )
             total_estimated_tokens += window_service.get_num_tokens(request.prompt)
         return GooseAITokenCounter.account_for_base_tokens(total_estimated_tokens)
diff --git a/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py
index d52807ab781..429075fe949 100644
--- a/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py
+++ b/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py
@@ -15,7 +15,7 @@ def estimate_tokens(self, request: Request, metric_service: MetricService) -> in
 
         Add num_tokens(prompt) if Request.echo_prompt is True.
         """
-        tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model, metric_service)
+        tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model_deployment, metric_service)
         num_prompt_tokens: int = tokenizer.get_num_tokens(request.prompt)
         total_estimated_tokens: int = num_prompt_tokens + request.num_completions * request.max_tokens
 
diff --git a/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py
index 2964beeefbf..2d202413f66 100644
--- a/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py
+++ b/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py
@@ -10,6 +10,8 @@ def setup_method(self, method):
 
     def test_estimate_tokens(self):
         request = Request(
+            model="openai/text-davinci-002",
+            model_deployment="openai/text-davinci-002",
             prompt="The Center for Research on Foundation Models (CRFM) is "
             "an interdisciplinary initiative born out of the Stanford "
             "Institute for Human-Centered Artificial Intelligence (HAI) "
diff --git a/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
index b647222e673..e4f07463e92 100644
--- a/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
+++ b/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
@@ -36,13 +36,21 @@ def setup_method(self, method):
         self._mock_metric_service.tokenize = MagicMock(return_value=tokenization_request_result)
 
     def test_estimate_tokens(self):
-        request = Request(prompt=TestOpenAITokenCostEstimator.TEST_PROMPT, num_completions=3, max_tokens=100)
+        request = Request(
+            model="openai/text-davinci-002",
+            model_deployment="openai/text-davinci-002",
+            prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
+            num_completions=3,
+            max_tokens=100,
+        )
 
         # Prompt + max number of tokens from completions = 51 + 3 * 100
         assert self._token_cost_estimator.estimate_tokens(request, self._mock_metric_service) == 51 + 3 * 100
 
     def test_estimate_tokens_with_echo_prompt(self):
         request = Request(
+            model="openai/text-davinci-002",
+            model_deployment="openai/text-davinci-002",
             prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
             echo_prompt=True,
             num_completions=1,
diff --git a/src/helm/benchmark/model_deployment_registry.py b/src/helm/benchmark/model_deployment_registry.py
index fe3a386987f..c3f9d361472 100644
--- a/src/helm/benchmark/model_deployment_registry.py
+++ b/src/helm/benchmark/model_deployment_registry.py
@@ -1,16 +1,18 @@
 import os
 from typing import Dict, Optional, List
 from dataclasses import dataclass
+import importlib_resources as resources
 
 import cattrs
 import yaml
 
 from helm.common.hierarchical_logger import hlog
 from helm.common.object_spec import ObjectSpec
-from helm.proxy.models import ALL_MODELS, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, MODEL_NAME_TO_MODEL, TEXT_MODEL_TAG, Model
+from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, CONFIG_PACKAGE
 
 
-MODEL_DEPLOYMENTS_FILE = "model_deployments.yaml"
+MODEL_DEPLOYMENTS_FILE: str = "model_deployments.yaml"
+DEPLOYMENTS_REGISTERED: bool = False
 
 
 class ClientSpec(ObjectSpec):
@@ -23,65 +25,96 @@ class WindowServiceSpec(ObjectSpec):
 
 @dataclass(frozen=True)
 class ModelDeployment:
-    """A model deployment is an accessible instance of this model (e.g. a hosted endpoint).
-
-    A model can have multiple model deployments."""
+    """
+    A model deployment is an accessible instance of this model (e.g., a hosted endpoint).
+    A model can have multiple model deployments.
+    """
 
     name: str
-    """Name of the model deployment."""
+    """Name of the model deployment. Usually formatted as "<hosting_group>/<engine_name>".
+    Example: "huggingface/t5-11b"."""
 
     client_spec: ClientSpec
     """Specification for instantiating the client for this model deployment."""
 
     model_name: Optional[str] = None
-    """Name of the model that this model deployment is for.
-
-    If unset, defaults to the the same value as `name`."""
+    """Name of the model that this model deployment is for. Refers to the field "name" in the Model class.
+    If unset, defaults to the same value as `name`."""
 
     tokenizer_name: Optional[str] = None
-    """Tokenizer for this model deployment.
-
-    If unset, auto-inferred by the WindowService."""
+    """Tokenizer for this model deployment. If unset, auto-inferred by the WindowService."""
 
     window_service_spec: Optional[WindowServiceSpec] = None
-    """Specification for instantiating the window service for this model deployment"""
+    """Specification for instantiating the window service for this model deployment."""
 
     max_sequence_length: Optional[int] = None
     """Maximum sequence length for this model deployment."""
 
     max_request_length: Optional[int] = None
     """Maximum request length for this model deployment.
-
     If unset, defaults to the same value as max_sequence_length."""
 
+    max_sequence_and_generated_tokens_length: Optional[int] = None
+    """The max length of the model input and output tokens.
+    Some models (like Anthropic/Claude and Megatron) have a specific limit sequence length + max_token.
+    If unset, defaults to INT_MAX (i.e., no limit)."""
+
+    deprecated: bool = False
+    """Whether this model deployment is deprecated."""
+
+    @property
+    def host_organization(self) -> str:
+        """
+        Extracts the host group from the model deployment name.
+        Example: "huggingface" from "huggingface/t5-11b"
+        This can be different from the creator organization (for example "together")
+        """
+        return self.name.split("/")[0]
+
+    @property
+    def engine(self) -> str:
+        """
+        Extracts the model engine from the model deployment name.
+        Example: 'ai21/j1-jumbo' => 'j1-jumbo'
+        """
+        return self.name.split("/")[1]
+
+    def __post_init__(self):
+        if not self.model_name:
+            object.__setattr__(self, "model_name", self.name)
+
 
 @dataclass(frozen=True)
 class ModelDeployments:
     model_deployments: List[ModelDeployment]
 
 
-_name_to_model_deployment: Dict[str, ModelDeployment] = {}
+ALL_MODEL_DEPLOYMENTS: List[ModelDeployment] = []
+DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT: Dict[str, ModelDeployment] = {
+    deployment.name: deployment for deployment in ALL_MODEL_DEPLOYMENTS
+}
 
 
+# ===================== REGISTRATION FUNCTIONS ==================== #
 def register_model_deployment(model_deployment: ModelDeployment) -> None:
-    hlog(f"Registered model deployment {model_deployment.name}")
-    _name_to_model_deployment[model_deployment.name] = model_deployment
-
-    # Auto-register a model with this name if none exists
-    model_name = model_deployment.model_name or model_deployment.name
-    if model_name not in MODEL_NAME_TO_MODEL:
-        model = Model(
-            group="unknown",
-            name=model_name,
-            tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-        )
-        MODEL_NAME_TO_MODEL[model_name] = model
-        ALL_MODELS.append(model)
-        hlog(f"Registered default metadata for model {model_name}")
+    # hlog(f"Registered model deployment {model_deployment.name}")
+    DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_deployment.name] = model_deployment
+    ALL_MODEL_DEPLOYMENTS.append(model_deployment)
+
+    model_name: str = model_deployment.model_name or model_deployment.name
+
+    try:
+        model_metadata: ModelMetadata = get_model_metadata(model_name)
+        deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
+        if model_deployment.name not in deployment_names:
+            if model_metadata.deployment_names is None:
+                model_metadata.deployment_names = []
+            model_metadata.deployment_names.append(model_deployment.name)
+    except ValueError:
+        raise ValueError(f"Model deployment {model_deployment.name} has no corresponding model metadata")
 
 
 def register_model_deployments_from_path(path: str) -> None:
-    global _name_to_model_deployment
     hlog(f"Reading model deployments from {path}...")
     with open(path, "r") as f:
         raw = yaml.safe_load(f)
@@ -90,12 +123,63 @@ def register_model_deployments_from_path(path: str) -> None:
         register_model_deployment(model_deployment)
 
 
-def maybe_register_model_deployments_from_base_path(base_path: str) -> None:
-    """Register model deployments from prod_env/model_deployments.yaml"""
-    path = os.path.join(base_path, MODEL_DEPLOYMENTS_FILE)
+def maybe_register_model_deployments_from_base_path(path: str) -> None:
+    """Register model deployments from yaml file if the path exists."""
     if os.path.exists(path):
         register_model_deployments_from_path(path)
 
 
-def get_model_deployment(name: str) -> Optional[ModelDeployment]:
-    return _name_to_model_deployment.get(name)
+# ===================== UTIL FUNCTIONS ==================== #
+def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeployment:
+    register_deployments_if_not_already_registered()
+    if name not in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
+        raise ValueError(f"Model deployment {name} not found")
+    deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
+    if deployment.deprecated and warn_deprecated:
+        hlog(f"WARNING: DEPLOYMENT Model deployment {name} is deprecated")
+    return deployment
+
+
+def get_model_deployments_by_host_organization(host_organization: str) -> List[str]:
+    """
+    Gets models by host organization.
+    Example:   together => [" together/bloom", "together/t0pp", ...]
+    """
+    register_deployments_if_not_already_registered()
+    return [
+        deployment.name for deployment in ALL_MODEL_DEPLOYMENTS if deployment.host_organization == host_organization
+    ]
+
+
+def get_model_deployment_host_organization(name: str) -> str:
+    """
+    Extracts the host organization from the model deployment name.
+    Example: "huggingface/t5-11b" => "huggingface"
+    """
+    deployment: ModelDeployment = get_model_deployment(name)
+    return deployment.host_organization
+
+
+def get_metadata_for_deployment(deployment_name: str) -> ModelMetadata:
+    """
+    Given a deployment name, returns the corresponding model metadata.
+    """
+    deployment: ModelDeployment = get_model_deployment(deployment_name)
+    return get_model_metadata(deployment.model_name or deployment.name)
+
+
+def get_model_names_with_tokenizer(tokenizer_name: str) -> List[str]:
+    """Get all the name of the models with tokenizer `tokenizer_name`."""
+    register_deployments_if_not_already_registered()
+    deployments: List[ModelDeployment] = [
+        deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.tokenizer_name == tokenizer_name
+    ]
+    return [deployment.model_name or deployment.name for deployment in deployments]
+
+
+def register_deployments_if_not_already_registered() -> None:
+    global DEPLOYMENTS_REGISTERED
+    if not DEPLOYMENTS_REGISTERED:
+        path: str = resources.files(CONFIG_PACKAGE).joinpath(MODEL_DEPLOYMENTS_FILE)
+        maybe_register_model_deployments_from_base_path(path)
+        DEPLOYMENTS_REGISTERED = True
diff --git a/src/helm/benchmark/model_metadata_registry.py b/src/helm/benchmark/model_metadata_registry.py
index e95c8a520b1..335c75c5b4b 100644
--- a/src/helm/benchmark/model_metadata_registry.py
+++ b/src/helm/benchmark/model_metadata_registry.py
@@ -1,52 +1,129 @@
 import os
-from typing import Optional, List
+from typing import Dict, Optional, List
 from dataclasses import dataclass, field
 from datetime import date
+import importlib_resources as resources
 
 import dacite
 import yaml
 
-from helm.proxy.models import ALL_MODELS, MODEL_NAME_TO_MODEL, Model
 
+# Different modalities
+TEXT_MODEL_TAG: str = "TEXT_MODEL_TAG"
+IMAGE_MODEL_TAG: str = "IMAGE_MODEL_TAG"
+CODE_MODEL_TAG: str = "CODE_MODEL_TAG"
+EMBEDDING_MODEL_TAG: str = "EMBEDDING_MODEL_TAG"
 
-MODEL_METADATA_FILE = "model_metadata.yaml"
+# Some model APIs have limited functionalities
+FULL_FUNCTIONALITY_TEXT_MODEL_TAG: str = "FULL_FUNCTIONALITY_TEXT_MODEL_TAG"
+LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG: str = "LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG"
 
+# ChatML format
+CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
 
-@dataclass(frozen=True)
+# OpenAI Chat format
+OPENAI_CHATGPT_MODEL_TAG: str = "openai_chatgpt"
+
+# For Anthropic models
+ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
+ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
+
+# Models which emit garbage tokens when temperature=0.
+BUGGY_TEMP_0_TAG: str = "BUGGY_TEMP_0_TAG"
+
+# Models that are used for ablations and fine-grained analyses.
+# These models are selected specifically because of their low marginal cost to evaluate.
+ABLATION_MODEL_TAG: str = "ABLATION_MODEL_TAG"
+
+# Some models (e.g., T5) have stripped newlines.
+# So we cannot use \n as a stop sequence for these models.
+NO_NEWLINES_TAG: str = "NO_NEWLINES_TAG"
+
+# Some models (e.g., UL2) require a prefix (e.g., [NLG]) in the
+# prompts to indicate the mode before doing inference.
+NLG_PREFIX_TAG: str = "NLG_PREFIX_TAG"
+
+# Some models can follow instructions.
+INSTRUCTION_FOLLOWING_MODEL_TAG: str = "INSTRUCTION_FOLLOWING_MODEL_TAG"
+
+# For Vision-langauge models (VLMs)
+VISION_LANGUAGE_MODEL_TAG: str = "VISION_LANGUAGE_MODEL_TAG"
+
+
+CONFIG_PACKAGE = "helm.config"
+MODEL_METADATA_FILE: str = "model_metadata.yaml"
+METADATAS_REGISTERED: bool = False
+
+
+# Frozen is set to false as the model_deployment_registry.py file
+# might populate the deployment_names field.
+@dataclass(frozen=False)
 class ModelMetadata:
     name: str
-    """Name of the model e.g. "meta/llama-2"."""
+    """Name of the model group (e.g., "openai/davinci"). This is the name of the model,
+    not the name of the deployment.
+    Usually formatted as "<creator_organization>/<engine_name>". Example: "ai21/j1-jumbo"."""
 
-    creator_organization: Optional[str] = None
-    """Organization that originally created the model (e.g. "meta")."""
+    creator_organization_name: str
+    """Name of the organization that created the model."""
 
-    access: Optional[str] = None
-    """How this model is available (e.g., limited).
+    display_name: str
+    """Name that is going to be displayed to the user (on the website, etc.)."""
 
-    If there are multiple deployments, this should be the most permissive access across
-    all deployments."""
+    description: str
+    """Description of the model, to be displayed on the website."""
 
-    todo: bool = False
-    """Whether we have yet to evaluate this model."""
+    access: str
+    """Description of the access level of the model. Should be one of the following:
+    - "open": the model is open-source and can be downloaded from the internet.
+    - "closed": not accessible
+    - "limited": accessible with an API key.
+    If there are multiple deployments, this should be the most permissive access across all deployments."""
 
-    release_date: Optional[date] = None
-    """When the model was released."""
+    release_date: date
+    """Release date of the model."""
 
-    num_parameters: Optional[int] = None
-    """The number of model parameters.
+    tags: List[str] = field(default_factory=list)
+    """Tags corresponding to the properties of the model."""
 
+    num_parameters: Optional[int] = None
+    """Number of parameters in the model.
     This should be a string as the number of parameters is usually a round number (175B),
     but we set it as an int for plotting purposes."""
 
-    tags: List[str] = field(default_factory=list)
-    """"""
+    deployment_names: Optional[List[str]] = None
+    """List of the model deployments for this model. Should at least contain one model deployment.
+    Refers to the field "name" in the ModelDeployment class. Defaults to a single model deployment
+    with the same name as the model."""
+
+    @property
+    def creator_organization(self) -> str:
+        """
+        Extracts the creator organization from the model name.
+        Example: 'ai21/j1-jumbo' => 'ai21'
+        This can be different from the hosting organization.
+        """
+        return self.name.split("/")[0]
+
+    @property
+    def engine(self) -> str:
+        """
+        Extracts the model engine from the model name.
+        Example: 'ai21/j1-jumbo' => 'j1-jumbo'
+        """
+        return self.name.split("/")[1]
 
 
 @dataclass(frozen=True)
 class ModelMetadataList:
-    models: List[ModelMetadata]
+    models: List[ModelMetadata] = field(default_factory=list)
 
 
+ALL_MODELS_METADATA: List[ModelMetadata] = []
+MODEL_NAME_TO_MODEL_METADATA: Dict[str, ModelMetadata] = {model.name: model for model in ALL_MODELS_METADATA}
+
+
+# ===================== REGISTRATION FUNCTIONS ==================== #
 def register_model_metadata_from_path(path: str) -> None:
     """Register model configurations from the given path."""
     with open(path, "r") as f:
@@ -55,17 +132,77 @@ def register_model_metadata_from_path(path: str) -> None:
     # serialization format for dates
     model_metadata_list = dacite.from_dict(ModelMetadataList, raw)
     for model_metadata in model_metadata_list.models:
-        model = Model(
-            group="none",  # TODO: Group should be part of model deployment, not model
-            name=model_metadata.name,
-            tags=model_metadata.tags,
-        )
-        MODEL_NAME_TO_MODEL[model_metadata.name] = model
-        ALL_MODELS.append(model)
-
-
-def maybe_register_model_metadata_from_base_path(base_path: str) -> None:
-    """Register model metadata from prod_env/model_metadata.yaml"""
-    path = os.path.join(base_path, MODEL_METADATA_FILE)
+        register_model_metadata(model_metadata)
+
+
+def register_model_metadata(model_metadata: ModelMetadata) -> None:
+    """Register a single model configuration."""
+    # hlog(f"Registered model metadata {model_metadata.name}")
+    ALL_MODELS_METADATA.append(model_metadata)
+    MODEL_NAME_TO_MODEL_METADATA[model_metadata.name] = model_metadata
+
+
+def maybe_register_model_metadata_from_base_path(path: str) -> None:
+    """Register model metadata from yaml file if the path exists."""
     if os.path.exists(path):
         register_model_metadata_from_path(path)
+
+
+# ===================== UTIL FUNCTIONS ==================== #
+def get_model_metadata(model_name: str) -> ModelMetadata:
+    """Get the `Model` given the name."""
+    register_metadatas_if_not_already_registered()
+    if model_name not in MODEL_NAME_TO_MODEL_METADATA:
+        raise ValueError(f"No model with name: {model_name}")
+
+    return MODEL_NAME_TO_MODEL_METADATA[model_name]
+
+
+def get_model_creator_organization(model_name: str) -> str:
+    """Get the model's group given the name."""
+    model: ModelMetadata = get_model_metadata(model_name)
+    return model.creator_organization
+
+
+def get_all_models() -> List[str]:
+    """Get all model names."""
+    register_metadatas_if_not_already_registered()
+    return list(MODEL_NAME_TO_MODEL_METADATA.keys())
+
+
+def get_models_by_creator_organization(organization: str) -> List[str]:
+    """
+    Gets models by creator organization.
+    Example:   ai21   =>   ai21/j1-jumbo, ai21/j1-grande, ai21-large.
+    """
+    register_metadatas_if_not_already_registered()
+    return [model.name for model in ALL_MODELS_METADATA if model.creator_organization == organization]
+
+
+def get_model_names_with_tag(tag: str) -> List[str]:
+    """Get all the name of the models with tag `tag`."""
+    register_metadatas_if_not_already_registered()
+    return [model.name for model in ALL_MODELS_METADATA if tag in model.tags]
+
+
+def get_all_text_models() -> List[str]:
+    """Get all text model names."""
+    return get_model_names_with_tag(TEXT_MODEL_TAG)
+
+
+def get_all_code_models() -> List[str]:
+    """Get all code model names."""
+    return get_model_names_with_tag(CODE_MODEL_TAG)
+
+
+def get_all_instruction_following_models() -> List[str]:
+    """Get all instruction-following model names."""
+    return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG)
+
+
+def register_metadatas_if_not_already_registered() -> None:
+    global METADATAS_REGISTERED
+    if not METADATAS_REGISTERED:
+        path: str = resources.files(CONFIG_PACKAGE).joinpath(MODEL_METADATA_FILE)
+        maybe_register_model_metadata_from_base_path(path)
+        METADATAS_REGISTERED = True
diff --git a/src/helm/benchmark/presentation/contamination.py b/src/helm/benchmark/presentation/contamination.py
index dd6832d0a9d..0e876bc42d9 100644
--- a/src/helm/benchmark/presentation/contamination.py
+++ b/src/helm/benchmark/presentation/contamination.py
@@ -2,10 +2,10 @@
 from typing import List, Optional
 import dacite
 import importlib_resources as resources
-import yaml  # type: ignore
+import yaml
 
 from helm.common.hierarchical_logger import htrack, hlog
-from helm.proxy.models import MODEL_NAME_TO_MODEL
+from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
 from helm.benchmark.presentation.schema import Schema
 
 
@@ -70,7 +70,7 @@ def validate_contamination(contamination: Contamination, schema: Schema):
     """Make sure models and groups in contamination are defined according to `schema`."""
     for point in contamination.points:
         for model in point.models:
-            if model not in MODEL_NAME_TO_MODEL:
+            if model not in MODEL_NAME_TO_MODEL_METADATA:
                 hlog(f"WARNING: model {model} not defined in schema")
         for group in point.groups:
             if group not in schema.name_to_run_group:
diff --git a/src/helm/benchmark/presentation/run_display.py b/src/helm/benchmark/presentation/run_display.py
index fce259add11..7f6b3fd03d3 100644
--- a/src/helm/benchmark/presentation/run_display.py
+++ b/src/helm/benchmark/presentation/run_display.py
@@ -76,16 +76,14 @@ class DisplayRequest:
     most relevant request e.g. the request for the chosen cohice for multiple choice questions."""
 
 
-def _read_scenario_state(run_path: str) -> ScenarioState:
-    scenario_state_path: str = os.path.join(run_path, "scenario_state.json")
+def _read_scenario_state(scenario_state_path: str) -> ScenarioState:
     if not os.path.exists(scenario_state_path):
         raise ValueError(f"Could not load ScenarioState from {scenario_state_path}")
     with open(scenario_state_path) as f:
         return from_json(f.read(), ScenarioState)
 
 
-def _read_per_instance_stats(run_path: str) -> List[PerInstanceStats]:
-    per_instance_stats_path: str = os.path.join(run_path, "per_instance_stats.json")
+def _read_per_instance_stats(per_instance_stats_path: str) -> List[PerInstanceStats]:
     if not os.path.exists(per_instance_stats_path):
         raise ValueError(f"Could not load PerInstanceStats from {per_instance_stats_path}")
     with open(per_instance_stats_path) as f:
@@ -168,16 +166,35 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
     display_predictions_file_path = os.path.join(run_path, _DISPLAY_PREDICTIONS_JSON_FILE_NAME)
     display_requests_file_path = os.path.join(run_path, _DISPLAY_REQUESTS_JSON_FILE_NAME)
 
+    scenario_state_path = os.path.join(run_path, "scenario_state.json")
+    per_instance_stats_path = os.path.join(run_path, "per_instance_stats.json")
+
     if (
         skip_completed
         and os.path.exists(instances_file_path)
         and os.path.exists(display_predictions_file_path)
         and os.path.exists(display_requests_file_path)
     ):
-        hlog(f"Skipping writing display JSON for run {run_spec.name} because all output display JSON files exist.")
+        hlog(
+            f"Skipping writing display JSON for run {run_spec.name} "
+            "because all output display JSON files already exist."
+        )
+        return
+    elif not os.path.exists(scenario_state_path):
+        hlog(
+            f"Skipping writing display JSON for run {run_spec.name} because "
+            f"the scenario state JSON file does not exist at {scenario_state_path}"
+        )
         return
-    scenario_state = _read_scenario_state(run_path)
-    per_instance_stats = _read_per_instance_stats(run_path)
+    elif not os.path.exists(per_instance_stats_path):
+        hlog(
+            f"Skipping writing display JSON for run {run_spec.name} because "
+            f"the per instance stats JSON file does not exist at {per_instance_stats_path}"
+        )
+        return
+
+    scenario_state = _read_scenario_state(scenario_state_path)
+    per_instance_stats = _read_per_instance_stats(per_instance_stats_path)
 
     metric_names = _get_metric_names_for_groups(run_spec.groups, schema)
 
diff --git a/src/helm/benchmark/presentation/run_specs.conf b/src/helm/benchmark/presentation/run_specs.conf
index a1009c29e35..5b17b35deea 100644
--- a/src/helm/benchmark/presentation/run_specs.conf
+++ b/src/helm/benchmark/presentation/run_specs.conf
@@ -483,36 +483,36 @@ entries: [
   {description: "lsat_qa:model=text_code,task=assignment", priority: 3}
   {description: "lsat_qa:model=text_code,task=miscellaneous", priority: 3}
 
-  {description: "lextreme:subset=brazilian_court_decisions_judgment,model=all", priority: 5}
-  {description: "lextreme:subset=brazilian_court_decisions_unanimity,model=all", priority: 5}
-  {description: "lextreme:subset=german_argument_mining,model=all", priority: 5}
-  {description: "lextreme:subset=greek_legal_code_chapter,model=all", priority: 5}
-  {description: "lextreme:subset=greek_legal_code_subject,model=all", priority: 5}
-  {description: "lextreme:subset=greek_legal_code_volume,model=all", priority: 5}
-  {description: "lextreme:subset=swiss_judgment_prediction,model=all", priority: 5}
-  {description: "lextreme:subset=online_terms_of_service_unfairness_levels,model=all", priority: 5}
-  {description: "lextreme:subset=online_terms_of_service_clause_topics,model=all", priority: 5}
-  {description: "lextreme:subset=covid19_emergency_event,model=all", priority: 5}
-  {description: "lextreme:subset=multi_eurlex_level_1,model=all", priority: 5}
-  {description: "lextreme:subset=multi_eurlex_level_2,model=all", priority: 5}
-  {description: "lextreme:subset=multi_eurlex_level_3,model=all", priority: 5}
-  {description: "lextreme:subset=greek_legal_ner,model=all", priority: 5}
-  {description: "lextreme:subset=legalnero,model=all", priority: 5}
-  {description: "lextreme:subset=lener_br,model=all", priority: 5}
-  {description: "lextreme:subset=mapa_coarse,model=all", priority: 5}
-  {description: "lextreme:subset=mapa_fine,model=all", priority: 5}
-
-  {description: "lex_glue:subset=ecthr_a,model=all", priority: 3}
-  {description: "lex_glue:subset=ecthr_b,model=all", priority: 3}
-  {description: "lex_glue:subset=scotus,model=all", priority: 3}
-  {description: "lex_glue:subset=eurlex,model=all", priority: 3}
-  {description: "lex_glue:subset=ledgar,model=all", priority: 3}
-  {description: "lex_glue:subset=unfair_tos,model=all", priority: 3}
-  {description: "lex_glue:subset=case_hold,model=all", priority: 3}
-
-  {description: "billsum_legal_summarization:model=all", priority: 3},
-  {description: "multilexsum_legal_summarization:model=all", priority: 3},
-  {description: "eurlexsum_legal_summarization:model=all", priority: 3},
+  {description: "lextreme:subset=brazilian_court_decisions_judgment,model=text", priority: 5}
+  {description: "lextreme:subset=brazilian_court_decisions_unanimity,model=text", priority: 5}
+  {description: "lextreme:subset=german_argument_mining,model=text", priority: 5}
+  {description: "lextreme:subset=greek_legal_code_chapter,model=text", priority: 5}
+  {description: "lextreme:subset=greek_legal_code_subject,model=text", priority: 5}
+  {description: "lextreme:subset=greek_legal_code_volume,model=text", priority: 5}
+  {description: "lextreme:subset=swiss_judgment_prediction,model=text", priority: 5}
+  {description: "lextreme:subset=online_terms_of_service_unfairness_levels,model=text", priority: 5}
+  {description: "lextreme:subset=online_terms_of_service_clause_topics,model=text", priority: 5}
+  {description: "lextreme:subset=covid19_emergency_event,model=text", priority: 5}
+  {description: "lextreme:subset=multi_eurlex_level_1,model=text", priority: 5}
+  {description: "lextreme:subset=multi_eurlex_level_2,model=text", priority: 5}
+  {description: "lextreme:subset=multi_eurlex_level_3,model=text", priority: 5}
+  {description: "lextreme:subset=greek_legal_ner,model=text", priority: 5}
+  {description: "lextreme:subset=legalnero,model=text", priority: 5}
+  {description: "lextreme:subset=lener_br,model=text", priority: 5}
+  {description: "lextreme:subset=mapa_coarse,model=text", priority: 5}
+  {description: "lextreme:subset=mapa_fine,model=text", priority: 5}
+
+  {description: "lex_glue:subset=ecthr_a,model=text", priority: 3}
+  {description: "lex_glue:subset=ecthr_b,model=text", priority: 3}
+  {description: "lex_glue:subset=scotus,model=text", priority: 3}
+  {description: "lex_glue:subset=eurlex,model=text", priority: 3}
+  {description: "lex_glue:subset=ledgar,model=text", priority: 3}
+  {description: "lex_glue:subset=unfair_tos,model=text", priority: 3}
+  {description: "lex_glue:subset=case_hold,model=text", priority: 3}
+
+  {description: "billsum_legal_summarization:model=text", priority: 3},
+  {description: "multilexsum_legal_summarization:model=text", priority: 3},
+  {description: "eurlexsum_legal_summarization:model=text", priority: 3},
 
   # MedQA
   {description: "med_qa:model=biomedical", priority: 2}
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_10.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_10.conf
new file mode 100644
index 00000000000..15116e38512
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_10.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+  ## Reading comprehension
+
+  {description: "boolq:model=text,max_eval_instances=10", priority: 1}
+  {description: "narrative_qa:model=text,max_eval_instances=10", priority: 2}
+  {description: "quac:model=text,max_eval_instances=10", priority: 1}
+
+  ## Reading comprehension and closedbook QA variants
+
+  {description: "natural_qa:model=text,mode=openbook_longans,max_eval_instances=10", priority: 1}
+  {description: "natural_qa:model=text,mode=closedbook,max_eval_instances=10", priority: 1}
+
+  ## Closed-book QA with multiple choice
+
+  # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+  {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,max_eval_instances=10", priority: 1}
+  {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,max_eval_instances=10", priority: 2}
+  {description: "truthful_qa:model=text,task=mc_single,max_eval_instances=10", priority: 1}
+
+  {description: "mmlu:model=text,subject=abstract_algebra,max_eval_instances=2", priority: 2}
+  {description: "mmlu:model=text,subject=college_chemistry,max_eval_instances=2", priority: 2}
+  {description: "mmlu:model=text,subject=computer_security,max_eval_instances=2", priority: 2}
+  {description: "mmlu:model=text,subject=econometrics,max_eval_instances=2", priority: 2}
+  {description: "mmlu:model=text,subject=us_foreign_policy,max_eval_instances=2", priority: 2}
+  
+  ##### Information Retrieval #####
+  # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+  {description: "msmarco:model=full_functionality_text,track=regular,valid_topk=30,max_eval_instances=10", priority: 2}
+  {description: "msmarco:model=full_functionality_text,track=trec,valid_topk=30,max_eval_instances=10", priority: 1}
+
+  ##### Summarization #####
+  # Scenarios: XSUM, CNN/DM
+
+  {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu,max_eval_instances=10", priority: 1}
+  {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu,max_eval_instances=10", priority: 1}
+
+
+  ##### Sentiment Analysis #####
+  # Scenarios: IMDB
+
+  {description: "imdb:model=text,max_eval_instances=10", priority: 1}
+
+
+  ##### (Miscellaneous) Text Classification #####
+  # Scenarios: RAFT
+
+  {description: "raft:subset=ade_corpus_v2,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=banking_77,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=neurips_impact_statement_risks,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=one_stop_english,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=overruling,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=semiconductor_org_types,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=tweet_eval_hate,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=twitter_complaints,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=systematic_review_inclusion,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=tai_safety_research,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=terms_of_service,model=text,max_eval_instances=1", priority: 2}
+
+
+  ##### Toxicity Detection #####
+  # Scenarios: CivilComments
+
+  {description: "civil_comments:model=text,demographic=all,max_eval_instances=1", priority: 1}
+  {description: "civil_comments:model=text,demographic=male,max_eval_instances=1", priority: 2}
+  {description: "civil_comments:model=text,demographic=female,max_eval_instances=1", priority: 2}
+  {description: "civil_comments:model=text,demographic=LGBTQ,max_eval_instances=1", priority: 2}
+  {description: "civil_comments:model=text,demographic=christian,max_eval_instances=1", priority: 2}
+  {description: "civil_comments:model=text,demographic=muslim,max_eval_instances=1", priority: 2}
+  {description: "civil_comments:model=text,demographic=other_religions,max_eval_instances=1", priority: 2}
+  {description: "civil_comments:model=text,demographic=black,max_eval_instances=1", priority: 2}
+  {description: "civil_comments:model=text,demographic=white,max_eval_instances=2", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_100.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_100.conf
new file mode 100644
index 00000000000..2cfd32102dd
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_100.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+  ## Reading comprehension
+
+  {description: "boolq:model=text,max_eval_instances=100", priority: 1}
+  {description: "narrative_qa:model=text,max_eval_instances=100", priority: 2}
+  {description: "quac:model=text,max_eval_instances=100", priority: 1}
+
+  ## Reading comprehension and closedbook QA variants
+
+  {description: "natural_qa:model=text,mode=openbook_longans,max_eval_instances=100", priority: 1}
+  {description: "natural_qa:model=text,mode=closedbook,max_eval_instances=100", priority: 1}
+
+  ## Closed-book QA with multiple choice
+
+  # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+  {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,max_eval_instances=100", priority: 1}
+  {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,max_eval_instances=100", priority: 2}
+  {description: "truthful_qa:model=text,task=mc_single,max_eval_instances=100", priority: 1}
+
+  {description: "mmlu:model=text,subject=abstract_algebra,max_eval_instances=20", priority: 2}
+  {description: "mmlu:model=text,subject=college_chemistry,max_eval_instances=20", priority: 2}
+  {description: "mmlu:model=text,subject=computer_security,max_eval_instances=20", priority: 2}
+  {description: "mmlu:model=text,subject=econometrics,max_eval_instances=20", priority: 2}
+  {description: "mmlu:model=text,subject=us_foreign_policy,max_eval_instances=20", priority: 2}
+  
+  ##### Information Retrieval #####
+  # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+  {description: "msmarco:model=full_functionality_text,track=regular,valid_topk=30,max_eval_instances=100", priority: 2}
+  {description: "msmarco:model=full_functionality_text,track=trec,valid_topk=30,max_eval_instances=100", priority: 1}
+
+  ##### Summarization #####
+  # Scenarios: XSUM, CNN/DM
+
+  {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu,max_eval_instances=100", priority: 1}
+  {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu,max_eval_instances=100", priority: 1}
+
+
+  ##### Sentiment Analysis #####
+  # Scenarios: IMDB
+
+  {description: "imdb:model=text,max_eval_instances=100", priority: 1}
+
+
+  ##### (Miscellaneous) Text Classification #####
+  # Scenarios: RAFT
+
+  {description: "raft:subset=ade_corpus_v2,model=text,max_eval_instances=9", priority: 2}
+  {description: "raft:subset=banking_77,model=text,max_eval_instances=9", priority: 2}
+  {description: "raft:subset=neurips_impact_statement_risks,model=text,max_eval_instances=9", priority: 2}
+  {description: "raft:subset=one_stop_english,model=text,max_eval_instances=9", priority: 2}
+  {description: "raft:subset=overruling,model=text,max_eval_instances=9", priority: 2}
+  {description: "raft:subset=semiconductor_org_types,model=text,max_eval_instances=9", priority: 2}
+  {description: "raft:subset=tweet_eval_hate,model=text,max_eval_instances=9", priority: 2}
+  {description: "raft:subset=twitter_complaints,model=text,max_eval_instances=9", priority: 2}
+  {description: "raft:subset=systematic_review_inclusion,model=text,max_eval_instances=9", priority: 2}
+  {description: "raft:subset=tai_safety_research,model=text,max_eval_instances=9", priority: 2}
+  {description: "raft:subset=terms_of_service,model=text,max_eval_instances=10", priority: 2}
+
+
+  ##### Toxicity Detection #####
+  # Scenarios: CivilComments
+
+  {description: "civil_comments:model=text,demographic=all,max_eval_instances=11", priority: 1}
+  {description: "civil_comments:model=text,demographic=male,max_eval_instances=11", priority: 2}
+  {description: "civil_comments:model=text,demographic=female,max_eval_instances=11", priority: 2}
+  {description: "civil_comments:model=text,demographic=LGBTQ,max_eval_instances=11", priority: 2}
+  {description: "civil_comments:model=text,demographic=christian,max_eval_instances=11", priority: 2}
+  {description: "civil_comments:model=text,demographic=muslim,max_eval_instances=11", priority: 2}
+  {description: "civil_comments:model=text,demographic=other_religions,max_eval_instances=11", priority: 2}
+  {description: "civil_comments:model=text,demographic=black,max_eval_instances=11", priority: 2}
+  {description: "civil_comments:model=text,demographic=white,max_eval_instances=12", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_1000.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_1000.conf
new file mode 100644
index 00000000000..fd2cac58a07
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_1000.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+  ## Reading comprehension
+
+  {description: "boolq:model=text,max_eval_instances=1000", priority: 1}
+  {description: "narrative_qa:model=text,max_eval_instances=1000", priority: 2}
+  {description: "quac:model=text,max_eval_instances=1000", priority: 1}
+
+  ## Reading comprehension and closedbook QA variants
+
+  {description: "natural_qa:model=text,mode=openbook_longans,max_eval_instances=1000", priority: 1}
+  {description: "natural_qa:model=text,mode=closedbook,max_eval_instances=1000", priority: 1}
+
+  ## Closed-book QA with multiple choice
+
+  # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+  {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,max_eval_instances=1000", priority: 1}
+  {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,max_eval_instances=1000", priority: 2}
+  {description: "truthful_qa:model=text,task=mc_single,max_eval_instances=1000", priority: 1}
+
+  {description: "mmlu:model=text,subject=abstract_algebra,max_eval_instances=200", priority: 2}
+  {description: "mmlu:model=text,subject=college_chemistry,max_eval_instances=200", priority: 2}
+  {description: "mmlu:model=text,subject=computer_security,max_eval_instances=200", priority: 2}
+  {description: "mmlu:model=text,subject=econometrics,max_eval_instances=200", priority: 2}
+  {description: "mmlu:model=text,subject=us_foreign_policy,max_eval_instances=200", priority: 2}
+  
+  ##### Information Retrieval #####
+  # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+  {description: "msmarco:model=full_functionality_text,track=regular,valid_topk=30,max_eval_instances=1000", priority: 2}
+  {description: "msmarco:model=full_functionality_text,track=trec,valid_topk=30,max_eval_instances=1000", priority: 1}
+
+  ##### Summarization #####
+  # Scenarios: XSUM, CNN/DM
+
+  {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu,max_eval_instances=1000", priority: 1}
+  {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu,max_eval_instances=1000", priority: 1}
+
+
+  ##### Sentiment Analysis #####
+  # Scenarios: IMDB
+
+  {description: "imdb:model=text,max_eval_instances=1000", priority: 1}
+
+
+  ##### (Miscellaneous) Text Classification #####
+  # Scenarios: RAFT
+
+  {description: "raft:subset=ade_corpus_v2,model=text,max_eval_instances=90", priority: 2}
+  {description: "raft:subset=banking_77,model=text,max_eval_instances=90", priority: 2}
+  {description: "raft:subset=neurips_impact_statement_risks,model=text,max_eval_instances=90", priority: 2}
+  {description: "raft:subset=one_stop_english,model=text,max_eval_instances=90", priority: 2}
+  {description: "raft:subset=overruling,model=text,max_eval_instances=90", priority: 2}
+  {description: "raft:subset=semiconductor_org_types,model=text,max_eval_instances=90", priority: 2}
+  {description: "raft:subset=tweet_eval_hate,model=text,max_eval_instances=90", priority: 2}
+  {description: "raft:subset=twitter_complaints,model=text,max_eval_instances=90", priority: 2}
+  {description: "raft:subset=systematic_review_inclusion,model=text,max_eval_instances=90", priority: 2}
+  {description: "raft:subset=tai_safety_research,model=text,max_eval_instances=90", priority: 2}
+  {description: "raft:subset=terms_of_service,model=text,max_eval_instances=100", priority: 2}
+
+
+  ##### Toxicity Detection #####
+  # Scenarios: CivilComments
+
+  {description: "civil_comments:model=text,demographic=all,max_eval_instances=110", priority: 1}
+  {description: "civil_comments:model=text,demographic=male,max_eval_instances=110", priority: 2}
+  {description: "civil_comments:model=text,demographic=female,max_eval_instances=110", priority: 2}
+  {description: "civil_comments:model=text,demographic=LGBTQ,max_eval_instances=110", priority: 2}
+  {description: "civil_comments:model=text,demographic=christian,max_eval_instances=110", priority: 2}
+  {description: "civil_comments:model=text,demographic=muslim,max_eval_instances=110", priority: 2}
+  {description: "civil_comments:model=text,demographic=other_religions,max_eval_instances=110", priority: 2}
+  {description: "civil_comments:model=text,demographic=black,max_eval_instances=110", priority: 2}
+  {description: "civil_comments:model=text,demographic=white,max_eval_instances=120", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_20.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_20.conf
new file mode 100644
index 00000000000..c9e4c2e14fa
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_20.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+  ## Reading comprehension
+
+  {description: "boolq:model=text,max_eval_instances=20", priority: 1}
+  {description: "narrative_qa:model=text,max_eval_instances=20", priority: 2}
+  {description: "quac:model=text,max_eval_instances=20", priority: 1}
+
+  ## Reading comprehension and closedbook QA variants
+
+  {description: "natural_qa:model=text,mode=openbook_longans,max_eval_instances=20", priority: 1}
+  {description: "natural_qa:model=text,mode=closedbook,max_eval_instances=20", priority: 1}
+
+  ## Closed-book QA with multiple choice
+
+  # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+  {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,max_eval_instances=20", priority: 1}
+  {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,max_eval_instances=20", priority: 2}
+  {description: "truthful_qa:model=text,task=mc_single,max_eval_instances=20", priority: 1}
+
+  {description: "mmlu:model=text,subject=abstract_algebra,max_eval_instances=4", priority: 2}
+  {description: "mmlu:model=text,subject=college_chemistry,max_eval_instances=4", priority: 2}
+  {description: "mmlu:model=text,subject=computer_security,max_eval_instances=4", priority: 2}
+  {description: "mmlu:model=text,subject=econometrics,max_eval_instances=4", priority: 2}
+  {description: "mmlu:model=text,subject=us_foreign_policy,max_eval_instances=4", priority: 2}
+  
+  ##### Information Retrieval #####
+  # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+  {description: "msmarco:model=full_functionality_text,track=regular,valid_topk=30,max_eval_instances=20", priority: 2}
+  {description: "msmarco:model=full_functionality_text,track=trec,valid_topk=30,max_eval_instances=20", priority: 1}
+
+  ##### Summarization #####
+  # Scenarios: XSUM, CNN/DM
+
+  {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu,max_eval_instances=20", priority: 1}
+  {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu,max_eval_instances=20", priority: 1}
+
+
+  ##### Sentiment Analysis #####
+  # Scenarios: IMDB
+
+  {description: "imdb:model=text,max_eval_instances=20", priority: 1}
+
+
+  ##### (Miscellaneous) Text Classification #####
+  # Scenarios: RAFT
+
+  {description: "raft:subset=ade_corpus_v2,model=text,max_eval_instances=2", priority: 2}
+  {description: "raft:subset=banking_77,model=text,max_eval_instances=2", priority: 2}
+  {description: "raft:subset=neurips_impact_statement_risks,model=text,max_eval_instances=2", priority: 2}
+  {description: "raft:subset=one_stop_english,model=text,max_eval_instances=2", priority: 2}
+  {description: "raft:subset=overruling,model=text,max_eval_instances=2", priority: 2}
+  {description: "raft:subset=semiconductor_org_types,model=text,max_eval_instances=2", priority: 2}
+  {description: "raft:subset=tweet_eval_hate,model=text,max_eval_instances=2", priority: 2}
+  {description: "raft:subset=twitter_complaints,model=text,max_eval_instances=2", priority: 2}
+  {description: "raft:subset=systematic_review_inclusion,model=text,max_eval_instances=2", priority: 2}
+  {description: "raft:subset=tai_safety_research,model=text,max_eval_instances=1", priority: 2}
+  {description: "raft:subset=terms_of_service,model=text,max_eval_instances=1", priority: 2}
+
+
+  ##### Toxicity Detection #####
+  # Scenarios: CivilComments
+
+  {description: "civil_comments:model=text,demographic=all,max_eval_instances=2", priority: 1}
+  {description: "civil_comments:model=text,demographic=male,max_eval_instances=2", priority: 2}
+  {description: "civil_comments:model=text,demographic=female,max_eval_instances=2", priority: 2}
+  {description: "civil_comments:model=text,demographic=LGBTQ,max_eval_instances=2", priority: 2}
+  {description: "civil_comments:model=text,demographic=christian,max_eval_instances=2", priority: 2}
+  {description: "civil_comments:model=text,demographic=muslim,max_eval_instances=2", priority: 2}
+  {description: "civil_comments:model=text,demographic=other_religions,max_eval_instances=2", priority: 2}
+  {description: "civil_comments:model=text,demographic=black,max_eval_instances=3", priority: 2}
+  {description: "civil_comments:model=text,demographic=white,max_eval_instances=3", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_50.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_50.conf
new file mode 100644
index 00000000000..faf8e0de4e6
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_50.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+  ## Reading comprehension
+
+  {description: "boolq:model=text,max_eval_instances=50", priority: 1}
+  {description: "narrative_qa:model=text,max_eval_instances=50", priority: 2}
+  {description: "quac:model=text,max_eval_instances=50", priority: 1}
+
+  ## Reading comprehension and closedbook QA variants
+
+  {description: "natural_qa:model=text,mode=openbook_longans,max_eval_instances=50", priority: 1}
+  {description: "natural_qa:model=text,mode=closedbook,max_eval_instances=50", priority: 1}
+
+  ## Closed-book QA with multiple choice
+
+  # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+  {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,max_eval_instances=50", priority: 1}
+  {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,max_eval_instances=50", priority: 2}
+  {description: "truthful_qa:model=text,task=mc_single,max_eval_instances=50", priority: 1}
+
+  {description: "mmlu:model=text,subject=abstract_algebra,max_eval_instances=10", priority: 2}
+  {description: "mmlu:model=text,subject=college_chemistry,max_eval_instances=10", priority: 2}
+  {description: "mmlu:model=text,subject=computer_security,max_eval_instances=10", priority: 2}
+  {description: "mmlu:model=text,subject=econometrics,max_eval_instances=10", priority: 2}
+  {description: "mmlu:model=text,subject=us_foreign_policy,max_eval_instances=10", priority: 2}
+  
+  ##### Information Retrieval #####
+  # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+  {description: "msmarco:model=full_functionality_text,track=regular,valid_topk=30,max_eval_instances=50", priority: 2}
+  {description: "msmarco:model=full_functionality_text,track=trec,valid_topk=30,max_eval_instances=50", priority: 1}
+
+  ##### Summarization #####
+  # Scenarios: XSUM, CNN/DM
+
+  {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu,max_eval_instances=50", priority: 1}
+  {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu,max_eval_instances=50", priority: 1}
+
+
+  ##### Sentiment Analysis #####
+  # Scenarios: IMDB
+
+  {description: "imdb:model=text,max_eval_instances=50", priority: 1}
+
+
+  ##### (Miscellaneous) Text Classification #####
+  # Scenarios: RAFT
+
+  {description: "raft:subset=ade_corpus_v2,model=text,max_eval_instances=4", priority: 2}
+  {description: "raft:subset=banking_77,model=text,max_eval_instances=4", priority: 2}
+  {description: "raft:subset=neurips_impact_statement_risks,model=text,max_eval_instances=4", priority: 2}
+  {description: "raft:subset=one_stop_english,model=text,max_eval_instances=4", priority: 2}
+  {description: "raft:subset=overruling,model=text,max_eval_instances=4", priority: 2}
+  {description: "raft:subset=semiconductor_org_types,model=text,max_eval_instances=4", priority: 2}
+  {description: "raft:subset=tweet_eval_hate,model=text,max_eval_instances=4", priority: 2}
+  {description: "raft:subset=twitter_complaints,model=text,max_eval_instances=4", priority: 2}
+  {description: "raft:subset=systematic_review_inclusion,model=text,max_eval_instances=4", priority: 2}
+  {description: "raft:subset=tai_safety_research,model=text,max_eval_instances=5", priority: 2}
+  {description: "raft:subset=terms_of_service,model=text,max_eval_instances=5", priority: 2}
+
+
+  ##### Toxicity Detection #####
+  # Scenarios: CivilComments
+
+  {description: "civil_comments:model=text,demographic=all,max_eval_instances=4", priority: 1}
+  {description: "civil_comments:model=text,demographic=male,max_eval_instances=4", priority: 2}
+  {description: "civil_comments:model=text,demographic=female,max_eval_instances=4", priority: 2}
+  {description: "civil_comments:model=text,demographic=LGBTQ,max_eval_instances=4", priority: 2}
+  {description: "civil_comments:model=text,demographic=christian,max_eval_instances=4", priority: 2}
+  {description: "civil_comments:model=text,demographic=muslim,max_eval_instances=5", priority: 2}
+  {description: "civil_comments:model=text,demographic=other_religions,max_eval_instances=5", priority: 2}
+  {description: "civil_comments:model=text,demographic=black,max_eval_instances=5", priority: 2}
+  {description: "civil_comments:model=text,demographic=white,max_eval_instances=5", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_all.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_all.conf
new file mode 100644
index 00000000000..dbb0e9b85ab
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_all.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+  ## Reading comprehension
+
+  {description: "boolq:model=text,data_augmentation=canonical", priority: 1}
+  {description: "narrative_qa:model=text,data_augmentation=canonical", priority: 2}
+  {description: "quac:model=text,data_augmentation=canonical", priority: 1}
+
+  ## Reading comprehension and closedbook QA variants
+
+  {description: "natural_qa:model=text,mode=openbook_longans,data_augmentation=canonical", priority: 1}
+  {description: "natural_qa:model=text,mode=closedbook,data_augmentation=canonical", priority: 1}
+
+  ## Closed-book QA with multiple choice
+
+  # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+  {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,data_augmentation=canonical", priority: 1}
+  {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,data_augmentation=canonical", priority: 2}
+  {description: "truthful_qa:model=text,task=mc_single,data_augmentation=canonical", priority: 1}
+
+  {description: "mmlu:model=text,subject=abstract_algebra,data_augmentation=canonical", priority: 2}
+  {description: "mmlu:model=text,subject=college_chemistry,data_augmentation=canonical", priority: 2}
+  {description: "mmlu:model=text,subject=computer_security,data_augmentation=canonical", priority: 2}
+  {description: "mmlu:model=text,subject=econometrics,data_augmentation=canonical", priority: 2}
+  {description: "mmlu:model=text,subject=us_foreign_policy,data_augmentation=canonical", priority: 2}
+  
+  ##### Information Retrieval #####
+  # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+  {description: "msmarco:model=full_functionality_text,data_augmentation=canonical,track=regular,valid_topk=30", priority: 2}
+  {description: "msmarco:model=full_functionality_text,data_augmentation=canonical,track=trec,valid_topk=30", priority: 1}
+
+  ##### Summarization #####
+  # Scenarios: XSUM, CNN/DM
+
+  {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu", priority: 1}
+  {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu", priority: 1}
+
+
+  ##### Sentiment Analysis #####
+  # Scenarios: IMDB
+
+  {description: "imdb:model=text,data_augmentation=canonical", priority: 1}
+
+
+  ##### (Miscellaneous) Text Classification #####
+  # Scenarios: RAFT
+
+  {description: "raft:subset=ade_corpus_v2,model=text,data_augmentation=canonical", priority: 2}
+  {description: "raft:subset=banking_77,model=text,data_augmentation=canonical", priority: 2}
+  {description: "raft:subset=neurips_impact_statement_risks,model=text,data_augmentation=canonical", priority: 2}
+  {description: "raft:subset=one_stop_english,model=text,data_augmentation=canonical", priority: 2}
+  {description: "raft:subset=overruling,model=text,data_augmentation=canonical", priority: 2}
+  {description: "raft:subset=semiconductor_org_types,model=text,data_augmentation=canonical", priority: 2}
+  {description: "raft:subset=tweet_eval_hate,model=text,data_augmentation=canonical", priority: 2}
+  {description: "raft:subset=twitter_complaints,model=text,data_augmentation=canonical", priority: 2}
+  {description: "raft:subset=systematic_review_inclusion,model=text,data_augmentation=canonical", priority: 2}
+  {description: "raft:subset=tai_safety_research,model=text,data_augmentation=canonical", priority: 2}
+  {description: "raft:subset=terms_of_service,model=text,data_augmentation=canonical", priority: 2}
+
+
+  ##### Toxicity Detection #####
+  # Scenarios: CivilComments
+
+  {description: "civil_comments:model=text,demographic=all,data_augmentation=canonical", priority: 1}
+  {description: "civil_comments:model=text,demographic=male,data_augmentation=canonical", priority: 2}
+  {description: "civil_comments:model=text,demographic=female,data_augmentation=canonical", priority: 2}
+  {description: "civil_comments:model=text,demographic=LGBTQ,data_augmentation=canonical", priority: 2}
+  {description: "civil_comments:model=text,demographic=christian,data_augmentation=canonical", priority: 2}
+  {description: "civil_comments:model=text,demographic=muslim,data_augmentation=canonical", priority: 2}
+  {description: "civil_comments:model=text,demographic=other_religions,data_augmentation=canonical", priority: 2}
+  {description: "civil_comments:model=text,demographic=black,data_augmentation=canonical", priority: 2}
+  {description: "civil_comments:model=text,demographic=white,data_augmentation=canonical", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_vhelm.conf b/src/helm/benchmark/presentation/run_specs_vhelm.conf
index a0b8df7a036..1ae33e43631 100644
--- a/src/helm/benchmark/presentation/run_specs_vhelm.conf
+++ b/src/helm/benchmark/presentation/run_specs_vhelm.conf
@@ -4,4 +4,5 @@ entries: [
     ################################################# Main experiments #################################################
 
     {description: "vqa:model=vlm", priority: 1, groups: ["vqa_base"]}
+    {description: "viz_wiz:model=vlm", priority: 1, groups: ["vqa_base"]}
 ]
diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py
index c1208ee45b5..3a0b7877b8c 100644
--- a/src/helm/benchmark/presentation/schema.py
+++ b/src/helm/benchmark/presentation/schema.py
@@ -3,7 +3,7 @@
 from typing import List, Optional, Dict
 import dacite
 import mako.template
-import yaml  # type: ignore
+import yaml
 import importlib_resources as resources
 
 from helm.common.general import hlog
@@ -207,9 +207,12 @@ class RunGroup(Field):
 
     # Which adapter_spec fields we should preserve when displaying methods for this group
     # When we are constructing a table where the rows are methods, what constitutes a "method" is given by the set of
-    # adapter keys. By default, this should just be "model" (e.g., BLOOM), where details like "num_train_instances" are
-    # "marginalized out". However, for ablations, we want to include both "model" and "num_train_instances".
-    adapter_keys_shown: List[str] = field(default_factory=lambda: ["model"])
+    # adapter keys. By default, this should just be "model_deployment" (e.g., BLOOM), where details like
+    # "num_train_instances" are "marginalized out". However, for ablations, we want to include both "model_deployment"
+    # and "num_train_instances".
+    # NOTE: "model" is kept for backward compatibility reason.
+    # TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore.
+    adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"])
 
 
 @dataclass
diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index ce5cf3cd1ed..4d873338015 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -57,7 +57,10 @@
     CONTAMINATION_STYLES,
     CONTAMINATION_LEVEL_STRONG,
 )
+from helm.benchmark.config_registry import register_helm_configurations
 from helm.benchmark.presentation.run_display import write_run_display_json
+from helm.benchmark.model_deployment_registry import get_metadata_for_deployment
+from helm.benchmark.model_metadata_registry import ModelMetadata
 
 
 OVERLAP_N_COUNT = 13
@@ -165,7 +168,7 @@ def get_coarse_adapter_spec(
 
     # Create a new adapter_spec, keeping only the model and the keys in adapter_keys_shown
     adapter_spec_kwargs = {key: adapter_spec.__dict__[key] for key in adapter_keys_shown}
-    return AdapterSpec(**adapter_spec_kwargs)  # type: ignore
+    return AdapterSpec(**adapter_spec_kwargs)
 
 
 def get_method_display_name(model_display_name: Optional[str], info: Dict[str, Any]) -> str:
@@ -178,6 +181,8 @@ def get_method_display_name(model_display_name: Optional[str], info: Dict[str, A
     info = dict(info)
     if "model" in info:
         del info["model"]
+    if "model_deployment" in info:
+        del info["model_deployment"]
 
     return (model_display_name or "???") + (f" [{dict_to_str(info)}]" if len(info) > 0 else "")
 
@@ -363,13 +368,8 @@ def read_runs_for_suite(self, suite, run_suite_path):
                 hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
                 continue
             run_path: str = os.path.join(run_suite_path, run_dir_name)
-            self.runs.append(self.read_run(run_path))
-
-        # For each group (e.g., natural_qa), map
-        # (i) scenario spec (e.g., subject=philosophy) [optional] and
-        # (ii) adapter spec (e.g., model = openai/davinci)
-        # to list of runs
-        for run in self.runs:
+            run = self.read_run(run_path)
+            self.runs.append(run)
             if run.run_spec.name in self.runs_to_run_suites:
                 hlog(
                     f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
@@ -377,6 +377,12 @@ def read_runs_for_suite(self, suite, run_suite_path):
                 )
             self.runs_to_run_suites[run.run_spec.name] = suite
 
+    def group_runs(self):
+        # For each group (e.g., natural_qa), map
+        # (i) scenario spec (e.g., subject=philosophy) [optional] and
+        # (ii) adapter spec (e.g., model = openai/davinci)
+        # to list of runs
+        for run in self.runs:
             scenario_spec = run.run_spec.scenario_spec
             adapter_spec = run.run_spec.adapter_spec
             for group_name in run.run_spec.groups:
@@ -564,12 +570,12 @@ def write_cost_report(self):
         # TODO: move to write_executive_summary()
         models_to_costs: Dict[str, Dict[str]] = defaultdict(lambda: defaultdict(int))
         for run in self.runs:
-            model: str = run.run_spec.adapter_spec.model
+            deployment: str = run.run_spec.adapter_spec.model_deployment
 
             for stat in run.stats:
                 stat_name = stat.name.name
                 if stat_name in Summarizer.COST_REPORT_FIELDS and not stat.name.split:
-                    models_to_costs[model][stat_name] += stat.sum
+                    models_to_costs[deployment][stat_name] += stat.sum
 
         # Do a second pass to add up the total number of tokens
         for costs in models_to_costs.values():
@@ -660,7 +666,7 @@ def get_cell(stats: List[Stat], compute_mean: bool = False, compute_sum: bool =
                 for subgroup in self.expand_subgroups(group):
                     for adapter_spec, runs in self.group_adapter_to_runs[subgroup.name].items():
                         filtered_runs = self.filter_runs_by_visibility(runs, subgroup)
-                        models.add(adapter_spec.model)
+                        models.add(adapter_spec.model_deployment)
                         methods.add(adapter_spec.method)
                         for run in filtered_runs:
                             num_instances.extend(get_all_stats_by_name(run.stats, "num_instances"))
@@ -869,33 +875,28 @@ def run_spec_names_to_url(run_spec_names: List[str]) -> str:
             model_order = [model.name for model in self.schema.models]
 
             def _adapter_spec_sort_key(spec):
-                index = model_order.index(spec.model) if spec.model in model_order else -1
-                return (index, spec.model)
+                index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
+                return (index, spec.model_deployment)
 
             adapter_specs = list(sorted(adapter_specs, key=_adapter_spec_sort_key))
 
         # Pull out only the keys of the method adapter_spec that is needed to
         # uniquely identify the method.
-        infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model"])
+        infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model_deployment", "model"])
 
         assert len(adapter_specs) == len(infos), [adapter_specs, infos]
 
         # Populate the contents of the table
         rows = []
         for adapter_spec, info in zip(adapter_specs, infos):
-            model_name: str = adapter_spec.model
-
-            # Get the model display name from the schema.
-            # Fall back to using the model name as the model display name if the model is not
-            # defined in the schema.
-            model_display_name = (
-                self.schema.name_to_model[model_name].display_name
-                if model_name in self.schema.name_to_model
-                else model_name
+            deployment: str = (
+                adapter_spec.model_deployment if len(adapter_spec.model_deployment) > 0 else adapter_spec.model
             )
+            model_metadata: ModelMetadata = get_metadata_for_deployment(deployment)
+            model_name: str = model_metadata.name
 
             runs = adapter_to_runs[adapter_spec]
-            display_name = get_method_display_name(model_display_name, info)
+            display_name = get_method_display_name(model_metadata.display_name, info)
 
             # Link to all the runs under this model
             if link_to_runs:
@@ -1254,6 +1255,7 @@ def symlink_latest(self) -> None:
     def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
         """Run the entire summarization pipeline pipeline."""
         self.read_runs()
+        self.group_runs()
         self.check_metrics_defined()
 
         self.write_run_display_json(skip_completed)
@@ -1335,6 +1337,8 @@ def main():
     else:
         raise ValueError("Exactly one of --release or --suite must be specified.")
 
+    register_helm_configurations()
+
     # Output JSON files summarizing the benchmark results which will be loaded in the web interface
     summarizer = Summarizer(
         release=release,
diff --git a/src/helm/benchmark/presentation/test_run_entry.py b/src/helm/benchmark/presentation/test_run_entry.py
index 68d33424d81..86a3b53afc2 100644
--- a/src/helm/benchmark/presentation/test_run_entry.py
+++ b/src/helm/benchmark/presentation/test_run_entry.py
@@ -1,4 +1,5 @@
 import os
+import pytest
 
 from helm.common.object_spec import parse_object_spec
 from helm.benchmark.presentation.run_entry import read_run_entries
@@ -6,11 +7,16 @@
 from helm.benchmark import vlm_run_specs  # noqa
 
 
-def test_read_all_specs():
-    """Read all the run entries and make sure they parse and we can instantiate them."""
+def list_fnames():
     base_path = os.path.dirname(__file__)
-    for fname in os.listdir(base_path):
-        if fname.endswith(".conf"):
-            run_entries = read_run_entries([os.path.join(base_path, fname)])
-            for entry in run_entries.entries:
-                construct_run_specs(parse_object_spec(entry.description))
+    return [os.path.join(base_path, fname) for fname in os.listdir(base_path) if fname.endswith(".conf")]
+
+
+class TestRunEntry:
+    """Read all the run entries and make sure they parse and we can instantiate them."""
+
+    @pytest.mark.parametrize("fname", list_fnames())
+    def test_read_all_specs(self, fname: str):
+        run_entries = read_run_entries([fname])
+        for entry in run_entries.entries:
+            construct_run_specs(parse_object_spec(entry.description))
diff --git a/src/helm/benchmark/run.py b/src/helm/benchmark/run.py
index 9e358df850e..cdf280d4364 100644
--- a/src/helm/benchmark/run.py
+++ b/src/helm/benchmark/run.py
@@ -10,11 +10,11 @@
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block
 from helm.common.authentication import Authentication
 from helm.common.object_spec import parse_object_spec, get_class_by_name
-from helm.proxy.clients.remote_model_registry import check_and_register_remote_model
 from helm.proxy.services.remote_service import create_authentication, add_service_args
 
 from helm.benchmark.model_metadata_registry import register_model_metadata_from_path
 from helm.benchmark.model_deployment_registry import register_model_deployments_from_path
+from helm.benchmark.config_registry import register_helm_configurations
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark import vlm_run_specs  # noqa
 from .executor import ExecutionSpec
@@ -39,7 +39,7 @@ def run_entries_to_run_specs(
 
         for run_spec in construct_run_specs(parse_object_spec(entry.description)):
             # Filter by models
-            if models_to_run and run_spec.adapter_spec.model not in models_to_run:
+            if models_to_run and run_spec.adapter_spec.model_deployment not in models_to_run:
                 continue
 
             # Filter by groups
@@ -239,13 +239,6 @@ def main():
         default=[],
         help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
     )
-    parser.add_argument(
-        "--enable-remote-models",
-        nargs="+",
-        default=[],
-        help="Experimental: Enable remote service models that are not available on the client. "
-        "The client will use RemoteWindowService for windowing.",
-    )
     parser.add_argument(
         "--runner-class-name",
         type=str,
@@ -277,9 +270,6 @@ def main():
     for model_deployment_paths in args.model_deployment_paths:
         register_model_deployments_from_path(model_deployment_paths)
 
-    if args.server_url and args.enable_remote_models:
-        check_and_register_remote_model(args.server_url, args.enable_remote_models)
-
     run_entries: List[RunEntry] = []
     if args.conf_paths:
         run_entries.extend(read_run_entries(args.conf_paths).entries)
@@ -288,6 +278,8 @@ def main():
             [RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
         )
 
+    register_helm_configurations()
+
     run_specs = run_entries_to_run_specs(
         run_entries=run_entries,
         max_eval_instances=args.max_eval_instances,
diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py
index a664b2893ca..d7ada38f0ab 100644
--- a/src/helm/benchmark/run_expander.py
+++ b/src/helm/benchmark/run_expander.py
@@ -3,7 +3,7 @@
 from dataclasses import replace
 from typing import Any, List, Dict, Optional, Tuple, Type
 
-from helm.proxy.models import (
+from helm.benchmark.model_metadata_registry import (
     get_all_instruction_following_models,
     get_all_code_models,
     get_all_models,
@@ -11,16 +11,10 @@
     get_model_names_with_tag,
     FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
     LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-    GPT2_TOKENIZER_TAG,
-    AI21_TOKENIZER_TAG,
-    COHERE_TOKENIZER_TAG,
-    OPT_TOKENIZER_TAG,
-    GPTJ_TOKENIZER_TAG,
-    GPTNEO_TOKENIZER_TAG,
-    GPT4_TOKENIZER_TAG,
     ABLATION_MODEL_TAG,
     VISION_LANGUAGE_MODEL_TAG,
 )
+from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
 from .runner import RunSpec
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution
 from .augmentations.perturbation import PerturbationSpec
@@ -355,10 +349,6 @@ def values_dict(self):
             "code": get_all_code_models(),
             "instruction_following": get_all_instruction_following_models(),
             "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
-            "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
-            "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
-            "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
-            "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
             "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
             "biomedical": ["openai/text-davinci-003"],  # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
             "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
@@ -388,6 +378,13 @@ def values_dict(self):
         return values_dict
 
 
+class ModelDeploymentRunExpander(ReplaceValueRunExpander):
+    """For overriding model deployment"""
+
+    name = "model_deployment"
+    values_dict: Dict[str, List[Any]] = {}
+
+
 ############################################################
 
 
@@ -880,18 +877,18 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
         "huggingface/santacoder": ["bigcode/santacoder"],
         "huggingface/starcoder": ["bigcode/starcoder"],
     }
-    model_tags_and_tokenizers = [
-        (GPT2_TOKENIZER_TAG, "huggingface/gpt2"),
-        (AI21_TOKENIZER_TAG, "ai21/j1"),
-        (COHERE_TOKENIZER_TAG, "cohere/cohere"),
-        (OPT_TOKENIZER_TAG, "meta/opt"),
-        (GPTJ_TOKENIZER_TAG, "eleutherai/gptj"),
-        (GPT4_TOKENIZER_TAG, "openai/cl100k_base"),
-        (GPTNEO_TOKENIZER_TAG, "eleutherai/gptneox"),
+    list_tokenizers = [
+        "huggingface/gpt2",
+        "ai21/j1",
+        "cohere/cohere",
+        "meta/opt",
+        "eleutherai/gptj",
+        "openai/cl100k_base",
+        "eleutherai/gptneox",
     ]
-    for model_tag, tokenizer in model_tags_and_tokenizers:
-        for model in get_model_names_with_tag(model_tag):
-            model_to_tokenizer_mapping[model] = [tokenizer]
+    for tokenizer_name in list_tokenizers:
+        for model in get_model_names_with_tokenizer(tokenizer_name):
+            model_to_tokenizer_mapping[model] = [tokenizer_name]
     # tokenizer=default will map to using the right tokenizer for a given model.
     values_dict = {"default": model_to_tokenizer_mapping}
 
@@ -907,10 +904,10 @@ def __init__(self, value):
             self.all_values = [value]
 
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
-        # Find right tokenizer given model.
+        # Find right tokenizer given model deployment name.
         if isinstance(self.all_values, dict):
-            model: str = run_spec.adapter_spec.model
-            self.values = self.all_values[model] if model in self.all_values else []
+            deployment: str = run_spec.adapter_spec.model_deployment
+            self.values = self.all_values[deployment] if deployment in self.all_values else []
         else:
             self.values = self.all_values
         return super().expand(run_spec)
@@ -1114,6 +1111,7 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
     MaxEvalInstancesRunExpander,
     NumOutputsRunExpander,
     ModelRunExpander,
+    ModelDeploymentRunExpander,
     DataAugmentationRunExpander,
     TokenizerRunExpander,
     NumPromptTokensRunExpander,
diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index 76f6ae50292..69bdd636b13 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -1,8 +1,10 @@
+import dataclasses
 import importlib
 import itertools
 from functools import partial
 from typing import Any, Callable, List, Dict, Optional, Set, TypeVar
 
+from helm.benchmark.model_deployment_registry import ALL_MODEL_DEPLOYMENTS, DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT
 from helm.common.hierarchical_logger import hlog, htrack
 from helm.common.object_spec import ObjectSpec
 from helm.benchmark.adaptation.adapters.adapter_factory import (
@@ -47,10 +49,15 @@
     TaskType,
     get_lextreme_task_type,
 )
-from helm.proxy.models import (
+from helm.benchmark.model_deployment_registry import (
+    ModelDeployment,
+    get_model_deployment,
+)
+from helm.benchmark.model_metadata_registry import (
+    ModelMetadata,
+    get_model_metadata,
     ANTHROPIC_CLAUDE_1_MODEL_TAG,
     ANTHROPIC_CLAUDE_2_MODEL_TAG,
-    get_model,
     NO_NEWLINES_TAG,
     NLG_PREFIX_TAG,
     CHATML_MODEL_TAG,
@@ -456,6 +463,7 @@ def get_adapter_spec1() -> AdapterSpec:
         num_outputs=3,
         num_train_trials=3,
         model="simple/model1",
+        model_deployment="simple/model1",
         temperature=1,
         stop_sequences=["."],
     )
@@ -1213,7 +1221,8 @@ def get_numeracy_spec(
 ) -> RunSpec:
     from .scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
 
-    run_solver: bool = True if run_solver == "True" else False  # type: ignore
+    run_solver_bool: bool = True if run_solver == "True" else False
+    del run_solver
     random_seed = int(seed)
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
@@ -1253,7 +1262,7 @@ def get_numeracy_spec(
         name=f"numeracy:relation_type={relation_type},mode={mode}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_numeracy_metric_specs(run_solver),  # type: ignore
+        metric_specs=get_numeracy_metric_specs(run_solver_bool),
         groups=["numeracy"],
     )
 
@@ -1265,21 +1274,25 @@ def get_math_spec(
     use_official_examples: str = "False",
     use_chain_of_thought: str = "False",
 ) -> RunSpec:
-    use_official_examples: bool = use_official_examples == "True"  # type: ignore
-    use_chain_of_thought: bool = use_chain_of_thought == "True"  # type: ignore
-    if use_chain_of_thought:
-        assert not use_official_examples, "Cannot use official examples when use_chain_of_thought is True."
+    # Convert to bools and remove the str versions
+    use_official_examples_bool: bool = use_official_examples == "True"
+    use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
+    del use_official_examples
+    del use_chain_of_thought
+
+    if use_chain_of_thought_bool:
+        assert not use_official_examples_bool, "Cannot use official examples when use_chain_of_thought is True."
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.math_scenario.MATHScenario",
         args={
             "subject": subject,
             "level": level,
-            "use_official_examples": use_official_examples,
-            "use_chain_of_thought": use_chain_of_thought,
+            "use_official_examples": use_official_examples_bool,
+            "use_chain_of_thought": use_chain_of_thought_bool,
         },
     )
 
-    if use_chain_of_thought:  # Include the solution in the output as per https://arxiv.org/abs/2201.11903
+    if use_chain_of_thought_bool:  # Include the solution in the output as per https://arxiv.org/abs/2201.11903
         output_prefix = "Answer: "  # Don't include LaTeX '$' delimiters
         output_suffix = "\n"
         instance_prefix = "###\n"  # Don't include LaTeX '$' delimiters
@@ -1311,10 +1324,10 @@ def get_math_spec(
 
     return RunSpec(
         name=f"math:subject={subject},level={level},"
-        f"use_official_examples={use_official_examples},use_chain_of_thought={use_chain_of_thought}",
+        f"use_official_examples={use_official_examples_bool},use_chain_of_thought={use_chain_of_thought_bool}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_math_metric_specs(use_chain_of_thought) + get_generative_harms_metric_specs(),  # type: ignore
+        metric_specs=get_math_metric_specs(use_chain_of_thought_bool) + get_generative_harms_metric_specs(),
         groups=groups,
     )
 
@@ -1925,7 +1938,6 @@ def get_metric_specs(big_bench_metrics: List[str]) -> List[MetricSpec]:
     # "metrics" is a required field. The default values were populated using the link above.
     adapter_spec = AdapterSpec(
         method=get_adaptation_method(big_bench_task["metrics"]),
-        model="openai/text-curie-001",  # Can override with the `ModelRunExpander`.
         max_train_instances=5,  # Can override with the `MaxTrainInstancesRunExpander`.
         num_outputs=1,  # Can override with the `NumOutputsRunExpander`.
         # From "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
@@ -2837,6 +2849,80 @@ def get_decodingtrust_toxicity_prompts_spec(subject) -> RunSpec:
 ############################################################
 
 
+def get_default_model_deployment_for_model(
+    model_name: str, warn_arg_deprecated: bool = False, ignore_deprecated: bool = False
+) -> Optional[str]:
+    """Returns a valid model deployment name corresponding to the given model arg.
+    This is used as a backwards compatibility layer for model names that are now moved to model deployments.
+    Example: "anthropic/claude-v1.3" => "anthropic/claude-v1.3"
+    Example: "meta/llama-7b" => "together/llama-7b"
+
+    The process to find a model deployment name is as follows:
+    1. If there is a model deployment with the same name as the model arg, use it.
+    2. If there is at least one deployment for the model, use the first one that is available.
+    3. If there are no deployments for the model, returns None.
+
+    This function will also try to find a model deployment name that is not deprecated.
+    If there are no non-deprecated deployments, it will return the first deployment (even if it's deprecated).
+    If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
+
+    If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
+    as the model arg. This is to remind the user that the model name is deprecated and should be replaced with
+    the model deployment name (in their config).
+
+    Args:
+        model_arg: The model arg to convert to a model deployment name.
+        warn_arg_deprecated: Whether to print a warning if the model deployment name is not the same as the model arg.
+        ignore_deprecated: Whether to return None if the model deployment is deprecated.
+    """
+
+    # If there is a model deployment with the same name as the model arg, use it.
+    if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
+        deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
+        if deployment.deprecated and ignore_deprecated:
+            if warn_arg_deprecated:
+                hlog(f"WARNING: Model deployment {model_name} is deprecated")
+            return None
+        return deployment.name
+
+    # If there is at least one deployment for the model, use the first one that is available.
+    available_deployments: List[ModelDeployment] = [
+        deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
+    ]
+    if len(available_deployments) > 0:
+        available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
+        if warn_arg_deprecated:
+            hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.")
+            hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
+
+        # Additionally, if there is a non-deprecated deployment, use it.
+        non_deprecated_deployments: List[ModelDeployment] = [
+            deployment for deployment in available_deployments if not deployment.deprecated
+        ]
+        if len(non_deprecated_deployments) > 0:
+            chosen_deployment = non_deprecated_deployments[0]
+        # There are no non-deprecated deployments, so there are two options:
+        # 1. If we can return an empty string, return it. (no model deployment is available)
+        # 2. If we can't return an empty string, return the first deployment (even if it's deprecated).
+        elif ignore_deprecated:
+            return None
+        else:
+            chosen_deployment = available_deployments[0]
+            if warn_arg_deprecated:
+                hlog(f"WARNING: All model deployments for model {model_name} are deprecated.")
+        if warn_arg_deprecated:
+            hlog(
+                f"Choosing {chosen_deployment.name} (the first one) as "
+                f"the default model deployment for model {model_name}"
+            )
+            hlog("If you want to use a different model deployment, please specify it explicitly.")
+        return chosen_deployment.name
+
+    # Some models are added but have no deployments yet.
+    # In this case, we return None.
+    return None
+
+
 def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
     """
     Takes a specification (name, args) and returns a list of `RunSpec`s.
@@ -2862,13 +2948,41 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
         ]
 
     def alter_run_spec(run_spec: RunSpec) -> RunSpec:
-        try:
-            model = get_model(run_spec.adapter_spec.model)
-        except ValueError:
-            # Models registered from configs cannot have expanders applied to them,
-            # because the models will not have been registered yet at this point.
-            # TODO: Figure out a cleaner way to deal with this.
-            return run_spec
+        if not run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
+            raise ValueError("At least one of model_deployment and model must be specified")
+        elif not run_spec.adapter_spec.model and run_spec.adapter_spec.model_deployment:
+            # Infer model from model deployment
+            default_model_name = get_model_deployment(run_spec.adapter_spec.model_deployment).model_name
+            if not default_model_name:
+                default_model_name = run_spec.adapter_spec.model_deployment
+            run_spec = dataclasses.replace(
+                run_spec,
+                adapter_spec=dataclasses.replace(run_spec.adapter_spec, model=default_model_name),
+            )
+        elif run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
+            # Infer model deployment from model
+            default_model_deployment = get_default_model_deployment_for_model(run_spec.adapter_spec.model)
+            if not default_model_deployment:
+                raise ValueError(
+                    f"Unknown model or no default model deployment found for model {run_spec.adapter_spec.model}"
+                )
+            run_spec = dataclasses.replace(
+                run_spec,
+                adapter_spec=dataclasses.replace(run_spec.adapter_spec, model_deployment=default_model_deployment),
+            )
+
+        # Both model and model_deployment should now be filled
+        assert run_spec.adapter_spec.model_deployment
+        assert run_spec.adapter_spec.model
+
+        model: ModelMetadata = get_model_metadata(run_spec.adapter_spec.model)
+        deployment: ModelDeployment = get_model_deployment(run_spec.adapter_spec.model_deployment)
+        if run_spec.adapter_spec.model != deployment.model_name:
+            raise ValueError(
+                f"Invalid RunSpec: selected model deployment '{run_spec.adapter_spec.model_deployment}'"
+                f"for model '{run_spec.adapter_spec.model}' but the model deployment is "
+                f"for a different model '{deployment.model_name}'"
+            )
         # For models that strip newlines, when we're generating, we need to set
         # the delimiter to be '###' so we stop properly.
         if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
diff --git a/src/helm/benchmark/runner.py b/src/helm/benchmark/runner.py
index 52677631ace..21c1a62c6f8 100644
--- a/src/helm/benchmark/runner.py
+++ b/src/helm/benchmark/runner.py
@@ -8,6 +8,7 @@
 import dataclasses
 from dataclasses import dataclass, field
 from typing import Any, Dict, List
+import numpy as np
 
 from tqdm import tqdm
 
@@ -15,7 +16,15 @@
 from helm.common.hierarchical_logger import hlog, htrack_block
 from helm.common.cache import cache_stats
 from .augmentations.data_augmenter import DataAugmenterSpec
-from .scenarios.scenario import Scenario, ScenarioSpec, create_scenario, Instance, with_instance_ids
+from .scenarios.scenario import (
+    EVAL_SPLITS,
+    TRAIN_SPLIT,
+    Scenario,
+    ScenarioSpec,
+    create_scenario,
+    Instance,
+    with_instance_ids,
+)
 from .adaptation.adapters.adapter import Adapter
 from .adaptation.adapters.adapter_factory import AdapterFactory
 from .adaptation.scenario_state import ScenarioState
@@ -103,6 +112,38 @@ def remove_per_instance_stats_nans(per_instance_stats_list: List[PerInstanceStat
     return result
 
 
+def downsample_eval_instances(instances: List[Instance], max_eval_instances: int) -> List[Instance]:
+    """
+    Get the instances necessary for this run:
+    Train instances (split=train): keep all (if any) for in-context learning
+    Eval instances (split=valid or test): keep at most `max_eval_instances` specified in `AdapterSpec` by sampling
+    Return the resulting train and eval instances.
+    """
+    all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
+
+    all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in EVAL_SPLITS]
+    if len(all_eval_instances) > max_eval_instances:
+        # The random sampling includes instances monotonically.
+        np.random.seed(0)
+        selected_eval_instances = list(
+            np.random.choice(
+                all_eval_instances,  # type: ignore
+                max_eval_instances,
+                replace=False,
+            )
+        )
+    else:
+        selected_eval_instances = all_eval_instances
+
+    hlog(
+        f"{len(instances)} instances, "
+        f"{len(all_train_instances)} train instances, "
+        f"{len(selected_eval_instances)}/{len(all_eval_instances)} eval instances"
+    )
+
+    return all_train_instances + selected_eval_instances
+
+
 class Runner:
     """
     The main entry point for running the entire benchmark.  Mostly just
@@ -145,11 +186,13 @@ def __init__(
         self.eval_cache_path: str = os.path.join(self.runs_path, "eval_cache")
         ensure_directory_exists(self.eval_cache_path)
 
-    def _is_run_completed(self, run_spec: RunSpec):
+    def _get_run_path(self, run_spec: RunSpec) -> str:
+        return os.path.join(self.runs_path, run_spec.name)
+
+    def _is_run_completed(self, run_path: str):
         """Return whether the run was previously completed.
 
         A run is completed if all of the expected output files exist."""
-        run_path: str = os.path.join(self.runs_path, run_spec.name)
         if not os.path.isdir(run_path):
             return False
         output_paths = [
@@ -182,6 +225,12 @@ def run_all(self, run_specs: List[RunSpec]):
             raise RunnerError(f"Failed runs: [{failed_runs_str}]")
 
     def run_one(self, run_spec: RunSpec):
+        run_path: str = self._get_run_path(run_spec)
+        if self.skip_completed_runs and self._is_run_completed(run_path):
+            hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
+            return
+        ensure_directory_exists(run_path)
+
         # Load the scenario
         scenario: Scenario = create_scenario(run_spec.scenario_spec)
 
@@ -195,18 +244,6 @@ def run_one(self, run_spec: RunSpec):
         input_instances_output_path = os.path.join(self.instances_path, scenario_name_with_args)
         input_instances_file_path = os.path.join(input_instances_output_path, "input_instances.json")
 
-        run_path: str = os.path.join(self.runs_path, run_spec.name)
-        ensure_directory_exists(run_path)
-
-        if self.skip_completed_runs and self._is_run_completed(run_spec):
-            # If scenario_state.json exists, assume that all other output files exist
-            # because scenario_state.json is the last output file to be written.
-            hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
-            return
-
-        # Fetch and initialize the Adapter based on the `AdapterSpec`.
-        adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
-
         instances: List[Instance]
         if self.skip_instances:
             instances = []
@@ -233,7 +270,9 @@ def run_one(self, run_spec: RunSpec):
         instances = with_instance_ids(instances)
 
         # Get the instances necessary for this run.
-        instances = adapter.get_run_instances(instances)
+        max_eval_instances = run_spec.adapter_spec.max_eval_instances
+        if max_eval_instances is not None:
+            instances = downsample_eval_instances(instances, max_eval_instances)
 
         # Data preprocessing
         instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
@@ -241,6 +280,7 @@ def run_one(self, run_spec: RunSpec):
         )
 
         # Adapt (convert to requests)
+        adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
         scenario_state: ScenarioState = adapter.adapt(instances, self.executor.execution_spec.parallelism)
 
         # Execute (fill up results)
diff --git a/src/helm/benchmark/scenarios/cleva_scenario.py b/src/helm/benchmark/scenarios/cleva_scenario.py
index eb8ed80462f..c63a1e523a0 100644
--- a/src/helm/benchmark/scenarios/cleva_scenario.py
+++ b/src/helm/benchmark/scenarios/cleva_scenario.py
@@ -10,7 +10,12 @@
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
     ADAPT_GENERATION,
 )
-from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from helm.common.general import (
+    assert_is_str,
+    assert_is_str_list,
+    ensure_file_downloaded,
+    ensure_directory_exists,
+)
 from helm.common.hierarchical_logger import hlog
 from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
 from .code_scenario import CodeReference, CodeInstance
@@ -69,26 +74,17 @@ def transform(self, data: Dict[str, RawData], templates: Dict[str, Optional[Temp
         """Convert a data point in CLEVA format to a HELM instance according to a given CLEVA prompt template."""
         transformed_data = self._apply_all(copy.deepcopy(data), templates)
 
-        prompt: str = transformed_data["input"]  # type: ignore
-        assert isinstance(prompt, str)
+        prompt = assert_is_str(transformed_data["input"])
         if "choices" in transformed_data:
             # This is a multiple-choice task
-            choices: List[str] = transformed_data["choices"]  # type: ignore
-            # Gurantee `choices` must be `List[str]`
-            assert isinstance(choices, list)
-            for c in choices:
-                assert isinstance(c, str)
+            choices = assert_is_str_list(transformed_data["choices"])
             references: List[Reference] = [
                 Reference(Output(text=text), tags=[CORRECT_TAG] if idx in transformed_data["label"] else [])
                 for idx, text in enumerate(choices)
             ]
         else:
             # This is a generation task
-            correct_answer: List[str] = transformed_data["label"]  # type: ignore
-            # Gurantee `label` must be `List[str]`
-            assert isinstance(correct_answer, list)
-            for a in correct_answer:
-                assert isinstance(a, str)
+            correct_answer = assert_is_str_list(transformed_data["label"])
             references = [Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in correct_answer]
 
         instance = Instance(
@@ -109,15 +105,12 @@ def transform_code(
         to a HELM CodeInstance according to a given CLEVA prompt template.
         """
 
-        assert isinstance(templates["input"], str)
-        data["prompt"] = templates["input"].format(**data)
-        assert isinstance(data["prompt"], str)
-        assert isinstance(data["canonical_solution"], str)
+        data["prompt"] = assert_is_str(templates["input"]).format(**data)
         instance = CodeInstance(
-            input=Input(text=data["prompt"]),
+            input=Input(text=assert_is_str(data["prompt"])),
             references=[
                 CodeReference(
-                    output=Output(text=data["canonical_solution"]),
+                    output=Output(text=assert_is_str(data["canonical_solution"])),
                     test_cases=data,
                     tags=[CORRECT_TAG],
                 )
@@ -211,27 +204,18 @@ def _apply_all(self, data: Dict[str, RawData], templates: Dict[str, Optional[Tem
                 transformed_data[k] = self._apply(data[k], template, **data)
 
         # We then merge all other fields into the `input`
-        assert isinstance(templates["input"], str), "The input field of a template should be a string"
-        data["input"] = templates["input"].format(**transformed_data)
+        data["input"] = assert_is_str(templates["input"]).format(**transformed_data)
         if "choices" in data:
             # We take the corresponding choices and apply the `label` template
             # Note: we do not allow `label` template to access other fields in multi-choice tasks
             # Overwrite `choices` to the actual continuations
-            choices: List[str] = data["choices"]  # type: ignore
-            # Gurantee `choices` must be `List[str]`
-            assert isinstance(choices, list)
-            for c in choices:
-                assert isinstance(c, str)
+            choices = assert_is_str_list(data["choices"])
             data["choices"] = [self._apply(c, templates.get("label", None), label=c) for c in choices]
         else:
             # For generation tasks, we allow it to access to other stringified fields
             kwargs = transformed_data
             del kwargs["label"]
-            labels: List[str] = data["label"]  # type: ignore
-            # Gurantee `label` must be `List[str]`
-            assert isinstance(labels, list)
-            for label in labels:
-                assert isinstance(label, str)
+            labels = assert_is_str_list(data["label"])
             data["label"] = [self._apply(x, templates.get("label", None), **kwargs, label=x) for x in labels]
         return data
 
diff --git a/src/helm/benchmark/scenarios/code_scenario.py b/src/helm/benchmark/scenarios/code_scenario.py
index d63fdf7b5f8..7324404ed04 100644
--- a/src/helm/benchmark/scenarios/code_scenario.py
+++ b/src/helm/benchmark/scenarios/code_scenario.py
@@ -139,8 +139,8 @@ def _read_and_preprocess_apps(target_path: str) -> List[CodeInstance]:
     # only if the version of Python has a default limit.
     #
     # See: https://docs.python.org/3/library/stdtypes.html#int-max-str-digits
-    if hasattr(sys, "set_int_max_str_digits"):  # type: ignore
-        sys.set_int_max_str_digits(100000)  # type: ignore
+    if hasattr(sys, "set_int_max_str_digits"):
+        sys.set_int_max_str_digits(100000)
 
     SINGLE_STR_LIMIT = 150000  # From original codebase.
 
diff --git a/src/helm/benchmark/scenarios/entity_matching_scenario.py b/src/helm/benchmark/scenarios/entity_matching_scenario.py
index 3e3ae5d7e54..2e309070850 100644
--- a/src/helm/benchmark/scenarios/entity_matching_scenario.py
+++ b/src/helm/benchmark/scenarios/entity_matching_scenario.py
@@ -92,7 +92,7 @@ def read_blocked_pairs(
             num_neg_classes: int = sum(merged["label"] == 0)
             assert num_pos_classes < num_neg_classes
             sample_fn = lambda x: x.sample(num_pos_classes)
-            merged = merged.groupby("label", group_keys=False).apply(sample_fn)  # type: ignore
+            merged = merged.groupby("label", group_keys=False).apply(sample_fn)
         return merged
 
     def serialize_row(self, row: pd.core.series.Series, column_map: Dict[str, str]) -> str:
diff --git a/src/helm/benchmark/scenarios/numeracy_scenario.py b/src/helm/benchmark/scenarios/numeracy_scenario.py
index 8b60b58f84b..7205ae4bdf1 100644
--- a/src/helm/benchmark/scenarios/numeracy_scenario.py
+++ b/src/helm/benchmark/scenarios/numeracy_scenario.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass, field
 from itertools import combinations_with_replacement, product
 import math
-from math import comb  # type: ignore
+from math import comb
 import numpy as np
 import numpy.typing as npt
 import random
@@ -544,7 +544,7 @@ def get_numeracy_adapter_spec(
                 "max_eval_instances": max_eval_instances,
                 "num_outputs": 1,
                 "num_train_trials": 1,
-                "model": "openai/davinci",
+                "model_deployment": "openai/davinci",
                 "temperature": 0,
                 "stop_sequences": ["\n"],
                 "max_tokens": 20,
diff --git a/src/helm/benchmark/scenarios/scenario.py b/src/helm/benchmark/scenarios/scenario.py
index 76f413af7dd..4d36bb1ab47 100644
--- a/src/helm/benchmark/scenarios/scenario.py
+++ b/src/helm/benchmark/scenarios/scenario.py
@@ -177,7 +177,7 @@ def render_lines(self) -> List[str]:
 
 
 # TODO(#1212): Scenario should not be a dataclass.
-@dataclass  # type: ignore
+@dataclass
 class Scenario(ABC):
     """
     A scenario represents a (task, data distribution).
diff --git a/src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py b/src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py
new file mode 100644
index 00000000000..bccf8c50b78
--- /dev/null
+++ b/src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py
@@ -0,0 +1,108 @@
+from typing import Dict, List, Set
+import json
+import os
+
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    Instance,
+    Input,
+    Output,
+    Reference,
+    Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+
+
+class VizWizScenario(Scenario):
+    """
+    VizWiz is a real-world visual question answering dataset consisting of questions
+    asked by people who are blind. It originates from a natural visual question answering
+    setting where blind people each took an image and recorded a spoken question about it,
+    together with 10 crowdsourced answers per visual question.
+
+    Version as of January 1, 2020:
+
+    - 20,523 training image/question pairs
+    - 205,230 training answer/answer confidence pairs
+    - 4,319 validation image/question pairs
+    - 43,190 validation answer/answer confidence pairs
+
+    where answer confidences are one of {"yes", "maybe", "no"}.
+
+    Answers are publicly shared for the train and validation splits and hidden for the test split.
+
+    Paper: https://arxiv.org/abs/1802.08218
+    Website: https://vizwiz.org/tasks-and-datasets/vqa
+    """
+
+    # Annotations are not available for the test set
+    ANNOTATIONS_URL: str = "https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip"
+    SPLIT_TO_ANNOTATIONS_FILE: Dict[str, str] = {
+        TRAIN_SPLIT: "train.json",
+        VALID_SPLIT: "val.json",
+    }
+
+    SPLIT_TO_IMAGES: Dict[str, str] = {
+        TRAIN_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip",
+        VALID_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip",
+    }
+
+    name = "viz_wiz"
+    description = (
+        "Real-world VQA dataset consisting of questions asked by "
+        "people who are blind ([paper](https://arxiv.org/abs/1802.08218))."
+    )
+    tags = ["vision-language", "visual question answering"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the questions and annotations
+        annotations_path: str = os.path.join(output_path, "annotations")
+        ensure_directory_exists(annotations_path)
+        ensure_file_downloaded(
+            source_url=self.ANNOTATIONS_URL,
+            target_path=annotations_path,
+            unpack=True,
+            unpack_type="unzip",
+        )
+
+        instances: List[Instance] = []
+        for split in [TRAIN_SPLIT, VALID_SPLIT]:
+            # Download the images for the split
+            images_path: str = os.path.join(output_path, split)
+            ensure_file_downloaded(
+                source_url=self.SPLIT_TO_IMAGES[split],
+                target_path=images_path,
+                unpack=True,
+                unpack_type="unzip",
+            )
+
+            annotations_split_path: str = os.path.join(annotations_path, self.SPLIT_TO_ANNOTATIONS_FILE[split])
+            with open(annotations_split_path) as f:
+                for image_annotation in json.load(f):
+                    image_path: str = os.path.join(images_path, image_annotation["image"])
+                    assert os.path.exists(image_path), f"Image {image_path} does not exist"
+
+                    content: List[MediaObject] = [
+                        MediaObject(location=image_path, content_type="image/jpeg"),
+                        MediaObject(text=image_annotation["question"], content_type="text/plain"),
+                    ]
+                    deduped_answers: Set[str] = {
+                        answer_json["answer"]
+                        for answer_json in image_annotation["answers"]
+                        if answer_json["answer_confidence"] == "yes"
+                    }
+
+                    instances.append(
+                        Instance(
+                            Input(multimedia_content=MultimediaObject(content)),
+                            references=[
+                                Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in deduped_answers
+                            ],
+                            split=split,
+                        )
+                    )
+
+        return instances
diff --git a/src/helm/benchmark/slurm_runner.py b/src/helm/benchmark/slurm_runner.py
index 84edde35b33..09cde008b52 100644
--- a/src/helm/benchmark/slurm_runner.py
+++ b/src/helm/benchmark/slurm_runner.py
@@ -131,7 +131,7 @@ def run_all(self, run_specs: List[RunSpec]):
         # When running with multiple models, sorting by RunSpec.name is a heuristic that tries to
         # spread out the load evenly across multiple models, in order to avoid overloading any single model.
         for run_spec in sorted(run_specs, key=lambda run_spec: run_spec.name):
-            if self.skip_completed_runs and self._is_run_completed(run_spec):
+            if self.skip_completed_runs and self._is_run_completed(self._get_run_path(run_spec)):
                 skipped_run_specs.append(run_spec)
             else:
                 queued_run_specs.append(run_spec)
diff --git a/src/helm/benchmark/static/schema.yaml b/src/helm/benchmark/static/schema.yaml
index e8f345f559a..0edf2eff69c 100644
--- a/src/helm/benchmark/static/schema.yaml
+++ b/src/helm/benchmark/static/schema.yaml
@@ -76,7 +76,7 @@ models:
     access: limited
     num_parameters: 70000000000
     release_date: 2022-01-01
-  
+
   # TODO: Remove Once we have configurable model names
   - name: neurips/local
     display_name: Local service
@@ -160,6 +160,32 @@ models:
     creator_organization: BigCode
     access: open
 
+  # Hugging Face
+  - name: huggingface/gpt2
+    display_name: GPT-2 (124M)
+    description: GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.
+    creator_organization: OpenAI
+    access: open
+    num_parameters: 124000000
+  - name: huggingface/gpt2-medium
+    display_name: GPT-2 Medium (355M)
+    description: GPT-2 Medium is the 355M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
+    creator_organization: OpenAI
+    access: open
+    num_parameters: 355000000
+  - name: huggingface/gpt2-large
+    display_name: GPT-2 Large (774M)
+    description: GPT-2 Large is the 774M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
+    creator_organization: OpenAI
+    access: open
+    num_parameters: 774000000
+  - name: huggingface/gpt2-xl
+    display_name: GPT-2 XL (1.5B)
+    description: GPT-2 XL is the 1.5B parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
+    creator_organization: OpenAI
+    access: open
+    num_parameters: 1500000000
+
   # Cerebras Systems
   - name: together/cerebras-gpt-6.7b
     display_name: Cerebras GPT (6.7B)
@@ -961,8 +987,12 @@ adapter:
     description: Maximum number of possible outputs to generate by sampling multiple outputs.
   - name: num_train_trials
     description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
+  - name: sample_train
+    description: If true, randomly sample N training examples; if false, select N consecutive training examples
   - name: model
-    description: Name of the language model (<organization>/<model name>) to send requests to.
+    description: DEPRECATED. Name of the language model (<creator_organization>/<model name>) to send requests to.
+  - name: model_deployment
+    description: Name of the language model (<host_organization>/<model name>) to send requests to.
   - name: temperature
     description: Temperature parameter used in generation.
   - name: max_tokens
@@ -971,6 +1001,8 @@ adapter:
     description: List of sequences, where we stop generation if we encounter any of them.
   - name: random
     description: Random seed (string), which guarantees reproducibility.
+  - name: multi_label
+    description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
 
 ############################################################
 metrics:
@@ -1059,6 +1091,7 @@ metrics:
     short_display_name: PEM
     description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
     lower_is_better: false
+
   - name: exact_match@5
     display_name: Exact match @5
     short_display_name: EM@5
@@ -1069,6 +1102,17 @@ metrics:
     short_display_name: EM@5
     description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
     lower_is_better: false
+  - name: prefix_exact_match@5
+    display_name: Prefix exact match @5
+    short_display_name: PEM@5
+    description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
+    lower_is_better: false
+  - name: quasi_prefix_exact_match@5
+    display_name: Prefix quasi-exact match @5
+    short_display_name: PEM@5
+    description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
+    lower_is_better: false
+
   - name: logprob
     display_name: Log probability
     short_display_name: Logprob
@@ -1979,6 +2023,7 @@ run_groups:
       - synthetic_efficiency
     adapter_keys_shown:
       - model
+      - model_deployment
       - max_tokens
 
   - name: calibration
@@ -2024,6 +2069,7 @@ run_groups:
       - civil_comments
     adapter_keys_shown:
       - model
+      - model_deployment
       - max_train_instances
     subgroup_metric_groups_hidden:
       - robustness
@@ -2045,6 +2091,7 @@ run_groups:
       - bbq
     adapter_keys_shown:
       - model
+      - model_deployment
       - method
 
   - name: ablation_prompts
@@ -2059,6 +2106,7 @@ run_groups:
       - civil_comments
     adapter_keys_shown:
       - model
+      - model_deployment
       - instructions
       - input_prefix
       - input_suffix
@@ -3064,6 +3112,7 @@ run_groups:
       main_split: test
     adapter_keys_shown:
       - model
+      - model_deployment
       - max_tokens
     taxonomy:
       task: "?"
diff --git a/src/helm/benchmark/test_model_properties.py b/src/helm/benchmark/test_model_properties.py
new file mode 100644
index 00000000000..64cf3e0db9d
--- /dev/null
+++ b/src/helm/benchmark/test_model_properties.py
@@ -0,0 +1,1465 @@
+"""Temporary test for preserving invariants during the model / tokenizer / window service refactor.
+
+Delete this after the refactor is done."""
+
+import pytest
+from tempfile import TemporaryDirectory
+from typing import Any
+from helm.benchmark.config_registry import register_helm_configurations
+from helm.benchmark.model_deployment_registry import (
+    ClientSpec,
+    ModelDeployment,
+    WindowServiceSpec,
+    ALL_MODEL_DEPLOYMENTS,
+)
+from helm.benchmark.model_metadata_registry import ModelMetadata
+from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec
+from helm.benchmark.window_services.test_utils import get_tokenizer_service
+
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.proxy.clients.auto_client import AutoClient
+from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
+from collections import defaultdict
+
+
+_BUILT_IN_TOKENIZER_CONFIGS = [
+    TokenizerConfig(
+        name="neurips/local",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.http_model_tokenizer.HTTPModelTokenizer"),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="<|endoftext|>",
+    ),
+    TokenizerConfig(
+        name="ai21/j1",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.ai21_tokenizer.AI21Tokenizer"),
+        end_of_text_token=" ",
+        prefix_token="",
+    ),
+    TokenizerConfig(
+        name="AlephAlpha/luminous-base",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"),
+        end_of_text_token="",
+        prefix_token="",
+    ),
+    TokenizerConfig(
+        name="AlephAlpha/luminous-extended",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"),
+        end_of_text_token="",
+        prefix_token="",
+    ),
+    TokenizerConfig(
+        name="AlephAlpha/luminous-supreme",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"),
+        end_of_text_token="",
+        prefix_token="",
+    ),
+    TokenizerConfig(
+        name="huggingface/gpt2",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="<|endoftext|>",
+    ),
+    TokenizerConfig(
+        name="anthropic/claude",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.anthropic_tokenizer.AnthropicTokenizer"),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="<|endoftext|>",
+    ),
+    TokenizerConfig(
+        name="bigscience/bloom",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="</s>",
+    ),
+    TokenizerConfig(
+        name="bigscience/T0pp",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="",
+    ),
+    TokenizerConfig(
+        name="cohere/cohere",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer"),
+        end_of_text_token="",
+        prefix_token=":",
+    ),
+    TokenizerConfig(
+        name="EleutherAI/gpt-j-6B",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="<|endoftext|>",
+    ),
+    TokenizerConfig(
+        name="EleutherAI/gpt-neox-20b",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="<|endoftext|>",
+    ),
+    TokenizerConfig(
+        name="hf-internal-testing/llama-tokenizer",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="<s>",
+    ),
+    TokenizerConfig(
+        name="meta-llama/Llama-2-7b-hf",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="<s>",
+    ),
+    TokenizerConfig(
+        name="mistralai/Mistral-7B-v0.1",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="<s>",
+    ),
+    TokenizerConfig(
+        name="tiiuae/falcon-7b",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="",
+    ),
+    TokenizerConfig(
+        name="bigcode/santacoder",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="<|endoftext|>",
+    ),
+    TokenizerConfig(
+        name="bigcode/starcoder",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="<|endoftext|>",
+    ),
+    TokenizerConfig(
+        name="google/t5-11b",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="",
+    ),
+    TokenizerConfig(
+        name="google/flan-t5-xxl",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="",
+    ),
+    TokenizerConfig(
+        name="google/ul2",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="",
+    ),
+    TokenizerConfig(
+        name="facebook/opt-66b",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="</s>",
+    ),
+    TokenizerConfig(
+        name="openai/cl100k_base",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="<|endoftext|>",
+    ),
+    TokenizerConfig(
+        name="TsinghuaKEG/ice",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.ice_tokenizer.ICETokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="",
+    ),
+    TokenizerConfig(
+        name="Yandex/yalm",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.yalm_tokenizer.YaLMTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="</s>",
+    ),
+    TokenizerConfig(
+        name="lightningai/lit-gpt",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.lit_gpt_tokenizer.LitGPTTokenizer", args={}),
+        end_of_text_token="<|endoftext|>",
+        prefix_token="<|endoftext|>",
+    ),
+    TokenizerConfig(
+        name="HuggingFaceM4/idefics-9b",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="<s>",
+    ),
+    TokenizerConfig(
+        name="HuggingFaceM4/idefics-9b-instruct",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="<s>",
+    ),
+    TokenizerConfig(
+        name="HuggingFaceM4/idefics-80b",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="<s>",
+    ),
+    TokenizerConfig(
+        name="HuggingFaceM4/idefics-80b-instruct",
+        tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+        end_of_text_token="</s>",
+        prefix_token="<s>",
+    ),
+]
+
+
+_BUILT_IN_MODEL_DEPLOYMENTS = [
+    ModelDeployment(
+        name="neurips/local",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.http_model_client.HTTPModelClient"),
+        tokenizer_name="neurips/local",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.http_model_window_service.HTTPModelWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="ai21/j1-jumbo",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+        tokenizer_name="ai21/j1",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+        ),
+        max_sequence_length=2047,
+    ),
+    ModelDeployment(
+        name="ai21/j1-grande",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+        tokenizer_name="ai21/j1",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+        ),
+        max_sequence_length=2047,
+    ),
+    ModelDeployment(
+        name="ai21/j1-grande-v2-beta",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+        tokenizer_name="ai21/j1",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+        ),
+        max_sequence_length=2047,
+    ),
+    ModelDeployment(
+        name="ai21/j1-large",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+        tokenizer_name="ai21/j1",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+        ),
+        max_sequence_length=2047,
+    ),
+    ModelDeployment(
+        name="ai21/j2-jumbo",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+        tokenizer_name="ai21/j1",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_ai21_window_service.AI21Jurassic2JumboWindowService",
+            args={},
+        ),
+        max_sequence_length=6000,
+    ),
+    ModelDeployment(
+        name="ai21/j2-grande",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+        tokenizer_name="ai21/j1",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+        ),
+        max_sequence_length=2047,
+    ),
+    ModelDeployment(
+        name="ai21/j2-large",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+        tokenizer_name="ai21/j1",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+        ),
+        max_sequence_length=2047,
+    ),
+    ModelDeployment(
+        name="AlephAlpha/luminous-base",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"),
+        tokenizer_name="AlephAlpha/luminous-base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.luminous_window_service.LuminousBaseWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="AlephAlpha/luminous-extended",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"),
+        tokenizer_name="AlephAlpha/luminous-extended",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.luminous_window_service.LuminousExtendedWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="AlephAlpha/luminous-supreme",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"),
+        tokenizer_name="AlephAlpha/luminous-supreme",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.luminous_window_service.LuminousSupremeWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="anthropic/stanford-online-all-v4-s3",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicLegacyClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.anthropic_window_service.LegacyAnthropicWindowService"
+        ),
+        max_sequence_length=8192,
+    ),
+    ModelDeployment(
+        name="anthropic/claude-2.0",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"),
+        tokenizer_name="anthropic/claude",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+        ),
+        max_sequence_length=8000,
+        max_sequence_and_generated_tokens_length=9016,
+    ),
+    ModelDeployment(
+        name="anthropic/claude-v1.3",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"),
+        tokenizer_name="anthropic/claude",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+        ),
+        max_sequence_length=8000,
+        max_sequence_and_generated_tokens_length=9016,
+    ),
+    ModelDeployment(
+        name="anthropic/claude-instant-v1",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"),
+        tokenizer_name="anthropic/claude",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+        ),
+        max_sequence_length=8000,
+        max_sequence_and_generated_tokens_length=9016,
+    ),
+    ModelDeployment(
+        name="together/bloom",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="bigscience/bloom",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.bloom_window_service.BloomWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/t0pp",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="bigscience/T0pp",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.t0pp_window_service.T0ppWindowService"
+        ),
+        max_sequence_length=1024,
+    ),
+    ModelDeployment(
+        name="cohere/xlarge-20220609",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+        tokenizer_name="cohere/cohere",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+        ),
+        max_sequence_length=2047,
+        max_request_length=2048,
+    ),
+    ModelDeployment(
+        name="cohere/xlarge-20221108",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+        tokenizer_name="cohere/cohere",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+        ),
+        max_sequence_length=2047,
+        max_request_length=2048,
+    ),
+    ModelDeployment(
+        name="cohere/large-20220720",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+        tokenizer_name="cohere/cohere",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+        ),
+        max_sequence_length=2047,
+        max_request_length=2048,
+    ),
+    ModelDeployment(
+        name="cohere/medium-20220720",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+        tokenizer_name="cohere/cohere",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+        ),
+        max_sequence_length=2047,
+        max_request_length=2048,
+    ),
+    ModelDeployment(
+        name="cohere/medium-20221108",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+        tokenizer_name="cohere/cohere",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+        ),
+        max_sequence_length=2047,
+        max_request_length=2048,
+    ),
+    ModelDeployment(
+        name="cohere/small-20220720",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+        tokenizer_name="cohere/cohere",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+        ),
+        max_sequence_length=2047,
+        max_request_length=2048,
+    ),
+    ModelDeployment(
+        name="cohere/command-medium-beta",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+        tokenizer_name="cohere/cohere",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+        ),
+        max_sequence_length=2019,
+        max_request_length=2020,
+    ),
+    ModelDeployment(
+        name="cohere/command-xlarge-beta",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+        tokenizer_name="cohere/cohere",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+        ),
+        max_sequence_length=2019,
+        max_request_length=2020,
+    ),
+    ModelDeployment(
+        name="cohere/command",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+        tokenizer_name="cohere/cohere",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+        ),
+        max_sequence_length=2019,
+        max_request_length=2020,
+    ),
+    ModelDeployment(
+        name="cohere/command-light",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+        tokenizer_name="cohere/cohere",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+        ),
+        max_sequence_length=2019,
+        max_request_length=2020,
+    ),
+    ModelDeployment(
+        name="together/gpt-j-6b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-j-6B",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/gpt-neox-20b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/pythia-1b-v0",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/pythia-2.8b-v0",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/pythia-6.9b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/pythia-12b-v0",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/llama-7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="hf-internal-testing/llama-tokenizer",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="together/llama-13b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="hf-internal-testing/llama-tokenizer",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="together/llama-30b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="hf-internal-testing/llama-tokenizer",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="together/llama-65b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="hf-internal-testing/llama-tokenizer",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="together/llama-2-7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="meta-llama/Llama-2-7b-hf",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+        ),
+        max_sequence_length=4096,
+    ),
+    ModelDeployment(
+        name="together/llama-2-13b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="meta-llama/Llama-2-7b-hf",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+        ),
+        max_sequence_length=4096,
+    ),
+    ModelDeployment(
+        name="together/llama-2-70b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="meta-llama/Llama-2-7b-hf",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+        ),
+        max_sequence_length=4096,
+    ),
+    ModelDeployment(
+        name="together/alpaca-7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="hf-internal-testing/llama-tokenizer",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="together/vicuna-7b-v1.3",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="hf-internal-testing/llama-tokenizer",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="together/vicuna-13b-v1.3",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="hf-internal-testing/llama-tokenizer",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="together/mistral-7b-v0.1",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="mistralai/Mistral-7B-v0.1",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+        ),
+        max_sequence_length=4095,
+    ),
+    ModelDeployment(
+        name="together/mpt-7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/mpt-instruct-7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/mpt-30b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/mpt-instruct-30b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/falcon-7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="tiiuae/falcon-7b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="together/falcon-7b-instruct",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="tiiuae/falcon-7b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="together/falcon-40b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="tiiuae/falcon-7b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="together/falcon-40b-instruct",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="tiiuae/falcon-7b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="gooseai/gpt-neo-20b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.goose_ai_client.GooseAIClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="gooseai/gpt-j-6b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.goose_ai_client.GooseAIClient"),
+        tokenizer_name="EleutherAI/gpt-j-6B",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="huggingface/gpt2",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+        ),
+        max_sequence_length=1024,
+        max_request_length=1025,
+    ),
+    ModelDeployment(
+        name="huggingface/gpt-j-6b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
+        tokenizer_name="EleutherAI/gpt-j-6B",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="huggingface/santacoder",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
+        tokenizer_name="bigcode/santacoder",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.santacoder_window_service.SantaCoderWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="huggingface/starcoder",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
+        tokenizer_name="bigcode/starcoder",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.starcoder_window_service.StarCoderWindowService"
+        ),
+        max_sequence_length=8192,
+    ),
+    ModelDeployment(
+        name="together/t5-11b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="google/t5-11b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.t511b_window_service.T511bWindowService"
+        ),
+        max_sequence_length=511,
+    ),
+    ModelDeployment(
+        name="together/flan-t5-xxl",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="google/flan-t5-xxl",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.flan_t5_window_service.FlanT5WindowService"
+        ),
+        max_sequence_length=511,
+    ),
+    ModelDeployment(
+        name="together/ul2",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="google/ul2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.ul2_window_service.UL2WindowService"
+        ),
+        max_sequence_length=511,
+    ),
+    ModelDeployment(
+        name="together/h3-2.7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+        ),
+        max_sequence_length=1024,
+        max_request_length=1025,
+    ),
+    ModelDeployment(
+        name="together/opt-175b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="facebook/opt-66b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/opt-66b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="facebook/opt-66b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/opt-6.7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="facebook/opt-66b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/opt-1.3b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="facebook/opt-66b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="microsoft/TNLGv2_530B",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.microsoft_client.MicrosoftClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
+        ),
+        max_sequence_length=2047,
+        max_request_length=2048,
+    ),
+    ModelDeployment(
+        name="microsoft/TNLGv2_7B",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.microsoft_client.MicrosoftClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
+        ),
+        max_sequence_length=2047,
+        max_request_length=2048,
+    ),
+    ModelDeployment(
+        name="openai/davinci",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/curie",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/babbage",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/ada",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/text-davinci-003",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+        ),
+        max_sequence_length=4000,
+        max_request_length=4001,
+    ),
+    ModelDeployment(
+        name="openai/text-davinci-002",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+        ),
+        max_sequence_length=4000,
+        max_request_length=4001,
+    ),
+    ModelDeployment(
+        name="openai/text-davinci-001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/text-curie-001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/text-babbage-001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/text-ada-001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/code-davinci-002",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+        ),
+        max_sequence_length=4000,
+        max_request_length=4001,
+    ),
+    ModelDeployment(
+        name="openai/code-davinci-001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/code-cushman-001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/gpt-4-0314",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="openai/cl100k_base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
+        ),
+        max_sequence_length=8192,
+        max_request_length=8193,
+    ),
+    ModelDeployment(
+        name="openai/gpt-4-32k-0314",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="openai/cl100k_base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
+        ),
+        max_sequence_length=32768,
+        max_request_length=32769,
+    ),
+    ModelDeployment(
+        name="openai/gpt-4-0613",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="openai/cl100k_base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
+        ),
+        max_sequence_length=8192,
+        max_request_length=8193,
+    ),
+    ModelDeployment(
+        name="openai/gpt-4-32k-0613",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="openai/cl100k_base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
+        ),
+        max_sequence_length=32768,
+        max_request_length=32769,
+    ),
+    ModelDeployment(
+        name="openai/gpt-3.5-turbo-0301",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="openai/cl100k_base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
+        ),
+        max_sequence_length=4000,
+        max_request_length=4001,
+    ),
+    ModelDeployment(
+        name="openai/gpt-3.5-turbo-0613",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="openai/cl100k_base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
+        ),
+        max_sequence_length=4000,
+        max_request_length=4001,
+    ),
+    ModelDeployment(
+        name="openai/gpt-3.5-turbo-16k-0613",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="openai/cl100k_base",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.wider_openai_window_service.GPTTurbo16KWindowService"
+        ),
+        max_sequence_length=16000,
+        max_request_length=16001,
+    ),
+    ModelDeployment(
+        name="openai/text-similarity-davinci-001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/text-similarity-curie-001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/text-similarity-babbage-001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/text-similarity-ada-001",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="openai/text-embedding-ada-002",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/gpt-jt-6b-v1",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-j-6B",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/gpt-neoxt-chat-base-20b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/redpajama-incite-base-3b-v1",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/redpajama-incite-instruct-3b-v1",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/redpajama-incite-base-7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/redpajama-incite-instruct-7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/glm",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="TsinghuaKEG/ice",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.ice_window_service.ICEWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="writer/palmyra-base",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+        ),
+        max_sequence_length=2048,
+        max_sequence_and_generated_tokens_length=2048,
+    ),
+    ModelDeployment(
+        name="writer/palmyra-large",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+        ),
+        max_sequence_length=2048,
+        max_sequence_and_generated_tokens_length=2048,
+    ),
+    ModelDeployment(
+        name="writer/palmyra-instruct-30",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+        ),
+        max_sequence_length=2048,
+        max_sequence_and_generated_tokens_length=2048,
+    ),
+    ModelDeployment(
+        name="writer/palmyra-e",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+        ),
+        max_sequence_length=2048,
+        max_sequence_and_generated_tokens_length=2048,
+    ),
+    ModelDeployment(
+        name="writer/silk-road",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
+        ),
+        max_sequence_length=8192,
+        max_sequence_and_generated_tokens_length=8192,
+    ),
+    ModelDeployment(
+        name="writer/palmyra-x",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
+        ),
+        max_sequence_length=8192,
+        max_sequence_and_generated_tokens_length=8192,
+    ),
+    ModelDeployment(
+        name="together/yalm",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="Yandex/yalm",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.yalm_window_service.YaLMWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="google/palm",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.google_client.GoogleClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="nvidia/megatron-gpt2",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.megatron_client.MegatronClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.megatron_window_service.MegatronWindowService"
+        ),
+        max_sequence_length=1024,
+    ),
+    ModelDeployment(
+        name="together/dolly-v2-3b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/dolly-v2-7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/dolly-v2-12b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+    ModelDeployment(
+        name="together/stablelm-base-alpha-3b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
+        ),
+        max_sequence_length=4096,
+        max_request_length=4097,
+    ),
+    ModelDeployment(
+        name="together/stablelm-base-alpha-7b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+        tokenizer_name="EleutherAI/gpt-neox-20b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
+        ),
+        max_sequence_length=4096,
+        max_request_length=4097,
+    ),
+    ModelDeployment(
+        name="lightningai/lit-gpt",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.lit_gpt_client.LitGPTClient", args={}),
+        model_name=None,
+        tokenizer_name="lightningai/lit-gpt",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.lit_gpt_window_service.LitGPTWindowService", args={}
+        ),
+        max_sequence_length=2048,
+        max_request_length=None,
+        max_sequence_and_generated_tokens_length=None,
+    ),
+    ModelDeployment(
+        name="HuggingFaceM4/idefics-9b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
+        tokenizer_name="HuggingFaceM4/idefics-9b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="HuggingFaceM4/idefics-9b-instruct",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
+        tokenizer_name="HuggingFaceM4/idefics-9b-instruct",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="HuggingFaceM4/idefics-80b",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
+        tokenizer_name="HuggingFaceM4/idefics-80b",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="HuggingFaceM4/idefics-80b-instruct",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
+        tokenizer_name="HuggingFaceM4/idefics-80b-instruct",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+        ),
+        max_sequence_length=2048,
+    ),
+    ModelDeployment(
+        name="simple/model1",
+        client_spec=ClientSpec(class_name="helm.proxy.clients.simple_client.SimpleClient"),
+        tokenizer_name="huggingface/gpt2",
+        window_service_spec=WindowServiceSpec(
+            class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+        ),
+        max_sequence_length=2048,
+        max_request_length=2049,
+    ),
+]
+
+
+_INT_MAX: int = 2**31 - 1
+
+
+def _full_class_name(obj: Any) -> str:
+    return f"{obj.__class__.__module__}.{obj.__class__.__name__}"
+
+
+# HACK: This looks like it should be done in a setup_class()
+# for the test below but apparently pytest first check the parametrize
+# before running the setup_class().
+# Therefore ALL_MODEL_DEPLOYMENTS is empty and no test would be run,
+# so we need to do this here.
+register_helm_configurations()
+
+
+class TestModelProperties:
+    @pytest.mark.parametrize("model", ALL_MODEL_DEPLOYMENTS)
+    def test_models_has_window_service(self, model: ModelMetadata):
+        auto_client = AutoClient(defaultdict(str), "", "")
+        auto_tokenizer = AutoTokenizer(defaultdict(str), "", "")
+        model_deployments = {
+            model_deployment.name: model_deployment for model_deployment in _BUILT_IN_MODEL_DEPLOYMENTS
+        }
+        tokenizer_configs = {
+            tokenizer_config.name: tokenizer_config for tokenizer_config in _BUILT_IN_TOKENIZER_CONFIGS
+        }
+        with TemporaryDirectory() as tmpdir:
+            tokenizer_service = get_tokenizer_service(tmpdir)
+            # Can't test lit-gpt client because it requires manual dependencies
+            if "lit-gpt" in model.name:
+                return
+
+            # Can't test Llama 2 because it requires Hugging Face credentials
+            if "llama-2-" in model.name:
+                return
+
+            deployment_name: str = model.name
+            client = auto_client._get_client(deployment_name)
+            window_service = WindowServiceFactory.get_window_service(deployment_name, tokenizer_service)
+            tokenizer_name = window_service.tokenizer_name
+            tokenizer = auto_tokenizer._get_tokenizer(tokenizer_name)
+
+            client_class_name = _full_class_name(client)
+            tokenizer_class_name = _full_class_name(tokenizer)
+            window_service_class_name = _full_class_name(window_service)
+
+            prefix_token = window_service.prefix_token
+            end_of_text_token = window_service.end_of_text_token
+
+            max_sequence_length = window_service.max_sequence_length
+            max_request_length = (
+                window_service.max_request_length
+                if window_service.max_request_length != window_service.max_sequence_length
+                else None
+            )
+            max_sequence_and_generated_tokens_length = (
+                window_service.max_sequence_and_generated_tokens_length
+                if window_service.max_sequence_and_generated_tokens_length != _INT_MAX
+                else None
+            )
+
+            model_deployment = ModelDeployment(
+                name=model.name,
+                client_spec=ClientSpec(class_name=client_class_name),
+                tokenizer_name=tokenizer_name,
+                window_service_spec=WindowServiceSpec(class_name=window_service_class_name),
+                max_sequence_length=max_sequence_length,
+                max_request_length=max_request_length,
+                max_sequence_and_generated_tokens_length=max_sequence_and_generated_tokens_length,
+            )
+            tokenizer_config = TokenizerConfig(
+                name=tokenizer_name,
+                tokenizer_spec=TokenizerSpec(class_name=tokenizer_class_name),
+                end_of_text_token=end_of_text_token,
+                prefix_token=prefix_token,
+            )
+            # NOTE: To generate the _BUILT_IN_MODEL_DEPLOYMENT and _BUILT_IN_TOKENIZER_CONFIGS lists above,
+            # print tokenizer_config and model_deployment here.
+
+            assert model_deployments[model.name] == model_deployment
+            # PalmyraWindowService overrides the huggingface/gpt2 tokenizer with different special tokens,
+            # so there are currently two tokenizers named huggingface/gpt2
+            # TODO: Give PalmyraWindowService's tokenizer a different name e.g. writer/palmyra
+            if tokenizer_name != "huggingface/gpt2":
+                assert tokenizer_configs[tokenizer_name] == tokenizer_config
+
+    def test_num_models_available(self):
+        assert len(ALL_MODEL_DEPLOYMENTS) == 119
diff --git a/src/helm/benchmark/tokenizer_config_registry.py b/src/helm/benchmark/tokenizer_config_registry.py
index dda06f384ac..732cd38bd1e 100644
--- a/src/helm/benchmark/tokenizer_config_registry.py
+++ b/src/helm/benchmark/tokenizer_config_registry.py
@@ -1,15 +1,18 @@
 import os
 from typing import Dict, Optional, List
 from dataclasses import dataclass
+import importlib_resources as resources
 
 import cattrs
 import yaml
 
 from helm.common.hierarchical_logger import hlog
 from helm.common.object_spec import ObjectSpec
+from helm.benchmark.model_metadata_registry import CONFIG_PACKAGE
 
 
-TOKENIEZR_CONFIGS_FILE = "tokenizer_configs.yaml"
+TOKENIZER_CONFIGS_FILE: str = "tokenizer_configs.yaml"
+TOKENIZERS_REGISTERED: bool = False
 
 
 class TokenizerSpec(ObjectSpec):
@@ -26,7 +29,11 @@ class TokenizerConfig:
     tokenizer_spec: TokenizerSpec
     """Specification for instantiating the client for this tokenizer."""
 
-    # TODO: Add `end_of_text_token`` and `prefix_token``
+    end_of_text_token: Optional[str] = None
+    """The end of text token."""
+
+    prefix_token: Optional[str] = None
+    """The prefix token."""
 
 
 @dataclass(frozen=True)
@@ -34,11 +41,13 @@ class TokenizerConfigs:
     tokenizer_configs: List[TokenizerConfig]
 
 
-_name_to_tokenizer_config: Dict[str, TokenizerConfig] = {}
+ALL_TOKENIZER_CONFIGS: List[TokenizerConfig] = []
+TOKENIZER_NAME_TO_CONFIG: Dict[str, TokenizerConfig] = {config.name: config for config in ALL_TOKENIZER_CONFIGS}
 
 
 def register_tokenizer_config(tokenizer_config: TokenizerConfig) -> None:
-    _name_to_tokenizer_config[tokenizer_config.name] = tokenizer_config
+    ALL_TOKENIZER_CONFIGS.append(tokenizer_config)
+    TOKENIZER_NAME_TO_CONFIG[tokenizer_config.name] = tokenizer_config
 
 
 def register_tokenizer_configs_from_path(path: str) -> None:
@@ -50,11 +59,20 @@ def register_tokenizer_configs_from_path(path: str) -> None:
         register_tokenizer_config(tokenizer_config)
 
 
-def maybe_register_tokenizer_configs_from_base_path(base_path: str) -> None:
-    path = os.path.join(base_path, TOKENIEZR_CONFIGS_FILE)
+def maybe_register_tokenizer_configs_from_base_path(path: str) -> None:
+    """Register tokenizer configs from yaml file if the path exists."""
     if os.path.exists(path):
         register_tokenizer_configs_from_path(path)
 
 
 def get_tokenizer_config(name: str) -> Optional[TokenizerConfig]:
-    return _name_to_tokenizer_config.get(name)
+    register_tokenizers_if_not_already_registered()
+    return TOKENIZER_NAME_TO_CONFIG.get(name)
+
+
+def register_tokenizers_if_not_already_registered() -> None:
+    global TOKENIZERS_REGISTERED
+    if not TOKENIZERS_REGISTERED:
+        path: str = resources.files(CONFIG_PACKAGE).joinpath(TOKENIZER_CONFIGS_FILE)
+        maybe_register_tokenizer_configs_from_base_path(path)
+        TOKENIZERS_REGISTERED = True
diff --git a/src/helm/benchmark/vlm_run_specs.py b/src/helm/benchmark/vlm_run_specs.py
index 6f226a36584..034aa33aace 100644
--- a/src/helm/benchmark/vlm_run_specs.py
+++ b/src/helm/benchmark/vlm_run_specs.py
@@ -18,7 +18,6 @@ def get_vlm_generation_adapter_spec(
     input_suffix: str = "",
     output_prefix: str = "",
     output_suffix: str = "",
-    max_train_instances: int = 0,
     max_tokens: int = 100,
     stop_sequences: Optional[List[str]] = None,
 ) -> AdapterSpec:
@@ -31,7 +30,8 @@ def get_vlm_generation_adapter_spec(
         output_prefix=output_prefix,
         output_suffix=output_suffix,
         instance_prefix="\n",
-        max_train_instances=max_train_instances,
+        # We focus on zero-shot evaluation for now as most open VLMs only support a single image input
+        max_train_instances=0,
         num_outputs=1,
         max_tokens=max_tokens,
         stop_sequences=stop_sequences if stop_sequences is not None else [],
@@ -43,6 +43,33 @@ def get_vlm_generation_adapter_spec(
 # VHELM run specs
 
 
+@run_spec_function("viz_wiz")
+def get_viz_wiz_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
+    )
+
+    # TODO: finalize the adapter spec parameters once we add more models
+    adapter_spec: AdapterSpec = get_vlm_generation_adapter_spec(
+        input_prefix="User: ",
+        input_suffix="<end_of_utterance>",
+        output_prefix="\nAssistant: ",
+        output_suffix="<end_of_utterance>",
+        stop_sequences=["<end_of_utterance>"],
+    )
+
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+
+    run_spec_name: str = "viz_wiz"
+    return RunSpec(
+        name=run_spec_name,
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
 @run_spec_function("vqa")
 def get_vqa_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -55,7 +82,6 @@ def get_vqa_spec() -> RunSpec:
         input_suffix="<end_of_utterance>",
         output_prefix="\nAssistant: ",
         output_suffix="<end_of_utterance>",
-        max_train_instances=3,
         stop_sequences=["<end_of_utterance>"],
     )
 
diff --git a/src/helm/benchmark/window_services/default_window_service.py b/src/helm/benchmark/window_services/default_window_service.py
index a643d2fcdd2..eddc056183f 100644
--- a/src/helm/benchmark/window_services/default_window_service.py
+++ b/src/helm/benchmark/window_services/default_window_service.py
@@ -10,11 +10,19 @@ def __init__(
         tokenizer_name: str,
         max_sequence_length: int,
         max_request_length: Optional[int] = None,
+        end_of_text_token: Optional[str] = None,
+        prefix_token: Optional[str] = None,
     ):
         super().__init__(service)
         self._tokenizer_name = tokenizer_name
         self._max_sequence_length = max_sequence_length
-        self._max_request_length = max_request_length
+        self._max_request_length = max_request_length or max_sequence_length
+        self._end_of_text_token = end_of_text_token or ""
+        self._prefix_token = prefix_token or ""
+
+    @property
+    def tokenizer_name(self) -> str:
+        return self._tokenizer_name
 
     @property
     def max_sequence_length(self) -> int:
@@ -22,18 +30,12 @@ def max_sequence_length(self) -> int:
 
     @property
     def max_request_length(self) -> int:
-        return self._max_request_length or self._max_sequence_length
+        return self._max_request_length
 
     @property
     def end_of_text_token(self) -> str:
-        # TODO: Support this
-        return ""
-
-    @property
-    def tokenizer_name(self) -> str:
-        return self._tokenizer_name
+        return self._end_of_text_token
 
     @property
     def prefix_token(self) -> str:
-        # TODO: Support this
-        return ""
+        return self._prefix_token
diff --git a/src/helm/benchmark/window_services/http_model_window_service.py b/src/helm/benchmark/window_services/http_model_window_service.py
index dac3bb70fbb..d84308b370a 100644
--- a/src/helm/benchmark/window_services/http_model_window_service.py
+++ b/src/helm/benchmark/window_services/http_model_window_service.py
@@ -3,7 +3,7 @@
 
 
 # TODO: Remove Once we have configurable model names since this hardcodes the tokenizer name
-class HTTPModelWindowServce(LocalWindowService):
+class HTTPModelWindowService(LocalWindowService):
     def __init__(self, service: TokenizerService):
         super().__init__(service)
 
diff --git a/src/helm/benchmark/window_services/huggingface_window_service.py b/src/helm/benchmark/window_services/huggingface_window_service.py
index d128c8b8974..3bb545e677f 100644
--- a/src/helm/benchmark/window_services/huggingface_window_service.py
+++ b/src/helm/benchmark/window_services/huggingface_window_service.py
@@ -12,7 +12,9 @@ def __init__(
         pretrained_model_name_or_path: Optional[str] = None,
         revision: Optional[str] = None,
         max_sequence_length: Optional[int] = None,
-        max_reqeust_length: Optional[int] = None,
+        max_request_length: Optional[int] = None,
+        end_of_text_token: Optional[str] = None,
+        prefix_token: Optional[str] = None,
     ):
         super().__init__(service)
         self._tokenizer_name = tokenizer_name
@@ -21,17 +23,21 @@ def __init__(
             pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
             revision=revision,
         )
-        self._prefix_token = tokenizer.bos_token
-        self._end_of_text_token = tokenizer.eos_token
-        # Override max_sequence_length if provided as an argument.
-        # Otherwise, auto-infer max_sequence_length from the Hugging Face tokenizer.
+        # Override max_sequence_length, max_request_length, end_of_text_token
+        # and prefix_token if provided as an argument.
+        # Otherwise, auto-infer them from the Hugging Face tokenizer.
+        #
         # Note that many Hugging Face tokenizers have incorrect sequence lengths,
         # so it is recommended to set this manually.
-        if max_sequence_length:
-            self._max_sequence_length = max_sequence_length
-        else:
-            self._max_sequence_length = tokenizer.model_max_length
-        self._max_request_length = max_reqeust_length
+        self._max_sequence_length = max_sequence_length or tokenizer.model_max_length
+        self._max_request_length = max_request_length or self._max_sequence_length
+        self._end_of_text_token = end_of_text_token or tokenizer.eos_token or ""
+        self._prefix_token = prefix_token or tokenizer.bos_token or ""
+
+    @property
+    def tokenizer_name(self) -> str:
+        """Name of the tokenizer to use when sending a request."""
+        return self._tokenizer_name
 
     @property
     def max_sequence_length(self) -> int:
@@ -41,18 +47,13 @@ def max_sequence_length(self) -> int:
     @property
     def max_request_length(self) -> int:
         """Return the max request length of this tokenizer."""
-        return self._max_request_length or self._max_sequence_length
+        return self._max_request_length
 
     @property
     def end_of_text_token(self) -> str:
         """The end of text token."""
         return self._end_of_text_token
 
-    @property
-    def tokenizer_name(self) -> str:
-        """Name of the tokenizer to use when sending a request."""
-        return self._tokenizer_name
-
     @property
     def prefix_token(self) -> str:
         """The prefix token."""
diff --git a/src/helm/benchmark/window_services/lit_gpt_window_service.py b/src/helm/benchmark/window_services/lit_gpt_window_service.py
index 5deddd6a004..4d670a38e68 100644
--- a/src/helm/benchmark/window_services/lit_gpt_window_service.py
+++ b/src/helm/benchmark/window_services/lit_gpt_window_service.py
@@ -2,7 +2,7 @@
 from .tokenizer_service import TokenizerService
 
 
-class LitGPTWindowServce(LocalWindowService):
+class LitGPTWindowService(LocalWindowService):
     def __init__(self, service: TokenizerService):
         super().__init__(service)
 
diff --git a/src/helm/benchmark/window_services/llama_window_service.py b/src/helm/benchmark/window_services/llama_window_service.py
index 7c54e3b03a2..586ce0d9702 100644
--- a/src/helm/benchmark/window_services/llama_window_service.py
+++ b/src/helm/benchmark/window_services/llama_window_service.py
@@ -21,8 +21,4 @@ class Llama2WindowService(HuggingFaceWindowService):
     #     meta-llama/Llama-2-70b-hf is not a local folder and is not a valid model identifier listed on
     #     'https://huggingface.co/models'
     def __init__(self, service: TokenizerService):
-        super().__init__(service, "meta-llama/Llama-2-7b-hf")
-
-    @property
-    def max_sequence_length(self) -> int:
-        return 4096
+        super().__init__(service, "meta-llama/Llama-2-7b-hf", max_sequence_length=4096)
diff --git a/src/helm/benchmark/window_services/remote_window_service.py b/src/helm/benchmark/window_services/remote_window_service.py
deleted file mode 100644
index e2a498265d5..00000000000
--- a/src/helm/benchmark/window_services/remote_window_service.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from typing import Dict, Type
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class RemoteWindowService(LocalWindowService):
-    def __init__(self, service: TokenizerService, model_name: str):
-        super().__init__(service)
-        self.model_name = model_name
-        info = self.service.get_info(model_name)
-        self._tokenizer_name = info.tokenizer_name
-        self._max_sequence_length = info.max_sequence_length
-        self._max_request_length = info.max_request_length
-        self._end_of_text_token = info.end_of_text_token
-        self._prefix_token = info.prefix_token
-
-    @property
-    def max_sequence_length(self) -> int:
-        return self._max_sequence_length
-
-    @property
-    def max_request_length(self) -> int:
-        return self._max_request_length
-
-    @property
-    def end_of_text_token(self) -> str:
-        return self._end_of_text_token
-
-    @property
-    def prefix_token(self) -> str:
-        return self._prefix_token
-
-    @property
-    def tokenizer_name(self) -> str:
-        """Name of the tokenizer to use when sending a request."""
-        return self._tokenizer_name
-
-
-# If the windowing logic is different from the base LocalWindowService,
-# please add the specific implementation for the model and add it to the following dict.
-remote_window_services: Dict[str, Type[RemoteWindowService]] = {}
-
-
-def get_remote_window_service(service: TokenizerService, model_name: str):
-    if model_name in remote_window_services:
-        return remote_window_services[model_name](service, model_name)
-    else:
-        return RemoteWindowService(service, model_name)
diff --git a/src/helm/benchmark/window_services/window_service_factory.py b/src/helm/benchmark/window_services/window_service_factory.py
index a1abbaeba39..e15bf720167 100644
--- a/src/helm/benchmark/window_services/window_service_factory.py
+++ b/src/helm/benchmark/window_services/window_service_factory.py
@@ -1,41 +1,20 @@
-from helm.benchmark.model_deployment_registry import WindowServiceSpec, get_model_deployment
-from helm.proxy.models import (
-    get_model,
-    get_model_names_with_tag,
-    Model,
-    AI21_WIDER_CONTEXT_WINDOW_TAG,
-    AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG,
-    WIDER_CONTEXT_WINDOW_TAG,
-    GPT_TURBO_CONTEXT_WINDOW_TAG,
-    GPT_TURBO_16K_CONTEXT_WINDOW_TAG,
-    GPT4_CONTEXT_WINDOW_TAG,
-    GPT4_32K_CONTEXT_WINDOW_TAG,
-)
+from typing import Optional
 
-from helm.benchmark.window_services.huggingface_window_service import HuggingFaceWindowService
-from helm.benchmark.window_services.gpt2_window_service import GPT2WindowService
-from helm.benchmark.window_services.remote_window_service import get_remote_window_service
+from helm.benchmark.model_deployment_registry import ModelDeployment, WindowServiceSpec, get_model_deployment
+from helm.benchmark.tokenizer_config_registry import TokenizerConfig, get_tokenizer_config
 from helm.benchmark.window_services.window_service import WindowService
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
-from helm.proxy.clients.remote_model_registry import get_remote_model
 from helm.common.object_spec import create_object, inject_object_spec_args
 
 
 class WindowServiceFactory:
     @staticmethod
-    def get_window_service(model_name: str, service: TokenizerService) -> WindowService:
+    def get_window_service(model_deployment_name: str, service: TokenizerService) -> WindowService:
         """
         Returns a `WindowService` given the name of the model.
         Make sure this function returns instantaneously on repeated calls.
         """
-        model: Model = get_model(model_name)
-        organization: str = model.organization
-        engine: str = model.engine
-
-        window_service: WindowService
-
-        # TODO: Migrate all window services to use use model deployments
-        model_deployment = get_model_deployment(model_name)
+        model_deployment: Optional[ModelDeployment] = get_model_deployment(model_deployment_name)
         if model_deployment:
             # If the model deployment specifies a WindowServiceSpec, instantiate it.
             window_service_spec: WindowServiceSpec
@@ -45,6 +24,16 @@ def get_window_service(model_name: str, service: TokenizerService) -> WindowServ
                 window_service_spec = WindowServiceSpec(
                     class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService", args={}
                 )
+
+            # If provided, look up special tokens from TokenizerConfig.
+            end_of_text_token: Optional[str] = None
+            prefix_token: Optional[str] = None
+            if model_deployment.tokenizer_name:
+                tokenizer_config: Optional[TokenizerConfig] = get_tokenizer_config(model_deployment.tokenizer_name)
+                if tokenizer_config:
+                    end_of_text_token = tokenizer_config.end_of_text_token
+                    prefix_token = tokenizer_config.prefix_token
+
             # Perform dependency injection to fill in remaining arguments.
             # Dependency injection is needed here for these reasons:
             #
@@ -59,248 +48,10 @@ def get_window_service(model_name: str, service: TokenizerService) -> WindowServ
                     "tokenizer_name": model_deployment.tokenizer_name,
                     "max_sequence_length": model_deployment.max_sequence_length,
                     "max_request_length": model_deployment.max_request_length,
+                    "end_of_text_token": end_of_text_token,
+                    "prefix_token": prefix_token,
                 },
             )
-            window_service = create_object(window_service_spec)
-        elif get_remote_model(model_name):
-            window_service = get_remote_window_service(service, model_name)
-        elif organization == "neurips":
-            from helm.benchmark.window_services.http_model_window_service import HTTPModelWindowServce
-
-            window_service = HTTPModelWindowServce(service)
-        elif organization == "openai":
-            from helm.benchmark.window_services.openai_window_service import OpenAIWindowService
-            from helm.benchmark.window_services.wider_openai_window_service import (
-                WiderOpenAIWindowService,
-                GPTTurboWindowService,
-                GPTTurbo16KWindowService,
-                GPT4WindowService,
-                GPT432KWindowService,
-            )
-
-            if model_name in get_model_names_with_tag(GPT4_CONTEXT_WINDOW_TAG):
-                window_service = GPT4WindowService(service)
-            elif model_name in get_model_names_with_tag(GPT4_32K_CONTEXT_WINDOW_TAG):
-                window_service = GPT432KWindowService(service)
-            if model_name in get_model_names_with_tag(GPT_TURBO_CONTEXT_WINDOW_TAG):
-                window_service = GPTTurboWindowService(service)
-            elif model_name in get_model_names_with_tag(GPT_TURBO_16K_CONTEXT_WINDOW_TAG):
-                window_service = GPTTurbo16KWindowService(service)
-            elif model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG):
-                window_service = WiderOpenAIWindowService(service)
-            else:
-                window_service = OpenAIWindowService(service)
-        # For the Google models, we approximate with the OpenAIWindowService
-        elif organization == "simple" or organization == "google":
-            from helm.benchmark.window_services.openai_window_service import OpenAIWindowService
-
-            window_service = OpenAIWindowService(service)
-        elif organization == "AlephAlpha":
-            from helm.benchmark.window_services.luminous_window_service import (
-                LuminousBaseWindowService,
-                LuminousExtendedWindowService,
-                LuminousSupremeWindowService,
-                LuminousWorldWindowService,
-            )
-
-            if engine == "luminous-base":
-                window_service = LuminousBaseWindowService(service)
-            elif engine == "luminous-extended":
-                window_service = LuminousExtendedWindowService(service)
-            elif engine == "luminous-supreme":
-                window_service = LuminousSupremeWindowService(service)
-            elif engine == "luminous-world":
-                window_service = LuminousWorldWindowService(service)
-            else:
-                raise ValueError(f"Unhandled Aleph Alpha model: {engine}")
-        elif organization == "microsoft":
-            from helm.benchmark.window_services.mt_nlg_window_service import MTNLGWindowService
-
-            window_service = MTNLGWindowService(service)
-        elif organization == "anthropic":
-            from helm.benchmark.window_services.anthropic_window_service import (
-                AnthropicWindowService,
-                LegacyAnthropicWindowService,
-            )
-
-            if engine == "stanford-online-all-v4-s3":
-                window_service = LegacyAnthropicWindowService(service)
-            else:
-                window_service = AnthropicWindowService(service)
-        elif organization == "writer":
-            from helm.benchmark.window_services.palmyra_window_service import (
-                PalmyraWindowService,
-                LongerPalmyraWindowService,
-            )
-
-            if engine in ["palmyra-base", "palmyra-large", "palmyra-instruct-30", "palmyra-e"]:
-                window_service = PalmyraWindowService(service)
-            elif engine in ["palmyra-x", "silk-road"]:
-                window_service = LongerPalmyraWindowService(service)
-            else:
-                raise ValueError(f"Unhandled Writer model: {engine}")
-        elif engine == "santacoder":
-            from helm.benchmark.window_services.santacoder_window_service import SantaCoderWindowService
-
-            window_service = SantaCoderWindowService(service)
-        elif engine == "starcoder":
-            from helm.benchmark.window_services.starcoder_window_service import StarCoderWindowService
-
-            window_service = StarCoderWindowService(service)
-        elif model_name == "huggingface/gpt2":
-            window_service = GPT2WindowService(service)
-        elif model_name == "together/bloom":
-            from helm.benchmark.window_services.bloom_window_service import BloomWindowService
-
-            window_service = BloomWindowService(service)
-        elif model_name == "together/glm":
-            # From https://github.com/THUDM/GLM-130B, "the tokenizer is implemented based on
-            # icetk---a unified multimodal tokenizer for images, Chinese, and English."
-            from helm.benchmark.window_services.ice_window_service import ICEWindowService
-
-            window_service = ICEWindowService(service)
-        elif model_name in ["huggingface/gpt-j-6b", "together/gpt-j-6b", "together/gpt-jt-6b-v1", "gooseai/gpt-j-6b"]:
-            from helm.benchmark.window_services.gptj_window_service import GPTJWindowService
-
-            window_service = GPTJWindowService(service)
-        elif model_name in [
-            "together/gpt-neox-20b",
-            "gooseai/gpt-neo-20b",
-            "together/gpt-neoxt-chat-base-20b",
-            "together/redpajama-incite-base-3b-v1",
-            "together/redpajama-incite-instruct-3b-v1",
-            "together/redpajama-incite-base-7b",
-            "together/redpajama-incite-instruct-7b",
-            # Pythia uses the same tokenizer as GPT-NeoX-20B.
-            # See: https://huggingface.co/EleutherAI/pythia-6.9b#training-procedure
-            "eleutherai/pythia-1b-v0",
-            "eleutherai/pythia-2.8b-v0",
-            "eleutherai/pythia-6.9b",
-            "eleutherai/pythia-12b-v0",
-            # MPT-7B model was trained with the EleutherAI/gpt-neox-20b tokenizer
-            # See: https://huggingface.co/mosaicml/mpt-7b
-            "mosaicml/mpt-7b",
-            "mosaicml/mpt-instruct-7b",
-            "mosaicml/mpt-30b",
-            "mosaicml/mpt-instruct-30b",
-            # Dolly models are based on Pythia.
-            # See: https://github.com/databrickslabs/dolly
-            "databricks/dolly-v2-3b",
-            "databricks/dolly-v2-7b",
-            "databricks/dolly-v2-12b",
-        ]:
-            from helm.benchmark.window_services.gptneox_window_service import GPTNeoXWindowService
-
-            window_service = GPTNeoXWindowService(service)
-        elif model_name in [
-            "tiiuae/falcon-7b",
-            "tiiuae/falcon-7b-instruct",
-            "tiiuae/falcon-40b",
-            "tiiuae/falcon-40b-instruct",
-        ]:
-            window_service = HuggingFaceWindowService(service=service, tokenizer_name="tiiuae/falcon-7b")
-        elif model_name in [
-            "stabilityai/stablelm-base-alpha-3b",
-            "stabilityai/stablelm-base-alpha-7b",
-        ]:
-            from helm.benchmark.window_services.gptneox_window_service import StableLMAlphaWindowService
-
-            window_service = StableLMAlphaWindowService(service)
-        elif model_name == "together/h3-2.7b":
-            window_service = GPT2WindowService(service)
-        elif model_name in [
-            "together/opt-1.3b",
-            "together/opt-6.7b",
-            "together/opt-66b",
-            "together/opt-175b",
-        ]:
-            from helm.benchmark.window_services.opt_window_service import OPTWindowService
-
-            window_service = OPTWindowService(service)
-        elif model_name == "together/t0pp":
-            from helm.benchmark.window_services.t0pp_window_service import T0ppWindowService
-
-            window_service = T0ppWindowService(service)
-        elif model_name == "together/t5-11b":
-            from helm.benchmark.window_services.t511b_window_service import T511bWindowService
-
-            window_service = T511bWindowService(service)
-        elif model_name == "together/flan-t5-xxl":
-            from helm.benchmark.window_services.flan_t5_window_service import FlanT5WindowService
-
-            window_service = FlanT5WindowService(service)
-        elif model_name == "together/ul2":
-            from helm.benchmark.window_services.ul2_window_service import UL2WindowService
-
-            window_service = UL2WindowService(service)
-        elif model_name == "together/yalm":
-            from helm.benchmark.window_services.yalm_window_service import YaLMWindowService
-
-            window_service = YaLMWindowService(service)
-        elif model_name == "nvidia/megatron-gpt2":
-            from helm.benchmark.window_services.megatron_window_service import MegatronWindowService
-
-            window_service = MegatronWindowService(service)
-        elif model_name in [
-            "lmsys/vicuna-7b-v1.3",
-            "lmsys/vicuna-13b-v1.3",
-            "meta/llama-7b",
-            "meta/llama-13b",
-            "meta/llama-30b",
-            "meta/llama-65b",
-            "stanford/alpaca-7b",
-        ]:
-            from helm.benchmark.window_services.llama_window_service import LlamaWindowService
-
-            window_service = LlamaWindowService(service)
-        elif model_name in [
-            "meta/llama-2-7b",
-            "meta/llama-2-13b",
-            "meta/llama-2-70b",
-        ]:
-            from helm.benchmark.window_services.llama_window_service import Llama2WindowService
-
-            window_service = Llama2WindowService(service)
-        elif organization == "cohere":
-            from helm.benchmark.window_services.cohere_window_service import (
-                CohereWindowService,
-                CohereCommandWindowService,
-            )
-
-            if "command" in engine:
-                window_service = CohereCommandWindowService(service)
-            else:
-                window_service = CohereWindowService(service)
-        elif organization == "ai21":
-            from helm.benchmark.window_services.wider_ai21_window_service import (
-                WiderAI21WindowService,
-                AI21Jurassic2JumboWindowService,
-            )
-            from helm.benchmark.window_services.ai21_window_service import AI21WindowService
-
-            if model_name in get_model_names_with_tag(AI21_WIDER_CONTEXT_WINDOW_TAG):
-                window_service = WiderAI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
-            if model_name in get_model_names_with_tag(AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG):
-                window_service = AI21Jurassic2JumboWindowService(
-                    service=service, gpt2_window_service=GPT2WindowService(service)
-                )
-            else:
-                window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
-
-        elif organization == "lightningai":
-            from helm.benchmark.window_services.lit_gpt_window_service import LitGPTWindowServce
-
-            window_service = LitGPTWindowServce(service)
-        elif organization == "mistralai":
-            window_service = HuggingFaceWindowService(service, tokenizer_name="mistralai/Mistral-7B-v0.1")
-        elif model_name in [
-            "HuggingFaceM4/idefics-9b",
-            "HuggingFaceM4/idefics-9b-instruct",
-            "HuggingFaceM4/idefics-80b",
-            "HuggingFaceM4/idefics-80b-instruct",
-        ]:
-            window_service = HuggingFaceWindowService(service, model_name)
-        else:
-            raise ValueError(f"Unhandled model name: {model_name}")
+            return create_object(window_service_spec)
 
-        return window_service
+        raise ValueError(f"Unhandled model deployment name: {model_deployment_name}")
diff --git a/src/helm/common/cache_utils.py b/src/helm/common/cache_utils.py
new file mode 100644
index 00000000000..0a2fa743eac
--- /dev/null
+++ b/src/helm/common/cache_utils.py
@@ -0,0 +1,14 @@
+"""Functions used for caching."""
+
+import os
+
+from helm.common.cache import CacheConfig, MongoCacheConfig, SqliteCacheConfig
+
+
+def build_cache_config(cache_path: str, mongo_uri: str, organization: str) -> CacheConfig:
+    if mongo_uri:
+        return MongoCacheConfig(mongo_uri, collection_name=organization)
+
+    client_cache_path: str = os.path.join(cache_path, f"{organization}.sqlite")
+    # TODO: Allow setting CacheConfig.follower_cache_path from a command line flag.
+    return SqliteCacheConfig(client_cache_path)
diff --git a/src/helm/common/credentials_utils.py b/src/helm/common/credentials_utils.py
new file mode 100644
index 00000000000..ba50db48d04
--- /dev/null
+++ b/src/helm/common/credentials_utils.py
@@ -0,0 +1,28 @@
+"""Functions used for credentials."""
+
+from typing import Any, Mapping, Optional
+
+from helm.common.hierarchical_logger import hlog
+
+
+def provide_api_key(
+    credentials: Mapping[str, Any], host_organization: str, model: Optional[str] = None
+) -> Optional[str]:
+    api_key_name = host_organization + "ApiKey"
+    if api_key_name in credentials:
+        hlog(f"Using host_organization api key defined in credentials.conf: {api_key_name}")
+        return credentials[api_key_name]
+    if "deployments" not in credentials:
+        hlog(
+            "WARNING: Could not find key 'deployments' in credentials.conf, "
+            f"therefore the API key {api_key_name} should be specified."
+        )
+        return None
+    deployment_api_keys = credentials["deployments"]
+    if model is None:
+        hlog(f"WARNING: Could not find key '{host_organization}' in credentials.conf and no model provided")
+        return None
+    if model not in deployment_api_keys:
+        hlog(f"WARNING: Could not find key '{model}' under key 'deployments' in credentials.conf")
+        return None
+    return deployment_api_keys[model]
diff --git a/src/helm/common/general.py b/src/helm/common/general.py
index 9961be1e393..5940b434630 100644
--- a/src/helm/common/general.py
+++ b/src/helm/common/general.py
@@ -7,7 +7,7 @@
 import uuid
 import zstandard
 from typing import Any, Callable, Dict, List, Optional, TypeVar
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+from concurrent.futures import ThreadPoolExecutor
 from tqdm import tqdm
 
 import pyhocon
@@ -214,20 +214,14 @@ def match_case(source_word: str, target_word: str) -> str:
 OutT = TypeVar("OutT")
 
 
-def parallel_map(
-    process: Callable[[InT], OutT], items: List[InT], parallelism: int, multiprocessing: bool = False
-) -> List[OutT]:
+def parallel_map(process: Callable[[InT], OutT], items: List[InT], parallelism: int) -> List[OutT]:
     """
     A wrapper for applying `process` to all `items`.
     """
-    units = "processes" if multiprocessing else "threads"
-    with htrack_block(f"Parallelizing computation on {len(items)} items over {parallelism} {units}"):
+    with htrack_block(f"Parallelizing computation on {len(items)} items over {parallelism} threads"):
         results: List
         if parallelism == 1:
             results = list(tqdm(map(process, items), total=len(items), disable=None))
-        elif multiprocessing:
-            with ProcessPoolExecutor(max_workers=parallelism) as executor:
-                results = list(tqdm(executor.map(process, items), total=len(items), disable=None))
         else:
             with ThreadPoolExecutor(max_workers=parallelism) as executor:
                 results = list(tqdm(executor.map(process, items), total=len(items), disable=None))
@@ -320,3 +314,20 @@ def safe_symlink(src: str, dest: str) -> None:
 def is_url(location: str) -> bool:
     """Return True if `location` is a url. False otherwise."""
     return urllib.parse.urlparse(location).scheme in ["http", "https"]
+
+
+def assert_is_str(val: Any) -> str:
+    assert isinstance(val, str)
+    return val
+
+
+def assert_is_str_list(val: Any) -> List[str]:
+    assert isinstance(val, list)
+    for v in val:
+        assert isinstance(v, str)
+    return val
+
+
+def assert_present(val: Optional[InT]) -> InT:
+    assert val is not None
+    return val
diff --git a/src/helm/common/object_spec.py b/src/helm/common/object_spec.py
index 8fab4489604..5669daeb33e 100644
--- a/src/helm/common/object_spec.py
+++ b/src/helm/common/object_spec.py
@@ -1,6 +1,6 @@
 import importlib
 import dataclasses
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import inspect
 from typing import Any, Callable, Dict, Optional, Tuple, Hashable, Type, TypeVar
 
@@ -13,7 +13,7 @@ class ObjectSpec:
     class_name: str
 
     # Arguments used to construct the scenario
-    args: Dict[str, Any]
+    args: Dict[str, Any] = field(default_factory=dict)
 
     def __hash__(self):
         def get_arg_value(key: str) -> Any:
diff --git a/src/helm/common/request.py b/src/helm/common/request.py
index 6ca89fc0cb6..4acefd3690d 100644
--- a/src/helm/common/request.py
+++ b/src/helm/common/request.py
@@ -3,7 +3,6 @@
 from typing import Any, Callable, Dict, List, Optional
 
 from helm.common.media_object import MultimediaObject
-from helm.proxy.models import Model, get_model
 from .general import indent_lines, format_text
 
 
@@ -15,8 +14,13 @@ class Request:
     various APIs (e.g., GPT-3, Jurassic).
     """
 
-    model: str = "openai/text-davinci-002"
-    """Which model to query"""
+    model_deployment: str = ""
+    """Which model deployment to query -> Determines the Client.
+    Refers to a deployment in the model deployment registry."""
+
+    model: str = ""
+    """Which model to use -> Determines the Engine.
+    Refers to a model metadata in the model registry."""
 
     embedding: bool = False
     """Whether to query embedding instead of text response"""
@@ -65,16 +69,23 @@ class Request:
     """Multimodal prompt with media objects interleaved (e.g., text, video, image, text, ...)"""
 
     @property
-    def model_organization(self) -> str:
-        """Example: 'openai/davinci' => 'openai'"""
-        model: Model = get_model(self.model)
-        return model.organization
+    def model_host(self) -> str:
+        """Returns the model host (referring to the deployment).
+        Not to be confused with the model creator organization (referring to the model).
+        Example: 'openai/davinci' => 'openai'
+                 'together/bloom' => 'together'"""
+        return self.model_deployment.split("/")[0]
 
     @property
     def model_engine(self) -> str:
-        """Example: 'openai/davinci' => 'davinci'"""
-        model: Model = get_model(self.model)
-        return model.engine
+        """Returns the model engine (referring to the model).
+        This is often the same as self.model_deploymentl.split("/")[1], but not always.
+        For example, one model could be served on several servers (each with a different model_deployment)
+        In that case we would have for example:
+        'aws/bloom-1', 'aws/bloom-2', 'aws/bloom-3' => 'bloom'
+        This is why we need to keep track of the model engine with the model metadata.
+        Example: 'openai/davinci' => 'davinci'"""
+        return self.model.split("/")[1]
 
 
 @dataclass(frozen=True)
diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
new file mode 100644
index 00000000000..f7699818a82
--- /dev/null
+++ b/src/helm/config/model_deployments.yaml
@@ -0,0 +1,1567 @@
+# This file defines all the model deployments that are supported by the Helm API.
+# Some models have several deployments, each with different parameters.
+
+# If you want to add a new deployment, you can technically do it here but we recommend
+# you to do it in private/model_deployments.yaml instead.
+
+model_deployments:
+
+  - name: simple/model1
+    model_name: simple/model1
+    tokenizer_name: simple/model1
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.simple_client.SimpleClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  # AI21 Labs
+
+  # J1 models are Deprecated by AI21 Labs
+  # API returns: Detail: Jurassic J1 models are deprecated
+  - name: ai21/j1-jumbo
+    deprecated: true
+    model_name: ai21/j1-jumbo
+    tokenizer_name: ai21/j1
+    max_sequence_length: 2047
+    client_spec:
+      class_name: "helm.proxy.clients.ai21_client.AI21Client"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+      args:
+        gpt2_window_service:
+          class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+          args: {}
+
+  - name: ai21/j1-large
+    deprecated: true
+    model_name: ai21/j1-large
+    tokenizer_name: ai21/j1
+    max_sequence_length: 2047
+    client_spec:
+      class_name: "helm.proxy.clients.ai21_client.AI21Client"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+      args:
+        gpt2_window_service:
+          class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+          args: {}
+
+  - name: ai21/j1-grande 
+    deprecated: true
+    model_name: ai21/j1-grande
+    tokenizer_name: ai21/j1
+    max_sequence_length: 2047
+    client_spec:
+      class_name: "helm.proxy.clients.ai21_client.AI21Client"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+      args:
+        gpt2_window_service:
+          class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+          args: {}
+
+  - name: ai21/j1-grande-v2-beta 
+    deprecated: true
+    model_name: ai21/j1-grande-v2-beta
+    tokenizer_name: ai21/j1
+    max_sequence_length: 2047
+    client_spec:
+      class_name: "helm.proxy.clients.ai21_client.AI21Client"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+      args:
+        gpt2_window_service:
+          class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+          args: {}
+
+  - name: ai21/j2-jumbo
+    model_name: ai21/j2-jumbo
+    tokenizer_name: ai21/j1
+    max_sequence_length: 6000
+    client_spec:
+      class_name: "helm.proxy.clients.ai21_client.AI21Client"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_ai21_window_service.AI21Jurassic2JumboWindowService"
+      args:
+        gpt2_window_service:
+          class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+          args: {}
+
+  - name: ai21/j2-large
+    model_name: ai21/j2-large
+    tokenizer_name: ai21/j1
+    max_sequence_length: 2047
+    client_spec:
+      class_name: "helm.proxy.clients.ai21_client.AI21Client"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+      args:
+        gpt2_window_service:
+          class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+          args: {}
+
+  - name: ai21/j2-grande
+    model_name: ai21/j2-grande
+    tokenizer_name: ai21/j1
+    max_sequence_length: 2047
+    client_spec:
+      class_name: "helm.proxy.clients.ai21_client.AI21Client"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+      args:
+        gpt2_window_service:
+          class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+          args: {}
+
+
+
+  # Aleph Alpha
+  - name: AlephAlpha/luminous-base
+    model_name: AlephAlpha/luminous-base
+    tokenizer_name: AlephAlpha/luminous-base
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.luminous_window_service.LuminousBaseWindowService"
+      args: {}
+
+  - name: AlephAlpha/luminous-extended
+    model_name: AlephAlpha/luminous-extended
+    tokenizer_name: AlephAlpha/luminous-extended
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.luminous_window_service.LuminousExtendedWindowService"
+      args: {}
+
+  - name: AlephAlpha/luminous-supreme
+    model_name: AlephAlpha/luminous-supreme
+    tokenizer_name: AlephAlpha/luminous-supreme
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.luminous_window_service.LuminousSupremeWindowService"
+      args: {}
+
+  # TODO: Add luminous-world once it is released.
+
+
+  
+  # Anthropic
+  - name: anthropic/claude-v1.3
+    model_name: anthropic/claude-v1.3
+    tokenizer_name: anthropic/claude
+    max_sequence_length: 8000
+    max_sequence_and_generated_tokens_length: 9016
+    client_spec:
+      class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+      args: {}
+
+  - name: anthropic/claude-instant-v1
+    model_name: anthropic/claude-instant-v1
+    tokenizer_name: anthropic/claude
+    max_sequence_length: 8000
+    max_sequence_and_generated_tokens_length: 9016
+    client_spec:
+      class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+      args: {}
+
+  - name: anthropic/claude-2.0
+    model_name: anthropic/claude-2.0
+    tokenizer_name: anthropic/claude
+    max_sequence_length: 8000
+    max_sequence_and_generated_tokens_length: 9016
+    client_spec:
+      class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+      args: {}
+
+  - name: anthropic/stanford-online-all-v4-s3
+    deprecated: true # Closed model, not accessible via API
+    model_name: anthropic/stanford-online-all-v4-s3
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 8192
+    client_spec:
+      class_name: "helm.proxy.clients.anthropic_client.AnthropicLegacyClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.anthropic_window_service.LegacyAnthropicWindowService"
+      args: {}
+
+
+
+  # Cohere
+  - name: cohere/xlarge-20220609
+    model_name: cohere/xlarge-20220609
+    tokenizer_name: cohere/cohere
+    max_sequence_length: 2047
+    max_request_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.cohere_client.CohereClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+      args: {}
+
+  - name: cohere/large-20220720
+    model_name: cohere/large-20220720
+    tokenizer_name: cohere/cohere
+    max_sequence_length: 2047
+    max_request_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.cohere_client.CohereClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+      args: {}
+
+  - name: cohere/medium-20220720
+    model_name: cohere/medium-20220720
+    tokenizer_name: cohere/cohere
+    max_sequence_length: 2047
+    max_request_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.cohere_client.CohereClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+      args: {}
+
+  - name: cohere/small-20220720
+    model_name: cohere/small-20220720
+    tokenizer_name: cohere/cohere
+    max_sequence_length: 2047
+    max_request_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.cohere_client.CohereClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+      args: {}
+
+  - name: cohere/xlarge-20221108
+    model_name: cohere/xlarge-20221108
+    tokenizer_name: cohere/cohere
+    max_sequence_length: 2047
+    max_request_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.cohere_client.CohereClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+      args: {}
+
+  - name: cohere/medium-20221108
+    model_name: cohere/medium-20221108
+    tokenizer_name: cohere/cohere
+    max_sequence_length: 2047
+    max_request_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.cohere_client.CohereClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+      args: {}
+
+  - name: cohere/command-medium-beta
+    model_name: cohere/command-medium-beta
+    tokenizer_name: cohere/cohere
+    max_sequence_length: 2019
+    max_request_length: 2020
+    client_spec:
+      class_name: "helm.proxy.clients.cohere_client.CohereClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+      args: {}
+
+  - name: cohere/command-xlarge-beta
+    model_name: cohere/command-xlarge-beta
+    tokenizer_name: cohere/cohere
+    max_sequence_length: 2019
+    max_request_length: 2020
+    client_spec:
+      class_name: "helm.proxy.clients.cohere_client.CohereClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+      args: {}
+
+  - name: cohere/command
+    model_name: cohere/command
+    tokenizer_name: cohere/cohere
+    max_sequence_length: 2019 # TODO: verify this
+    max_request_length: 2020 # TODO: verify this
+    client_spec:
+      class_name: "helm.proxy.clients.cohere_client.CohereClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+      args: {}
+
+  - name: cohere/command-light
+    model_name: cohere/command-light
+    tokenizer_name: cohere/cohere
+    max_sequence_length: 2019 # TODO: verify this
+    max_request_length: 2020 # TODO: verify this
+    client_spec:
+      class_name: "helm.proxy.clients.cohere_client.CohereClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+      args: {}
+
+
+
+  # Gooseai
+
+  ## EleutherAI
+  - name: gooseai/gpt-neo-20b
+    model_name: eleutherai/gpt-neox-20b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.goose_ai_client.GooseAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: gooseai/gpt-j-6b
+    model_name: eleutherai/gpt-j-6b
+    tokenizer_name: EleutherAI/gpt-j-6B
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.goose_ai_client.GooseAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+      args: {}
+
+
+
+  # HuggingFace
+
+  ## Bigcode
+  - name: huggingface/santacoder
+    model_name: bigcode/santacoder
+    tokenizer_name: bigcode/santacoder
+    client_spec:
+      class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.santacoder_window_service.SantaCoderWindowService"
+      args: {}
+
+  - name: huggingface/starcoder
+    model_name: bigcode/starcoder
+    tokenizer_name: bigcode/starcoder
+    client_spec:
+      class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.starcoder_window_service.StarCoderWindowService"
+      args: {}
+
+  ## EleutherAI
+  - name: huggingface/gpt-j-6b
+    model_name: eleutherai/gpt-j-6b
+    tokenizer_name: EleutherAI/gpt-j-6B
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+      args: {}
+
+  ## OpenAI 
+  - name: huggingface/gpt2
+    model_name: openai/gpt2
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 1024
+    max_request_length: 1025
+    client_spec:
+      class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+      args: {}
+
+
+
+  # HuggingFaceM4
+  - name: HuggingFaceM4/idefics-9b
+    model_name: huggingface/idefics-9b
+    tokenizer_name: HuggingFaceM4/idefics-9b
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+      args: {}
+
+  - name: HuggingFaceM4/idefics-9b-instruct
+    model_name: huggingface/idefics-9b-instruct
+    tokenizer_name: HuggingFaceM4/idefics-9b-instruct
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+      args: {}
+
+  - name: HuggingFaceM4/idefics-80b
+    model_name: huggingface/idefics-80b
+    tokenizer_name: HuggingFaceM4/idefics-80b
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+      args: {}
+
+  - name: HuggingFaceM4/idefics-80b-instruct
+    model_name: huggingface/idefics-80b-instruct
+    tokenizer_name: HuggingFaceM4/idefics-80b-instruct
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+      args: {}
+
+
+
+  # Lighting AI
+  - name: lightningai/lit-gpt
+    model_name: lightningai/lit-gpt
+    tokenizer_name: lightningai/lit-gpt
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.lit_gpt_client.LitGPTClient"
+      args: 
+        checkpoint_dir: "" # Path to the checkpoint directory
+        precision: bf16-true
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.lit_gpt_window_service.LitGPTWindowService"
+      args: {}
+
+
+
+  # Microsoft
+  - name: microsoft/TNLGv2_530B
+    model_name: microsoft/TNLGv2_530B
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2047
+    max_request_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.microsoft_client.MicrosoftClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
+      args: {}
+
+  - name: microsoft/TNLGv2_7B
+    model_name: microsoft/TNLGv2_7B
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2047
+    max_request_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.microsoft_client.MicrosoftClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
+      args: {}
+
+
+
+  # Neurips
+  - name: neurips/local
+    model_name: neurips/local
+    tokenizer_name: neurips/local
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.http_model_client.HTTPModelClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.http_model_window_service.HTTPModelWindowService"
+      args: {}
+
+
+
+  # Nvidia
+  - name: nvidia/megatron-gpt2
+    model_name: nvidia/megatron-gpt2
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 1024
+    client_spec:
+      class_name: "helm.proxy.clients.megatron_client.MegatronClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.megatron_window_service.MegatronWindowService"
+      args: {}
+
+
+
+  # OpenAI
+
+  ## GPT 3 Models
+  # The list of models can be found here: https://beta.openai.com/docs/engines/gpt-3
+  # DEPRECATED: Announced on July 06 2023 that these models will be shut down on January 04 2024.
+  
+  - name: openai/davinci 
+    deprecated: true
+    model_name: openai/davinci
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/curie 
+    deprecated: true
+    model_name: openai/curie
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/babbage 
+    deprecated: true
+    model_name: openai/babbage
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/ada 
+    deprecated: true
+    model_name: openai/ada
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/text-davinci-003 
+    deprecated: true
+    model_name: openai/text-davinci-003
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 4000
+    max_request_length: 4001
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+      args: {}
+
+  - name: openai/text-davinci-002 
+    deprecated: true
+    model_name: openai/text-davinci-002
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 4000
+    max_request_length: 4001
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+      args: {}
+
+  - name: openai/text-davinci-001 
+    deprecated: true
+    model_name: openai/text-davinci-001
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/text-curie-001 
+    deprecated: true
+    model_name: openai/text-curie-001
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/text-babbage-001 
+    deprecated: true
+    model_name: openai/text-babbage-001
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/text-ada-001 
+    deprecated: true
+    model_name: openai/text-ada-001
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+
+  ## GPT 3.5 Turbo Models
+  # ChatGPT: https://openai.com/blog/chatgpt
+
+  # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
+  # sequence length is smaller at 4087 with one user input message and one assistant
+  # output message because ChatGPT uses special tokens for message roles and boundaries.
+  # We use a rounded-down sequence length of 4000 to account for these special tokens.
+  - name: openai/gpt-3.5-turbo-0301
+    model_name: openai/gpt-3.5-turbo-0301
+    tokenizer_name: openai/cl100k_base
+    max_sequence_length: 4000
+    max_request_length: 4001
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
+      args: {}
+
+  # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
+  # sequence length is smaller at 4087 with one user input message and one assistant
+  # output message because ChatGPT uses special tokens for message roles and boundaries.
+  # We use a rounded-down sequence length of 4000 to account for these special tokens.
+  - name: openai/gpt-3.5-turbo-0613
+    model_name: openai/gpt-3.5-turbo-0613
+    tokenizer_name: openai/cl100k_base
+    max_sequence_length: 4000
+    max_request_length: 4001
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
+      args: {}
+
+  # Claimed length is 16,384; we round down to 16,000 for the same reasons as explained
+  # in the openai/gpt-3.5-turbo-0613 comment
+  - name: openai/gpt-3.5-turbo-16k-0613
+    model_name: openai/gpt-3.5-turbo-16k-0613
+    tokenizer_name: openai/cl100k_base
+    max_sequence_length: 16000
+    max_request_length: 16001
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurbo16KWindowService"
+      args: {}
+
+
+  ## GPT 4 Models
+
+  - name: openai/gpt-4-0314
+    model_name: openai/gpt-4-0314
+    tokenizer_name: openai/cl100k_base
+    max_sequence_length: 8192
+    max_request_length: 8193
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
+      args: {}
+
+  - name: openai/gpt-4-32k-0314
+    model_name: openai/gpt-4-32k-0314
+    tokenizer_name: openai/cl100k_base
+    max_sequence_length: 32768
+    max_request_length: 32769
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
+      args: {}
+
+  - name: openai/gpt-4-0613
+    model_name: openai/gpt-4-0613
+    tokenizer_name: openai/cl100k_base
+    max_sequence_length: 8192
+    max_request_length: 8193
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
+      args: {}
+
+  - name: openai/gpt-4-32k-0613
+    model_name: openai/gpt-4-32k-0613
+    tokenizer_name: openai/cl100k_base
+    max_sequence_length: 32768
+    max_request_length: 32769
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
+      args: {}
+
+
+  ## Codex Models
+  # DEPRECATED: Codex models have been shut down on March 23 2023.
+
+  - name: openai/code-davinci-002 
+    deprecated: true
+    model_name: openai/code-davinci-002
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 4000
+    max_request_length: 4001
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+      args: {}
+
+  - name: openai/code-davinci-001 
+    deprecated: true
+    model_name: openai/code-davinci-001
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/code-cushman-001 
+    deprecated: true
+    model_name: openai/code-cushman-001
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  
+  ## Text Similarity Models
+  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
+  # The number of parameters is guessed based on the number of parameters of the
+  # corresponding GPT-3 model.
+  # DEPRECATED: Announced on July 06 2023 that first generation embeddings models
+  #  will be shut down on January 04 2024.
+
+  - name: openai/text-similarity-davinci-001 
+    deprecated: true
+    model_name: openai/text-similarity-davinci-001
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/text-similarity-curie-001 
+    deprecated: true
+    model_name: openai/text-similarity-curie-001
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/text-similarity-babbage-001 
+    deprecated: true
+    model_name: openai/text-similarity-babbage-001
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  - name: openai/text-similarity-ada-001 
+    deprecated: true
+    model_name: openai/text-similarity-ada-001
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+  # As of 2023-11-07, text-embedding-ada-002 is not deprecated:
+  # "We recommend using text-embedding-ada-002 for nearly all use cases."
+  # Source: https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
+  - name: openai/text-embedding-ada-002 
+    model_name: openai/text-embedding-ada-002
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+      args: {}
+
+
+
+  # Together
+  # The list of models served by Together changes often, to check the latest list, visit:
+  # https://docs.together.ai/docs/inference-models
+  # You can also check the playground to check that the live models are working:
+  # https://api.together.xyz/playground
+
+  ## BigScience
+  - name: together/bloom 
+    deprecated: true # Removed from together
+    model_name: bigscience/bloom
+    tokenizer_name: bigscience/bloom
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.bloom_window_service.BloomWindowService"
+      args: {}
+
+  - name: together/t0pp 
+    deprecated: true # Removed from together
+    model_name: bigscience/t0pp
+    tokenizer_name: bigscience/T0pp
+    max_sequence_length: 1024
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.t0pp_window_service.T0ppWindowService"
+      args: {}
+
+  ## Databricks
+  - name: together/dolly-v2-3b
+    model_name: databricks/dolly-v2-3b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/dolly-v2-7b
+    model_name: databricks/dolly-v2-7b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/dolly-v2-12b
+    model_name: databricks/dolly-v2-12b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  ## EleutherAI
+  - name: together/gpt-j-6b
+    deprecated: true # Removed from together
+    model_name: eleutherai/gpt-j-6b
+    tokenizer_name: EleutherAI/gpt-j-6B
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+      args: {}
+
+  - name: together/gpt-neox-20b
+    deprecated: true # Removed from together
+    model_name: eleutherai/gpt-neox-20b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/pythia-1b-v0
+    model_name: eleutherai/pythia-1b-v0
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/pythia-2.8b-v0
+    model_name: eleutherai/pythia-2.8b-v0
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/pythia-6.9b
+    model_name: eleutherai/pythia-6.9b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/pythia-12b-v0
+    model_name: eleutherai/pythia-12b-v0
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  ## Google
+  - name: together/t5-11b
+    deprecated: true # Removed from together
+    model_name: google/t5-11b
+    tokenizer_name: google/t5-11b
+    max_sequence_length: 511
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.t511b_window_service.T511bWindowService"
+      args: {}
+
+  - name: together/flan-t5-xxl
+    deprecated: true # Removed from together
+    model_name: google/flan-t5-xxl
+    tokenizer_name: google/flan-t5-xxl
+    max_sequence_length: 511
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.flan_t5_window_service.FlanT5WindowService"
+      args: {}
+
+  - name: together/ul2
+    deprecated: true # Removed from together
+    model_name: google/ul2
+    tokenizer_name: google/ul2
+    max_sequence_length: 511
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.ul2_window_service.UL2WindowService"
+      args: {}
+
+  ## HazyResearch
+  - name: together/h3-2.7b
+    deprecated: true# Not available on Together yet
+    model_name: hazyresearch/h3-2.7b
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 1024
+    max_request_length: 1025
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+      args: {}
+
+  ## LMSYS
+  # TODO: might be deprecated. Needs to be checked.
+  # Together officialy supports vicuna 1.5, not sure if 1.3 is still supported.
+  - name: together/vicuna-7b-v1.3
+    model_name: lmsys/vicuna-7b-v1.3
+    tokenizer_name: hf-internal-testing/llama-tokenizer
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+      args: {}
+
+  - name: together/vicuna-13b-v1.3
+    model_name: lmsys/vicuna-13b-v1.3
+    tokenizer_name: hf-internal-testing/llama-tokenizer
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+      args: {}
+
+  ## Meta
+  - name: together/llama-7b
+    model_name: meta/llama-7b
+    tokenizer_name: hf-internal-testing/llama-tokenizer
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+      args: {}
+
+  - name: together/llama-13b
+    model_name: meta/llama-13b
+    tokenizer_name: hf-internal-testing/llama-tokenizer
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+      args: {}
+
+  - name: together/llama-30b
+    model_name: meta/llama-30b
+    tokenizer_name: hf-internal-testing/llama-tokenizer
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+      args: {}
+
+  - name: together/llama-65b
+    model_name: meta/llama-65b
+    tokenizer_name: hf-internal-testing/llama-tokenizer
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+      args: {}
+
+  - name: together/llama-2-7b
+    model_name: meta/llama-2-7b
+    tokenizer_name: meta-llama/Llama-2-7b-hf
+    max_sequence_length: 4096
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+      args: {}
+
+  - name: together/llama-2-13b
+    model_name: meta/llama-2-13b
+    tokenizer_name: meta-llama/Llama-2-7b-hf
+    max_sequence_length: 4096
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+      args: {}
+
+  - name: together/llama-2-70b
+    model_name: meta/llama-2-70b
+    tokenizer_name: meta-llama/Llama-2-7b-hf
+    max_sequence_length: 4096
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+      args: {}
+
+  - name: together/opt-175b
+    deprecated: true # Not available on Together yet
+    model_name: meta/opt-175b
+    tokenizer_name: facebook/opt-66b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
+      args: {}
+
+  - name: together/opt-66b
+    deprecated: true # Not available on Together yet
+    model_name: meta/opt-66b
+    tokenizer_name: facebook/opt-66b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
+      args: {}
+
+  - name: together/opt-6.7b
+    deprecated: true # Not available on Together yet
+    model_name: meta/opt-6.7b
+    tokenizer_name: facebook/opt-66b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
+      args: {}
+
+  - name: together/opt-1.3b
+    deprecated: true # Not available on Together yet
+    model_name: meta/opt-1.3b
+    tokenizer_name: facebook/opt-66b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
+      args: {}
+
+  ## MistralAI
+  - name: together/mistral-7b-v0.1
+    model_name: mistralai/mistral-7b-v0.1
+    tokenizer_name: mistralai/Mistral-7B-v0.1
+    max_sequence_length: 4095
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+      args: {}
+
+  ## MosaicML
+  - name: together/mpt-7b
+    deprecated: true # Not available on Together yet
+    model_name: mosaicml/mpt-7b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/mpt-instruct-7b
+    deprecated: true # Not available on Together yet
+    model_name: mosaicml/mpt-instruct-7b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/mpt-30b
+    model_name: mosaicml/mpt-30b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/mpt-instruct-30b
+    model_name: mosaicml/mpt-instruct-30b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  ## StabilityAI
+  - name: together/stablelm-base-alpha-3b
+    deprecated: true # Removed from together
+    model_name: stabilityai/stablelm-base-alpha-3b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 4096
+    max_request_length: 4097
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
+      args: {}
+
+  - name: together/stablelm-base-alpha-7b
+    deprecated: true # Removed from together
+    model_name: stabilityai/stablelm-base-alpha-7b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 4096
+    max_request_length: 4097
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
+      args: {}
+
+  ## Stanford
+  - name: together/alpaca-7b
+    model_name: stanford/alpaca-7b
+    tokenizer_name: hf-internal-testing/llama-tokenizer
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+      args: {}
+
+  ## Tiiuae
+  - name: together/falcon-7b
+    model_name: tiiuae/falcon-7b
+    tokenizer_name: tiiuae/falcon-7b
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+      args: {}
+
+  - name: together/falcon-7b-instruct
+    model_name: tiiuae/falcon-7b-instruct
+    tokenizer_name: tiiuae/falcon-7b
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+      args: {}
+
+  - name: together/falcon-40b
+    model_name: tiiuae/falcon-40b
+    tokenizer_name: tiiuae/falcon-7b
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+      args: {}
+
+  - name: together/falcon-40b-instruct
+    model_name: tiiuae/falcon-40b-instruct
+    tokenizer_name: tiiuae/falcon-7b
+    max_sequence_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+      args: {}
+
+  ## Together
+  # These are models fine-tuned by Together (and not simply hosted by Together).
+  - name: together/gpt-jt-6b-v1
+    model_name: together/gpt-jt-6b-v1
+    tokenizer_name: EleutherAI/gpt-j-6B
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+      args: {}
+
+  - name: together/gpt-neoxt-chat-base-20b
+    model_name: together/gpt-neoxt-chat-base-20b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/redpajama-incite-base-3b-v1
+    model_name: together/redpajama-incite-base-3b-v1
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/redpajama-incite-instruct-3b-v1
+    model_name: together/redpajama-incite-instruct-3b-v1
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/redpajama-incite-base-7b
+    model_name: together/redpajama-incite-base-7b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  - name: together/redpajama-incite-instruct-7b
+    model_name: together/redpajama-incite-instruct-7b
+    tokenizer_name: EleutherAI/gpt-neox-20b
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+      args: {}
+
+  ## Tsinghua
+  - name: together/glm
+    deprecated: true # Not available on Together yet
+    model_name: tsinghua/glm
+    tokenizer_name: TsinghuaKEG/ice
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.ice_window_service.ICEWindowService"
+      args: {}
+
+  ## Yandex
+  - name: together/yalm
+    deprecated: true # Not available on Together yet
+    model_name: yandex/yalm
+    tokenizer_name: Yandex/yalm
+    max_sequence_length: 2048
+    max_request_length: 2049
+    client_spec:
+      class_name: "helm.proxy.clients.together_client.TogetherClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.yalm_window_service.YaLMWindowService"
+      args: {}
+
+
+
+  # Writer
+  - name: writer/palmyra-base
+    model_name: writer/palmyra-base
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_sequence_and_generated_tokens_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+      args: {}
+
+  - name: writer/palmyra-large
+    model_name: writer/palmyra-large
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_sequence_and_generated_tokens_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+      args: {}
+
+  - name: writer/palmyra-instruct-30
+    model_name: writer/palmyra-instruct-30
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_sequence_and_generated_tokens_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+      args: {}
+
+  - name: writer/palmyra-e
+    model_name: writer/palmyra-e
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 2048
+    max_sequence_and_generated_tokens_length: 2048
+    client_spec:
+      class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+      args: {}
+
+  - name: writer/silk-road
+    model_name: writer/silk-road
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 8192
+    max_sequence_and_generated_tokens_length: 8192
+    client_spec:
+      class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
+      args: {}
+
+  - name: writer/palmyra-x
+    model_name: writer/palmyra-x
+    tokenizer_name: huggingface/gpt2
+    max_sequence_length: 8192
+    max_sequence_and_generated_tokens_length: 8192
+    client_spec:
+      class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+      args: {}
+    window_service_spec:
+      class_name: "helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
+      args: {}
\ No newline at end of file
diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
new file mode 100644
index 00000000000..e9c097ea165
--- /dev/null
+++ b/src/helm/config/model_metadata.yaml
@@ -0,0 +1,1351 @@
+# This file defines all the models officially supported by the Helm API.
+# The model names here should match the model names in model_deployments.yaml.
+
+# If you want to add a new model, you can technically do it here but we recommend
+# you to do it in private/model_metadata.yaml instead.
+
+models:
+
+  - name: simple/model1
+    display_name: Simple Model 1
+    description: This is a test model.
+    creator_organization_name: Helm
+    access: open
+    release_date: 2023-01-01
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  # AI21 Labs
+  - name: ai21/j1-jumbo # DEPRECATED
+    display_name: J1-Jumbo v1 (178B)
+    description: Jurassic-1 Jumbo (178B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
+    creator_organization_name: AI21 Labs
+    access: limited
+    num_parameters: 178000000000
+    release_date: 2021-08-11
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: ai21/j1-large # DEPRECATED
+    display_name: J1-Large v1 (7.5B)
+    description: Jurassic-1 Large (7.5B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
+    creator_organization_name: AI21 Labs
+    access: limited
+    num_parameters: 7500000000
+    release_date: 2021-08-11
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: ai21/j1-grande # DEPRECATED
+    display_name: J1-Grande v1 (17B)
+    description: Jurassic-1 Grande (17B parameters) with a "few tweaks" to the training process ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
+    creator_organization_name: AI21 Labs
+    access: limited
+    num_parameters: 17000000000
+    release_date: 2022-05-03
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: ai21/j1-grande-v2-beta # DEPRECATED
+    display_name: J1-Grande v2 beta (17B)
+    description: Jurassic-1 Grande v2 beta (17B parameters)
+    creator_organization_name: AI21 Labs
+    access: limited
+    num_parameters: 17000000000
+    release_date: 2022-10-28
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: ai21/j2-jumbo
+    display_name: Jurassic-2 Jumbo (178B)
+    description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization_name: AI21 Labs
+    access: limited
+    num_parameters: 178000000000
+    release_date: 2023-03-09
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: ai21/j2-large
+    display_name: Jurassic-2 Large (7.5B)
+    description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization_name: AI21 Labs
+    access: limited
+    num_parameters: 7500000000
+    release_date: 2023-03-09
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: ai21/j2-grande
+    display_name: Jurassic-2 Grande (17B)
+    description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+    creator_organization_name: AI21 Labs
+    access: limited
+    num_parameters: 17000000000
+    release_date: 2023-03-09
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  # TODO(1524): Change AI21 model names
+  # - j2-jumbo -> j2-ultra
+  # - j2-grande -> j2-mid
+  # - j2-large -> j2-light
+
+
+
+  # Aleph Alpha
+  # Aleph Alpha's Luminous models: https://docs.aleph-alpha.com/docs/introduction/luminous
+  # TODO: add Luminous World when it's released
+  - name: AlephAlpha/luminous-base
+    display_name: Luminous Base (13B)
+    description: Luminous Base (13B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+    creator_organization_name: Aleph Alpha
+    access: limited
+    num_parameters: 13000000000
+    # TODO: get exact release date
+    release_date: 2022-01-01
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: AlephAlpha/luminous-extended
+    display_name: Luminous Extended (30B)
+    description: Luminous Extended (30B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+    creator_organization_name: Aleph Alpha
+    access: limited
+    num_parameters: 30000000000
+    release_date: 2022-01-01
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: AlephAlpha/luminous-supreme
+    display_name: Luminous Supreme (70B)
+    description: Luminous Supreme (70B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+    creator_organization_name: Aleph Alpha
+    access: limited
+    num_parameters: 70000000000
+    release_date: 2022-01-01
+    # Does not support echo.
+    # TODO: images will be supported in the near future. Add IMAGE_MODEL_TAG.
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+  
+  # TODO: Uncomment when luminous-world is released.
+  # - name: AlephAlpha/luminous-world # Not released yet.
+  #   display_name: Luminous World (178B)
+  #   description: Luminous World (178B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+  #   creator_organization_name: Aleph Alpha
+  #   access: limited
+  #   num_parameters: TBD
+  #   release_date: TBD
+  #   # Does not support echo.
+  #   tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # Anthropic
+  - name: anthropic/claude-v1.3
+    display_name: Anthropic Claude v1.3
+    description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
+    creator_organization_name: Anthropic
+    access: limited
+    num_parameters: 52000000000
+    release_date: 2023-03-17
+    tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+ 
+  - name: anthropic/claude-instant-v1
+    display_name: Anthropic Claude Instant V1
+    description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
+    creator_organization_name: Anthropic
+    access: limited
+    release_date: 2023-03-17
+    tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  - name: anthropic/claude-2.0
+    display_name: Anthropic Claude 2.0
+    description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
+    creator_organization_name: Anthropic
+    access: limited
+    release_date: 2023-07-11
+    tags: [ANTHROPIC_CLAUDE_2_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  # DEPRECATED: Please do not use.
+  - name: anthropic/stanford-online-all-v4-s3
+    display_name: Anthropic-LM v4-s3 (52B)
+    description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
+    creator_organization_name: Anthropic
+    access: closed
+    num_parameters: 52000000000
+    release_date: 2021-12-01
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
+
+
+
+  # Berkeley
+  - name: berkeley/koala-13b # NOT SUPPORTED
+    display_name: Koala (13B)
+    description: Koala (13B) is a chatbot fine-tuned from Llama (13B) on dialogue data gathered from the web. ([blog post](https://bair.berkeley.edu/blog/2023/04/03/koala/))
+    creator_organization_name: UC Berkeley
+    access: open
+    num_parameters: 13000000000
+    release_date: 2022-04-03
+    tags: [] # TODO: add tags
+
+
+
+  # BigScience
+  - name: bigscience/bloom
+    display_name: BLOOM (176B)
+    description: BLOOM (176B parameters) is an autoregressive model trained on 46 natural languages and 13 programming languages ([paper](https://arxiv.org/pdf/2211.05100.pdf)).
+    creator_organization_name: BigScience
+    access: open
+    num_parameters: 176000000000
+    release_date: 2022-06-28
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
+
+  - name: bigscience/bloomz # NOT SUPPORTED
+    display_name: BLOOMZ (176B)
+    description: BLOOMZ (176B parameters) is BLOOM that has been fine-tuned on natural language instructions ([details](https://huggingface.co/bigscience/bloomz)).
+    creator_organization_name: BigScience
+    access: open
+    num_parameters: 176000000000
+    release_date: 2022-11-03
+    tags: [] # TODO: add tags
+
+  - name: bigscience/t0pp
+    display_name: T0pp (11B)
+    description: T0pp (11B parameters) is an encoder-decoder model trained on a large set of different tasks specified in natural language prompts ([paper](https://arxiv.org/pdf/2110.08207.pdf)).
+    creator_organization_name: BigScience
+    access: open
+    num_parameters: 11000000000
+    release_date: 2021-10-15
+    # Does not support echo.
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG]
+
+
+
+  # BigCode
+  - name: bigcode/santacoder
+    display_name: SantaCoder (1.1B)
+    description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
+    creator_organization_name: BigCode
+    access: open
+    num_parameters: 1100000000
+    release_date: 2023-01-09 # ArXiv submission date
+    tags: [CODE_MODEL_TAG]
+
+  - name: bigcode/starcoder
+    display_name: StarCoder (15.5B)
+    description: The StarCoder (15.5B parameter) model trained on 80+ programming languages from The Stack (v1.2) ([model card](https://huggingface.co/bigcode/starcoder)).
+    creator_organization_name: BigCode
+    access: open
+    num_parameters: 15500000000
+    release_date: 2023-05-09 # ArXiv submission date
+    tags: [CODE_MODEL_TAG]
+
+
+
+  # Cerebras Systems
+  - name: cerebras/cerebras-gpt-6.7b # NOT SUPPORTED
+    display_name: Cerebras GPT (6.7B)
+    description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
+    creator_organization_name: Cerebras
+    access: limited
+    num_parameters: 6700000000
+    release_date: 2023-04-06
+    tags: [] # TODO: add tags
+
+  - name: cerebras/cerebras-gpt-13b # NOT SUPPORTED
+    display_name: Cerebras GPT (13B)
+    description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
+    creator_organization_name: Cerebras
+    access: limited
+    num_parameters: 13000000000
+    release_date: 2023-04-06
+    tags: [] # TODO: add tags
+
+
+
+  # Cohere
+  # Model versioning and the possible versions are not documented here:
+  # https://docs.cohere.ai/generate-reference#model-optional.
+  # So, instead, we got the names of the models from the Cohere Playground.
+  #
+  # Note that their tokenizer and model were trained on English text and
+  # they do not have a dedicated decode API endpoint, so the adaptation
+  # step for language modeling fails for certain Scenarios:
+  # the_pile:subset=ArXiv
+  # the_pile:subset=Github
+  # the_pile:subset=PubMed Central
+
+  # TODO: Consider renaming to new model names.
+  - name: cohere/xlarge-20220609
+    display_name: Cohere xlarge v20220609 (52.4B)
+    description: Cohere xlarge v20220609 (52.4B parameters)
+    creator_organization_name: Cohere
+    access: limited
+    num_parameters: 52400000000
+    release_date: 2022-06-09
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: cohere/large-20220720 # DEPRECATED
+    display_name: Cohere large v20220720 (13.1B)
+    description: Cohere large v20220720 (13.1B parameters), which is deprecated by Cohere as of December 2, 2022.
+    creator_organization_name: Cohere
+    access: limited
+    num_parameters: 13100000000
+    release_date: 2022-07-20
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: cohere/medium-20220720
+    display_name: Cohere medium v20220720 (6.1B)
+    description: Cohere medium v20220720 (6.1B parameters)
+    creator_organization_name: Cohere
+    access: limited
+    num_parameters: 6100000000
+    release_date: 2022-07-20
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: cohere/small-20220720 # DEPRECATED
+    display_name: Cohere small v20220720 (410M)
+    description: Cohere small v20220720 (410M parameters), which is deprecated by Cohere as of December 2, 2022.
+    creator_organization_name: Cohere
+    access: limited
+    num_parameters: 410000000
+    release_date: 2022-07-20
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: cohere/xlarge-20221108
+    display_name: Cohere xlarge v20221108 (52.4B)
+    description: Cohere xlarge v20221108 (52.4B parameters)
+    creator_organization_name: Cohere
+    access: limited
+    num_parameters: 52400000000
+    release_date: 2022-11-08
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: cohere/medium-20221108 # DEPRECATED
+    display_name: Cohere medium v20221108 (6.1B)
+    description: Cohere medium v20221108 (6.1B parameters)
+    creator_organization_name: Cohere
+    access: limited
+    num_parameters: 6100000000
+    release_date: 2022-11-08
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: cohere/command-medium-beta # DEPRECATED
+    display_name: Cohere Command beta (6.1B)
+    description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
+    creator_organization_name: Cohere
+    access: limited
+    num_parameters: 6100000000
+    release_date: 2022-11-08
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  - name: cohere/command-xlarge-beta # DEPRECATED
+    display_name: Cohere Command beta (52.4B)
+    description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
+    creator_organization_name: Cohere
+    access: limited
+    num_parameters: 52400000000
+    release_date: 2022-11-08
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  # TODO: Fill in the details.
+  - name: cohere/command
+    display_name: Cohere Command TODO
+    description: Cohere Command TODO
+    creator_organization_name: Cohere
+    access: limited
+    release_date: 2022-11-08 # TODO
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  - name: cohere/command-light
+    display_name: Cohere Command TODO
+    description: Cohere Command TODO
+    creator_organization_name: Cohere
+    access: limited
+    release_date: 2022-11-08 # TODO
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+
+  # Databricks
+  - name: databricks/dolly-v2-3b
+    display_name: Dolly V2 (3B)
+    description: Dolly V2 (3B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
+    creator_organization_name: Databricks
+    access: open
+    num_parameters: 2517652480
+    release_date: 2023-04-12
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: databricks/dolly-v2-7b
+    display_name: Dolly V2 (7B)
+    description: Dolly V2 (7B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
+    creator_organization_name: Databricks
+    access: open
+    num_parameters: 6444163072
+    release_date: 2023-04-12
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: databricks/dolly-v2-12b
+    display_name: Dolly V2 (12B)
+    description: Dolly V2 (12B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
+    creator_organization_name: Databricks
+    access: open
+    num_parameters: 11327027200
+    release_date: 2023-04-12
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # DeepMind
+  - name: deepmind/gopher # NOT SUPPORTED
+    display_name: Gopher (280B)
+    description: Gopher (280B parameters) ([paper](https://arxiv.org/pdf/2112.11446.pdf)).
+    creator_organization_name: DeepMind
+    access: closed
+    num_parameters: 280000000000
+    release_date: 2021-12-08
+    tags: [] # TODO: add tags
+
+  - name: deepmind/chinchilla # NOT SUPPORTED
+    display_name: Chinchilla (70B)
+    description: Chinchilla (70B parameters) ([paper](https://arxiv.org/pdf/2203.15556.pdf)).
+    creator_organization_name: DeepMind
+    access: closed
+    num_parameters: 70000000000
+    release_date: 2022-03-31
+    tags: [] # TODO: add tags
+
+
+
+  # EleutherAI
+  - name: eleutherai/gpt-j-6b # Served by GooseAi, HuggingFace and Together.
+    display_name: GPT-J (6B)
+    description: GPT-J (6B parameters) autoregressive language model trained on The Pile ([details](https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/)).
+    creator_organization_name: EleutherAI
+    access: open
+    num_parameters: 6000000000
+    release_date: 2021-06-04
+    # TODO: The BUGGY_TEMP_0_TAG is a deployment related tag (Together).
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, BUGGY_TEMP_0_TAG]
+
+  - name: eleutherai/gpt-neox-20b # Served by GooseAi and Together.
+    display_name: GPT-NeoX (20B)
+    description: GPT-NeoX (20B parameters) autoregressive language model trained on The Pile ([paper](https://arxiv.org/pdf/2204.06745.pdf)).
+    creator_organization_name: EleutherAI
+    access: open
+    num_parameters: 20000000000
+    release_date: 2022-02-02
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
+
+  - name: eleutherai/pythia-1b-v0
+    display_name: Pythia (1B)
+    description: Pythia (1B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+    creator_organization_name: EleutherAI
+    access: open
+    num_parameters: 805736448
+    release_date: 2023-02-13
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: eleutherai/pythia-2.8b-v0
+    display_name: Pythia (2.8B)
+    description: Pythia (2.8B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+    creator_organization_name: EleutherAI
+    access: open
+    num_parameters: 2517652480
+    release_date: 2023-02-13
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: eleutherai/pythia-6.9b
+    display_name: Pythia (6.9B)
+    description: Pythia (6.9B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+    creator_organization_name: EleutherAI
+    access: open
+    num_parameters: 6444163072
+    release_date: 2023-02-13
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: eleutherai/pythia-12b-v0
+    display_name: Pythia (12B)
+    description: Pythia (12B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+    creator_organization_name: EleutherAI
+    access: open
+    num_parameters: 11327027200
+    release_date: 2023-02-13
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # Google
+  - name: google/t5-11b
+    display_name: T5 (11B)
+    description: T5 (11B parameters) is an encoder-decoder model trained on a multi-task mixture, where each task is converted into a text-to-text format ([paper](https://arxiv.org/pdf/1910.10683.pdf)).
+    creator_organization_name: Google
+    access: open
+    num_parameters: 11000000000
+    release_date: 2019-10-23
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG]
+
+  - name: google/ul2
+    display_name: UL2 (20B)
+    description: UL2 (20B parameters) is an encoder-decoder model trained on the C4 corpus. It's similar to T5 but trained with a different objective and slightly different scaling knobs ([paper](https://arxiv.org/pdf/2205.05131.pdf)).
+    creator_organization_name: Google
+    access: open
+    num_parameters: 20000000000
+    release_date: 2022-05-10
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG, NLG_PREFIX_TAG]
+
+  - name: google/flan-t5-xxl
+    display_name: Flan-T5 (11B)
+    description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
+    creator_organization_name: Google
+    access: open
+    num_parameters: 11000000000
+    release_date: 2022-12-06 # Paper date
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  - name: google/palm # NOT SUPPORTED
+    display_name: PaLM (540B)
+    description: Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips ([paper](https://arxiv.org/pdf/2204.02311.pdf)).
+    creator_organization_name: Google
+    access: closed
+    num_parameters: 540000000000
+    release_date: 2023-03-01 # was first announced on 2022-04 but remained private.
+    tags: [] # TODO: add tags
+
+
+
+  # HazyResearch
+  - name: hazyresearch/h3-2.7b
+    display_name: H3 (2.7B)
+    description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
+    creator_organization_name: HazyResearch
+    access: open
+    num_parameters: 2700000000
+    release_date: 2023-01-23
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # HuggingFace
+  - name: huggingface/idefics-9b
+    display_name: IDEFICS (9B)
+    description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization_name: HuggingFace
+    access: open
+    num_parameters: 9000000000
+    release_date: 2023-08-22
+    tags: [VISION_LANGUAGE_MODEL_TAG]
+
+  - name: huggingface/idefics-9b-instruct
+    display_name: IDEFICS instruct (9B)
+    description: IDEFICS instruct (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization_name: HuggingFace
+    access: open
+    num_parameters: 9000000000
+    release_date: 2023-08-22
+    tags: [VISION_LANGUAGE_MODEL_TAG]
+
+  - name: huggingface/idefics-80b
+    display_name: IDEFICS (80B)
+    description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization_name: HuggingFace
+    access: open
+    num_parameters: 80000000000
+    release_date: 2023-08-22
+    tags: [VISION_LANGUAGE_MODEL_TAG]
+
+  - name: huggingface/idefics-80b-instruct
+    display_name: IDEFICS instruct (80B)
+    description: IDEFICS instruct (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+    creator_organization_name: HuggingFace
+    access: open
+    num_parameters: 80000000000
+    release_date: 2023-08-22
+    tags: [VISION_LANGUAGE_MODEL_TAG]
+
+
+
+  # Lightning AI
+  - name: lightningai/lit-gpt
+    display_name: Lit-GPT
+    description: Lit-GPT is an optimized collection of open-source LLMs for finetuning and inference. It supports – Falcon, Llama 2, Vicuna, LongChat, and other top-performing open-source large language models.
+    creator_organization_name: Lightning AI
+    access: open
+    release_date: 2023-04-04
+    tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # LMSYS
+  - name: lmsys/vicuna-7b-v1.3
+    display_name: Vicuna v1.3 (7B)
+    description: Vicuna v1.3 (7B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
+    creator_organization_name: LMSYS
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-06-22
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  - name: lmsys/vicuna-13b-v1.3
+    display_name: Vicuna v1.3 (13B)
+    description: Vicuna v1.3 (13B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
+    creator_organization_name: LMSYS
+    access: open
+    num_parameters: 13000000000
+    release_date: 2023-06-22
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+
+  # Meta
+  - name: meta/opt-iml-175b # NOT SUPPORTED
+    display_name: OPT-IML (175B)
+    description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 175000000000
+    release_date: 2022-12-22
+    tags: [] # TODO: add tags
+
+  - name: meta/opt-iml-30b # NOT SUPPORTED
+    display_name: OPT-IML (30B)
+    description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 30000000000
+    release_date: 2022-12-22
+    tags: [] # TODO: add tags
+
+  - name: meta/opt-175b
+    display_name: OPT (175B)
+    description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 175000000000
+    release_date: 2022-05-02
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
+
+  - name: meta/opt-66b
+    display_name: OPT (66B)
+    description: Open Pre-trained Transformers (66B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 66000000000
+    release_date: 2022-05-02
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
+
+  - name: meta/opt-6.7b
+    display_name: OPT (6.7B)
+    description: Open Pre-trained Transformers (6.7B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 6700000000
+    release_date: 2022-05-02
+    # TODO: The BUGGY_TEMP_0_TAG is a deployment related tag (Together).
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, BUGGY_TEMP_0_TAG]
+
+  - name: meta/opt-1.3b
+    display_name: OPT (1.3B)
+    description: Open Pre-trained Transformers (1.3B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 1300000000
+    release_date: 2022-05-02
+    # TODO: The BUGGY_TEMP_0_TAG is a deployment related tag (Together).
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, BUGGY_TEMP_0_TAG]
+
+  - name: meta/galactica-120b # NOT SUPPORTED
+    display_name: Galactica (120B)
+    description: Galactica (120B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 120000000000
+    release_date: 2022-11-15
+    tags: [] # TODO: add tags
+
+  - name: meta/galactica-30b # NOT SUPPORTED
+    display_name: Galactica (30B)
+    description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 30000000000
+    release_date: 2022-11-15
+    tags: [] # TODO: add tags
+
+  - name: meta/llama-7b
+    display_name: LLaMA (7B)
+    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-02-24
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: meta/llama-13b
+    display_name: LLaMA (13B)
+    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 13000000000
+    release_date: 2023-02-24
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: meta/llama-30b
+    display_name: LLaMA (30B)
+    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 30000000000
+    release_date: 2023-02-24
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: meta/llama-65b
+    display_name: LLaMA (65B)
+    description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 65000000000
+    release_date: 2023-02-24
+    # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: meta/llama-2-7b
+    display_name: Llama 2 (7B)
+    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-07-18
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: meta/llama-2-13b
+    display_name: Llama 2 (13B)
+    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 13000000000
+    release_date: 2023-07-18
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: meta/llama-2-70b
+    display_name: Llama 2 (70B)
+    description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+    creator_organization_name: Meta
+    access: open
+    num_parameters: 70000000000
+    release_date: 2023-07-18
+    # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # Microsoft/NVIDIA
+  - name: microsoft/TNLGv2_530B
+    display_name: TNLG v2 (530B)
+    description: TNLG v2 (530B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
+    creator_organization_name: Microsoft/NVIDIA
+    access: closed
+    num_parameters: 530000000000
+    release_date: 2022-01-28
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: microsoft/TNLGv2_7B
+    display_name: TNLG v2 (6.7B)
+    description: TNLG v2 (6.7B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
+    creator_organization_name: Microsoft/NVIDIA
+    access: closed
+    num_parameters: 6700000000
+    release_date: 2022-01-28
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # Mistral AI
+  - name: mistralai/mistral-7b-v0.1
+    display_name: Mistral v0.1 (7B)
+    description: Mistral 7B is a  7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
+    creator_organization_name: Mistral AI
+    access: open
+    num_parameters: 7300000000
+    release_date: 2023-09-27
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+
+  # MosaicML
+  - name: mosaicml/mpt-7b
+    display_name: MPT (7B)
+    description: MPT (7B) is a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization_name: MosaicML
+    access: open
+    num_parameters: 6700000000
+    release_date: 2023-05-05
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: mosaicml/mpt-7b-chat # NOT SUPPORTED
+    display_name: MPT-Chat (7B)
+    description: MPT-Chat (7B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B) , a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization_name: MosaicML
+    access: open
+    num_parameters: 6700000000
+    release_date: 2023-05-05
+    tags: [] # TODO: add tags
+
+  - name: mosaicml/mpt-instruct-7b
+    display_name: MPT-Instruct (7B)
+    description: MPT-Instruct (7B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization_name: MosaicML
+    access: open
+    num_parameters: 6700000000
+    release_date: 2023-05-05
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: mosaicml/mpt-30b
+    display_name: MPT (30B)
+    description: MPT (30B) is a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization_name: MosaicML
+    access: open
+    num_parameters: 30000000000
+    release_date: 2023-06-22
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: mosaicml/mpt-30b-chat # NOT SUPPORTED
+    display_name: MPT-Chat (30B)
+    description: MPT-Chat (30B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization_name: MosaicML
+    access: open
+    num_parameters: 30000000000
+    release_date: 2023-06-22
+    tags: [] # TODO: add tags
+
+  - name: mosaicml/mpt-instruct-30b
+    display_name: MPT-Instruct (30B)
+    description: MPT-Instruct (30B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
+    creator_organization_name: MosaicML
+    access: open
+    num_parameters: 30000000000
+    release_date: 2023-06-22
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # Neurips
+  - name: neurips/local
+    display_name: Neurips Local
+    description: Neurips Local
+    creator_organization_name: Neurips
+    access: open
+    release_date: 2023-06-01
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # NVIDIA
+  - name: nvidia/megatron-gpt2
+    display_name: Megatron GPT2
+    description: GPT-2 implemented in Megatron-LM ([paper](https://arxiv.org/abs/1909.08053)).
+    creator_organization_name: NVIDIA
+    access: open
+    release_date: 2019-09-17 # paper date
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, BUGGY_TEMP_0_TAG]
+
+
+
+  # OpenAI
+
+  ## GPT 2 Models
+  # Not served by OpenAI, instead served by HuggingFace.
+
+  - name: openai/gpt2
+    display_name: GPT-2 (1.5B)
+    description: GPT-2 (1.5B parameters) is a transformer model trained on a large corpus of English text in a self-supervised fashion ([paper](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)).
+    creator_organization_name: OpenAI
+    access: open
+    num_parameters: 1500000000
+    release_date: 2019-02-14
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+  ## GPT 3 Models
+  # The list of models can be found here: https://beta.openai.com/docs/engines/gpt-3
+  # DEPRECATED: Announced on July 06 2023 that these models will be shut down on January 04 2024.
+
+  - name: openai/davinci # DEPRECATED
+    display_name: davinci (175B)
+    description: Original GPT-3 (175B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2020-05-28
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: openai/curie # DEPRECATED
+    display_name: curie (6.7B)
+    description: Original GPT-3 (6.7B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 6700000000
+    release_date: 2020-05-28
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+    
+  - name: openai/babbage # DEPRECATED
+    display_name: babbage (1.3B)
+    description: Original GPT-3 (1.3B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 1300000000
+    release_date: 2020-05-28
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+    
+  - name: openai/ada # DEPRECATED
+    display_name: ada (350M)
+    description: Original GPT-3 (350M parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 350000000
+    release_date: 2020-05-28
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: openai/text-davinci-003 # DEPRECATED
+    display_name: text-davinci-003
+    description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2022-11-28
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  # TODO: text-davinci-002 supports insertion. Support insertion in our framework.
+  #       https://github.com/stanford-crfm/benchmarking/issues/359
+  - name: openai/text-davinci-002 # DEPRECATED
+    display_name: text-davinci-002
+    description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2022-01-27
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: openai/text-davinci-001 # DEPRECATED
+    display_name: text-davinci-001
+    description: text-davinci-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2022-01-27
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: openai/text-curie-001 # DEPRECATED
+    display_name: text-curie-001
+    description: text-curie-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 6700000000
+    release_date: 2022-01-27
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+    
+  - name: openai/text-babbage-001 # DEPRECATED
+    display_name: text-babbage-001
+    description: text-babbage-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 1300000000
+    release_date: 2022-01-27
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+    
+  - name: openai/text-ada-001 # DEPRECATED
+    display_name: text-ada-001
+    description: text-ada-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 350000000
+    release_date: 2022-01-27
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+  ## GPT 3.5 Turbo Models
+  # ChatGPT: https://openai.com/blog/chatgpt
+  
+  - name: openai/gpt-3.5-turbo-0301
+    display_name: gpt-3.5-turbo-0301
+    description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2023-03-01
+    tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  - name: openai/gpt-3.5-turbo-0613
+    display_name: gpt-3.5-turbo-0613
+    description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13.
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2023-06-13
+    tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  # Claimed length is 16,384; we round down to 16,000 for the same reasons as explained
+  # in the openai/gpt-3.5-turbo-0613 comment
+  - name: openai/gpt-3.5-turbo-16k-0613
+    display_name: gpt-3.5-turbo-16k-0613
+    description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13 with a longer context length of 16,384 tokens.
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2023-06-13
+    tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+  ## GPT 4 Models
+    
+  - name: openai/gpt-4-0314
+    display_name: gpt-4-0314
+    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from March 14th 2023.
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2023-03-14
+    tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  - name: openai/gpt-4-32k-0314
+    display_name: gpt-4-32k-0314
+    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from March 14th 2023.
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2023-03-14
+    tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  - name: openai/gpt-4-0613
+    display_name: gpt-4-0613
+    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13.
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2023-06-13
+    tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+  - name: openai/gpt-4-32k-0613
+    display_name: gpt-4-32k-0613
+    description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from 2023-06-13.
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2023-06-13
+    tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+  ## Codex Models
+  # DEPRECATED: Codex models have been shut down on March 23 2023.
+
+  - name: openai/code-davinci-002 # DEPRECATED
+    display_name: code-davinci-002
+    description: Codex-style model that is designed for pure code-completion tasks ([docs](https://beta.openai.com/docs/models/codex)).
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2021-07-01 # TODO: Find correct date (this is for v1)
+    tags: [CODE_MODEL_TAG]
+
+  - name: openai/code-davinci-001 # DEPRECATED
+    display_name: code-davinci-001
+    description: code-davinci-001 model
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2021-07-01 # Paper date
+    tags: [CODE_MODEL_TAG]
+
+  - name: openai/code-cushman-001 # DEPRECATED
+    display_name: code-cushman-001 (12B)
+    description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 12000000000
+    release_date: 2021-07-01 # Paper date
+    tags: [CODE_MODEL_TAG]
+
+
+  ## Text Similarity Models
+  # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
+  # The number of parameters is guessed based on the number of parameters of the
+  # corresponding GPT-3 model.
+  # DEPRECATED: Announced on July 06 2023 that first generation embeddings models
+  #  will be shut down on January 04 2024.
+
+  - name: openai/text-similarity-davinci-001 # DEPRECATED
+    display_name: text-similarity-davinci-001
+    description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 175000000000
+    release_date: 2022-01-25 # Blog post date
+    tags: [TEXT_SIMILARITY_MODEL_TAG]
+
+  - name: openai/text-similarity-curie-001 # DEPRECATED
+    display_name: text-similarity-curie-001
+    description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 6700000000
+    release_date: 2022-01-25 # Blog post date
+    tags: [TEXT_SIMILARITY_MODEL_TAG]
+
+  - name: openai/text-similarity-babbage-001 # DEPRECATED
+    display_name: text-similarity-babbage-001
+    description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 1300000000
+    release_date: 2022-01-25 # Blog post date
+    tags: [TEXT_SIMILARITY_MODEL_TAG]
+
+  - name: openai/text-similarity-ada-001 # DEPRECATED
+    display_name: text-similarity-ada-001
+    description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)).
+    creator_organization_name: OpenAI
+    access: limited
+    num_parameters: 350000000
+    release_date: 2022-01-25 # Blog post date
+    tags: [TEXT_SIMILARITY_MODEL_TAG]
+
+  - name: openai/text-embedding-ada-002
+    display_name: text-embedding-ada-002
+    description: An improved embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/new-and-improved-embedding-model)).
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2022-12-15 # Blog post date
+    tags: [TEXT_SIMILARITY_MODEL_TAG]
+
+
+
+  # Salesforce
+  - name: salesforce/codegen # NOT SUPPORTED
+    display_name: CodeGen (16B)
+    description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
+    creator_organization_name: Tsinghua
+    access: open
+    num_parameters: 16000000000
+    release_date: 2022-03-25
+    tags: [] # TODO: add tags
+
+
+
+  # Stability AI
+  - name: stabilityai/stablelm-base-alpha-3b
+    display_name: StableLM-Base-Alpha (3B)
+    description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
+    creator_organization_name: Stability AI
+    access: open
+    num_parameters: 3000000000
+    release_date: 2023-04-20
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: stabilityai/stablelm-base-alpha-7b
+    display_name: StableLM-Base-Alpha (7B)
+    description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
+    creator_organization_name: Stability AI
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-04-20
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # Stanford
+  - name: stanford/alpaca-7b
+    display_name: Alpaca (7B)
+    description: Alpaca 7B is a model fine-tuned from the LLaMA 7B model on 52K instruction-following demonstrations
+    creator_organization_name: Stanford
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-03-13
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+
+  # TII UAE
+  - name: tiiuae/falcon-7b
+    display_name: Falcon (7B)
+    description: Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
+    creator_organization_name: TII UAE
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-03-15
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: tiiuae/falcon-7b-instruct
+    display_name: Falcon-Instruct (7B)
+    description: Falcon-7B-Instruct is a 7B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
+    creator_organization_name: TII UAE
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-03-15
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: tiiuae/falcon-40b
+    display_name: Falcon (40B)
+    description: Falcon-40B is a 40B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
+    creator_organization_name: TII UAE
+    access: open
+    num_parameters: 40000000000
+    release_date: 2023-05-25
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: tiiuae/falcon-40b-instruct
+    display_name: Falcon-Instruct (40B)
+    description: Falcon-40B-Instruct is a 40B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
+    creator_organization_name: TII UAE
+    access: open
+    num_parameters: 40000000000
+    release_date: 2023-05-25
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # Together
+  - name: together/gpt-jt-6b-v1
+    display_name: GPT-JT (6B)
+    description: GPT-JT (6B parameters) is a fork of GPT-J ([blog post](https://www.together.xyz/blog/releasing-v1-of-gpt-jt-powered-by-open-source-ai)).
+    creator_organization_name: Together
+    access: open
+    num_parameters: 6700000000
+    release_date: 2022-11-29
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: together/gpt-neoxt-chat-base-20b
+    display_name: GPT-NeoXT-Chat-Base (20B)
+    description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
+    creator_organization_name: Together
+    access: open
+    num_parameters: 20000000000
+    release_date: 2023-03-08
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, CHATML_MODEL_TAG]
+
+  - name: together/redpajama-incite-base-3b-v1
+    display_name: RedPajama-INCITE-Base-v1 (3B)
+    description: RedPajama-INCITE-Base-v1 (3B parameters) is a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+    creator_organization_name: Together
+    access: open
+    num_parameters: 3000000000
+    release_date: 2023-05-05
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: together/redpajama-incite-instruct-3b-v1
+    display_name: RedPajama-INCITE-Instruct-v1 (3B)
+    description: RedPajama-INCITE-Instruct-v1 (3B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+    creator_organization_name: Together
+    access: open
+    num_parameters: 3000000000
+    release_date: 2023-05-05
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: together/redpajama-incite-chat-3b-v1 # NOT SUPPORTED
+    display_name: RedPajama-INCITE-Chat-v1 (3B)
+    description: RedPajama-INCITE-Chat-v1 (3B parameters) is a model fine-tuned on OASST1 and Dolly2 to enhance chatting ability. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+    creator_organization_name: Together
+    access: open
+    num_parameters: 3000000000
+    release_date: 2023-05-05
+    tafs: [] # TODO: add tags
+
+  - name: together/redpajama-incite-base-7b
+    display_name: RedPajama-INCITE-Base (7B)
+    description: RedPajama-INCITE-Base (7B parameters) is a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+    creator_organization_name: Together
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-05-05
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: together/redpajama-incite-instruct-7b
+    display_name: RedPajama-INCITE-Instruct (7B)
+    description: RedPajama-INCITE-Instruct (7B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base (7B), a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+    creator_organization_name: Together
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-05-05
+    tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # Tsinghua
+  - name: tsinghua/glm
+    display_name: GLM (130B)
+    description: GLM (130B parameters) is an open bilingual (English & Chinese) bidirectional dense model that was trained using General Language Model (GLM) procedure ([paper](https://arxiv.org/pdf/2210.02414.pdf)).
+    creator_organization_name: Tsinghua
+    access: open
+    num_parameters: 130000000000
+    release_date: 2022-08-04
+    # Inference with echo=True is not feasible -- in the prompt encoding phase, they use
+    # bidirectional attention and do not perform predictions on them.
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG]
+
+  - name: tsinghua/codegeex # NOT SUPPORTED
+    display_name: CodeGeeX (13B)
+    description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
+    creator_organization_name: Tsinghua
+    access: open
+    num_parameters: 13000000000
+    release_date: 2022-09-19
+    tags: [] # TODO: add tags
+
+
+
+  # Writer
+  - name: writer/palmyra-base
+    display_name: Palmyra Base (5B)
+    description: Palmyra Base (5B)
+    creator_organization_name: Writer
+    access: limited
+    num_parameters: 5000000000
+    release_date: 2022-10-13
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: writer/palmyra-large
+    display_name: Palmyra Large (20B)
+    description: Palmyra Large (20B)
+    creator_organization_name: Writer
+    access: limited
+    num_parameters: 20000000000
+    release_date: 2022-12-23
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: writer/palmyra-instruct-30
+    display_name: InstructPalmyra (30B)
+    description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans.
+    creator_organization_name: Writer
+    access: limited
+    num_parameters: 30000000000
+    release_date: 2023-02-16
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: writer/palmyra-e
+    display_name: Palmyra E (30B)
+    description: Palmyra E (30B)
+    creator_organization_name: Writer
+    access: limited
+    num_parameters: 30000000000
+    release_date: 2023-03-03
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: writer/silk-road
+    display_name: Silk Road (35B)
+    description: Silk Road (35B)
+    creator_organization_name: Writer
+    access: limited
+    num_parameters: 35000000000
+    release_date: 2023-04-13
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+  - name: writer/palmyra-x
+    display_name: Palmyra X (43B)
+    description: Palmyra-X (43B parameters) is trained to adhere to instructions using human feedback and utilizes a technique called multiquery attention. Furthermore, a new feature called 'self-instruct' has been introduced, which includes the implementation of an early stopping criteria specifically designed for minimal instruction tuning ([paper](https://dev.writer.com/docs/becoming-self-instruct-introducing-early-stopping-criteria-for-minimal-instruct-tuning)).
+    creator_organization_name: Writer
+    access: limited
+    num_parameters: 43000000000
+    release_date: 2023-06-11
+    # Does not support echo
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+  # Yandex
+  - name: yandex/yalm
+    display_name: YaLM (100B)
+    description: YaLM (100B parameters) is an autoregressive language model trained on English and Russian text ([GitHub](https://github.com/yandex/YaLM-100B)).
+    creator_organization_name: Yandex
+    access: open
+    num_parameters: 100000000000
+    release_date: 2022-06-23
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
\ No newline at end of file
diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
new file mode 100644
index 00000000000..c7c0d1446ec
--- /dev/null
+++ b/src/helm/config/tokenizer_configs.yaml
@@ -0,0 +1,202 @@
+tokenizer_configs:
+
+  - name: simple/model1
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.simple_tokenizer.SimpleTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+
+  # AI21
+  - name: ai21/j1
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.ai21_tokenizer.AI21Tokenizer"
+    end_of_text_token: " "
+    prefix_token: ""
+
+  # AlephAlpha
+  - name: AlephAlpha/luminous-base
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
+    end_of_text_token: ""
+    prefix_token: ""
+  - name: AlephAlpha/luminous-extended
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
+    end_of_text_token: ""
+    prefix_token: ""
+  - name: AlephAlpha/luminous-supreme
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
+    end_of_text_token: ""
+    prefix_token: ""
+  - name: AlephAlpha/luminous-world
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
+    end_of_text_token: ""
+    prefix_token: ""
+
+  # Anthropic
+  - name: anthropic/claude
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.anthropic_tokenizer.AnthropicTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+
+  # Bigcode
+  - name: bigcode/santacoder
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+  - name: bigcode/starcoder
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+
+  # Bigscience
+  - name: bigscience/bloom
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "</s>"
+  - name: bigscience/T0pp
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: ""
+
+  # Cohere
+  - name: cohere/cohere
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer"
+    end_of_text_token: ""
+    prefix_token: ":"
+
+  # EleutherAI
+  - name: EleutherAI/gpt-j-6B
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+  - name: EleutherAI/gpt-neox-20b
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+
+  # Facebook
+  - name: facebook/opt-66b
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "</s>"
+
+  # Google
+  - name: google/t5-11b
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: ""
+  - name: google/flan-t5-xxl
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: ""
+  - name: google/ul2
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: ""
+
+  # Hf-internal-testing
+  - name: hf-internal-testing/llama-tokenizer
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+
+  # HuggingFaceM4
+  - name: HuggingFaceM4/idefics-9b
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  - name: HuggingFaceM4/idefics-9b-instruct
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  - name: HuggingFaceM4/idefics-80b
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  - name: HuggingFaceM4/idefics-80b-instruct
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+
+  # Huggingface
+  - name: huggingface/gpt2
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+
+  # Lighting AI
+  - name: lightningai/lit-gpt
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.lit_gpt_tokenizer.LitGPTTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+
+  # Meta-llama
+  - name: meta-llama/Llama-2-7b-hf
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+
+  # Mistralai
+  - name: mistralai/Mistral-7B-v0.1
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+
+  # Neurips
+  - name: neurips/local
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.http_model_tokenizer.HTTPModelTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+
+  # Openai
+  - name: openai/cl100k_base
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+
+  # Tiiuae
+  - name: tiiuae/falcon-7b
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: ""
+
+  # TsinghuaKEG
+  - name: TsinghuaKEG/ice
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.ice_tokenizer.ICETokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: ""
+
+  # Yandex
+  - name: Yandex/yalm
+    tokenizer_spec:
+      class_name: "helm.proxy.tokenizers.yalm_tokenizer.YaLMTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "</s>"
\ No newline at end of file
diff --git a/src/helm/proxy/clients/aleph_alpha_client.py b/src/helm/proxy/clients/aleph_alpha_client.py
index a988938ae33..ae7116cef2d 100644
--- a/src/helm/proxy/clients/aleph_alpha_client.py
+++ b/src/helm/proxy/clients/aleph_alpha_client.py
@@ -2,8 +2,6 @@
 import requests
 from typing import Any, Dict, List
 
-from aleph_alpha_client import Client as AlephAlphaPythonClient
-
 from helm.common.cache import CacheConfig
 from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
 from helm.proxy.tokenizers.tokenizer import Tokenizer
@@ -16,7 +14,6 @@ class AlephAlphaClient(CachingClient):
     def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig):
         super().__init__(cache_config=cache_config, tokenizer=tokenizer)
         self.api_key: str = api_key
-        self._aleph_alpha_client = AlephAlphaPythonClient(token=api_key)
 
     def _send_request(self, endpoint: str, raw_request: Dict[str, Any]) -> Dict[str, Any]:
         response = requests.request(
diff --git a/src/helm/proxy/clients/anthropic_client.py b/src/helm/proxy/clients/anthropic_client.py
index 0cfde926c58..6f6dd8f1c7d 100644
--- a/src/helm/proxy/clients/anthropic_client.py
+++ b/src/helm/proxy/clients/anthropic_client.py
@@ -249,7 +249,7 @@ def make_request(self, request: Request) -> RequestResult:
         if request.embedding:
             return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
         # Validate the fields of `Request`
-        if request.model != "anthropic/stanford-online-all-v4-s3":
+        if request.model_engine != "stanford-online-all-v4-s3":
             raise ValueError(f"Invalid model: {request.model}")
         if request.max_tokens > AnthropicLegacyClient.MAX_COMPLETION_LENGTH:
             raise ValueError(
diff --git a/src/helm/proxy/clients/auto_client.py b/src/helm/proxy/clients/auto_client.py
index 84239fb4b05..04b1381331d 100644
--- a/src/helm/proxy/clients/auto_client.py
+++ b/src/helm/proxy/clients/auto_client.py
@@ -1,13 +1,13 @@
 import os
 from dataclasses import replace
-from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional
 
 from retrying import Attempt, RetryError
 
-from helm.benchmark.model_deployment_registry import get_model_deployment
-from helm.benchmark.tokenizer_config_registry import get_tokenizer_config
-from helm.common.cache import CacheConfig, MongoCacheConfig, SqliteCacheConfig
+from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment
+from helm.common.cache_utils import build_cache_config
+from helm.common.credentials_utils import provide_api_key
+from helm.common.cache import CacheConfig
 from helm.common.hierarchical_logger import hlog
 from helm.common.object_spec import create_object, inject_object_spec_args
 from helm.common.request import Request, RequestResult
@@ -21,10 +21,9 @@
 from helm.proxy.critique.critique_client import CritiqueClient
 from helm.proxy.clients.toxicity_classifier_client import ToxicityClassifierClient
 from helm.proxy.retry import NonRetriableException, retry_request
-from helm.proxy.tokenizers.tokenizer import Tokenizer
+from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
 from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
 
-from .http_model_client import HTTPModelClient
 
 if TYPE_CHECKING:
     import helm.proxy.clients.huggingface_client
@@ -35,18 +34,14 @@ class AuthenticationError(NonRetriableException):
 
 
 class AutoClient(Client):
-    """Automatically dispatch to the proper `Client` based on the organization.
-
-    The modules for each client are lazily imported when the respective client is created.
-    This greatly speeds up the import time of this module, and allows the client modules to
-    use optional dependencies."""
+    """Automatically dispatch to the proper `Client` based on the model deployment name."""
 
     def __init__(self, credentials: Mapping[str, Any], cache_path: str, mongo_uri: str = ""):
+        self._auto_tokenizer = AutoTokenizer(credentials, cache_path, mongo_uri)
         self.credentials = credentials
         self.cache_path = cache_path
         self.mongo_uri = mongo_uri
         self.clients: Dict[str, Client] = {}
-        self.tokenizers: Dict[str, Tokenizer] = {}
         # self._huggingface_client is lazily instantiated by get_huggingface_client()
         self._huggingface_client: Optional["helm.proxy.clients.huggingface_client.HuggingFaceClient"] = None
         # self._critique_client is lazily instantiated by get_critique_client()
@@ -54,185 +49,54 @@ def __init__(self, credentials: Mapping[str, Any], cache_path: str, mongo_uri: s
         hlog(f"AutoClient: cache_path = {cache_path}")
         hlog(f"AutoClient: mongo_uri = {mongo_uri}")
 
-    def _build_cache_config(self, organization: str) -> CacheConfig:
-        if self.mongo_uri:
-            return MongoCacheConfig(self.mongo_uri, collection_name=organization)
+    def _get_client(self, model_deployment_name: str) -> Client:
+        """Return a client based on the model, creating it if necessary."""
+        # First try to find the client in the cache
+        client: Optional[Client] = self.clients.get(model_deployment_name)
+        if client is not None:
+            return client
+
+        # Otherwise, create the client
+        model_deployment: ModelDeployment = get_model_deployment(model_deployment_name)
+        if model_deployment:
+            # Perform dependency injection to fill in remaining arguments.
+            # Dependency injection is needed here for these reasons:
+            #
+            # 1. Different clients have different parameters. Dependency injection provides arguments
+            #    that match the parameters of the client.
+            # 2. Some arguments, such as the tokenizer, are not static data objects that can be
+            #    in the users configuration file. Instead, they have to be constructed dynamically at
+            #    runtime.
+            # 3. The providers must be lazily-evaluated, because eager evaluation can result in an
+            #    exception. For instance, some clients do not require an API key, so trying to fetch
+            #    the API key from configuration eagerly will result in an exception because the user
+            #    will not have configured an API key.
+
+            # Prepare a cache
+            host_organization: str = model_deployment.host_organization
+            cache_config: CacheConfig = build_cache_config(self.cache_path, self.mongo_uri, host_organization)
+
+            client_spec = inject_object_spec_args(
+                model_deployment.client_spec,
+                constant_bindings={"cache_config": cache_config},
+                provider_bindings={
+                    "api_key": lambda: provide_api_key(self.credentials, host_organization, model_deployment_name),
+                    "tokenizer": lambda: self._auto_tokenizer._get_tokenizer(
+                        tokenizer_name=model_deployment.tokenizer_name or model_deployment.name
+                    ),
+                    "org_id": lambda: self.credentials.get(
+                        host_organization + "OrgId", None
+                    ),  # OpenAI, GooseAI, Microsoft
+                    "lock_file_path": lambda: os.path.join(self.cache_path, f"{host_organization}.lock"),  # Microsoft
+                },
+            )
+            client = create_object(client_spec)
+        else:
+            raise ValueError(f"Could not find client for model deployment: {model_deployment_name}")
 
-        client_cache_path: str = os.path.join(self.cache_path, f"{organization}.sqlite")
-        # TODO: Allow setting CacheConfig.follower_cache_path from a command line flag.
-        return SqliteCacheConfig(client_cache_path)
+        # Cache the client
+        self.clients[model_deployment_name] = client
 
-    def _get_client(self, model: str) -> Client:
-        """Return a client based on the model, creating it if necessary."""
-        client: Optional[Client] = self.clients.get(model)
-
-        if client is None:
-            organization: str = model.split("/")[0]
-            cache_config: CacheConfig = self._build_cache_config(organization)
-            tokenizer: Tokenizer = self._get_tokenizer(organization)
-
-            # TODO: Migrate all clients to use model deployments
-            model_deployment = get_model_deployment(model)
-            if model_deployment:
-
-                def provide_api_key():
-                    if "deployments" not in self.credentials:
-                        raise AuthenticationError("Could not find key 'deployments' in credentials.conf")
-                    deployment_api_keys = self.credentials["deployments"]
-                    if model not in deployment_api_keys:
-                        raise AuthenticationError(
-                            f"Could not find key '{model}' under key 'deployments' in credentials.conf"
-                        )
-                    return deployment_api_keys[model]
-
-                # Perform dependency injection to fill in remaining arguments.
-                # Dependency injection is needed here for these reasons:
-                #
-                # 1. Different clients have different parameters. Dependency injection provides arguments
-                #    that match the parameters of the client.
-                # 2. Some arguments, such as the tokenizer, are not static data objects that can be
-                #    in the users configuration file. Instead, they have to be constructed dynamically at
-                #    runtime.
-                # 3. The providers must be lazily-evaluated, because eager evaluation can result in an
-                #    exception. For instance, some clients do not require an API key, so trying to fetch
-                #    the API key from configuration eagerly will result in an exception because the user
-                #    will not have configured an API key.
-                client_spec = inject_object_spec_args(
-                    model_deployment.client_spec,
-                    constant_bindings={"cache_config": cache_config},
-                    provider_bindings={"api_key": provide_api_key},
-                )
-                client = create_object(client_spec)
-            elif organization == "neurips":
-                client = HTTPModelClient(tokenizer=tokenizer, cache_config=cache_config)
-            elif organization == "openai":
-                from helm.proxy.clients.openai_client import OpenAIClient
-
-                org_id = self.credentials.get("openaiOrgId", None)
-                api_key = self.credentials.get("openaiApiKey", None)
-                client = OpenAIClient(
-                    tokenizer=tokenizer,
-                    cache_config=cache_config,
-                    api_key=api_key,
-                    org_id=org_id,
-                )
-            elif organization == "AlephAlpha":
-                from helm.proxy.clients.aleph_alpha_client import AlephAlphaClient
-
-                client = AlephAlphaClient(
-                    tokenizer=tokenizer,
-                    api_key=self.credentials["alephAlphaKey"],
-                    cache_config=cache_config,
-                )
-            elif organization == "ai21":
-                from helm.proxy.clients.ai21_client import AI21Client
-
-                client = AI21Client(
-                    tokenizer=tokenizer,
-                    api_key=self.credentials["ai21ApiKey"],
-                    cache_config=cache_config,
-                )
-            elif organization == "cohere":
-                from helm.proxy.clients.cohere_client import CohereClient
-
-                client = CohereClient(
-                    tokenizer=tokenizer,
-                    api_key=self.credentials["cohereApiKey"],
-                    cache_config=cache_config,
-                )
-            elif organization == "gooseai":
-                from helm.proxy.clients.goose_ai_client import GooseAIClient
-
-                org_id = self.credentials.get("gooseaiOrgId", None)
-                client = GooseAIClient(
-                    tokenizer=tokenizer,
-                    api_key=self.credentials["gooseaiApiKey"],
-                    cache_config=cache_config,
-                    org_id=org_id,
-                )
-            elif organization == "huggingface":
-                from helm.proxy.clients.huggingface_client import HuggingFaceClient
-
-                client = HuggingFaceClient(tokenizer=tokenizer, cache_config=cache_config)
-            elif organization == "anthropic":
-                from helm.proxy.clients.anthropic_client import AnthropicClient
-
-                client = AnthropicClient(
-                    api_key=self.credentials.get("anthropicApiKey", None),
-                    tokenizer=tokenizer,
-                    cache_config=cache_config,
-                )
-            elif organization == "microsoft":
-                from helm.proxy.clients.microsoft_client import MicrosoftClient
-
-                org_id = self.credentials.get("microsoftOrgId", None)
-                lock_file_path: str = os.path.join(self.cache_path, f"{organization}.lock")
-                client = MicrosoftClient(
-                    api_key=self.credentials.get("microsoftApiKey", None),
-                    tokenizer=tokenizer,
-                    lock_file_path=lock_file_path,
-                    cache_config=cache_config,
-                    org_id=org_id,
-                )
-            elif organization == "google":
-                from helm.proxy.clients.google_client import GoogleClient
-
-                client = GoogleClient(
-                    tokenizer=tokenizer,
-                    cache_config=cache_config,
-                )
-            elif organization in [
-                "together",
-                "databricks",
-                "eleutherai",
-                "lmsys",
-                "meta",
-                "mistralai",
-                "mosaicml",
-                "stabilityai",
-                "stanford",
-                "tiiuae",
-            ]:
-                from helm.proxy.clients.together_client import TogetherClient
-
-                client = TogetherClient(
-                    api_key=self.credentials.get("togetherApiKey", None),
-                    tokenizer=tokenizer,
-                    cache_config=cache_config,
-                )
-            elif organization == "simple":
-                from helm.proxy.clients.simple_client import SimpleClient
-
-                client = SimpleClient(tokenizer=tokenizer, cache_config=cache_config)
-            elif organization == "writer":
-                from helm.proxy.clients.palmyra_client import PalmyraClient
-
-                client = PalmyraClient(
-                    api_key=self.credentials["writerApiKey"],
-                    tokenizer=tokenizer,
-                    cache_config=cache_config,
-                )
-            elif organization == "nvidia":
-                from helm.proxy.clients.megatron_client import MegatronClient
-
-                client = MegatronClient(tokenizer=tokenizer, cache_config=cache_config)
-
-            elif organization == "lightningai":
-                from helm.proxy.clients.lit_gpt_client import LitGPTClient
-
-                client = LitGPTClient(
-                    tokenizer=tokenizer,
-                    cache_config=cache_config,
-                    checkpoint_dir=Path(os.environ.get("LIT_GPT_CHECKPOINT_DIR", "")),
-                    precision=os.environ.get("LIT_GPT_PRECISION", "bf16-true"),
-                )
-            elif organization == "HuggingFaceM4":
-                from helm.proxy.clients.vision_language.idefics_client import IDEFICSClient
-
-                client = IDEFICSClient(tokenizer=tokenizer, cache_config=cache_config)
-            else:
-                raise ValueError(f"Could not find client for model: {model}")
-            self.clients[model] = client
         return client
 
     def make_request(self, request: Request) -> RequestResult:
@@ -246,155 +110,36 @@ def make_request(self, request: Request) -> RequestResult:
         def make_request_with_retry(client: Client, request: Request) -> RequestResult:
             return client.make_request(request)
 
-        client: Client = self._get_client(request.model)
+        client: Client = self._get_client(request.model_deployment)
 
         try:
             return make_request_with_retry(client=client, request=request)
         except RetryError as e:
             last_attempt: Attempt = e.last_attempt
             retry_error: str = (
-                f"Failed to make request to {request.model} after retrying {last_attempt.attempt_number} times"
+                f"Failed to make request to {request.model_deployment} after retrying "
+                f"{last_attempt.attempt_number} times"
             )
             hlog(retry_error)
 
             # Notify our user that we failed to make the request even after retrying.
             return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
 
-    def _get_tokenizer(self, tokenizer_name: str) -> Tokenizer:
-        # First try to find the tokenizer in the cache
-        tokenizer: Optional[Tokenizer] = self.tokenizers.get(tokenizer_name)
-        if tokenizer is not None:
-            return tokenizer
-
-        # Otherwise, create the tokenizer
-        organization: str = tokenizer_name.split("/")[0]
-        cache_config: CacheConfig = self._build_cache_config(organization)
-
-        # TODO: Migrate all clients to use tokenizer configs
-        tokenizer_config = get_tokenizer_config(tokenizer_name)
-        if tokenizer_config:
-            tokenizer_spec = inject_object_spec_args(
-                tokenizer_config.tokenizer_spec, constant_bindings={"cache_config": cache_config}
-            )
-            return create_object(tokenizer_spec)
-        elif organization in [
-            "gooseai",
-            "huggingface",
-            "microsoft",
-            "google",
-            "writer",  # Palmyra
-            "nvidia",
-            "EleutherAI",
-            "facebook",
-            "meta-llama",
-            "hf-internal-testing",
-            "mistralai",
-            "HuggingFaceM4",
-            # Together
-            "together",
-            "databricks",
-            "eleutherai",
-            "lmsys",
-            "meta",
-            "mosaicml",
-            "stabilityai",
-            "stanford",
-            "tiiuae",
-            "bigcode",
-            "bigscience",
-        ]:
-            from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
-
-            tokenizer = HuggingFaceTokenizer(cache_config=cache_config)
-        elif organization == "neurips":
-            from helm.proxy.tokenizers.http_model_tokenizer import HTTPModelTokenizer
-
-            tokenizer = HTTPModelTokenizer(cache_config=cache_config)
-        elif organization == "openai":
-            from helm.proxy.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
-
-            tokenizer = TiktokenTokenizer(cache_config=cache_config)
-        elif organization == "AlephAlpha":
-            from helm.proxy.tokenizers.aleph_alpha_tokenizer import AlephAlphaTokenizer
-
-            tokenizer = AlephAlphaTokenizer(api_key=self.credentials["alephAlphaKey"], cache_config=cache_config)
-        elif organization == "ai21":
-            from helm.proxy.tokenizers.ai21_tokenizer import AI21Tokenizer
-
-            tokenizer = AI21Tokenizer(api_key=self.credentials["ai21ApiKey"], cache_config=cache_config)
-        elif organization == "cohere":
-            from helm.proxy.tokenizers.cohere_tokenizer import CohereTokenizer
-
-            tokenizer = CohereTokenizer(api_key=self.credentials["cohereApiKey"], cache_config=cache_config)
-        elif organization == "anthropic":
-            from helm.proxy.tokenizers.anthropic_tokenizer import AnthropicTokenizer
-
-            tokenizer = AnthropicTokenizer(cache_config=cache_config)
-        elif organization == "simple":
-            from helm.proxy.tokenizers.simple_tokenizer import SimpleTokenizer
-
-            tokenizer = SimpleTokenizer()
-        elif organization == "lightningai":
-            from helm.proxy.tokenizers.lit_gpt_tokenizer import LitGPTTokenizer
-
-            tokenizer = LitGPTTokenizer(
-                cache_config=cache_config,
-                checkpoint_dir=Path(os.environ.get("LIT_GPT_CHECKPOINT_DIR", "")),
-            )
-        elif organization == "TsinghuaKEG":
-            from helm.proxy.tokenizers.ice_tokenizer import ICETokenizer
-
-            tokenizer = ICETokenizer(cache_config=cache_config)
-        elif organization == "Yandex":
-            from helm.proxy.tokenizers.yalm_tokenizer import YaLMTokenizer
-
-            tokenizer = YaLMTokenizer(cache_config=cache_config)
-
-        if tokenizer is None:
-            raise ValueError(f"Could not find tokenizer for model: {tokenizer_name}")
-
-        # Cache the tokenizer
-        self.tokenizers[tokenizer_name] = tokenizer
-
-        return tokenizer
-
+    # TODO: remove this method after a few weeks (2023-11-09)
     def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
-        """Tokenizes based on the name of the tokenizer (e.g., huggingface/gpt2)."""
-
-        def tokenize_with_retry(tokenizer: Tokenizer, request: TokenizationRequest) -> TokenizationRequestResult:
-            return tokenizer.tokenize(request)
-
-        tokenizer: Tokenizer = self._get_tokenizer(request.tokenizer)
-
-        try:
-            return tokenize_with_retry(tokenizer=tokenizer, request=request)
-        except RetryError as e:
-            last_attempt: Attempt = e.last_attempt
-            retry_error: str = f"Failed to tokenize after retrying {last_attempt.attempt_number} times"
-            hlog(retry_error)
-            return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
+        raise NotImplementedError(
+            "AutoClient.tokenize() is not supported anymore." "Use AutoTokenizer.tokenize() instead."
+        )
 
+    # TODO: remove this method after a few weeks (2023-11-09)
     def decode(self, request: DecodeRequest) -> DecodeRequestResult:
-        """Decodes based on the the name of the tokenizer (e.g., huggingface/gpt2)."""
-
-        def decode_with_retry(tokenizer: Tokenizer, request: DecodeRequest) -> DecodeRequestResult:
-            return tokenizer.decode(request)
-
-        tokenizer: Tokenizer = self._get_tokenizer(request.tokenizer)
-
-        try:
-            return decode_with_retry(tokenizer=tokenizer, request=request)
-        except RetryError as e:
-            last_attempt: Attempt = e.last_attempt
-            retry_error: str = f"Failed to decode after retrying {last_attempt.attempt_number} times"
-            hlog(retry_error)
-            return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
+        raise NotImplementedError("AutoClient.decode() is not supported anymore." "Use AutoTokenizer.decode() instead.")
 
     def get_toxicity_classifier_client(self) -> ToxicityClassifierClient:
         """Get the toxicity classifier client. We currently only support Perspective API."""
         from helm.proxy.clients.perspective_api_client import PerspectiveAPIClient
 
-        cache_config: CacheConfig = self._build_cache_config("perspectiveapi")
+        cache_config: CacheConfig = build_cache_config(self.cache_path, self.mongo_uri, "perspectiveapi")
         return PerspectiveAPIClient(self.credentials.get("perspectiveApiKey", ""), cache_config)
 
     def get_critique_client(self) -> CritiqueClient:
@@ -420,7 +165,9 @@ def get_critique_client(self) -> CritiqueClient:
             surgeai_credentials = self.credentials.get("surgeaiApiKey")
             if not surgeai_credentials:
                 raise ValueError("surgeaiApiKey credentials are required for SurgeAICritiqueClient")
-            self._critique_client = SurgeAICritiqueClient(surgeai_credentials, self._build_cache_config("surgeai"))
+            self._critique_client = SurgeAICritiqueClient(
+                surgeai_credentials, build_cache_config(self.cache_path, self.mongo_uri, "surgeai")
+            )
         elif critique_type == "model":
             from helm.proxy.critique.model_critique_client import ModelCritiqueClient
 
@@ -439,7 +186,7 @@ def get_critique_client(self) -> CritiqueClient:
             if not scale_credentials:
                 raise ValueError("scaleApiKey is required for ScaleCritiqueClient")
             self._critique_client = ScaleCritiqueClient(
-                scale_credentials, self._build_cache_config("scale"), scale_project
+                scale_credentials, build_cache_config(self.cache_path, self.mongo_uri, "scale"), scale_project
             )
         else:
             raise ValueError(
@@ -455,7 +202,7 @@ def get_huggingface_client(self) -> "helm.proxy.clients.huggingface_client.Huggi
         if self._huggingface_client:
             assert isinstance(self._huggingface_client, HuggingFaceClient)
             return self._huggingface_client
-        cache_config = self._build_cache_config("huggingface")
+        cache_config = build_cache_config(self.cache_path, self.mongo_uri, "huggingface")
         tokenizer = HuggingFaceTokenizer(cache_config)
         self._huggingface_client = HuggingFaceClient(tokenizer=tokenizer, cache_config=cache_config)
         return self._huggingface_client
diff --git a/src/helm/proxy/clients/cohere_client.py b/src/helm/proxy/clients/cohere_client.py
index c92fcc4330f..2d626b99150 100644
--- a/src/helm/proxy/clients/cohere_client.py
+++ b/src/helm/proxy/clients/cohere_client.py
@@ -11,7 +11,7 @@
     Sequence,
     Token,
 )
-from helm.proxy.models import get_models_by_organization
+from helm.benchmark.model_deployment_registry import get_model_deployments_by_host_organization
 from helm.proxy.tokenizers.tokenizer import Tokenizer
 from .client import CachingClient, truncate_sequence
 from .cohere_utils import get_cohere_url, DEFAULT_COHERE_API_VERSION
@@ -45,7 +45,7 @@ def make_request(self, request: Request) -> RequestResult:
             assert request.max_tokens > 0, "max_tokens can only be 0 if echo_prompt=True"
 
         # model: "Currently available models are small, medium, large, xlarge"
-        assert request.model in get_models_by_organization("cohere")
+        assert request.model_deployment in get_model_deployments_by_host_organization("cohere")
         # temperature: "min value of 0.0, max value of 5.0"
         assert 0.0 <= request.temperature <= 5.0, f"Invalid temperature: {request.temperature}. Valid range: [0,5]"
         # num_generations: "min value of 1, max value of 5"
diff --git a/src/helm/proxy/clients/huggingface_client.py b/src/helm/proxy/clients/huggingface_client.py
index 498a810d608..b4e25987e6d 100644
--- a/src/helm/proxy/clients/huggingface_client.py
+++ b/src/helm/proxy/clients/huggingface_client.py
@@ -203,9 +203,9 @@ def make_request(self, request: Request) -> RequestResult:
         if self._pretrained_model_name_or_path:
             pretrained_model_name_or_path = self._pretrained_model_name_or_path
         else:
-            pretrained_model_name_or_path = resolve_alias(request.model)
+            pretrained_model_name_or_path = resolve_alias(request.model_deployment)
         huggingface_model: HuggingFaceServer = HuggingFaceServerFactory.get_server(
-            helm_model_name=request.model,
+            helm_model_name=request.model_deployment,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
             revision=self._revision,
         )
diff --git a/src/helm/proxy/clients/palmyra_client.py b/src/helm/proxy/clients/palmyra_client.py
index 46a17e961ec..550b4b56984 100644
--- a/src/helm/proxy/clients/palmyra_client.py
+++ b/src/helm/proxy/clients/palmyra_client.py
@@ -100,7 +100,10 @@ def do_it():
                 return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
 
             if _is_content_moderation_failure(response):
-                hlog(f"WARNING: Returning empty request for {request.model} due to content moderation filter")
+                hlog(
+                    f"WARNING: Returning empty request for {request.model_deployment} "
+                    "due to content moderation filter"
+                )
                 return RequestResult(
                     success=False,
                     cached=False,
diff --git a/src/helm/proxy/clients/remote_model_registry.py b/src/helm/proxy/clients/remote_model_registry.py
deleted file mode 100644
index 1dae2999469..00000000000
--- a/src/helm/proxy/clients/remote_model_registry.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import Dict, List, Optional
-
-from helm.proxy.models import Model
-from helm.proxy.services.remote_service import RemoteService
-
-
-_remote_model_registry: Dict[str, Model] = {}
-
-
-def get_remote_model(model_name: str) -> Optional[Model]:
-    """Returns a Model for the model_name."""
-    return _remote_model_registry.get(model_name)
-
-
-def check_and_register_remote_model(server_url: str, model_names: List[str]):
-    try:
-        service = RemoteService(server_url)
-        info = service.get_general_info()
-        models = {}
-        for model in info.all_models:
-            models[model.name] = model
-        for model_name in model_names:
-            if model_name in models:
-                _remote_model_registry[model_name] = models[model_name]
-            else:
-                raise RuntimeError(f"remote service not contain {model_name}")
-    except Exception as e:
-        raise RuntimeError(f"check and register remote service error: {e}")
diff --git a/src/helm/proxy/clients/test_auto_client.py b/src/helm/proxy/clients/test_auto_client.py
index 6fffdf35ba0..98c13b6870d 100644
--- a/src/helm/proxy/clients/test_auto_client.py
+++ b/src/helm/proxy/clients/test_auto_client.py
@@ -27,6 +27,7 @@ def make_request_and_check_result(self, request, expected_result):
     def test_make_request_databricks(self):
         request = Request(
             model="databricks/dolly-v2-3b",
+            model_deployment="together/dolly-v2-3b",
             prompt="Elephants are one of the most",
             temperature=0.0,
             max_tokens=10,
@@ -69,6 +70,7 @@ def test_make_request_databricks(self):
         )
         request = Request(
             model="databricks/dolly-v2-3b",
+            model_deployment="together/dolly-v2-3b",
             prompt="Elephants are one of the most",
             temperature=0.0,
             max_tokens=10,
diff --git a/src/helm/proxy/clients/test_client.py b/src/helm/proxy/clients/test_client.py
index 8ca194de198..256282d835b 100644
--- a/src/helm/proxy/clients/test_client.py
+++ b/src/helm/proxy/clients/test_client.py
@@ -19,13 +19,31 @@ def truncate_sequence_helper(tokens: List[str], request: Request, expected_token
 
 def test_truncate_sequence():
     # echo_prompt = True, nothing gets truncated
-    truncate_sequence_helper(["a", "b", "c"], Request(prompt="abc", echo_prompt=True), ["a", "b", "c"])
+    truncate_sequence_helper(
+        ["a", "b", "c"],
+        Request(
+            model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", prompt="abc", echo_prompt=True
+        ),
+        ["a", "b", "c"],
+    )
 
     # Nothing gets truncated
-    truncate_sequence_helper(["hello", " world"], Request(stop_sequences=["#"]), ["hello", " world"])
+    truncate_sequence_helper(
+        ["hello", " world"],
+        Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["#"]),
+        ["hello", " world"],
+    )
 
     # Truncate using stop sequences
-    truncate_sequence_helper(["hello", " world", "\n", "what"], Request(stop_sequences=["\n"]), ["hello", " world"])
+    truncate_sequence_helper(
+        ["hello", " world", "\n", "what"],
+        Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["\n"]),
+        ["hello", " world"],
+    )
 
     # Truncate using max tokens
-    truncate_sequence_helper(["a", "b", "c"], Request(max_tokens=2), ["a", "b"])
+    truncate_sequence_helper(
+        ["a", "b", "c"],
+        Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", max_tokens=2),
+        ["a", "b"],
+    )
diff --git a/src/helm/proxy/clients/test_huggingface_client.py b/src/helm/proxy/clients/test_huggingface_client.py
index f5c59f2d8f3..09efeca3b27 100644
--- a/src/helm/proxy/clients/test_huggingface_client.py
+++ b/src/helm/proxy/clients/test_huggingface_client.py
@@ -29,30 +29,30 @@ def teardown_method(self, method):
 
     def test_tokenize(self):
         request = TokenizationRequest(text="I am a computer scientist.")
-        result: TokenizationRequestResult = self.client.tokenize(request)
+        result: TokenizationRequestResult = self.client.tokenizer.tokenize(request)
         assert not result.cached, "First time making the tokenize request. Result should not be cached"
-        result: TokenizationRequestResult = self.client.tokenize(request)
+        result: TokenizationRequestResult = self.client.tokenizer.tokenize(request)
         assert result.cached, "Result should be cached"
         assert result.raw_tokens == ["I", " am", " a", " computer", " scientist", "."]
 
     def test_encode(self):
         request = TokenizationRequest(text="I am a computer scientist.", encode=True, truncation=True, max_length=1)
-        result: TokenizationRequestResult = self.client.tokenize(request)
+        result: TokenizationRequestResult = self.client.tokenizer.tokenize(request)
         assert not result.cached, "First time making the tokenize request. Result should not be cached"
-        result: TokenizationRequestResult = self.client.tokenize(request)
+        result: TokenizationRequestResult = self.client.tokenizer.tokenize(request)
         assert result.cached, "Result should be cached"
         assert result.raw_tokens == [40]
 
         request = TokenizationRequest(text="I am a computer scientist.", encode=True, truncation=True, max_length=1024)
-        result = self.client.tokenize(request)
+        result = self.client.tokenizer.tokenize(request)
         assert not result.cached, "First time making this particular request. Result should not be cached"
         assert result.raw_tokens == [40, 716, 257, 3644, 11444, 13]
 
     def test_decode(self):
         request = DecodeRequest(tokens=[40, 716, 257, 3644, 11444, 13])
-        result: DecodeRequestResult = self.client.decode(request)
+        result: DecodeRequestResult = self.client.tokenizer.decode(request)
         assert not result.cached, "First time making the decode request. Result should not be cached"
-        result: DecodeRequestResult = self.client.decode(request)
+        result: DecodeRequestResult = self.client.tokenizer.decode(request)
         assert result.cached, "Result should be cached"
         assert result.text == "I am a computer scientist."
 
@@ -60,7 +60,8 @@ def test_gpt2(self):
         prompt: str = "I am a computer scientist."
         result: RequestResult = self.client.make_request(
             Request(
-                model="huggingface/gpt2",
+                model="openai/gpt2",
+                model_deployment="huggingface/gpt2",
                 prompt=prompt,
                 num_completions=3,
                 top_k_per_token=5,
@@ -77,7 +78,8 @@ def test_gpt2(self):
     def test_gptj_6b(self):
         result: RequestResult = self.client.make_request(
             Request(
-                model="huggingface/gpt-j-6b",
+                model="eleutherai/gpt-j-6b",
+                model_deployment="huggingface/gpt-j-6b",
                 prompt="I am a computer scientist.",
                 num_completions=3,
                 top_k_per_token=5,
diff --git a/src/helm/proxy/clients/test_together_client.py b/src/helm/proxy/clients/test_together_client.py
index 59eebab9b9d..312fed545e0 100644
--- a/src/helm/proxy/clients/test_together_client.py
+++ b/src/helm/proxy/clients/test_together_client.py
@@ -27,6 +27,7 @@ def teardown_method(self, method):
             (
                 Request(
                     model="together/redpajama-incite-base-3b-v1",
+                    model_deployment="together/redpajama-incite-base-3b-v1",
                 ),
                 {
                     "best_of": 1,
@@ -45,6 +46,7 @@ def teardown_method(self, method):
             (
                 Request(
                     model="meta/llama-7b",
+                    model_deployment="together/llama-7b",
                     prompt="I am a computer scientist.",
                     temperature=0,
                     num_completions=4,
@@ -71,6 +73,7 @@ def teardown_method(self, method):
             (
                 Request(
                     model="stanford/alpaca-7b",
+                    model_deployment="together/alpaca-7b",
                     stop_sequences=["\n"],
                 ),
                 {
@@ -95,4 +98,4 @@ def test_convert_to_raw_request(self, test_input, expected):
 
     def test_api_key_error(self):
         with pytest.raises(TogetherClientError):
-            self.client.make_request(Request(model="together/bloom"))
+            self.client.make_request(Request(model="bigscience/bloom", model_deployment="together/bloom"))
diff --git a/src/helm/proxy/clients/together_client.py b/src/helm/proxy/clients/together_client.py
index ad3365af5a9..24cb5b6a387 100644
--- a/src/helm/proxy/clients/together_client.py
+++ b/src/helm/proxy/clients/together_client.py
@@ -16,6 +16,10 @@
     "h3-2.7b": "h3-2.7b-h3",
     "opt-1.3b": "opt-1.3b-ft-tp1",
     "opt-6.7b": "opt-6.7b-ft-tp1",
+    "mpt-7b": "togethercomputer/mpt-7b",
+    "mpt-instruct-7b": "togethercomputer/mpt-7b-instruct",
+    "stablelm-base-alpha-3b": "stabilityai/stablelm-base-alpha-3b",
+    "stablelm-base-alpha-7b": "stabilityai/stablelm-base-alpha-7b",
     # Production models
     "redpajama-incite-base-3b-v1": "togethercomputer/RedPajama-INCITE-Base-3B-v1",
     "redpajama-incite-instruct-3b-v1": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
@@ -29,6 +33,8 @@
     "falcon-7b-instruct": "togethercomputer/falcon-7b-instruct",
     "falcon-40b": "togethercomputer/falcon-40b",
     "falcon-40b-instruct": "togethercomputer/falcon-40b-instruct",
+    "gpt-jt-6b-v1": "togethercomputer/GPT-JT-6B-v1",
+    "gpt-neoxt-chat-base-20b": "togethercomputer/GPT-NeoXT-Chat-Base-20B",
     "llama-7b": "huggyllama/llama-7b",
     "llama-13b": "huggyllama/llama-13b",
     "llama-30b": "huggyllama/llama-30b",
@@ -37,16 +43,12 @@
     "llama-2-13b": "togethercomputer/llama-2-13b",
     "llama-2-70b": "togethercomputer/llama-2-70b",
     "mistral-7b-v0.1": "mistralai/Mistral-7B-v0.1",
-    "mpt-7b": "togethercomputer/mpt-7b",
-    "mpt-instruct-7b": "togethercomputer/mpt-7b-instruct",
     "mpt-30b": "togethercomputer/mpt-30b",
     "mpt-instruct-30b": "togethercomputer/mpt-30b-instruct",
     "pythia-1b-v0": "EleutherAI/pythia-1b-v0",
     "pythia-2.8b-v0": "EleutherAI/pythia-2.8b-v0",
     "pythia-6.9b": "EleutherAI/pythia-6.9b",
     "pythia-12b-v0": "EleutherAI/pythia-12b-v0",
-    "stablelm-base-alpha-3b": "stabilityai/stablelm-base-alpha-3b",
-    "stablelm-base-alpha-7b": "stabilityai/stablelm-base-alpha-7b",
     "vicuna-7b-v1.3": "lmsys/vicuna-7b-v1.3",
     "vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
 }
@@ -55,7 +57,7 @@
 HELM users use a shorter model name (e.g. together/flan-t5-xxl)
 whereas the Together client sends and caches requests using
 a longer model name that is suffixed with the implementation framework
-(e.g. flan-t5-xxl-hf). This allows trackcing exactly which
+(e.g. flan-t5-xxl-hf). This allows tracking exactly which
 implementation was used in the cached results, since some results may
 be different depending on the implementation (e.g. efficiency metrics).
 This also allows future migration of results in the case of changes of
diff --git a/src/helm/proxy/clients/vision_language/idefics_client.py b/src/helm/proxy/clients/vision_language/idefics_client.py
index 90d290667cd..38aa6d93ab8 100644
--- a/src/helm/proxy/clients/vision_language/idefics_client.py
+++ b/src/helm/proxy/clients/vision_language/idefics_client.py
@@ -78,10 +78,10 @@ def _get_model(self, checkpoint: str) -> LoadedIDEFICSModelProcessor:
         return loaded_model_processor
 
     def make_request(self, request: Request) -> RequestResult:
-        assert request.model in _models, f"Not a valid model for this client: {request.model}"
+        assert request.model_deployment in _models, f"Not a valid model for this client: {request.model_deployment}"
         assert request.multimodal_prompt is not None, "Multimodal prompt is required"
 
-        loaded_model_processor: LoadedIDEFICSModelProcessor = self._get_model(request.model)
+        loaded_model_processor: LoadedIDEFICSModelProcessor = self._get_model(request.model_deployment)
         model = loaded_model_processor.model
         processor = loaded_model_processor.processor
 
diff --git a/src/helm/proxy/critique/model_critique_client.py b/src/helm/proxy/critique/model_critique_client.py
index 7c4caaeca65..f26b79897da 100644
--- a/src/helm/proxy/critique/model_critique_client.py
+++ b/src/helm/proxy/critique/model_critique_client.py
@@ -2,6 +2,7 @@
 import string
 import dataclasses
 
+from helm.benchmark.run_specs import get_default_model_deployment_for_model
 from helm.common.critique_request import (
     CritiqueRequest,
     CritiqueRequestResult,
@@ -26,6 +27,10 @@ class ModelCritiqueClient(CritiqueClient):
     def __init__(self, client: Client, model_name):
         self._client = client
         self._model_name = model_name
+        self._model_deployment_name = (
+            get_default_model_deployment_for_model(model_name, warn_arg_deprecated=False, ignore_deprecated=True)
+            or self._model_name
+        )
 
     def _interpolate_fields(self, text: str, fields: Dict[str, str]) -> str:
         for key, value in fields.items():
@@ -75,6 +80,7 @@ def _task_to_requests(self, task: CritiqueTaskTemplate, fields: Dict[str, str])
 
             request = Request(
                 model=self._model_name,
+                model_deployment=self._model_deployment_name,
                 prompt=prompt,
                 max_tokens=max_tokens,
                 echo_prompt=False,
diff --git a/src/helm/proxy/example_queries.py b/src/helm/proxy/example_queries.py
index 10009f8c249..bad6b6fc39b 100644
--- a/src/helm/proxy/example_queries.py
+++ b/src/helm/proxy/example_queries.py
@@ -63,13 +63,13 @@ def dedent(text: str) -> str:
             """
             temperature: 0
             stop_sequences: [.]
-            model: ${model}  # Try out multiple models
+            model_deployment: ${model_deployment}  # Try out multiple models
             """
         ),
         environments=dedent(
             """
             occupation: [mathematician, lawyer, doctor]
-            model: [openai/davinci, ai21/j1-jumbo]
+            model_deployment: [openai/davinci, ai21/j1-jumbo]
             """
         ),
     ),
@@ -88,12 +88,12 @@ def dedent(text: str) -> str:
             temperature: 0.5
             stop_sequences: [\\n]
             num_completions: 5
-            model: ${model}  # Try out GPT-3 and Jurassic
+            model_deployment: ${model_deployment}  # Try out GPT-3 and Jurassic
             """
         ),
         environments=dedent(
             """
-            model: [openai/davinci, ai21/j1-jumbo]
+            model_deployment: [openai/davinci, ai21/j1-jumbo]
             """
         ),
     ),
@@ -122,12 +122,12 @@ def dedent(text: str) -> str:
             temperature: 0
             max_tokens: 1
             top_k_per_token: 4
-            model: ${model}  # Try out GPT-3 and Jurassic
+            model_deployment: ${model_deployment}  # Try out GPT-3 and Jurassic
             """
         ),
         environments=dedent(
             """
-            model: [openai/davinci, ai21/j1-jumbo]
+            model_deployment: [openai/davinci, ai21/j1-jumbo]
             """
         ),
     ),
@@ -135,7 +135,7 @@ def dedent(text: str) -> str:
         prompt="Takes two vectors a and b and returns their Euclidean distance",
         settings=dedent(
             """
-            model: openai/code-davinci-001  # Codex for code generation
+            model_deployment: openai/code-davinci-001  # Codex for code generation
             """
         ),
         environments="",
@@ -144,14 +144,14 @@ def dedent(text: str) -> str:
         prompt="The quick brown fox",
         settings=dedent(
             """
-            model: ${model}
+            model_deployment: ${model_deployment}
             temperature: 0.3
             stop_sequences: [\\n]
             """
         ),
         environments=dedent(
             """
-            model: [
+            model_deployment: [
                 "openai/davinci", "openai/text-davinci-002",
                 "openai/text-davinci-003", "ai21/j1-grande-v2-beta",
                 "together/gpt-j-6b", "together/gpt-jt-6b-v1",
diff --git a/src/helm/proxy/models.py b/src/helm/proxy/models.py
deleted file mode 100644
index 477ae710d97..00000000000
--- a/src/helm/proxy/models.py
+++ /dev/null
@@ -1,963 +0,0 @@
-from dataclasses import dataclass, field
-from typing import Dict, List
-
-# Different modalities
-TEXT_MODEL_TAG: str = "text"
-IMAGE_MODEL_TAG: str = "image"
-CODE_MODEL_TAG: str = "code"
-EMBEDDING_MODEL_TAG: str = "embedding"
-
-# Some model APIs have limited functionalities
-FULL_FUNCTIONALITY_TEXT_MODEL_TAG: str = "full_functionality_text"
-LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG: str = "limited_functionality_text"
-
-# ChatML format
-CHATML_MODEL_TAG: str = "chatml"
-
-# OpenAI Chat format
-OPENAI_CHATGPT_MODEL_TAG: str = "openai_chatgpt"
-
-# For Anthropic models
-ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "claude_1"
-ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "claude_2"
-
-# For OpenAI models with wider context windows
-# TODO(#1455): Simplify context window tags.
-WIDER_CONTEXT_WINDOW_TAG: str = "openai_wider_context_window"  # huggingface/gpt2 tokenizer, 4000 tokens
-GPT_TURBO_CONTEXT_WINDOW_TAG: str = "gpt_turbo_context_window"  # cl100k_base tokenizer, 4000 tokens
-GPT_TURBO_16K_CONTEXT_WINDOW_TAG: str = "gpt_turbo_16k_context_window"  # cl100k_base tokenizer, 8000 tokens
-GPT4_CONTEXT_WINDOW_TAG: str = "gpt4_context_window"  # cl100k_base tokenizer, 8192 tokens
-GPT4_32K_CONTEXT_WINDOW_TAG: str = "gpt4_32k_context_window"  # cl100k_base tokenizer, 32768 tokens
-
-# For AI21 Jurassic-2 models with wider context windows
-AI21_WIDER_CONTEXT_WINDOW_TAG: str = "ai21_wider_context_window"
-
-# For AI21 Jurassic-2 Jumbo
-# AI21 has recommended using a sequence length of 6000 tokens to avoid OOMs.
-AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG: str = "ai21_jurassic_2_jumbo_context_window"  # 6000
-
-# To fetch models that use these tokenizers
-GPT2_TOKENIZER_TAG: str = "gpt2_tokenizer"
-AI21_TOKENIZER_TAG: str = "ai21_tokenizer"
-COHERE_TOKENIZER_TAG: str = "cohere_tokenizer"
-OPT_TOKENIZER_TAG: str = "opt_tokenizer"
-GPTJ_TOKENIZER_TAG: str = "gptj_tokenizer"
-GPT4_TOKENIZER_TAG: str = "gpt4_tokenizer"
-GPTNEO_TOKENIZER_TAG: str = "gptneo_tokenizer"
-
-# Models which emit garbage tokens when temperature=0.
-BUGGY_TEMP_0_TAG: str = "buggy_temp_0"
-
-# Models that are used for ablations and fine-grained analyses.
-# These models are selected specifically because of their low marginal cost to evaluate.
-ABLATION_MODEL_TAG: str = "ablation"
-
-# Some models (e.g., T5) have stripped newlines.
-# So we cannot use \n as a stop sequence for these models.
-NO_NEWLINES_TAG: str = "no_newlines"
-
-# Some models (e.g., UL2) require a prefix (e.g., [NLG]) in the
-# prompts to indicate the mode before doing inference.
-NLG_PREFIX_TAG: str = "nlg_prefix_tag"
-
-# Some models can follow instructions.
-INSTRUCTION_FOLLOWING_MODEL_TAG: str = "instruction_following"
-
-# For Vision-langauge models (VLMs)
-VISION_LANGUAGE_MODEL_TAG: str = "vision_language"
-
-
-@dataclass
-class Model:
-    """
-    Represents a model that we can make requests to.  Conceptually, an instance
-    of `Model` is tied more to the hosting implementation (where can we send
-    requests) rather than the conceptual model.  These are the same for closed
-    models, but different for open-source models.  Note: for all the metadata
-    and documentation about the model itself, see `ModelField` in `schema.py`.
-    """
-
-    # Model group, used to determine quotas (e.g. "huggingface").
-    # This group is only for user accounts, not benchmarking, and should probably
-    # called something else.
-    group: str
-
-    # Name of the specific model (e.g. "huggingface/gpt-j-6b")
-    # The name is <hosting_organization>/<model_name> or
-    # <creator_organization>/<model_name>
-    # There is also `<creator_organization>` (see `ModelField`).
-    name: str
-
-    # Tags corresponding to the properties of the model.
-    tags: List[str] = field(default_factory=list)
-
-    @property
-    def organization(self) -> str:
-        """
-        Extracts the organization from the model name.
-        Example: 'ai21/j1-jumbo' => 'ai21'
-        """
-        return self.name.split("/")[0]
-
-    @property
-    def engine(self) -> str:
-        """
-        Extracts the model engine from the model name.
-        Example: 'ai21/j1-jumbo' => 'j1-jumbo'
-        """
-        return self.name.split("/")[1]
-
-
-# For the list of available models, see the following docs:
-# Note that schema.yaml has much of this information now.
-# Over time, we should add more information there.
-
-ALL_MODELS = [
-    # Local Model
-    Model(
-        group="neurips",
-        name="neurips/local",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    # AI21: https://studio.ai21.com/pricing
-    Model(
-        group="jurassic",
-        name="ai21/j1-jumbo",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
-    ),
-    # From AI21: "the new model is a mid-point in terms of size, cost and performance between Jumbo and Large.
-    # We also implemented a few tweaks to its training process. Internal benchmarks suggest it can really
-    # help the unit economics on your end compared to Jumbo, without compromising too much on quality."
-    Model(
-        group="jurassic",
-        name="ai21/j1-grande",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
-    ),
-    Model(
-        group="jurassic",
-        name="ai21/j1-grande-v2-beta",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
-    ),
-    Model(
-        group="jurassic",
-        name="ai21/j1-large",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
-    ),
-    # AI21 Jurassic-2 Models: https://www.ai21.com/blog/introducing-j2
-    Model(
-        group="jurassic",
-        name="ai21/j2-jumbo",
-        tags=[
-            TEXT_MODEL_TAG,
-            AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG,
-            FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
-            AI21_TOKENIZER_TAG,
-        ],
-    ),
-    Model(
-        group="jurassic",
-        name="ai21/j2-grande",
-        tags=[TEXT_MODEL_TAG, AI21_WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
-    ),
-    Model(
-        group="jurassic",
-        name="ai21/j2-large",
-        tags=[TEXT_MODEL_TAG, AI21_WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
-    ),
-    # Aleph Alpha's Luminous models: https://docs.aleph-alpha.com/docs/introduction/luminous
-    Model(
-        group="luminous",
-        name="AlephAlpha/luminous-base",
-        # Does not support echo
-        tags=[TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="luminous",
-        name="AlephAlpha/luminous-extended",
-        # Does not support echo
-        tags=[TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="luminous",
-        name="AlephAlpha/luminous-supreme",
-        # Does not support echo.
-        # TODO: images will be supported in the near future. Add IMAGE_MODEL_TAG.
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    # TODO: coming soon. Uncomment out the following when Luminous World is released.
-    # Model(
-    #     group="luminous",
-    #     name="AlephAlpha/luminous-world",
-    #     tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    # ),
-    # Anthropic
-    Model(
-        group="anthropic",
-        name="anthropic/stanford-online-all-v4-s3",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG, ABLATION_MODEL_TAG],
-    ),
-    Model(
-        group="anthropic",
-        name="anthropic/claude-2.0",
-        tags=[
-            ANTHROPIC_CLAUDE_2_MODEL_TAG,
-            TEXT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            GPT2_TOKENIZER_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    Model(
-        group="anthropic",
-        name="anthropic/claude-v1.3",
-        tags=[
-            ANTHROPIC_CLAUDE_1_MODEL_TAG,
-            TEXT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            GPT2_TOKENIZER_TAG,
-            ABLATION_MODEL_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    Model(
-        group="anthropic",
-        name="anthropic/claude-instant-v1",
-        tags=[
-            ANTHROPIC_CLAUDE_1_MODEL_TAG,
-            TEXT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            GPT2_TOKENIZER_TAG,
-            ABLATION_MODEL_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    # BigScience
-    Model(
-        group="together",
-        name="together/bloom",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="together/t0pp",
-        # Does not support echo=True
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG],
-    ),
-    # Cohere models
-    # Model versioning and the possible versions are not documented here:
-    # https://docs.cohere.ai/generate-reference#model-optional.
-    # So, instead, we got the names of the models from the Cohere Playground.
-    #
-    # Note that their tokenizer and model were trained on English text and
-    # they do not have a dedicated decode API endpoint, so the adaptation
-    # step for language modeling fails for certain Scenarios:
-    # the_pile:subset=ArXiv
-    # the_pile:subset=Github
-    # the_pile:subset=PubMed Central
-    Model(
-        group="cohere",
-        name="cohere/xlarge-20220609",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
-    ),
-    Model(
-        group="cohere",
-        name="cohere/xlarge-20221108",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
-    ),
-    Model(
-        group="cohere",
-        name="cohere/large-20220720",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
-    ),
-    Model(
-        group="cohere",
-        name="cohere/medium-20220720",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
-    ),
-    Model(
-        group="cohere",
-        name="cohere/medium-20221108",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
-    ),
-    Model(
-        group="cohere",
-        name="cohere/small-20220720",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
-    ),
-    Model(
-        group="cohere",
-        name="cohere/command-medium-beta",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
-    ),
-    Model(
-        group="cohere",
-        name="cohere/command-xlarge-beta",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
-    ),
-    # EleutherAI
-    Model(
-        group="together",
-        name="together/gpt-j-6b",
-        tags=[
-            TEXT_MODEL_TAG,
-            FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
-            ABLATION_MODEL_TAG,
-            GPTJ_TOKENIZER_TAG,
-            BUGGY_TEMP_0_TAG,
-        ],
-    ),
-    Model(
-        group="together",
-        name="together/gpt-neox-20b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, GPTNEO_TOKENIZER_TAG],
-    ),
-    Model(
-        group="together",
-        name="eleutherai/pythia-1b-v0",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="eleutherai/pythia-2.8b-v0",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="eleutherai/pythia-6.9b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="eleutherai/pythia-12b-v0",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    # Meta
-    Model(
-        group="together",
-        name="meta/llama-7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="meta/llama-13b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="meta/llama-30b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="meta/llama-65b",
-        # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="meta/llama-2-7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="meta/llama-2-13b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="meta/llama-2-70b",
-        # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    # Stanford
-    Model(
-        group="together",
-        name="stanford/alpaca-7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
-    ),
-    # LMSYS
-    Model(
-        group="together",
-        name="lmsys/vicuna-7b-v1.3",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="lmsys/vicuna-13b-v1.3",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
-    ),
-    # Mistral AI
-    Model(
-        group="mistralai",
-        name="mistralai/mistral-7b-v0.1",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
-    ),
-    # MosaicML
-    Model(
-        group="together",
-        name="mosaicml/mpt-7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="mosaicml/mpt-instruct-7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="mosaicml/mpt-30b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="mosaicml/mpt-instruct-30b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    # TII UAE
-    Model(
-        group="together",
-        name="tiiuae/falcon-7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="tiiuae/falcon-7b-instruct",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="tiiuae/falcon-40b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="tiiuae/falcon-40b-instruct",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    # GooseAI supported models
-    Model(
-        group="gooseai",
-        name="gooseai/gpt-neo-20b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTNEO_TOKENIZER_TAG],
-    ),
-    Model(
-        group="gooseai",
-        name="gooseai/gpt-j-6b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
-    ),
-    # HuggingFace
-    Model(
-        group="huggingface",
-        name="huggingface/gpt2",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="huggingface",
-        name="huggingface/gpt-j-6b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
-    ),
-    Model(
-        group="huggingface",
-        name="huggingface/santacoder",
-        tags=[CODE_MODEL_TAG],
-    ),
-    Model(
-        group="huggingface",
-        name="huggingface/starcoder",
-        tags=[CODE_MODEL_TAG],
-    ),
-    # Google
-    Model(
-        group="together",
-        name="together/t5-11b",
-        # Does not support echo=True
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG],
-    ),
-    Model(
-        group="together",
-        name="together/flan-t5-xxl",
-        # Does not support echo=True
-        tags=[
-            TEXT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            ABLATION_MODEL_TAG,
-            NO_NEWLINES_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    Model(
-        group="together",
-        name="together/ul2",
-        # Does not support echo=True
-        tags=[
-            TEXT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            ABLATION_MODEL_TAG,
-            NO_NEWLINES_TAG,
-            NLG_PREFIX_TAG,
-        ],
-    ),
-    # H3 model
-    Model(
-        group="together",
-        name="together/h3-2.7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    # OPT
-    Model(
-        group="together",
-        name="together/opt-175b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, OPT_TOKENIZER_TAG],
-    ),
-    Model(
-        group="together",
-        name="together/opt-66b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, OPT_TOKENIZER_TAG],
-    ),
-    Model(
-        group="together",
-        name="together/opt-6.7b",
-        tags=[
-            TEXT_MODEL_TAG,
-            FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
-            ABLATION_MODEL_TAG,
-            OPT_TOKENIZER_TAG,
-            BUGGY_TEMP_0_TAG,
-        ],
-    ),
-    Model(
-        group="together",
-        name="together/opt-1.3b",
-        tags=[
-            TEXT_MODEL_TAG,
-            FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
-            ABLATION_MODEL_TAG,
-            OPT_TOKENIZER_TAG,
-            BUGGY_TEMP_0_TAG,
-        ],
-    ),
-    # Microsoft/NVIDIA
-    Model(
-        group="microsoft",
-        name="microsoft/TNLGv2_530B",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="microsoft",
-        name="microsoft/TNLGv2_7B",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    # OpenAI: https://beta.openai.com/docs/engines/gpt-3
-    Model(
-        group="gpt3",
-        name="openai/davinci",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/curie",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/babbage",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/ada",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    # TODO: text-davinci-002 supports insertion. Support insertion in our framework.
-    #       https://github.com/stanford-crfm/benchmarking/issues/359
-    Model(
-        group="gpt3",
-        name="openai/text-davinci-003",
-        tags=[
-            TEXT_MODEL_TAG,
-            WIDER_CONTEXT_WINDOW_TAG,
-            FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
-            GPT2_TOKENIZER_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/text-davinci-002",
-        tags=[TEXT_MODEL_TAG, WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/text-davinci-001",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/text-curie-001",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/text-babbage-001",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/text-ada-001",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="codex",
-        name="openai/code-davinci-002",
-        tags=[CODE_MODEL_TAG, WIDER_CONTEXT_WINDOW_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="codex",
-        name="openai/code-davinci-001",
-        tags=[CODE_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    Model(
-        group="codex",
-        name="openai/code-cushman-001",
-        tags=[CODE_MODEL_TAG, GPT2_TOKENIZER_TAG],
-    ),
-    # GPT-4
-    Model(
-        group="gpt4",
-        name="openai/gpt-4-0314",
-        tags=[
-            TEXT_MODEL_TAG,
-            GPT4_CONTEXT_WINDOW_TAG,
-            GPT4_TOKENIZER_TAG,
-            OPENAI_CHATGPT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    Model(
-        group="gpt4",
-        name="openai/gpt-4-32k-0314",
-        tags=[
-            TEXT_MODEL_TAG,
-            GPT4_32K_CONTEXT_WINDOW_TAG,
-            GPT4_TOKENIZER_TAG,
-            OPENAI_CHATGPT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    Model(
-        group="gpt4",
-        name="openai/gpt-4-0613",
-        tags=[
-            TEXT_MODEL_TAG,
-            GPT4_CONTEXT_WINDOW_TAG,
-            GPT4_TOKENIZER_TAG,
-            OPENAI_CHATGPT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    Model(
-        group="gpt4",
-        name="openai/gpt-4-32k-0613",
-        tags=[
-            TEXT_MODEL_TAG,
-            GPT4_32K_CONTEXT_WINDOW_TAG,
-            GPT4_TOKENIZER_TAG,
-            OPENAI_CHATGPT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    # ChatGPT: https://openai.com/blog/chatgpt
-    Model(
-        group="gpt3",
-        name="openai/gpt-3.5-turbo-0301",
-        # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
-        # sequence length is smaller at 4087 with one user input message and one assistant
-        # output message because ChatGPT uses special tokens for message roles and boundaries.
-        # We use a rounded-down sequence length of 4000 to account for these special tokens.
-        tags=[
-            TEXT_MODEL_TAG,
-            GPT_TURBO_CONTEXT_WINDOW_TAG,
-            GPT4_TOKENIZER_TAG,
-            OPENAI_CHATGPT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/gpt-3.5-turbo-0613",
-        # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
-        # sequence length is smaller at 4087 with one user input message and one assistant
-        # output message because ChatGPT uses special tokens for message roles and boundaries.
-        # We use a rounded-down sequence length of 4000 to account for these special tokens.
-        tags=[
-            TEXT_MODEL_TAG,
-            GPT_TURBO_CONTEXT_WINDOW_TAG,
-            GPT4_TOKENIZER_TAG,
-            OPENAI_CHATGPT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/gpt-3.5-turbo-16k-0613",
-        # Claimed length is 16,384; we round down to 16,000 for the same reasons as explained
-        # in the openai/gpt-3.5-turbo-0613 comment
-        tags=[
-            TEXT_MODEL_TAG,
-            GPT_TURBO_16K_CONTEXT_WINDOW_TAG,
-            GPT4_TOKENIZER_TAG,
-            OPENAI_CHATGPT_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-        ],
-    ),
-    # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
-    Model(
-        group="gpt3",
-        name="openai/text-similarity-davinci-001",
-        tags=[EMBEDDING_MODEL_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/text-similarity-curie-001",
-        tags=[EMBEDDING_MODEL_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/text-similarity-babbage-001",
-        tags=[EMBEDDING_MODEL_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/text-similarity-ada-001",
-        tags=[EMBEDDING_MODEL_TAG],
-    ),
-    Model(
-        group="gpt3",
-        name="openai/text-embedding-ada-002",
-        tags=[EMBEDDING_MODEL_TAG],
-    ),
-    # Together
-    Model(
-        group="together",
-        name="together/gpt-jt-6b-v1",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
-    ),
-    Model(
-        group="together",
-        name="together/gpt-neoxt-chat-base-20b",
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, CHATML_MODEL_TAG, GPTNEO_TOKENIZER_TAG],
-    ),
-    Model(
-        group="together",
-        name="together/redpajama-incite-base-3b-v1",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="together/redpajama-incite-instruct-3b-v1",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="together/redpajama-incite-base-7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="together/redpajama-incite-instruct-7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    # Tsinghua
-    Model(
-        group="together",
-        name="together/glm",
-        # Inference with echo=True is not feasible -- in the prompt encoding phase, they use
-        # bidirectional attention and do not perform predictions on them.
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG],
-    ),
-    # Writer
-    Model(
-        group="palmyra",
-        name="writer/palmyra-base",
-        # Does not support echo
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="palmyra",
-        name="writer/palmyra-large",
-        # Does not support echo
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="palmyra",
-        name="writer/palmyra-r",
-        # Does not support echo
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="palmyra",
-        name="writer/camel",
-        # Does not support echo
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="palmyra",
-        name="writer/palmyra-instruct-30",
-        # Does not support echo
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="palmyra",
-        name="writer/palmyra-e",
-        # Does not support echo
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="palmyra",
-        name="writer/silk-road",
-        # Does not support echo
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="palmyra",
-        name="writer/palmyra-x",
-        # Does not support echo
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    # Yandex
-    Model(
-        group="together",
-        name="together/yalm",
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG],
-    ),
-    # Google
-    Model(
-        group="google",
-        name="google/palm",
-        tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    # NVIDIA
-    Model(
-        group="nvidia",
-        name="nvidia/megatron-gpt2",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG, BUGGY_TEMP_0_TAG],
-    ),
-    # Databricks
-    Model(
-        group="together",
-        name="databricks/dolly-v2-3b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="databricks/dolly-v2-7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="databricks/dolly-v2-12b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    # Stability AI
-    Model(
-        group="together",
-        name="stabilityai/stablelm-base-alpha-3b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="together",
-        name="stabilityai/stablelm-base-alpha-7b",
-        tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
-    ),
-    Model(
-        group="lightningai",
-        name="lightningai/lit-gpt",
-        tags=[
-            TEXT_MODEL_TAG,
-            INSTRUCTION_FOLLOWING_MODEL_TAG,
-            LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
-            GPT2_TOKENIZER_TAG,
-        ],
-    ),
-    # Vision-language models (VLMs)
-    Model(
-        group="idefics",
-        name="HuggingFaceM4/idefics-9b",
-        tags=[VISION_LANGUAGE_MODEL_TAG],
-    ),
-    Model(
-        group="idefics",
-        name="HuggingFaceM4/idefics-9b-instruct",
-        tags=[VISION_LANGUAGE_MODEL_TAG],
-    ),
-    Model(
-        group="idefics",
-        name="HuggingFaceM4/idefics-80b",
-        tags=[VISION_LANGUAGE_MODEL_TAG],
-    ),
-    Model(
-        group="idefics",
-        name="HuggingFaceM4/idefics-80b-instruct",
-        tags=[VISION_LANGUAGE_MODEL_TAG],
-    ),
-    # For debugging
-    Model(
-        group="simple",
-        name="simple/model1",
-    ),
-]
-
-MODEL_NAME_TO_MODEL: Dict[str, Model] = {model.name: model for model in ALL_MODELS}
-
-
-def get_model(model_name: str) -> Model:
-    """Get the `Model` given the name."""
-    if model_name not in MODEL_NAME_TO_MODEL:
-        raise ValueError(f"No model with name: {model_name}")
-
-    return MODEL_NAME_TO_MODEL[model_name]
-
-
-def get_model_group(model_name: str) -> str:
-    """Get the model's group given the name."""
-    model: Model = get_model(model_name)
-    return model.group
-
-
-def get_all_models() -> List[str]:
-    """Get all model names."""
-    return list(MODEL_NAME_TO_MODEL.keys())
-
-
-def get_models_by_organization(organization: str) -> List[str]:
-    """
-    Gets models by organization e.g., ai21 => ai21/j1-jumbo, ai21/j1-grande, ai21-large.
-    """
-    return [model.name for model in ALL_MODELS if model.organization == organization]
-
-
-def get_model_names_with_tag(tag: str) -> List[str]:
-    """Get all the name of the models with tag `tag`."""
-    return [model.name for model in ALL_MODELS if tag in model.tags]
-
-
-def get_all_text_models() -> List[str]:
-    """Get all text model names."""
-    return get_model_names_with_tag(TEXT_MODEL_TAG)
-
-
-def get_all_code_models() -> List[str]:
-    """Get all code model names."""
-    return get_model_names_with_tag(CODE_MODEL_TAG)
-
-
-def get_all_instruction_following_models() -> List[str]:
-    """Get all instruction-following model names."""
-    return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG)
diff --git a/src/helm/proxy/services/server_service.py b/src/helm/proxy/services/server_service.py
index 1361859767e..a22ceb35b41 100644
--- a/src/helm/proxy/services/server_service.py
+++ b/src/helm/proxy/services/server_service.py
@@ -2,9 +2,6 @@
 import signal
 from typing import List, Optional
 
-from helm.benchmark.model_metadata_registry import maybe_register_model_metadata_from_base_path
-from helm.benchmark.model_deployment_registry import maybe_register_model_deployments_from_base_path
-from helm.benchmark.tokenizer_config_registry import maybe_register_tokenizer_configs_from_base_path
 from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
 from helm.common.authentication import Authentication
 from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
@@ -22,10 +19,12 @@
 from helm.proxy.clients.auto_client import AutoClient
 from helm.proxy.clients.toxicity_classifier_client import ToxicityClassifierClient
 from helm.proxy.example_queries import example_queries
-from helm.proxy.models import ALL_MODELS, get_model_group
+from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
+from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization
 from helm.proxy.query import Query, QueryResult
 from helm.proxy.retry import retry_request
 from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
+from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
 from .service import (
     Service,
     CACHE_DIR,
@@ -48,18 +47,15 @@ def __init__(self, base_path: str = "prod_env", root_mode=False, mongo_uri: str
         ensure_directory_exists(cache_path)
         accounts_path = os.path.join(base_path, ACCOUNTS_FILE)
 
-        maybe_register_model_metadata_from_base_path(base_path)
-        maybe_register_model_deployments_from_base_path(base_path)
-        maybe_register_tokenizer_configs_from_base_path(base_path)
-
         self.client = AutoClient(credentials, cache_path, mongo_uri)
+        self.tokenizer = AutoTokenizer(credentials, cache_path, mongo_uri)
         self.token_counter = AutoTokenCounter(self.client.get_huggingface_client())
         self.accounts = Accounts(accounts_path, root_mode=root_mode)
         # Lazily instantiated by get_toxicity_scores()
         self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
 
     def get_general_info(self) -> GeneralInfo:
-        return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=ALL_MODELS)
+        return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=ALL_MODELS_METADATA)
 
     def get_window_service_info(self, model_name) -> WindowServiceInfo:
         # The import statement is placed here to avoid two problems, please refer to the link for details
@@ -95,9 +91,9 @@ def make_request(self, auth: Authentication, request: Request) -> RequestResult:
         #       https://github.com/stanford-crfm/benchmarking/issues/56
 
         self.accounts.authenticate(auth)
-        model_group: str = get_model_group(request.model)
+        host_organization: str = get_model_deployment_host_organization(request.model_deployment)
         # Make sure we can use
-        self.accounts.check_can_use(auth.api_key, model_group)
+        self.accounts.check_can_use(auth.api_key, host_organization)
 
         # Use!
         request_result: RequestResult = self.client.make_request(request)
@@ -106,19 +102,19 @@ def make_request(self, auth: Authentication, request: Request) -> RequestResult:
         if not request_result.cached:
             # Count the number of tokens used
             count: int = self.token_counter.count_tokens(request, request_result.completions)
-            self.accounts.use(auth.api_key, model_group, count)
+            self.accounts.use(auth.api_key, host_organization, count)
 
         return request_result
 
     def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
         """Tokenize via an API."""
         self.accounts.authenticate(auth)
-        return self.client.tokenize(request)
+        return self.tokenizer.tokenize(request)
 
     def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
         """Decodes to text."""
         self.accounts.authenticate(auth)
-        return self.client.decode(request)
+        return self.tokenizer.decode(request)
 
     def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
         @retry_request
diff --git a/src/helm/proxy/services/service.py b/src/helm/proxy/services/service.py
index f169008ff2c..af3b500c09d 100644
--- a/src/helm/proxy/services/service.py
+++ b/src/helm/proxy/services/service.py
@@ -14,7 +14,7 @@
     DecodeRequestResult,
 )
 from helm.common.request import Request, RequestResult
-from helm.proxy.models import Model
+from helm.benchmark.model_metadata_registry import ModelMetadata
 from helm.proxy.query import Query, QueryResult
 from helm.proxy.accounts import Authentication, Account
 
@@ -29,7 +29,7 @@
 class GeneralInfo:
     version: str
     example_queries: List[Query]
-    all_models: List[Model]
+    all_models: List[ModelMetadata]
 
 
 def expand_environments(environments: Dict[str, List[str]]):
@@ -69,6 +69,8 @@ def synthesize_request(prompt: str, settings: str, environment: Dict[str, str])
     request: Dict[str, Any] = {}
     request["prompt"] = substitute_text(prompt, environment)
     request.update(parse_hocon(substitute_text(settings, environment)))
+    if "model_deployment" not in request and "model" not in request:
+        request["model_deployment"] = "openai/text-davinci-002"
     return Request(**request)
 
 
diff --git a/src/helm/proxy/services/test_remote_service.py b/src/helm/proxy/services/test_remote_service.py
index 63a267d5608..e2f4306f7a3 100644
--- a/src/helm/proxy/services/test_remote_service.py
+++ b/src/helm/proxy/services/test_remote_service.py
@@ -85,7 +85,7 @@ def create_root_account() -> str:
 
     @staticmethod
     def query(url: str, auth: Authentication, prompt: str):
-        request = Request(prompt=prompt, model="simple/model1")
+        request = Request(prompt=prompt, model="simple/model1", model_deployment="simple/model1")
         response: RequestResult = RemoteService(base_url=url).make_request(auth, request)
         response_text: str = response.completions[0].text
         # With the toy model (simple/model1), we should expect the same response as the prompt
@@ -121,7 +121,7 @@ def teardown_class(cls):
         shutil.rmtree(cls.base_path)
 
     def test_make_request(self):
-        request = Request(prompt="1 2 3", model="simple/model1")
+        request = Request(prompt="1 2 3", model="simple/model1", model_deployment="simple/model1")
         response: RequestResult = self.service.make_request(self.auth, request)
         assert response.success
 
@@ -132,7 +132,7 @@ def test_tokenize(self):
 
     def test_make_request_plus_sign(self):
         # Ensure + in prompt doesn't get replaced by a blank space
-        request = Request(prompt="+", model="simple/model1")
+        request = Request(prompt="+", model="simple/model1", model_deployment="simple/model1")
         response: RequestResult = self.service.make_request(self.auth, request)
         assert response.completions[0].text == "+"
         assert response.success
diff --git a/src/helm/proxy/services/test_service.py b/src/helm/proxy/services/test_service.py
index 1d3f2583f9b..6c0fd19b0ce 100644
--- a/src/helm/proxy/services/test_service.py
+++ b/src/helm/proxy/services/test_service.py
@@ -3,6 +3,7 @@
 import shutil
 import tempfile
 
+from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment
 from helm.common.authentication import Authentication
 from helm.common.request import Request
 from helm.proxy.accounts import AuthenticationError, Accounts
@@ -34,7 +35,9 @@ def test_expand_query(self):
 
     def test_make_request(self):
         num_completions = 2
-        request = Request(prompt="1 2 3", model="simple/model1", num_completions=num_completions)
+        request = Request(
+            prompt="1 2 3", model="simple/model1", model_deployment="simple/model1", num_completions=num_completions
+        )
         result = self.service.make_request(self.auth, request)
         assert len(result.completions) == num_completions
 
@@ -211,7 +214,7 @@ def helper_prod_test_service(request: Request, expected_text: str):
 
 
 # Models that we want to test
-prod_models = ["openai/davinci", "ai21/j1-jumbo"]
+prod_model_deployments = ["openai/davinci", "ai21/j1-jumbo"]
 
 
 # TODO: put a flag on this so that it's easy to use pytest to still run these slow tests
@@ -220,8 +223,17 @@ def helper_prod_test_service(request: Request, expected_text: str):
 def test_prod_continue():
     # Test that we're continuing
     prompt = "Paris is the capital of"
-    for model in prod_models:
-        request = Request(prompt=prompt, model=model, max_tokens=1, num_completions=1, temperature=0)
+    for model_deployment_name in prod_model_deployments:
+        model_deployment: ModelDeployment = get_model_deployment(model_deployment_name)
+        model_name: str = model_deployment.model_name or model_deployment.name
+        request = Request(
+            prompt=prompt,
+            model=model_name,
+            model_deployment=model_deployment_name,
+            max_tokens=1,
+            num_completions=1,
+            temperature=0,
+        )
         helper_prod_test_service(request, " France")
 
 
@@ -229,6 +241,15 @@ def test_prod_continue():
 def test_prod_echo():
     # If we're echoing the prompt, make sure we're getting the same thing back
     prompt = "I like pickles."
-    for model in prod_models:
-        request = Request(prompt=prompt, model=model, max_tokens=0, num_completions=1, echo_prompt=True)
+    for model_deployment_name in prod_model_deployments:
+        model_deployment: ModelDeployment = get_model_deployment(model_deployment_name)
+        model_name: str = model_deployment.model_name or model_deployment.name
+        request = Request(
+            prompt=prompt,
+            model=model_name,
+            model_deployment=model_deployment_name,
+            max_tokens=0,
+            num_completions=1,
+            echo_prompt=True,
+        )
         helper_prod_test_service(request, prompt)
diff --git a/src/helm/proxy/static/index.js b/src/helm/proxy/static/index.js
index 26ad8b8416b..dc97d42ab1b 100644
--- a/src/helm/proxy/static/index.js
+++ b/src/helm/proxy/static/index.js
@@ -237,7 +237,7 @@ $(function () {
     //
     // get_num_bytes() and convert_tokens_to_text() in src/helm/benchmark/basic_metrics.py are adapted from this function.
     const groups = [];
-    for (let i = 0; i < tokens.length; ) {
+    for (let i = 0; i < tokens.length;) {
       // Aggregate consecutive tokens while they're "bytes:..."
       const group = { tokens: [] };
       if (tokens[i].text.startsWith("bytes:")) {
diff --git a/src/helm/proxy/test_models.py b/src/helm/proxy/test_models.py
deleted file mode 100644
index c966815927c..00000000000
--- a/src/helm/proxy/test_models.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from .models import get_model, get_model_group, get_models_by_organization, get_all_code_models, Model
-
-
-def test_get_model():
-    model: Model = get_model("ai21/j1-jumbo")
-    assert model.organization == "ai21"
-    assert model.engine == "j1-jumbo"
-
-
-def test_get_model_with_invalid_model_name():
-    try:
-        get_model("invalid/model")
-        assert False, "Expected to throw ValueError"
-    except ValueError:
-        pass
-
-
-def test_get_model_group():
-    assert get_model_group("openai/text-curie-001") == "gpt3"
-
-
-def test_get_models_by_organization():
-    assert get_models_by_organization("simple") == ["simple/model1"]
-
-
-def test_all_code_models():
-    assert "openai/code-davinci-002" in get_all_code_models()
diff --git a/src/helm/proxy/token_counters/auto_token_counter.py b/src/helm/proxy/token_counters/auto_token_counter.py
index 31d93d3d638..60604f2aa83 100644
--- a/src/helm/proxy/token_counters/auto_token_counter.py
+++ b/src/helm/proxy/token_counters/auto_token_counter.py
@@ -38,5 +38,5 @@ def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
         """
         Counts tokens based on the organization.
         """
-        token_counter: TokenCounter = self.get_token_counter(request.model_organization)
+        token_counter: TokenCounter = self.get_token_counter(request.model_host)
         return token_counter.count_tokens(request, completions)
diff --git a/src/helm/proxy/token_counters/openai_token_counter.py b/src/helm/proxy/token_counters/openai_token_counter.py
index e3083cea5cd..01ca7d35426 100644
--- a/src/helm/proxy/token_counters/openai_token_counter.py
+++ b/src/helm/proxy/token_counters/openai_token_counter.py
@@ -15,7 +15,7 @@ def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
         Counts the total number of tokens using the suggestion here:
         https://community.openai.com/t/how-do-i-calculate-the-pricing-for-generation-of-text/11662/5
         """
-        tokenized_prompt: TokenizationRequestResult = self.huggingface_client.tokenize(
+        tokenized_prompt: TokenizationRequestResult = self.huggingface_client.tokenizer.tokenize(
             TokenizationRequest(request.prompt)
         )
         # Number of tokens in the prompt + number of tokens in all the completions
diff --git a/src/helm/proxy/token_counters/test_ai21_token_counter.py b/src/helm/proxy/token_counters/test_ai21_token_counter.py
index 9ae4541fcb2..026943dfc8e 100644
--- a/src/helm/proxy/token_counters/test_ai21_token_counter.py
+++ b/src/helm/proxy/token_counters/test_ai21_token_counter.py
@@ -10,11 +10,13 @@ def setup_method(self, method):
 
     def test_count_tokens(self):
         request = Request(
+            model="openai/text-davinci-002",
+            model_deployment="openai/text-davinci-002",
             prompt="The Center for Research on Foundation Models (CRFM) is "
             "an interdisciplinary initiative born out of the Stanford "
             "Institute for Human-Centered Artificial Intelligence (HAI) "
             "that aims to make fundamental advances in the study, development, "
-            "and deployment of foundation models."
+            "and deployment of foundation models.",
         )
         completions: List[Sequence] = [
             Sequence(
diff --git a/src/helm/proxy/token_counters/test_openai_token_counter.py b/src/helm/proxy/token_counters/test_openai_token_counter.py
index de9fcc3ef35..3f7bbfaebae 100644
--- a/src/helm/proxy/token_counters/test_openai_token_counter.py
+++ b/src/helm/proxy/token_counters/test_openai_token_counter.py
@@ -32,7 +32,11 @@ def teardown_method(self, method):
         os.remove(self.cache_path)
 
     def test_count_tokens(self):
-        request = Request(prompt=TestOpenAITokenCounter.TEST_PROMPT)
+        request = Request(
+            model="openai/text-davinci-002",
+            model_deployment="openai/text-davinci-002",
+            prompt=TestOpenAITokenCounter.TEST_PROMPT,
+        )
         completions: List[Sequence] = [
             Sequence(
                 text=" The CRFM is dedicated to advancing our knowledge of the foundations of artificial intelligence "
diff --git a/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py b/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py
index a43c63b8414..313cc0a4be4 100644
--- a/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py
+++ b/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py
@@ -31,7 +31,7 @@ class AlephAlphaTokenizer(CachingTokenizer):
     def __init__(self, api_key: str, cache_config: CacheConfig) -> None:
         super().__init__(cache_config)
         self.api_key: str = api_key
-        self._aleph_alpha_client = AlephAlphaPythonClient(token=api_key)
+        self._aleph_alpha_client = AlephAlphaPythonClient(token=api_key) if api_key else None
         self._tokenizer_name_to_tokenizer: Dict[str, InternalTokenizer] = {}
 
     def _get_tokenizer(self, tokenizer_name: str) -> InternalTokenizer:
@@ -40,6 +40,8 @@ def _get_tokenizer(self, tokenizer_name: str) -> InternalTokenizer:
 
         # Check if the tokenizer is cached
         if tokenizer_name not in self._tokenizer_name_to_tokenizer:
+            if self._aleph_alpha_client is None:
+                raise ValueError("Aleph Alpha API key not set.")
             self._tokenizer_name_to_tokenizer[tokenizer_name] = self._aleph_alpha_client.tokenizer(tokenizer_name)
             hlog(f"Initialized tokenizer: {tokenizer_name}")
         return self._tokenizer_name_to_tokenizer[tokenizer_name]
diff --git a/src/helm/proxy/tokenizers/auto_tokenizer.py b/src/helm/proxy/tokenizers/auto_tokenizer.py
new file mode 100644
index 00000000000..5722a7e97bf
--- /dev/null
+++ b/src/helm/proxy/tokenizers/auto_tokenizer.py
@@ -0,0 +1,89 @@
+from dataclasses import replace
+from typing import Any, Dict, Mapping, Optional
+
+from retrying import Attempt, RetryError
+
+from helm.benchmark.tokenizer_config_registry import get_tokenizer_config
+from helm.common.cache_utils import build_cache_config
+from helm.common.credentials_utils import provide_api_key
+from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hlog
+from helm.common.object_spec import create_object, inject_object_spec_args
+from helm.common.tokenization_request import (
+    DecodeRequest,
+    DecodeRequestResult,
+    TokenizationRequest,
+    TokenizationRequestResult,
+)
+from helm.proxy.tokenizers.tokenizer import Tokenizer
+
+
+class AutoTokenizer(Tokenizer):
+    """Automatically dispatch to the proper `Tokenizer` based on the tokenizer name."""
+
+    def __init__(self, credentials: Mapping[str, Any], cache_path: str, mongo_uri: str = ""):
+        self.credentials = credentials
+        self.cache_path = cache_path
+        self.mongo_uri = mongo_uri
+        self.tokenizers: Dict[str, Tokenizer] = {}
+        hlog(f"AutoTokenizer: cache_path = {cache_path}")
+        hlog(f"AutoTokenizer: mongo_uri = {mongo_uri}")
+
+    def _get_tokenizer(self, tokenizer_name: str) -> Tokenizer:
+        # First try to find the tokenizer in the cache
+        tokenizer: Optional[Tokenizer] = self.tokenizers.get(tokenizer_name)
+        if tokenizer is not None:
+            return tokenizer
+
+        # Otherwise, create the tokenizer
+        organization: str = tokenizer_name.split("/")[0]
+        cache_config: CacheConfig = build_cache_config(self.cache_path, self.mongo_uri, organization)
+
+        tokenizer_config = get_tokenizer_config(tokenizer_name)
+        if tokenizer_config:
+            tokenizer_spec = inject_object_spec_args(
+                tokenizer_config.tokenizer_spec,
+                constant_bindings={"cache_config": cache_config},
+                provider_bindings={
+                    "api_key": lambda: provide_api_key(self.credentials, organization),
+                },
+            )
+            tokenizer = create_object(tokenizer_spec)
+
+        # Cache the tokenizer
+        assert isinstance(tokenizer, Tokenizer)  # To make mypy happy
+        self.tokenizers[tokenizer_name] = tokenizer
+
+        return tokenizer
+
+    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
+        """Tokenizes based on the name of the tokenizer (e.g., huggingface/gpt2)."""
+
+        def tokenize_with_retry(tokenizer: Tokenizer, request: TokenizationRequest) -> TokenizationRequestResult:
+            return tokenizer.tokenize(request)
+
+        tokenizer: Tokenizer = self._get_tokenizer(request.tokenizer)
+
+        try:
+            return tokenize_with_retry(tokenizer=tokenizer, request=request)
+        except RetryError as e:
+            last_attempt: Attempt = e.last_attempt
+            retry_error: str = f"Failed to tokenize after retrying {last_attempt.attempt_number} times"
+            hlog(retry_error)
+            return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
+
+    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
+        """Decodes based on the the name of the tokenizer (e.g., huggingface/gpt2)."""
+
+        def decode_with_retry(tokenizer: Tokenizer, request: DecodeRequest) -> DecodeRequestResult:
+            return tokenizer.decode(request)
+
+        tokenizer: Tokenizer = self._get_tokenizer(request.tokenizer)
+
+        try:
+            return decode_with_retry(tokenizer=tokenizer, request=request)
+        except RetryError as e:
+            last_attempt: Attempt = e.last_attempt
+            retry_error: str = f"Failed to decode after retrying {last_attempt.attempt_number} times"
+            hlog(retry_error)
+            return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
diff --git a/src/helm/proxy/clients/test_anthropic_client.py b/src/helm/proxy/tokenizers/test_anthropic_tokenizer.py
similarity index 71%
rename from src/helm/proxy/clients/test_anthropic_client.py
rename to src/helm/proxy/tokenizers/test_anthropic_tokenizer.py
index d1a039ef07e..3556978b5ae 100644
--- a/src/helm/proxy/clients/test_anthropic_client.py
+++ b/src/helm/proxy/tokenizers/test_anthropic_tokenizer.py
@@ -10,11 +10,10 @@
     TokenizationRequest,
     TokenizationRequestResult,
 )
-from helm.proxy.tokenizers.anthropic_tokenizer import AnthropicTokenizer
-from .anthropic_client import AnthropicClient
+from .anthropic_tokenizer import AnthropicTokenizer
 
 
-class TestAnthropicClient:
+class TestAnthropicTokenizer:
     TEST_PROMPT: str = "I am a computer scientist."
     TEST_ENCODED: List[int] = [45, 1413, 269, 6797, 22228, 18]
     TEST_TOKENS: List[str] = ["I", " am", " a", " computer", " scientist", "."]
@@ -22,42 +21,39 @@ class TestAnthropicClient:
     def setup_method(self, method):
         cache_file = tempfile.NamedTemporaryFile(delete=False)
         self.cache_path: str = cache_file.name
-        self.client = AnthropicClient(
-            tokenizer=AnthropicTokenizer(SqliteCacheConfig(self.cache_path)),
-            cache_config=SqliteCacheConfig(self.cache_path),
-        )
+        self.tokenizer = AnthropicTokenizer(SqliteCacheConfig(self.cache_path))
 
     def teardown_method(self, method):
         os.remove(self.cache_path)
 
     def test_tokenize(self):
         request = TokenizationRequest(text=self.TEST_PROMPT)
-        result: TokenizationRequestResult = self.client.tokenize(request)
+        result: TokenizationRequestResult = self.tokenizer.tokenize(request)
         assert not result.cached, "First time making the tokenize request. Result should not be cached"
         assert result.raw_tokens == self.TEST_TOKENS
-        result: TokenizationRequestResult = self.client.tokenize(request)
+        result: TokenizationRequestResult = self.tokenizer.tokenize(request)
         assert result.cached, "Result should be cached"
         assert result.raw_tokens == self.TEST_TOKENS
 
     def test_encode(self):
         request = TokenizationRequest(text=self.TEST_PROMPT, encode=True, truncation=True, max_length=1)
-        result: TokenizationRequestResult = self.client.tokenize(request)
+        result: TokenizationRequestResult = self.tokenizer.tokenize(request)
         assert not result.cached, "First time making the tokenize request. Result should not be cached"
         assert result.raw_tokens == [self.TEST_ENCODED[0]]
-        result: TokenizationRequestResult = self.client.tokenize(request)
+        result: TokenizationRequestResult = self.tokenizer.tokenize(request)
         assert result.cached, "Result should be cached"
         assert result.raw_tokens == [self.TEST_ENCODED[0]]
 
         request = TokenizationRequest(text=self.TEST_PROMPT, encode=True, truncation=True, max_length=1024)
-        result = self.client.tokenize(request)
+        result = self.tokenizer.tokenize(request)
         assert not result.cached, "First time making this particular request. Result should not be cached"
         assert result.raw_tokens == self.TEST_ENCODED
 
     def test_decode(self):
         request = DecodeRequest(tokens=self.TEST_ENCODED)
-        result: DecodeRequestResult = self.client.decode(request)
+        result: DecodeRequestResult = self.tokenizer.decode(request)
         assert not result.cached, "First time making the decode request. Result should not be cached"
         assert result.text == self.TEST_PROMPT
-        result: DecodeRequestResult = self.client.decode(request)
+        result: DecodeRequestResult = self.tokenizer.decode(request)
         assert result.cached, "Result should be cached"
         assert result.text == self.TEST_PROMPT