diff --git a/demo.py b/demo.py index a12f1d6c396..fcbe9d78dec 100644 --- a/demo.py +++ b/demo.py @@ -17,23 +17,30 @@ print(account.usages) # Make a request -request = Request(model="ai21/j1-large", prompt="Life is like a box of", echo_prompt=True) +request = Request( + model="ai21/j2-large", model_deployment="ai21/j2-large", prompt="Life is like a box of", echo_prompt=True +) request_result: RequestResult = service.make_request(auth, request) print(request_result.completions[0].text) # Expect different responses for the same request but with different values for `random`. # Passing in the same value for `random` guarantees the same results. -request = Request(prompt="Life is like a box of", random="1") +request = Request(model="ai21/j2-large", model_deployment="ai21/j2-large", prompt="Life is like a box of", random="1") request_result = service.make_request(auth, request) print(request_result.completions[0].text) # How to get the embedding for some text -request = Request(model="openai/text-similarity-ada-001", prompt="Life is like a box of", embedding=True) +request = Request( + model="openai/text-similarity-ada-002", + model_deployment="openai/text-similarity-ada-002", + prompt="Life is like a box of", + embedding=True, +) request_result = service.make_request(auth, request) print(request_result.embedding) # Tokenize -request = TokenizationRequest(tokenizer="ai21/j1-jumbo", text="Tokenize me please.") +request = TokenizationRequest(tokenizer="ai21/j2-jumbo", text="Tokenize me please.") tokenization_request_result: TokenizationRequestResult = service.tokenize(auth, request) print(f"Number of tokens: {len(tokenization_request_result.tokens)}") diff --git a/docs/get_helm_rank.md b/docs/get_helm_rank.md new file mode 100644 index 00000000000..cf5e26345cc --- /dev/null +++ b/docs/get_helm_rank.md @@ -0,0 +1,84 @@ +# Get Your Model's Leaderboard Rank + +This tutorial will show you how to locally add your model into the HELM leaderboard, with in 3 steps: + +## Download HELM leaderboard results + +First, in order to compare your model to the latest and greatest models found in the [HELM leaderboard](https://crfm.stanford.edu/helm/latest/?group=core_scenarios), use the following command to obtain a zip file of all previous HELM results + +```bash +export LEADERBOARD_VERSION=v0.3.0 +``` + +Downloaded, expand the file into HELMs results dir: + +```bash +curl -O https://storage.googleapis.com/crfm-helm-public/benchmark_output/archives/$LEADERBOARD_VERSION/run_stats.zip &&\ +mkdir -p benchmark_output/runs/$LEADERBOARD_VERSION && unzip run_stats.zip -d benchmark_output/runs/$LEADERBOARD_VERSION +``` + +now that the files are in your results directory, all HELM models will be shown in your UI along with your model. + +## Run Efficient-HELM + +According to [Efficient Benchmarking (of Language Models)](https://arxiv.org/pdf/2308.11696.pdf) a paper from IBM, which systematically analysed benchmark design choices using the HELM benchmark as an example, one can run the HELM benchmark with a fraction of the examples and still get a reliable estimation of a full run (Perlitz et al., 2023). + +Specifically, the authors calculated the CI $95\%$ of Rank Location from the real ranks as a function of the number of examples used per scenario and came up with the following tradeoffs[^1]: + +| Examples Per Scenario | CI $95\%$ of Rank Location | Compute saved | +| :-------------------: | :------------------------: | :-----------: | +| $10$ | $\pm5$ | $\times400$ | +| $20$ | $\pm4$ | $\times200$ | +| $50$ | $\pm3$ | $\times80$ | +| $200$ | $\pm2$ | $\times20$ | +| $1000$ | $\pm1$ | $\times4$ | +| All | $\pm1$ | $\times1$ | + + +Choose your point on your tradeoff, how accurate do you need your rank? how much time do you want to wait? Once you have chosen, download the config and define your model +```bash +export EXAMPLES_PER_SCENARIO=10 && \ +export MODEL_TO_RUN=huggingface/gpt2 +``` + +That's it, run the following to get the config file: + +```bash +wget https://raw.githubusercontent.com/stanford-crfm/helm/main/src/helm/benchmark/presentation/run_specs_core_scenarios_$EXAMPLES_PER_SCENARIO.conf -O run_specs_$EXAMPLES_PER_SCENARIO.conf +``` + +and this one to run the benchmark (will take some time in the first time since all the data has to be prepared): + +```bash +helm-run \ +--conf-paths run_specs_$EXAMPLES_PER_SCENARIO.conf \ +--suite $LEADERBOARD_VERSION \ +--max-eval-instances $EXAMPLES_PER_SCENARIO \ +--models-to-run $MODEL_TO_RUN \ +--cache-instances \ +--num-train-trials 1 \ +--skip-completed-runs +``` + +This will take some time the first time running since all the data (regardless of the number of examples chosen) is downloaded and prepared. + + +## Summarize and serve your results + +To view how your model fits in with the latest leaderboard, process and aggregate your results with: + +```bash +helm-summarize --suite $LEADERBOARD_VERSION +``` + +And serve with: + +```bash +helm-server +``` + +## References List: + +```Perlitz, Y., Bandel, E., Gera, A., Arviv, O., Ein-Dor, L., Shnarch, E., Slonim, N., Shmueli-Scheuer, M. and Choshen, L., 2023. Efficient Benchmarking (of Language Models). arXiv preprint arXiv:2308.11696.``` + +[^1]: Note that the quantities below are the CI $95\%$ of the rank location and are thus very conservative estimates. In our experiments, we did not experience deviations above $\pm2$ for any of the options above. diff --git a/docs/index.md b/docs/index.md index c1d090537d1..cd79581fa41 100644 --- a/docs/index.md +++ b/docs/index.md @@ -11,6 +11,7 @@ To run the code, refer to the User Guide's chapters: - [Installation](installation.md) - [Quick Start](quick_start.md) - [Tutorial](tutorial.md) +- [Get Your Model's Leaderboard Rank](get_helm_rank.md) To add new models and scenarios, refer to the Developer Guide's chapters: diff --git a/docs/quick_start.md b/docs/quick_start.md index b9463733cb2..f8d35026306 100644 --- a/docs/quick_start.md +++ b/docs/quick_start.md @@ -18,3 +18,4 @@ helm-server Then go to http://localhost:8000/ in your browser. +**Next steps:** click [here](get_helm_rank.md) to find out how to to run the full benchmark and get your model's leaderboard rank. \ No newline at end of file diff --git a/docs/tutorial.md b/docs/tutorial.md index d02fd9cb5cc..cc103e11f6d 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -2,20 +2,20 @@ This tutorial will explain how to use the HELM command line tools to run benchmarks, aggregate statistics, and visualize results. -We will run two runs using the `mmlu` scenario on the `huggingface/gpt-2` model. The `mmlu` scenario implements the **Massive Multitask Language (MMLU)** benchmark from [this paper](https://arxiv.org/pdf/2009.03300.pdf), and consists of a Question Answering (QA) task using a dataset with questions from 57 subjects such as elementary mathematics, US history, computer science, law, and more. Note that GPT-2 performs poorly on MMLU, so this is just a proof of concept. We will run two runs: the first using questions about anatomy, and the second using questions about philosophy. +We will run two runs using the `mmlu` scenario on the `openai/gpt2` model. The `mmlu` scenario implements the **Massive Multitask Language (MMLU)** benchmark from [this paper](https://arxiv.org/pdf/2009.03300.pdf), and consists of a Question Answering (QA) task using a dataset with questions from 57 subjects such as elementary mathematics, US history, computer science, law, and more. Note that GPT-2 performs poorly on MMLU, so this is just a proof of concept. We will run two runs: the first using questions about anatomy, and the second using questions about philosophy. ## Using `helm-run` `helm-run` is a command line tool for running benchmarks. -To run this benchmark using the HELM command-line tools, we need to specify **run spec descriptions** that describes the desired runs. For this example, the run spec descriptions are `mmlu:subject=anatomy,model=huggingface/gpt-2` (for anatomy) and `mmlu:subject=philosophy,model=huggingface/gpt-2` (for philosophy). +To run this benchmark using the HELM command-line tools, we need to specify **run spec descriptions** that describes the desired runs. For this example, the run spec descriptions are `mmlu:subject=anatomy,model=openai/gpt2` (for anatomy) and `mmlu:subject=philosophy,model=openai/gpt2` (for philosophy). Next, we need to create a **run spec configuration file** contining these run spec descriptions. A run spec configuration file is a text file containing `RunEntries` serialized to JSON, where each entry in `RunEntries` contains a run spec description. The `description` field of each entry should be a **run spec description**. Create a text file named `run_specs.conf` with the following contents: ``` entries: [ - {description: "mmlu:subject=anatomy,model=huggingface/gpt2", priority: 1}, - {description: "mmlu:subject=philosophy,model=huggingface/gpt2", priority: 1}, + {description: "mmlu:subject=anatomy,model=openai/gpt2", priority: 1}, + {description: "mmlu:subject=philosophy,model=openai/gpt2", priority: 1}, ] ``` @@ -35,7 +35,7 @@ The meaning of the additional arguments are as follows: - The environment directory is `prod_env/` by default and can be set using `--local-path`. Credentials for making API calls should be added to a `credentials.conf` file in this directory. - The output directory is `benchmark_output/` by default and can be set using `--output-path`. -After running this command, navigate to the `benchmark_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=huggingface_gpt-2` and `mmlu:subject=philosophy,model=huggingface_gpt-2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`. +After running this command, navigate to the `benchmark_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=openai_gpt2` and `mmlu:subject=philosophy,model=openai_gpt2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`. Each output sub-directory will contain several JSON files that were generated during the corresponding run: diff --git a/scripts/compute_request_limits.py b/scripts/compute_request_limits.py index a5060e37255..55cd813d70c 100644 --- a/scripts/compute_request_limits.py +++ b/scripts/compute_request_limits.py @@ -1,10 +1,12 @@ # This script is used to find out the max_prompt_length and max_prompt_length_plus_tokens for a given model. # You must set max_attempts to 1 in retry.py to make it work. # Example usage: -# python compute_request_limits.py --model_name="writer/palmyra-base" --tokenizer_name="Writer/palmyra-base" +# python compute_request_limits.py --model_deployment_name="writer/palmyra-base" --tokenizer_name="Writer/palmyra-base" from typing import Any, Optional, Dict from helm.proxy.clients.auto_client import AutoClient +from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment +from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer from helm.common.request import Request from helm.common.tokenization_request import TokenizationRequest @@ -40,6 +42,7 @@ def get_number_of_tokens(prompt: str, tokenizer: Tokenizer, tokenizer_name: str) def try_request( client: Any, + model_deployment_name: str, model_name: str, tokenizer_name: str, tokenizer: Tokenizer, @@ -58,6 +61,7 @@ def try_request( try: request = Request( model=model_name, + model_deployment=model_deployment_name, prompt=prefix + " ".join(["hello"] * (sequence_length - num_tokens_prefix - num_tokens_suffix)) + suffix, max_tokens=num_tokens, ) @@ -78,6 +82,8 @@ class RequestLimits: def figure_out_max_prompt_length( client: AutoClient, + auto_tokenizer: AutoTokenizer, + model_deployment_name: str, model_name: str, tokenizer_name: str, upper_bound: int = 9500, @@ -85,7 +91,7 @@ def figure_out_max_prompt_length( prefix: str = "", suffix: str = "", ) -> RequestLimits: - tokenizer = client._get_tokenizer(tokenizer_name) + tokenizer = auto_tokenizer._get_tokenizer(tokenizer_name) num_tokens_prefix = get_number_of_tokens(prefix, tokenizer, tokenizer_name) num_tokens_suffix = get_number_of_tokens(suffix, tokenizer, tokenizer_name) @@ -95,7 +101,9 @@ def figure_out_max_prompt_length( with tqdm(total=int(math.log2(upper_bound - lower_bound))) as pbar: while lower_bound < upper_bound: middle = math.ceil((lower_bound + upper_bound) / 2) - if try_request(client, model_name, tokenizer_name, tokenizer, middle, 0, prefix, suffix): + if try_request( + client, model_deployment_name, model_name, tokenizer_name, tokenizer, middle, 0, prefix, suffix + ): lower_bound = middle else: upper_bound = middle - 1 @@ -117,6 +125,7 @@ def figure_out_max_prompt_length( def figure_out_max_prompt_length_plus_tokens( client: Any, # Client, + model_deployment_name: str, model_name: str, tokenizer_name: str, max_prompt_length: int, @@ -130,6 +139,7 @@ def figure_out_max_prompt_length_plus_tokens( # Check if there is a limit (some model accept as many tokens as you want) if try_request( client, + model_deployment_name, model_name, tokenizer_name, tokenizer, @@ -148,7 +158,17 @@ def figure_out_max_prompt_length_plus_tokens( with tqdm(total=int(math.log2(upper_bound - lower_bound))) as pbar: while lower_bound < upper_bound: middle = math.ceil((lower_bound + upper_bound) / 2) - if try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length, middle, prefix, suffix): + if try_request( + client, + model_deployment_name, + model_name, + tokenizer_name, + tokenizer, + max_prompt_length, + middle, + prefix, + suffix, + ): lower_bound = middle else: upper_bound = middle - 1 @@ -159,20 +179,24 @@ def figure_out_max_prompt_length_plus_tokens( def check_limits( client: AutoClient, + auto_tokenizer: AutoTokenizer, + model_deployment_name: str, model_name: str, tokenizer_name: str, limits: RequestLimits, prefix: str = "", suffix: str = "", ) -> bool: - tokenizer = client._get_tokenizer(tokenizer_name) + tokenizer = auto_tokenizer._get_tokenizer(tokenizer_name) result: bool = True # Check the max_prompt_length max_prompt_length = limits.max_prompt_length if max_prompt_length < 0: print("No limit on the number of tokens") - if not try_request(client, model_name, tokenizer_name, tokenizer, 2**32 - 2, 0, prefix, suffix): + if not try_request( + client, model_deployment_name, model_name, tokenizer_name, tokenizer, 2**32 - 2, 0, prefix, suffix + ): print(f"There is a limit on the number of tokens. Params: max_prompt_length={2**32 - 2}, max_tokens=1") result = False else: @@ -180,15 +204,37 @@ def check_limits( # If there is no limit on the number of tokens, max_prompt_length should be -1 # And we should not be here # Check that max_prompt_length is ok - if not try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length, 0, prefix, suffix): + if not try_request( + client, model_deployment_name, model_name, tokenizer_name, tokenizer, max_prompt_length, 0, prefix, suffix + ): print(f"max_prompt_length is too big. Params: max_prompt_length={max_prompt_length}, max_tokens=1") result = False # Check that max_prompt_length + 1 is not ok - if try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length + 1, 0, prefix, suffix): + if try_request( + client, + model_deployment_name, + model_name, + tokenizer_name, + tokenizer, + max_prompt_length + 1, + 0, + prefix, + suffix, + ): print(f"max_prompt_length could be bigger. Params: max_prompt_length={max_prompt_length+1}, max_tokens=1") result = False # Check that max_prompt_length - 1 is ok - if not try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length - 1, 0, prefix, suffix): + if not try_request( + client, + model_deployment_name, + model_name, + tokenizer_name, + tokenizer, + max_prompt_length - 1, + 0, + prefix, + suffix, + ): print( f"max_prompt_length ssems to be inconsistent. max_prompt_length={max_prompt_length} " f"is ok but max_prompt_length={max_prompt_length-1} is not, with max_tokens=0" @@ -203,7 +249,15 @@ def check_limits( if max_prompt_length_plus_tokens < 0: print("No limit on the number of tokens") if not try_request( - client, model_name, tokenizer_name, tokenizer, max(1, max_prompt_length), 2**32 - 2, prefix, suffix + client, + model_deployment_name, + model_name, + tokenizer_name, + tokenizer, + max(1, max_prompt_length), + 2**32 - 2, + prefix, + suffix, ): print( f"There is a limit on the number of tokens. Params: max_prompt_length={max_prompt_length}," @@ -216,6 +270,7 @@ def check_limits( # If there is no limit on the number of tokens, we skip this test if not try_request( client, + model_deployment_name, model_name, tokenizer_name, tokenizer, @@ -231,6 +286,7 @@ def check_limits( result = False if try_request( client, + model_deployment_name, model_name, tokenizer_name, tokenizer, @@ -251,7 +307,8 @@ def check_limits( def get_args(): # model_name, tokenizer_name, prefix and suffix are passed as arguments parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, default="writer/palmyra-base") + parser.add_argument("--model_deployment_name", type=str, default="writer/palmyra-base") + parser.add_argument("--model_name", type=str, default="") parser.add_argument("--tokenizer_name", type=str, default="Writer/palmyra-base") parser.add_argument( "--prefix", @@ -268,6 +325,10 @@ def get_args(): parser.add_argument("--credentials_path", type=str, default="../prod_env/credentials.conf") parser.add_argument("--cache_path", type=str, default="../prod_env/cache") args = parser.parse_args() + + if args.model_name == "": + model_deployment: ModelDeployment = get_model_deployment(args.model_deployment_name) + args.model_name = model_deployment.model_name return args @@ -284,10 +345,16 @@ def main(): print(f"cache_path: {cache_path}") client = AutoClient(credentials=credentials, cache_path=cache_path) + auto_tokenizer = AutoTokenizer(credentials=credentials, cache_path=cache_path) print("client successfully created") print("Making short request...") - request = Request(model=args.model_name, prompt=args.prefix + "hello" + args.suffix, max_tokens=1) + request = Request( + model=args.model_name, + model_deployment=args.model_deployment_name, + prompt=args.prefix + "hello" + args.suffix, + max_tokens=1, + ) response = client.make_request(request) if not response.success: raise ValueError("Request failed") @@ -305,7 +372,13 @@ def main(): print("========== Figure out max_prompt_length ==========") limits: RequestLimits = figure_out_max_prompt_length( - client, args.model_name, args.tokenizer_name, prefix=args.prefix, suffix=args.suffix + client, + auto_tokenizer, + args.model_deployment_name, + args.model_name, + args.tokenizer_name, + prefix=args.prefix, + suffix=args.suffix, ) print(f"max_prompt_length: {limits.max_prompt_length}") print("===================================================") @@ -314,6 +387,7 @@ def main(): print("========== Figure out max_prompt_length_plus_tokens ==========") max_prompt_length_plus_tokens: int = figure_out_max_prompt_length_plus_tokens( client, + args.model_deployment_name, args.model_name, args.tokenizer_name, max_prompt_length=limits.max_prompt_length, @@ -328,7 +402,14 @@ def main(): # Check the limits print("========== Check the limits ==========") result: bool = check_limits( - client, args.model_name, args.tokenizer_name, limits, prefix=args.prefix, suffix=args.suffix + client, + auto_tokenizer, + args.model_deployment_name, + args.model_name, + args.tokenizer_name, + limits, + prefix=args.prefix, + suffix=args.suffix, ) if result: print("All limits are respected") diff --git a/scripts/efficiency/generate_instances.py b/scripts/efficiency/generate_instances.py index 615b0600825..626bb6a7be6 100644 --- a/scripts/efficiency/generate_instances.py +++ b/scripts/efficiency/generate_instances.py @@ -17,8 +17,8 @@ DecodeRequestResult, TokenizationToken, ) -from helm.proxy.clients.client import Client -from helm.proxy.clients.auto_client import AutoClient +from helm.proxy.tokenizers.tokenizer import Tokenizer +from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer from helm.proxy.services.service import ( CACHE_DIR, ) @@ -40,25 +40,28 @@ } -def _count_prompt_tokens(client: Client, prompt: str, tokenizer: str): - request: TokenizationRequest = TokenizationRequest(text=prompt, tokenizer=tokenizer) - result: TokenizationRequestResult = client.tokenize(request) +def _count_prompt_tokens(tokenizer: Tokenizer, prompt: str, tokenizer_name: str): + request: TokenizationRequest = TokenizationRequest(text=prompt, tokenizer=tokenizer_name) + result: TokenizationRequestResult = tokenizer.tokenize(request) return len(result.tokens) -def get_client(base_path: str = "prod_env"): +def get_tokenizer(base_path: str = "prod_env") -> AutoTokenizer: credentials = get_credentials(base_path) cache_path = os.path.join(base_path, CACHE_DIR) ensure_directory_exists(cache_path) # TODO: Pass mongo_uri to AutoClient - client = AutoClient(credentials, cache_path) + tokenizer = AutoTokenizer(credentials, cache_path) - return client + return tokenizer def tokenize_text( - client: AutoClient, tokenizer: str, output_path: str = "synthetic_efficiency_instances", base_path: str = "prod_env" + tokenizer: AutoTokenizer, + tokenizer_name: str, + output_path: str = "synthetic_efficiency_instances", + base_path: str = "prod_env", ) -> Tuple[Dict[str, List[TokenizationToken]], Dict[str, List[str]]]: """Tokenizes each book using the requested tokenizer service.""" sources = { @@ -72,7 +75,7 @@ def tokenize_text( tokens: Dict[str, List[TokenizationToken]] = {} text_chunks: Dict[str, List[str]] = {} - tokenizer_organization: str = tokenizer.split("/")[0] + tokenizer_organization: str = tokenizer_name.split("/")[0] ai21_tokenizer: bool = tokenizer_organization == "ai21" # Extract tokens from book sources @@ -96,9 +99,9 @@ def tokenize_text( batch = " ".join(text[i * batch_size : (i + 1) * batch_size]) while True: request: TokenizationRequest = TokenizationRequest( - text=batch, tokenizer=tokenizer, encode=(not ai21_tokenizer) + text=batch, tokenizer=tokenizer_name, encode=(not ai21_tokenizer) ) - result: TokenizationRequestResult = client.tokenize(request) + result: TokenizationRequestResult = tokenizer.tokenize(request) tokens_ = frozenset([token.value for token in result.tokens]) if tokens_ not in seen_tokens: seen_tokens.add(tokens_) @@ -116,15 +119,15 @@ def tokenize_text( def generate_synthetic_efficiency_instances( tokens: Dict[str, List[TokenizationToken]], text_chunks: Dict[str, List[str]], - client: Client, + tokenizer: Tokenizer, num_instances: int, num_prompt_tokens: int, - tokenizer: str, + tokenizer_name: str, output_path: str = "synthetic_efficiency_instances", base_path: str = "prod_env", ): """Generates the synthetic efficiency instances given the tokenized book sources.""" - tokenizer_organization: str = tokenizer.split("/")[0] + tokenizer_organization: str = tokenizer_name.split("/")[0] ai21_tokenizer: bool = tokenizer_organization == "ai21" books = list(tokens.keys()) @@ -155,13 +158,13 @@ def generate_synthetic_efficiency_instances( prompt = "".join(per_instance_tokens) else: decode_request: DecodeRequest = DecodeRequest(tokens=per_instance_tokens) # type: ignore - decode_result: DecodeRequestResult = client.decode(decode_request) + decode_result: DecodeRequestResult = tokenizer.decode(decode_request) prompt = decode_result.text if prompt == "": num_generated_tokens = 0 else: - num_generated_tokens = _count_prompt_tokens(client, prompt, tokenizer) + num_generated_tokens = _count_prompt_tokens(tokenizer, prompt, tokenizer_name) if num_generated_tokens != num_prompt_tokens: temp_num_tokens = num_generated_tokens while temp_num_tokens < num_prompt_tokens: @@ -190,7 +193,7 @@ def generate_synthetic_efficiency_instances( if not finished: print( f"Requested {num_prompt_tokens}, got {num_generated_tokens} for " - f"book {books[j]}, instance #{orig_i}, tokenizer={tokenizer}, " + f"book {books[j]}, instance #{orig_i}, tokenizer={tokenizer_name}, " "trying again with a new span of text..." ) attempt_num += 1 @@ -199,15 +202,15 @@ def generate_synthetic_efficiency_instances( for i, prompt in enumerate(prompts): for k, v in TOKENIZER_REPLACEMENTS.items(): - tokenizer = tokenizer.replace(k, v) - name = f"num_prompt_tokens={num_prompt_tokens}," f"tokenizer={tokenizer.replace('/', '_')}," f"id={i}.txt" + tokenizer_name = tokenizer_name.replace(k, v) + name = f"num_prompt_tokens={num_prompt_tokens}," f"tokenizer={tokenizer_name.replace('/', '_')}," f"id={i}.txt" write(os.path.join(output_path, name), prompt) if __name__ == "__main__": - client = get_client() + tokenizer = get_tokenizer() - for tokenizer in [ + for tokenizer_name in [ "huggingface/gpt2", "ai21/j1", "cohere/cohere", @@ -221,13 +224,13 @@ def generate_synthetic_efficiency_instances( "EleutherAI/gpt-neox-20b", "EleutherAI/gpt-j-6B", ]: - tokens, text_chunks = tokenize_text(tokenizer=tokenizer, client=client) + tokens, text_chunks = tokenize_text(tokenizer=tokenizer, tokenizer_name=tokenizer_name) for num_prompt_tokens in NUM_INPUT_TOKENS: generate_synthetic_efficiency_instances( tokens=tokens, text_chunks=text_chunks, - client=client, + tokenizer=tokenizer, num_instances=30, num_prompt_tokens=num_prompt_tokens, - tokenizer=tokenizer, + tokenizer_name=tokenizer_name, ) diff --git a/setup.cfg b/setup.cfg index ac7edc699ff..764a864116a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -147,7 +147,7 @@ all = dev = pytest~=7.2.0 black~=22.10.0 - mypy~=0.982 + mypy~=1.5.1 pre-commit~=2.20.0 flake8~=5.0.4 diff --git a/src/helm-frontend/README.md b/src/helm-frontend/README.md index c8c5ac97318..d1cc19541a0 100644 --- a/src/helm-frontend/README.md +++ b/src/helm-frontend/README.md @@ -1,13 +1,11 @@ -React Frontend for HELM -------------------------- +## React Frontend for HELM This directory contains the files for building and developing an alternative React based frontend for HELM. If you are looking for the current frontend deployed to https://crfm.stanford.edu/helm/latest/ you will want to look in `helm/benchmark/static` and `helm/benchmark/proxy/static`. If you are looking to make changes to the alternative React frontend, then you are in the correct place. This app makes use of [React](https://react.dev/) + [TypeScript](https://www.typescriptlang.org/) and built with [vite](https://vitejs.dev/). [Tailwindcss](https://tailwindcss.com/) is used for CSS along with some help from the UI frameworks [daisyUI](https://daisyui.com/) and [tremor](https://www.tremor.so/). [React Testing Library](https://testing-library.com/docs/react-testing-library/intro/) is used for tests. - - ### Installation + ```bash npm Install ``` @@ -38,6 +36,12 @@ npm run test npm run build ``` +### Deployment + +For deployment, you can use the build directory out of the box (using the previous command), but will have to replace every leading / in href and src in order for deploys to GitHub Pages to work correctly. + +You can rename the build directory to the desired release name and upload it to GitHub pages to add a new front-end release. Currently the GitHub workflow for deploying the React front-end does not do this and is not being used. + ### Linting ```bash @@ -46,6 +50,8 @@ npm run lint ### Formatting +If you don't have prettier configured in your IDE or Node environment, you will have to run the following before commiting, in order to pass tests. + ```bash npm run format ``` diff --git a/src/helm-frontend/index.html b/src/helm-frontend/index.html index 787b4e58dd9..b728d7f97f9 100644 --- a/src/helm-frontend/index.html +++ b/src/helm-frontend/index.html @@ -6,6 +6,7 @@ Holistic Evaluation of Language Models (HELM) +
diff --git a/src/helm-frontend/public/config.js b/src/helm-frontend/public/config.js new file mode 100644 index 00000000000..87668190bc7 --- /dev/null +++ b/src/helm-frontend/public/config.js @@ -0,0 +1,4 @@ +window.BENCHMARK_OUTPUT_BASE_URL = + "https://storage.googleapis.com/crfm-helm-public/"; +window.SUITE = "v0.2.4"; +window.RELEASE = "v0.3.0"; diff --git a/src/helm-frontend/src/App.tsx b/src/helm-frontend/src/App.tsx index 8d6cdd91356..3808668c153 100644 --- a/src/helm-frontend/src/App.tsx +++ b/src/helm-frontend/src/App.tsx @@ -1,26 +1,28 @@ import "./App.css"; import { HashRouter as Router, Route, Routes } from "react-router-dom"; import Layout from "@/layouts/Main"; -import Home from "@/routes/Home"; import Models from "@/routes/Models"; import Scenarios from "@/routes/Scenarios"; import Groups from "@/routes/Groups"; import Group from "@/routes/Group"; import Runs from "@/routes/Runs"; import Run from "@/routes/Run"; +import Landing from "@/routes/Landing"; +import Leaderboard from "@/routes/Leaderboard"; export default function App() { return ( - }> - } /> - } /> - } /> - } /> - } /> - } /> - } /> + }> + } /> + } /> + } /> + } /> + } /> + } /> + } /> + } /> diff --git a/src/helm-frontend/src/assets/heim-logo.png b/src/helm-frontend/src/assets/heim-logo.png new file mode 100644 index 00000000000..ee23761a2bc Binary files /dev/null and b/src/helm-frontend/src/assets/heim-logo.png differ diff --git a/src/helm-frontend/src/assets/helmhero.png b/src/helm-frontend/src/assets/helmhero.png new file mode 100644 index 00000000000..05b9da5ebd8 Binary files /dev/null and b/src/helm-frontend/src/assets/helmhero.png differ diff --git a/src/helm-frontend/src/components/Alert.tsx b/src/helm-frontend/src/components/Alert.tsx new file mode 100644 index 00000000000..e6ab0aa6656 --- /dev/null +++ b/src/helm-frontend/src/components/Alert.tsx @@ -0,0 +1,39 @@ +import { useState } from "react"; +import Link from "./Link"; + +export default function Alert() { + const [visible, setVisible] = useState(true); + + const handleClose = () => { + setVisible(false); + }; + + return ( + visible && ( +
+
+ + Welcome to the new results view, + + for the old view, + + click here + +
+ + Close + +
+ ) + ); +} diff --git a/src/helm-frontend/src/components/Card.tsx b/src/helm-frontend/src/components/Card.tsx index 7f4703e1f52..03bed335aed 100644 --- a/src/helm-frontend/src/components/Card.tsx +++ b/src/helm-frontend/src/components/Card.tsx @@ -3,7 +3,7 @@ export default function Card() {
Total Models
-
123
+
1
); diff --git a/src/helm-frontend/src/components/Footer.tsx b/src/helm-frontend/src/components/Footer.tsx index adb2c0fb25d..068419a95d2 100644 --- a/src/helm-frontend/src/components/Footer.tsx +++ b/src/helm-frontend/src/components/Footer.tsx @@ -1,7 +1,8 @@ -import getBenchmarkSuite from "@/utils/getBenchmarkSuite"; +import getBenchmarkRelease from "@/utils/getBenchmarkRelease"; +//import getBenchmarkSuite from "@/utils/getBenchmarkSuite"; export default function Footer() { - const version = getBenchmarkSuite(); + const version = getBenchmarkRelease(); return (

Version {version}

diff --git a/src/helm-frontend/src/components/Hero.tsx b/src/helm-frontend/src/components/Hero.tsx new file mode 100644 index 00000000000..886b33196f8 --- /dev/null +++ b/src/helm-frontend/src/components/Hero.tsx @@ -0,0 +1,44 @@ +import helmHero from "@/assets/helmhero.png"; +import { Link } from "react-router-dom"; + +export default function Hero() { + return ( +
+ {/* Left side content */} +
+ {" "} + {/* Added flex and justify-center */} +
+

+ + A holistic framework for evaluating foundation models. + +

+
+
+ + + + + {" "} + {/* Added margin-left for spacing */} + + +
+
+ + {/* Right side image */} +
+ {" "} + {/* Added mx-4 for horizontal margin */} + HELM Hero +
+
+ ); +} diff --git a/src/helm-frontend/src/components/LeaderboardTables.tsx b/src/helm-frontend/src/components/LeaderboardTables.tsx new file mode 100644 index 00000000000..3580fda1e4b --- /dev/null +++ b/src/helm-frontend/src/components/LeaderboardTables.tsx @@ -0,0 +1,128 @@ +import { useEffect, useState } from "react"; +import { ChevronUpDownIcon } from "@heroicons/react/24/solid"; +import type GroupsTable from "@/types/GroupsTable"; +import RowValue from "@/components/RowValue"; + +interface Props { + groupsTables: GroupsTable[]; + activeGroup: number; + ignoreHref?: boolean; + sortable?: boolean; + sortFirstMetric?: boolean; +} + +export default function LeaderboardTables({ + groupsTables, + activeGroup, + ignoreHref = false, + sortable = true, + sortFirstMetric = true, +}: Props) { + const [activeSortColumn, setActiveSortColumn] = useState( + sortFirstMetric ? 1 : undefined, + ); + const [activeGroupsTable, setActiveGroupsTable] = useState({ + ...groupsTables[activeGroup], + }); + const [sortDirection, setSortDirection] = useState(1); + + useEffect(() => { + setActiveGroupsTable({ ...groupsTables[activeGroup] }); + }, [activeGroup, groupsTables]); + + const handleSort = (columnIndex: number) => { + let sort = sortDirection; + if (activeSortColumn === columnIndex) { + sort = sort * -1; + } else { + sort = 1; + } + setActiveSortColumn(columnIndex); + setSortDirection(sort); + + setActiveGroupsTable((prev) => { + const group = { ...prev }; + group.rows.sort((a, b) => { + const av = a[columnIndex]?.value; + const bv = b[columnIndex]?.value; + if (av !== undefined && bv === undefined) { + return -1; + } + if (bv !== undefined && av === undefined) { + return 1; + } + if (typeof av === "number" && typeof bv === "number") { + return (av - bv) * sort; + } + if (typeof av === "string" && typeof bv === "string") { + if (sort === 1) { + return av.localeCompare(bv); + } + return bv.localeCompare(av); + } + + return 0; + }); + + return group; + }); + }; + useEffect(() => { + if (sortFirstMetric && activeSortColumn) { + handleSort(activeSortColumn); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [sortFirstMetric, activeSortColumn]); + + return ( +
+
+ + + + {activeGroupsTable.header.map((headerValue, idx) => ( + + ))} + + + + {activeGroupsTable.rows.map((row, idx) => ( + + {" "} + {/* Added alternating row highlighting */} + {row.map((rowValue, cellIdx) => ( + + ))} + + ))} + +
+
+ {headerValue.value} + {sortable ? ( + + ) : null} +
+
+ +
+
+
+ ); +} diff --git a/src/helm-frontend/src/components/MetricsList.tsx b/src/helm-frontend/src/components/MetricsList.tsx index 98d711661d0..0fdc996a559 100644 --- a/src/helm-frontend/src/components/MetricsList.tsx +++ b/src/helm-frontend/src/components/MetricsList.tsx @@ -1,5 +1,6 @@ import type Metric from "@/types/Metric"; import type MetricGroup from "@/types/MetricGroup"; +import { Link as ReactRouterLink } from "react-router-dom"; interface Props { metrics: Metric[]; @@ -9,18 +10,21 @@ interface Props { export default function MetricList({ metrics, metricGroups }: Props) { return (
-

{metrics.length} Metrics

+

{metrics.length} metrics

    {metricGroups.map((metricGroup, idx) => ( -
  • +
  • {metrics.filter((metric) => metricGroup.metrics.some((m) => m.name === metric.name), ).length > 0 ? ( -

    - {metricGroup.display_name} ({metricGroup.name}) -

    + +

    {metricGroup.display_name}

    +
    ) : null} -
      +
        {metrics .filter((metric) => metricGroup.metrics.some((m) => m.name === metric.name), diff --git a/src/helm-frontend/src/components/ModelsList.tsx b/src/helm-frontend/src/components/ModelsList.tsx index d6c36f2ac63..10b0bde9f82 100644 --- a/src/helm-frontend/src/components/ModelsList.tsx +++ b/src/helm-frontend/src/components/ModelsList.tsx @@ -1,4 +1,5 @@ import type Model from "@/types/Model"; +import { Link as ReactRouterLink } from "react-router-dom"; interface Props { models: Model[]; @@ -7,17 +8,19 @@ interface Props { export default function ModelsList({ models }: Props) { return (
        -

        {models.length} Models

        +

        {models.length} models

          {models.map((model, idx) => model.todo ? ( -
        • +
        • {model.creator_organization} / {model.display_name}
        • ) : ( -
        • - {model.creator_organization} / {model.display_name} -
        • + +
        • + {model.creator_organization} / {model.display_name} +
        • +
          ), )}
        diff --git a/src/helm-frontend/src/components/NavBar/NavBar.test.tsx b/src/helm-frontend/src/components/NavBar/NavBar.test.tsx index bf49b3504d8..cbd2f650cc6 100644 --- a/src/helm-frontend/src/components/NavBar/NavBar.test.tsx +++ b/src/helm-frontend/src/components/NavBar/NavBar.test.tsx @@ -11,6 +11,6 @@ test("displays nav bar", () => { ); expect(screen.getByRole("navigation")).toHaveTextContent( - "ModelsScenariosResultsRaw RunsModelsScenariosResultsRaw Runs", + "LeaderboardModelsScenariosExplore PredictionsLeaderboardModelsScenariosExplore Predictions", ); }); diff --git a/src/helm-frontend/src/components/NavBar/NavBar.tsx b/src/helm-frontend/src/components/NavBar/NavBar.tsx index 340285cbc22..354aa714552 100644 --- a/src/helm-frontend/src/components/NavBar/NavBar.tsx +++ b/src/helm-frontend/src/components/NavBar/NavBar.tsx @@ -1,7 +1,8 @@ import { Link } from "react-router-dom"; import { Bars3Icon } from "@heroicons/react/24/outline"; import crfmLogo from "@/assets/crfm-logo.png"; -import helmLogo from "@/assets/helm-logo-simple.png"; +//import helmLogo from "@/assets/helm-logo-simple.png"; +import NavDropdown from "@/components/NavDropdown"; export default function NavBar() { return ( @@ -19,17 +20,17 @@ export default function NavBar() { className="menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow" >
      • - Models + Leaderboard
      • - Scenarios + Models
      • - Results + Scenarios
      • - Raw Runs + Explore Predictions
      @@ -39,23 +40,21 @@ export default function NavBar() { - - - +
diff --git a/src/helm-frontend/src/components/NavDropdown.tsx b/src/helm-frontend/src/components/NavDropdown.tsx new file mode 100644 index 00000000000..8eb9e72c00b --- /dev/null +++ b/src/helm-frontend/src/components/NavDropdown.tsx @@ -0,0 +1,80 @@ +import { useState } from "react"; +import { Link } from "react-router-dom"; + +function NavDropdown() { + const [dropdownOpen, setDropdownOpen] = useState(false); + + return ( +
+
+ + Image 1 + + + {/* Chevron Button */} + +
+ + {dropdownOpen && ( +
+
+
+ +
+ + HELM: Holistic evaluation of language + models + +
+ +
+
+ +
+ + HEIM: Holistic evaluation of image models + +
+ +
+
+
+ )} +
+ ); +} + +export default NavDropdown; diff --git a/src/helm-frontend/src/components/ScenariosList.tsx b/src/helm-frontend/src/components/ScenariosList.tsx index 4dc4fc2cdf0..20afd793792 100644 --- a/src/helm-frontend/src/components/ScenariosList.tsx +++ b/src/helm-frontend/src/components/ScenariosList.tsx @@ -1,4 +1,5 @@ import type RunGroup from "@/types/RunGroup"; +import { Link as ReactRouterLink } from "react-router-dom"; interface Props { runGroups: RunGroup[]; @@ -23,7 +24,7 @@ export default function ScenariosList({ runGroups }: Props) { return (
-

{runGroups.length} Scenarios

+

{runGroups.length} scenarios