@@ -93,13 +98,10 @@ export default function Scenarios() {
))}
-
+
- Total Scenarios
+ Total scenarios
{runGroups.length}
diff --git a/src/helm-frontend/src/services/getDisplayPredictionsByName.ts b/src/helm-frontend/src/services/getDisplayPredictionsByName.ts
index b6e1a3eadc8..ce0fd464225 100644
--- a/src/helm-frontend/src/services/getDisplayPredictionsByName.ts
+++ b/src/helm-frontend/src/services/getDisplayPredictionsByName.ts
@@ -5,11 +5,14 @@ import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
export default async function getDisplayPredictionsByName(
runName: string,
signal: AbortSignal,
+ suite?: string,
): Promise {
try {
const displayPrediction = await fetch(
getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/display_predictions.json`,
+ `/benchmark_output/runs/${
+ suite || getBenchmarkSuite()
+ }/${runName}/display_predictions.json`,
),
{ signal },
);
diff --git a/src/helm-frontend/src/services/getDisplayRequestsByName.ts b/src/helm-frontend/src/services/getDisplayRequestsByName.ts
index 0bdfa6c6baf..a336b57e002 100644
--- a/src/helm-frontend/src/services/getDisplayRequestsByName.ts
+++ b/src/helm-frontend/src/services/getDisplayRequestsByName.ts
@@ -2,14 +2,17 @@ import type DisplayRequest from "@/types/DisplayRequest";
import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
-export default async function getDisplayRequestssByName(
+export default async function getDisplayRequestsByName(
runName: string,
signal: AbortSignal,
+ suite?: string,
): Promise {
try {
const displayRequest = await fetch(
getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/display_requests.json`,
+ `/benchmark_output/runs/${
+ suite || getBenchmarkSuite()
+ }/${runName}/display_requests.json`,
),
{ signal },
);
diff --git a/src/helm-frontend/src/services/getGroupTablesByName.ts b/src/helm-frontend/src/services/getGroupTablesByName.ts
index 29d31852d3a..ba11840a688 100644
--- a/src/helm-frontend/src/services/getGroupTablesByName.ts
+++ b/src/helm-frontend/src/services/getGroupTablesByName.ts
@@ -1,6 +1,6 @@
import type GroupsTable from "@/types/GroupsTable";
import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
-import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getVersionBaseUrl from "@/utils/getVersionBaseUrl";
export default async function getGroupsTablesByName(
groupName: string,
@@ -8,9 +8,7 @@ export default async function getGroupsTablesByName(
): Promise {
try {
const group = await fetch(
- getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/groups/${groupName}.json`,
- ),
+ getBenchmarkEndpoint(`${getVersionBaseUrl()}/groups/${groupName}.json`),
{ signal },
);
diff --git a/src/helm-frontend/src/services/getGroupsMetadata.ts b/src/helm-frontend/src/services/getGroupsMetadata.ts
index 8c9d6685a28..c4316099cd1 100644
--- a/src/helm-frontend/src/services/getGroupsMetadata.ts
+++ b/src/helm-frontend/src/services/getGroupsMetadata.ts
@@ -1,15 +1,13 @@
import type GroupsMetadata from "@/types/GroupsMetadata";
import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
-import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getVersionBaseUrl from "@/utils/getVersionBaseUrl";
export default async function getGroupsMetadata(
signal: AbortSignal,
): Promise {
try {
const groups = await fetch(
- getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/groups_metadata.json`,
- ),
+ getBenchmarkEndpoint(`${getVersionBaseUrl()}/groups_metadata.json`),
{ signal },
);
diff --git a/src/helm-frontend/src/services/getGroupsTables.ts b/src/helm-frontend/src/services/getGroupsTables.ts
index 2b03ae5cfd1..2907e137a8d 100644
--- a/src/helm-frontend/src/services/getGroupsTables.ts
+++ b/src/helm-frontend/src/services/getGroupsTables.ts
@@ -1,11 +1,9 @@
import type GroupsTable from "@/types/GroupsTable";
import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
-import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getVersionBaseUrl from "@/utils/getVersionBaseUrl";
export function getGroupsTablesJsonUrl(): string {
- return getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/groups.json`,
- );
+ return getBenchmarkEndpoint(`${getVersionBaseUrl()}/groups.json`);
}
export default async function getGroupsTables(
diff --git a/src/helm-frontend/src/services/getInstances.ts b/src/helm-frontend/src/services/getInstances.ts
index 3c5b02503b9..35abb89893d 100644
--- a/src/helm-frontend/src/services/getInstances.ts
+++ b/src/helm-frontend/src/services/getInstances.ts
@@ -5,11 +5,14 @@ import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
export default async function getInstancesByRunName(
runName: string,
signal: AbortSignal,
+ suite?: string,
): Promise {
try {
const instances = await fetch(
getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/instances.json`,
+ `/benchmark_output/runs/${
+ suite || getBenchmarkSuite()
+ }/${runName}/instances.json`,
),
{ signal },
);
diff --git a/src/helm-frontend/src/services/getRunSpecByName.ts b/src/helm-frontend/src/services/getRunSpecByName.ts
index ec87d6dda2f..2133193a719 100644
--- a/src/helm-frontend/src/services/getRunSpecByName.ts
+++ b/src/helm-frontend/src/services/getRunSpecByName.ts
@@ -2,9 +2,11 @@ import type RunSpec from "@/types/RunSpec";
import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
-export function getRunSpecByNameUrl(runName: string): string {
+export function getRunSpecByNameUrl(runName: string, suite?: string): string {
return getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/run_spec.json`,
+ `/benchmark_output/runs/${
+ suite || getBenchmarkSuite()
+ }/${runName}/run_spec.json`,
);
}
export default async function getRunSpecByName(
diff --git a/src/helm-frontend/src/services/getRunSpecs.ts b/src/helm-frontend/src/services/getRunSpecs.ts
index 5fd3674ba00..8b5df8d47d0 100644
--- a/src/helm-frontend/src/services/getRunSpecs.ts
+++ b/src/helm-frontend/src/services/getRunSpecs.ts
@@ -1,15 +1,13 @@
import RunSpec from "@/types/RunSpec";
import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
-import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getVersionBaseUrl from "@/utils/getVersionBaseUrl";
export default async function getRunSpecs(
signal: AbortSignal,
): Promise {
try {
const runSpecs = await fetch(
- getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/run_specs.json`,
- ),
+ getBenchmarkEndpoint(`${getVersionBaseUrl()}/run_specs.json`),
{ signal },
);
diff --git a/src/helm-frontend/src/services/getRunsToRunSuites.ts b/src/helm-frontend/src/services/getRunsToRunSuites.ts
new file mode 100644
index 00000000000..4e353d73971
--- /dev/null
+++ b/src/helm-frontend/src/services/getRunsToRunSuites.ts
@@ -0,0 +1,20 @@
+import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
+import getBenchmarkRelease from "@/utils/getBenchmarkRelease";
+
+export default async function getRunsToRunSuites(
+ signal: AbortSignal,
+): Promise> {
+ try {
+ const runsToRunSuites = await fetch(
+ getBenchmarkEndpoint(
+ `/benchmark_output/releases/${getBenchmarkRelease()}/runs_to_run_suites.json`,
+ ),
+ { signal },
+ );
+
+ return (await runsToRunSuites.json()) as Record;
+ } catch (error) {
+ console.log(error);
+ return {};
+ }
+}
diff --git a/src/helm-frontend/src/services/getScenarioByName.ts b/src/helm-frontend/src/services/getScenarioByName.ts
index c2a8cd9aa8f..2bcf3331bc9 100644
--- a/src/helm-frontend/src/services/getScenarioByName.ts
+++ b/src/helm-frontend/src/services/getScenarioByName.ts
@@ -5,11 +5,14 @@ import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
export default async function getScenarioByName(
scenarioName: string,
signal: AbortSignal,
+ suite?: string,
): Promise {
try {
const scenario = await fetch(
getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/${scenarioName}/scenario.json`,
+ `/benchmark_output/runs/${
+ suite || getBenchmarkSuite()
+ }/${scenarioName}/scenario.json`,
),
{ signal },
);
diff --git a/src/helm-frontend/src/services/getScenarioStateByName.ts b/src/helm-frontend/src/services/getScenarioStateByName.ts
index 75444730564..c1ea75cc37f 100644
--- a/src/helm-frontend/src/services/getScenarioStateByName.ts
+++ b/src/helm-frontend/src/services/getScenarioStateByName.ts
@@ -1,8 +1,13 @@
import getBenchmarkEndpoint from "@/utils/getBenchmarkEndpoint";
import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
-export function getScenarioStateByNameUrl(runName: string): string {
+export function getScenarioStateByNameUrl(
+ runName: string,
+ suite?: string,
+): string {
return getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/scenario_state.json`,
+ `/benchmark_output/runs/${
+ suite || getBenchmarkSuite()
+ }/${runName}/scenario_state.json`,
);
}
diff --git a/src/helm-frontend/src/services/getSchema.ts b/src/helm-frontend/src/services/getSchema.ts
index bb6e403cb55..b6b056c2ba9 100644
--- a/src/helm-frontend/src/services/getSchema.ts
+++ b/src/helm-frontend/src/services/getSchema.ts
@@ -1,11 +1,15 @@
import { parse } from "yaml";
import type Schema from "@/types/Schema";
+import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
+import getBenchmarkRelease from "@/utils/getBenchmarkRelease";
export default async function getSchema(signal: AbortSignal): Promise {
try {
const resp = await fetch(
- "https://crfm.stanford.edu/helm/latest/schema.yaml",
+ `https://crfm.stanford.edu/helm/${
+ getBenchmarkRelease() || getBenchmarkSuite()
+ }/schema.yaml`,
{ signal },
);
const data = await resp.text();
diff --git a/src/helm-frontend/src/services/getStatsByName.ts b/src/helm-frontend/src/services/getStatsByName.ts
index d7dd8a7a5bf..8b54aaae21b 100644
--- a/src/helm-frontend/src/services/getStatsByName.ts
+++ b/src/helm-frontend/src/services/getStatsByName.ts
@@ -5,11 +5,14 @@ import getBenchmarkSuite from "@/utils/getBenchmarkSuite";
export default async function getStatsByName(
runName: string,
signal: AbortSignal,
+ suite?: string,
): Promise {
try {
const stats = await fetch(
getBenchmarkEndpoint(
- `/benchmark_output/runs/${getBenchmarkSuite()}/${runName}/stats.json`,
+ `/benchmark_output/runs/${
+ suite || getBenchmarkSuite()
+ }/${runName}/stats.json`,
),
{ signal },
);
diff --git a/src/helm-frontend/src/services/getSuiteForRun.ts b/src/helm-frontend/src/services/getSuiteForRun.ts
new file mode 100644
index 00000000000..d35c656194e
--- /dev/null
+++ b/src/helm-frontend/src/services/getSuiteForRun.ts
@@ -0,0 +1,9 @@
+import getBenchmarkRelease from "@/utils/getBenchmarkRelease";
+
+export default function getSuiteForRun(
+ runNameToSuite: Record,
+ runName: string,
+) {
+ const suite = getBenchmarkRelease() ? runNameToSuite[runName] : window.SUITE;
+ return suite;
+}
diff --git a/src/helm-frontend/src/types/global.d.ts b/src/helm-frontend/src/types/global.d.ts
new file mode 100644
index 00000000000..697db247501
--- /dev/null
+++ b/src/helm-frontend/src/types/global.d.ts
@@ -0,0 +1,5 @@
+interface Window {
+ RELEASE: string;
+ SUITE: string;
+ BENCHMARK_OUTPUT_BASE_URL: string;
+}
diff --git a/src/helm-frontend/src/utils/getBenchmarkEndpoint.ts b/src/helm-frontend/src/utils/getBenchmarkEndpoint.ts
index 3ee821961d3..a4469a0353b 100644
--- a/src/helm-frontend/src/utils/getBenchmarkEndpoint.ts
+++ b/src/helm-frontend/src/utils/getBenchmarkEndpoint.ts
@@ -1,6 +1,6 @@
export default function getBenchmarkEndpoint(path: string): string {
- return `${import.meta.env.VITE_HELM_BENCHMARKS_ENDPOINT.replace(
- /\/$/,
+ return `${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/, "")}/${path.replace(
+ /^\//,
"",
- )}/${path.replace(/^\//, "")}`;
+ )}`;
}
diff --git a/src/helm-frontend/src/utils/getBenchmarkRelease.ts b/src/helm-frontend/src/utils/getBenchmarkRelease.ts
new file mode 100644
index 00000000000..7cf2b9afee9
--- /dev/null
+++ b/src/helm-frontend/src/utils/getBenchmarkRelease.ts
@@ -0,0 +1,3 @@
+export default function getBenchmarkRelease(): string {
+ return String(window.RELEASE);
+}
diff --git a/src/helm-frontend/src/utils/getBenchmarkSuite.ts b/src/helm-frontend/src/utils/getBenchmarkSuite.ts
index 5088457bbc0..10b312b8fec 100644
--- a/src/helm-frontend/src/utils/getBenchmarkSuite.ts
+++ b/src/helm-frontend/src/utils/getBenchmarkSuite.ts
@@ -1,3 +1,3 @@
export default function getBenchmarkSuite(): string {
- return String(import.meta.env.VITE_HELM_BENCHMARKS_SUITE);
+ return String(window.SUITE);
}
diff --git a/src/helm-frontend/src/utils/getVersionBaseUrl.ts b/src/helm-frontend/src/utils/getVersionBaseUrl.ts
new file mode 100644
index 00000000000..7859190afc8
--- /dev/null
+++ b/src/helm-frontend/src/utils/getVersionBaseUrl.ts
@@ -0,0 +1,7 @@
+export default function getVersionBaseUrl(): string {
+ if (window.RELEASE) {
+ return `/benchmark_output/releases/${window.RELEASE}`;
+ } else {
+ return `/benchmark_output/runs/${window.SUITE}`;
+ }
+}
diff --git a/src/helm-frontend/tailwind.config.js b/src/helm-frontend/tailwind.config.js
index de184169978..bb8f95c03bf 100644
--- a/src/helm-frontend/tailwind.config.js
+++ b/src/helm-frontend/tailwind.config.js
@@ -1,48 +1,48 @@
/** @type {import('tailwindcss').Config} */
/* eslint-disable max-len */
module.exports = {
- content: [
- "./index.html",
- "./src/**/*.{js,ts,jsx,tsx}",
- "./node_modules/@tremor/**/*.{js,ts,jsx,tsx}",
- ],
- theme: {
- transparent: "transparent",
- current: "currentColor",
- extend: {
- colors: {
- // light mode
- tremor: {
- brand: {
- faint: "#eff6ff", // blue-50
- muted: "#bfdbfe", // blue-200
- subtle: "#60a5fa", // blue-400
- DEFAULT: "#3b82f6", // blue-500
- emphasis: "#1d4ed8", // blue-700
- inverted: "#ffffff", // white
- },
- background: {
- muted: "#f9fafb", // gray-50
- subtle: "#f3f4f6", // gray-100
- DEFAULT: "#ffffff", // white
- emphasis: "#374151", // gray-700
- },
- border: {
- DEFAULT: "#e5e7eb", // gray-200
- },
- ring: {
- DEFAULT: "#e5e7eb", // gray-200
- },
- content: {
- subtle: "#9ca3af", // gray-400
- DEFAULT: "#6b7280", // gray-500
- emphasis: "#374151", // gray-700
- strong: "#111827", // gray-900
- inverted: "#ffffff", // white
- },
- },
- // dark mode
- /*
+ content: [
+ "./index.html",
+ "./src/**/*.{js,ts,jsx,tsx}",
+ "./node_modules/@tremor/**/*.{js,ts,jsx,tsx}",
+ ],
+ theme: {
+ transparent: "transparent",
+ current: "currentColor",
+ extend: {
+ colors: {
+ // light mode
+ tremor: {
+ brand: {
+ faint: "#eff6ff", // blue-50
+ muted: "#bfdbfe", // blue-200
+ subtle: "#60a5fa", // blue-400
+ DEFAULT: "#3b82f6", // blue-500
+ emphasis: "#1d4ed8", // blue-700
+ inverted: "#ffffff", // white
+ },
+ background: {
+ muted: "#f9fafb", // gray-50
+ subtle: "#f3f4f6", // gray-100
+ DEFAULT: "#ffffff", // white
+ emphasis: "#374151", // gray-700
+ },
+ border: {
+ DEFAULT: "#e5e7eb", // gray-200
+ },
+ ring: {
+ DEFAULT: "#e5e7eb", // gray-200
+ },
+ content: {
+ subtle: "#9ca3af", // gray-400
+ DEFAULT: "#6b7280", // gray-500
+ emphasis: "#374151", // gray-700
+ strong: "#111827", // gray-900
+ inverted: "#ffffff", // white
+ },
+ },
+ // dark mode
+ /*
"dark-tremor": {
brand: {
faint: "#0B1229", // custom
@@ -73,65 +73,65 @@ module.exports = {
},
},
*/
- },
- boxShadow: {
- // light
- "tremor-input": "0 1px 2px 0 rgb(0 0 0 / 0.05)",
- "tremor-card":
- "0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",
- "tremor-dropdown":
- "0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",
- // dark
- "dark-tremor-input": "0 1px 2px 0 rgb(0 0 0 / 0.05)",
- "dark-tremor-card":
- "0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",
- "dark-tremor-dropdown":
- "0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",
- },
- borderRadius: {
- "tremor-small": "0.375rem",
- "tremor-default": "0.5rem",
- "tremor-full": "9999px",
- },
- fontSize: {
- "tremor-label": ["0.75rem"],
- "tremor-default": ["0.875rem", { lineHeight: "1.25rem" }],
- "tremor-title": ["1.125rem", { lineHeight: "1.75rem" }],
- "tremor-metric": ["1.875rem", { lineHeight: "2.25rem" }],
- },
- },
- },
- safelist: [
- {
- pattern:
- /^(bg-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
- variants: ["hover", "ui-selected"],
- },
- {
- pattern:
- /^(text-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
- variants: ["hover", "ui-selected"],
- },
- {
- pattern:
- /^(border-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
- variants: ["hover", "ui-selected"],
- },
- {
- pattern:
- /^(ring-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
- },
- {
- pattern:
- /^(stroke-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
- },
- {
- pattern:
- /^(fill-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
- },
- ],
- plugins: [require("@headlessui/tailwindcss"), require("daisyui")],
- daisyui: {
- themes: ["corporate", "business"],
- },
+ },
+ boxShadow: {
+ // light
+ "tremor-input": "0 1px 2px 0 rgb(0 0 0 / 0.05)",
+ "tremor-card":
+ "0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",
+ "tremor-dropdown":
+ "0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",
+ // dark
+ "dark-tremor-input": "0 1px 2px 0 rgb(0 0 0 / 0.05)",
+ "dark-tremor-card":
+ "0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)",
+ "dark-tremor-dropdown":
+ "0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1)",
+ },
+ borderRadius: {
+ "tremor-small": "0.375rem",
+ "tremor-default": "0.5rem",
+ "tremor-full": "9999px",
+ },
+ fontSize: {
+ "tremor-label": ["0.75rem"],
+ "tremor-default": ["0.875rem", { lineHeight: "1.25rem" }],
+ "tremor-title": ["1.125rem", { lineHeight: "1.75rem" }],
+ "tremor-metric": ["1.875rem", { lineHeight: "2.25rem" }],
+ },
+ },
+ },
+ safelist: [
+ {
+ pattern:
+ /^(bg-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+ variants: ["hover", "ui-selected"],
+ },
+ {
+ pattern:
+ /^(text-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+ variants: ["hover", "ui-selected"],
+ },
+ {
+ pattern:
+ /^(border-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+ variants: ["hover", "ui-selected"],
+ },
+ {
+ pattern:
+ /^(ring-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+ },
+ {
+ pattern:
+ /^(stroke-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+ },
+ {
+ pattern:
+ /^(fill-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/,
+ },
+ ],
+ plugins: [require("@headlessui/tailwindcss"), require("daisyui")],
+ daisyui: {
+ themes: ["corporate", "business"],
+ },
};
diff --git a/src/helm-frontend/tsconfig.json b/src/helm-frontend/tsconfig.json
index 7f44c96e282..5df59518c12 100644
--- a/src/helm-frontend/tsconfig.json
+++ b/src/helm-frontend/tsconfig.json
@@ -1,30 +1,30 @@
{
- "compilerOptions": {
- "target": "ES2020",
- "useDefineForClassFields": true,
- "lib": ["ES2020", "DOM", "DOM.Iterable"],
- "module": "ESNext",
- "skipLibCheck": true,
+ "compilerOptions": {
+ "target": "ES2020",
+ "useDefineForClassFields": true,
+ "lib": ["ES2020", "DOM", "DOM.Iterable"],
+ "module": "ESNext",
+ "skipLibCheck": true,
- "moduleResolution": "bundler",
- "allowImportingTsExtensions": true,
- "resolveJsonModule": true,
- "isolatedModules": true,
- "noEmit": true,
- "jsx": "react-jsx",
+ "moduleResolution": "bundler",
+ "allowImportingTsExtensions": true,
+ "resolveJsonModule": true,
+ "isolatedModules": true,
+ "noEmit": true,
+ "jsx": "react-jsx",
- "strict": true,
- "noUnusedLocals": true,
- "noUnusedParameters": true,
- "noFallthroughCasesInSwitch": true,
+ "strict": true,
+ "noUnusedLocals": true,
+ "noUnusedParameters": true,
+ "noFallthroughCasesInSwitch": true,
- "baseUrl": "src",
- "paths": {
- "@/*": ["*"]
- },
+ "baseUrl": "src",
+ "paths": {
+ "@/*": ["*"]
+ },
- "types": ["vitest/globals", "vite/client"]
- },
- "include": ["src"],
- "references": [{ "path": "./tsconfig.node.json" }]
+ "types": ["vitest/globals", "vite/client"]
+ },
+ "include": ["src"],
+ "references": [{ "path": "./tsconfig.node.json" }]
}
diff --git a/src/helm-frontend/vite.config.ts b/src/helm-frontend/vite.config.ts
index 0782743722e..7c89bb4daa7 100644
--- a/src/helm-frontend/vite.config.ts
+++ b/src/helm-frontend/vite.config.ts
@@ -17,5 +17,4 @@ export default defineConfig({
build: {
outDir: `${__dirname}/../helm/benchmark/static_build`,
},
- base: "/helm/",
});
diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
index 36bd147c07d..82648c7be6e 100644
--- a/src/helm/benchmark/adaptation/adapter_spec.py
+++ b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -73,7 +73,11 @@ class AdapterSpec:
# Decoding parameters (inherited by `Request`)
- # Model to make the request to (need to fill in)
+ # Model deployment to make the request to (need to fill in)
+ model_deployment: str = ""
+
+ # DEPRECATED: old model field, kept for backward compatibility
+ # TODO: Remove this once we do not wish to support backward compatibility anymore.
model: str = ""
# Temperature to use
diff --git a/src/helm/benchmark/adaptation/adapters/adapter.py b/src/helm/benchmark/adaptation/adapters/adapter.py
index 3dd65132863..bbf56a626d3 100644
--- a/src/helm/benchmark/adaptation/adapters/adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/adapter.py
@@ -1,15 +1,12 @@
from abc import ABC, abstractmethod
from typing import List
-import numpy as np
-
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark.adaptation.scenario_state import ScenarioState
-from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS
+from helm.benchmark.scenarios.scenario import Instance
from helm.benchmark.window_services.tokenizer_service import TokenizerService
from helm.benchmark.window_services.window_service import WindowService
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
-from helm.common.hierarchical_logger import hlog
class Adapter(ABC):
@@ -21,7 +18,7 @@ class Adapter(ABC):
def __init__(self, adapter_spec: AdapterSpec, tokenizer_service: TokenizerService):
self.adapter_spec: AdapterSpec = adapter_spec
self.window_service: WindowService = WindowServiceFactory.get_window_service(
- adapter_spec.model, tokenizer_service
+ adapter_spec.model_deployment, tokenizer_service
)
@abstractmethod
@@ -31,38 +28,3 @@ def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
list of corresponding `RequestState`s.
"""
pass
-
- def get_run_instances(self, instances: List[Instance]) -> List[Instance]:
- """
- Get the instances necessary for this run:
- Train instances (split=train): keep all (if any) for in-context learning
- Eval instances (split=valid or test): keep at most `max_eval_instances` specified in `AdapterSpec` by sampling
- Return the resulting train and eval instances.
- """
- all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
-
- all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in EVAL_SPLITS]
- if (
- self.adapter_spec.max_eval_instances is not None
- and len(all_eval_instances) > self.adapter_spec.max_eval_instances
- ):
- # Pick the first `self.adapter_spec.max_eval_instances`.
- # The random sampling includes instances monotonically.
- np.random.seed(0)
- selected_eval_instances = list(
- np.random.choice(
- all_eval_instances, # type: ignore
- self.adapter_spec.max_eval_instances,
- replace=False,
- )
- )
- else:
- selected_eval_instances = all_eval_instances
-
- hlog(
- f"{len(instances)} instances, "
- f"{len(all_train_instances)} train instances, "
- f"{len(selected_eval_instances)}/{len(all_eval_instances)} eval instances"
- )
-
- return all_train_instances + selected_eval_instances
diff --git a/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py b/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py
index 75d8b622f59..339a220788b 100644
--- a/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py
@@ -50,6 +50,7 @@ def generate_requests(
)
request = Request(
model=self.adapter_spec.model,
+ model_deployment=self.adapter_spec.model_deployment,
prompt=prompt.text,
num_completions=self.adapter_spec.num_outputs,
temperature=self.adapter_spec.temperature,
diff --git a/src/helm/benchmark/adaptation/adapters/generation_adapter.py b/src/helm/benchmark/adaptation/adapters/generation_adapter.py
index ec251ce20a2..c4945852653 100644
--- a/src/helm/benchmark/adaptation/adapters/generation_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/generation_adapter.py
@@ -39,6 +39,7 @@ def generate_requests(
)
request = Request(
model=self.adapter_spec.model,
+ model_deployment=self.adapter_spec.model_deployment,
prompt=prompt.text,
num_completions=self.adapter_spec.num_outputs,
temperature=self.adapter_spec.temperature,
diff --git a/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py b/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py
index cc18c761196..87e51a9b212 100644
--- a/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py
@@ -34,9 +34,15 @@ def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
# Pick out evaluation instances. This includes both valid and test splits.
eval_instances: List[Instance] = [instance for instance in instances if instance.split in EVAL_SPLITS]
hlog(f"{len(eval_instances)} eval instances")
-
+ # Since at least 2023-01-01, this adapter was using `instances` instead of `eval_instances`
+ # https://github.com/stanford-crfm/helm/commit/ac9892f7449418d32ab55843702db312b58003ed#diff-69871182494f0d9f4bc6aeea76e99c13edf0213e2c123432a63cd2024d66ffcaR39
+ # This assert is intended to identify run specs (if any) that had been producing incorrect results.
+ assert len(eval_instances) == len(instances), (
+ "Non-evaluation instances were passed to LanguageModelingAdapter, but LanguageModelingAdapter "
+ + "expects evaluation instances only. Please open a GitHub issue with your RunSpec."
+ )
all_request_states: List[RequestState] = flatten_list(
- parallel_map(self._generate_requests, instances, parallelism)
+ parallel_map(self._generate_requests, eval_instances, parallelism)
)
hlog(f"{len(all_request_states)} requests")
@@ -114,6 +120,7 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
)
request = Request(
model=self.adapter_spec.model,
+ model_deployment=self.adapter_spec.model_deployment,
prompt=prompt_text,
num_completions=1,
temperature=0,
@@ -162,6 +169,7 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
request = Request(
model=self.adapter_spec.model,
+ model_deployment=self.adapter_spec.model_deployment,
prompt=prompt_text,
num_completions=1,
temperature=0,
diff --git a/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py b/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py
index aa8b6c9d204..a5126373502 100644
--- a/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py
@@ -29,6 +29,7 @@ def generate_requests(
request = Request(
model=self.adapter_spec.model,
+ model_deployment=self.adapter_spec.model_deployment,
multimodal_prompt=prompt.multimedia_object,
num_completions=self.adapter_spec.num_outputs,
temperature=self.adapter_spec.temperature,
diff --git a/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py b/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py
index 152bffc34db..18fbe8508f4 100644
--- a/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py
@@ -27,6 +27,7 @@ def generate_requests(
request = Request(
model=self.adapter_spec.model,
+ model_deployment=self.adapter_spec.model_deployment,
multimodal_prompt=prompt.multimedia_object,
num_completions=self.adapter_spec.num_outputs,
temperature=self.adapter_spec.temperature,
diff --git a/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py b/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py
index 18796dbebcd..4b9d3801bf0 100644
--- a/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py
@@ -22,6 +22,7 @@ def teardown_method(self, _):
def test_construct_prompt(self):
adapter_spec: AdapterSpec = AdapterSpec(
model="simple/model1",
+ model_deployment="simple/model1",
method=ADAPT_GENERATION_MULTIMODAL,
global_prefix="[START]",
instructions="Please answer the following question about the images.",
@@ -91,6 +92,7 @@ def test_construct_prompt(self):
def test_construct_prompt_multi_label(self):
adapter_spec: AdapterSpec = AdapterSpec(
model="simple/model1",
+ model_deployment="simple/model1",
method=ADAPT_GENERATION_MULTIMODAL,
global_prefix="[START]",
instructions="Please answer the following question about the images.",
@@ -171,6 +173,7 @@ def test_construct_prompt_idefics_instruct_example(self):
"""
adapter_spec: AdapterSpec = AdapterSpec(
model="simple/model1",
+ model_deployment="simple/model1",
method=ADAPT_GENERATION_MULTIMODAL,
input_prefix="User: ",
input_suffix="",
diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
index 5cf4e4d9410..08e8569b0be 100644
--- a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py
@@ -55,6 +55,7 @@ def generate_requests(
)
request = Request(
model=self.adapter_spec.model,
+ model_deployment=self.adapter_spec.model_deployment,
prompt=prompt.text,
num_completions=1,
top_k_per_token=self.adapter_spec.num_outputs,
diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py
index 31429cc2529..d9a3d79fa41 100644
--- a/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py
@@ -41,6 +41,7 @@ def construct_request_state(
) -> RequestState:
request = Request(
model=self.adapter_spec.model,
+ model_deployment=self.adapter_spec.model_deployment,
prompt=prompt.text,
num_completions=1,
temperature=0,
diff --git a/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py b/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py
index 7d327b8dd90..d2791ed532f 100644
--- a/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py
@@ -33,6 +33,7 @@ def test_adapt(self):
def test_construct_prompt(self):
adapter_spec = AdapterSpec(
model="openai/davinci",
+ model_deployment="openai/davinci",
method=ADAPT_GENERATION,
input_prefix="",
input_suffix="",
@@ -59,7 +60,12 @@ def test_construct_prompt(self):
def test_construct_prompt_with_truncation(self):
adapter_spec = AdapterSpec(
- model="openai/davinci", method=ADAPT_GENERATION, input_prefix="", output_prefix="", max_tokens=100
+ model="openai/davinci",
+ model_deployment="openai/davinci",
+ method=ADAPT_GENERATION,
+ input_prefix="",
+ output_prefix="",
+ max_tokens=100,
)
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
correct_reference = Reference(Output(text=""), tags=[CORRECT_TAG])
@@ -80,7 +86,9 @@ def test_construct_prompt_with_truncation(self):
assert prompt_text.count("eval") == 1948
def test_sample_examples_without_references(self):
- adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=1)
+ adapter_spec = AdapterSpec(
+ method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=1
+ )
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
all_train_instances = [
Instance(Input(text="prompt1"), references=[]),
@@ -92,7 +100,9 @@ def test_sample_examples_without_references(self):
assert len(examples) == 1
def test_sample_examples_open_ended_generation(self):
- adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=3)
+ adapter_spec = AdapterSpec(
+ method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=3
+ )
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
all_train_instances: List[Instance] = [
@@ -106,7 +116,9 @@ def test_sample_examples_open_ended_generation(self):
assert seed0_examples != seed1_examples, "Examples should differ when changing the seed"
def test_sample_examples_open_ended_generation_stress(self):
- adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=5)
+ adapter_spec = AdapterSpec(
+ method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=5
+ )
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
all_train_instances: List[Instance] = [
@@ -146,7 +158,11 @@ def test_sample_examples_open_ended_generation_stress(self):
def test_multiple_correct_reference(self):
adapter_spec = AdapterSpec(
- method=ADAPT_GENERATION, model="openai/ada", max_train_instances=2, sample_train=False
+ method=ADAPT_GENERATION,
+ model="openai/ada",
+ model_deployment="openai/ada",
+ max_train_instances=2,
+ sample_train=False,
)
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
train_instances = [
@@ -191,7 +207,12 @@ def test_multiple_correct_reference(self):
def test_multiple_correct_reference_multi_label(self):
adapter_spec = AdapterSpec(
- method=ADAPT_GENERATION, model="openai/ada", max_train_instances=2, multi_label=True, sample_train=False
+ method=ADAPT_GENERATION,
+ model="openai/ada",
+ model_deployment="openai/ada",
+ max_train_instances=2,
+ multi_label=True,
+ sample_train=False,
)
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
train_instances = [
diff --git a/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py b/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py
index c963513a932..588dfe6b1f8 100644
--- a/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py
@@ -1,5 +1,6 @@
# mypy: check_untyped_defs = False
from typing import List
+from helm.benchmark.window_services.gpt2_window_service import GPT2WindowService
from helm.common.tokenization_request import TokenizationToken
from helm.benchmark.adaptation.request_state import RequestState
@@ -7,7 +8,19 @@
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from .adapter_factory import AdapterFactory, ADAPT_LANGUAGE_MODELING
from .test_adapter import TestAdapter
-from helm.benchmark.scenarios.scenario import Instance, Input, Reference
+from helm.benchmark.scenarios.scenario import TEST_SPLIT, Instance, Input, Reference
+
+
+class MockGPT2Window(GPT2WindowService):
+ """Utility for overriding properties of a GPT2WindowService for test purposes."""
+
+ def __init__(self, service, *, max_sequence_length):
+ super().__init__(service)
+ self._max_sequence_length = max_sequence_length
+
+ @property
+ def max_sequence_length(self) -> int:
+ return self._max_sequence_length
class TestLanguageModelingAdapter(TestAdapter):
@@ -16,6 +29,7 @@ def test_construct_language_modeling_prompt(self):
method=ADAPT_LANGUAGE_MODELING,
input_prefix="",
model="openai/davinci",
+ model_deployment="openai/davinci",
output_prefix="",
max_tokens=0,
)
@@ -39,6 +53,7 @@ def test_fits_tokens_within_context_window(self):
method=ADAPT_LANGUAGE_MODELING,
input_prefix="",
model="openai/curie",
+ model_deployment="openai/curie",
output_prefix="",
max_tokens=0,
)
@@ -70,6 +85,7 @@ def test_prompt_truncated(self):
method=ADAPT_LANGUAGE_MODELING,
input_prefix="",
model="anthropic/claude-v1.3",
+ model_deployment="anthropic/claude-v1.3",
output_prefix="",
max_tokens=0,
)
@@ -81,6 +97,7 @@ def test_prompt_truncated(self):
instance: Instance = Instance(
input=input_text,
references=[reference],
+ split=TEST_SPLIT,
)
# Ensure the adapter returns the correct prompt
request_states: List[RequestState] = adapter.adapt([instance], parallelism=1).request_states
@@ -93,6 +110,7 @@ def test_prompt_truncated(self):
instance_long: Instance = Instance(
input=input_text_long,
references=[reference],
+ split=TEST_SPLIT,
)
request_states_long: List[RequestState] = adapter.adapt([instance_long], parallelism=1).request_states
request_long: Request = request_states_long[0].request
@@ -105,6 +123,7 @@ def test_prompt_truncated(self):
method=ADAPT_LANGUAGE_MODELING,
input_prefix="",
model="anthropic/claude-v1.3",
+ model_deployment="anthropic/claude-v1.3",
output_prefix="",
max_tokens=2000,
)
@@ -125,3 +144,31 @@ def test_prompt_truncated(self):
num_tokens_2 = len(adapter_2.window_service.encode(request_long_2.prompt).token_values)
assert num_tokens_2 == adapter.window_service.max_sequence_and_generated_tokens_length - 2000
assert request_long_2.max_tokens == 2000
+
+ # TODO(#1969) Determine if this behavior is actually desirable.
+ def test_prompt_wrapping(self):
+ input_tokens = 25
+ max_sequence_length = 10
+ adapter_spec = AdapterSpec(
+ method=ADAPT_LANGUAGE_MODELING,
+ input_prefix="",
+ model="openai/code-davinci-002",
+ model_deployment="openai/code-davinci-002",
+ output_prefix="",
+ max_tokens=0,
+ )
+ adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
+ # Monkey patch the window service to have really short max sequences.
+ adapter.window_service = MockGPT2Window(self.tokenizer_service, max_sequence_length=max_sequence_length)
+ input_text = Input(text=" ".join(str(i) for i in range(input_tokens)))
+ instance = Instance(input=input_text, references=[], split=TEST_SPLIT)
+
+ # Generate the requests
+ request_states: List[RequestState] = adapter.adapt([instance], parallelism=1).request_states
+ # A smaller window service creates more requests
+ assert len(request_states) == 3
+ assert request_states[0].request.prompt == "<|endoftext|>0 1 2 3 4 5 6 7 8 9"
+ # Only the first prompt inclues the prefix_token
+ assert request_states[1].request.prompt == " 9 10 11 12 13 14 15 16 17 18 19"
+ # The last prompt includes as many conditioning_tokens as will fit
+ assert request_states[2].request.prompt == " 14 15 16 17 18 19 20 21 22 23 24"
diff --git a/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
index ab4ef43d9b9..06cb1dec6cf 100644
--- a/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
+++ b/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py
@@ -7,7 +7,9 @@
class TestMultipleChoiceJointAdapter(TestAdapter):
def test_sample_examples(self):
- adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=4)
+ adapter_spec = AdapterSpec(
+ method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=4
+ )
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
all_train_instances = [
Instance(Input(text="say no"), references=[Reference(Output(text="no"), tags=[CORRECT_TAG])]),
@@ -27,13 +29,20 @@ def test_sample_examples(self):
assert examples[3].input.text == "say yes3"
def test_sample_examples_no_train_instances(self):
- adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=2)
+ adapter_spec = AdapterSpec(
+ method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=2
+ )
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
examples = adapter.sample_examples(all_train_instances=[], seed=0)
assert len(examples) == 0
def test_sample_examples_greater_max_train_instances(self):
- adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=10)
+ adapter_spec = AdapterSpec(
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
+ model="openai/ada",
+ model_deployment="openai/ada",
+ max_train_instances=10,
+ )
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
all_train_instances = [
Instance(Input(text="say no"), references=[Reference(Output(text="no"), tags=[CORRECT_TAG])]),
@@ -46,7 +55,11 @@ def test_sample_examples_greater_max_train_instances(self):
def test_multiple_correct_reference(self):
adapter_spec = AdapterSpec(
- method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=10, sample_train=False
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
+ model="openai/ada",
+ model_deployment="openai/ada",
+ max_train_instances=10,
+ sample_train=False,
)
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
train_instances = [
@@ -102,6 +115,7 @@ def test_multiple_correct_reference_multi_label(self):
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
model="openai/ada",
+ model_deployment="openai/ada",
max_train_instances=10,
multi_label=True,
sample_train=False,
diff --git a/src/helm/benchmark/augmentations/cleva_perturbation.py b/src/helm/benchmark/augmentations/cleva_perturbation.py
index b65562d8b08..54927d864e0 100644
--- a/src/helm/benchmark/augmentations/cleva_perturbation.py
+++ b/src/helm/benchmark/augmentations/cleva_perturbation.py
@@ -10,13 +10,13 @@
from helm.common.optional_dependencies import handle_module_not_found_error
from helm.benchmark.scenarios.scenario import Input, Instance, Reference, Output
from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import Perturbation, TextPerturbation
############################################################
-class ChineseTyposPerturbation(Perturbation):
+class ChineseTyposPerturbation(TextPerturbation):
"""
Chinese typos. For implementation details, see
https://github.com/GEM-benchmark/NL-Augmenter/tree/main/nlaugmenter/transformations/chinese_butter_fingers_perturbation
@@ -271,7 +271,7 @@ def retrieve_from_database(
return chars_with_similar_pinyin
-class ChineseSynonymPerturbation(Perturbation):
+class ChineseSynonymPerturbation(TextPerturbation):
"""
Chinese synonyms. For implementation details, see
https://github.com/GEM-benchmark/NL-Augmenter/blob/main/nlaugmenter/transformations/chinese_antonym_synonym_substitution
@@ -340,7 +340,7 @@ def sample_word(self, sample_list: List[str], rng: Random) -> str:
return sample_list[index]
-class CLEVAMildMixPerturbation(Perturbation):
+class CLEVAMildMixPerturbation(TextPerturbation):
"""
CLEVA robustness perturbation that composes several perturbations.
"""
@@ -370,7 +370,7 @@ def perturb(self, text: str, rng: Random) -> str:
############################################################
-class ChineseGenderPerturbation(Perturbation):
+class ChineseGenderPerturbation(TextPerturbation):
"""Individual fairness perturbation for Chinese gender terms and pronouns."""
name: str = "chinese_gender"
@@ -601,13 +601,6 @@ def get_substitute_name(self, token: str, rng: Random) -> Optional[str]:
name = rng.choice(list(options))
return name
- def perturb(self, text: str, rng: Random) -> str:
- """
- Perturbing the text is handled in `perturb_with_persistency` to ensure that perturbed names
- in `Instance`s and `Reference`s match.
- """
- raise NotImplementedError("Should never be called")
-
def perturb_with_persistency(
self, text: str, rng: Random, name_substitution_mapping: Dict[str, str], skipped_tokens: Set[str]
) -> str:
@@ -686,7 +679,7 @@ def word_segment_and_pos_tagging(text: str) -> Tuple[List[str], List[str]]:
return tokens, tags
-class SimplifiedToTraditionalPerturbation(Perturbation):
+class SimplifiedToTraditionalPerturbation(TextPerturbation):
"""Individual fairness perturbation for Chinese simplified to Chinese traditional."""
name: str = "simplified_to_traditional"
@@ -713,7 +706,7 @@ def perturb(self, text: str, rng: Random) -> str:
return perturbed_text
-class MandarinToCantonesePerturbation(Perturbation):
+class MandarinToCantonesePerturbation(TextPerturbation):
"""
Individual fairness perturbation for Mandarin to Cantonese translation.
The implementation is inspired by https://justyy.com/tools/chinese-converter/
diff --git a/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py b/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py
index 4107efc8083..dc4b6172f24 100644
--- a/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py
+++ b/src/helm/benchmark/augmentations/contraction_expansion_perturbation.py
@@ -5,7 +5,7 @@
from random import Random
from helm.common.general import match_case
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
from .perturbation_description import PerturbationDescription
@@ -92,7 +92,7 @@
# The implementations below are based on
# https://github.com/GEM-benchmark/NL-Augmenter/blob/main/transformations/contraction_expansions/transformation.py
-class ContractionPerturbation(Perturbation):
+class ContractionPerturbation(TextPerturbation):
"""
Contractions.
Replaces each expansion with its contracted version.
@@ -132,7 +132,7 @@ def cont(possible):
return self.reverse_contraction_pattern.sub(cont, text)
-class ExpansionPerturbation(Perturbation):
+class ExpansionPerturbation(TextPerturbation):
"""
Expansions.
Replaces each contraction with its expanded version.
diff --git a/src/helm/benchmark/augmentations/contrast_sets_perturbation.py b/src/helm/benchmark/augmentations/contrast_sets_perturbation.py
index 18bf6b9c541..e7d8b4db690 100644
--- a/src/helm/benchmark/augmentations/contrast_sets_perturbation.py
+++ b/src/helm/benchmark/augmentations/contrast_sets_perturbation.py
@@ -81,6 +81,3 @@ def apply(self, instance: Instance, seed: Optional[int] = None) -> Instance:
references=perturbed_references,
perturbation=description,
)
-
- def perturb(self, text: str, rng: Random) -> str: # we need this since parent method is abstract
- raise NotImplementedError("Should never be called since apply() was overridden")
diff --git a/src/helm/benchmark/augmentations/dialect_perturbation.py b/src/helm/benchmark/augmentations/dialect_perturbation.py
index 5da74113dc1..440cf799356 100644
--- a/src/helm/benchmark/augmentations/dialect_perturbation.py
+++ b/src/helm/benchmark/augmentations/dialect_perturbation.py
@@ -8,10 +8,10 @@
from helm.common.general import match_case, ensure_file_downloaded
from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
-class DialectPerturbation(Perturbation):
+class DialectPerturbation(TextPerturbation):
"""Individual fairness perturbation for dialect."""
""" Short unique identifier of the perturbation (e.g., extra_space) """
diff --git a/src/helm/benchmark/augmentations/extra_space_perturbation.py b/src/helm/benchmark/augmentations/extra_space_perturbation.py
index 814ac44f677..69eaec1f6f3 100644
--- a/src/helm/benchmark/augmentations/extra_space_perturbation.py
+++ b/src/helm/benchmark/augmentations/extra_space_perturbation.py
@@ -1,11 +1,11 @@
from dataclasses import dataclass
from random import Random
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
from .perturbation_description import PerturbationDescription
-class ExtraSpacePerturbation(Perturbation):
+class ExtraSpacePerturbation(TextPerturbation):
"""
A toy perturbation that replaces existing spaces in the text with
`num_spaces` number of spaces.
diff --git a/src/helm/benchmark/augmentations/filler_words_perturbation.py b/src/helm/benchmark/augmentations/filler_words_perturbation.py
index a566cc37d28..40b433cd95f 100644
--- a/src/helm/benchmark/augmentations/filler_words_perturbation.py
+++ b/src/helm/benchmark/augmentations/filler_words_perturbation.py
@@ -1,6 +1,6 @@
from dataclasses import dataclass
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
from .perturbation_description import PerturbationDescription
from random import Random
@@ -31,7 +31,7 @@
FILL_PHRASE = ["uhm", "umm", "ahh", "err", "actually", "obviously", "naturally", "like", "you know"]
-class FillerWordsPerturbation(Perturbation):
+class FillerWordsPerturbation(TextPerturbation):
"""
Randomly inserts filler words and phrases in the sentence.
Perturbation example:
diff --git a/src/helm/benchmark/augmentations/gender_perturbation.py b/src/helm/benchmark/augmentations/gender_perturbation.py
index 4b3c6dc7619..df53ce875b5 100644
--- a/src/helm/benchmark/augmentations/gender_perturbation.py
+++ b/src/helm/benchmark/augmentations/gender_perturbation.py
@@ -6,7 +6,7 @@
from helm.common.general import match_case
from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
""" Gender term mappings """
@@ -62,7 +62,7 @@
]
-class GenderPerturbation(Perturbation):
+class GenderPerturbation(TextPerturbation):
"""Individual fairness perturbation for gender terms and pronouns."""
""" Short unique identifier of the perturbation (e.g., extra_space) """
diff --git a/src/helm/benchmark/augmentations/lowercase_perturbation.py b/src/helm/benchmark/augmentations/lowercase_perturbation.py
index 8f2c537cae0..e67d71dab3c 100644
--- a/src/helm/benchmark/augmentations/lowercase_perturbation.py
+++ b/src/helm/benchmark/augmentations/lowercase_perturbation.py
@@ -1,10 +1,10 @@
from random import Random
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
from .perturbation_description import PerturbationDescription
-class LowerCasePerturbation(Perturbation):
+class LowerCasePerturbation(TextPerturbation):
"""
Simple perturbation turning input and references into lowercase.
"""
diff --git a/src/helm/benchmark/augmentations/mild_mix_perturbation.py b/src/helm/benchmark/augmentations/mild_mix_perturbation.py
index 1bfae4ecf5d..fa682976b86 100644
--- a/src/helm/benchmark/augmentations/mild_mix_perturbation.py
+++ b/src/helm/benchmark/augmentations/mild_mix_perturbation.py
@@ -1,14 +1,14 @@
from random import Random
from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
from .lowercase_perturbation import LowerCasePerturbation
from .contraction_expansion_perturbation import ContractionPerturbation
from .space_perturbation import SpacePerturbation
from .misspelling_perturbation import MisspellingPerturbation
-class MildMixPerturbation(Perturbation):
+class MildMixPerturbation(TextPerturbation):
"""
Canonical robustness perturbation that composes several perturbations.
These perturbations are chosen to be reasonable.
diff --git a/src/helm/benchmark/augmentations/misspelling_perturbation.py b/src/helm/benchmark/augmentations/misspelling_perturbation.py
index 37d7d9a22cc..41e8029cd15 100644
--- a/src/helm/benchmark/augmentations/misspelling_perturbation.py
+++ b/src/helm/benchmark/augmentations/misspelling_perturbation.py
@@ -6,13 +6,13 @@
from typing import Dict, List
from helm.common.general import match_case
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
from .perturbation_description import PerturbationDescription
# The implementation below is based on the following list of common misspellings:
# https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines
-class MisspellingPerturbation(Perturbation):
+class MisspellingPerturbation(TextPerturbation):
"""
Replaces words randomly with common misspellings, from a list of common misspellings.
diff --git a/src/helm/benchmark/augmentations/person_name_perturbation.py b/src/helm/benchmark/augmentations/person_name_perturbation.py
index 33a11d21b6c..91736d1115a 100644
--- a/src/helm/benchmark/augmentations/person_name_perturbation.py
+++ b/src/helm/benchmark/augmentations/person_name_perturbation.py
@@ -263,13 +263,6 @@ def get_substitute_name(self, token: str, rng: Random) -> Optional[str]:
name = rng.choice(list(options))
return name
- def perturb(self, text: str, rng: Random) -> str:
- """
- Perturbing the text is handled in `perturb_with_persistency` to ensure that perturbed names
- in `Instance`s and `Reference`s match.
- """
- raise NotImplementedError("Should never be called")
-
def perturb_with_persistency(
self, text: str, rng: Random, name_substitution_mapping: Dict[str, str], skipped_tokens: Set[str]
) -> str:
diff --git a/src/helm/benchmark/augmentations/perturbation.py b/src/helm/benchmark/augmentations/perturbation.py
index 5eb24481d47..646926d85ae 100644
--- a/src/helm/benchmark/augmentations/perturbation.py
+++ b/src/helm/benchmark/augmentations/perturbation.py
@@ -27,17 +27,24 @@ def get_rng(self, instance: Instance, seed: Optional[int] = None) -> Random:
# If seed exists, use it as part of the random seed
return Random(instance.id if seed is None else str(seed) + instance.id)
+ @abstractmethod
+ def apply(self, instance: Instance, seed: Optional[int] = None) -> Instance:
+ """Generate a modified instance from the input instance."""
+ pass
+
+
+class TextPerturbation(Perturbation, ABC):
def apply(self, instance: Instance, seed: Optional[int] = None) -> Instance:
"""
- Generates a new Instance by perturbing the input, tagging the Instance and perturbing the References,
- if should_perturb_references is true. Initializes a random number generator based on instance_id that gets
- passed to perturb and perturb_references.
+ Generates a new Instance by applying `perturb` to the input and (if requested) the references.
+ Initializes a random number generator based on instance_id that gets
+ passed to perturb.
"""
rng: Random = self.get_rng(instance, seed)
references: List[Reference] = instance.references
if self.should_perturb_references:
- references = [self.perturb_reference(reference, rng) for reference in references]
+ references = [self._perturb_reference(reference, rng) for reference in references]
description = replace(self.description, seed=seed)
@@ -50,7 +57,7 @@ def apply(self, instance: Instance, seed: Optional[int] = None) -> Instance:
perturbation=description,
)
- def perturb_reference(self, reference: Reference, rng: Random) -> Reference:
+ def _perturb_reference(self, reference: Reference, rng: Random) -> Reference:
"""Generates a new Reference by perturbing the output and tagging the Reference."""
return replace(reference, output=Output(text=self.perturb(reference.output.text, rng)), tags=reference.tags)
diff --git a/src/helm/benchmark/augmentations/space_perturbation.py b/src/helm/benchmark/augmentations/space_perturbation.py
index fd08f3ba991..e1ddbf3cb4f 100644
--- a/src/helm/benchmark/augmentations/space_perturbation.py
+++ b/src/helm/benchmark/augmentations/space_perturbation.py
@@ -2,11 +2,11 @@
from random import Random
import re
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
from .perturbation_description import PerturbationDescription
-class SpacePerturbation(Perturbation):
+class SpacePerturbation(TextPerturbation):
"""
A simple perturbation that replaces existing spaces with 0-max_spaces spaces (thus potentially merging words).
"""
diff --git a/src/helm/benchmark/augmentations/synonym_perturbation.py b/src/helm/benchmark/augmentations/synonym_perturbation.py
index f8c54d334e7..81a7def7c3d 100644
--- a/src/helm/benchmark/augmentations/synonym_perturbation.py
+++ b/src/helm/benchmark/augmentations/synonym_perturbation.py
@@ -11,10 +11,10 @@
from helm.common.general import match_case, ensure_file_downloaded
from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
-class SynonymPerturbation(Perturbation):
+class SynonymPerturbation(TextPerturbation):
"""
Synonyms. For implementation details, see
https://github.com/GEM-benchmark/NL-Augmenter/blob/main/nlaugmenter/transformations/synonym_substitution/transformation.py
diff --git a/src/helm/benchmark/augmentations/typos_perturbation.py b/src/helm/benchmark/augmentations/typos_perturbation.py
index 0df062817fa..4b273dfbf9b 100644
--- a/src/helm/benchmark/augmentations/typos_perturbation.py
+++ b/src/helm/benchmark/augmentations/typos_perturbation.py
@@ -2,10 +2,10 @@
from random import Random
from .perturbation_description import PerturbationDescription
-from .perturbation import Perturbation
+from .perturbation import TextPerturbation
-class TyposPerturbation(Perturbation):
+class TyposPerturbation(TextPerturbation):
"""
Typos. For implementation details, see
https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/butter_fingers_perturbation
diff --git a/src/helm/benchmark/config_registry.py b/src/helm/benchmark/config_registry.py
new file mode 100644
index 00000000000..0fab062949c
--- /dev/null
+++ b/src/helm/benchmark/config_registry.py
@@ -0,0 +1,14 @@
+from helm.benchmark.model_deployment_registry import register_deployments_if_not_already_registered
+from helm.benchmark.model_metadata_registry import register_metadatas_if_not_already_registered
+from helm.benchmark.tokenizer_config_registry import register_tokenizers_if_not_already_registered
+
+HELM_REGISTERED: bool = False
+
+
+def register_helm_configurations():
+ global HELM_REGISTERED
+ if not HELM_REGISTERED:
+ register_metadatas_if_not_already_registered()
+ register_tokenizers_if_not_already_registered()
+ register_deployments_if_not_already_registered()
+ HELM_REGISTERED = True
diff --git a/src/helm/benchmark/huggingface_registration.py b/src/helm/benchmark/huggingface_registration.py
index ff444fd4fda..bc833ddea2e 100644
--- a/src/helm/benchmark/huggingface_registration.py
+++ b/src/helm/benchmark/huggingface_registration.py
@@ -1,5 +1,6 @@
import os
from typing import Optional
+from datetime import date
from helm.benchmark.model_deployment_registry import (
ClientSpec,
@@ -7,7 +8,15 @@
WindowServiceSpec,
register_model_deployment,
)
+from helm.benchmark.model_metadata_registry import (
+ get_model_metadata,
+ ModelMetadata,
+ register_model_metadata,
+ TEXT_MODEL_TAG,
+ FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
+)
from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec, register_tokenizer_config
+from helm.common.hierarchical_logger import hlog
def register_huggingface_model(
@@ -30,6 +39,27 @@ def register_huggingface_model(
args=object_spec_args,
),
)
+
+ # We check if the model is already registered because we don't want to
+ # overwrite the model metadata if it's already registered.
+ # If it's not registered, we register it, as otherwise an error would be thrown
+ # when we try to register the model deployment.
+ try:
+ _ = get_model_metadata(model_name=helm_model_name)
+ except ValueError:
+ register_model_metadata(
+ ModelMetadata(
+ name=helm_model_name,
+ creator_organization_name="Unknown",
+ display_name=helm_model_name,
+ description=helm_model_name,
+ access="open",
+ release_date=date.today(),
+ tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
+ )
+ )
+ hlog(f"Registered default metadata for model {helm_model_name}")
+
register_model_deployment(model_deployment)
tokenizer_config = TokenizerConfig(
name=helm_model_name,
diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py
index c9a107628cc..aa0eb8ae949 100644
--- a/src/helm/benchmark/metrics/basic_metrics.py
+++ b/src/helm/benchmark/metrics/basic_metrics.py
@@ -60,8 +60,8 @@ def compute_estimated_time_from_prompt_size_and_num_output_tokens(
num_output_tokens: int,
) -> Optional[float]:
estimated_runtime: Optional[float]
- if request_state.request.model in inference_runtimes_dict:
- inference_runtimes_dict_for_model = inference_runtimes_dict[request_state.request.model]
+ if request_state.request.model_deployment in inference_runtimes_dict:
+ inference_runtimes_dict_for_model = inference_runtimes_dict[request_state.request.model_deployment]
runtime_per_output_token: float = inference_runtimes_dict_for_model["runtime_per_output_token"]
raw_runtimes_for_prompt_tokens: Dict[str, float] = inference_runtimes_dict_for_model[
"runtime_for_prompt_tokens"
@@ -583,7 +583,9 @@ def compute_efficiency_metrics(
# Fetch the right `Tokenizer` depending on the model defined in `AdapterSpec`
# and calculate the number of tokens in the prompt.
tokenizer_service: TokenizerService = metric_service
- window_service: WindowService = WindowServiceFactory.get_window_service(adapter_spec.model, tokenizer_service)
+ window_service: WindowService = WindowServiceFactory.get_window_service(
+ adapter_spec.model_deployment, tokenizer_service
+ )
prompt: str = request_state.request.prompt
num_prompt_tokens: int = window_service.get_num_tokens(prompt)
@@ -618,14 +620,16 @@ def compute_efficiency_metrics(
# Compute efficiency metrics for training.
training_co2_cost: Optional[float]
- if request_state.request.model in self.training_efficiency_dict["carbon"]:
- training_co2_cost = self.training_efficiency_dict["carbon"][request_state.request.model]["value"]
+ if request_state.request.model_deployment in self.training_efficiency_dict["carbon"]:
+ training_co2_cost = self.training_efficiency_dict["carbon"][request_state.request.model_deployment]["value"]
else:
training_co2_cost = None
training_energy_cost: Optional[float]
- if request_state.request.model in self.training_efficiency_dict["energy"]:
- training_energy_cost = self.training_efficiency_dict["energy"][request_state.request.model]["value"]
+ if request_state.request.model_deployment in self.training_efficiency_dict["energy"]:
+ training_energy_cost = self.training_efficiency_dict["energy"][request_state.request.model_deployment][
+ "value"
+ ]
else:
training_energy_cost = None
@@ -799,7 +803,9 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind
num_choices = len(references)
tokenizer_service: TokenizerService = metric_service
- window_service: WindowService = WindowServiceFactory.get_window_service(adapter_spec.model, tokenizer_service)
+ window_service: WindowService = WindowServiceFactory.get_window_service(
+ adapter_spec.model_deployment, tokenizer_service
+ )
reference_stats: Dict[ReferenceKey, ReferenceStat] = {}
for request_state in reference_request_states:
assert request_state.reference_index is not None and request_state.request_mode is not None
diff --git a/src/helm/benchmark/metrics/code_metrics.py b/src/helm/benchmark/metrics/code_metrics.py
index 6ed8f084750..01bcabcaa00 100644
--- a/src/helm/benchmark/metrics/code_metrics.py
+++ b/src/helm/benchmark/metrics/code_metrics.py
@@ -106,12 +106,13 @@ def evaluate_generation(
hlog(f"After second join thread count: {threading.active_count()}. exitcode: {p.exitcode}")
assert not p.is_alive(), "The code process was still alive even after calling kill."
- if len(shared_list) == 0:
+ if len(shared_list) > 0:
+ scores = shared_list[0]
+ else:
# Remark: ideally should consider all tests that failed;
# use the average number of tests here for simplicity
avg_number_tests = 21
- shared_list = [[-1] * avg_number_tests] # type: ignore
- scores = shared_list[0]
+ scores = [-1] * avg_number_tests
scores = _convert_scores(scores) # Convert list of bool/int to list of ints.
this_score = metric_fn(scores)
diff --git a/src/helm/benchmark/metrics/disinformation_metrics.py b/src/helm/benchmark/metrics/disinformation_metrics.py
index fd8d180bee9..2025d06eb7e 100644
--- a/src/helm/benchmark/metrics/disinformation_metrics.py
+++ b/src/helm/benchmark/metrics/disinformation_metrics.py
@@ -86,7 +86,7 @@ def _compute_wedging_human_eval(
results: List[Stat] = []
instance_first_line = request_state.instance.input.text.splitlines()[0]
human_evaluations = _fetch_human_evaluation_results(eval_cache_path, WEDGING_HUMAN_EVAL_FILE)
- model_results = human_evaluations.get(adapter_spec.model)
+ model_results = human_evaluations.get(adapter_spec.model_deployment)
if not model_results:
# Trying to evaluate a model we don't have annotations for
@@ -125,7 +125,7 @@ def _compute_reiteration_human_eval(
"""
results: List[Stat] = []
human_evaluations = _fetch_human_evaluation_results(eval_cache_path, REITERATION_HUMAN_EVAL_FILE)
- model_results = human_evaluations.get(adapter_spec.model)
+ model_results = human_evaluations.get(adapter_spec.model_deployment)
if not model_results:
# Trying to evaluate a model we don't have annotations for
return results
diff --git a/src/helm/benchmark/metrics/dry_run_metrics.py b/src/helm/benchmark/metrics/dry_run_metrics.py
index 1f2618b0dd0..4fe2126630e 100644
--- a/src/helm/benchmark/metrics/dry_run_metrics.py
+++ b/src/helm/benchmark/metrics/dry_run_metrics.py
@@ -38,7 +38,9 @@ def process(self, request_state: RequestState) -> List[Stat]:
stats.append(Stat(MetricName("max_num_completion_tokens")).add(request.num_completions * request.max_tokens))
# Get number of tokens in the prompt
- tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model, self.metric_service)
+ tokenizer: WindowService = WindowServiceFactory.get_window_service(
+ request.model_deployment, self.metric_service
+ )
num_prompt_tokens: int = tokenizer.get_num_tokens(request.prompt)
stats.append(Stat(MetricName("num_prompt_tokens")).add(num_prompt_tokens))
diff --git a/src/helm/benchmark/metrics/ranking_metrics.py b/src/helm/benchmark/metrics/ranking_metrics.py
index 204a4a8fbd5..b7c823eca5d 100644
--- a/src/helm/benchmark/metrics/ranking_metrics.py
+++ b/src/helm/benchmark/metrics/ranking_metrics.py
@@ -7,7 +7,7 @@
from helm.common.optional_dependencies import handle_module_not_found_error
from helm.benchmark.scenarios.scenario import unpack_tag, CORRECT_TAG, Reference
from helm.common.request import RequestResult
-from helm.common.general import binarize_dict
+from helm.common.general import assert_present, binarize_dict
from .metric import Metric
from .metric_name import MetricName
from .metric_service import MetricService
@@ -205,14 +205,13 @@ def get_run_relevances(self, ranking_objs: List[RankingObject], rank_limit: Opti
relevance dictionary, which contains the ground truth relevance
values for each document.
"""
- assert all([r.model_relevance is not None for r in ranking_objs])
if rank_limit:
return {
- self.get_query_string(r.reference_index): r.model_relevance # type: ignore
+ self.get_query_string(r.reference_index): assert_present(r.model_relevance)
for r in ranking_objs
if r.rank and r.rank <= rank_limit
}
- return {self.get_query_string(r.reference_index): r.model_relevance for r in ranking_objs} # type: ignore
+ return {self.get_query_string(r.reference_index): assert_present(r.model_relevance) for r in ranking_objs}
def get_true_relevances(self, ranking_objects: List[RankingObject]) -> Dict[str, int]:
"""Get the true relevance dictionary."""
@@ -372,7 +371,7 @@ def evaluate_references(
# len(ranking_objects) minus its relevance.
stats += [
Stat(MetricName(f"ref{r.reference_index}_rank")).add(
- len(ranking_objects) - r.model_relevance # type: ignore
+ len(ranking_objects) - assert_present(r.model_relevance)
)
for r in ranking_objects
]
diff --git a/src/helm/benchmark/metrics/summarization_metrics.py b/src/helm/benchmark/metrics/summarization_metrics.py
index de90249dbd2..3a61ae77413 100644
--- a/src/helm/benchmark/metrics/summarization_metrics.py
+++ b/src/helm/benchmark/metrics/summarization_metrics.py
@@ -52,7 +52,7 @@ def __init__(self, task: str, device: str = "cpu"):
# avoid triggering a bug in DataStatsMetric that raises
# `NameError: name 'stderr' is not defined`
if not spacy.util.is_package("en_core_web_sm"):
- spacy.cli.download("en_core_web_sm") # type: ignore
+ spacy.cli.download("en_core_web_sm")
try:
from summ_eval.data_stats_metric import DataStatsMetric
@@ -181,9 +181,9 @@ def evaluate_generation(
self.humaneval = self._load_humaneval(eval_cache_path)
# get human evaluation scores if they exist
- model_name = adapter_spec.model.replace("/", "_")
+ deployment = adapter_spec.model_deployment.replace("/", "_")
for metric_name in ["faithfulness", "relevance", "coherence"]:
- val = self.humaneval[(metric_name, model_name, request_state.instance.id, pred)]
+ val = self.humaneval[(metric_name, deployment, request_state.instance.id, pred)]
result.append(Stat(MetricName(f"HumanEval-{metric_name}")).add(float(val)))
except KeyError:
pass
@@ -195,8 +195,8 @@ def evaluate_generation(
if self.qa_fact_eval is None:
self._load_qafacteval(eval_cache_path)
assert self.qa_fact_eval is not None
- model_name = adapter_spec.model.replace("/", "_")
- val = self.qa_fact_eval[model_name][(request_state.instance.id, pred)]
+ deployment = adapter_spec.model_deployment.replace("/", "_")
+ val = self.qa_fact_eval[deployment][(request_state.instance.id, pred)]
result.append(Stat(MetricName("QAFactEval")).add(float(val)))
except KeyError:
pass
diff --git a/src/helm/benchmark/metrics/test_classification_metrics.py b/src/helm/benchmark/metrics/test_classification_metrics.py
index f5f3d23ff18..d15b4b9fef7 100644
--- a/src/helm/benchmark/metrics/test_classification_metrics.py
+++ b/src/helm/benchmark/metrics/test_classification_metrics.py
@@ -26,7 +26,7 @@ def _request_state(prediction: str, options: List[_Option]):
request_mode=None,
train_trial_index=0,
output_mapping=None,
- request=Request(),
+ request=Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002"),
result=RequestResult(
success=True, embedding=[], completions=[Sequence(text=prediction, logprob=0.0, tokens=[])], cached=False
),
diff --git a/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py
index 297dcf60736..9d6444fcbe9 100644
--- a/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py
+++ b/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py
@@ -39,5 +39,5 @@ def estimate_tokens(self, request: Request, metric_service: MetricService) -> in
"""
Estimate the number of tokens for a given request based on the organization.
"""
- token_cost_estimator: TokenCostEstimator = self._get_estimator(request.model_organization)
+ token_cost_estimator: TokenCostEstimator = self._get_estimator(request.model_host)
return token_cost_estimator.estimate_tokens(request, metric_service)
diff --git a/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py
index 7880686adee..1bd22893061 100644
--- a/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py
+++ b/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py
@@ -17,6 +17,8 @@ def estimate_tokens(self, request: Request, metric_service: MetricService) -> in
"""
total_estimated_tokens: int = request.num_completions * request.max_tokens
if request.echo_prompt:
- window_service: WindowService = WindowServiceFactory.get_window_service(request.model, metric_service)
+ window_service: WindowService = WindowServiceFactory.get_window_service(
+ request.model_deployment, metric_service
+ )
total_estimated_tokens += window_service.get_num_tokens(request.prompt)
return GooseAITokenCounter.account_for_base_tokens(total_estimated_tokens)
diff --git a/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py
index d52807ab781..429075fe949 100644
--- a/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py
+++ b/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py
@@ -15,7 +15,7 @@ def estimate_tokens(self, request: Request, metric_service: MetricService) -> in
Add num_tokens(prompt) if Request.echo_prompt is True.
"""
- tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model, metric_service)
+ tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model_deployment, metric_service)
num_prompt_tokens: int = tokenizer.get_num_tokens(request.prompt)
total_estimated_tokens: int = num_prompt_tokens + request.num_completions * request.max_tokens
diff --git a/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py
index 2964beeefbf..2d202413f66 100644
--- a/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py
+++ b/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py
@@ -10,6 +10,8 @@ def setup_method(self, method):
def test_estimate_tokens(self):
request = Request(
+ model="openai/text-davinci-002",
+ model_deployment="openai/text-davinci-002",
prompt="The Center for Research on Foundation Models (CRFM) is "
"an interdisciplinary initiative born out of the Stanford "
"Institute for Human-Centered Artificial Intelligence (HAI) "
diff --git a/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
index b647222e673..e4f07463e92 100644
--- a/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
+++ b/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py
@@ -36,13 +36,21 @@ def setup_method(self, method):
self._mock_metric_service.tokenize = MagicMock(return_value=tokenization_request_result)
def test_estimate_tokens(self):
- request = Request(prompt=TestOpenAITokenCostEstimator.TEST_PROMPT, num_completions=3, max_tokens=100)
+ request = Request(
+ model="openai/text-davinci-002",
+ model_deployment="openai/text-davinci-002",
+ prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
+ num_completions=3,
+ max_tokens=100,
+ )
# Prompt + max number of tokens from completions = 51 + 3 * 100
assert self._token_cost_estimator.estimate_tokens(request, self._mock_metric_service) == 51 + 3 * 100
def test_estimate_tokens_with_echo_prompt(self):
request = Request(
+ model="openai/text-davinci-002",
+ model_deployment="openai/text-davinci-002",
prompt=TestOpenAITokenCostEstimator.TEST_PROMPT,
echo_prompt=True,
num_completions=1,
diff --git a/src/helm/benchmark/model_deployment_registry.py b/src/helm/benchmark/model_deployment_registry.py
index fe3a386987f..c3f9d361472 100644
--- a/src/helm/benchmark/model_deployment_registry.py
+++ b/src/helm/benchmark/model_deployment_registry.py
@@ -1,16 +1,18 @@
import os
from typing import Dict, Optional, List
from dataclasses import dataclass
+import importlib_resources as resources
import cattrs
import yaml
from helm.common.hierarchical_logger import hlog
from helm.common.object_spec import ObjectSpec
-from helm.proxy.models import ALL_MODELS, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, MODEL_NAME_TO_MODEL, TEXT_MODEL_TAG, Model
+from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, CONFIG_PACKAGE
-MODEL_DEPLOYMENTS_FILE = "model_deployments.yaml"
+MODEL_DEPLOYMENTS_FILE: str = "model_deployments.yaml"
+DEPLOYMENTS_REGISTERED: bool = False
class ClientSpec(ObjectSpec):
@@ -23,65 +25,96 @@ class WindowServiceSpec(ObjectSpec):
@dataclass(frozen=True)
class ModelDeployment:
- """A model deployment is an accessible instance of this model (e.g. a hosted endpoint).
-
- A model can have multiple model deployments."""
+ """
+ A model deployment is an accessible instance of this model (e.g., a hosted endpoint).
+ A model can have multiple model deployments.
+ """
name: str
- """Name of the model deployment."""
+ """Name of the model deployment. Usually formatted as "/".
+ Example: "huggingface/t5-11b"."""
client_spec: ClientSpec
"""Specification for instantiating the client for this model deployment."""
model_name: Optional[str] = None
- """Name of the model that this model deployment is for.
-
- If unset, defaults to the the same value as `name`."""
+ """Name of the model that this model deployment is for. Refers to the field "name" in the Model class.
+ If unset, defaults to the same value as `name`."""
tokenizer_name: Optional[str] = None
- """Tokenizer for this model deployment.
-
- If unset, auto-inferred by the WindowService."""
+ """Tokenizer for this model deployment. If unset, auto-inferred by the WindowService."""
window_service_spec: Optional[WindowServiceSpec] = None
- """Specification for instantiating the window service for this model deployment"""
+ """Specification for instantiating the window service for this model deployment."""
max_sequence_length: Optional[int] = None
"""Maximum sequence length for this model deployment."""
max_request_length: Optional[int] = None
"""Maximum request length for this model deployment.
-
If unset, defaults to the same value as max_sequence_length."""
+ max_sequence_and_generated_tokens_length: Optional[int] = None
+ """The max length of the model input and output tokens.
+ Some models (like Anthropic/Claude and Megatron) have a specific limit sequence length + max_token.
+ If unset, defaults to INT_MAX (i.e., no limit)."""
+
+ deprecated: bool = False
+ """Whether this model deployment is deprecated."""
+
+ @property
+ def host_organization(self) -> str:
+ """
+ Extracts the host group from the model deployment name.
+ Example: "huggingface" from "huggingface/t5-11b"
+ This can be different from the creator organization (for example "together")
+ """
+ return self.name.split("/")[0]
+
+ @property
+ def engine(self) -> str:
+ """
+ Extracts the model engine from the model deployment name.
+ Example: 'ai21/j1-jumbo' => 'j1-jumbo'
+ """
+ return self.name.split("/")[1]
+
+ def __post_init__(self):
+ if not self.model_name:
+ object.__setattr__(self, "model_name", self.name)
+
@dataclass(frozen=True)
class ModelDeployments:
model_deployments: List[ModelDeployment]
-_name_to_model_deployment: Dict[str, ModelDeployment] = {}
+ALL_MODEL_DEPLOYMENTS: List[ModelDeployment] = []
+DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT: Dict[str, ModelDeployment] = {
+ deployment.name: deployment for deployment in ALL_MODEL_DEPLOYMENTS
+}
+# ===================== REGISTRATION FUNCTIONS ==================== #
def register_model_deployment(model_deployment: ModelDeployment) -> None:
- hlog(f"Registered model deployment {model_deployment.name}")
- _name_to_model_deployment[model_deployment.name] = model_deployment
-
- # Auto-register a model with this name if none exists
- model_name = model_deployment.model_name or model_deployment.name
- if model_name not in MODEL_NAME_TO_MODEL:
- model = Model(
- group="unknown",
- name=model_name,
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- )
- MODEL_NAME_TO_MODEL[model_name] = model
- ALL_MODELS.append(model)
- hlog(f"Registered default metadata for model {model_name}")
+ # hlog(f"Registered model deployment {model_deployment.name}")
+ DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_deployment.name] = model_deployment
+ ALL_MODEL_DEPLOYMENTS.append(model_deployment)
+
+ model_name: str = model_deployment.model_name or model_deployment.name
+
+ try:
+ model_metadata: ModelMetadata = get_model_metadata(model_name)
+ deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
+ if model_deployment.name not in deployment_names:
+ if model_metadata.deployment_names is None:
+ model_metadata.deployment_names = []
+ model_metadata.deployment_names.append(model_deployment.name)
+ except ValueError:
+ raise ValueError(f"Model deployment {model_deployment.name} has no corresponding model metadata")
def register_model_deployments_from_path(path: str) -> None:
- global _name_to_model_deployment
hlog(f"Reading model deployments from {path}...")
with open(path, "r") as f:
raw = yaml.safe_load(f)
@@ -90,12 +123,63 @@ def register_model_deployments_from_path(path: str) -> None:
register_model_deployment(model_deployment)
-def maybe_register_model_deployments_from_base_path(base_path: str) -> None:
- """Register model deployments from prod_env/model_deployments.yaml"""
- path = os.path.join(base_path, MODEL_DEPLOYMENTS_FILE)
+def maybe_register_model_deployments_from_base_path(path: str) -> None:
+ """Register model deployments from yaml file if the path exists."""
if os.path.exists(path):
register_model_deployments_from_path(path)
-def get_model_deployment(name: str) -> Optional[ModelDeployment]:
- return _name_to_model_deployment.get(name)
+# ===================== UTIL FUNCTIONS ==================== #
+def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeployment:
+ register_deployments_if_not_already_registered()
+ if name not in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
+ raise ValueError(f"Model deployment {name} not found")
+ deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
+ if deployment.deprecated and warn_deprecated:
+ hlog(f"WARNING: DEPLOYMENT Model deployment {name} is deprecated")
+ return deployment
+
+
+def get_model_deployments_by_host_organization(host_organization: str) -> List[str]:
+ """
+ Gets models by host organization.
+ Example: together => [" together/bloom", "together/t0pp", ...]
+ """
+ register_deployments_if_not_already_registered()
+ return [
+ deployment.name for deployment in ALL_MODEL_DEPLOYMENTS if deployment.host_organization == host_organization
+ ]
+
+
+def get_model_deployment_host_organization(name: str) -> str:
+ """
+ Extracts the host organization from the model deployment name.
+ Example: "huggingface/t5-11b" => "huggingface"
+ """
+ deployment: ModelDeployment = get_model_deployment(name)
+ return deployment.host_organization
+
+
+def get_metadata_for_deployment(deployment_name: str) -> ModelMetadata:
+ """
+ Given a deployment name, returns the corresponding model metadata.
+ """
+ deployment: ModelDeployment = get_model_deployment(deployment_name)
+ return get_model_metadata(deployment.model_name or deployment.name)
+
+
+def get_model_names_with_tokenizer(tokenizer_name: str) -> List[str]:
+ """Get all the name of the models with tokenizer `tokenizer_name`."""
+ register_deployments_if_not_already_registered()
+ deployments: List[ModelDeployment] = [
+ deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.tokenizer_name == tokenizer_name
+ ]
+ return [deployment.model_name or deployment.name for deployment in deployments]
+
+
+def register_deployments_if_not_already_registered() -> None:
+ global DEPLOYMENTS_REGISTERED
+ if not DEPLOYMENTS_REGISTERED:
+ path: str = resources.files(CONFIG_PACKAGE).joinpath(MODEL_DEPLOYMENTS_FILE)
+ maybe_register_model_deployments_from_base_path(path)
+ DEPLOYMENTS_REGISTERED = True
diff --git a/src/helm/benchmark/model_metadata_registry.py b/src/helm/benchmark/model_metadata_registry.py
index e95c8a520b1..335c75c5b4b 100644
--- a/src/helm/benchmark/model_metadata_registry.py
+++ b/src/helm/benchmark/model_metadata_registry.py
@@ -1,52 +1,129 @@
import os
-from typing import Optional, List
+from typing import Dict, Optional, List
from dataclasses import dataclass, field
from datetime import date
+import importlib_resources as resources
import dacite
import yaml
-from helm.proxy.models import ALL_MODELS, MODEL_NAME_TO_MODEL, Model
+# Different modalities
+TEXT_MODEL_TAG: str = "TEXT_MODEL_TAG"
+IMAGE_MODEL_TAG: str = "IMAGE_MODEL_TAG"
+CODE_MODEL_TAG: str = "CODE_MODEL_TAG"
+EMBEDDING_MODEL_TAG: str = "EMBEDDING_MODEL_TAG"
-MODEL_METADATA_FILE = "model_metadata.yaml"
+# Some model APIs have limited functionalities
+FULL_FUNCTIONALITY_TEXT_MODEL_TAG: str = "FULL_FUNCTIONALITY_TEXT_MODEL_TAG"
+LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG: str = "LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG"
+# ChatML format
+CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
-@dataclass(frozen=True)
+# OpenAI Chat format
+OPENAI_CHATGPT_MODEL_TAG: str = "openai_chatgpt"
+
+# For Anthropic models
+ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
+ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
+
+# Models which emit garbage tokens when temperature=0.
+BUGGY_TEMP_0_TAG: str = "BUGGY_TEMP_0_TAG"
+
+# Models that are used for ablations and fine-grained analyses.
+# These models are selected specifically because of their low marginal cost to evaluate.
+ABLATION_MODEL_TAG: str = "ABLATION_MODEL_TAG"
+
+# Some models (e.g., T5) have stripped newlines.
+# So we cannot use \n as a stop sequence for these models.
+NO_NEWLINES_TAG: str = "NO_NEWLINES_TAG"
+
+# Some models (e.g., UL2) require a prefix (e.g., [NLG]) in the
+# prompts to indicate the mode before doing inference.
+NLG_PREFIX_TAG: str = "NLG_PREFIX_TAG"
+
+# Some models can follow instructions.
+INSTRUCTION_FOLLOWING_MODEL_TAG: str = "INSTRUCTION_FOLLOWING_MODEL_TAG"
+
+# For Vision-langauge models (VLMs)
+VISION_LANGUAGE_MODEL_TAG: str = "VISION_LANGUAGE_MODEL_TAG"
+
+
+CONFIG_PACKAGE = "helm.config"
+MODEL_METADATA_FILE: str = "model_metadata.yaml"
+METADATAS_REGISTERED: bool = False
+
+
+# Frozen is set to false as the model_deployment_registry.py file
+# might populate the deployment_names field.
+@dataclass(frozen=False)
class ModelMetadata:
name: str
- """Name of the model e.g. "meta/llama-2"."""
+ """Name of the model group (e.g., "openai/davinci"). This is the name of the model,
+ not the name of the deployment.
+ Usually formatted as "/". Example: "ai21/j1-jumbo"."""
- creator_organization: Optional[str] = None
- """Organization that originally created the model (e.g. "meta")."""
+ creator_organization_name: str
+ """Name of the organization that created the model."""
- access: Optional[str] = None
- """How this model is available (e.g., limited).
+ display_name: str
+ """Name that is going to be displayed to the user (on the website, etc.)."""
- If there are multiple deployments, this should be the most permissive access across
- all deployments."""
+ description: str
+ """Description of the model, to be displayed on the website."""
- todo: bool = False
- """Whether we have yet to evaluate this model."""
+ access: str
+ """Description of the access level of the model. Should be one of the following:
+ - "open": the model is open-source and can be downloaded from the internet.
+ - "closed": not accessible
+ - "limited": accessible with an API key.
+ If there are multiple deployments, this should be the most permissive access across all deployments."""
- release_date: Optional[date] = None
- """When the model was released."""
+ release_date: date
+ """Release date of the model."""
- num_parameters: Optional[int] = None
- """The number of model parameters.
+ tags: List[str] = field(default_factory=list)
+ """Tags corresponding to the properties of the model."""
+ num_parameters: Optional[int] = None
+ """Number of parameters in the model.
This should be a string as the number of parameters is usually a round number (175B),
but we set it as an int for plotting purposes."""
- tags: List[str] = field(default_factory=list)
- """"""
+ deployment_names: Optional[List[str]] = None
+ """List of the model deployments for this model. Should at least contain one model deployment.
+ Refers to the field "name" in the ModelDeployment class. Defaults to a single model deployment
+ with the same name as the model."""
+
+ @property
+ def creator_organization(self) -> str:
+ """
+ Extracts the creator organization from the model name.
+ Example: 'ai21/j1-jumbo' => 'ai21'
+ This can be different from the hosting organization.
+ """
+ return self.name.split("/")[0]
+
+ @property
+ def engine(self) -> str:
+ """
+ Extracts the model engine from the model name.
+ Example: 'ai21/j1-jumbo' => 'j1-jumbo'
+ """
+ return self.name.split("/")[1]
@dataclass(frozen=True)
class ModelMetadataList:
- models: List[ModelMetadata]
+ models: List[ModelMetadata] = field(default_factory=list)
+ALL_MODELS_METADATA: List[ModelMetadata] = []
+MODEL_NAME_TO_MODEL_METADATA: Dict[str, ModelMetadata] = {model.name: model for model in ALL_MODELS_METADATA}
+
+
+# ===================== REGISTRATION FUNCTIONS ==================== #
def register_model_metadata_from_path(path: str) -> None:
"""Register model configurations from the given path."""
with open(path, "r") as f:
@@ -55,17 +132,77 @@ def register_model_metadata_from_path(path: str) -> None:
# serialization format for dates
model_metadata_list = dacite.from_dict(ModelMetadataList, raw)
for model_metadata in model_metadata_list.models:
- model = Model(
- group="none", # TODO: Group should be part of model deployment, not model
- name=model_metadata.name,
- tags=model_metadata.tags,
- )
- MODEL_NAME_TO_MODEL[model_metadata.name] = model
- ALL_MODELS.append(model)
-
-
-def maybe_register_model_metadata_from_base_path(base_path: str) -> None:
- """Register model metadata from prod_env/model_metadata.yaml"""
- path = os.path.join(base_path, MODEL_METADATA_FILE)
+ register_model_metadata(model_metadata)
+
+
+def register_model_metadata(model_metadata: ModelMetadata) -> None:
+ """Register a single model configuration."""
+ # hlog(f"Registered model metadata {model_metadata.name}")
+ ALL_MODELS_METADATA.append(model_metadata)
+ MODEL_NAME_TO_MODEL_METADATA[model_metadata.name] = model_metadata
+
+
+def maybe_register_model_metadata_from_base_path(path: str) -> None:
+ """Register model metadata from yaml file if the path exists."""
if os.path.exists(path):
register_model_metadata_from_path(path)
+
+
+# ===================== UTIL FUNCTIONS ==================== #
+def get_model_metadata(model_name: str) -> ModelMetadata:
+ """Get the `Model` given the name."""
+ register_metadatas_if_not_already_registered()
+ if model_name not in MODEL_NAME_TO_MODEL_METADATA:
+ raise ValueError(f"No model with name: {model_name}")
+
+ return MODEL_NAME_TO_MODEL_METADATA[model_name]
+
+
+def get_model_creator_organization(model_name: str) -> str:
+ """Get the model's group given the name."""
+ model: ModelMetadata = get_model_metadata(model_name)
+ return model.creator_organization
+
+
+def get_all_models() -> List[str]:
+ """Get all model names."""
+ register_metadatas_if_not_already_registered()
+ return list(MODEL_NAME_TO_MODEL_METADATA.keys())
+
+
+def get_models_by_creator_organization(organization: str) -> List[str]:
+ """
+ Gets models by creator organization.
+ Example: ai21 => ai21/j1-jumbo, ai21/j1-grande, ai21-large.
+ """
+ register_metadatas_if_not_already_registered()
+ return [model.name for model in ALL_MODELS_METADATA if model.creator_organization == organization]
+
+
+def get_model_names_with_tag(tag: str) -> List[str]:
+ """Get all the name of the models with tag `tag`."""
+ register_metadatas_if_not_already_registered()
+ return [model.name for model in ALL_MODELS_METADATA if tag in model.tags]
+
+
+def get_all_text_models() -> List[str]:
+ """Get all text model names."""
+ return get_model_names_with_tag(TEXT_MODEL_TAG)
+
+
+def get_all_code_models() -> List[str]:
+ """Get all code model names."""
+ return get_model_names_with_tag(CODE_MODEL_TAG)
+
+
+def get_all_instruction_following_models() -> List[str]:
+ """Get all instruction-following model names."""
+ return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG)
+
+
+def register_metadatas_if_not_already_registered() -> None:
+ global METADATAS_REGISTERED
+ if not METADATAS_REGISTERED:
+ path: str = resources.files(CONFIG_PACKAGE).joinpath(MODEL_METADATA_FILE)
+ maybe_register_model_metadata_from_base_path(path)
+ METADATAS_REGISTERED = True
diff --git a/src/helm/benchmark/presentation/contamination.py b/src/helm/benchmark/presentation/contamination.py
index dd6832d0a9d..0e876bc42d9 100644
--- a/src/helm/benchmark/presentation/contamination.py
+++ b/src/helm/benchmark/presentation/contamination.py
@@ -2,10 +2,10 @@
from typing import List, Optional
import dacite
import importlib_resources as resources
-import yaml # type: ignore
+import yaml
from helm.common.hierarchical_logger import htrack, hlog
-from helm.proxy.models import MODEL_NAME_TO_MODEL
+from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
from helm.benchmark.presentation.schema import Schema
@@ -70,7 +70,7 @@ def validate_contamination(contamination: Contamination, schema: Schema):
"""Make sure models and groups in contamination are defined according to `schema`."""
for point in contamination.points:
for model in point.models:
- if model not in MODEL_NAME_TO_MODEL:
+ if model not in MODEL_NAME_TO_MODEL_METADATA:
hlog(f"WARNING: model {model} not defined in schema")
for group in point.groups:
if group not in schema.name_to_run_group:
diff --git a/src/helm/benchmark/presentation/run_display.py b/src/helm/benchmark/presentation/run_display.py
index fce259add11..7f6b3fd03d3 100644
--- a/src/helm/benchmark/presentation/run_display.py
+++ b/src/helm/benchmark/presentation/run_display.py
@@ -76,16 +76,14 @@ class DisplayRequest:
most relevant request e.g. the request for the chosen cohice for multiple choice questions."""
-def _read_scenario_state(run_path: str) -> ScenarioState:
- scenario_state_path: str = os.path.join(run_path, "scenario_state.json")
+def _read_scenario_state(scenario_state_path: str) -> ScenarioState:
if not os.path.exists(scenario_state_path):
raise ValueError(f"Could not load ScenarioState from {scenario_state_path}")
with open(scenario_state_path) as f:
return from_json(f.read(), ScenarioState)
-def _read_per_instance_stats(run_path: str) -> List[PerInstanceStats]:
- per_instance_stats_path: str = os.path.join(run_path, "per_instance_stats.json")
+def _read_per_instance_stats(per_instance_stats_path: str) -> List[PerInstanceStats]:
if not os.path.exists(per_instance_stats_path):
raise ValueError(f"Could not load PerInstanceStats from {per_instance_stats_path}")
with open(per_instance_stats_path) as f:
@@ -168,16 +166,35 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
display_predictions_file_path = os.path.join(run_path, _DISPLAY_PREDICTIONS_JSON_FILE_NAME)
display_requests_file_path = os.path.join(run_path, _DISPLAY_REQUESTS_JSON_FILE_NAME)
+ scenario_state_path = os.path.join(run_path, "scenario_state.json")
+ per_instance_stats_path = os.path.join(run_path, "per_instance_stats.json")
+
if (
skip_completed
and os.path.exists(instances_file_path)
and os.path.exists(display_predictions_file_path)
and os.path.exists(display_requests_file_path)
):
- hlog(f"Skipping writing display JSON for run {run_spec.name} because all output display JSON files exist.")
+ hlog(
+ f"Skipping writing display JSON for run {run_spec.name} "
+ "because all output display JSON files already exist."
+ )
+ return
+ elif not os.path.exists(scenario_state_path):
+ hlog(
+ f"Skipping writing display JSON for run {run_spec.name} because "
+ f"the scenario state JSON file does not exist at {scenario_state_path}"
+ )
return
- scenario_state = _read_scenario_state(run_path)
- per_instance_stats = _read_per_instance_stats(run_path)
+ elif not os.path.exists(per_instance_stats_path):
+ hlog(
+ f"Skipping writing display JSON for run {run_spec.name} because "
+ f"the per instance stats JSON file does not exist at {per_instance_stats_path}"
+ )
+ return
+
+ scenario_state = _read_scenario_state(scenario_state_path)
+ per_instance_stats = _read_per_instance_stats(per_instance_stats_path)
metric_names = _get_metric_names_for_groups(run_spec.groups, schema)
diff --git a/src/helm/benchmark/presentation/run_specs.conf b/src/helm/benchmark/presentation/run_specs.conf
index a1009c29e35..5b17b35deea 100644
--- a/src/helm/benchmark/presentation/run_specs.conf
+++ b/src/helm/benchmark/presentation/run_specs.conf
@@ -483,36 +483,36 @@ entries: [
{description: "lsat_qa:model=text_code,task=assignment", priority: 3}
{description: "lsat_qa:model=text_code,task=miscellaneous", priority: 3}
- {description: "lextreme:subset=brazilian_court_decisions_judgment,model=all", priority: 5}
- {description: "lextreme:subset=brazilian_court_decisions_unanimity,model=all", priority: 5}
- {description: "lextreme:subset=german_argument_mining,model=all", priority: 5}
- {description: "lextreme:subset=greek_legal_code_chapter,model=all", priority: 5}
- {description: "lextreme:subset=greek_legal_code_subject,model=all", priority: 5}
- {description: "lextreme:subset=greek_legal_code_volume,model=all", priority: 5}
- {description: "lextreme:subset=swiss_judgment_prediction,model=all", priority: 5}
- {description: "lextreme:subset=online_terms_of_service_unfairness_levels,model=all", priority: 5}
- {description: "lextreme:subset=online_terms_of_service_clause_topics,model=all", priority: 5}
- {description: "lextreme:subset=covid19_emergency_event,model=all", priority: 5}
- {description: "lextreme:subset=multi_eurlex_level_1,model=all", priority: 5}
- {description: "lextreme:subset=multi_eurlex_level_2,model=all", priority: 5}
- {description: "lextreme:subset=multi_eurlex_level_3,model=all", priority: 5}
- {description: "lextreme:subset=greek_legal_ner,model=all", priority: 5}
- {description: "lextreme:subset=legalnero,model=all", priority: 5}
- {description: "lextreme:subset=lener_br,model=all", priority: 5}
- {description: "lextreme:subset=mapa_coarse,model=all", priority: 5}
- {description: "lextreme:subset=mapa_fine,model=all", priority: 5}
-
- {description: "lex_glue:subset=ecthr_a,model=all", priority: 3}
- {description: "lex_glue:subset=ecthr_b,model=all", priority: 3}
- {description: "lex_glue:subset=scotus,model=all", priority: 3}
- {description: "lex_glue:subset=eurlex,model=all", priority: 3}
- {description: "lex_glue:subset=ledgar,model=all", priority: 3}
- {description: "lex_glue:subset=unfair_tos,model=all", priority: 3}
- {description: "lex_glue:subset=case_hold,model=all", priority: 3}
-
- {description: "billsum_legal_summarization:model=all", priority: 3},
- {description: "multilexsum_legal_summarization:model=all", priority: 3},
- {description: "eurlexsum_legal_summarization:model=all", priority: 3},
+ {description: "lextreme:subset=brazilian_court_decisions_judgment,model=text", priority: 5}
+ {description: "lextreme:subset=brazilian_court_decisions_unanimity,model=text", priority: 5}
+ {description: "lextreme:subset=german_argument_mining,model=text", priority: 5}
+ {description: "lextreme:subset=greek_legal_code_chapter,model=text", priority: 5}
+ {description: "lextreme:subset=greek_legal_code_subject,model=text", priority: 5}
+ {description: "lextreme:subset=greek_legal_code_volume,model=text", priority: 5}
+ {description: "lextreme:subset=swiss_judgment_prediction,model=text", priority: 5}
+ {description: "lextreme:subset=online_terms_of_service_unfairness_levels,model=text", priority: 5}
+ {description: "lextreme:subset=online_terms_of_service_clause_topics,model=text", priority: 5}
+ {description: "lextreme:subset=covid19_emergency_event,model=text", priority: 5}
+ {description: "lextreme:subset=multi_eurlex_level_1,model=text", priority: 5}
+ {description: "lextreme:subset=multi_eurlex_level_2,model=text", priority: 5}
+ {description: "lextreme:subset=multi_eurlex_level_3,model=text", priority: 5}
+ {description: "lextreme:subset=greek_legal_ner,model=text", priority: 5}
+ {description: "lextreme:subset=legalnero,model=text", priority: 5}
+ {description: "lextreme:subset=lener_br,model=text", priority: 5}
+ {description: "lextreme:subset=mapa_coarse,model=text", priority: 5}
+ {description: "lextreme:subset=mapa_fine,model=text", priority: 5}
+
+ {description: "lex_glue:subset=ecthr_a,model=text", priority: 3}
+ {description: "lex_glue:subset=ecthr_b,model=text", priority: 3}
+ {description: "lex_glue:subset=scotus,model=text", priority: 3}
+ {description: "lex_glue:subset=eurlex,model=text", priority: 3}
+ {description: "lex_glue:subset=ledgar,model=text", priority: 3}
+ {description: "lex_glue:subset=unfair_tos,model=text", priority: 3}
+ {description: "lex_glue:subset=case_hold,model=text", priority: 3}
+
+ {description: "billsum_legal_summarization:model=text", priority: 3},
+ {description: "multilexsum_legal_summarization:model=text", priority: 3},
+ {description: "eurlexsum_legal_summarization:model=text", priority: 3},
# MedQA
{description: "med_qa:model=biomedical", priority: 2}
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_10.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_10.conf
new file mode 100644
index 00000000000..15116e38512
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_10.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+ ## Reading comprehension
+
+ {description: "boolq:model=text,max_eval_instances=10", priority: 1}
+ {description: "narrative_qa:model=text,max_eval_instances=10", priority: 2}
+ {description: "quac:model=text,max_eval_instances=10", priority: 1}
+
+ ## Reading comprehension and closedbook QA variants
+
+ {description: "natural_qa:model=text,mode=openbook_longans,max_eval_instances=10", priority: 1}
+ {description: "natural_qa:model=text,mode=closedbook,max_eval_instances=10", priority: 1}
+
+ ## Closed-book QA with multiple choice
+
+ # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+ {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,max_eval_instances=10", priority: 1}
+ {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,max_eval_instances=10", priority: 2}
+ {description: "truthful_qa:model=text,task=mc_single,max_eval_instances=10", priority: 1}
+
+ {description: "mmlu:model=text,subject=abstract_algebra,max_eval_instances=2", priority: 2}
+ {description: "mmlu:model=text,subject=college_chemistry,max_eval_instances=2", priority: 2}
+ {description: "mmlu:model=text,subject=computer_security,max_eval_instances=2", priority: 2}
+ {description: "mmlu:model=text,subject=econometrics,max_eval_instances=2", priority: 2}
+ {description: "mmlu:model=text,subject=us_foreign_policy,max_eval_instances=2", priority: 2}
+
+ ##### Information Retrieval #####
+ # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+ {description: "msmarco:model=full_functionality_text,track=regular,valid_topk=30,max_eval_instances=10", priority: 2}
+ {description: "msmarco:model=full_functionality_text,track=trec,valid_topk=30,max_eval_instances=10", priority: 1}
+
+ ##### Summarization #####
+ # Scenarios: XSUM, CNN/DM
+
+ {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu,max_eval_instances=10", priority: 1}
+ {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu,max_eval_instances=10", priority: 1}
+
+
+ ##### Sentiment Analysis #####
+ # Scenarios: IMDB
+
+ {description: "imdb:model=text,max_eval_instances=10", priority: 1}
+
+
+ ##### (Miscellaneous) Text Classification #####
+ # Scenarios: RAFT
+
+ {description: "raft:subset=ade_corpus_v2,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=banking_77,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=neurips_impact_statement_risks,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=one_stop_english,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=overruling,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=semiconductor_org_types,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=tweet_eval_hate,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=twitter_complaints,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=systematic_review_inclusion,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=tai_safety_research,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=terms_of_service,model=text,max_eval_instances=1", priority: 2}
+
+
+ ##### Toxicity Detection #####
+ # Scenarios: CivilComments
+
+ {description: "civil_comments:model=text,demographic=all,max_eval_instances=1", priority: 1}
+ {description: "civil_comments:model=text,demographic=male,max_eval_instances=1", priority: 2}
+ {description: "civil_comments:model=text,demographic=female,max_eval_instances=1", priority: 2}
+ {description: "civil_comments:model=text,demographic=LGBTQ,max_eval_instances=1", priority: 2}
+ {description: "civil_comments:model=text,demographic=christian,max_eval_instances=1", priority: 2}
+ {description: "civil_comments:model=text,demographic=muslim,max_eval_instances=1", priority: 2}
+ {description: "civil_comments:model=text,demographic=other_religions,max_eval_instances=1", priority: 2}
+ {description: "civil_comments:model=text,demographic=black,max_eval_instances=1", priority: 2}
+ {description: "civil_comments:model=text,demographic=white,max_eval_instances=2", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_100.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_100.conf
new file mode 100644
index 00000000000..2cfd32102dd
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_100.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+ ## Reading comprehension
+
+ {description: "boolq:model=text,max_eval_instances=100", priority: 1}
+ {description: "narrative_qa:model=text,max_eval_instances=100", priority: 2}
+ {description: "quac:model=text,max_eval_instances=100", priority: 1}
+
+ ## Reading comprehension and closedbook QA variants
+
+ {description: "natural_qa:model=text,mode=openbook_longans,max_eval_instances=100", priority: 1}
+ {description: "natural_qa:model=text,mode=closedbook,max_eval_instances=100", priority: 1}
+
+ ## Closed-book QA with multiple choice
+
+ # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+ {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,max_eval_instances=100", priority: 1}
+ {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,max_eval_instances=100", priority: 2}
+ {description: "truthful_qa:model=text,task=mc_single,max_eval_instances=100", priority: 1}
+
+ {description: "mmlu:model=text,subject=abstract_algebra,max_eval_instances=20", priority: 2}
+ {description: "mmlu:model=text,subject=college_chemistry,max_eval_instances=20", priority: 2}
+ {description: "mmlu:model=text,subject=computer_security,max_eval_instances=20", priority: 2}
+ {description: "mmlu:model=text,subject=econometrics,max_eval_instances=20", priority: 2}
+ {description: "mmlu:model=text,subject=us_foreign_policy,max_eval_instances=20", priority: 2}
+
+ ##### Information Retrieval #####
+ # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+ {description: "msmarco:model=full_functionality_text,track=regular,valid_topk=30,max_eval_instances=100", priority: 2}
+ {description: "msmarco:model=full_functionality_text,track=trec,valid_topk=30,max_eval_instances=100", priority: 1}
+
+ ##### Summarization #####
+ # Scenarios: XSUM, CNN/DM
+
+ {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu,max_eval_instances=100", priority: 1}
+ {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu,max_eval_instances=100", priority: 1}
+
+
+ ##### Sentiment Analysis #####
+ # Scenarios: IMDB
+
+ {description: "imdb:model=text,max_eval_instances=100", priority: 1}
+
+
+ ##### (Miscellaneous) Text Classification #####
+ # Scenarios: RAFT
+
+ {description: "raft:subset=ade_corpus_v2,model=text,max_eval_instances=9", priority: 2}
+ {description: "raft:subset=banking_77,model=text,max_eval_instances=9", priority: 2}
+ {description: "raft:subset=neurips_impact_statement_risks,model=text,max_eval_instances=9", priority: 2}
+ {description: "raft:subset=one_stop_english,model=text,max_eval_instances=9", priority: 2}
+ {description: "raft:subset=overruling,model=text,max_eval_instances=9", priority: 2}
+ {description: "raft:subset=semiconductor_org_types,model=text,max_eval_instances=9", priority: 2}
+ {description: "raft:subset=tweet_eval_hate,model=text,max_eval_instances=9", priority: 2}
+ {description: "raft:subset=twitter_complaints,model=text,max_eval_instances=9", priority: 2}
+ {description: "raft:subset=systematic_review_inclusion,model=text,max_eval_instances=9", priority: 2}
+ {description: "raft:subset=tai_safety_research,model=text,max_eval_instances=9", priority: 2}
+ {description: "raft:subset=terms_of_service,model=text,max_eval_instances=10", priority: 2}
+
+
+ ##### Toxicity Detection #####
+ # Scenarios: CivilComments
+
+ {description: "civil_comments:model=text,demographic=all,max_eval_instances=11", priority: 1}
+ {description: "civil_comments:model=text,demographic=male,max_eval_instances=11", priority: 2}
+ {description: "civil_comments:model=text,demographic=female,max_eval_instances=11", priority: 2}
+ {description: "civil_comments:model=text,demographic=LGBTQ,max_eval_instances=11", priority: 2}
+ {description: "civil_comments:model=text,demographic=christian,max_eval_instances=11", priority: 2}
+ {description: "civil_comments:model=text,demographic=muslim,max_eval_instances=11", priority: 2}
+ {description: "civil_comments:model=text,demographic=other_religions,max_eval_instances=11", priority: 2}
+ {description: "civil_comments:model=text,demographic=black,max_eval_instances=11", priority: 2}
+ {description: "civil_comments:model=text,demographic=white,max_eval_instances=12", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_1000.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_1000.conf
new file mode 100644
index 00000000000..fd2cac58a07
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_1000.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+ ## Reading comprehension
+
+ {description: "boolq:model=text,max_eval_instances=1000", priority: 1}
+ {description: "narrative_qa:model=text,max_eval_instances=1000", priority: 2}
+ {description: "quac:model=text,max_eval_instances=1000", priority: 1}
+
+ ## Reading comprehension and closedbook QA variants
+
+ {description: "natural_qa:model=text,mode=openbook_longans,max_eval_instances=1000", priority: 1}
+ {description: "natural_qa:model=text,mode=closedbook,max_eval_instances=1000", priority: 1}
+
+ ## Closed-book QA with multiple choice
+
+ # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+ {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,max_eval_instances=1000", priority: 1}
+ {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,max_eval_instances=1000", priority: 2}
+ {description: "truthful_qa:model=text,task=mc_single,max_eval_instances=1000", priority: 1}
+
+ {description: "mmlu:model=text,subject=abstract_algebra,max_eval_instances=200", priority: 2}
+ {description: "mmlu:model=text,subject=college_chemistry,max_eval_instances=200", priority: 2}
+ {description: "mmlu:model=text,subject=computer_security,max_eval_instances=200", priority: 2}
+ {description: "mmlu:model=text,subject=econometrics,max_eval_instances=200", priority: 2}
+ {description: "mmlu:model=text,subject=us_foreign_policy,max_eval_instances=200", priority: 2}
+
+ ##### Information Retrieval #####
+ # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+ {description: "msmarco:model=full_functionality_text,track=regular,valid_topk=30,max_eval_instances=1000", priority: 2}
+ {description: "msmarco:model=full_functionality_text,track=trec,valid_topk=30,max_eval_instances=1000", priority: 1}
+
+ ##### Summarization #####
+ # Scenarios: XSUM, CNN/DM
+
+ {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu,max_eval_instances=1000", priority: 1}
+ {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu,max_eval_instances=1000", priority: 1}
+
+
+ ##### Sentiment Analysis #####
+ # Scenarios: IMDB
+
+ {description: "imdb:model=text,max_eval_instances=1000", priority: 1}
+
+
+ ##### (Miscellaneous) Text Classification #####
+ # Scenarios: RAFT
+
+ {description: "raft:subset=ade_corpus_v2,model=text,max_eval_instances=90", priority: 2}
+ {description: "raft:subset=banking_77,model=text,max_eval_instances=90", priority: 2}
+ {description: "raft:subset=neurips_impact_statement_risks,model=text,max_eval_instances=90", priority: 2}
+ {description: "raft:subset=one_stop_english,model=text,max_eval_instances=90", priority: 2}
+ {description: "raft:subset=overruling,model=text,max_eval_instances=90", priority: 2}
+ {description: "raft:subset=semiconductor_org_types,model=text,max_eval_instances=90", priority: 2}
+ {description: "raft:subset=tweet_eval_hate,model=text,max_eval_instances=90", priority: 2}
+ {description: "raft:subset=twitter_complaints,model=text,max_eval_instances=90", priority: 2}
+ {description: "raft:subset=systematic_review_inclusion,model=text,max_eval_instances=90", priority: 2}
+ {description: "raft:subset=tai_safety_research,model=text,max_eval_instances=90", priority: 2}
+ {description: "raft:subset=terms_of_service,model=text,max_eval_instances=100", priority: 2}
+
+
+ ##### Toxicity Detection #####
+ # Scenarios: CivilComments
+
+ {description: "civil_comments:model=text,demographic=all,max_eval_instances=110", priority: 1}
+ {description: "civil_comments:model=text,demographic=male,max_eval_instances=110", priority: 2}
+ {description: "civil_comments:model=text,demographic=female,max_eval_instances=110", priority: 2}
+ {description: "civil_comments:model=text,demographic=LGBTQ,max_eval_instances=110", priority: 2}
+ {description: "civil_comments:model=text,demographic=christian,max_eval_instances=110", priority: 2}
+ {description: "civil_comments:model=text,demographic=muslim,max_eval_instances=110", priority: 2}
+ {description: "civil_comments:model=text,demographic=other_religions,max_eval_instances=110", priority: 2}
+ {description: "civil_comments:model=text,demographic=black,max_eval_instances=110", priority: 2}
+ {description: "civil_comments:model=text,demographic=white,max_eval_instances=120", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_20.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_20.conf
new file mode 100644
index 00000000000..c9e4c2e14fa
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_20.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+ ## Reading comprehension
+
+ {description: "boolq:model=text,max_eval_instances=20", priority: 1}
+ {description: "narrative_qa:model=text,max_eval_instances=20", priority: 2}
+ {description: "quac:model=text,max_eval_instances=20", priority: 1}
+
+ ## Reading comprehension and closedbook QA variants
+
+ {description: "natural_qa:model=text,mode=openbook_longans,max_eval_instances=20", priority: 1}
+ {description: "natural_qa:model=text,mode=closedbook,max_eval_instances=20", priority: 1}
+
+ ## Closed-book QA with multiple choice
+
+ # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+ {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,max_eval_instances=20", priority: 1}
+ {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,max_eval_instances=20", priority: 2}
+ {description: "truthful_qa:model=text,task=mc_single,max_eval_instances=20", priority: 1}
+
+ {description: "mmlu:model=text,subject=abstract_algebra,max_eval_instances=4", priority: 2}
+ {description: "mmlu:model=text,subject=college_chemistry,max_eval_instances=4", priority: 2}
+ {description: "mmlu:model=text,subject=computer_security,max_eval_instances=4", priority: 2}
+ {description: "mmlu:model=text,subject=econometrics,max_eval_instances=4", priority: 2}
+ {description: "mmlu:model=text,subject=us_foreign_policy,max_eval_instances=4", priority: 2}
+
+ ##### Information Retrieval #####
+ # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+ {description: "msmarco:model=full_functionality_text,track=regular,valid_topk=30,max_eval_instances=20", priority: 2}
+ {description: "msmarco:model=full_functionality_text,track=trec,valid_topk=30,max_eval_instances=20", priority: 1}
+
+ ##### Summarization #####
+ # Scenarios: XSUM, CNN/DM
+
+ {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu,max_eval_instances=20", priority: 1}
+ {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu,max_eval_instances=20", priority: 1}
+
+
+ ##### Sentiment Analysis #####
+ # Scenarios: IMDB
+
+ {description: "imdb:model=text,max_eval_instances=20", priority: 1}
+
+
+ ##### (Miscellaneous) Text Classification #####
+ # Scenarios: RAFT
+
+ {description: "raft:subset=ade_corpus_v2,model=text,max_eval_instances=2", priority: 2}
+ {description: "raft:subset=banking_77,model=text,max_eval_instances=2", priority: 2}
+ {description: "raft:subset=neurips_impact_statement_risks,model=text,max_eval_instances=2", priority: 2}
+ {description: "raft:subset=one_stop_english,model=text,max_eval_instances=2", priority: 2}
+ {description: "raft:subset=overruling,model=text,max_eval_instances=2", priority: 2}
+ {description: "raft:subset=semiconductor_org_types,model=text,max_eval_instances=2", priority: 2}
+ {description: "raft:subset=tweet_eval_hate,model=text,max_eval_instances=2", priority: 2}
+ {description: "raft:subset=twitter_complaints,model=text,max_eval_instances=2", priority: 2}
+ {description: "raft:subset=systematic_review_inclusion,model=text,max_eval_instances=2", priority: 2}
+ {description: "raft:subset=tai_safety_research,model=text,max_eval_instances=1", priority: 2}
+ {description: "raft:subset=terms_of_service,model=text,max_eval_instances=1", priority: 2}
+
+
+ ##### Toxicity Detection #####
+ # Scenarios: CivilComments
+
+ {description: "civil_comments:model=text,demographic=all,max_eval_instances=2", priority: 1}
+ {description: "civil_comments:model=text,demographic=male,max_eval_instances=2", priority: 2}
+ {description: "civil_comments:model=text,demographic=female,max_eval_instances=2", priority: 2}
+ {description: "civil_comments:model=text,demographic=LGBTQ,max_eval_instances=2", priority: 2}
+ {description: "civil_comments:model=text,demographic=christian,max_eval_instances=2", priority: 2}
+ {description: "civil_comments:model=text,demographic=muslim,max_eval_instances=2", priority: 2}
+ {description: "civil_comments:model=text,demographic=other_religions,max_eval_instances=2", priority: 2}
+ {description: "civil_comments:model=text,demographic=black,max_eval_instances=3", priority: 2}
+ {description: "civil_comments:model=text,demographic=white,max_eval_instances=3", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_50.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_50.conf
new file mode 100644
index 00000000000..faf8e0de4e6
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_50.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+ ## Reading comprehension
+
+ {description: "boolq:model=text,max_eval_instances=50", priority: 1}
+ {description: "narrative_qa:model=text,max_eval_instances=50", priority: 2}
+ {description: "quac:model=text,max_eval_instances=50", priority: 1}
+
+ ## Reading comprehension and closedbook QA variants
+
+ {description: "natural_qa:model=text,mode=openbook_longans,max_eval_instances=50", priority: 1}
+ {description: "natural_qa:model=text,mode=closedbook,max_eval_instances=50", priority: 1}
+
+ ## Closed-book QA with multiple choice
+
+ # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+ {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,max_eval_instances=50", priority: 1}
+ {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,max_eval_instances=50", priority: 2}
+ {description: "truthful_qa:model=text,task=mc_single,max_eval_instances=50", priority: 1}
+
+ {description: "mmlu:model=text,subject=abstract_algebra,max_eval_instances=10", priority: 2}
+ {description: "mmlu:model=text,subject=college_chemistry,max_eval_instances=10", priority: 2}
+ {description: "mmlu:model=text,subject=computer_security,max_eval_instances=10", priority: 2}
+ {description: "mmlu:model=text,subject=econometrics,max_eval_instances=10", priority: 2}
+ {description: "mmlu:model=text,subject=us_foreign_policy,max_eval_instances=10", priority: 2}
+
+ ##### Information Retrieval #####
+ # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+ {description: "msmarco:model=full_functionality_text,track=regular,valid_topk=30,max_eval_instances=50", priority: 2}
+ {description: "msmarco:model=full_functionality_text,track=trec,valid_topk=30,max_eval_instances=50", priority: 1}
+
+ ##### Summarization #####
+ # Scenarios: XSUM, CNN/DM
+
+ {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu,max_eval_instances=50", priority: 1}
+ {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu,max_eval_instances=50", priority: 1}
+
+
+ ##### Sentiment Analysis #####
+ # Scenarios: IMDB
+
+ {description: "imdb:model=text,max_eval_instances=50", priority: 1}
+
+
+ ##### (Miscellaneous) Text Classification #####
+ # Scenarios: RAFT
+
+ {description: "raft:subset=ade_corpus_v2,model=text,max_eval_instances=4", priority: 2}
+ {description: "raft:subset=banking_77,model=text,max_eval_instances=4", priority: 2}
+ {description: "raft:subset=neurips_impact_statement_risks,model=text,max_eval_instances=4", priority: 2}
+ {description: "raft:subset=one_stop_english,model=text,max_eval_instances=4", priority: 2}
+ {description: "raft:subset=overruling,model=text,max_eval_instances=4", priority: 2}
+ {description: "raft:subset=semiconductor_org_types,model=text,max_eval_instances=4", priority: 2}
+ {description: "raft:subset=tweet_eval_hate,model=text,max_eval_instances=4", priority: 2}
+ {description: "raft:subset=twitter_complaints,model=text,max_eval_instances=4", priority: 2}
+ {description: "raft:subset=systematic_review_inclusion,model=text,max_eval_instances=4", priority: 2}
+ {description: "raft:subset=tai_safety_research,model=text,max_eval_instances=5", priority: 2}
+ {description: "raft:subset=terms_of_service,model=text,max_eval_instances=5", priority: 2}
+
+
+ ##### Toxicity Detection #####
+ # Scenarios: CivilComments
+
+ {description: "civil_comments:model=text,demographic=all,max_eval_instances=4", priority: 1}
+ {description: "civil_comments:model=text,demographic=male,max_eval_instances=4", priority: 2}
+ {description: "civil_comments:model=text,demographic=female,max_eval_instances=4", priority: 2}
+ {description: "civil_comments:model=text,demographic=LGBTQ,max_eval_instances=4", priority: 2}
+ {description: "civil_comments:model=text,demographic=christian,max_eval_instances=4", priority: 2}
+ {description: "civil_comments:model=text,demographic=muslim,max_eval_instances=5", priority: 2}
+ {description: "civil_comments:model=text,demographic=other_religions,max_eval_instances=5", priority: 2}
+ {description: "civil_comments:model=text,demographic=black,max_eval_instances=5", priority: 2}
+ {description: "civil_comments:model=text,demographic=white,max_eval_instances=5", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_core_scenarios_all.conf b/src/helm/benchmark/presentation/run_specs_core_scenarios_all.conf
new file mode 100644
index 00000000000..dbb0e9b85ab
--- /dev/null
+++ b/src/helm/benchmark/presentation/run_specs_core_scenarios_all.conf
@@ -0,0 +1,77 @@
+# Main `RunSpec`s for the benchmarking the core scenarios.
+
+entries: [
+
+ ## Reading comprehension
+
+ {description: "boolq:model=text,data_augmentation=canonical", priority: 1}
+ {description: "narrative_qa:model=text,data_augmentation=canonical", priority: 2}
+ {description: "quac:model=text,data_augmentation=canonical", priority: 1}
+
+ ## Reading comprehension and closedbook QA variants
+
+ {description: "natural_qa:model=text,mode=openbook_longans,data_augmentation=canonical", priority: 1}
+ {description: "natural_qa:model=text,mode=closedbook,data_augmentation=canonical", priority: 1}
+
+ ## Closed-book QA with multiple choice
+
+ # Adaptation method is set to ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED and echo=True
+ {description: "commonsense:model=full_functionality_text,dataset=hellaswag,method=multiple_choice_separate_original,data_augmentation=canonical", priority: 1}
+ {description: "commonsense:model=full_functionality_text,dataset=openbookqa,method=multiple_choice_separate_calibrated,data_augmentation=canonical", priority: 2}
+ {description: "truthful_qa:model=text,task=mc_single,data_augmentation=canonical", priority: 1}
+
+ {description: "mmlu:model=text,subject=abstract_algebra,data_augmentation=canonical", priority: 2}
+ {description: "mmlu:model=text,subject=college_chemistry,data_augmentation=canonical", priority: 2}
+ {description: "mmlu:model=text,subject=computer_security,data_augmentation=canonical", priority: 2}
+ {description: "mmlu:model=text,subject=econometrics,data_augmentation=canonical", priority: 2}
+ {description: "mmlu:model=text,subject=us_foreign_policy,data_augmentation=canonical", priority: 2}
+
+ ##### Information Retrieval #####
+ # Scenarios: MS Marco (Regular), MS MARCO (TREC)
+
+ {description: "msmarco:model=full_functionality_text,data_augmentation=canonical,track=regular,valid_topk=30", priority: 2}
+ {description: "msmarco:model=full_functionality_text,data_augmentation=canonical,track=trec,valid_topk=30", priority: 1}
+
+ ##### Summarization #####
+ # Scenarios: XSUM, CNN/DM
+
+ {description: "summarization_cnndm:model=text,temperature=0.3,device=cpu", priority: 1}
+ {description: "summarization_xsum_sampled:model=text,temperature=0.3,device=cpu", priority: 1}
+
+
+ ##### Sentiment Analysis #####
+ # Scenarios: IMDB
+
+ {description: "imdb:model=text,data_augmentation=canonical", priority: 1}
+
+
+ ##### (Miscellaneous) Text Classification #####
+ # Scenarios: RAFT
+
+ {description: "raft:subset=ade_corpus_v2,model=text,data_augmentation=canonical", priority: 2}
+ {description: "raft:subset=banking_77,model=text,data_augmentation=canonical", priority: 2}
+ {description: "raft:subset=neurips_impact_statement_risks,model=text,data_augmentation=canonical", priority: 2}
+ {description: "raft:subset=one_stop_english,model=text,data_augmentation=canonical", priority: 2}
+ {description: "raft:subset=overruling,model=text,data_augmentation=canonical", priority: 2}
+ {description: "raft:subset=semiconductor_org_types,model=text,data_augmentation=canonical", priority: 2}
+ {description: "raft:subset=tweet_eval_hate,model=text,data_augmentation=canonical", priority: 2}
+ {description: "raft:subset=twitter_complaints,model=text,data_augmentation=canonical", priority: 2}
+ {description: "raft:subset=systematic_review_inclusion,model=text,data_augmentation=canonical", priority: 2}
+ {description: "raft:subset=tai_safety_research,model=text,data_augmentation=canonical", priority: 2}
+ {description: "raft:subset=terms_of_service,model=text,data_augmentation=canonical", priority: 2}
+
+
+ ##### Toxicity Detection #####
+ # Scenarios: CivilComments
+
+ {description: "civil_comments:model=text,demographic=all,data_augmentation=canonical", priority: 1}
+ {description: "civil_comments:model=text,demographic=male,data_augmentation=canonical", priority: 2}
+ {description: "civil_comments:model=text,demographic=female,data_augmentation=canonical", priority: 2}
+ {description: "civil_comments:model=text,demographic=LGBTQ,data_augmentation=canonical", priority: 2}
+ {description: "civil_comments:model=text,demographic=christian,data_augmentation=canonical", priority: 2}
+ {description: "civil_comments:model=text,demographic=muslim,data_augmentation=canonical", priority: 2}
+ {description: "civil_comments:model=text,demographic=other_religions,data_augmentation=canonical", priority: 2}
+ {description: "civil_comments:model=text,demographic=black,data_augmentation=canonical", priority: 2}
+ {description: "civil_comments:model=text,demographic=white,data_augmentation=canonical", priority: 2}
+
+]
\ No newline at end of file
diff --git a/src/helm/benchmark/presentation/run_specs_vhelm.conf b/src/helm/benchmark/presentation/run_specs_vhelm.conf
index a0b8df7a036..1ae33e43631 100644
--- a/src/helm/benchmark/presentation/run_specs_vhelm.conf
+++ b/src/helm/benchmark/presentation/run_specs_vhelm.conf
@@ -4,4 +4,5 @@ entries: [
################################################# Main experiments #################################################
{description: "vqa:model=vlm", priority: 1, groups: ["vqa_base"]}
+ {description: "viz_wiz:model=vlm", priority: 1, groups: ["vqa_base"]}
]
diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py
index c1208ee45b5..3a0b7877b8c 100644
--- a/src/helm/benchmark/presentation/schema.py
+++ b/src/helm/benchmark/presentation/schema.py
@@ -3,7 +3,7 @@
from typing import List, Optional, Dict
import dacite
import mako.template
-import yaml # type: ignore
+import yaml
import importlib_resources as resources
from helm.common.general import hlog
@@ -207,9 +207,12 @@ class RunGroup(Field):
# Which adapter_spec fields we should preserve when displaying methods for this group
# When we are constructing a table where the rows are methods, what constitutes a "method" is given by the set of
- # adapter keys. By default, this should just be "model" (e.g., BLOOM), where details like "num_train_instances" are
- # "marginalized out". However, for ablations, we want to include both "model" and "num_train_instances".
- adapter_keys_shown: List[str] = field(default_factory=lambda: ["model"])
+ # adapter keys. By default, this should just be "model_deployment" (e.g., BLOOM), where details like
+ # "num_train_instances" are "marginalized out". However, for ablations, we want to include both "model_deployment"
+ # and "num_train_instances".
+ # NOTE: "model" is kept for backward compatibility reason.
+ # TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore.
+ adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"])
@dataclass
diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index ce5cf3cd1ed..4d873338015 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -57,7 +57,10 @@
CONTAMINATION_STYLES,
CONTAMINATION_LEVEL_STRONG,
)
+from helm.benchmark.config_registry import register_helm_configurations
from helm.benchmark.presentation.run_display import write_run_display_json
+from helm.benchmark.model_deployment_registry import get_metadata_for_deployment
+from helm.benchmark.model_metadata_registry import ModelMetadata
OVERLAP_N_COUNT = 13
@@ -165,7 +168,7 @@ def get_coarse_adapter_spec(
# Create a new adapter_spec, keeping only the model and the keys in adapter_keys_shown
adapter_spec_kwargs = {key: adapter_spec.__dict__[key] for key in adapter_keys_shown}
- return AdapterSpec(**adapter_spec_kwargs) # type: ignore
+ return AdapterSpec(**adapter_spec_kwargs)
def get_method_display_name(model_display_name: Optional[str], info: Dict[str, Any]) -> str:
@@ -178,6 +181,8 @@ def get_method_display_name(model_display_name: Optional[str], info: Dict[str, A
info = dict(info)
if "model" in info:
del info["model"]
+ if "model_deployment" in info:
+ del info["model_deployment"]
return (model_display_name or "???") + (f" [{dict_to_str(info)}]" if len(info) > 0 else "")
@@ -363,13 +368,8 @@ def read_runs_for_suite(self, suite, run_suite_path):
hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
continue
run_path: str = os.path.join(run_suite_path, run_dir_name)
- self.runs.append(self.read_run(run_path))
-
- # For each group (e.g., natural_qa), map
- # (i) scenario spec (e.g., subject=philosophy) [optional] and
- # (ii) adapter spec (e.g., model = openai/davinci)
- # to list of runs
- for run in self.runs:
+ run = self.read_run(run_path)
+ self.runs.append(run)
if run.run_spec.name in self.runs_to_run_suites:
hlog(
f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
@@ -377,6 +377,12 @@ def read_runs_for_suite(self, suite, run_suite_path):
)
self.runs_to_run_suites[run.run_spec.name] = suite
+ def group_runs(self):
+ # For each group (e.g., natural_qa), map
+ # (i) scenario spec (e.g., subject=philosophy) [optional] and
+ # (ii) adapter spec (e.g., model = openai/davinci)
+ # to list of runs
+ for run in self.runs:
scenario_spec = run.run_spec.scenario_spec
adapter_spec = run.run_spec.adapter_spec
for group_name in run.run_spec.groups:
@@ -564,12 +570,12 @@ def write_cost_report(self):
# TODO: move to write_executive_summary()
models_to_costs: Dict[str, Dict[str]] = defaultdict(lambda: defaultdict(int))
for run in self.runs:
- model: str = run.run_spec.adapter_spec.model
+ deployment: str = run.run_spec.adapter_spec.model_deployment
for stat in run.stats:
stat_name = stat.name.name
if stat_name in Summarizer.COST_REPORT_FIELDS and not stat.name.split:
- models_to_costs[model][stat_name] += stat.sum
+ models_to_costs[deployment][stat_name] += stat.sum
# Do a second pass to add up the total number of tokens
for costs in models_to_costs.values():
@@ -660,7 +666,7 @@ def get_cell(stats: List[Stat], compute_mean: bool = False, compute_sum: bool =
for subgroup in self.expand_subgroups(group):
for adapter_spec, runs in self.group_adapter_to_runs[subgroup.name].items():
filtered_runs = self.filter_runs_by_visibility(runs, subgroup)
- models.add(adapter_spec.model)
+ models.add(adapter_spec.model_deployment)
methods.add(adapter_spec.method)
for run in filtered_runs:
num_instances.extend(get_all_stats_by_name(run.stats, "num_instances"))
@@ -869,33 +875,28 @@ def run_spec_names_to_url(run_spec_names: List[str]) -> str:
model_order = [model.name for model in self.schema.models]
def _adapter_spec_sort_key(spec):
- index = model_order.index(spec.model) if spec.model in model_order else -1
- return (index, spec.model)
+ index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
+ return (index, spec.model_deployment)
adapter_specs = list(sorted(adapter_specs, key=_adapter_spec_sort_key))
# Pull out only the keys of the method adapter_spec that is needed to
# uniquely identify the method.
- infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model"])
+ infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model_deployment", "model"])
assert len(adapter_specs) == len(infos), [adapter_specs, infos]
# Populate the contents of the table
rows = []
for adapter_spec, info in zip(adapter_specs, infos):
- model_name: str = adapter_spec.model
-
- # Get the model display name from the schema.
- # Fall back to using the model name as the model display name if the model is not
- # defined in the schema.
- model_display_name = (
- self.schema.name_to_model[model_name].display_name
- if model_name in self.schema.name_to_model
- else model_name
+ deployment: str = (
+ adapter_spec.model_deployment if len(adapter_spec.model_deployment) > 0 else adapter_spec.model
)
+ model_metadata: ModelMetadata = get_metadata_for_deployment(deployment)
+ model_name: str = model_metadata.name
runs = adapter_to_runs[adapter_spec]
- display_name = get_method_display_name(model_display_name, info)
+ display_name = get_method_display_name(model_metadata.display_name, info)
# Link to all the runs under this model
if link_to_runs:
@@ -1254,6 +1255,7 @@ def symlink_latest(self) -> None:
def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
"""Run the entire summarization pipeline pipeline."""
self.read_runs()
+ self.group_runs()
self.check_metrics_defined()
self.write_run_display_json(skip_completed)
@@ -1335,6 +1337,8 @@ def main():
else:
raise ValueError("Exactly one of --release or --suite must be specified.")
+ register_helm_configurations()
+
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
summarizer = Summarizer(
release=release,
diff --git a/src/helm/benchmark/presentation/test_run_entry.py b/src/helm/benchmark/presentation/test_run_entry.py
index 68d33424d81..86a3b53afc2 100644
--- a/src/helm/benchmark/presentation/test_run_entry.py
+++ b/src/helm/benchmark/presentation/test_run_entry.py
@@ -1,4 +1,5 @@
import os
+import pytest
from helm.common.object_spec import parse_object_spec
from helm.benchmark.presentation.run_entry import read_run_entries
@@ -6,11 +7,16 @@
from helm.benchmark import vlm_run_specs # noqa
-def test_read_all_specs():
- """Read all the run entries and make sure they parse and we can instantiate them."""
+def list_fnames():
base_path = os.path.dirname(__file__)
- for fname in os.listdir(base_path):
- if fname.endswith(".conf"):
- run_entries = read_run_entries([os.path.join(base_path, fname)])
- for entry in run_entries.entries:
- construct_run_specs(parse_object_spec(entry.description))
+ return [os.path.join(base_path, fname) for fname in os.listdir(base_path) if fname.endswith(".conf")]
+
+
+class TestRunEntry:
+ """Read all the run entries and make sure they parse and we can instantiate them."""
+
+ @pytest.mark.parametrize("fname", list_fnames())
+ def test_read_all_specs(self, fname: str):
+ run_entries = read_run_entries([fname])
+ for entry in run_entries.entries:
+ construct_run_specs(parse_object_spec(entry.description))
diff --git a/src/helm/benchmark/run.py b/src/helm/benchmark/run.py
index 9e358df850e..cdf280d4364 100644
--- a/src/helm/benchmark/run.py
+++ b/src/helm/benchmark/run.py
@@ -10,11 +10,11 @@
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
from helm.common.authentication import Authentication
from helm.common.object_spec import parse_object_spec, get_class_by_name
-from helm.proxy.clients.remote_model_registry import check_and_register_remote_model
from helm.proxy.services.remote_service import create_authentication, add_service_args
from helm.benchmark.model_metadata_registry import register_model_metadata_from_path
from helm.benchmark.model_deployment_registry import register_model_deployments_from_path
+from helm.benchmark.config_registry import register_helm_configurations
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.benchmark import vlm_run_specs # noqa
from .executor import ExecutionSpec
@@ -39,7 +39,7 @@ def run_entries_to_run_specs(
for run_spec in construct_run_specs(parse_object_spec(entry.description)):
# Filter by models
- if models_to_run and run_spec.adapter_spec.model not in models_to_run:
+ if models_to_run and run_spec.adapter_spec.model_deployment not in models_to_run:
continue
# Filter by groups
@@ -239,13 +239,6 @@ def main():
default=[],
help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
)
- parser.add_argument(
- "--enable-remote-models",
- nargs="+",
- default=[],
- help="Experimental: Enable remote service models that are not available on the client. "
- "The client will use RemoteWindowService for windowing.",
- )
parser.add_argument(
"--runner-class-name",
type=str,
@@ -277,9 +270,6 @@ def main():
for model_deployment_paths in args.model_deployment_paths:
register_model_deployments_from_path(model_deployment_paths)
- if args.server_url and args.enable_remote_models:
- check_and_register_remote_model(args.server_url, args.enable_remote_models)
-
run_entries: List[RunEntry] = []
if args.conf_paths:
run_entries.extend(read_run_entries(args.conf_paths).entries)
@@ -288,6 +278,8 @@ def main():
[RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
)
+ register_helm_configurations()
+
run_specs = run_entries_to_run_specs(
run_entries=run_entries,
max_eval_instances=args.max_eval_instances,
diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py
index a664b2893ca..d7ada38f0ab 100644
--- a/src/helm/benchmark/run_expander.py
+++ b/src/helm/benchmark/run_expander.py
@@ -3,7 +3,7 @@
from dataclasses import replace
from typing import Any, List, Dict, Optional, Tuple, Type
-from helm.proxy.models import (
+from helm.benchmark.model_metadata_registry import (
get_all_instruction_following_models,
get_all_code_models,
get_all_models,
@@ -11,16 +11,10 @@
get_model_names_with_tag,
FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- GPT2_TOKENIZER_TAG,
- AI21_TOKENIZER_TAG,
- COHERE_TOKENIZER_TAG,
- OPT_TOKENIZER_TAG,
- GPTJ_TOKENIZER_TAG,
- GPTNEO_TOKENIZER_TAG,
- GPT4_TOKENIZER_TAG,
ABLATION_MODEL_TAG,
VISION_LANGUAGE_MODEL_TAG,
)
+from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
from .runner import RunSpec
from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution
from .augmentations.perturbation import PerturbationSpec
@@ -355,10 +349,6 @@ def values_dict(self):
"code": get_all_code_models(),
"instruction_following": get_all_instruction_following_models(),
"limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
- "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
- "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
- "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
- "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
"summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
"biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
"interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
@@ -388,6 +378,13 @@ def values_dict(self):
return values_dict
+class ModelDeploymentRunExpander(ReplaceValueRunExpander):
+ """For overriding model deployment"""
+
+ name = "model_deployment"
+ values_dict: Dict[str, List[Any]] = {}
+
+
############################################################
@@ -880,18 +877,18 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
"huggingface/santacoder": ["bigcode/santacoder"],
"huggingface/starcoder": ["bigcode/starcoder"],
}
- model_tags_and_tokenizers = [
- (GPT2_TOKENIZER_TAG, "huggingface/gpt2"),
- (AI21_TOKENIZER_TAG, "ai21/j1"),
- (COHERE_TOKENIZER_TAG, "cohere/cohere"),
- (OPT_TOKENIZER_TAG, "meta/opt"),
- (GPTJ_TOKENIZER_TAG, "eleutherai/gptj"),
- (GPT4_TOKENIZER_TAG, "openai/cl100k_base"),
- (GPTNEO_TOKENIZER_TAG, "eleutherai/gptneox"),
+ list_tokenizers = [
+ "huggingface/gpt2",
+ "ai21/j1",
+ "cohere/cohere",
+ "meta/opt",
+ "eleutherai/gptj",
+ "openai/cl100k_base",
+ "eleutherai/gptneox",
]
- for model_tag, tokenizer in model_tags_and_tokenizers:
- for model in get_model_names_with_tag(model_tag):
- model_to_tokenizer_mapping[model] = [tokenizer]
+ for tokenizer_name in list_tokenizers:
+ for model in get_model_names_with_tokenizer(tokenizer_name):
+ model_to_tokenizer_mapping[model] = [tokenizer_name]
# tokenizer=default will map to using the right tokenizer for a given model.
values_dict = {"default": model_to_tokenizer_mapping}
@@ -907,10 +904,10 @@ def __init__(self, value):
self.all_values = [value]
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
- # Find right tokenizer given model.
+ # Find right tokenizer given model deployment name.
if isinstance(self.all_values, dict):
- model: str = run_spec.adapter_spec.model
- self.values = self.all_values[model] if model in self.all_values else []
+ deployment: str = run_spec.adapter_spec.model_deployment
+ self.values = self.all_values[deployment] if deployment in self.all_values else []
else:
self.values = self.all_values
return super().expand(run_spec)
@@ -1114,6 +1111,7 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
MaxEvalInstancesRunExpander,
NumOutputsRunExpander,
ModelRunExpander,
+ ModelDeploymentRunExpander,
DataAugmentationRunExpander,
TokenizerRunExpander,
NumPromptTokensRunExpander,
diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py
index 76f6ae50292..69bdd636b13 100644
--- a/src/helm/benchmark/run_specs.py
+++ b/src/helm/benchmark/run_specs.py
@@ -1,8 +1,10 @@
+import dataclasses
import importlib
import itertools
from functools import partial
from typing import Any, Callable, List, Dict, Optional, Set, TypeVar
+from helm.benchmark.model_deployment_registry import ALL_MODEL_DEPLOYMENTS, DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT
from helm.common.hierarchical_logger import hlog, htrack
from helm.common.object_spec import ObjectSpec
from helm.benchmark.adaptation.adapters.adapter_factory import (
@@ -47,10 +49,15 @@
TaskType,
get_lextreme_task_type,
)
-from helm.proxy.models import (
+from helm.benchmark.model_deployment_registry import (
+ ModelDeployment,
+ get_model_deployment,
+)
+from helm.benchmark.model_metadata_registry import (
+ ModelMetadata,
+ get_model_metadata,
ANTHROPIC_CLAUDE_1_MODEL_TAG,
ANTHROPIC_CLAUDE_2_MODEL_TAG,
- get_model,
NO_NEWLINES_TAG,
NLG_PREFIX_TAG,
CHATML_MODEL_TAG,
@@ -456,6 +463,7 @@ def get_adapter_spec1() -> AdapterSpec:
num_outputs=3,
num_train_trials=3,
model="simple/model1",
+ model_deployment="simple/model1",
temperature=1,
stop_sequences=["."],
)
@@ -1213,7 +1221,8 @@ def get_numeracy_spec(
) -> RunSpec:
from .scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
- run_solver: bool = True if run_solver == "True" else False # type: ignore
+ run_solver_bool: bool = True if run_solver == "True" else False
+ del run_solver
random_seed = int(seed)
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
@@ -1253,7 +1262,7 @@ def get_numeracy_spec(
name=f"numeracy:relation_type={relation_type},mode={mode}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
- metric_specs=get_numeracy_metric_specs(run_solver), # type: ignore
+ metric_specs=get_numeracy_metric_specs(run_solver_bool),
groups=["numeracy"],
)
@@ -1265,21 +1274,25 @@ def get_math_spec(
use_official_examples: str = "False",
use_chain_of_thought: str = "False",
) -> RunSpec:
- use_official_examples: bool = use_official_examples == "True" # type: ignore
- use_chain_of_thought: bool = use_chain_of_thought == "True" # type: ignore
- if use_chain_of_thought:
- assert not use_official_examples, "Cannot use official examples when use_chain_of_thought is True."
+ # Convert to bools and remove the str versions
+ use_official_examples_bool: bool = use_official_examples == "True"
+ use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
+ del use_official_examples
+ del use_chain_of_thought
+
+ if use_chain_of_thought_bool:
+ assert not use_official_examples_bool, "Cannot use official examples when use_chain_of_thought is True."
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.math_scenario.MATHScenario",
args={
"subject": subject,
"level": level,
- "use_official_examples": use_official_examples,
- "use_chain_of_thought": use_chain_of_thought,
+ "use_official_examples": use_official_examples_bool,
+ "use_chain_of_thought": use_chain_of_thought_bool,
},
)
- if use_chain_of_thought: # Include the solution in the output as per https://arxiv.org/abs/2201.11903
+ if use_chain_of_thought_bool: # Include the solution in the output as per https://arxiv.org/abs/2201.11903
output_prefix = "Answer: " # Don't include LaTeX '$' delimiters
output_suffix = "\n"
instance_prefix = "###\n" # Don't include LaTeX '$' delimiters
@@ -1311,10 +1324,10 @@ def get_math_spec(
return RunSpec(
name=f"math:subject={subject},level={level},"
- f"use_official_examples={use_official_examples},use_chain_of_thought={use_chain_of_thought}",
+ f"use_official_examples={use_official_examples_bool},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
- metric_specs=get_math_metric_specs(use_chain_of_thought) + get_generative_harms_metric_specs(), # type: ignore
+ metric_specs=get_math_metric_specs(use_chain_of_thought_bool) + get_generative_harms_metric_specs(),
groups=groups,
)
@@ -1925,7 +1938,6 @@ def get_metric_specs(big_bench_metrics: List[str]) -> List[MetricSpec]:
# "metrics" is a required field. The default values were populated using the link above.
adapter_spec = AdapterSpec(
method=get_adaptation_method(big_bench_task["metrics"]),
- model="openai/text-curie-001", # Can override with the `ModelRunExpander`.
max_train_instances=5, # Can override with the `MaxTrainInstancesRunExpander`.
num_outputs=1, # Can override with the `NumOutputsRunExpander`.
# From "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
@@ -2837,6 +2849,80 @@ def get_decodingtrust_toxicity_prompts_spec(subject) -> RunSpec:
############################################################
+def get_default_model_deployment_for_model(
+ model_name: str, warn_arg_deprecated: bool = False, ignore_deprecated: bool = False
+) -> Optional[str]:
+ """Returns a valid model deployment name corresponding to the given model arg.
+ This is used as a backwards compatibility layer for model names that are now moved to model deployments.
+ Example: "anthropic/claude-v1.3" => "anthropic/claude-v1.3"
+ Example: "meta/llama-7b" => "together/llama-7b"
+
+ The process to find a model deployment name is as follows:
+ 1. If there is a model deployment with the same name as the model arg, use it.
+ 2. If there is at least one deployment for the model, use the first one that is available.
+ 3. If there are no deployments for the model, returns None.
+
+ This function will also try to find a model deployment name that is not deprecated.
+ If there are no non-deprecated deployments, it will return the first deployment (even if it's deprecated).
+ If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
+
+ If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
+ as the model arg. This is to remind the user that the model name is deprecated and should be replaced with
+ the model deployment name (in their config).
+
+ Args:
+ model_arg: The model arg to convert to a model deployment name.
+ warn_arg_deprecated: Whether to print a warning if the model deployment name is not the same as the model arg.
+ ignore_deprecated: Whether to return None if the model deployment is deprecated.
+ """
+
+ # If there is a model deployment with the same name as the model arg, use it.
+ if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
+ deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
+ if deployment.deprecated and ignore_deprecated:
+ if warn_arg_deprecated:
+ hlog(f"WARNING: Model deployment {model_name} is deprecated")
+ return None
+ return deployment.name
+
+ # If there is at least one deployment for the model, use the first one that is available.
+ available_deployments: List[ModelDeployment] = [
+ deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
+ ]
+ if len(available_deployments) > 0:
+ available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
+ if warn_arg_deprecated:
+ hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.")
+ hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
+
+ # Additionally, if there is a non-deprecated deployment, use it.
+ non_deprecated_deployments: List[ModelDeployment] = [
+ deployment for deployment in available_deployments if not deployment.deprecated
+ ]
+ if len(non_deprecated_deployments) > 0:
+ chosen_deployment = non_deprecated_deployments[0]
+ # There are no non-deprecated deployments, so there are two options:
+ # 1. If we can return an empty string, return it. (no model deployment is available)
+ # 2. If we can't return an empty string, return the first deployment (even if it's deprecated).
+ elif ignore_deprecated:
+ return None
+ else:
+ chosen_deployment = available_deployments[0]
+ if warn_arg_deprecated:
+ hlog(f"WARNING: All model deployments for model {model_name} are deprecated.")
+ if warn_arg_deprecated:
+ hlog(
+ f"Choosing {chosen_deployment.name} (the first one) as "
+ f"the default model deployment for model {model_name}"
+ )
+ hlog("If you want to use a different model deployment, please specify it explicitly.")
+ return chosen_deployment.name
+
+ # Some models are added but have no deployments yet.
+ # In this case, we return None.
+ return None
+
+
def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
"""
Takes a specification (name, args) and returns a list of `RunSpec`s.
@@ -2862,13 +2948,41 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
]
def alter_run_spec(run_spec: RunSpec) -> RunSpec:
- try:
- model = get_model(run_spec.adapter_spec.model)
- except ValueError:
- # Models registered from configs cannot have expanders applied to them,
- # because the models will not have been registered yet at this point.
- # TODO: Figure out a cleaner way to deal with this.
- return run_spec
+ if not run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
+ raise ValueError("At least one of model_deployment and model must be specified")
+ elif not run_spec.adapter_spec.model and run_spec.adapter_spec.model_deployment:
+ # Infer model from model deployment
+ default_model_name = get_model_deployment(run_spec.adapter_spec.model_deployment).model_name
+ if not default_model_name:
+ default_model_name = run_spec.adapter_spec.model_deployment
+ run_spec = dataclasses.replace(
+ run_spec,
+ adapter_spec=dataclasses.replace(run_spec.adapter_spec, model=default_model_name),
+ )
+ elif run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
+ # Infer model deployment from model
+ default_model_deployment = get_default_model_deployment_for_model(run_spec.adapter_spec.model)
+ if not default_model_deployment:
+ raise ValueError(
+ f"Unknown model or no default model deployment found for model {run_spec.adapter_spec.model}"
+ )
+ run_spec = dataclasses.replace(
+ run_spec,
+ adapter_spec=dataclasses.replace(run_spec.adapter_spec, model_deployment=default_model_deployment),
+ )
+
+ # Both model and model_deployment should now be filled
+ assert run_spec.adapter_spec.model_deployment
+ assert run_spec.adapter_spec.model
+
+ model: ModelMetadata = get_model_metadata(run_spec.adapter_spec.model)
+ deployment: ModelDeployment = get_model_deployment(run_spec.adapter_spec.model_deployment)
+ if run_spec.adapter_spec.model != deployment.model_name:
+ raise ValueError(
+ f"Invalid RunSpec: selected model deployment '{run_spec.adapter_spec.model_deployment}'"
+ f"for model '{run_spec.adapter_spec.model}' but the model deployment is "
+ f"for a different model '{deployment.model_name}'"
+ )
# For models that strip newlines, when we're generating, we need to set
# the delimiter to be '###' so we stop properly.
if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
diff --git a/src/helm/benchmark/runner.py b/src/helm/benchmark/runner.py
index 52677631ace..21c1a62c6f8 100644
--- a/src/helm/benchmark/runner.py
+++ b/src/helm/benchmark/runner.py
@@ -8,6 +8,7 @@
import dataclasses
from dataclasses import dataclass, field
from typing import Any, Dict, List
+import numpy as np
from tqdm import tqdm
@@ -15,7 +16,15 @@
from helm.common.hierarchical_logger import hlog, htrack_block
from helm.common.cache import cache_stats
from .augmentations.data_augmenter import DataAugmenterSpec
-from .scenarios.scenario import Scenario, ScenarioSpec, create_scenario, Instance, with_instance_ids
+from .scenarios.scenario import (
+ EVAL_SPLITS,
+ TRAIN_SPLIT,
+ Scenario,
+ ScenarioSpec,
+ create_scenario,
+ Instance,
+ with_instance_ids,
+)
from .adaptation.adapters.adapter import Adapter
from .adaptation.adapters.adapter_factory import AdapterFactory
from .adaptation.scenario_state import ScenarioState
@@ -103,6 +112,38 @@ def remove_per_instance_stats_nans(per_instance_stats_list: List[PerInstanceStat
return result
+def downsample_eval_instances(instances: List[Instance], max_eval_instances: int) -> List[Instance]:
+ """
+ Get the instances necessary for this run:
+ Train instances (split=train): keep all (if any) for in-context learning
+ Eval instances (split=valid or test): keep at most `max_eval_instances` specified in `AdapterSpec` by sampling
+ Return the resulting train and eval instances.
+ """
+ all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
+
+ all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in EVAL_SPLITS]
+ if len(all_eval_instances) > max_eval_instances:
+ # The random sampling includes instances monotonically.
+ np.random.seed(0)
+ selected_eval_instances = list(
+ np.random.choice(
+ all_eval_instances, # type: ignore
+ max_eval_instances,
+ replace=False,
+ )
+ )
+ else:
+ selected_eval_instances = all_eval_instances
+
+ hlog(
+ f"{len(instances)} instances, "
+ f"{len(all_train_instances)} train instances, "
+ f"{len(selected_eval_instances)}/{len(all_eval_instances)} eval instances"
+ )
+
+ return all_train_instances + selected_eval_instances
+
+
class Runner:
"""
The main entry point for running the entire benchmark. Mostly just
@@ -145,11 +186,13 @@ def __init__(
self.eval_cache_path: str = os.path.join(self.runs_path, "eval_cache")
ensure_directory_exists(self.eval_cache_path)
- def _is_run_completed(self, run_spec: RunSpec):
+ def _get_run_path(self, run_spec: RunSpec) -> str:
+ return os.path.join(self.runs_path, run_spec.name)
+
+ def _is_run_completed(self, run_path: str):
"""Return whether the run was previously completed.
A run is completed if all of the expected output files exist."""
- run_path: str = os.path.join(self.runs_path, run_spec.name)
if not os.path.isdir(run_path):
return False
output_paths = [
@@ -182,6 +225,12 @@ def run_all(self, run_specs: List[RunSpec]):
raise RunnerError(f"Failed runs: [{failed_runs_str}]")
def run_one(self, run_spec: RunSpec):
+ run_path: str = self._get_run_path(run_spec)
+ if self.skip_completed_runs and self._is_run_completed(run_path):
+ hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
+ return
+ ensure_directory_exists(run_path)
+
# Load the scenario
scenario: Scenario = create_scenario(run_spec.scenario_spec)
@@ -195,18 +244,6 @@ def run_one(self, run_spec: RunSpec):
input_instances_output_path = os.path.join(self.instances_path, scenario_name_with_args)
input_instances_file_path = os.path.join(input_instances_output_path, "input_instances.json")
- run_path: str = os.path.join(self.runs_path, run_spec.name)
- ensure_directory_exists(run_path)
-
- if self.skip_completed_runs and self._is_run_completed(run_spec):
- # If scenario_state.json exists, assume that all other output files exist
- # because scenario_state.json is the last output file to be written.
- hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
- return
-
- # Fetch and initialize the Adapter based on the `AdapterSpec`.
- adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
-
instances: List[Instance]
if self.skip_instances:
instances = []
@@ -233,7 +270,9 @@ def run_one(self, run_spec: RunSpec):
instances = with_instance_ids(instances)
# Get the instances necessary for this run.
- instances = adapter.get_run_instances(instances)
+ max_eval_instances = run_spec.adapter_spec.max_eval_instances
+ if max_eval_instances is not None:
+ instances = downsample_eval_instances(instances, max_eval_instances)
# Data preprocessing
instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
@@ -241,6 +280,7 @@ def run_one(self, run_spec: RunSpec):
)
# Adapt (convert to requests)
+ adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
scenario_state: ScenarioState = adapter.adapt(instances, self.executor.execution_spec.parallelism)
# Execute (fill up results)
diff --git a/src/helm/benchmark/scenarios/cleva_scenario.py b/src/helm/benchmark/scenarios/cleva_scenario.py
index eb8ed80462f..c63a1e523a0 100644
--- a/src/helm/benchmark/scenarios/cleva_scenario.py
+++ b/src/helm/benchmark/scenarios/cleva_scenario.py
@@ -10,7 +10,12 @@
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
ADAPT_GENERATION,
)
-from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from helm.common.general import (
+ assert_is_str,
+ assert_is_str_list,
+ ensure_file_downloaded,
+ ensure_directory_exists,
+)
from helm.common.hierarchical_logger import hlog
from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
from .code_scenario import CodeReference, CodeInstance
@@ -69,26 +74,17 @@ def transform(self, data: Dict[str, RawData], templates: Dict[str, Optional[Temp
"""Convert a data point in CLEVA format to a HELM instance according to a given CLEVA prompt template."""
transformed_data = self._apply_all(copy.deepcopy(data), templates)
- prompt: str = transformed_data["input"] # type: ignore
- assert isinstance(prompt, str)
+ prompt = assert_is_str(transformed_data["input"])
if "choices" in transformed_data:
# This is a multiple-choice task
- choices: List[str] = transformed_data["choices"] # type: ignore
- # Gurantee `choices` must be `List[str]`
- assert isinstance(choices, list)
- for c in choices:
- assert isinstance(c, str)
+ choices = assert_is_str_list(transformed_data["choices"])
references: List[Reference] = [
Reference(Output(text=text), tags=[CORRECT_TAG] if idx in transformed_data["label"] else [])
for idx, text in enumerate(choices)
]
else:
# This is a generation task
- correct_answer: List[str] = transformed_data["label"] # type: ignore
- # Gurantee `label` must be `List[str]`
- assert isinstance(correct_answer, list)
- for a in correct_answer:
- assert isinstance(a, str)
+ correct_answer = assert_is_str_list(transformed_data["label"])
references = [Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in correct_answer]
instance = Instance(
@@ -109,15 +105,12 @@ def transform_code(
to a HELM CodeInstance according to a given CLEVA prompt template.
"""
- assert isinstance(templates["input"], str)
- data["prompt"] = templates["input"].format(**data)
- assert isinstance(data["prompt"], str)
- assert isinstance(data["canonical_solution"], str)
+ data["prompt"] = assert_is_str(templates["input"]).format(**data)
instance = CodeInstance(
- input=Input(text=data["prompt"]),
+ input=Input(text=assert_is_str(data["prompt"])),
references=[
CodeReference(
- output=Output(text=data["canonical_solution"]),
+ output=Output(text=assert_is_str(data["canonical_solution"])),
test_cases=data,
tags=[CORRECT_TAG],
)
@@ -211,27 +204,18 @@ def _apply_all(self, data: Dict[str, RawData], templates: Dict[str, Optional[Tem
transformed_data[k] = self._apply(data[k], template, **data)
# We then merge all other fields into the `input`
- assert isinstance(templates["input"], str), "The input field of a template should be a string"
- data["input"] = templates["input"].format(**transformed_data)
+ data["input"] = assert_is_str(templates["input"]).format(**transformed_data)
if "choices" in data:
# We take the corresponding choices and apply the `label` template
# Note: we do not allow `label` template to access other fields in multi-choice tasks
# Overwrite `choices` to the actual continuations
- choices: List[str] = data["choices"] # type: ignore
- # Gurantee `choices` must be `List[str]`
- assert isinstance(choices, list)
- for c in choices:
- assert isinstance(c, str)
+ choices = assert_is_str_list(data["choices"])
data["choices"] = [self._apply(c, templates.get("label", None), label=c) for c in choices]
else:
# For generation tasks, we allow it to access to other stringified fields
kwargs = transformed_data
del kwargs["label"]
- labels: List[str] = data["label"] # type: ignore
- # Gurantee `label` must be `List[str]`
- assert isinstance(labels, list)
- for label in labels:
- assert isinstance(label, str)
+ labels = assert_is_str_list(data["label"])
data["label"] = [self._apply(x, templates.get("label", None), **kwargs, label=x) for x in labels]
return data
diff --git a/src/helm/benchmark/scenarios/code_scenario.py b/src/helm/benchmark/scenarios/code_scenario.py
index d63fdf7b5f8..7324404ed04 100644
--- a/src/helm/benchmark/scenarios/code_scenario.py
+++ b/src/helm/benchmark/scenarios/code_scenario.py
@@ -139,8 +139,8 @@ def _read_and_preprocess_apps(target_path: str) -> List[CodeInstance]:
# only if the version of Python has a default limit.
#
# See: https://docs.python.org/3/library/stdtypes.html#int-max-str-digits
- if hasattr(sys, "set_int_max_str_digits"): # type: ignore
- sys.set_int_max_str_digits(100000) # type: ignore
+ if hasattr(sys, "set_int_max_str_digits"):
+ sys.set_int_max_str_digits(100000)
SINGLE_STR_LIMIT = 150000 # From original codebase.
diff --git a/src/helm/benchmark/scenarios/entity_matching_scenario.py b/src/helm/benchmark/scenarios/entity_matching_scenario.py
index 3e3ae5d7e54..2e309070850 100644
--- a/src/helm/benchmark/scenarios/entity_matching_scenario.py
+++ b/src/helm/benchmark/scenarios/entity_matching_scenario.py
@@ -92,7 +92,7 @@ def read_blocked_pairs(
num_neg_classes: int = sum(merged["label"] == 0)
assert num_pos_classes < num_neg_classes
sample_fn = lambda x: x.sample(num_pos_classes)
- merged = merged.groupby("label", group_keys=False).apply(sample_fn) # type: ignore
+ merged = merged.groupby("label", group_keys=False).apply(sample_fn)
return merged
def serialize_row(self, row: pd.core.series.Series, column_map: Dict[str, str]) -> str:
diff --git a/src/helm/benchmark/scenarios/numeracy_scenario.py b/src/helm/benchmark/scenarios/numeracy_scenario.py
index 8b60b58f84b..7205ae4bdf1 100644
--- a/src/helm/benchmark/scenarios/numeracy_scenario.py
+++ b/src/helm/benchmark/scenarios/numeracy_scenario.py
@@ -3,7 +3,7 @@
from dataclasses import dataclass, field
from itertools import combinations_with_replacement, product
import math
-from math import comb # type: ignore
+from math import comb
import numpy as np
import numpy.typing as npt
import random
@@ -544,7 +544,7 @@ def get_numeracy_adapter_spec(
"max_eval_instances": max_eval_instances,
"num_outputs": 1,
"num_train_trials": 1,
- "model": "openai/davinci",
+ "model_deployment": "openai/davinci",
"temperature": 0,
"stop_sequences": ["\n"],
"max_tokens": 20,
diff --git a/src/helm/benchmark/scenarios/scenario.py b/src/helm/benchmark/scenarios/scenario.py
index 76f413af7dd..4d36bb1ab47 100644
--- a/src/helm/benchmark/scenarios/scenario.py
+++ b/src/helm/benchmark/scenarios/scenario.py
@@ -177,7 +177,7 @@ def render_lines(self) -> List[str]:
# TODO(#1212): Scenario should not be a dataclass.
-@dataclass # type: ignore
+@dataclass
class Scenario(ABC):
"""
A scenario represents a (task, data distribution).
diff --git a/src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py b/src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py
new file mode 100644
index 00000000000..bccf8c50b78
--- /dev/null
+++ b/src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py
@@ -0,0 +1,108 @@
+from typing import Dict, List, Set
+import json
+import os
+
+from helm.benchmark.scenarios.scenario import (
+ CORRECT_TAG,
+ TRAIN_SPLIT,
+ VALID_SPLIT,
+ Instance,
+ Input,
+ Output,
+ Reference,
+ Scenario,
+)
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+
+
+class VizWizScenario(Scenario):
+ """
+ VizWiz is a real-world visual question answering dataset consisting of questions
+ asked by people who are blind. It originates from a natural visual question answering
+ setting where blind people each took an image and recorded a spoken question about it,
+ together with 10 crowdsourced answers per visual question.
+
+ Version as of January 1, 2020:
+
+ - 20,523 training image/question pairs
+ - 205,230 training answer/answer confidence pairs
+ - 4,319 validation image/question pairs
+ - 43,190 validation answer/answer confidence pairs
+
+ where answer confidences are one of {"yes", "maybe", "no"}.
+
+ Answers are publicly shared for the train and validation splits and hidden for the test split.
+
+ Paper: https://arxiv.org/abs/1802.08218
+ Website: https://vizwiz.org/tasks-and-datasets/vqa
+ """
+
+ # Annotations are not available for the test set
+ ANNOTATIONS_URL: str = "https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip"
+ SPLIT_TO_ANNOTATIONS_FILE: Dict[str, str] = {
+ TRAIN_SPLIT: "train.json",
+ VALID_SPLIT: "val.json",
+ }
+
+ SPLIT_TO_IMAGES: Dict[str, str] = {
+ TRAIN_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip",
+ VALID_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip",
+ }
+
+ name = "viz_wiz"
+ description = (
+ "Real-world VQA dataset consisting of questions asked by "
+ "people who are blind ([paper](https://arxiv.org/abs/1802.08218))."
+ )
+ tags = ["vision-language", "visual question answering"]
+
+ def get_instances(self, output_path: str) -> List[Instance]:
+ # Download the questions and annotations
+ annotations_path: str = os.path.join(output_path, "annotations")
+ ensure_directory_exists(annotations_path)
+ ensure_file_downloaded(
+ source_url=self.ANNOTATIONS_URL,
+ target_path=annotations_path,
+ unpack=True,
+ unpack_type="unzip",
+ )
+
+ instances: List[Instance] = []
+ for split in [TRAIN_SPLIT, VALID_SPLIT]:
+ # Download the images for the split
+ images_path: str = os.path.join(output_path, split)
+ ensure_file_downloaded(
+ source_url=self.SPLIT_TO_IMAGES[split],
+ target_path=images_path,
+ unpack=True,
+ unpack_type="unzip",
+ )
+
+ annotations_split_path: str = os.path.join(annotations_path, self.SPLIT_TO_ANNOTATIONS_FILE[split])
+ with open(annotations_split_path) as f:
+ for image_annotation in json.load(f):
+ image_path: str = os.path.join(images_path, image_annotation["image"])
+ assert os.path.exists(image_path), f"Image {image_path} does not exist"
+
+ content: List[MediaObject] = [
+ MediaObject(location=image_path, content_type="image/jpeg"),
+ MediaObject(text=image_annotation["question"], content_type="text/plain"),
+ ]
+ deduped_answers: Set[str] = {
+ answer_json["answer"]
+ for answer_json in image_annotation["answers"]
+ if answer_json["answer_confidence"] == "yes"
+ }
+
+ instances.append(
+ Instance(
+ Input(multimedia_content=MultimediaObject(content)),
+ references=[
+ Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in deduped_answers
+ ],
+ split=split,
+ )
+ )
+
+ return instances
diff --git a/src/helm/benchmark/slurm_runner.py b/src/helm/benchmark/slurm_runner.py
index 84edde35b33..09cde008b52 100644
--- a/src/helm/benchmark/slurm_runner.py
+++ b/src/helm/benchmark/slurm_runner.py
@@ -131,7 +131,7 @@ def run_all(self, run_specs: List[RunSpec]):
# When running with multiple models, sorting by RunSpec.name is a heuristic that tries to
# spread out the load evenly across multiple models, in order to avoid overloading any single model.
for run_spec in sorted(run_specs, key=lambda run_spec: run_spec.name):
- if self.skip_completed_runs and self._is_run_completed(run_spec):
+ if self.skip_completed_runs and self._is_run_completed(self._get_run_path(run_spec)):
skipped_run_specs.append(run_spec)
else:
queued_run_specs.append(run_spec)
diff --git a/src/helm/benchmark/static/schema.yaml b/src/helm/benchmark/static/schema.yaml
index e8f345f559a..0edf2eff69c 100644
--- a/src/helm/benchmark/static/schema.yaml
+++ b/src/helm/benchmark/static/schema.yaml
@@ -76,7 +76,7 @@ models:
access: limited
num_parameters: 70000000000
release_date: 2022-01-01
-
+
# TODO: Remove Once we have configurable model names
- name: neurips/local
display_name: Local service
@@ -160,6 +160,32 @@ models:
creator_organization: BigCode
access: open
+ # Hugging Face
+ - name: huggingface/gpt2
+ display_name: GPT-2 (124M)
+ description: GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts.
+ creator_organization: OpenAI
+ access: open
+ num_parameters: 124000000
+ - name: huggingface/gpt2-medium
+ display_name: GPT-2 Medium (355M)
+ description: GPT-2 Medium is the 355M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
+ creator_organization: OpenAI
+ access: open
+ num_parameters: 355000000
+ - name: huggingface/gpt2-large
+ display_name: GPT-2 Large (774M)
+ description: GPT-2 Large is the 774M parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
+ creator_organization: OpenAI
+ access: open
+ num_parameters: 774000000
+ - name: huggingface/gpt2-xl
+ display_name: GPT-2 XL (1.5B)
+ description: GPT-2 XL is the 1.5B parameter version of GPT-2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
+ creator_organization: OpenAI
+ access: open
+ num_parameters: 1500000000
+
# Cerebras Systems
- name: together/cerebras-gpt-6.7b
display_name: Cerebras GPT (6.7B)
@@ -961,8 +987,12 @@ adapter:
description: Maximum number of possible outputs to generate by sampling multiple outputs.
- name: num_train_trials
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
+ - name: sample_train
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
- name: model
- description: Name of the language model (/) to send requests to.
+ description: DEPRECATED. Name of the language model (/) to send requests to.
+ - name: model_deployment
+ description: Name of the language model (/) to send requests to.
- name: temperature
description: Temperature parameter used in generation.
- name: max_tokens
@@ -971,6 +1001,8 @@ adapter:
description: List of sequences, where we stop generation if we encounter any of them.
- name: random
description: Random seed (string), which guarantees reproducibility.
+ - name: multi_label
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
############################################################
metrics:
@@ -1059,6 +1091,7 @@ metrics:
short_display_name: PEM
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
lower_is_better: false
+
- name: exact_match@5
display_name: Exact match @5
short_display_name: EM@5
@@ -1069,6 +1102,17 @@ metrics:
short_display_name: EM@5
description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
lower_is_better: false
+ - name: prefix_exact_match@5
+ display_name: Prefix exact match @5
+ short_display_name: PEM@5
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
+ lower_is_better: false
+ - name: quasi_prefix_exact_match@5
+ display_name: Prefix quasi-exact match @5
+ short_display_name: PEM@5
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
+ lower_is_better: false
+
- name: logprob
display_name: Log probability
short_display_name: Logprob
@@ -1979,6 +2023,7 @@ run_groups:
- synthetic_efficiency
adapter_keys_shown:
- model
+ - model_deployment
- max_tokens
- name: calibration
@@ -2024,6 +2069,7 @@ run_groups:
- civil_comments
adapter_keys_shown:
- model
+ - model_deployment
- max_train_instances
subgroup_metric_groups_hidden:
- robustness
@@ -2045,6 +2091,7 @@ run_groups:
- bbq
adapter_keys_shown:
- model
+ - model_deployment
- method
- name: ablation_prompts
@@ -2059,6 +2106,7 @@ run_groups:
- civil_comments
adapter_keys_shown:
- model
+ - model_deployment
- instructions
- input_prefix
- input_suffix
@@ -3064,6 +3112,7 @@ run_groups:
main_split: test
adapter_keys_shown:
- model
+ - model_deployment
- max_tokens
taxonomy:
task: "?"
diff --git a/src/helm/benchmark/test_model_properties.py b/src/helm/benchmark/test_model_properties.py
new file mode 100644
index 00000000000..64cf3e0db9d
--- /dev/null
+++ b/src/helm/benchmark/test_model_properties.py
@@ -0,0 +1,1465 @@
+"""Temporary test for preserving invariants during the model / tokenizer / window service refactor.
+
+Delete this after the refactor is done."""
+
+import pytest
+from tempfile import TemporaryDirectory
+from typing import Any
+from helm.benchmark.config_registry import register_helm_configurations
+from helm.benchmark.model_deployment_registry import (
+ ClientSpec,
+ ModelDeployment,
+ WindowServiceSpec,
+ ALL_MODEL_DEPLOYMENTS,
+)
+from helm.benchmark.model_metadata_registry import ModelMetadata
+from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec
+from helm.benchmark.window_services.test_utils import get_tokenizer_service
+
+from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
+from helm.proxy.clients.auto_client import AutoClient
+from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
+from collections import defaultdict
+
+
+_BUILT_IN_TOKENIZER_CONFIGS = [
+ TokenizerConfig(
+ name="neurips/local",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.http_model_tokenizer.HTTPModelTokenizer"),
+ end_of_text_token="<|endoftext|>",
+ prefix_token="<|endoftext|>",
+ ),
+ TokenizerConfig(
+ name="ai21/j1",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.ai21_tokenizer.AI21Tokenizer"),
+ end_of_text_token=" ",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="AlephAlpha/luminous-base",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="AlephAlpha/luminous-extended",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="AlephAlpha/luminous-supreme",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="huggingface/gpt2",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="<|endoftext|>",
+ prefix_token="<|endoftext|>",
+ ),
+ TokenizerConfig(
+ name="anthropic/claude",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.anthropic_tokenizer.AnthropicTokenizer"),
+ end_of_text_token="<|endoftext|>",
+ prefix_token="<|endoftext|>",
+ ),
+ TokenizerConfig(
+ name="bigscience/bloom",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="bigscience/T0pp",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="cohere/cohere",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer"),
+ end_of_text_token="",
+ prefix_token=":",
+ ),
+ TokenizerConfig(
+ name="EleutherAI/gpt-j-6B",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="<|endoftext|>",
+ prefix_token="<|endoftext|>",
+ ),
+ TokenizerConfig(
+ name="EleutherAI/gpt-neox-20b",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="<|endoftext|>",
+ prefix_token="<|endoftext|>",
+ ),
+ TokenizerConfig(
+ name="hf-internal-testing/llama-tokenizer",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="meta-llama/Llama-2-7b-hf",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="mistralai/Mistral-7B-v0.1",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="tiiuae/falcon-7b",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="<|endoftext|>",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="bigcode/santacoder",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="<|endoftext|>",
+ prefix_token="<|endoftext|>",
+ ),
+ TokenizerConfig(
+ name="bigcode/starcoder",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="<|endoftext|>",
+ prefix_token="<|endoftext|>",
+ ),
+ TokenizerConfig(
+ name="google/t5-11b",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="google/flan-t5-xxl",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="google/ul2",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="facebook/opt-66b",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="openai/cl100k_base",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"),
+ end_of_text_token="<|endoftext|>",
+ prefix_token="<|endoftext|>",
+ ),
+ TokenizerConfig(
+ name="TsinghuaKEG/ice",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.ice_tokenizer.ICETokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="Yandex/yalm",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.yalm_tokenizer.YaLMTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="lightningai/lit-gpt",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.lit_gpt_tokenizer.LitGPTTokenizer", args={}),
+ end_of_text_token="<|endoftext|>",
+ prefix_token="<|endoftext|>",
+ ),
+ TokenizerConfig(
+ name="HuggingFaceM4/idefics-9b",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="HuggingFaceM4/idefics-9b-instruct",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="HuggingFaceM4/idefics-80b",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+ TokenizerConfig(
+ name="HuggingFaceM4/idefics-80b-instruct",
+ tokenizer_spec=TokenizerSpec(class_name="helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"),
+ end_of_text_token="",
+ prefix_token="",
+ ),
+]
+
+
+_BUILT_IN_MODEL_DEPLOYMENTS = [
+ ModelDeployment(
+ name="neurips/local",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.http_model_client.HTTPModelClient"),
+ tokenizer_name="neurips/local",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.http_model_window_service.HTTPModelWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="ai21/j1-jumbo",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+ tokenizer_name="ai21/j1",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ ),
+ max_sequence_length=2047,
+ ),
+ ModelDeployment(
+ name="ai21/j1-grande",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+ tokenizer_name="ai21/j1",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ ),
+ max_sequence_length=2047,
+ ),
+ ModelDeployment(
+ name="ai21/j1-grande-v2-beta",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+ tokenizer_name="ai21/j1",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ ),
+ max_sequence_length=2047,
+ ),
+ ModelDeployment(
+ name="ai21/j1-large",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+ tokenizer_name="ai21/j1",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ ),
+ max_sequence_length=2047,
+ ),
+ ModelDeployment(
+ name="ai21/j2-jumbo",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+ tokenizer_name="ai21/j1",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_ai21_window_service.AI21Jurassic2JumboWindowService",
+ args={},
+ ),
+ max_sequence_length=6000,
+ ),
+ ModelDeployment(
+ name="ai21/j2-grande",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+ tokenizer_name="ai21/j1",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ ),
+ max_sequence_length=2047,
+ ),
+ ModelDeployment(
+ name="ai21/j2-large",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.ai21_client.AI21Client"),
+ tokenizer_name="ai21/j1",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ ),
+ max_sequence_length=2047,
+ ),
+ ModelDeployment(
+ name="AlephAlpha/luminous-base",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"),
+ tokenizer_name="AlephAlpha/luminous-base",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.luminous_window_service.LuminousBaseWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="AlephAlpha/luminous-extended",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"),
+ tokenizer_name="AlephAlpha/luminous-extended",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.luminous_window_service.LuminousExtendedWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="AlephAlpha/luminous-supreme",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"),
+ tokenizer_name="AlephAlpha/luminous-supreme",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.luminous_window_service.LuminousSupremeWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="anthropic/stanford-online-all-v4-s3",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicLegacyClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.anthropic_window_service.LegacyAnthropicWindowService"
+ ),
+ max_sequence_length=8192,
+ ),
+ ModelDeployment(
+ name="anthropic/claude-2.0",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"),
+ tokenizer_name="anthropic/claude",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+ ),
+ max_sequence_length=8000,
+ max_sequence_and_generated_tokens_length=9016,
+ ),
+ ModelDeployment(
+ name="anthropic/claude-v1.3",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"),
+ tokenizer_name="anthropic/claude",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+ ),
+ max_sequence_length=8000,
+ max_sequence_and_generated_tokens_length=9016,
+ ),
+ ModelDeployment(
+ name="anthropic/claude-instant-v1",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"),
+ tokenizer_name="anthropic/claude",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+ ),
+ max_sequence_length=8000,
+ max_sequence_and_generated_tokens_length=9016,
+ ),
+ ModelDeployment(
+ name="together/bloom",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="bigscience/bloom",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.bloom_window_service.BloomWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/t0pp",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="bigscience/T0pp",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.t0pp_window_service.T0ppWindowService"
+ ),
+ max_sequence_length=1024,
+ ),
+ ModelDeployment(
+ name="cohere/xlarge-20220609",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+ tokenizer_name="cohere/cohere",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ ),
+ max_sequence_length=2047,
+ max_request_length=2048,
+ ),
+ ModelDeployment(
+ name="cohere/xlarge-20221108",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+ tokenizer_name="cohere/cohere",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ ),
+ max_sequence_length=2047,
+ max_request_length=2048,
+ ),
+ ModelDeployment(
+ name="cohere/large-20220720",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+ tokenizer_name="cohere/cohere",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ ),
+ max_sequence_length=2047,
+ max_request_length=2048,
+ ),
+ ModelDeployment(
+ name="cohere/medium-20220720",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+ tokenizer_name="cohere/cohere",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ ),
+ max_sequence_length=2047,
+ max_request_length=2048,
+ ),
+ ModelDeployment(
+ name="cohere/medium-20221108",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+ tokenizer_name="cohere/cohere",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ ),
+ max_sequence_length=2047,
+ max_request_length=2048,
+ ),
+ ModelDeployment(
+ name="cohere/small-20220720",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+ tokenizer_name="cohere/cohere",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ ),
+ max_sequence_length=2047,
+ max_request_length=2048,
+ ),
+ ModelDeployment(
+ name="cohere/command-medium-beta",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+ tokenizer_name="cohere/cohere",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+ ),
+ max_sequence_length=2019,
+ max_request_length=2020,
+ ),
+ ModelDeployment(
+ name="cohere/command-xlarge-beta",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+ tokenizer_name="cohere/cohere",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+ ),
+ max_sequence_length=2019,
+ max_request_length=2020,
+ ),
+ ModelDeployment(
+ name="cohere/command",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+ tokenizer_name="cohere/cohere",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+ ),
+ max_sequence_length=2019,
+ max_request_length=2020,
+ ),
+ ModelDeployment(
+ name="cohere/command-light",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"),
+ tokenizer_name="cohere/cohere",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+ ),
+ max_sequence_length=2019,
+ max_request_length=2020,
+ ),
+ ModelDeployment(
+ name="together/gpt-j-6b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-j-6B",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/gpt-neox-20b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/pythia-1b-v0",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/pythia-2.8b-v0",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/pythia-6.9b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/pythia-12b-v0",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/llama-7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="hf-internal-testing/llama-tokenizer",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="together/llama-13b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="hf-internal-testing/llama-tokenizer",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="together/llama-30b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="hf-internal-testing/llama-tokenizer",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="together/llama-65b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="hf-internal-testing/llama-tokenizer",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="together/llama-2-7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="meta-llama/Llama-2-7b-hf",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+ ),
+ max_sequence_length=4096,
+ ),
+ ModelDeployment(
+ name="together/llama-2-13b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="meta-llama/Llama-2-7b-hf",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+ ),
+ max_sequence_length=4096,
+ ),
+ ModelDeployment(
+ name="together/llama-2-70b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="meta-llama/Llama-2-7b-hf",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+ ),
+ max_sequence_length=4096,
+ ),
+ ModelDeployment(
+ name="together/alpaca-7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="hf-internal-testing/llama-tokenizer",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="together/vicuna-7b-v1.3",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="hf-internal-testing/llama-tokenizer",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="together/vicuna-13b-v1.3",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="hf-internal-testing/llama-tokenizer",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="together/mistral-7b-v0.1",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="mistralai/Mistral-7B-v0.1",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ ),
+ max_sequence_length=4095,
+ ),
+ ModelDeployment(
+ name="together/mpt-7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/mpt-instruct-7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/mpt-30b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/mpt-instruct-30b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/falcon-7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="tiiuae/falcon-7b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="together/falcon-7b-instruct",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="tiiuae/falcon-7b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="together/falcon-40b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="tiiuae/falcon-7b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="together/falcon-40b-instruct",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="tiiuae/falcon-7b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="gooseai/gpt-neo-20b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.goose_ai_client.GooseAIClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="gooseai/gpt-j-6b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.goose_ai_client.GooseAIClient"),
+ tokenizer_name="EleutherAI/gpt-j-6B",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="huggingface/gpt2",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ ),
+ max_sequence_length=1024,
+ max_request_length=1025,
+ ),
+ ModelDeployment(
+ name="huggingface/gpt-j-6b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
+ tokenizer_name="EleutherAI/gpt-j-6B",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="huggingface/santacoder",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
+ tokenizer_name="bigcode/santacoder",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.santacoder_window_service.SantaCoderWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="huggingface/starcoder",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.huggingface_client.HuggingFaceClient"),
+ tokenizer_name="bigcode/starcoder",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.starcoder_window_service.StarCoderWindowService"
+ ),
+ max_sequence_length=8192,
+ ),
+ ModelDeployment(
+ name="together/t5-11b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="google/t5-11b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.t511b_window_service.T511bWindowService"
+ ),
+ max_sequence_length=511,
+ ),
+ ModelDeployment(
+ name="together/flan-t5-xxl",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="google/flan-t5-xxl",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.flan_t5_window_service.FlanT5WindowService"
+ ),
+ max_sequence_length=511,
+ ),
+ ModelDeployment(
+ name="together/ul2",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="google/ul2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.ul2_window_service.UL2WindowService"
+ ),
+ max_sequence_length=511,
+ ),
+ ModelDeployment(
+ name="together/h3-2.7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ ),
+ max_sequence_length=1024,
+ max_request_length=1025,
+ ),
+ ModelDeployment(
+ name="together/opt-175b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="facebook/opt-66b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/opt-66b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="facebook/opt-66b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/opt-6.7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="facebook/opt-66b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/opt-1.3b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="facebook/opt-66b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.opt_window_service.OPTWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="microsoft/TNLGv2_530B",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.microsoft_client.MicrosoftClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
+ ),
+ max_sequence_length=2047,
+ max_request_length=2048,
+ ),
+ ModelDeployment(
+ name="microsoft/TNLGv2_7B",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.microsoft_client.MicrosoftClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
+ ),
+ max_sequence_length=2047,
+ max_request_length=2048,
+ ),
+ ModelDeployment(
+ name="openai/davinci",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/curie",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/babbage",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/ada",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/text-davinci-003",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+ ),
+ max_sequence_length=4000,
+ max_request_length=4001,
+ ),
+ ModelDeployment(
+ name="openai/text-davinci-002",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+ ),
+ max_sequence_length=4000,
+ max_request_length=4001,
+ ),
+ ModelDeployment(
+ name="openai/text-davinci-001",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/text-curie-001",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/text-babbage-001",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/text-ada-001",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/code-davinci-002",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+ ),
+ max_sequence_length=4000,
+ max_request_length=4001,
+ ),
+ ModelDeployment(
+ name="openai/code-davinci-001",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/code-cushman-001",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/gpt-4-0314",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="openai/cl100k_base",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
+ ),
+ max_sequence_length=8192,
+ max_request_length=8193,
+ ),
+ ModelDeployment(
+ name="openai/gpt-4-32k-0314",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="openai/cl100k_base",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
+ ),
+ max_sequence_length=32768,
+ max_request_length=32769,
+ ),
+ ModelDeployment(
+ name="openai/gpt-4-0613",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="openai/cl100k_base",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
+ ),
+ max_sequence_length=8192,
+ max_request_length=8193,
+ ),
+ ModelDeployment(
+ name="openai/gpt-4-32k-0613",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="openai/cl100k_base",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
+ ),
+ max_sequence_length=32768,
+ max_request_length=32769,
+ ),
+ ModelDeployment(
+ name="openai/gpt-3.5-turbo-0301",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="openai/cl100k_base",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
+ ),
+ max_sequence_length=4000,
+ max_request_length=4001,
+ ),
+ ModelDeployment(
+ name="openai/gpt-3.5-turbo-0613",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="openai/cl100k_base",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
+ ),
+ max_sequence_length=4000,
+ max_request_length=4001,
+ ),
+ ModelDeployment(
+ name="openai/gpt-3.5-turbo-16k-0613",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="openai/cl100k_base",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.wider_openai_window_service.GPTTurbo16KWindowService"
+ ),
+ max_sequence_length=16000,
+ max_request_length=16001,
+ ),
+ ModelDeployment(
+ name="openai/text-similarity-davinci-001",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/text-similarity-curie-001",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/text-similarity-babbage-001",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/text-similarity-ada-001",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="openai/text-embedding-ada-002",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.openai_client.OpenAIClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/gpt-jt-6b-v1",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-j-6B",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/gpt-neoxt-chat-base-20b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/redpajama-incite-base-3b-v1",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/redpajama-incite-instruct-3b-v1",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/redpajama-incite-base-7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/redpajama-incite-instruct-7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/glm",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="TsinghuaKEG/ice",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.ice_window_service.ICEWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="writer/palmyra-base",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+ ),
+ max_sequence_length=2048,
+ max_sequence_and_generated_tokens_length=2048,
+ ),
+ ModelDeployment(
+ name="writer/palmyra-large",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+ ),
+ max_sequence_length=2048,
+ max_sequence_and_generated_tokens_length=2048,
+ ),
+ ModelDeployment(
+ name="writer/palmyra-instruct-30",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+ ),
+ max_sequence_length=2048,
+ max_sequence_and_generated_tokens_length=2048,
+ ),
+ ModelDeployment(
+ name="writer/palmyra-e",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+ ),
+ max_sequence_length=2048,
+ max_sequence_and_generated_tokens_length=2048,
+ ),
+ ModelDeployment(
+ name="writer/silk-road",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
+ ),
+ max_sequence_length=8192,
+ max_sequence_and_generated_tokens_length=8192,
+ ),
+ ModelDeployment(
+ name="writer/palmyra-x",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.palmyra_client.PalmyraClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
+ ),
+ max_sequence_length=8192,
+ max_sequence_and_generated_tokens_length=8192,
+ ),
+ ModelDeployment(
+ name="together/yalm",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="Yandex/yalm",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.yalm_window_service.YaLMWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="google/palm",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.google_client.GoogleClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="nvidia/megatron-gpt2",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.megatron_client.MegatronClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.megatron_window_service.MegatronWindowService"
+ ),
+ max_sequence_length=1024,
+ ),
+ ModelDeployment(
+ name="together/dolly-v2-3b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/dolly-v2-7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/dolly-v2-12b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+ ModelDeployment(
+ name="together/stablelm-base-alpha-3b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
+ ),
+ max_sequence_length=4096,
+ max_request_length=4097,
+ ),
+ ModelDeployment(
+ name="together/stablelm-base-alpha-7b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"),
+ tokenizer_name="EleutherAI/gpt-neox-20b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
+ ),
+ max_sequence_length=4096,
+ max_request_length=4097,
+ ),
+ ModelDeployment(
+ name="lightningai/lit-gpt",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.lit_gpt_client.LitGPTClient", args={}),
+ model_name=None,
+ tokenizer_name="lightningai/lit-gpt",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.lit_gpt_window_service.LitGPTWindowService", args={}
+ ),
+ max_sequence_length=2048,
+ max_request_length=None,
+ max_sequence_and_generated_tokens_length=None,
+ ),
+ ModelDeployment(
+ name="HuggingFaceM4/idefics-9b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
+ tokenizer_name="HuggingFaceM4/idefics-9b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="HuggingFaceM4/idefics-9b-instruct",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
+ tokenizer_name="HuggingFaceM4/idefics-9b-instruct",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="HuggingFaceM4/idefics-80b",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
+ tokenizer_name="HuggingFaceM4/idefics-80b",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="HuggingFaceM4/idefics-80b-instruct",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"),
+ tokenizer_name="HuggingFaceM4/idefics-80b-instruct",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ ),
+ max_sequence_length=2048,
+ ),
+ ModelDeployment(
+ name="simple/model1",
+ client_spec=ClientSpec(class_name="helm.proxy.clients.simple_client.SimpleClient"),
+ tokenizer_name="huggingface/gpt2",
+ window_service_spec=WindowServiceSpec(
+ class_name="helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ ),
+ max_sequence_length=2048,
+ max_request_length=2049,
+ ),
+]
+
+
+_INT_MAX: int = 2**31 - 1
+
+
+def _full_class_name(obj: Any) -> str:
+ return f"{obj.__class__.__module__}.{obj.__class__.__name__}"
+
+
+# HACK: This looks like it should be done in a setup_class()
+# for the test below but apparently pytest first check the parametrize
+# before running the setup_class().
+# Therefore ALL_MODEL_DEPLOYMENTS is empty and no test would be run,
+# so we need to do this here.
+register_helm_configurations()
+
+
+class TestModelProperties:
+ @pytest.mark.parametrize("model", ALL_MODEL_DEPLOYMENTS)
+ def test_models_has_window_service(self, model: ModelMetadata):
+ auto_client = AutoClient(defaultdict(str), "", "")
+ auto_tokenizer = AutoTokenizer(defaultdict(str), "", "")
+ model_deployments = {
+ model_deployment.name: model_deployment for model_deployment in _BUILT_IN_MODEL_DEPLOYMENTS
+ }
+ tokenizer_configs = {
+ tokenizer_config.name: tokenizer_config for tokenizer_config in _BUILT_IN_TOKENIZER_CONFIGS
+ }
+ with TemporaryDirectory() as tmpdir:
+ tokenizer_service = get_tokenizer_service(tmpdir)
+ # Can't test lit-gpt client because it requires manual dependencies
+ if "lit-gpt" in model.name:
+ return
+
+ # Can't test Llama 2 because it requires Hugging Face credentials
+ if "llama-2-" in model.name:
+ return
+
+ deployment_name: str = model.name
+ client = auto_client._get_client(deployment_name)
+ window_service = WindowServiceFactory.get_window_service(deployment_name, tokenizer_service)
+ tokenizer_name = window_service.tokenizer_name
+ tokenizer = auto_tokenizer._get_tokenizer(tokenizer_name)
+
+ client_class_name = _full_class_name(client)
+ tokenizer_class_name = _full_class_name(tokenizer)
+ window_service_class_name = _full_class_name(window_service)
+
+ prefix_token = window_service.prefix_token
+ end_of_text_token = window_service.end_of_text_token
+
+ max_sequence_length = window_service.max_sequence_length
+ max_request_length = (
+ window_service.max_request_length
+ if window_service.max_request_length != window_service.max_sequence_length
+ else None
+ )
+ max_sequence_and_generated_tokens_length = (
+ window_service.max_sequence_and_generated_tokens_length
+ if window_service.max_sequence_and_generated_tokens_length != _INT_MAX
+ else None
+ )
+
+ model_deployment = ModelDeployment(
+ name=model.name,
+ client_spec=ClientSpec(class_name=client_class_name),
+ tokenizer_name=tokenizer_name,
+ window_service_spec=WindowServiceSpec(class_name=window_service_class_name),
+ max_sequence_length=max_sequence_length,
+ max_request_length=max_request_length,
+ max_sequence_and_generated_tokens_length=max_sequence_and_generated_tokens_length,
+ )
+ tokenizer_config = TokenizerConfig(
+ name=tokenizer_name,
+ tokenizer_spec=TokenizerSpec(class_name=tokenizer_class_name),
+ end_of_text_token=end_of_text_token,
+ prefix_token=prefix_token,
+ )
+ # NOTE: To generate the _BUILT_IN_MODEL_DEPLOYMENT and _BUILT_IN_TOKENIZER_CONFIGS lists above,
+ # print tokenizer_config and model_deployment here.
+
+ assert model_deployments[model.name] == model_deployment
+ # PalmyraWindowService overrides the huggingface/gpt2 tokenizer with different special tokens,
+ # so there are currently two tokenizers named huggingface/gpt2
+ # TODO: Give PalmyraWindowService's tokenizer a different name e.g. writer/palmyra
+ if tokenizer_name != "huggingface/gpt2":
+ assert tokenizer_configs[tokenizer_name] == tokenizer_config
+
+ def test_num_models_available(self):
+ assert len(ALL_MODEL_DEPLOYMENTS) == 119
diff --git a/src/helm/benchmark/tokenizer_config_registry.py b/src/helm/benchmark/tokenizer_config_registry.py
index dda06f384ac..732cd38bd1e 100644
--- a/src/helm/benchmark/tokenizer_config_registry.py
+++ b/src/helm/benchmark/tokenizer_config_registry.py
@@ -1,15 +1,18 @@
import os
from typing import Dict, Optional, List
from dataclasses import dataclass
+import importlib_resources as resources
import cattrs
import yaml
from helm.common.hierarchical_logger import hlog
from helm.common.object_spec import ObjectSpec
+from helm.benchmark.model_metadata_registry import CONFIG_PACKAGE
-TOKENIEZR_CONFIGS_FILE = "tokenizer_configs.yaml"
+TOKENIZER_CONFIGS_FILE: str = "tokenizer_configs.yaml"
+TOKENIZERS_REGISTERED: bool = False
class TokenizerSpec(ObjectSpec):
@@ -26,7 +29,11 @@ class TokenizerConfig:
tokenizer_spec: TokenizerSpec
"""Specification for instantiating the client for this tokenizer."""
- # TODO: Add `end_of_text_token`` and `prefix_token``
+ end_of_text_token: Optional[str] = None
+ """The end of text token."""
+
+ prefix_token: Optional[str] = None
+ """The prefix token."""
@dataclass(frozen=True)
@@ -34,11 +41,13 @@ class TokenizerConfigs:
tokenizer_configs: List[TokenizerConfig]
-_name_to_tokenizer_config: Dict[str, TokenizerConfig] = {}
+ALL_TOKENIZER_CONFIGS: List[TokenizerConfig] = []
+TOKENIZER_NAME_TO_CONFIG: Dict[str, TokenizerConfig] = {config.name: config for config in ALL_TOKENIZER_CONFIGS}
def register_tokenizer_config(tokenizer_config: TokenizerConfig) -> None:
- _name_to_tokenizer_config[tokenizer_config.name] = tokenizer_config
+ ALL_TOKENIZER_CONFIGS.append(tokenizer_config)
+ TOKENIZER_NAME_TO_CONFIG[tokenizer_config.name] = tokenizer_config
def register_tokenizer_configs_from_path(path: str) -> None:
@@ -50,11 +59,20 @@ def register_tokenizer_configs_from_path(path: str) -> None:
register_tokenizer_config(tokenizer_config)
-def maybe_register_tokenizer_configs_from_base_path(base_path: str) -> None:
- path = os.path.join(base_path, TOKENIEZR_CONFIGS_FILE)
+def maybe_register_tokenizer_configs_from_base_path(path: str) -> None:
+ """Register tokenizer configs from yaml file if the path exists."""
if os.path.exists(path):
register_tokenizer_configs_from_path(path)
def get_tokenizer_config(name: str) -> Optional[TokenizerConfig]:
- return _name_to_tokenizer_config.get(name)
+ register_tokenizers_if_not_already_registered()
+ return TOKENIZER_NAME_TO_CONFIG.get(name)
+
+
+def register_tokenizers_if_not_already_registered() -> None:
+ global TOKENIZERS_REGISTERED
+ if not TOKENIZERS_REGISTERED:
+ path: str = resources.files(CONFIG_PACKAGE).joinpath(TOKENIZER_CONFIGS_FILE)
+ maybe_register_tokenizer_configs_from_base_path(path)
+ TOKENIZERS_REGISTERED = True
diff --git a/src/helm/benchmark/vlm_run_specs.py b/src/helm/benchmark/vlm_run_specs.py
index 6f226a36584..034aa33aace 100644
--- a/src/helm/benchmark/vlm_run_specs.py
+++ b/src/helm/benchmark/vlm_run_specs.py
@@ -18,7 +18,6 @@ def get_vlm_generation_adapter_spec(
input_suffix: str = "",
output_prefix: str = "",
output_suffix: str = "",
- max_train_instances: int = 0,
max_tokens: int = 100,
stop_sequences: Optional[List[str]] = None,
) -> AdapterSpec:
@@ -31,7 +30,8 @@ def get_vlm_generation_adapter_spec(
output_prefix=output_prefix,
output_suffix=output_suffix,
instance_prefix="\n",
- max_train_instances=max_train_instances,
+ # We focus on zero-shot evaluation for now as most open VLMs only support a single image input
+ max_train_instances=0,
num_outputs=1,
max_tokens=max_tokens,
stop_sequences=stop_sequences if stop_sequences is not None else [],
@@ -43,6 +43,33 @@ def get_vlm_generation_adapter_spec(
# VHELM run specs
+@run_spec_function("viz_wiz")
+def get_viz_wiz_spec() -> RunSpec:
+ scenario_spec = ScenarioSpec(
+ class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
+ )
+
+ # TODO: finalize the adapter spec parameters once we add more models
+ adapter_spec: AdapterSpec = get_vlm_generation_adapter_spec(
+ input_prefix="User: ",
+ input_suffix="",
+ output_prefix="\nAssistant: ",
+ output_suffix="",
+ stop_sequences=[""],
+ )
+
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+
+ run_spec_name: str = "viz_wiz"
+ return RunSpec(
+ name=run_spec_name,
+ scenario_spec=scenario_spec,
+ adapter_spec=adapter_spec,
+ metric_specs=metric_specs,
+ groups=[run_spec_name],
+ )
+
+
@run_spec_function("vqa")
def get_vqa_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
@@ -55,7 +82,6 @@ def get_vqa_spec() -> RunSpec:
input_suffix="",
output_prefix="\nAssistant: ",
output_suffix="",
- max_train_instances=3,
stop_sequences=[""],
)
diff --git a/src/helm/benchmark/window_services/default_window_service.py b/src/helm/benchmark/window_services/default_window_service.py
index a643d2fcdd2..eddc056183f 100644
--- a/src/helm/benchmark/window_services/default_window_service.py
+++ b/src/helm/benchmark/window_services/default_window_service.py
@@ -10,11 +10,19 @@ def __init__(
tokenizer_name: str,
max_sequence_length: int,
max_request_length: Optional[int] = None,
+ end_of_text_token: Optional[str] = None,
+ prefix_token: Optional[str] = None,
):
super().__init__(service)
self._tokenizer_name = tokenizer_name
self._max_sequence_length = max_sequence_length
- self._max_request_length = max_request_length
+ self._max_request_length = max_request_length or max_sequence_length
+ self._end_of_text_token = end_of_text_token or ""
+ self._prefix_token = prefix_token or ""
+
+ @property
+ def tokenizer_name(self) -> str:
+ return self._tokenizer_name
@property
def max_sequence_length(self) -> int:
@@ -22,18 +30,12 @@ def max_sequence_length(self) -> int:
@property
def max_request_length(self) -> int:
- return self._max_request_length or self._max_sequence_length
+ return self._max_request_length
@property
def end_of_text_token(self) -> str:
- # TODO: Support this
- return ""
-
- @property
- def tokenizer_name(self) -> str:
- return self._tokenizer_name
+ return self._end_of_text_token
@property
def prefix_token(self) -> str:
- # TODO: Support this
- return ""
+ return self._prefix_token
diff --git a/src/helm/benchmark/window_services/http_model_window_service.py b/src/helm/benchmark/window_services/http_model_window_service.py
index dac3bb70fbb..d84308b370a 100644
--- a/src/helm/benchmark/window_services/http_model_window_service.py
+++ b/src/helm/benchmark/window_services/http_model_window_service.py
@@ -3,7 +3,7 @@
# TODO: Remove Once we have configurable model names since this hardcodes the tokenizer name
-class HTTPModelWindowServce(LocalWindowService):
+class HTTPModelWindowService(LocalWindowService):
def __init__(self, service: TokenizerService):
super().__init__(service)
diff --git a/src/helm/benchmark/window_services/huggingface_window_service.py b/src/helm/benchmark/window_services/huggingface_window_service.py
index d128c8b8974..3bb545e677f 100644
--- a/src/helm/benchmark/window_services/huggingface_window_service.py
+++ b/src/helm/benchmark/window_services/huggingface_window_service.py
@@ -12,7 +12,9 @@ def __init__(
pretrained_model_name_or_path: Optional[str] = None,
revision: Optional[str] = None,
max_sequence_length: Optional[int] = None,
- max_reqeust_length: Optional[int] = None,
+ max_request_length: Optional[int] = None,
+ end_of_text_token: Optional[str] = None,
+ prefix_token: Optional[str] = None,
):
super().__init__(service)
self._tokenizer_name = tokenizer_name
@@ -21,17 +23,21 @@ def __init__(
pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
revision=revision,
)
- self._prefix_token = tokenizer.bos_token
- self._end_of_text_token = tokenizer.eos_token
- # Override max_sequence_length if provided as an argument.
- # Otherwise, auto-infer max_sequence_length from the Hugging Face tokenizer.
+ # Override max_sequence_length, max_request_length, end_of_text_token
+ # and prefix_token if provided as an argument.
+ # Otherwise, auto-infer them from the Hugging Face tokenizer.
+ #
# Note that many Hugging Face tokenizers have incorrect sequence lengths,
# so it is recommended to set this manually.
- if max_sequence_length:
- self._max_sequence_length = max_sequence_length
- else:
- self._max_sequence_length = tokenizer.model_max_length
- self._max_request_length = max_reqeust_length
+ self._max_sequence_length = max_sequence_length or tokenizer.model_max_length
+ self._max_request_length = max_request_length or self._max_sequence_length
+ self._end_of_text_token = end_of_text_token or tokenizer.eos_token or ""
+ self._prefix_token = prefix_token or tokenizer.bos_token or ""
+
+ @property
+ def tokenizer_name(self) -> str:
+ """Name of the tokenizer to use when sending a request."""
+ return self._tokenizer_name
@property
def max_sequence_length(self) -> int:
@@ -41,18 +47,13 @@ def max_sequence_length(self) -> int:
@property
def max_request_length(self) -> int:
"""Return the max request length of this tokenizer."""
- return self._max_request_length or self._max_sequence_length
+ return self._max_request_length
@property
def end_of_text_token(self) -> str:
"""The end of text token."""
return self._end_of_text_token
- @property
- def tokenizer_name(self) -> str:
- """Name of the tokenizer to use when sending a request."""
- return self._tokenizer_name
-
@property
def prefix_token(self) -> str:
"""The prefix token."""
diff --git a/src/helm/benchmark/window_services/lit_gpt_window_service.py b/src/helm/benchmark/window_services/lit_gpt_window_service.py
index 5deddd6a004..4d670a38e68 100644
--- a/src/helm/benchmark/window_services/lit_gpt_window_service.py
+++ b/src/helm/benchmark/window_services/lit_gpt_window_service.py
@@ -2,7 +2,7 @@
from .tokenizer_service import TokenizerService
-class LitGPTWindowServce(LocalWindowService):
+class LitGPTWindowService(LocalWindowService):
def __init__(self, service: TokenizerService):
super().__init__(service)
diff --git a/src/helm/benchmark/window_services/llama_window_service.py b/src/helm/benchmark/window_services/llama_window_service.py
index 7c54e3b03a2..586ce0d9702 100644
--- a/src/helm/benchmark/window_services/llama_window_service.py
+++ b/src/helm/benchmark/window_services/llama_window_service.py
@@ -21,8 +21,4 @@ class Llama2WindowService(HuggingFaceWindowService):
# meta-llama/Llama-2-70b-hf is not a local folder and is not a valid model identifier listed on
# 'https://huggingface.co/models'
def __init__(self, service: TokenizerService):
- super().__init__(service, "meta-llama/Llama-2-7b-hf")
-
- @property
- def max_sequence_length(self) -> int:
- return 4096
+ super().__init__(service, "meta-llama/Llama-2-7b-hf", max_sequence_length=4096)
diff --git a/src/helm/benchmark/window_services/remote_window_service.py b/src/helm/benchmark/window_services/remote_window_service.py
deleted file mode 100644
index e2a498265d5..00000000000
--- a/src/helm/benchmark/window_services/remote_window_service.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from typing import Dict, Type
-from .local_window_service import LocalWindowService
-from .tokenizer_service import TokenizerService
-
-
-class RemoteWindowService(LocalWindowService):
- def __init__(self, service: TokenizerService, model_name: str):
- super().__init__(service)
- self.model_name = model_name
- info = self.service.get_info(model_name)
- self._tokenizer_name = info.tokenizer_name
- self._max_sequence_length = info.max_sequence_length
- self._max_request_length = info.max_request_length
- self._end_of_text_token = info.end_of_text_token
- self._prefix_token = info.prefix_token
-
- @property
- def max_sequence_length(self) -> int:
- return self._max_sequence_length
-
- @property
- def max_request_length(self) -> int:
- return self._max_request_length
-
- @property
- def end_of_text_token(self) -> str:
- return self._end_of_text_token
-
- @property
- def prefix_token(self) -> str:
- return self._prefix_token
-
- @property
- def tokenizer_name(self) -> str:
- """Name of the tokenizer to use when sending a request."""
- return self._tokenizer_name
-
-
-# If the windowing logic is different from the base LocalWindowService,
-# please add the specific implementation for the model and add it to the following dict.
-remote_window_services: Dict[str, Type[RemoteWindowService]] = {}
-
-
-def get_remote_window_service(service: TokenizerService, model_name: str):
- if model_name in remote_window_services:
- return remote_window_services[model_name](service, model_name)
- else:
- return RemoteWindowService(service, model_name)
diff --git a/src/helm/benchmark/window_services/window_service_factory.py b/src/helm/benchmark/window_services/window_service_factory.py
index a1abbaeba39..e15bf720167 100644
--- a/src/helm/benchmark/window_services/window_service_factory.py
+++ b/src/helm/benchmark/window_services/window_service_factory.py
@@ -1,41 +1,20 @@
-from helm.benchmark.model_deployment_registry import WindowServiceSpec, get_model_deployment
-from helm.proxy.models import (
- get_model,
- get_model_names_with_tag,
- Model,
- AI21_WIDER_CONTEXT_WINDOW_TAG,
- AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG,
- WIDER_CONTEXT_WINDOW_TAG,
- GPT_TURBO_CONTEXT_WINDOW_TAG,
- GPT_TURBO_16K_CONTEXT_WINDOW_TAG,
- GPT4_CONTEXT_WINDOW_TAG,
- GPT4_32K_CONTEXT_WINDOW_TAG,
-)
+from typing import Optional
-from helm.benchmark.window_services.huggingface_window_service import HuggingFaceWindowService
-from helm.benchmark.window_services.gpt2_window_service import GPT2WindowService
-from helm.benchmark.window_services.remote_window_service import get_remote_window_service
+from helm.benchmark.model_deployment_registry import ModelDeployment, WindowServiceSpec, get_model_deployment
+from helm.benchmark.tokenizer_config_registry import TokenizerConfig, get_tokenizer_config
from helm.benchmark.window_services.window_service import WindowService
from helm.benchmark.window_services.tokenizer_service import TokenizerService
-from helm.proxy.clients.remote_model_registry import get_remote_model
from helm.common.object_spec import create_object, inject_object_spec_args
class WindowServiceFactory:
@staticmethod
- def get_window_service(model_name: str, service: TokenizerService) -> WindowService:
+ def get_window_service(model_deployment_name: str, service: TokenizerService) -> WindowService:
"""
Returns a `WindowService` given the name of the model.
Make sure this function returns instantaneously on repeated calls.
"""
- model: Model = get_model(model_name)
- organization: str = model.organization
- engine: str = model.engine
-
- window_service: WindowService
-
- # TODO: Migrate all window services to use use model deployments
- model_deployment = get_model_deployment(model_name)
+ model_deployment: Optional[ModelDeployment] = get_model_deployment(model_deployment_name)
if model_deployment:
# If the model deployment specifies a WindowServiceSpec, instantiate it.
window_service_spec: WindowServiceSpec
@@ -45,6 +24,16 @@ def get_window_service(model_name: str, service: TokenizerService) -> WindowServ
window_service_spec = WindowServiceSpec(
class_name="helm.benchmark.window_services.default_window_service.DefaultWindowService", args={}
)
+
+ # If provided, look up special tokens from TokenizerConfig.
+ end_of_text_token: Optional[str] = None
+ prefix_token: Optional[str] = None
+ if model_deployment.tokenizer_name:
+ tokenizer_config: Optional[TokenizerConfig] = get_tokenizer_config(model_deployment.tokenizer_name)
+ if tokenizer_config:
+ end_of_text_token = tokenizer_config.end_of_text_token
+ prefix_token = tokenizer_config.prefix_token
+
# Perform dependency injection to fill in remaining arguments.
# Dependency injection is needed here for these reasons:
#
@@ -59,248 +48,10 @@ def get_window_service(model_name: str, service: TokenizerService) -> WindowServ
"tokenizer_name": model_deployment.tokenizer_name,
"max_sequence_length": model_deployment.max_sequence_length,
"max_request_length": model_deployment.max_request_length,
+ "end_of_text_token": end_of_text_token,
+ "prefix_token": prefix_token,
},
)
- window_service = create_object(window_service_spec)
- elif get_remote_model(model_name):
- window_service = get_remote_window_service(service, model_name)
- elif organization == "neurips":
- from helm.benchmark.window_services.http_model_window_service import HTTPModelWindowServce
-
- window_service = HTTPModelWindowServce(service)
- elif organization == "openai":
- from helm.benchmark.window_services.openai_window_service import OpenAIWindowService
- from helm.benchmark.window_services.wider_openai_window_service import (
- WiderOpenAIWindowService,
- GPTTurboWindowService,
- GPTTurbo16KWindowService,
- GPT4WindowService,
- GPT432KWindowService,
- )
-
- if model_name in get_model_names_with_tag(GPT4_CONTEXT_WINDOW_TAG):
- window_service = GPT4WindowService(service)
- elif model_name in get_model_names_with_tag(GPT4_32K_CONTEXT_WINDOW_TAG):
- window_service = GPT432KWindowService(service)
- if model_name in get_model_names_with_tag(GPT_TURBO_CONTEXT_WINDOW_TAG):
- window_service = GPTTurboWindowService(service)
- elif model_name in get_model_names_with_tag(GPT_TURBO_16K_CONTEXT_WINDOW_TAG):
- window_service = GPTTurbo16KWindowService(service)
- elif model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG):
- window_service = WiderOpenAIWindowService(service)
- else:
- window_service = OpenAIWindowService(service)
- # For the Google models, we approximate with the OpenAIWindowService
- elif organization == "simple" or organization == "google":
- from helm.benchmark.window_services.openai_window_service import OpenAIWindowService
-
- window_service = OpenAIWindowService(service)
- elif organization == "AlephAlpha":
- from helm.benchmark.window_services.luminous_window_service import (
- LuminousBaseWindowService,
- LuminousExtendedWindowService,
- LuminousSupremeWindowService,
- LuminousWorldWindowService,
- )
-
- if engine == "luminous-base":
- window_service = LuminousBaseWindowService(service)
- elif engine == "luminous-extended":
- window_service = LuminousExtendedWindowService(service)
- elif engine == "luminous-supreme":
- window_service = LuminousSupremeWindowService(service)
- elif engine == "luminous-world":
- window_service = LuminousWorldWindowService(service)
- else:
- raise ValueError(f"Unhandled Aleph Alpha model: {engine}")
- elif organization == "microsoft":
- from helm.benchmark.window_services.mt_nlg_window_service import MTNLGWindowService
-
- window_service = MTNLGWindowService(service)
- elif organization == "anthropic":
- from helm.benchmark.window_services.anthropic_window_service import (
- AnthropicWindowService,
- LegacyAnthropicWindowService,
- )
-
- if engine == "stanford-online-all-v4-s3":
- window_service = LegacyAnthropicWindowService(service)
- else:
- window_service = AnthropicWindowService(service)
- elif organization == "writer":
- from helm.benchmark.window_services.palmyra_window_service import (
- PalmyraWindowService,
- LongerPalmyraWindowService,
- )
-
- if engine in ["palmyra-base", "palmyra-large", "palmyra-instruct-30", "palmyra-e"]:
- window_service = PalmyraWindowService(service)
- elif engine in ["palmyra-x", "silk-road"]:
- window_service = LongerPalmyraWindowService(service)
- else:
- raise ValueError(f"Unhandled Writer model: {engine}")
- elif engine == "santacoder":
- from helm.benchmark.window_services.santacoder_window_service import SantaCoderWindowService
-
- window_service = SantaCoderWindowService(service)
- elif engine == "starcoder":
- from helm.benchmark.window_services.starcoder_window_service import StarCoderWindowService
-
- window_service = StarCoderWindowService(service)
- elif model_name == "huggingface/gpt2":
- window_service = GPT2WindowService(service)
- elif model_name == "together/bloom":
- from helm.benchmark.window_services.bloom_window_service import BloomWindowService
-
- window_service = BloomWindowService(service)
- elif model_name == "together/glm":
- # From https://github.com/THUDM/GLM-130B, "the tokenizer is implemented based on
- # icetk---a unified multimodal tokenizer for images, Chinese, and English."
- from helm.benchmark.window_services.ice_window_service import ICEWindowService
-
- window_service = ICEWindowService(service)
- elif model_name in ["huggingface/gpt-j-6b", "together/gpt-j-6b", "together/gpt-jt-6b-v1", "gooseai/gpt-j-6b"]:
- from helm.benchmark.window_services.gptj_window_service import GPTJWindowService
-
- window_service = GPTJWindowService(service)
- elif model_name in [
- "together/gpt-neox-20b",
- "gooseai/gpt-neo-20b",
- "together/gpt-neoxt-chat-base-20b",
- "together/redpajama-incite-base-3b-v1",
- "together/redpajama-incite-instruct-3b-v1",
- "together/redpajama-incite-base-7b",
- "together/redpajama-incite-instruct-7b",
- # Pythia uses the same tokenizer as GPT-NeoX-20B.
- # See: https://huggingface.co/EleutherAI/pythia-6.9b#training-procedure
- "eleutherai/pythia-1b-v0",
- "eleutherai/pythia-2.8b-v0",
- "eleutherai/pythia-6.9b",
- "eleutherai/pythia-12b-v0",
- # MPT-7B model was trained with the EleutherAI/gpt-neox-20b tokenizer
- # See: https://huggingface.co/mosaicml/mpt-7b
- "mosaicml/mpt-7b",
- "mosaicml/mpt-instruct-7b",
- "mosaicml/mpt-30b",
- "mosaicml/mpt-instruct-30b",
- # Dolly models are based on Pythia.
- # See: https://github.com/databrickslabs/dolly
- "databricks/dolly-v2-3b",
- "databricks/dolly-v2-7b",
- "databricks/dolly-v2-12b",
- ]:
- from helm.benchmark.window_services.gptneox_window_service import GPTNeoXWindowService
-
- window_service = GPTNeoXWindowService(service)
- elif model_name in [
- "tiiuae/falcon-7b",
- "tiiuae/falcon-7b-instruct",
- "tiiuae/falcon-40b",
- "tiiuae/falcon-40b-instruct",
- ]:
- window_service = HuggingFaceWindowService(service=service, tokenizer_name="tiiuae/falcon-7b")
- elif model_name in [
- "stabilityai/stablelm-base-alpha-3b",
- "stabilityai/stablelm-base-alpha-7b",
- ]:
- from helm.benchmark.window_services.gptneox_window_service import StableLMAlphaWindowService
-
- window_service = StableLMAlphaWindowService(service)
- elif model_name == "together/h3-2.7b":
- window_service = GPT2WindowService(service)
- elif model_name in [
- "together/opt-1.3b",
- "together/opt-6.7b",
- "together/opt-66b",
- "together/opt-175b",
- ]:
- from helm.benchmark.window_services.opt_window_service import OPTWindowService
-
- window_service = OPTWindowService(service)
- elif model_name == "together/t0pp":
- from helm.benchmark.window_services.t0pp_window_service import T0ppWindowService
-
- window_service = T0ppWindowService(service)
- elif model_name == "together/t5-11b":
- from helm.benchmark.window_services.t511b_window_service import T511bWindowService
-
- window_service = T511bWindowService(service)
- elif model_name == "together/flan-t5-xxl":
- from helm.benchmark.window_services.flan_t5_window_service import FlanT5WindowService
-
- window_service = FlanT5WindowService(service)
- elif model_name == "together/ul2":
- from helm.benchmark.window_services.ul2_window_service import UL2WindowService
-
- window_service = UL2WindowService(service)
- elif model_name == "together/yalm":
- from helm.benchmark.window_services.yalm_window_service import YaLMWindowService
-
- window_service = YaLMWindowService(service)
- elif model_name == "nvidia/megatron-gpt2":
- from helm.benchmark.window_services.megatron_window_service import MegatronWindowService
-
- window_service = MegatronWindowService(service)
- elif model_name in [
- "lmsys/vicuna-7b-v1.3",
- "lmsys/vicuna-13b-v1.3",
- "meta/llama-7b",
- "meta/llama-13b",
- "meta/llama-30b",
- "meta/llama-65b",
- "stanford/alpaca-7b",
- ]:
- from helm.benchmark.window_services.llama_window_service import LlamaWindowService
-
- window_service = LlamaWindowService(service)
- elif model_name in [
- "meta/llama-2-7b",
- "meta/llama-2-13b",
- "meta/llama-2-70b",
- ]:
- from helm.benchmark.window_services.llama_window_service import Llama2WindowService
-
- window_service = Llama2WindowService(service)
- elif organization == "cohere":
- from helm.benchmark.window_services.cohere_window_service import (
- CohereWindowService,
- CohereCommandWindowService,
- )
-
- if "command" in engine:
- window_service = CohereCommandWindowService(service)
- else:
- window_service = CohereWindowService(service)
- elif organization == "ai21":
- from helm.benchmark.window_services.wider_ai21_window_service import (
- WiderAI21WindowService,
- AI21Jurassic2JumboWindowService,
- )
- from helm.benchmark.window_services.ai21_window_service import AI21WindowService
-
- if model_name in get_model_names_with_tag(AI21_WIDER_CONTEXT_WINDOW_TAG):
- window_service = WiderAI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
- if model_name in get_model_names_with_tag(AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG):
- window_service = AI21Jurassic2JumboWindowService(
- service=service, gpt2_window_service=GPT2WindowService(service)
- )
- else:
- window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service))
-
- elif organization == "lightningai":
- from helm.benchmark.window_services.lit_gpt_window_service import LitGPTWindowServce
-
- window_service = LitGPTWindowServce(service)
- elif organization == "mistralai":
- window_service = HuggingFaceWindowService(service, tokenizer_name="mistralai/Mistral-7B-v0.1")
- elif model_name in [
- "HuggingFaceM4/idefics-9b",
- "HuggingFaceM4/idefics-9b-instruct",
- "HuggingFaceM4/idefics-80b",
- "HuggingFaceM4/idefics-80b-instruct",
- ]:
- window_service = HuggingFaceWindowService(service, model_name)
- else:
- raise ValueError(f"Unhandled model name: {model_name}")
+ return create_object(window_service_spec)
- return window_service
+ raise ValueError(f"Unhandled model deployment name: {model_deployment_name}")
diff --git a/src/helm/common/cache_utils.py b/src/helm/common/cache_utils.py
new file mode 100644
index 00000000000..0a2fa743eac
--- /dev/null
+++ b/src/helm/common/cache_utils.py
@@ -0,0 +1,14 @@
+"""Functions used for caching."""
+
+import os
+
+from helm.common.cache import CacheConfig, MongoCacheConfig, SqliteCacheConfig
+
+
+def build_cache_config(cache_path: str, mongo_uri: str, organization: str) -> CacheConfig:
+ if mongo_uri:
+ return MongoCacheConfig(mongo_uri, collection_name=organization)
+
+ client_cache_path: str = os.path.join(cache_path, f"{organization}.sqlite")
+ # TODO: Allow setting CacheConfig.follower_cache_path from a command line flag.
+ return SqliteCacheConfig(client_cache_path)
diff --git a/src/helm/common/credentials_utils.py b/src/helm/common/credentials_utils.py
new file mode 100644
index 00000000000..ba50db48d04
--- /dev/null
+++ b/src/helm/common/credentials_utils.py
@@ -0,0 +1,28 @@
+"""Functions used for credentials."""
+
+from typing import Any, Mapping, Optional
+
+from helm.common.hierarchical_logger import hlog
+
+
+def provide_api_key(
+ credentials: Mapping[str, Any], host_organization: str, model: Optional[str] = None
+) -> Optional[str]:
+ api_key_name = host_organization + "ApiKey"
+ if api_key_name in credentials:
+ hlog(f"Using host_organization api key defined in credentials.conf: {api_key_name}")
+ return credentials[api_key_name]
+ if "deployments" not in credentials:
+ hlog(
+ "WARNING: Could not find key 'deployments' in credentials.conf, "
+ f"therefore the API key {api_key_name} should be specified."
+ )
+ return None
+ deployment_api_keys = credentials["deployments"]
+ if model is None:
+ hlog(f"WARNING: Could not find key '{host_organization}' in credentials.conf and no model provided")
+ return None
+ if model not in deployment_api_keys:
+ hlog(f"WARNING: Could not find key '{model}' under key 'deployments' in credentials.conf")
+ return None
+ return deployment_api_keys[model]
diff --git a/src/helm/common/general.py b/src/helm/common/general.py
index 9961be1e393..5940b434630 100644
--- a/src/helm/common/general.py
+++ b/src/helm/common/general.py
@@ -7,7 +7,7 @@
import uuid
import zstandard
from typing import Any, Callable, Dict, List, Optional, TypeVar
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import pyhocon
@@ -214,20 +214,14 @@ def match_case(source_word: str, target_word: str) -> str:
OutT = TypeVar("OutT")
-def parallel_map(
- process: Callable[[InT], OutT], items: List[InT], parallelism: int, multiprocessing: bool = False
-) -> List[OutT]:
+def parallel_map(process: Callable[[InT], OutT], items: List[InT], parallelism: int) -> List[OutT]:
"""
A wrapper for applying `process` to all `items`.
"""
- units = "processes" if multiprocessing else "threads"
- with htrack_block(f"Parallelizing computation on {len(items)} items over {parallelism} {units}"):
+ with htrack_block(f"Parallelizing computation on {len(items)} items over {parallelism} threads"):
results: List
if parallelism == 1:
results = list(tqdm(map(process, items), total=len(items), disable=None))
- elif multiprocessing:
- with ProcessPoolExecutor(max_workers=parallelism) as executor:
- results = list(tqdm(executor.map(process, items), total=len(items), disable=None))
else:
with ThreadPoolExecutor(max_workers=parallelism) as executor:
results = list(tqdm(executor.map(process, items), total=len(items), disable=None))
@@ -320,3 +314,20 @@ def safe_symlink(src: str, dest: str) -> None:
def is_url(location: str) -> bool:
"""Return True if `location` is a url. False otherwise."""
return urllib.parse.urlparse(location).scheme in ["http", "https"]
+
+
+def assert_is_str(val: Any) -> str:
+ assert isinstance(val, str)
+ return val
+
+
+def assert_is_str_list(val: Any) -> List[str]:
+ assert isinstance(val, list)
+ for v in val:
+ assert isinstance(v, str)
+ return val
+
+
+def assert_present(val: Optional[InT]) -> InT:
+ assert val is not None
+ return val
diff --git a/src/helm/common/object_spec.py b/src/helm/common/object_spec.py
index 8fab4489604..5669daeb33e 100644
--- a/src/helm/common/object_spec.py
+++ b/src/helm/common/object_spec.py
@@ -1,6 +1,6 @@
import importlib
import dataclasses
-from dataclasses import dataclass
+from dataclasses import dataclass, field
import inspect
from typing import Any, Callable, Dict, Optional, Tuple, Hashable, Type, TypeVar
@@ -13,7 +13,7 @@ class ObjectSpec:
class_name: str
# Arguments used to construct the scenario
- args: Dict[str, Any]
+ args: Dict[str, Any] = field(default_factory=dict)
def __hash__(self):
def get_arg_value(key: str) -> Any:
diff --git a/src/helm/common/request.py b/src/helm/common/request.py
index 6ca89fc0cb6..4acefd3690d 100644
--- a/src/helm/common/request.py
+++ b/src/helm/common/request.py
@@ -3,7 +3,6 @@
from typing import Any, Callable, Dict, List, Optional
from helm.common.media_object import MultimediaObject
-from helm.proxy.models import Model, get_model
from .general import indent_lines, format_text
@@ -15,8 +14,13 @@ class Request:
various APIs (e.g., GPT-3, Jurassic).
"""
- model: str = "openai/text-davinci-002"
- """Which model to query"""
+ model_deployment: str = ""
+ """Which model deployment to query -> Determines the Client.
+ Refers to a deployment in the model deployment registry."""
+
+ model: str = ""
+ """Which model to use -> Determines the Engine.
+ Refers to a model metadata in the model registry."""
embedding: bool = False
"""Whether to query embedding instead of text response"""
@@ -65,16 +69,23 @@ class Request:
"""Multimodal prompt with media objects interleaved (e.g., text, video, image, text, ...)"""
@property
- def model_organization(self) -> str:
- """Example: 'openai/davinci' => 'openai'"""
- model: Model = get_model(self.model)
- return model.organization
+ def model_host(self) -> str:
+ """Returns the model host (referring to the deployment).
+ Not to be confused with the model creator organization (referring to the model).
+ Example: 'openai/davinci' => 'openai'
+ 'together/bloom' => 'together'"""
+ return self.model_deployment.split("/")[0]
@property
def model_engine(self) -> str:
- """Example: 'openai/davinci' => 'davinci'"""
- model: Model = get_model(self.model)
- return model.engine
+ """Returns the model engine (referring to the model).
+ This is often the same as self.model_deploymentl.split("/")[1], but not always.
+ For example, one model could be served on several servers (each with a different model_deployment)
+ In that case we would have for example:
+ 'aws/bloom-1', 'aws/bloom-2', 'aws/bloom-3' => 'bloom'
+ This is why we need to keep track of the model engine with the model metadata.
+ Example: 'openai/davinci' => 'davinci'"""
+ return self.model.split("/")[1]
@dataclass(frozen=True)
diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
new file mode 100644
index 00000000000..f7699818a82
--- /dev/null
+++ b/src/helm/config/model_deployments.yaml
@@ -0,0 +1,1567 @@
+# This file defines all the model deployments that are supported by the Helm API.
+# Some models have several deployments, each with different parameters.
+
+# If you want to add a new deployment, you can technically do it here but we recommend
+# you to do it in private/model_deployments.yaml instead.
+
+model_deployments:
+
+ - name: simple/model1
+ model_name: simple/model1
+ tokenizer_name: simple/model1
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.simple_client.SimpleClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ # AI21 Labs
+
+ # J1 models are Deprecated by AI21 Labs
+ # API returns: Detail: Jurassic J1 models are deprecated
+ - name: ai21/j1-jumbo
+ deprecated: true
+ model_name: ai21/j1-jumbo
+ tokenizer_name: ai21/j1
+ max_sequence_length: 2047
+ client_spec:
+ class_name: "helm.proxy.clients.ai21_client.AI21Client"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ args:
+ gpt2_window_service:
+ class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ args: {}
+
+ - name: ai21/j1-large
+ deprecated: true
+ model_name: ai21/j1-large
+ tokenizer_name: ai21/j1
+ max_sequence_length: 2047
+ client_spec:
+ class_name: "helm.proxy.clients.ai21_client.AI21Client"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ args:
+ gpt2_window_service:
+ class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ args: {}
+
+ - name: ai21/j1-grande
+ deprecated: true
+ model_name: ai21/j1-grande
+ tokenizer_name: ai21/j1
+ max_sequence_length: 2047
+ client_spec:
+ class_name: "helm.proxy.clients.ai21_client.AI21Client"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ args:
+ gpt2_window_service:
+ class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ args: {}
+
+ - name: ai21/j1-grande-v2-beta
+ deprecated: true
+ model_name: ai21/j1-grande-v2-beta
+ tokenizer_name: ai21/j1
+ max_sequence_length: 2047
+ client_spec:
+ class_name: "helm.proxy.clients.ai21_client.AI21Client"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ args:
+ gpt2_window_service:
+ class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ args: {}
+
+ - name: ai21/j2-jumbo
+ model_name: ai21/j2-jumbo
+ tokenizer_name: ai21/j1
+ max_sequence_length: 6000
+ client_spec:
+ class_name: "helm.proxy.clients.ai21_client.AI21Client"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_ai21_window_service.AI21Jurassic2JumboWindowService"
+ args:
+ gpt2_window_service:
+ class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ args: {}
+
+ - name: ai21/j2-large
+ model_name: ai21/j2-large
+ tokenizer_name: ai21/j1
+ max_sequence_length: 2047
+ client_spec:
+ class_name: "helm.proxy.clients.ai21_client.AI21Client"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ args:
+ gpt2_window_service:
+ class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ args: {}
+
+ - name: ai21/j2-grande
+ model_name: ai21/j2-grande
+ tokenizer_name: ai21/j1
+ max_sequence_length: 2047
+ client_spec:
+ class_name: "helm.proxy.clients.ai21_client.AI21Client"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService"
+ args:
+ gpt2_window_service:
+ class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ args: {}
+
+
+
+ # Aleph Alpha
+ - name: AlephAlpha/luminous-base
+ model_name: AlephAlpha/luminous-base
+ tokenizer_name: AlephAlpha/luminous-base
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.luminous_window_service.LuminousBaseWindowService"
+ args: {}
+
+ - name: AlephAlpha/luminous-extended
+ model_name: AlephAlpha/luminous-extended
+ tokenizer_name: AlephAlpha/luminous-extended
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.luminous_window_service.LuminousExtendedWindowService"
+ args: {}
+
+ - name: AlephAlpha/luminous-supreme
+ model_name: AlephAlpha/luminous-supreme
+ tokenizer_name: AlephAlpha/luminous-supreme
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.luminous_window_service.LuminousSupremeWindowService"
+ args: {}
+
+ # TODO: Add luminous-world once it is released.
+
+
+
+ # Anthropic
+ - name: anthropic/claude-v1.3
+ model_name: anthropic/claude-v1.3
+ tokenizer_name: anthropic/claude
+ max_sequence_length: 8000
+ max_sequence_and_generated_tokens_length: 9016
+ client_spec:
+ class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+ args: {}
+
+ - name: anthropic/claude-instant-v1
+ model_name: anthropic/claude-instant-v1
+ tokenizer_name: anthropic/claude
+ max_sequence_length: 8000
+ max_sequence_and_generated_tokens_length: 9016
+ client_spec:
+ class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+ args: {}
+
+ - name: anthropic/claude-2.0
+ model_name: anthropic/claude-2.0
+ tokenizer_name: anthropic/claude
+ max_sequence_length: 8000
+ max_sequence_and_generated_tokens_length: 9016
+ client_spec:
+ class_name: "helm.proxy.clients.anthropic_client.AnthropicClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService"
+ args: {}
+
+ - name: anthropic/stanford-online-all-v4-s3
+ deprecated: true # Closed model, not accessible via API
+ model_name: anthropic/stanford-online-all-v4-s3
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 8192
+ client_spec:
+ class_name: "helm.proxy.clients.anthropic_client.AnthropicLegacyClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.anthropic_window_service.LegacyAnthropicWindowService"
+ args: {}
+
+
+
+ # Cohere
+ - name: cohere/xlarge-20220609
+ model_name: cohere/xlarge-20220609
+ tokenizer_name: cohere/cohere
+ max_sequence_length: 2047
+ max_request_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.cohere_client.CohereClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ args: {}
+
+ - name: cohere/large-20220720
+ model_name: cohere/large-20220720
+ tokenizer_name: cohere/cohere
+ max_sequence_length: 2047
+ max_request_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.cohere_client.CohereClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ args: {}
+
+ - name: cohere/medium-20220720
+ model_name: cohere/medium-20220720
+ tokenizer_name: cohere/cohere
+ max_sequence_length: 2047
+ max_request_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.cohere_client.CohereClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ args: {}
+
+ - name: cohere/small-20220720
+ model_name: cohere/small-20220720
+ tokenizer_name: cohere/cohere
+ max_sequence_length: 2047
+ max_request_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.cohere_client.CohereClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ args: {}
+
+ - name: cohere/xlarge-20221108
+ model_name: cohere/xlarge-20221108
+ tokenizer_name: cohere/cohere
+ max_sequence_length: 2047
+ max_request_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.cohere_client.CohereClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ args: {}
+
+ - name: cohere/medium-20221108
+ model_name: cohere/medium-20221108
+ tokenizer_name: cohere/cohere
+ max_sequence_length: 2047
+ max_request_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.cohere_client.CohereClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService"
+ args: {}
+
+ - name: cohere/command-medium-beta
+ model_name: cohere/command-medium-beta
+ tokenizer_name: cohere/cohere
+ max_sequence_length: 2019
+ max_request_length: 2020
+ client_spec:
+ class_name: "helm.proxy.clients.cohere_client.CohereClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+ args: {}
+
+ - name: cohere/command-xlarge-beta
+ model_name: cohere/command-xlarge-beta
+ tokenizer_name: cohere/cohere
+ max_sequence_length: 2019
+ max_request_length: 2020
+ client_spec:
+ class_name: "helm.proxy.clients.cohere_client.CohereClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+ args: {}
+
+ - name: cohere/command
+ model_name: cohere/command
+ tokenizer_name: cohere/cohere
+ max_sequence_length: 2019 # TODO: verify this
+ max_request_length: 2020 # TODO: verify this
+ client_spec:
+ class_name: "helm.proxy.clients.cohere_client.CohereClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+ args: {}
+
+ - name: cohere/command-light
+ model_name: cohere/command-light
+ tokenizer_name: cohere/cohere
+ max_sequence_length: 2019 # TODO: verify this
+ max_request_length: 2020 # TODO: verify this
+ client_spec:
+ class_name: "helm.proxy.clients.cohere_client.CohereClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService"
+ args: {}
+
+
+
+ # Gooseai
+
+ ## EleutherAI
+ - name: gooseai/gpt-neo-20b
+ model_name: eleutherai/gpt-neox-20b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.goose_ai_client.GooseAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: gooseai/gpt-j-6b
+ model_name: eleutherai/gpt-j-6b
+ tokenizer_name: EleutherAI/gpt-j-6B
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.goose_ai_client.GooseAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+ args: {}
+
+
+
+ # HuggingFace
+
+ ## Bigcode
+ - name: huggingface/santacoder
+ model_name: bigcode/santacoder
+ tokenizer_name: bigcode/santacoder
+ client_spec:
+ class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.santacoder_window_service.SantaCoderWindowService"
+ args: {}
+
+ - name: huggingface/starcoder
+ model_name: bigcode/starcoder
+ tokenizer_name: bigcode/starcoder
+ client_spec:
+ class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.starcoder_window_service.StarCoderWindowService"
+ args: {}
+
+ ## EleutherAI
+ - name: huggingface/gpt-j-6b
+ model_name: eleutherai/gpt-j-6b
+ tokenizer_name: EleutherAI/gpt-j-6B
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+ args: {}
+
+ ## OpenAI
+ - name: huggingface/gpt2
+ model_name: openai/gpt2
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 1024
+ max_request_length: 1025
+ client_spec:
+ class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ args: {}
+
+
+
+ # HuggingFaceM4
+ - name: HuggingFaceM4/idefics-9b
+ model_name: huggingface/idefics-9b
+ tokenizer_name: HuggingFaceM4/idefics-9b
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ args: {}
+
+ - name: HuggingFaceM4/idefics-9b-instruct
+ model_name: huggingface/idefics-9b-instruct
+ tokenizer_name: HuggingFaceM4/idefics-9b-instruct
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ args: {}
+
+ - name: HuggingFaceM4/idefics-80b
+ model_name: huggingface/idefics-80b
+ tokenizer_name: HuggingFaceM4/idefics-80b
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ args: {}
+
+ - name: HuggingFaceM4/idefics-80b-instruct
+ model_name: huggingface/idefics-80b-instruct
+ tokenizer_name: HuggingFaceM4/idefics-80b-instruct
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ args: {}
+
+
+
+ # Lighting AI
+ - name: lightningai/lit-gpt
+ model_name: lightningai/lit-gpt
+ tokenizer_name: lightningai/lit-gpt
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.lit_gpt_client.LitGPTClient"
+ args:
+ checkpoint_dir: "" # Path to the checkpoint directory
+ precision: bf16-true
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.lit_gpt_window_service.LitGPTWindowService"
+ args: {}
+
+
+
+ # Microsoft
+ - name: microsoft/TNLGv2_530B
+ model_name: microsoft/TNLGv2_530B
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2047
+ max_request_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.microsoft_client.MicrosoftClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
+ args: {}
+
+ - name: microsoft/TNLGv2_7B
+ model_name: microsoft/TNLGv2_7B
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2047
+ max_request_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.microsoft_client.MicrosoftClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService"
+ args: {}
+
+
+
+ # Neurips
+ - name: neurips/local
+ model_name: neurips/local
+ tokenizer_name: neurips/local
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.http_model_client.HTTPModelClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.http_model_window_service.HTTPModelWindowService"
+ args: {}
+
+
+
+ # Nvidia
+ - name: nvidia/megatron-gpt2
+ model_name: nvidia/megatron-gpt2
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 1024
+ client_spec:
+ class_name: "helm.proxy.clients.megatron_client.MegatronClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.megatron_window_service.MegatronWindowService"
+ args: {}
+
+
+
+ # OpenAI
+
+ ## GPT 3 Models
+ # The list of models can be found here: https://beta.openai.com/docs/engines/gpt-3
+ # DEPRECATED: Announced on July 06 2023 that these models will be shut down on January 04 2024.
+
+ - name: openai/davinci
+ deprecated: true
+ model_name: openai/davinci
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/curie
+ deprecated: true
+ model_name: openai/curie
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/babbage
+ deprecated: true
+ model_name: openai/babbage
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/ada
+ deprecated: true
+ model_name: openai/ada
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/text-davinci-003
+ deprecated: true
+ model_name: openai/text-davinci-003
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 4000
+ max_request_length: 4001
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+ args: {}
+
+ - name: openai/text-davinci-002
+ deprecated: true
+ model_name: openai/text-davinci-002
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 4000
+ max_request_length: 4001
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+ args: {}
+
+ - name: openai/text-davinci-001
+ deprecated: true
+ model_name: openai/text-davinci-001
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/text-curie-001
+ deprecated: true
+ model_name: openai/text-curie-001
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/text-babbage-001
+ deprecated: true
+ model_name: openai/text-babbage-001
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/text-ada-001
+ deprecated: true
+ model_name: openai/text-ada-001
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+
+ ## GPT 3.5 Turbo Models
+ # ChatGPT: https://openai.com/blog/chatgpt
+
+ # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
+ # sequence length is smaller at 4087 with one user input message and one assistant
+ # output message because ChatGPT uses special tokens for message roles and boundaries.
+ # We use a rounded-down sequence length of 4000 to account for these special tokens.
+ - name: openai/gpt-3.5-turbo-0301
+ model_name: openai/gpt-3.5-turbo-0301
+ tokenizer_name: openai/cl100k_base
+ max_sequence_length: 4000
+ max_request_length: 4001
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
+ args: {}
+
+ # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
+ # sequence length is smaller at 4087 with one user input message and one assistant
+ # output message because ChatGPT uses special tokens for message roles and boundaries.
+ # We use a rounded-down sequence length of 4000 to account for these special tokens.
+ - name: openai/gpt-3.5-turbo-0613
+ model_name: openai/gpt-3.5-turbo-0613
+ tokenizer_name: openai/cl100k_base
+ max_sequence_length: 4000
+ max_request_length: 4001
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService"
+ args: {}
+
+ # Claimed length is 16,384; we round down to 16,000 for the same reasons as explained
+ # in the openai/gpt-3.5-turbo-0613 comment
+ - name: openai/gpt-3.5-turbo-16k-0613
+ model_name: openai/gpt-3.5-turbo-16k-0613
+ tokenizer_name: openai/cl100k_base
+ max_sequence_length: 16000
+ max_request_length: 16001
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurbo16KWindowService"
+ args: {}
+
+
+ ## GPT 4 Models
+
+ - name: openai/gpt-4-0314
+ model_name: openai/gpt-4-0314
+ tokenizer_name: openai/cl100k_base
+ max_sequence_length: 8192
+ max_request_length: 8193
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
+ args: {}
+
+ - name: openai/gpt-4-32k-0314
+ model_name: openai/gpt-4-32k-0314
+ tokenizer_name: openai/cl100k_base
+ max_sequence_length: 32768
+ max_request_length: 32769
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
+ args: {}
+
+ - name: openai/gpt-4-0613
+ model_name: openai/gpt-4-0613
+ tokenizer_name: openai/cl100k_base
+ max_sequence_length: 8192
+ max_request_length: 8193
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService"
+ args: {}
+
+ - name: openai/gpt-4-32k-0613
+ model_name: openai/gpt-4-32k-0613
+ tokenizer_name: openai/cl100k_base
+ max_sequence_length: 32768
+ max_request_length: 32769
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService"
+ args: {}
+
+
+ ## Codex Models
+ # DEPRECATED: Codex models have been shut down on March 23 2023.
+
+ - name: openai/code-davinci-002
+ deprecated: true
+ model_name: openai/code-davinci-002
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 4000
+ max_request_length: 4001
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService"
+ args: {}
+
+ - name: openai/code-davinci-001
+ deprecated: true
+ model_name: openai/code-davinci-001
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/code-cushman-001
+ deprecated: true
+ model_name: openai/code-cushman-001
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+
+ ## Text Similarity Models
+ # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
+ # The number of parameters is guessed based on the number of parameters of the
+ # corresponding GPT-3 model.
+ # DEPRECATED: Announced on July 06 2023 that first generation embeddings models
+ # will be shut down on January 04 2024.
+
+ - name: openai/text-similarity-davinci-001
+ deprecated: true
+ model_name: openai/text-similarity-davinci-001
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/text-similarity-curie-001
+ deprecated: true
+ model_name: openai/text-similarity-curie-001
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/text-similarity-babbage-001
+ deprecated: true
+ model_name: openai/text-similarity-babbage-001
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ - name: openai/text-similarity-ada-001
+ deprecated: true
+ model_name: openai/text-similarity-ada-001
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+ # As of 2023-11-07, text-embedding-ada-002 is not deprecated:
+ # "We recommend using text-embedding-ada-002 for nearly all use cases."
+ # Source: https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
+ - name: openai/text-embedding-ada-002
+ model_name: openai/text-embedding-ada-002
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.openai_client.OpenAIClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService"
+ args: {}
+
+
+
+ # Together
+ # The list of models served by Together changes often, to check the latest list, visit:
+ # https://docs.together.ai/docs/inference-models
+ # You can also check the playground to check that the live models are working:
+ # https://api.together.xyz/playground
+
+ ## BigScience
+ - name: together/bloom
+ deprecated: true # Removed from together
+ model_name: bigscience/bloom
+ tokenizer_name: bigscience/bloom
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.bloom_window_service.BloomWindowService"
+ args: {}
+
+ - name: together/t0pp
+ deprecated: true # Removed from together
+ model_name: bigscience/t0pp
+ tokenizer_name: bigscience/T0pp
+ max_sequence_length: 1024
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.t0pp_window_service.T0ppWindowService"
+ args: {}
+
+ ## Databricks
+ - name: together/dolly-v2-3b
+ model_name: databricks/dolly-v2-3b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/dolly-v2-7b
+ model_name: databricks/dolly-v2-7b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/dolly-v2-12b
+ model_name: databricks/dolly-v2-12b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ ## EleutherAI
+ - name: together/gpt-j-6b
+ deprecated: true # Removed from together
+ model_name: eleutherai/gpt-j-6b
+ tokenizer_name: EleutherAI/gpt-j-6B
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+ args: {}
+
+ - name: together/gpt-neox-20b
+ deprecated: true # Removed from together
+ model_name: eleutherai/gpt-neox-20b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/pythia-1b-v0
+ model_name: eleutherai/pythia-1b-v0
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/pythia-2.8b-v0
+ model_name: eleutherai/pythia-2.8b-v0
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/pythia-6.9b
+ model_name: eleutherai/pythia-6.9b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/pythia-12b-v0
+ model_name: eleutherai/pythia-12b-v0
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ ## Google
+ - name: together/t5-11b
+ deprecated: true # Removed from together
+ model_name: google/t5-11b
+ tokenizer_name: google/t5-11b
+ max_sequence_length: 511
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.t511b_window_service.T511bWindowService"
+ args: {}
+
+ - name: together/flan-t5-xxl
+ deprecated: true # Removed from together
+ model_name: google/flan-t5-xxl
+ tokenizer_name: google/flan-t5-xxl
+ max_sequence_length: 511
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.flan_t5_window_service.FlanT5WindowService"
+ args: {}
+
+ - name: together/ul2
+ deprecated: true # Removed from together
+ model_name: google/ul2
+ tokenizer_name: google/ul2
+ max_sequence_length: 511
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.ul2_window_service.UL2WindowService"
+ args: {}
+
+ ## HazyResearch
+ - name: together/h3-2.7b
+ deprecated: true# Not available on Together yet
+ model_name: hazyresearch/h3-2.7b
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 1024
+ max_request_length: 1025
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService"
+ args: {}
+
+ ## LMSYS
+ # TODO: might be deprecated. Needs to be checked.
+ # Together officialy supports vicuna 1.5, not sure if 1.3 is still supported.
+ - name: together/vicuna-7b-v1.3
+ model_name: lmsys/vicuna-7b-v1.3
+ tokenizer_name: hf-internal-testing/llama-tokenizer
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ args: {}
+
+ - name: together/vicuna-13b-v1.3
+ model_name: lmsys/vicuna-13b-v1.3
+ tokenizer_name: hf-internal-testing/llama-tokenizer
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ args: {}
+
+ ## Meta
+ - name: together/llama-7b
+ model_name: meta/llama-7b
+ tokenizer_name: hf-internal-testing/llama-tokenizer
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ args: {}
+
+ - name: together/llama-13b
+ model_name: meta/llama-13b
+ tokenizer_name: hf-internal-testing/llama-tokenizer
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ args: {}
+
+ - name: together/llama-30b
+ model_name: meta/llama-30b
+ tokenizer_name: hf-internal-testing/llama-tokenizer
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ args: {}
+
+ - name: together/llama-65b
+ model_name: meta/llama-65b
+ tokenizer_name: hf-internal-testing/llama-tokenizer
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ args: {}
+
+ - name: together/llama-2-7b
+ model_name: meta/llama-2-7b
+ tokenizer_name: meta-llama/Llama-2-7b-hf
+ max_sequence_length: 4096
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+ args: {}
+
+ - name: together/llama-2-13b
+ model_name: meta/llama-2-13b
+ tokenizer_name: meta-llama/Llama-2-7b-hf
+ max_sequence_length: 4096
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+ args: {}
+
+ - name: together/llama-2-70b
+ model_name: meta/llama-2-70b
+ tokenizer_name: meta-llama/Llama-2-7b-hf
+ max_sequence_length: 4096
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService"
+ args: {}
+
+ - name: together/opt-175b
+ deprecated: true # Not available on Together yet
+ model_name: meta/opt-175b
+ tokenizer_name: facebook/opt-66b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
+ args: {}
+
+ - name: together/opt-66b
+ deprecated: true # Not available on Together yet
+ model_name: meta/opt-66b
+ tokenizer_name: facebook/opt-66b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
+ args: {}
+
+ - name: together/opt-6.7b
+ deprecated: true # Not available on Together yet
+ model_name: meta/opt-6.7b
+ tokenizer_name: facebook/opt-66b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
+ args: {}
+
+ - name: together/opt-1.3b
+ deprecated: true # Not available on Together yet
+ model_name: meta/opt-1.3b
+ tokenizer_name: facebook/opt-66b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService"
+ args: {}
+
+ ## MistralAI
+ - name: together/mistral-7b-v0.1
+ model_name: mistralai/mistral-7b-v0.1
+ tokenizer_name: mistralai/Mistral-7B-v0.1
+ max_sequence_length: 4095
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ args: {}
+
+ ## MosaicML
+ - name: together/mpt-7b
+ deprecated: true # Not available on Together yet
+ model_name: mosaicml/mpt-7b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/mpt-instruct-7b
+ deprecated: true # Not available on Together yet
+ model_name: mosaicml/mpt-instruct-7b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/mpt-30b
+ model_name: mosaicml/mpt-30b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/mpt-instruct-30b
+ model_name: mosaicml/mpt-instruct-30b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ ## StabilityAI
+ - name: together/stablelm-base-alpha-3b
+ deprecated: true # Removed from together
+ model_name: stabilityai/stablelm-base-alpha-3b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 4096
+ max_request_length: 4097
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
+ args: {}
+
+ - name: together/stablelm-base-alpha-7b
+ deprecated: true # Removed from together
+ model_name: stabilityai/stablelm-base-alpha-7b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 4096
+ max_request_length: 4097
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService"
+ args: {}
+
+ ## Stanford
+ - name: together/alpaca-7b
+ model_name: stanford/alpaca-7b
+ tokenizer_name: hf-internal-testing/llama-tokenizer
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService"
+ args: {}
+
+ ## Tiiuae
+ - name: together/falcon-7b
+ model_name: tiiuae/falcon-7b
+ tokenizer_name: tiiuae/falcon-7b
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ args: {}
+
+ - name: together/falcon-7b-instruct
+ model_name: tiiuae/falcon-7b-instruct
+ tokenizer_name: tiiuae/falcon-7b
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ args: {}
+
+ - name: together/falcon-40b
+ model_name: tiiuae/falcon-40b
+ tokenizer_name: tiiuae/falcon-7b
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ args: {}
+
+ - name: together/falcon-40b-instruct
+ model_name: tiiuae/falcon-40b-instruct
+ tokenizer_name: tiiuae/falcon-7b
+ max_sequence_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService"
+ args: {}
+
+ ## Together
+ # These are models fine-tuned by Together (and not simply hosted by Together).
+ - name: together/gpt-jt-6b-v1
+ model_name: together/gpt-jt-6b-v1
+ tokenizer_name: EleutherAI/gpt-j-6B
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService"
+ args: {}
+
+ - name: together/gpt-neoxt-chat-base-20b
+ model_name: together/gpt-neoxt-chat-base-20b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/redpajama-incite-base-3b-v1
+ model_name: together/redpajama-incite-base-3b-v1
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/redpajama-incite-instruct-3b-v1
+ model_name: together/redpajama-incite-instruct-3b-v1
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/redpajama-incite-base-7b
+ model_name: together/redpajama-incite-base-7b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ - name: together/redpajama-incite-instruct-7b
+ model_name: together/redpajama-incite-instruct-7b
+ tokenizer_name: EleutherAI/gpt-neox-20b
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService"
+ args: {}
+
+ ## Tsinghua
+ - name: together/glm
+ deprecated: true # Not available on Together yet
+ model_name: tsinghua/glm
+ tokenizer_name: TsinghuaKEG/ice
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.ice_window_service.ICEWindowService"
+ args: {}
+
+ ## Yandex
+ - name: together/yalm
+ deprecated: true # Not available on Together yet
+ model_name: yandex/yalm
+ tokenizer_name: Yandex/yalm
+ max_sequence_length: 2048
+ max_request_length: 2049
+ client_spec:
+ class_name: "helm.proxy.clients.together_client.TogetherClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.yalm_window_service.YaLMWindowService"
+ args: {}
+
+
+
+ # Writer
+ - name: writer/palmyra-base
+ model_name: writer/palmyra-base
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_sequence_and_generated_tokens_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+ args: {}
+
+ - name: writer/palmyra-large
+ model_name: writer/palmyra-large
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_sequence_and_generated_tokens_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+ args: {}
+
+ - name: writer/palmyra-instruct-30
+ model_name: writer/palmyra-instruct-30
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_sequence_and_generated_tokens_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+ args: {}
+
+ - name: writer/palmyra-e
+ model_name: writer/palmyra-e
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 2048
+ max_sequence_and_generated_tokens_length: 2048
+ client_spec:
+ class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService"
+ args: {}
+
+ - name: writer/silk-road
+ model_name: writer/silk-road
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 8192
+ max_sequence_and_generated_tokens_length: 8192
+ client_spec:
+ class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
+ args: {}
+
+ - name: writer/palmyra-x
+ model_name: writer/palmyra-x
+ tokenizer_name: huggingface/gpt2
+ max_sequence_length: 8192
+ max_sequence_and_generated_tokens_length: 8192
+ client_spec:
+ class_name: "helm.proxy.clients.palmyra_client.PalmyraClient"
+ args: {}
+ window_service_spec:
+ class_name: "helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService"
+ args: {}
\ No newline at end of file
diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
new file mode 100644
index 00000000000..e9c097ea165
--- /dev/null
+++ b/src/helm/config/model_metadata.yaml
@@ -0,0 +1,1351 @@
+# This file defines all the models officially supported by the Helm API.
+# The model names here should match the model names in model_deployments.yaml.
+
+# If you want to add a new model, you can technically do it here but we recommend
+# you to do it in private/model_metadata.yaml instead.
+
+models:
+
+ - name: simple/model1
+ display_name: Simple Model 1
+ description: This is a test model.
+ creator_organization_name: Helm
+ access: open
+ release_date: 2023-01-01
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ # AI21 Labs
+ - name: ai21/j1-jumbo # DEPRECATED
+ display_name: J1-Jumbo v1 (178B)
+ description: Jurassic-1 Jumbo (178B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
+ creator_organization_name: AI21 Labs
+ access: limited
+ num_parameters: 178000000000
+ release_date: 2021-08-11
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: ai21/j1-large # DEPRECATED
+ display_name: J1-Large v1 (7.5B)
+ description: Jurassic-1 Large (7.5B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
+ creator_organization_name: AI21 Labs
+ access: limited
+ num_parameters: 7500000000
+ release_date: 2021-08-11
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: ai21/j1-grande # DEPRECATED
+ display_name: J1-Grande v1 (17B)
+ description: Jurassic-1 Grande (17B parameters) with a "few tweaks" to the training process ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)).
+ creator_organization_name: AI21 Labs
+ access: limited
+ num_parameters: 17000000000
+ release_date: 2022-05-03
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: ai21/j1-grande-v2-beta # DEPRECATED
+ display_name: J1-Grande v2 beta (17B)
+ description: Jurassic-1 Grande v2 beta (17B parameters)
+ creator_organization_name: AI21 Labs
+ access: limited
+ num_parameters: 17000000000
+ release_date: 2022-10-28
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: ai21/j2-jumbo
+ display_name: Jurassic-2 Jumbo (178B)
+ description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+ creator_organization_name: AI21 Labs
+ access: limited
+ num_parameters: 178000000000
+ release_date: 2023-03-09
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: ai21/j2-large
+ display_name: Jurassic-2 Large (7.5B)
+ description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+ creator_organization_name: AI21 Labs
+ access: limited
+ num_parameters: 7500000000
+ release_date: 2023-03-09
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: ai21/j2-grande
+ display_name: Jurassic-2 Grande (17B)
+ description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2))
+ creator_organization_name: AI21 Labs
+ access: limited
+ num_parameters: 17000000000
+ release_date: 2023-03-09
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ # TODO(1524): Change AI21 model names
+ # - j2-jumbo -> j2-ultra
+ # - j2-grande -> j2-mid
+ # - j2-large -> j2-light
+
+
+
+ # Aleph Alpha
+ # Aleph Alpha's Luminous models: https://docs.aleph-alpha.com/docs/introduction/luminous
+ # TODO: add Luminous World when it's released
+ - name: AlephAlpha/luminous-base
+ display_name: Luminous Base (13B)
+ description: Luminous Base (13B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+ creator_organization_name: Aleph Alpha
+ access: limited
+ num_parameters: 13000000000
+ # TODO: get exact release date
+ release_date: 2022-01-01
+ # Does not support echo
+ tags: [TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: AlephAlpha/luminous-extended
+ display_name: Luminous Extended (30B)
+ description: Luminous Extended (30B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+ creator_organization_name: Aleph Alpha
+ access: limited
+ num_parameters: 30000000000
+ release_date: 2022-01-01
+ # Does not support echo
+ tags: [TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: AlephAlpha/luminous-supreme
+ display_name: Luminous Supreme (70B)
+ description: Luminous Supreme (70B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+ creator_organization_name: Aleph Alpha
+ access: limited
+ num_parameters: 70000000000
+ release_date: 2022-01-01
+ # Does not support echo.
+ # TODO: images will be supported in the near future. Add IMAGE_MODEL_TAG.
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ # TODO: Uncomment when luminous-world is released.
+ # - name: AlephAlpha/luminous-world # Not released yet.
+ # display_name: Luminous World (178B)
+ # description: Luminous World (178B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/))
+ # creator_organization_name: Aleph Alpha
+ # access: limited
+ # num_parameters: TBD
+ # release_date: TBD
+ # # Does not support echo.
+ # tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # Anthropic
+ - name: anthropic/claude-v1.3
+ display_name: Anthropic Claude v1.3
+ description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
+ creator_organization_name: Anthropic
+ access: limited
+ num_parameters: 52000000000
+ release_date: 2023-03-17
+ tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ - name: anthropic/claude-instant-v1
+ display_name: Anthropic Claude Instant V1
+ description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)).
+ creator_organization_name: Anthropic
+ access: limited
+ release_date: 2023-03-17
+ tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ - name: anthropic/claude-2.0
+ display_name: Anthropic Claude 2.0
+ description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf))
+ creator_organization_name: Anthropic
+ access: limited
+ release_date: 2023-07-11
+ tags: [ANTHROPIC_CLAUDE_2_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ # DEPRECATED: Please do not use.
+ - name: anthropic/stanford-online-all-v4-s3
+ display_name: Anthropic-LM v4-s3 (52B)
+ description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf).
+ creator_organization_name: Anthropic
+ access: closed
+ num_parameters: 52000000000
+ release_date: 2021-12-01
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
+
+
+
+ # Berkeley
+ - name: berkeley/koala-13b # NOT SUPPORTED
+ display_name: Koala (13B)
+ description: Koala (13B) is a chatbot fine-tuned from Llama (13B) on dialogue data gathered from the web. ([blog post](https://bair.berkeley.edu/blog/2023/04/03/koala/))
+ creator_organization_name: UC Berkeley
+ access: open
+ num_parameters: 13000000000
+ release_date: 2022-04-03
+ tags: [] # TODO: add tags
+
+
+
+ # BigScience
+ - name: bigscience/bloom
+ display_name: BLOOM (176B)
+ description: BLOOM (176B parameters) is an autoregressive model trained on 46 natural languages and 13 programming languages ([paper](https://arxiv.org/pdf/2211.05100.pdf)).
+ creator_organization_name: BigScience
+ access: open
+ num_parameters: 176000000000
+ release_date: 2022-06-28
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
+
+ - name: bigscience/bloomz # NOT SUPPORTED
+ display_name: BLOOMZ (176B)
+ description: BLOOMZ (176B parameters) is BLOOM that has been fine-tuned on natural language instructions ([details](https://huggingface.co/bigscience/bloomz)).
+ creator_organization_name: BigScience
+ access: open
+ num_parameters: 176000000000
+ release_date: 2022-11-03
+ tags: [] # TODO: add tags
+
+ - name: bigscience/t0pp
+ display_name: T0pp (11B)
+ description: T0pp (11B parameters) is an encoder-decoder model trained on a large set of different tasks specified in natural language prompts ([paper](https://arxiv.org/pdf/2110.08207.pdf)).
+ creator_organization_name: BigScience
+ access: open
+ num_parameters: 11000000000
+ release_date: 2021-10-15
+ # Does not support echo.
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG]
+
+
+
+ # BigCode
+ - name: bigcode/santacoder
+ display_name: SantaCoder (1.1B)
+ description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)).
+ creator_organization_name: BigCode
+ access: open
+ num_parameters: 1100000000
+ release_date: 2023-01-09 # ArXiv submission date
+ tags: [CODE_MODEL_TAG]
+
+ - name: bigcode/starcoder
+ display_name: StarCoder (15.5B)
+ description: The StarCoder (15.5B parameter) model trained on 80+ programming languages from The Stack (v1.2) ([model card](https://huggingface.co/bigcode/starcoder)).
+ creator_organization_name: BigCode
+ access: open
+ num_parameters: 15500000000
+ release_date: 2023-05-09 # ArXiv submission date
+ tags: [CODE_MODEL_TAG]
+
+
+
+ # Cerebras Systems
+ - name: cerebras/cerebras-gpt-6.7b # NOT SUPPORTED
+ display_name: Cerebras GPT (6.7B)
+ description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
+ creator_organization_name: Cerebras
+ access: limited
+ num_parameters: 6700000000
+ release_date: 2023-04-06
+ tags: [] # TODO: add tags
+
+ - name: cerebras/cerebras-gpt-13b # NOT SUPPORTED
+ display_name: Cerebras GPT (13B)
+ description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf))
+ creator_organization_name: Cerebras
+ access: limited
+ num_parameters: 13000000000
+ release_date: 2023-04-06
+ tags: [] # TODO: add tags
+
+
+
+ # Cohere
+ # Model versioning and the possible versions are not documented here:
+ # https://docs.cohere.ai/generate-reference#model-optional.
+ # So, instead, we got the names of the models from the Cohere Playground.
+ #
+ # Note that their tokenizer and model were trained on English text and
+ # they do not have a dedicated decode API endpoint, so the adaptation
+ # step for language modeling fails for certain Scenarios:
+ # the_pile:subset=ArXiv
+ # the_pile:subset=Github
+ # the_pile:subset=PubMed Central
+
+ # TODO: Consider renaming to new model names.
+ - name: cohere/xlarge-20220609
+ display_name: Cohere xlarge v20220609 (52.4B)
+ description: Cohere xlarge v20220609 (52.4B parameters)
+ creator_organization_name: Cohere
+ access: limited
+ num_parameters: 52400000000
+ release_date: 2022-06-09
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: cohere/large-20220720 # DEPRECATED
+ display_name: Cohere large v20220720 (13.1B)
+ description: Cohere large v20220720 (13.1B parameters), which is deprecated by Cohere as of December 2, 2022.
+ creator_organization_name: Cohere
+ access: limited
+ num_parameters: 13100000000
+ release_date: 2022-07-20
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: cohere/medium-20220720
+ display_name: Cohere medium v20220720 (6.1B)
+ description: Cohere medium v20220720 (6.1B parameters)
+ creator_organization_name: Cohere
+ access: limited
+ num_parameters: 6100000000
+ release_date: 2022-07-20
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: cohere/small-20220720 # DEPRECATED
+ display_name: Cohere small v20220720 (410M)
+ description: Cohere small v20220720 (410M parameters), which is deprecated by Cohere as of December 2, 2022.
+ creator_organization_name: Cohere
+ access: limited
+ num_parameters: 410000000
+ release_date: 2022-07-20
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: cohere/xlarge-20221108
+ display_name: Cohere xlarge v20221108 (52.4B)
+ description: Cohere xlarge v20221108 (52.4B parameters)
+ creator_organization_name: Cohere
+ access: limited
+ num_parameters: 52400000000
+ release_date: 2022-11-08
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: cohere/medium-20221108 # DEPRECATED
+ display_name: Cohere medium v20221108 (6.1B)
+ description: Cohere medium v20221108 (6.1B parameters)
+ creator_organization_name: Cohere
+ access: limited
+ num_parameters: 6100000000
+ release_date: 2022-11-08
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: cohere/command-medium-beta # DEPRECATED
+ display_name: Cohere Command beta (6.1B)
+ description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
+ creator_organization_name: Cohere
+ access: limited
+ num_parameters: 6100000000
+ release_date: 2022-11-08
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ - name: cohere/command-xlarge-beta # DEPRECATED
+ display_name: Cohere Command beta (52.4B)
+ description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)).
+ creator_organization_name: Cohere
+ access: limited
+ num_parameters: 52400000000
+ release_date: 2022-11-08
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ # TODO: Fill in the details.
+ - name: cohere/command
+ display_name: Cohere Command TODO
+ description: Cohere Command TODO
+ creator_organization_name: Cohere
+ access: limited
+ release_date: 2022-11-08 # TODO
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ - name: cohere/command-light
+ display_name: Cohere Command TODO
+ description: Cohere Command TODO
+ creator_organization_name: Cohere
+ access: limited
+ release_date: 2022-11-08 # TODO
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+
+ # Databricks
+ - name: databricks/dolly-v2-3b
+ display_name: Dolly V2 (3B)
+ description: Dolly V2 (3B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
+ creator_organization_name: Databricks
+ access: open
+ num_parameters: 2517652480
+ release_date: 2023-04-12
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: databricks/dolly-v2-7b
+ display_name: Dolly V2 (7B)
+ description: Dolly V2 (7B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
+ creator_organization_name: Databricks
+ access: open
+ num_parameters: 6444163072
+ release_date: 2023-04-12
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: databricks/dolly-v2-12b
+ display_name: Dolly V2 (12B)
+ description: Dolly V2 (12B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b.
+ creator_organization_name: Databricks
+ access: open
+ num_parameters: 11327027200
+ release_date: 2023-04-12
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # DeepMind
+ - name: deepmind/gopher # NOT SUPPORTED
+ display_name: Gopher (280B)
+ description: Gopher (280B parameters) ([paper](https://arxiv.org/pdf/2112.11446.pdf)).
+ creator_organization_name: DeepMind
+ access: closed
+ num_parameters: 280000000000
+ release_date: 2021-12-08
+ tags: [] # TODO: add tags
+
+ - name: deepmind/chinchilla # NOT SUPPORTED
+ display_name: Chinchilla (70B)
+ description: Chinchilla (70B parameters) ([paper](https://arxiv.org/pdf/2203.15556.pdf)).
+ creator_organization_name: DeepMind
+ access: closed
+ num_parameters: 70000000000
+ release_date: 2022-03-31
+ tags: [] # TODO: add tags
+
+
+
+ # EleutherAI
+ - name: eleutherai/gpt-j-6b # Served by GooseAi, HuggingFace and Together.
+ display_name: GPT-J (6B)
+ description: GPT-J (6B parameters) autoregressive language model trained on The Pile ([details](https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/)).
+ creator_organization_name: EleutherAI
+ access: open
+ num_parameters: 6000000000
+ release_date: 2021-06-04
+ # TODO: The BUGGY_TEMP_0_TAG is a deployment related tag (Together).
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, BUGGY_TEMP_0_TAG]
+
+ - name: eleutherai/gpt-neox-20b # Served by GooseAi and Together.
+ display_name: GPT-NeoX (20B)
+ description: GPT-NeoX (20B parameters) autoregressive language model trained on The Pile ([paper](https://arxiv.org/pdf/2204.06745.pdf)).
+ creator_organization_name: EleutherAI
+ access: open
+ num_parameters: 20000000000
+ release_date: 2022-02-02
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
+
+ - name: eleutherai/pythia-1b-v0
+ display_name: Pythia (1B)
+ description: Pythia (1B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+ creator_organization_name: EleutherAI
+ access: open
+ num_parameters: 805736448
+ release_date: 2023-02-13
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: eleutherai/pythia-2.8b-v0
+ display_name: Pythia (2.8B)
+ description: Pythia (2.8B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+ creator_organization_name: EleutherAI
+ access: open
+ num_parameters: 2517652480
+ release_date: 2023-02-13
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: eleutherai/pythia-6.9b
+ display_name: Pythia (6.9B)
+ description: Pythia (6.9B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+ creator_organization_name: EleutherAI
+ access: open
+ num_parameters: 6444163072
+ release_date: 2023-02-13
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: eleutherai/pythia-12b-v0
+ display_name: Pythia (12B)
+ description: Pythia (12B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers.
+ creator_organization_name: EleutherAI
+ access: open
+ num_parameters: 11327027200
+ release_date: 2023-02-13
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # Google
+ - name: google/t5-11b
+ display_name: T5 (11B)
+ description: T5 (11B parameters) is an encoder-decoder model trained on a multi-task mixture, where each task is converted into a text-to-text format ([paper](https://arxiv.org/pdf/1910.10683.pdf)).
+ creator_organization_name: Google
+ access: open
+ num_parameters: 11000000000
+ release_date: 2019-10-23
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG]
+
+ - name: google/ul2
+ display_name: UL2 (20B)
+ description: UL2 (20B parameters) is an encoder-decoder model trained on the C4 corpus. It's similar to T5 but trained with a different objective and slightly different scaling knobs ([paper](https://arxiv.org/pdf/2205.05131.pdf)).
+ creator_organization_name: Google
+ access: open
+ num_parameters: 20000000000
+ release_date: 2022-05-10
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG, NLG_PREFIX_TAG]
+
+ - name: google/flan-t5-xxl
+ display_name: Flan-T5 (11B)
+ description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)).
+ creator_organization_name: Google
+ access: open
+ num_parameters: 11000000000
+ release_date: 2022-12-06 # Paper date
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ - name: google/palm # NOT SUPPORTED
+ display_name: PaLM (540B)
+ description: Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips ([paper](https://arxiv.org/pdf/2204.02311.pdf)).
+ creator_organization_name: Google
+ access: closed
+ num_parameters: 540000000000
+ release_date: 2023-03-01 # was first announced on 2022-04 but remained private.
+ tags: [] # TODO: add tags
+
+
+
+ # HazyResearch
+ - name: hazyresearch/h3-2.7b
+ display_name: H3 (2.7B)
+ description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)).
+ creator_organization_name: HazyResearch
+ access: open
+ num_parameters: 2700000000
+ release_date: 2023-01-23
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # HuggingFace
+ - name: huggingface/idefics-9b
+ display_name: IDEFICS (9B)
+ description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+ creator_organization_name: HuggingFace
+ access: open
+ num_parameters: 9000000000
+ release_date: 2023-08-22
+ tags: [VISION_LANGUAGE_MODEL_TAG]
+
+ - name: huggingface/idefics-9b-instruct
+ display_name: IDEFICS instruct (9B)
+ description: IDEFICS instruct (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+ creator_organization_name: HuggingFace
+ access: open
+ num_parameters: 9000000000
+ release_date: 2023-08-22
+ tags: [VISION_LANGUAGE_MODEL_TAG]
+
+ - name: huggingface/idefics-80b
+ display_name: IDEFICS (80B)
+ description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+ creator_organization_name: HuggingFace
+ access: open
+ num_parameters: 80000000000
+ release_date: 2023-08-22
+ tags: [VISION_LANGUAGE_MODEL_TAG]
+
+ - name: huggingface/idefics-80b-instruct
+ display_name: IDEFICS instruct (80B)
+ description: IDEFICS instruct (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics))
+ creator_organization_name: HuggingFace
+ access: open
+ num_parameters: 80000000000
+ release_date: 2023-08-22
+ tags: [VISION_LANGUAGE_MODEL_TAG]
+
+
+
+ # Lightning AI
+ - name: lightningai/lit-gpt
+ display_name: Lit-GPT
+ description: Lit-GPT is an optimized collection of open-source LLMs for finetuning and inference. It supports – Falcon, Llama 2, Vicuna, LongChat, and other top-performing open-source large language models.
+ creator_organization_name: Lightning AI
+ access: open
+ release_date: 2023-04-04
+ tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # LMSYS
+ - name: lmsys/vicuna-7b-v1.3
+ display_name: Vicuna v1.3 (7B)
+ description: Vicuna v1.3 (7B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
+ creator_organization_name: LMSYS
+ access: open
+ num_parameters: 7000000000
+ release_date: 2023-06-22
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ - name: lmsys/vicuna-13b-v1.3
+ display_name: Vicuna v1.3 (13B)
+ description: Vicuna v1.3 (13B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT.
+ creator_organization_name: LMSYS
+ access: open
+ num_parameters: 13000000000
+ release_date: 2023-06-22
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+
+ # Meta
+ - name: meta/opt-iml-175b # NOT SUPPORTED
+ display_name: OPT-IML (175B)
+ description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 175000000000
+ release_date: 2022-12-22
+ tags: [] # TODO: add tags
+
+ - name: meta/opt-iml-30b # NOT SUPPORTED
+ display_name: OPT-IML (30B)
+ description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)).
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 30000000000
+ release_date: 2022-12-22
+ tags: [] # TODO: add tags
+
+ - name: meta/opt-175b
+ display_name: OPT (175B)
+ description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 175000000000
+ release_date: 2022-05-02
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
+
+ - name: meta/opt-66b
+ display_name: OPT (66B)
+ description: Open Pre-trained Transformers (66B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 66000000000
+ release_date: 2022-05-02
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
+
+ - name: meta/opt-6.7b
+ display_name: OPT (6.7B)
+ description: Open Pre-trained Transformers (6.7B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 6700000000
+ release_date: 2022-05-02
+ # TODO: The BUGGY_TEMP_0_TAG is a deployment related tag (Together).
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, BUGGY_TEMP_0_TAG]
+
+ - name: meta/opt-1.3b
+ display_name: OPT (1.3B)
+ description: Open Pre-trained Transformers (1.3B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)).
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 1300000000
+ release_date: 2022-05-02
+ # TODO: The BUGGY_TEMP_0_TAG is a deployment related tag (Together).
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, BUGGY_TEMP_0_TAG]
+
+ - name: meta/galactica-120b # NOT SUPPORTED
+ display_name: Galactica (120B)
+ description: Galactica (120B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 120000000000
+ release_date: 2022-11-15
+ tags: [] # TODO: add tags
+
+ - name: meta/galactica-30b # NOT SUPPORTED
+ display_name: Galactica (30B)
+ description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)).
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 30000000000
+ release_date: 2022-11-15
+ tags: [] # TODO: add tags
+
+ - name: meta/llama-7b
+ display_name: LLaMA (7B)
+ description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 7000000000
+ release_date: 2023-02-24
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: meta/llama-13b
+ display_name: LLaMA (13B)
+ description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 13000000000
+ release_date: 2023-02-24
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: meta/llama-30b
+ display_name: LLaMA (30B)
+ description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 30000000000
+ release_date: 2023-02-24
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: meta/llama-65b
+ display_name: LLaMA (65B)
+ description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters.
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 65000000000
+ release_date: 2023-02-24
+ # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: meta/llama-2-7b
+ display_name: Llama 2 (7B)
+ description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 7000000000
+ release_date: 2023-07-18
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: meta/llama-2-13b
+ display_name: Llama 2 (13B)
+ description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 13000000000
+ release_date: 2023-07-18
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: meta/llama-2-70b
+ display_name: Llama 2 (70B)
+ description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1.
+ creator_organization_name: Meta
+ access: open
+ num_parameters: 70000000000
+ release_date: 2023-07-18
+ # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # Microsoft/NVIDIA
+ - name: microsoft/TNLGv2_530B
+ display_name: TNLG v2 (530B)
+ description: TNLG v2 (530B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
+ creator_organization_name: Microsoft/NVIDIA
+ access: closed
+ num_parameters: 530000000000
+ release_date: 2022-01-28
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: microsoft/TNLGv2_7B
+ display_name: TNLG v2 (6.7B)
+ description: TNLG v2 (6.7B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)).
+ creator_organization_name: Microsoft/NVIDIA
+ access: closed
+ num_parameters: 6700000000
+ release_date: 2022-01-28
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # Mistral AI
+ - name: mistralai/mistral-7b-v0.1
+ display_name: Mistral v0.1 (7B)
+ description: Mistral 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA).
+ creator_organization_name: Mistral AI
+ access: open
+ num_parameters: 7300000000
+ release_date: 2023-09-27
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+
+ # MosaicML
+ - name: mosaicml/mpt-7b
+ display_name: MPT (7B)
+ description: MPT (7B) is a Transformer trained from scratch on 1T tokens of text and code.
+ creator_organization_name: MosaicML
+ access: open
+ num_parameters: 6700000000
+ release_date: 2023-05-05
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: mosaicml/mpt-7b-chat # NOT SUPPORTED
+ display_name: MPT-Chat (7B)
+ description: MPT-Chat (7B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B) , a Transformer trained from scratch on 1T tokens of text and code.
+ creator_organization_name: MosaicML
+ access: open
+ num_parameters: 6700000000
+ release_date: 2023-05-05
+ tags: [] # TODO: add tags
+
+ - name: mosaicml/mpt-instruct-7b
+ display_name: MPT-Instruct (7B)
+ description: MPT-Instruct (7B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
+ creator_organization_name: MosaicML
+ access: open
+ num_parameters: 6700000000
+ release_date: 2023-05-05
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: mosaicml/mpt-30b
+ display_name: MPT (30B)
+ description: MPT (30B) is a Transformer trained from scratch on 1T tokens of text and code.
+ creator_organization_name: MosaicML
+ access: open
+ num_parameters: 30000000000
+ release_date: 2023-06-22
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: mosaicml/mpt-30b-chat # NOT SUPPORTED
+ display_name: MPT-Chat (30B)
+ description: MPT-Chat (30B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
+ creator_organization_name: MosaicML
+ access: open
+ num_parameters: 30000000000
+ release_date: 2023-06-22
+ tags: [] # TODO: add tags
+
+ - name: mosaicml/mpt-instruct-30b
+ display_name: MPT-Instruct (30B)
+ description: MPT-Instruct (30B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code.
+ creator_organization_name: MosaicML
+ access: open
+ num_parameters: 30000000000
+ release_date: 2023-06-22
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # Neurips
+ - name: neurips/local
+ display_name: Neurips Local
+ description: Neurips Local
+ creator_organization_name: Neurips
+ access: open
+ release_date: 2023-06-01
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # NVIDIA
+ - name: nvidia/megatron-gpt2
+ display_name: Megatron GPT2
+ description: GPT-2 implemented in Megatron-LM ([paper](https://arxiv.org/abs/1909.08053)).
+ creator_organization_name: NVIDIA
+ access: open
+ release_date: 2019-09-17 # paper date
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, BUGGY_TEMP_0_TAG]
+
+
+
+ # OpenAI
+
+ ## GPT 2 Models
+ # Not served by OpenAI, instead served by HuggingFace.
+
+ - name: openai/gpt2
+ display_name: GPT-2 (1.5B)
+ description: GPT-2 (1.5B parameters) is a transformer model trained on a large corpus of English text in a self-supervised fashion ([paper](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)).
+ creator_organization_name: OpenAI
+ access: open
+ num_parameters: 1500000000
+ release_date: 2019-02-14
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+ ## GPT 3 Models
+ # The list of models can be found here: https://beta.openai.com/docs/engines/gpt-3
+ # DEPRECATED: Announced on July 06 2023 that these models will be shut down on January 04 2024.
+
+ - name: openai/davinci # DEPRECATED
+ display_name: davinci (175B)
+ description: Original GPT-3 (175B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 175000000000
+ release_date: 2020-05-28
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: openai/curie # DEPRECATED
+ display_name: curie (6.7B)
+ description: Original GPT-3 (6.7B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 6700000000
+ release_date: 2020-05-28
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: openai/babbage # DEPRECATED
+ display_name: babbage (1.3B)
+ description: Original GPT-3 (1.3B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 1300000000
+ release_date: 2020-05-28
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: openai/ada # DEPRECATED
+ display_name: ada (350M)
+ description: Original GPT-3 (350M parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 350000000
+ release_date: 2020-05-28
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: openai/text-davinci-003 # DEPRECATED
+ display_name: text-davinci-003
+ description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 175000000000
+ release_date: 2022-11-28
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ # TODO: text-davinci-002 supports insertion. Support insertion in our framework.
+ # https://github.com/stanford-crfm/benchmarking/issues/359
+ - name: openai/text-davinci-002 # DEPRECATED
+ display_name: text-davinci-002
+ description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 175000000000
+ release_date: 2022-01-27
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: openai/text-davinci-001 # DEPRECATED
+ display_name: text-davinci-001
+ description: text-davinci-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 175000000000
+ release_date: 2022-01-27
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: openai/text-curie-001 # DEPRECATED
+ display_name: text-curie-001
+ description: text-curie-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 6700000000
+ release_date: 2022-01-27
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: openai/text-babbage-001 # DEPRECATED
+ display_name: text-babbage-001
+ description: text-babbage-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 1300000000
+ release_date: 2022-01-27
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: openai/text-ada-001 # DEPRECATED
+ display_name: text-ada-001
+ description: text-ada-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 350000000
+ release_date: 2022-01-27
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+ ## GPT 3.5 Turbo Models
+ # ChatGPT: https://openai.com/blog/chatgpt
+
+ - name: openai/gpt-3.5-turbo-0301
+ display_name: gpt-3.5-turbo-0301
+ description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01.
+ creator_organization_name: OpenAI
+ access: limited
+ release_date: 2023-03-01
+ tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ - name: openai/gpt-3.5-turbo-0613
+ display_name: gpt-3.5-turbo-0613
+ description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13.
+ creator_organization_name: OpenAI
+ access: limited
+ release_date: 2023-06-13
+ tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ # Claimed length is 16,384; we round down to 16,000 for the same reasons as explained
+ # in the openai/gpt-3.5-turbo-0613 comment
+ - name: openai/gpt-3.5-turbo-16k-0613
+ display_name: gpt-3.5-turbo-16k-0613
+ description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13 with a longer context length of 16,384 tokens.
+ creator_organization_name: OpenAI
+ access: limited
+ release_date: 2023-06-13
+ tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+ ## GPT 4 Models
+
+ - name: openai/gpt-4-0314
+ display_name: gpt-4-0314
+ description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from March 14th 2023.
+ creator_organization_name: OpenAI
+ access: limited
+ release_date: 2023-03-14
+ tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ - name: openai/gpt-4-32k-0314
+ display_name: gpt-4-32k-0314
+ description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from March 14th 2023.
+ creator_organization_name: OpenAI
+ access: limited
+ release_date: 2023-03-14
+ tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ - name: openai/gpt-4-0613
+ display_name: gpt-4-0613
+ description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13.
+ creator_organization_name: OpenAI
+ access: limited
+ release_date: 2023-06-13
+ tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+ - name: openai/gpt-4-32k-0613
+ display_name: gpt-4-32k-0613
+ description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from 2023-06-13.
+ creator_organization_name: OpenAI
+ access: limited
+ release_date: 2023-06-13
+ tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+ ## Codex Models
+ # DEPRECATED: Codex models have been shut down on March 23 2023.
+
+ - name: openai/code-davinci-002 # DEPRECATED
+ display_name: code-davinci-002
+ description: Codex-style model that is designed for pure code-completion tasks ([docs](https://beta.openai.com/docs/models/codex)).
+ creator_organization_name: OpenAI
+ access: limited
+ release_date: 2021-07-01 # TODO: Find correct date (this is for v1)
+ tags: [CODE_MODEL_TAG]
+
+ - name: openai/code-davinci-001 # DEPRECATED
+ display_name: code-davinci-001
+ description: code-davinci-001 model
+ creator_organization_name: OpenAI
+ access: limited
+ release_date: 2021-07-01 # Paper date
+ tags: [CODE_MODEL_TAG]
+
+ - name: openai/code-cushman-001 # DEPRECATED
+ display_name: code-cushman-001 (12B)
+ description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 12000000000
+ release_date: 2021-07-01 # Paper date
+ tags: [CODE_MODEL_TAG]
+
+
+ ## Text Similarity Models
+ # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
+ # The number of parameters is guessed based on the number of parameters of the
+ # corresponding GPT-3 model.
+ # DEPRECATED: Announced on July 06 2023 that first generation embeddings models
+ # will be shut down on January 04 2024.
+
+ - name: openai/text-similarity-davinci-001 # DEPRECATED
+ display_name: text-similarity-davinci-001
+ description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 175000000000
+ release_date: 2022-01-25 # Blog post date
+ tags: [TEXT_SIMILARITY_MODEL_TAG]
+
+ - name: openai/text-similarity-curie-001 # DEPRECATED
+ display_name: text-similarity-curie-001
+ description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 6700000000
+ release_date: 2022-01-25 # Blog post date
+ tags: [TEXT_SIMILARITY_MODEL_TAG]
+
+ - name: openai/text-similarity-babbage-001 # DEPRECATED
+ display_name: text-similarity-babbage-001
+ description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 1300000000
+ release_date: 2022-01-25 # Blog post date
+ tags: [TEXT_SIMILARITY_MODEL_TAG]
+
+ - name: openai/text-similarity-ada-001 # DEPRECATED
+ display_name: text-similarity-ada-001
+ description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)).
+ creator_organization_name: OpenAI
+ access: limited
+ num_parameters: 350000000
+ release_date: 2022-01-25 # Blog post date
+ tags: [TEXT_SIMILARITY_MODEL_TAG]
+
+ - name: openai/text-embedding-ada-002
+ display_name: text-embedding-ada-002
+ description: An improved embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/new-and-improved-embedding-model)).
+ creator_organization_name: OpenAI
+ access: limited
+ release_date: 2022-12-15 # Blog post date
+ tags: [TEXT_SIMILARITY_MODEL_TAG]
+
+
+
+ # Salesforce
+ - name: salesforce/codegen # NOT SUPPORTED
+ display_name: CodeGen (16B)
+ description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)).
+ creator_organization_name: Tsinghua
+ access: open
+ num_parameters: 16000000000
+ release_date: 2022-03-25
+ tags: [] # TODO: add tags
+
+
+
+ # Stability AI
+ - name: stabilityai/stablelm-base-alpha-3b
+ display_name: StableLM-Base-Alpha (3B)
+ description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
+ creator_organization_name: Stability AI
+ access: open
+ num_parameters: 3000000000
+ release_date: 2023-04-20
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: stabilityai/stablelm-base-alpha-7b
+ display_name: StableLM-Base-Alpha (7B)
+ description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models.
+ creator_organization_name: Stability AI
+ access: open
+ num_parameters: 7000000000
+ release_date: 2023-04-20
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # Stanford
+ - name: stanford/alpaca-7b
+ display_name: Alpaca (7B)
+ description: Alpaca 7B is a model fine-tuned from the LLaMA 7B model on 52K instruction-following demonstrations
+ creator_organization_name: Stanford
+ access: open
+ num_parameters: 7000000000
+ release_date: 2023-03-13
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+
+
+
+ # TII UAE
+ - name: tiiuae/falcon-7b
+ display_name: Falcon (7B)
+ description: Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
+ creator_organization_name: TII UAE
+ access: open
+ num_parameters: 7000000000
+ release_date: 2023-03-15
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: tiiuae/falcon-7b-instruct
+ display_name: Falcon-Instruct (7B)
+ description: Falcon-7B-Instruct is a 7B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
+ creator_organization_name: TII UAE
+ access: open
+ num_parameters: 7000000000
+ release_date: 2023-03-15
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: tiiuae/falcon-40b
+ display_name: Falcon (40B)
+ description: Falcon-40B is a 40B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora.
+ creator_organization_name: TII UAE
+ access: open
+ num_parameters: 40000000000
+ release_date: 2023-05-25
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: tiiuae/falcon-40b-instruct
+ display_name: Falcon-Instruct (40B)
+ description: Falcon-40B-Instruct is a 40B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets.
+ creator_organization_name: TII UAE
+ access: open
+ num_parameters: 40000000000
+ release_date: 2023-05-25
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # Together
+ - name: together/gpt-jt-6b-v1
+ display_name: GPT-JT (6B)
+ description: GPT-JT (6B parameters) is a fork of GPT-J ([blog post](https://www.together.xyz/blog/releasing-v1-of-gpt-jt-powered-by-open-source-ai)).
+ creator_organization_name: Together
+ access: open
+ num_parameters: 6700000000
+ release_date: 2022-11-29
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: together/gpt-neoxt-chat-base-20b
+ display_name: GPT-NeoXT-Chat-Base (20B)
+ description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots.
+ creator_organization_name: Together
+ access: open
+ num_parameters: 20000000000
+ release_date: 2023-03-08
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, CHATML_MODEL_TAG]
+
+ - name: together/redpajama-incite-base-3b-v1
+ display_name: RedPajama-INCITE-Base-v1 (3B)
+ description: RedPajama-INCITE-Base-v1 (3B parameters) is a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+ creator_organization_name: Together
+ access: open
+ num_parameters: 3000000000
+ release_date: 2023-05-05
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: together/redpajama-incite-instruct-3b-v1
+ display_name: RedPajama-INCITE-Instruct-v1 (3B)
+ description: RedPajama-INCITE-Instruct-v1 (3B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+ creator_organization_name: Together
+ access: open
+ num_parameters: 3000000000
+ release_date: 2023-05-05
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: together/redpajama-incite-chat-3b-v1 # NOT SUPPORTED
+ display_name: RedPajama-INCITE-Chat-v1 (3B)
+ description: RedPajama-INCITE-Chat-v1 (3B parameters) is a model fine-tuned on OASST1 and Dolly2 to enhance chatting ability. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+ creator_organization_name: Together
+ access: open
+ num_parameters: 3000000000
+ release_date: 2023-05-05
+ tafs: [] # TODO: add tags
+
+ - name: together/redpajama-incite-base-7b
+ display_name: RedPajama-INCITE-Base (7B)
+ description: RedPajama-INCITE-Base (7B parameters) is a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+ creator_organization_name: Together
+ access: open
+ num_parameters: 7000000000
+ release_date: 2023-05-05
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: together/redpajama-incite-instruct-7b
+ display_name: RedPajama-INCITE-Instruct (7B)
+ description: RedPajama-INCITE-Instruct (7B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base (7B), a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible.
+ creator_organization_name: Together
+ access: open
+ num_parameters: 7000000000
+ release_date: 2023-05-05
+ tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # Tsinghua
+ - name: tsinghua/glm
+ display_name: GLM (130B)
+ description: GLM (130B parameters) is an open bilingual (English & Chinese) bidirectional dense model that was trained using General Language Model (GLM) procedure ([paper](https://arxiv.org/pdf/2210.02414.pdf)).
+ creator_organization_name: Tsinghua
+ access: open
+ num_parameters: 130000000000
+ release_date: 2022-08-04
+ # Inference with echo=True is not feasible -- in the prompt encoding phase, they use
+ # bidirectional attention and do not perform predictions on them.
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG]
+
+ - name: tsinghua/codegeex # NOT SUPPORTED
+ display_name: CodeGeeX (13B)
+ description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)).
+ creator_organization_name: Tsinghua
+ access: open
+ num_parameters: 13000000000
+ release_date: 2022-09-19
+ tags: [] # TODO: add tags
+
+
+
+ # Writer
+ - name: writer/palmyra-base
+ display_name: Palmyra Base (5B)
+ description: Palmyra Base (5B)
+ creator_organization_name: Writer
+ access: limited
+ num_parameters: 5000000000
+ release_date: 2022-10-13
+ # Does not support echo
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: writer/palmyra-large
+ display_name: Palmyra Large (20B)
+ description: Palmyra Large (20B)
+ creator_organization_name: Writer
+ access: limited
+ num_parameters: 20000000000
+ release_date: 2022-12-23
+ # Does not support echo
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: writer/palmyra-instruct-30
+ display_name: InstructPalmyra (30B)
+ description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans.
+ creator_organization_name: Writer
+ access: limited
+ num_parameters: 30000000000
+ release_date: 2023-02-16
+ # Does not support echo
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: writer/palmyra-e
+ display_name: Palmyra E (30B)
+ description: Palmyra E (30B)
+ creator_organization_name: Writer
+ access: limited
+ num_parameters: 30000000000
+ release_date: 2023-03-03
+ # Does not support echo
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: writer/silk-road
+ display_name: Silk Road (35B)
+ description: Silk Road (35B)
+ creator_organization_name: Writer
+ access: limited
+ num_parameters: 35000000000
+ release_date: 2023-04-13
+ # Does not support echo
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+ - name: writer/palmyra-x
+ display_name: Palmyra X (43B)
+ description: Palmyra-X (43B parameters) is trained to adhere to instructions using human feedback and utilizes a technique called multiquery attention. Furthermore, a new feature called 'self-instruct' has been introduced, which includes the implementation of an early stopping criteria specifically designed for minimal instruction tuning ([paper](https://dev.writer.com/docs/becoming-self-instruct-introducing-early-stopping-criteria-for-minimal-instruct-tuning)).
+ creator_organization_name: Writer
+ access: limited
+ num_parameters: 43000000000
+ release_date: 2023-06-11
+ # Does not support echo
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+
+
+
+ # Yandex
+ - name: yandex/yalm
+ display_name: YaLM (100B)
+ description: YaLM (100B parameters) is an autoregressive language model trained on English and Russian text ([GitHub](https://github.com/yandex/YaLM-100B)).
+ creator_organization_name: Yandex
+ access: open
+ num_parameters: 100000000000
+ release_date: 2022-06-23
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG]
\ No newline at end of file
diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml
new file mode 100644
index 00000000000..c7c0d1446ec
--- /dev/null
+++ b/src/helm/config/tokenizer_configs.yaml
@@ -0,0 +1,202 @@
+tokenizer_configs:
+
+ - name: simple/model1
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.simple_tokenizer.SimpleTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+
+ # AI21
+ - name: ai21/j1
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.ai21_tokenizer.AI21Tokenizer"
+ end_of_text_token: " "
+ prefix_token: ""
+
+ # AlephAlpha
+ - name: AlephAlpha/luminous-base
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: AlephAlpha/luminous-extended
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: AlephAlpha/luminous-supreme
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: AlephAlpha/luminous-world
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+
+ # Anthropic
+ - name: anthropic/claude
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.anthropic_tokenizer.AnthropicTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+
+ # Bigcode
+ - name: bigcode/santacoder
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+ - name: bigcode/starcoder
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+
+ # Bigscience
+ - name: bigscience/bloom
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: bigscience/T0pp
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+
+ # Cohere
+ - name: cohere/cohere
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer"
+ end_of_text_token: ""
+ prefix_token: ":"
+
+ # EleutherAI
+ - name: EleutherAI/gpt-j-6B
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+ - name: EleutherAI/gpt-neox-20b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+
+ # Facebook
+ - name: facebook/opt-66b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+
+ # Google
+ - name: google/t5-11b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: google/flan-t5-xxl
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: google/ul2
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+
+ # Hf-internal-testing
+ - name: hf-internal-testing/llama-tokenizer
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+
+ # HuggingFaceM4
+ - name: HuggingFaceM4/idefics-9b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: HuggingFaceM4/idefics-9b-instruct
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: HuggingFaceM4/idefics-80b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+ - name: HuggingFaceM4/idefics-80b-instruct
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+
+ # Huggingface
+ - name: huggingface/gpt2
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+
+ # Lighting AI
+ - name: lightningai/lit-gpt
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.lit_gpt_tokenizer.LitGPTTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+
+ # Meta-llama
+ - name: meta-llama/Llama-2-7b-hf
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+
+ # Mistralai
+ - name: mistralai/Mistral-7B-v0.1
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+
+ # Neurips
+ - name: neurips/local
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.http_model_tokenizer.HTTPModelTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+
+ # Openai
+ - name: openai/cl100k_base
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: "<|endoftext|>"
+
+ # Tiiuae
+ - name: tiiuae/falcon-7b
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+ end_of_text_token: "<|endoftext|>"
+ prefix_token: ""
+
+ # TsinghuaKEG
+ - name: TsinghuaKEG/ice
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.ice_tokenizer.ICETokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
+
+ # Yandex
+ - name: Yandex/yalm
+ tokenizer_spec:
+ class_name: "helm.proxy.tokenizers.yalm_tokenizer.YaLMTokenizer"
+ end_of_text_token: ""
+ prefix_token: ""
\ No newline at end of file
diff --git a/src/helm/proxy/clients/aleph_alpha_client.py b/src/helm/proxy/clients/aleph_alpha_client.py
index a988938ae33..ae7116cef2d 100644
--- a/src/helm/proxy/clients/aleph_alpha_client.py
+++ b/src/helm/proxy/clients/aleph_alpha_client.py
@@ -2,8 +2,6 @@
import requests
from typing import Any, Dict, List
-from aleph_alpha_client import Client as AlephAlphaPythonClient
-
from helm.common.cache import CacheConfig
from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
from helm.proxy.tokenizers.tokenizer import Tokenizer
@@ -16,7 +14,6 @@ class AlephAlphaClient(CachingClient):
def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig):
super().__init__(cache_config=cache_config, tokenizer=tokenizer)
self.api_key: str = api_key
- self._aleph_alpha_client = AlephAlphaPythonClient(token=api_key)
def _send_request(self, endpoint: str, raw_request: Dict[str, Any]) -> Dict[str, Any]:
response = requests.request(
diff --git a/src/helm/proxy/clients/anthropic_client.py b/src/helm/proxy/clients/anthropic_client.py
index 0cfde926c58..6f6dd8f1c7d 100644
--- a/src/helm/proxy/clients/anthropic_client.py
+++ b/src/helm/proxy/clients/anthropic_client.py
@@ -249,7 +249,7 @@ def make_request(self, request: Request) -> RequestResult:
if request.embedding:
return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
# Validate the fields of `Request`
- if request.model != "anthropic/stanford-online-all-v4-s3":
+ if request.model_engine != "stanford-online-all-v4-s3":
raise ValueError(f"Invalid model: {request.model}")
if request.max_tokens > AnthropicLegacyClient.MAX_COMPLETION_LENGTH:
raise ValueError(
diff --git a/src/helm/proxy/clients/auto_client.py b/src/helm/proxy/clients/auto_client.py
index 84239fb4b05..04b1381331d 100644
--- a/src/helm/proxy/clients/auto_client.py
+++ b/src/helm/proxy/clients/auto_client.py
@@ -1,13 +1,13 @@
import os
from dataclasses import replace
-from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional
from retrying import Attempt, RetryError
-from helm.benchmark.model_deployment_registry import get_model_deployment
-from helm.benchmark.tokenizer_config_registry import get_tokenizer_config
-from helm.common.cache import CacheConfig, MongoCacheConfig, SqliteCacheConfig
+from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment
+from helm.common.cache_utils import build_cache_config
+from helm.common.credentials_utils import provide_api_key
+from helm.common.cache import CacheConfig
from helm.common.hierarchical_logger import hlog
from helm.common.object_spec import create_object, inject_object_spec_args
from helm.common.request import Request, RequestResult
@@ -21,10 +21,9 @@
from helm.proxy.critique.critique_client import CritiqueClient
from helm.proxy.clients.toxicity_classifier_client import ToxicityClassifierClient
from helm.proxy.retry import NonRetriableException, retry_request
-from helm.proxy.tokenizers.tokenizer import Tokenizer
+from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
-from .http_model_client import HTTPModelClient
if TYPE_CHECKING:
import helm.proxy.clients.huggingface_client
@@ -35,18 +34,14 @@ class AuthenticationError(NonRetriableException):
class AutoClient(Client):
- """Automatically dispatch to the proper `Client` based on the organization.
-
- The modules for each client are lazily imported when the respective client is created.
- This greatly speeds up the import time of this module, and allows the client modules to
- use optional dependencies."""
+ """Automatically dispatch to the proper `Client` based on the model deployment name."""
def __init__(self, credentials: Mapping[str, Any], cache_path: str, mongo_uri: str = ""):
+ self._auto_tokenizer = AutoTokenizer(credentials, cache_path, mongo_uri)
self.credentials = credentials
self.cache_path = cache_path
self.mongo_uri = mongo_uri
self.clients: Dict[str, Client] = {}
- self.tokenizers: Dict[str, Tokenizer] = {}
# self._huggingface_client is lazily instantiated by get_huggingface_client()
self._huggingface_client: Optional["helm.proxy.clients.huggingface_client.HuggingFaceClient"] = None
# self._critique_client is lazily instantiated by get_critique_client()
@@ -54,185 +49,54 @@ def __init__(self, credentials: Mapping[str, Any], cache_path: str, mongo_uri: s
hlog(f"AutoClient: cache_path = {cache_path}")
hlog(f"AutoClient: mongo_uri = {mongo_uri}")
- def _build_cache_config(self, organization: str) -> CacheConfig:
- if self.mongo_uri:
- return MongoCacheConfig(self.mongo_uri, collection_name=organization)
+ def _get_client(self, model_deployment_name: str) -> Client:
+ """Return a client based on the model, creating it if necessary."""
+ # First try to find the client in the cache
+ client: Optional[Client] = self.clients.get(model_deployment_name)
+ if client is not None:
+ return client
+
+ # Otherwise, create the client
+ model_deployment: ModelDeployment = get_model_deployment(model_deployment_name)
+ if model_deployment:
+ # Perform dependency injection to fill in remaining arguments.
+ # Dependency injection is needed here for these reasons:
+ #
+ # 1. Different clients have different parameters. Dependency injection provides arguments
+ # that match the parameters of the client.
+ # 2. Some arguments, such as the tokenizer, are not static data objects that can be
+ # in the users configuration file. Instead, they have to be constructed dynamically at
+ # runtime.
+ # 3. The providers must be lazily-evaluated, because eager evaluation can result in an
+ # exception. For instance, some clients do not require an API key, so trying to fetch
+ # the API key from configuration eagerly will result in an exception because the user
+ # will not have configured an API key.
+
+ # Prepare a cache
+ host_organization: str = model_deployment.host_organization
+ cache_config: CacheConfig = build_cache_config(self.cache_path, self.mongo_uri, host_organization)
+
+ client_spec = inject_object_spec_args(
+ model_deployment.client_spec,
+ constant_bindings={"cache_config": cache_config},
+ provider_bindings={
+ "api_key": lambda: provide_api_key(self.credentials, host_organization, model_deployment_name),
+ "tokenizer": lambda: self._auto_tokenizer._get_tokenizer(
+ tokenizer_name=model_deployment.tokenizer_name or model_deployment.name
+ ),
+ "org_id": lambda: self.credentials.get(
+ host_organization + "OrgId", None
+ ), # OpenAI, GooseAI, Microsoft
+ "lock_file_path": lambda: os.path.join(self.cache_path, f"{host_organization}.lock"), # Microsoft
+ },
+ )
+ client = create_object(client_spec)
+ else:
+ raise ValueError(f"Could not find client for model deployment: {model_deployment_name}")
- client_cache_path: str = os.path.join(self.cache_path, f"{organization}.sqlite")
- # TODO: Allow setting CacheConfig.follower_cache_path from a command line flag.
- return SqliteCacheConfig(client_cache_path)
+ # Cache the client
+ self.clients[model_deployment_name] = client
- def _get_client(self, model: str) -> Client:
- """Return a client based on the model, creating it if necessary."""
- client: Optional[Client] = self.clients.get(model)
-
- if client is None:
- organization: str = model.split("/")[0]
- cache_config: CacheConfig = self._build_cache_config(organization)
- tokenizer: Tokenizer = self._get_tokenizer(organization)
-
- # TODO: Migrate all clients to use model deployments
- model_deployment = get_model_deployment(model)
- if model_deployment:
-
- def provide_api_key():
- if "deployments" not in self.credentials:
- raise AuthenticationError("Could not find key 'deployments' in credentials.conf")
- deployment_api_keys = self.credentials["deployments"]
- if model not in deployment_api_keys:
- raise AuthenticationError(
- f"Could not find key '{model}' under key 'deployments' in credentials.conf"
- )
- return deployment_api_keys[model]
-
- # Perform dependency injection to fill in remaining arguments.
- # Dependency injection is needed here for these reasons:
- #
- # 1. Different clients have different parameters. Dependency injection provides arguments
- # that match the parameters of the client.
- # 2. Some arguments, such as the tokenizer, are not static data objects that can be
- # in the users configuration file. Instead, they have to be constructed dynamically at
- # runtime.
- # 3. The providers must be lazily-evaluated, because eager evaluation can result in an
- # exception. For instance, some clients do not require an API key, so trying to fetch
- # the API key from configuration eagerly will result in an exception because the user
- # will not have configured an API key.
- client_spec = inject_object_spec_args(
- model_deployment.client_spec,
- constant_bindings={"cache_config": cache_config},
- provider_bindings={"api_key": provide_api_key},
- )
- client = create_object(client_spec)
- elif organization == "neurips":
- client = HTTPModelClient(tokenizer=tokenizer, cache_config=cache_config)
- elif organization == "openai":
- from helm.proxy.clients.openai_client import OpenAIClient
-
- org_id = self.credentials.get("openaiOrgId", None)
- api_key = self.credentials.get("openaiApiKey", None)
- client = OpenAIClient(
- tokenizer=tokenizer,
- cache_config=cache_config,
- api_key=api_key,
- org_id=org_id,
- )
- elif organization == "AlephAlpha":
- from helm.proxy.clients.aleph_alpha_client import AlephAlphaClient
-
- client = AlephAlphaClient(
- tokenizer=tokenizer,
- api_key=self.credentials["alephAlphaKey"],
- cache_config=cache_config,
- )
- elif organization == "ai21":
- from helm.proxy.clients.ai21_client import AI21Client
-
- client = AI21Client(
- tokenizer=tokenizer,
- api_key=self.credentials["ai21ApiKey"],
- cache_config=cache_config,
- )
- elif organization == "cohere":
- from helm.proxy.clients.cohere_client import CohereClient
-
- client = CohereClient(
- tokenizer=tokenizer,
- api_key=self.credentials["cohereApiKey"],
- cache_config=cache_config,
- )
- elif organization == "gooseai":
- from helm.proxy.clients.goose_ai_client import GooseAIClient
-
- org_id = self.credentials.get("gooseaiOrgId", None)
- client = GooseAIClient(
- tokenizer=tokenizer,
- api_key=self.credentials["gooseaiApiKey"],
- cache_config=cache_config,
- org_id=org_id,
- )
- elif organization == "huggingface":
- from helm.proxy.clients.huggingface_client import HuggingFaceClient
-
- client = HuggingFaceClient(tokenizer=tokenizer, cache_config=cache_config)
- elif organization == "anthropic":
- from helm.proxy.clients.anthropic_client import AnthropicClient
-
- client = AnthropicClient(
- api_key=self.credentials.get("anthropicApiKey", None),
- tokenizer=tokenizer,
- cache_config=cache_config,
- )
- elif organization == "microsoft":
- from helm.proxy.clients.microsoft_client import MicrosoftClient
-
- org_id = self.credentials.get("microsoftOrgId", None)
- lock_file_path: str = os.path.join(self.cache_path, f"{organization}.lock")
- client = MicrosoftClient(
- api_key=self.credentials.get("microsoftApiKey", None),
- tokenizer=tokenizer,
- lock_file_path=lock_file_path,
- cache_config=cache_config,
- org_id=org_id,
- )
- elif organization == "google":
- from helm.proxy.clients.google_client import GoogleClient
-
- client = GoogleClient(
- tokenizer=tokenizer,
- cache_config=cache_config,
- )
- elif organization in [
- "together",
- "databricks",
- "eleutherai",
- "lmsys",
- "meta",
- "mistralai",
- "mosaicml",
- "stabilityai",
- "stanford",
- "tiiuae",
- ]:
- from helm.proxy.clients.together_client import TogetherClient
-
- client = TogetherClient(
- api_key=self.credentials.get("togetherApiKey", None),
- tokenizer=tokenizer,
- cache_config=cache_config,
- )
- elif organization == "simple":
- from helm.proxy.clients.simple_client import SimpleClient
-
- client = SimpleClient(tokenizer=tokenizer, cache_config=cache_config)
- elif organization == "writer":
- from helm.proxy.clients.palmyra_client import PalmyraClient
-
- client = PalmyraClient(
- api_key=self.credentials["writerApiKey"],
- tokenizer=tokenizer,
- cache_config=cache_config,
- )
- elif organization == "nvidia":
- from helm.proxy.clients.megatron_client import MegatronClient
-
- client = MegatronClient(tokenizer=tokenizer, cache_config=cache_config)
-
- elif organization == "lightningai":
- from helm.proxy.clients.lit_gpt_client import LitGPTClient
-
- client = LitGPTClient(
- tokenizer=tokenizer,
- cache_config=cache_config,
- checkpoint_dir=Path(os.environ.get("LIT_GPT_CHECKPOINT_DIR", "")),
- precision=os.environ.get("LIT_GPT_PRECISION", "bf16-true"),
- )
- elif organization == "HuggingFaceM4":
- from helm.proxy.clients.vision_language.idefics_client import IDEFICSClient
-
- client = IDEFICSClient(tokenizer=tokenizer, cache_config=cache_config)
- else:
- raise ValueError(f"Could not find client for model: {model}")
- self.clients[model] = client
return client
def make_request(self, request: Request) -> RequestResult:
@@ -246,155 +110,36 @@ def make_request(self, request: Request) -> RequestResult:
def make_request_with_retry(client: Client, request: Request) -> RequestResult:
return client.make_request(request)
- client: Client = self._get_client(request.model)
+ client: Client = self._get_client(request.model_deployment)
try:
return make_request_with_retry(client=client, request=request)
except RetryError as e:
last_attempt: Attempt = e.last_attempt
retry_error: str = (
- f"Failed to make request to {request.model} after retrying {last_attempt.attempt_number} times"
+ f"Failed to make request to {request.model_deployment} after retrying "
+ f"{last_attempt.attempt_number} times"
)
hlog(retry_error)
# Notify our user that we failed to make the request even after retrying.
return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
- def _get_tokenizer(self, tokenizer_name: str) -> Tokenizer:
- # First try to find the tokenizer in the cache
- tokenizer: Optional[Tokenizer] = self.tokenizers.get(tokenizer_name)
- if tokenizer is not None:
- return tokenizer
-
- # Otherwise, create the tokenizer
- organization: str = tokenizer_name.split("/")[0]
- cache_config: CacheConfig = self._build_cache_config(organization)
-
- # TODO: Migrate all clients to use tokenizer configs
- tokenizer_config = get_tokenizer_config(tokenizer_name)
- if tokenizer_config:
- tokenizer_spec = inject_object_spec_args(
- tokenizer_config.tokenizer_spec, constant_bindings={"cache_config": cache_config}
- )
- return create_object(tokenizer_spec)
- elif organization in [
- "gooseai",
- "huggingface",
- "microsoft",
- "google",
- "writer", # Palmyra
- "nvidia",
- "EleutherAI",
- "facebook",
- "meta-llama",
- "hf-internal-testing",
- "mistralai",
- "HuggingFaceM4",
- # Together
- "together",
- "databricks",
- "eleutherai",
- "lmsys",
- "meta",
- "mosaicml",
- "stabilityai",
- "stanford",
- "tiiuae",
- "bigcode",
- "bigscience",
- ]:
- from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
-
- tokenizer = HuggingFaceTokenizer(cache_config=cache_config)
- elif organization == "neurips":
- from helm.proxy.tokenizers.http_model_tokenizer import HTTPModelTokenizer
-
- tokenizer = HTTPModelTokenizer(cache_config=cache_config)
- elif organization == "openai":
- from helm.proxy.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
-
- tokenizer = TiktokenTokenizer(cache_config=cache_config)
- elif organization == "AlephAlpha":
- from helm.proxy.tokenizers.aleph_alpha_tokenizer import AlephAlphaTokenizer
-
- tokenizer = AlephAlphaTokenizer(api_key=self.credentials["alephAlphaKey"], cache_config=cache_config)
- elif organization == "ai21":
- from helm.proxy.tokenizers.ai21_tokenizer import AI21Tokenizer
-
- tokenizer = AI21Tokenizer(api_key=self.credentials["ai21ApiKey"], cache_config=cache_config)
- elif organization == "cohere":
- from helm.proxy.tokenizers.cohere_tokenizer import CohereTokenizer
-
- tokenizer = CohereTokenizer(api_key=self.credentials["cohereApiKey"], cache_config=cache_config)
- elif organization == "anthropic":
- from helm.proxy.tokenizers.anthropic_tokenizer import AnthropicTokenizer
-
- tokenizer = AnthropicTokenizer(cache_config=cache_config)
- elif organization == "simple":
- from helm.proxy.tokenizers.simple_tokenizer import SimpleTokenizer
-
- tokenizer = SimpleTokenizer()
- elif organization == "lightningai":
- from helm.proxy.tokenizers.lit_gpt_tokenizer import LitGPTTokenizer
-
- tokenizer = LitGPTTokenizer(
- cache_config=cache_config,
- checkpoint_dir=Path(os.environ.get("LIT_GPT_CHECKPOINT_DIR", "")),
- )
- elif organization == "TsinghuaKEG":
- from helm.proxy.tokenizers.ice_tokenizer import ICETokenizer
-
- tokenizer = ICETokenizer(cache_config=cache_config)
- elif organization == "Yandex":
- from helm.proxy.tokenizers.yalm_tokenizer import YaLMTokenizer
-
- tokenizer = YaLMTokenizer(cache_config=cache_config)
-
- if tokenizer is None:
- raise ValueError(f"Could not find tokenizer for model: {tokenizer_name}")
-
- # Cache the tokenizer
- self.tokenizers[tokenizer_name] = tokenizer
-
- return tokenizer
-
+ # TODO: remove this method after a few weeks (2023-11-09)
def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
- """Tokenizes based on the name of the tokenizer (e.g., huggingface/gpt2)."""
-
- def tokenize_with_retry(tokenizer: Tokenizer, request: TokenizationRequest) -> TokenizationRequestResult:
- return tokenizer.tokenize(request)
-
- tokenizer: Tokenizer = self._get_tokenizer(request.tokenizer)
-
- try:
- return tokenize_with_retry(tokenizer=tokenizer, request=request)
- except RetryError as e:
- last_attempt: Attempt = e.last_attempt
- retry_error: str = f"Failed to tokenize after retrying {last_attempt.attempt_number} times"
- hlog(retry_error)
- return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
+ raise NotImplementedError(
+ "AutoClient.tokenize() is not supported anymore." "Use AutoTokenizer.tokenize() instead."
+ )
+ # TODO: remove this method after a few weeks (2023-11-09)
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
- """Decodes based on the the name of the tokenizer (e.g., huggingface/gpt2)."""
-
- def decode_with_retry(tokenizer: Tokenizer, request: DecodeRequest) -> DecodeRequestResult:
- return tokenizer.decode(request)
-
- tokenizer: Tokenizer = self._get_tokenizer(request.tokenizer)
-
- try:
- return decode_with_retry(tokenizer=tokenizer, request=request)
- except RetryError as e:
- last_attempt: Attempt = e.last_attempt
- retry_error: str = f"Failed to decode after retrying {last_attempt.attempt_number} times"
- hlog(retry_error)
- return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
+ raise NotImplementedError("AutoClient.decode() is not supported anymore." "Use AutoTokenizer.decode() instead.")
def get_toxicity_classifier_client(self) -> ToxicityClassifierClient:
"""Get the toxicity classifier client. We currently only support Perspective API."""
from helm.proxy.clients.perspective_api_client import PerspectiveAPIClient
- cache_config: CacheConfig = self._build_cache_config("perspectiveapi")
+ cache_config: CacheConfig = build_cache_config(self.cache_path, self.mongo_uri, "perspectiveapi")
return PerspectiveAPIClient(self.credentials.get("perspectiveApiKey", ""), cache_config)
def get_critique_client(self) -> CritiqueClient:
@@ -420,7 +165,9 @@ def get_critique_client(self) -> CritiqueClient:
surgeai_credentials = self.credentials.get("surgeaiApiKey")
if not surgeai_credentials:
raise ValueError("surgeaiApiKey credentials are required for SurgeAICritiqueClient")
- self._critique_client = SurgeAICritiqueClient(surgeai_credentials, self._build_cache_config("surgeai"))
+ self._critique_client = SurgeAICritiqueClient(
+ surgeai_credentials, build_cache_config(self.cache_path, self.mongo_uri, "surgeai")
+ )
elif critique_type == "model":
from helm.proxy.critique.model_critique_client import ModelCritiqueClient
@@ -439,7 +186,7 @@ def get_critique_client(self) -> CritiqueClient:
if not scale_credentials:
raise ValueError("scaleApiKey is required for ScaleCritiqueClient")
self._critique_client = ScaleCritiqueClient(
- scale_credentials, self._build_cache_config("scale"), scale_project
+ scale_credentials, build_cache_config(self.cache_path, self.mongo_uri, "scale"), scale_project
)
else:
raise ValueError(
@@ -455,7 +202,7 @@ def get_huggingface_client(self) -> "helm.proxy.clients.huggingface_client.Huggi
if self._huggingface_client:
assert isinstance(self._huggingface_client, HuggingFaceClient)
return self._huggingface_client
- cache_config = self._build_cache_config("huggingface")
+ cache_config = build_cache_config(self.cache_path, self.mongo_uri, "huggingface")
tokenizer = HuggingFaceTokenizer(cache_config)
self._huggingface_client = HuggingFaceClient(tokenizer=tokenizer, cache_config=cache_config)
return self._huggingface_client
diff --git a/src/helm/proxy/clients/cohere_client.py b/src/helm/proxy/clients/cohere_client.py
index c92fcc4330f..2d626b99150 100644
--- a/src/helm/proxy/clients/cohere_client.py
+++ b/src/helm/proxy/clients/cohere_client.py
@@ -11,7 +11,7 @@
Sequence,
Token,
)
-from helm.proxy.models import get_models_by_organization
+from helm.benchmark.model_deployment_registry import get_model_deployments_by_host_organization
from helm.proxy.tokenizers.tokenizer import Tokenizer
from .client import CachingClient, truncate_sequence
from .cohere_utils import get_cohere_url, DEFAULT_COHERE_API_VERSION
@@ -45,7 +45,7 @@ def make_request(self, request: Request) -> RequestResult:
assert request.max_tokens > 0, "max_tokens can only be 0 if echo_prompt=True"
# model: "Currently available models are small, medium, large, xlarge"
- assert request.model in get_models_by_organization("cohere")
+ assert request.model_deployment in get_model_deployments_by_host_organization("cohere")
# temperature: "min value of 0.0, max value of 5.0"
assert 0.0 <= request.temperature <= 5.0, f"Invalid temperature: {request.temperature}. Valid range: [0,5]"
# num_generations: "min value of 1, max value of 5"
diff --git a/src/helm/proxy/clients/huggingface_client.py b/src/helm/proxy/clients/huggingface_client.py
index 498a810d608..b4e25987e6d 100644
--- a/src/helm/proxy/clients/huggingface_client.py
+++ b/src/helm/proxy/clients/huggingface_client.py
@@ -203,9 +203,9 @@ def make_request(self, request: Request) -> RequestResult:
if self._pretrained_model_name_or_path:
pretrained_model_name_or_path = self._pretrained_model_name_or_path
else:
- pretrained_model_name_or_path = resolve_alias(request.model)
+ pretrained_model_name_or_path = resolve_alias(request.model_deployment)
huggingface_model: HuggingFaceServer = HuggingFaceServerFactory.get_server(
- helm_model_name=request.model,
+ helm_model_name=request.model_deployment,
pretrained_model_name_or_path=pretrained_model_name_or_path,
revision=self._revision,
)
diff --git a/src/helm/proxy/clients/palmyra_client.py b/src/helm/proxy/clients/palmyra_client.py
index 46a17e961ec..550b4b56984 100644
--- a/src/helm/proxy/clients/palmyra_client.py
+++ b/src/helm/proxy/clients/palmyra_client.py
@@ -100,7 +100,10 @@ def do_it():
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
if _is_content_moderation_failure(response):
- hlog(f"WARNING: Returning empty request for {request.model} due to content moderation filter")
+ hlog(
+ f"WARNING: Returning empty request for {request.model_deployment} "
+ "due to content moderation filter"
+ )
return RequestResult(
success=False,
cached=False,
diff --git a/src/helm/proxy/clients/remote_model_registry.py b/src/helm/proxy/clients/remote_model_registry.py
deleted file mode 100644
index 1dae2999469..00000000000
--- a/src/helm/proxy/clients/remote_model_registry.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import Dict, List, Optional
-
-from helm.proxy.models import Model
-from helm.proxy.services.remote_service import RemoteService
-
-
-_remote_model_registry: Dict[str, Model] = {}
-
-
-def get_remote_model(model_name: str) -> Optional[Model]:
- """Returns a Model for the model_name."""
- return _remote_model_registry.get(model_name)
-
-
-def check_and_register_remote_model(server_url: str, model_names: List[str]):
- try:
- service = RemoteService(server_url)
- info = service.get_general_info()
- models = {}
- for model in info.all_models:
- models[model.name] = model
- for model_name in model_names:
- if model_name in models:
- _remote_model_registry[model_name] = models[model_name]
- else:
- raise RuntimeError(f"remote service not contain {model_name}")
- except Exception as e:
- raise RuntimeError(f"check and register remote service error: {e}")
diff --git a/src/helm/proxy/clients/test_auto_client.py b/src/helm/proxy/clients/test_auto_client.py
index 6fffdf35ba0..98c13b6870d 100644
--- a/src/helm/proxy/clients/test_auto_client.py
+++ b/src/helm/proxy/clients/test_auto_client.py
@@ -27,6 +27,7 @@ def make_request_and_check_result(self, request, expected_result):
def test_make_request_databricks(self):
request = Request(
model="databricks/dolly-v2-3b",
+ model_deployment="together/dolly-v2-3b",
prompt="Elephants are one of the most",
temperature=0.0,
max_tokens=10,
@@ -69,6 +70,7 @@ def test_make_request_databricks(self):
)
request = Request(
model="databricks/dolly-v2-3b",
+ model_deployment="together/dolly-v2-3b",
prompt="Elephants are one of the most",
temperature=0.0,
max_tokens=10,
diff --git a/src/helm/proxy/clients/test_client.py b/src/helm/proxy/clients/test_client.py
index 8ca194de198..256282d835b 100644
--- a/src/helm/proxy/clients/test_client.py
+++ b/src/helm/proxy/clients/test_client.py
@@ -19,13 +19,31 @@ def truncate_sequence_helper(tokens: List[str], request: Request, expected_token
def test_truncate_sequence():
# echo_prompt = True, nothing gets truncated
- truncate_sequence_helper(["a", "b", "c"], Request(prompt="abc", echo_prompt=True), ["a", "b", "c"])
+ truncate_sequence_helper(
+ ["a", "b", "c"],
+ Request(
+ model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", prompt="abc", echo_prompt=True
+ ),
+ ["a", "b", "c"],
+ )
# Nothing gets truncated
- truncate_sequence_helper(["hello", " world"], Request(stop_sequences=["#"]), ["hello", " world"])
+ truncate_sequence_helper(
+ ["hello", " world"],
+ Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["#"]),
+ ["hello", " world"],
+ )
# Truncate using stop sequences
- truncate_sequence_helper(["hello", " world", "\n", "what"], Request(stop_sequences=["\n"]), ["hello", " world"])
+ truncate_sequence_helper(
+ ["hello", " world", "\n", "what"],
+ Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["\n"]),
+ ["hello", " world"],
+ )
# Truncate using max tokens
- truncate_sequence_helper(["a", "b", "c"], Request(max_tokens=2), ["a", "b"])
+ truncate_sequence_helper(
+ ["a", "b", "c"],
+ Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", max_tokens=2),
+ ["a", "b"],
+ )
diff --git a/src/helm/proxy/clients/test_huggingface_client.py b/src/helm/proxy/clients/test_huggingface_client.py
index f5c59f2d8f3..09efeca3b27 100644
--- a/src/helm/proxy/clients/test_huggingface_client.py
+++ b/src/helm/proxy/clients/test_huggingface_client.py
@@ -29,30 +29,30 @@ def teardown_method(self, method):
def test_tokenize(self):
request = TokenizationRequest(text="I am a computer scientist.")
- result: TokenizationRequestResult = self.client.tokenize(request)
+ result: TokenizationRequestResult = self.client.tokenizer.tokenize(request)
assert not result.cached, "First time making the tokenize request. Result should not be cached"
- result: TokenizationRequestResult = self.client.tokenize(request)
+ result: TokenizationRequestResult = self.client.tokenizer.tokenize(request)
assert result.cached, "Result should be cached"
assert result.raw_tokens == ["I", " am", " a", " computer", " scientist", "."]
def test_encode(self):
request = TokenizationRequest(text="I am a computer scientist.", encode=True, truncation=True, max_length=1)
- result: TokenizationRequestResult = self.client.tokenize(request)
+ result: TokenizationRequestResult = self.client.tokenizer.tokenize(request)
assert not result.cached, "First time making the tokenize request. Result should not be cached"
- result: TokenizationRequestResult = self.client.tokenize(request)
+ result: TokenizationRequestResult = self.client.tokenizer.tokenize(request)
assert result.cached, "Result should be cached"
assert result.raw_tokens == [40]
request = TokenizationRequest(text="I am a computer scientist.", encode=True, truncation=True, max_length=1024)
- result = self.client.tokenize(request)
+ result = self.client.tokenizer.tokenize(request)
assert not result.cached, "First time making this particular request. Result should not be cached"
assert result.raw_tokens == [40, 716, 257, 3644, 11444, 13]
def test_decode(self):
request = DecodeRequest(tokens=[40, 716, 257, 3644, 11444, 13])
- result: DecodeRequestResult = self.client.decode(request)
+ result: DecodeRequestResult = self.client.tokenizer.decode(request)
assert not result.cached, "First time making the decode request. Result should not be cached"
- result: DecodeRequestResult = self.client.decode(request)
+ result: DecodeRequestResult = self.client.tokenizer.decode(request)
assert result.cached, "Result should be cached"
assert result.text == "I am a computer scientist."
@@ -60,7 +60,8 @@ def test_gpt2(self):
prompt: str = "I am a computer scientist."
result: RequestResult = self.client.make_request(
Request(
- model="huggingface/gpt2",
+ model="openai/gpt2",
+ model_deployment="huggingface/gpt2",
prompt=prompt,
num_completions=3,
top_k_per_token=5,
@@ -77,7 +78,8 @@ def test_gpt2(self):
def test_gptj_6b(self):
result: RequestResult = self.client.make_request(
Request(
- model="huggingface/gpt-j-6b",
+ model="eleutherai/gpt-j-6b",
+ model_deployment="huggingface/gpt-j-6b",
prompt="I am a computer scientist.",
num_completions=3,
top_k_per_token=5,
diff --git a/src/helm/proxy/clients/test_together_client.py b/src/helm/proxy/clients/test_together_client.py
index 59eebab9b9d..312fed545e0 100644
--- a/src/helm/proxy/clients/test_together_client.py
+++ b/src/helm/proxy/clients/test_together_client.py
@@ -27,6 +27,7 @@ def teardown_method(self, method):
(
Request(
model="together/redpajama-incite-base-3b-v1",
+ model_deployment="together/redpajama-incite-base-3b-v1",
),
{
"best_of": 1,
@@ -45,6 +46,7 @@ def teardown_method(self, method):
(
Request(
model="meta/llama-7b",
+ model_deployment="together/llama-7b",
prompt="I am a computer scientist.",
temperature=0,
num_completions=4,
@@ -71,6 +73,7 @@ def teardown_method(self, method):
(
Request(
model="stanford/alpaca-7b",
+ model_deployment="together/alpaca-7b",
stop_sequences=["\n"],
),
{
@@ -95,4 +98,4 @@ def test_convert_to_raw_request(self, test_input, expected):
def test_api_key_error(self):
with pytest.raises(TogetherClientError):
- self.client.make_request(Request(model="together/bloom"))
+ self.client.make_request(Request(model="bigscience/bloom", model_deployment="together/bloom"))
diff --git a/src/helm/proxy/clients/together_client.py b/src/helm/proxy/clients/together_client.py
index ad3365af5a9..24cb5b6a387 100644
--- a/src/helm/proxy/clients/together_client.py
+++ b/src/helm/proxy/clients/together_client.py
@@ -16,6 +16,10 @@
"h3-2.7b": "h3-2.7b-h3",
"opt-1.3b": "opt-1.3b-ft-tp1",
"opt-6.7b": "opt-6.7b-ft-tp1",
+ "mpt-7b": "togethercomputer/mpt-7b",
+ "mpt-instruct-7b": "togethercomputer/mpt-7b-instruct",
+ "stablelm-base-alpha-3b": "stabilityai/stablelm-base-alpha-3b",
+ "stablelm-base-alpha-7b": "stabilityai/stablelm-base-alpha-7b",
# Production models
"redpajama-incite-base-3b-v1": "togethercomputer/RedPajama-INCITE-Base-3B-v1",
"redpajama-incite-instruct-3b-v1": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
@@ -29,6 +33,8 @@
"falcon-7b-instruct": "togethercomputer/falcon-7b-instruct",
"falcon-40b": "togethercomputer/falcon-40b",
"falcon-40b-instruct": "togethercomputer/falcon-40b-instruct",
+ "gpt-jt-6b-v1": "togethercomputer/GPT-JT-6B-v1",
+ "gpt-neoxt-chat-base-20b": "togethercomputer/GPT-NeoXT-Chat-Base-20B",
"llama-7b": "huggyllama/llama-7b",
"llama-13b": "huggyllama/llama-13b",
"llama-30b": "huggyllama/llama-30b",
@@ -37,16 +43,12 @@
"llama-2-13b": "togethercomputer/llama-2-13b",
"llama-2-70b": "togethercomputer/llama-2-70b",
"mistral-7b-v0.1": "mistralai/Mistral-7B-v0.1",
- "mpt-7b": "togethercomputer/mpt-7b",
- "mpt-instruct-7b": "togethercomputer/mpt-7b-instruct",
"mpt-30b": "togethercomputer/mpt-30b",
"mpt-instruct-30b": "togethercomputer/mpt-30b-instruct",
"pythia-1b-v0": "EleutherAI/pythia-1b-v0",
"pythia-2.8b-v0": "EleutherAI/pythia-2.8b-v0",
"pythia-6.9b": "EleutherAI/pythia-6.9b",
"pythia-12b-v0": "EleutherAI/pythia-12b-v0",
- "stablelm-base-alpha-3b": "stabilityai/stablelm-base-alpha-3b",
- "stablelm-base-alpha-7b": "stabilityai/stablelm-base-alpha-7b",
"vicuna-7b-v1.3": "lmsys/vicuna-7b-v1.3",
"vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3",
}
@@ -55,7 +57,7 @@
HELM users use a shorter model name (e.g. together/flan-t5-xxl)
whereas the Together client sends and caches requests using
a longer model name that is suffixed with the implementation framework
-(e.g. flan-t5-xxl-hf). This allows trackcing exactly which
+(e.g. flan-t5-xxl-hf). This allows tracking exactly which
implementation was used in the cached results, since some results may
be different depending on the implementation (e.g. efficiency metrics).
This also allows future migration of results in the case of changes of
diff --git a/src/helm/proxy/clients/vision_language/idefics_client.py b/src/helm/proxy/clients/vision_language/idefics_client.py
index 90d290667cd..38aa6d93ab8 100644
--- a/src/helm/proxy/clients/vision_language/idefics_client.py
+++ b/src/helm/proxy/clients/vision_language/idefics_client.py
@@ -78,10 +78,10 @@ def _get_model(self, checkpoint: str) -> LoadedIDEFICSModelProcessor:
return loaded_model_processor
def make_request(self, request: Request) -> RequestResult:
- assert request.model in _models, f"Not a valid model for this client: {request.model}"
+ assert request.model_deployment in _models, f"Not a valid model for this client: {request.model_deployment}"
assert request.multimodal_prompt is not None, "Multimodal prompt is required"
- loaded_model_processor: LoadedIDEFICSModelProcessor = self._get_model(request.model)
+ loaded_model_processor: LoadedIDEFICSModelProcessor = self._get_model(request.model_deployment)
model = loaded_model_processor.model
processor = loaded_model_processor.processor
diff --git a/src/helm/proxy/critique/model_critique_client.py b/src/helm/proxy/critique/model_critique_client.py
index 7c4caaeca65..f26b79897da 100644
--- a/src/helm/proxy/critique/model_critique_client.py
+++ b/src/helm/proxy/critique/model_critique_client.py
@@ -2,6 +2,7 @@
import string
import dataclasses
+from helm.benchmark.run_specs import get_default_model_deployment_for_model
from helm.common.critique_request import (
CritiqueRequest,
CritiqueRequestResult,
@@ -26,6 +27,10 @@ class ModelCritiqueClient(CritiqueClient):
def __init__(self, client: Client, model_name):
self._client = client
self._model_name = model_name
+ self._model_deployment_name = (
+ get_default_model_deployment_for_model(model_name, warn_arg_deprecated=False, ignore_deprecated=True)
+ or self._model_name
+ )
def _interpolate_fields(self, text: str, fields: Dict[str, str]) -> str:
for key, value in fields.items():
@@ -75,6 +80,7 @@ def _task_to_requests(self, task: CritiqueTaskTemplate, fields: Dict[str, str])
request = Request(
model=self._model_name,
+ model_deployment=self._model_deployment_name,
prompt=prompt,
max_tokens=max_tokens,
echo_prompt=False,
diff --git a/src/helm/proxy/example_queries.py b/src/helm/proxy/example_queries.py
index 10009f8c249..bad6b6fc39b 100644
--- a/src/helm/proxy/example_queries.py
+++ b/src/helm/proxy/example_queries.py
@@ -63,13 +63,13 @@ def dedent(text: str) -> str:
"""
temperature: 0
stop_sequences: [.]
- model: ${model} # Try out multiple models
+ model_deployment: ${model_deployment} # Try out multiple models
"""
),
environments=dedent(
"""
occupation: [mathematician, lawyer, doctor]
- model: [openai/davinci, ai21/j1-jumbo]
+ model_deployment: [openai/davinci, ai21/j1-jumbo]
"""
),
),
@@ -88,12 +88,12 @@ def dedent(text: str) -> str:
temperature: 0.5
stop_sequences: [\\n]
num_completions: 5
- model: ${model} # Try out GPT-3 and Jurassic
+ model_deployment: ${model_deployment} # Try out GPT-3 and Jurassic
"""
),
environments=dedent(
"""
- model: [openai/davinci, ai21/j1-jumbo]
+ model_deployment: [openai/davinci, ai21/j1-jumbo]
"""
),
),
@@ -122,12 +122,12 @@ def dedent(text: str) -> str:
temperature: 0
max_tokens: 1
top_k_per_token: 4
- model: ${model} # Try out GPT-3 and Jurassic
+ model_deployment: ${model_deployment} # Try out GPT-3 and Jurassic
"""
),
environments=dedent(
"""
- model: [openai/davinci, ai21/j1-jumbo]
+ model_deployment: [openai/davinci, ai21/j1-jumbo]
"""
),
),
@@ -135,7 +135,7 @@ def dedent(text: str) -> str:
prompt="Takes two vectors a and b and returns their Euclidean distance",
settings=dedent(
"""
- model: openai/code-davinci-001 # Codex for code generation
+ model_deployment: openai/code-davinci-001 # Codex for code generation
"""
),
environments="",
@@ -144,14 +144,14 @@ def dedent(text: str) -> str:
prompt="The quick brown fox",
settings=dedent(
"""
- model: ${model}
+ model_deployment: ${model_deployment}
temperature: 0.3
stop_sequences: [\\n]
"""
),
environments=dedent(
"""
- model: [
+ model_deployment: [
"openai/davinci", "openai/text-davinci-002",
"openai/text-davinci-003", "ai21/j1-grande-v2-beta",
"together/gpt-j-6b", "together/gpt-jt-6b-v1",
diff --git a/src/helm/proxy/models.py b/src/helm/proxy/models.py
deleted file mode 100644
index 477ae710d97..00000000000
--- a/src/helm/proxy/models.py
+++ /dev/null
@@ -1,963 +0,0 @@
-from dataclasses import dataclass, field
-from typing import Dict, List
-
-# Different modalities
-TEXT_MODEL_TAG: str = "text"
-IMAGE_MODEL_TAG: str = "image"
-CODE_MODEL_TAG: str = "code"
-EMBEDDING_MODEL_TAG: str = "embedding"
-
-# Some model APIs have limited functionalities
-FULL_FUNCTIONALITY_TEXT_MODEL_TAG: str = "full_functionality_text"
-LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG: str = "limited_functionality_text"
-
-# ChatML format
-CHATML_MODEL_TAG: str = "chatml"
-
-# OpenAI Chat format
-OPENAI_CHATGPT_MODEL_TAG: str = "openai_chatgpt"
-
-# For Anthropic models
-ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "claude_1"
-ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "claude_2"
-
-# For OpenAI models with wider context windows
-# TODO(#1455): Simplify context window tags.
-WIDER_CONTEXT_WINDOW_TAG: str = "openai_wider_context_window" # huggingface/gpt2 tokenizer, 4000 tokens
-GPT_TURBO_CONTEXT_WINDOW_TAG: str = "gpt_turbo_context_window" # cl100k_base tokenizer, 4000 tokens
-GPT_TURBO_16K_CONTEXT_WINDOW_TAG: str = "gpt_turbo_16k_context_window" # cl100k_base tokenizer, 8000 tokens
-GPT4_CONTEXT_WINDOW_TAG: str = "gpt4_context_window" # cl100k_base tokenizer, 8192 tokens
-GPT4_32K_CONTEXT_WINDOW_TAG: str = "gpt4_32k_context_window" # cl100k_base tokenizer, 32768 tokens
-
-# For AI21 Jurassic-2 models with wider context windows
-AI21_WIDER_CONTEXT_WINDOW_TAG: str = "ai21_wider_context_window"
-
-# For AI21 Jurassic-2 Jumbo
-# AI21 has recommended using a sequence length of 6000 tokens to avoid OOMs.
-AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG: str = "ai21_jurassic_2_jumbo_context_window" # 6000
-
-# To fetch models that use these tokenizers
-GPT2_TOKENIZER_TAG: str = "gpt2_tokenizer"
-AI21_TOKENIZER_TAG: str = "ai21_tokenizer"
-COHERE_TOKENIZER_TAG: str = "cohere_tokenizer"
-OPT_TOKENIZER_TAG: str = "opt_tokenizer"
-GPTJ_TOKENIZER_TAG: str = "gptj_tokenizer"
-GPT4_TOKENIZER_TAG: str = "gpt4_tokenizer"
-GPTNEO_TOKENIZER_TAG: str = "gptneo_tokenizer"
-
-# Models which emit garbage tokens when temperature=0.
-BUGGY_TEMP_0_TAG: str = "buggy_temp_0"
-
-# Models that are used for ablations and fine-grained analyses.
-# These models are selected specifically because of their low marginal cost to evaluate.
-ABLATION_MODEL_TAG: str = "ablation"
-
-# Some models (e.g., T5) have stripped newlines.
-# So we cannot use \n as a stop sequence for these models.
-NO_NEWLINES_TAG: str = "no_newlines"
-
-# Some models (e.g., UL2) require a prefix (e.g., [NLG]) in the
-# prompts to indicate the mode before doing inference.
-NLG_PREFIX_TAG: str = "nlg_prefix_tag"
-
-# Some models can follow instructions.
-INSTRUCTION_FOLLOWING_MODEL_TAG: str = "instruction_following"
-
-# For Vision-langauge models (VLMs)
-VISION_LANGUAGE_MODEL_TAG: str = "vision_language"
-
-
-@dataclass
-class Model:
- """
- Represents a model that we can make requests to. Conceptually, an instance
- of `Model` is tied more to the hosting implementation (where can we send
- requests) rather than the conceptual model. These are the same for closed
- models, but different for open-source models. Note: for all the metadata
- and documentation about the model itself, see `ModelField` in `schema.py`.
- """
-
- # Model group, used to determine quotas (e.g. "huggingface").
- # This group is only for user accounts, not benchmarking, and should probably
- # called something else.
- group: str
-
- # Name of the specific model (e.g. "huggingface/gpt-j-6b")
- # The name is / or
- # /
- # There is also `` (see `ModelField`).
- name: str
-
- # Tags corresponding to the properties of the model.
- tags: List[str] = field(default_factory=list)
-
- @property
- def organization(self) -> str:
- """
- Extracts the organization from the model name.
- Example: 'ai21/j1-jumbo' => 'ai21'
- """
- return self.name.split("/")[0]
-
- @property
- def engine(self) -> str:
- """
- Extracts the model engine from the model name.
- Example: 'ai21/j1-jumbo' => 'j1-jumbo'
- """
- return self.name.split("/")[1]
-
-
-# For the list of available models, see the following docs:
-# Note that schema.yaml has much of this information now.
-# Over time, we should add more information there.
-
-ALL_MODELS = [
- # Local Model
- Model(
- group="neurips",
- name="neurips/local",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- # AI21: https://studio.ai21.com/pricing
- Model(
- group="jurassic",
- name="ai21/j1-jumbo",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
- ),
- # From AI21: "the new model is a mid-point in terms of size, cost and performance between Jumbo and Large.
- # We also implemented a few tweaks to its training process. Internal benchmarks suggest it can really
- # help the unit economics on your end compared to Jumbo, without compromising too much on quality."
- Model(
- group="jurassic",
- name="ai21/j1-grande",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
- ),
- Model(
- group="jurassic",
- name="ai21/j1-grande-v2-beta",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
- ),
- Model(
- group="jurassic",
- name="ai21/j1-large",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
- ),
- # AI21 Jurassic-2 Models: https://www.ai21.com/blog/introducing-j2
- Model(
- group="jurassic",
- name="ai21/j2-jumbo",
- tags=[
- TEXT_MODEL_TAG,
- AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG,
- FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
- AI21_TOKENIZER_TAG,
- ],
- ),
- Model(
- group="jurassic",
- name="ai21/j2-grande",
- tags=[TEXT_MODEL_TAG, AI21_WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
- ),
- Model(
- group="jurassic",
- name="ai21/j2-large",
- tags=[TEXT_MODEL_TAG, AI21_WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG],
- ),
- # Aleph Alpha's Luminous models: https://docs.aleph-alpha.com/docs/introduction/luminous
- Model(
- group="luminous",
- name="AlephAlpha/luminous-base",
- # Does not support echo
- tags=[TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="luminous",
- name="AlephAlpha/luminous-extended",
- # Does not support echo
- tags=[TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="luminous",
- name="AlephAlpha/luminous-supreme",
- # Does not support echo.
- # TODO: images will be supported in the near future. Add IMAGE_MODEL_TAG.
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- # TODO: coming soon. Uncomment out the following when Luminous World is released.
- # Model(
- # group="luminous",
- # name="AlephAlpha/luminous-world",
- # tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- # ),
- # Anthropic
- Model(
- group="anthropic",
- name="anthropic/stanford-online-all-v4-s3",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG, ABLATION_MODEL_TAG],
- ),
- Model(
- group="anthropic",
- name="anthropic/claude-2.0",
- tags=[
- ANTHROPIC_CLAUDE_2_MODEL_TAG,
- TEXT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- GPT2_TOKENIZER_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- Model(
- group="anthropic",
- name="anthropic/claude-v1.3",
- tags=[
- ANTHROPIC_CLAUDE_1_MODEL_TAG,
- TEXT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- GPT2_TOKENIZER_TAG,
- ABLATION_MODEL_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- Model(
- group="anthropic",
- name="anthropic/claude-instant-v1",
- tags=[
- ANTHROPIC_CLAUDE_1_MODEL_TAG,
- TEXT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- GPT2_TOKENIZER_TAG,
- ABLATION_MODEL_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- # BigScience
- Model(
- group="together",
- name="together/bloom",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG],
- ),
- Model(
- group="together",
- name="together/t0pp",
- # Does not support echo=True
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG],
- ),
- # Cohere models
- # Model versioning and the possible versions are not documented here:
- # https://docs.cohere.ai/generate-reference#model-optional.
- # So, instead, we got the names of the models from the Cohere Playground.
- #
- # Note that their tokenizer and model were trained on English text and
- # they do not have a dedicated decode API endpoint, so the adaptation
- # step for language modeling fails for certain Scenarios:
- # the_pile:subset=ArXiv
- # the_pile:subset=Github
- # the_pile:subset=PubMed Central
- Model(
- group="cohere",
- name="cohere/xlarge-20220609",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
- ),
- Model(
- group="cohere",
- name="cohere/xlarge-20221108",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
- ),
- Model(
- group="cohere",
- name="cohere/large-20220720",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
- ),
- Model(
- group="cohere",
- name="cohere/medium-20220720",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
- ),
- Model(
- group="cohere",
- name="cohere/medium-20221108",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
- ),
- Model(
- group="cohere",
- name="cohere/small-20220720",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG],
- ),
- Model(
- group="cohere",
- name="cohere/command-medium-beta",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
- ),
- Model(
- group="cohere",
- name="cohere/command-xlarge-beta",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
- ),
- # EleutherAI
- Model(
- group="together",
- name="together/gpt-j-6b",
- tags=[
- TEXT_MODEL_TAG,
- FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
- ABLATION_MODEL_TAG,
- GPTJ_TOKENIZER_TAG,
- BUGGY_TEMP_0_TAG,
- ],
- ),
- Model(
- group="together",
- name="together/gpt-neox-20b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, GPTNEO_TOKENIZER_TAG],
- ),
- Model(
- group="together",
- name="eleutherai/pythia-1b-v0",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="eleutherai/pythia-2.8b-v0",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="eleutherai/pythia-6.9b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="eleutherai/pythia-12b-v0",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- # Meta
- Model(
- group="together",
- name="meta/llama-7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="meta/llama-13b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="meta/llama-30b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="meta/llama-65b",
- # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="meta/llama-2-7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="meta/llama-2-13b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="meta/llama-2-70b",
- # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- # Stanford
- Model(
- group="together",
- name="stanford/alpaca-7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
- ),
- # LMSYS
- Model(
- group="together",
- name="lmsys/vicuna-7b-v1.3",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
- ),
- Model(
- group="together",
- name="lmsys/vicuna-13b-v1.3",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
- ),
- # Mistral AI
- Model(
- group="mistralai",
- name="mistralai/mistral-7b-v0.1",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG],
- ),
- # MosaicML
- Model(
- group="together",
- name="mosaicml/mpt-7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="mosaicml/mpt-instruct-7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="mosaicml/mpt-30b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="mosaicml/mpt-instruct-30b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- # TII UAE
- Model(
- group="together",
- name="tiiuae/falcon-7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="tiiuae/falcon-7b-instruct",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="tiiuae/falcon-40b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="tiiuae/falcon-40b-instruct",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- # GooseAI supported models
- Model(
- group="gooseai",
- name="gooseai/gpt-neo-20b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTNEO_TOKENIZER_TAG],
- ),
- Model(
- group="gooseai",
- name="gooseai/gpt-j-6b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
- ),
- # HuggingFace
- Model(
- group="huggingface",
- name="huggingface/gpt2",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="huggingface",
- name="huggingface/gpt-j-6b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
- ),
- Model(
- group="huggingface",
- name="huggingface/santacoder",
- tags=[CODE_MODEL_TAG],
- ),
- Model(
- group="huggingface",
- name="huggingface/starcoder",
- tags=[CODE_MODEL_TAG],
- ),
- # Google
- Model(
- group="together",
- name="together/t5-11b",
- # Does not support echo=True
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG],
- ),
- Model(
- group="together",
- name="together/flan-t5-xxl",
- # Does not support echo=True
- tags=[
- TEXT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- ABLATION_MODEL_TAG,
- NO_NEWLINES_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- Model(
- group="together",
- name="together/ul2",
- # Does not support echo=True
- tags=[
- TEXT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- ABLATION_MODEL_TAG,
- NO_NEWLINES_TAG,
- NLG_PREFIX_TAG,
- ],
- ),
- # H3 model
- Model(
- group="together",
- name="together/h3-2.7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- # OPT
- Model(
- group="together",
- name="together/opt-175b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, OPT_TOKENIZER_TAG],
- ),
- Model(
- group="together",
- name="together/opt-66b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, OPT_TOKENIZER_TAG],
- ),
- Model(
- group="together",
- name="together/opt-6.7b",
- tags=[
- TEXT_MODEL_TAG,
- FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
- ABLATION_MODEL_TAG,
- OPT_TOKENIZER_TAG,
- BUGGY_TEMP_0_TAG,
- ],
- ),
- Model(
- group="together",
- name="together/opt-1.3b",
- tags=[
- TEXT_MODEL_TAG,
- FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
- ABLATION_MODEL_TAG,
- OPT_TOKENIZER_TAG,
- BUGGY_TEMP_0_TAG,
- ],
- ),
- # Microsoft/NVIDIA
- Model(
- group="microsoft",
- name="microsoft/TNLGv2_530B",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="microsoft",
- name="microsoft/TNLGv2_7B",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- # OpenAI: https://beta.openai.com/docs/engines/gpt-3
- Model(
- group="gpt3",
- name="openai/davinci",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/curie",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/babbage",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/ada",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- # TODO: text-davinci-002 supports insertion. Support insertion in our framework.
- # https://github.com/stanford-crfm/benchmarking/issues/359
- Model(
- group="gpt3",
- name="openai/text-davinci-003",
- tags=[
- TEXT_MODEL_TAG,
- WIDER_CONTEXT_WINDOW_TAG,
- FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
- GPT2_TOKENIZER_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- Model(
- group="gpt3",
- name="openai/text-davinci-002",
- tags=[TEXT_MODEL_TAG, WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/text-davinci-001",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/text-curie-001",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/text-babbage-001",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/text-ada-001",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="codex",
- name="openai/code-davinci-002",
- tags=[CODE_MODEL_TAG, WIDER_CONTEXT_WINDOW_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="codex",
- name="openai/code-davinci-001",
- tags=[CODE_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- Model(
- group="codex",
- name="openai/code-cushman-001",
- tags=[CODE_MODEL_TAG, GPT2_TOKENIZER_TAG],
- ),
- # GPT-4
- Model(
- group="gpt4",
- name="openai/gpt-4-0314",
- tags=[
- TEXT_MODEL_TAG,
- GPT4_CONTEXT_WINDOW_TAG,
- GPT4_TOKENIZER_TAG,
- OPENAI_CHATGPT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- Model(
- group="gpt4",
- name="openai/gpt-4-32k-0314",
- tags=[
- TEXT_MODEL_TAG,
- GPT4_32K_CONTEXT_WINDOW_TAG,
- GPT4_TOKENIZER_TAG,
- OPENAI_CHATGPT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- Model(
- group="gpt4",
- name="openai/gpt-4-0613",
- tags=[
- TEXT_MODEL_TAG,
- GPT4_CONTEXT_WINDOW_TAG,
- GPT4_TOKENIZER_TAG,
- OPENAI_CHATGPT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- Model(
- group="gpt4",
- name="openai/gpt-4-32k-0613",
- tags=[
- TEXT_MODEL_TAG,
- GPT4_32K_CONTEXT_WINDOW_TAG,
- GPT4_TOKENIZER_TAG,
- OPENAI_CHATGPT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- # ChatGPT: https://openai.com/blog/chatgpt
- Model(
- group="gpt3",
- name="openai/gpt-3.5-turbo-0301",
- # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
- # sequence length is smaller at 4087 with one user input message and one assistant
- # output message because ChatGPT uses special tokens for message roles and boundaries.
- # We use a rounded-down sequence length of 4000 to account for these special tokens.
- tags=[
- TEXT_MODEL_TAG,
- GPT_TURBO_CONTEXT_WINDOW_TAG,
- GPT4_TOKENIZER_TAG,
- OPENAI_CHATGPT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- Model(
- group="gpt3",
- name="openai/gpt-3.5-turbo-0613",
- # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable
- # sequence length is smaller at 4087 with one user input message and one assistant
- # output message because ChatGPT uses special tokens for message roles and boundaries.
- # We use a rounded-down sequence length of 4000 to account for these special tokens.
- tags=[
- TEXT_MODEL_TAG,
- GPT_TURBO_CONTEXT_WINDOW_TAG,
- GPT4_TOKENIZER_TAG,
- OPENAI_CHATGPT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- Model(
- group="gpt3",
- name="openai/gpt-3.5-turbo-16k-0613",
- # Claimed length is 16,384; we round down to 16,000 for the same reasons as explained
- # in the openai/gpt-3.5-turbo-0613 comment
- tags=[
- TEXT_MODEL_TAG,
- GPT_TURBO_16K_CONTEXT_WINDOW_TAG,
- GPT4_TOKENIZER_TAG,
- OPENAI_CHATGPT_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- ],
- ),
- # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings
- Model(
- group="gpt3",
- name="openai/text-similarity-davinci-001",
- tags=[EMBEDDING_MODEL_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/text-similarity-curie-001",
- tags=[EMBEDDING_MODEL_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/text-similarity-babbage-001",
- tags=[EMBEDDING_MODEL_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/text-similarity-ada-001",
- tags=[EMBEDDING_MODEL_TAG],
- ),
- Model(
- group="gpt3",
- name="openai/text-embedding-ada-002",
- tags=[EMBEDDING_MODEL_TAG],
- ),
- # Together
- Model(
- group="together",
- name="together/gpt-jt-6b-v1",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG],
- ),
- Model(
- group="together",
- name="together/gpt-neoxt-chat-base-20b",
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, CHATML_MODEL_TAG, GPTNEO_TOKENIZER_TAG],
- ),
- Model(
- group="together",
- name="together/redpajama-incite-base-3b-v1",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="together/redpajama-incite-instruct-3b-v1",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="together/redpajama-incite-base-7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="together/redpajama-incite-instruct-7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- # Tsinghua
- Model(
- group="together",
- name="together/glm",
- # Inference with echo=True is not feasible -- in the prompt encoding phase, they use
- # bidirectional attention and do not perform predictions on them.
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG],
- ),
- # Writer
- Model(
- group="palmyra",
- name="writer/palmyra-base",
- # Does not support echo
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="palmyra",
- name="writer/palmyra-large",
- # Does not support echo
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="palmyra",
- name="writer/palmyra-r",
- # Does not support echo
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="palmyra",
- name="writer/camel",
- # Does not support echo
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="palmyra",
- name="writer/palmyra-instruct-30",
- # Does not support echo
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="palmyra",
- name="writer/palmyra-e",
- # Does not support echo
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="palmyra",
- name="writer/silk-road",
- # Does not support echo
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="palmyra",
- name="writer/palmyra-x",
- # Does not support echo
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- # Yandex
- Model(
- group="together",
- name="together/yalm",
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG],
- ),
- # Google
- Model(
- group="google",
- name="google/palm",
- tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- # NVIDIA
- Model(
- group="nvidia",
- name="nvidia/megatron-gpt2",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG, BUGGY_TEMP_0_TAG],
- ),
- # Databricks
- Model(
- group="together",
- name="databricks/dolly-v2-3b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="databricks/dolly-v2-7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="databricks/dolly-v2-12b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- # Stability AI
- Model(
- group="together",
- name="stabilityai/stablelm-base-alpha-3b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="together",
- name="stabilityai/stablelm-base-alpha-7b",
- tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG],
- ),
- Model(
- group="lightningai",
- name="lightningai/lit-gpt",
- tags=[
- TEXT_MODEL_TAG,
- INSTRUCTION_FOLLOWING_MODEL_TAG,
- LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
- GPT2_TOKENIZER_TAG,
- ],
- ),
- # Vision-language models (VLMs)
- Model(
- group="idefics",
- name="HuggingFaceM4/idefics-9b",
- tags=[VISION_LANGUAGE_MODEL_TAG],
- ),
- Model(
- group="idefics",
- name="HuggingFaceM4/idefics-9b-instruct",
- tags=[VISION_LANGUAGE_MODEL_TAG],
- ),
- Model(
- group="idefics",
- name="HuggingFaceM4/idefics-80b",
- tags=[VISION_LANGUAGE_MODEL_TAG],
- ),
- Model(
- group="idefics",
- name="HuggingFaceM4/idefics-80b-instruct",
- tags=[VISION_LANGUAGE_MODEL_TAG],
- ),
- # For debugging
- Model(
- group="simple",
- name="simple/model1",
- ),
-]
-
-MODEL_NAME_TO_MODEL: Dict[str, Model] = {model.name: model for model in ALL_MODELS}
-
-
-def get_model(model_name: str) -> Model:
- """Get the `Model` given the name."""
- if model_name not in MODEL_NAME_TO_MODEL:
- raise ValueError(f"No model with name: {model_name}")
-
- return MODEL_NAME_TO_MODEL[model_name]
-
-
-def get_model_group(model_name: str) -> str:
- """Get the model's group given the name."""
- model: Model = get_model(model_name)
- return model.group
-
-
-def get_all_models() -> List[str]:
- """Get all model names."""
- return list(MODEL_NAME_TO_MODEL.keys())
-
-
-def get_models_by_organization(organization: str) -> List[str]:
- """
- Gets models by organization e.g., ai21 => ai21/j1-jumbo, ai21/j1-grande, ai21-large.
- """
- return [model.name for model in ALL_MODELS if model.organization == organization]
-
-
-def get_model_names_with_tag(tag: str) -> List[str]:
- """Get all the name of the models with tag `tag`."""
- return [model.name for model in ALL_MODELS if tag in model.tags]
-
-
-def get_all_text_models() -> List[str]:
- """Get all text model names."""
- return get_model_names_with_tag(TEXT_MODEL_TAG)
-
-
-def get_all_code_models() -> List[str]:
- """Get all code model names."""
- return get_model_names_with_tag(CODE_MODEL_TAG)
-
-
-def get_all_instruction_following_models() -> List[str]:
- """Get all instruction-following model names."""
- return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG)
diff --git a/src/helm/proxy/services/server_service.py b/src/helm/proxy/services/server_service.py
index 1361859767e..a22ceb35b41 100644
--- a/src/helm/proxy/services/server_service.py
+++ b/src/helm/proxy/services/server_service.py
@@ -2,9 +2,6 @@
import signal
from typing import List, Optional
-from helm.benchmark.model_metadata_registry import maybe_register_model_metadata_from_base_path
-from helm.benchmark.model_deployment_registry import maybe_register_model_deployments_from_base_path
-from helm.benchmark.tokenizer_config_registry import maybe_register_tokenizer_configs_from_base_path
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
from helm.common.authentication import Authentication
from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
@@ -22,10 +19,12 @@
from helm.proxy.clients.auto_client import AutoClient
from helm.proxy.clients.toxicity_classifier_client import ToxicityClassifierClient
from helm.proxy.example_queries import example_queries
-from helm.proxy.models import ALL_MODELS, get_model_group
+from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
+from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization
from helm.proxy.query import Query, QueryResult
from helm.proxy.retry import retry_request
from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
+from helm.proxy.tokenizers.auto_tokenizer import AutoTokenizer
from .service import (
Service,
CACHE_DIR,
@@ -48,18 +47,15 @@ def __init__(self, base_path: str = "prod_env", root_mode=False, mongo_uri: str
ensure_directory_exists(cache_path)
accounts_path = os.path.join(base_path, ACCOUNTS_FILE)
- maybe_register_model_metadata_from_base_path(base_path)
- maybe_register_model_deployments_from_base_path(base_path)
- maybe_register_tokenizer_configs_from_base_path(base_path)
-
self.client = AutoClient(credentials, cache_path, mongo_uri)
+ self.tokenizer = AutoTokenizer(credentials, cache_path, mongo_uri)
self.token_counter = AutoTokenCounter(self.client.get_huggingface_client())
self.accounts = Accounts(accounts_path, root_mode=root_mode)
# Lazily instantiated by get_toxicity_scores()
self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
def get_general_info(self) -> GeneralInfo:
- return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=ALL_MODELS)
+ return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=ALL_MODELS_METADATA)
def get_window_service_info(self, model_name) -> WindowServiceInfo:
# The import statement is placed here to avoid two problems, please refer to the link for details
@@ -95,9 +91,9 @@ def make_request(self, auth: Authentication, request: Request) -> RequestResult:
# https://github.com/stanford-crfm/benchmarking/issues/56
self.accounts.authenticate(auth)
- model_group: str = get_model_group(request.model)
+ host_organization: str = get_model_deployment_host_organization(request.model_deployment)
# Make sure we can use
- self.accounts.check_can_use(auth.api_key, model_group)
+ self.accounts.check_can_use(auth.api_key, host_organization)
# Use!
request_result: RequestResult = self.client.make_request(request)
@@ -106,19 +102,19 @@ def make_request(self, auth: Authentication, request: Request) -> RequestResult:
if not request_result.cached:
# Count the number of tokens used
count: int = self.token_counter.count_tokens(request, request_result.completions)
- self.accounts.use(auth.api_key, model_group, count)
+ self.accounts.use(auth.api_key, host_organization, count)
return request_result
def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
"""Tokenize via an API."""
self.accounts.authenticate(auth)
- return self.client.tokenize(request)
+ return self.tokenizer.tokenize(request)
def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
"""Decodes to text."""
self.accounts.authenticate(auth)
- return self.client.decode(request)
+ return self.tokenizer.decode(request)
def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
@retry_request
diff --git a/src/helm/proxy/services/service.py b/src/helm/proxy/services/service.py
index f169008ff2c..af3b500c09d 100644
--- a/src/helm/proxy/services/service.py
+++ b/src/helm/proxy/services/service.py
@@ -14,7 +14,7 @@
DecodeRequestResult,
)
from helm.common.request import Request, RequestResult
-from helm.proxy.models import Model
+from helm.benchmark.model_metadata_registry import ModelMetadata
from helm.proxy.query import Query, QueryResult
from helm.proxy.accounts import Authentication, Account
@@ -29,7 +29,7 @@
class GeneralInfo:
version: str
example_queries: List[Query]
- all_models: List[Model]
+ all_models: List[ModelMetadata]
def expand_environments(environments: Dict[str, List[str]]):
@@ -69,6 +69,8 @@ def synthesize_request(prompt: str, settings: str, environment: Dict[str, str])
request: Dict[str, Any] = {}
request["prompt"] = substitute_text(prompt, environment)
request.update(parse_hocon(substitute_text(settings, environment)))
+ if "model_deployment" not in request and "model" not in request:
+ request["model_deployment"] = "openai/text-davinci-002"
return Request(**request)
diff --git a/src/helm/proxy/services/test_remote_service.py b/src/helm/proxy/services/test_remote_service.py
index 63a267d5608..e2f4306f7a3 100644
--- a/src/helm/proxy/services/test_remote_service.py
+++ b/src/helm/proxy/services/test_remote_service.py
@@ -85,7 +85,7 @@ def create_root_account() -> str:
@staticmethod
def query(url: str, auth: Authentication, prompt: str):
- request = Request(prompt=prompt, model="simple/model1")
+ request = Request(prompt=prompt, model="simple/model1", model_deployment="simple/model1")
response: RequestResult = RemoteService(base_url=url).make_request(auth, request)
response_text: str = response.completions[0].text
# With the toy model (simple/model1), we should expect the same response as the prompt
@@ -121,7 +121,7 @@ def teardown_class(cls):
shutil.rmtree(cls.base_path)
def test_make_request(self):
- request = Request(prompt="1 2 3", model="simple/model1")
+ request = Request(prompt="1 2 3", model="simple/model1", model_deployment="simple/model1")
response: RequestResult = self.service.make_request(self.auth, request)
assert response.success
@@ -132,7 +132,7 @@ def test_tokenize(self):
def test_make_request_plus_sign(self):
# Ensure + in prompt doesn't get replaced by a blank space
- request = Request(prompt="+", model="simple/model1")
+ request = Request(prompt="+", model="simple/model1", model_deployment="simple/model1")
response: RequestResult = self.service.make_request(self.auth, request)
assert response.completions[0].text == "+"
assert response.success
diff --git a/src/helm/proxy/services/test_service.py b/src/helm/proxy/services/test_service.py
index 1d3f2583f9b..6c0fd19b0ce 100644
--- a/src/helm/proxy/services/test_service.py
+++ b/src/helm/proxy/services/test_service.py
@@ -3,6 +3,7 @@
import shutil
import tempfile
+from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment
from helm.common.authentication import Authentication
from helm.common.request import Request
from helm.proxy.accounts import AuthenticationError, Accounts
@@ -34,7 +35,9 @@ def test_expand_query(self):
def test_make_request(self):
num_completions = 2
- request = Request(prompt="1 2 3", model="simple/model1", num_completions=num_completions)
+ request = Request(
+ prompt="1 2 3", model="simple/model1", model_deployment="simple/model1", num_completions=num_completions
+ )
result = self.service.make_request(self.auth, request)
assert len(result.completions) == num_completions
@@ -211,7 +214,7 @@ def helper_prod_test_service(request: Request, expected_text: str):
# Models that we want to test
-prod_models = ["openai/davinci", "ai21/j1-jumbo"]
+prod_model_deployments = ["openai/davinci", "ai21/j1-jumbo"]
# TODO: put a flag on this so that it's easy to use pytest to still run these slow tests
@@ -220,8 +223,17 @@ def helper_prod_test_service(request: Request, expected_text: str):
def test_prod_continue():
# Test that we're continuing
prompt = "Paris is the capital of"
- for model in prod_models:
- request = Request(prompt=prompt, model=model, max_tokens=1, num_completions=1, temperature=0)
+ for model_deployment_name in prod_model_deployments:
+ model_deployment: ModelDeployment = get_model_deployment(model_deployment_name)
+ model_name: str = model_deployment.model_name or model_deployment.name
+ request = Request(
+ prompt=prompt,
+ model=model_name,
+ model_deployment=model_deployment_name,
+ max_tokens=1,
+ num_completions=1,
+ temperature=0,
+ )
helper_prod_test_service(request, " France")
@@ -229,6 +241,15 @@ def test_prod_continue():
def test_prod_echo():
# If we're echoing the prompt, make sure we're getting the same thing back
prompt = "I like pickles."
- for model in prod_models:
- request = Request(prompt=prompt, model=model, max_tokens=0, num_completions=1, echo_prompt=True)
+ for model_deployment_name in prod_model_deployments:
+ model_deployment: ModelDeployment = get_model_deployment(model_deployment_name)
+ model_name: str = model_deployment.model_name or model_deployment.name
+ request = Request(
+ prompt=prompt,
+ model=model_name,
+ model_deployment=model_deployment_name,
+ max_tokens=0,
+ num_completions=1,
+ echo_prompt=True,
+ )
helper_prod_test_service(request, prompt)
diff --git a/src/helm/proxy/static/index.js b/src/helm/proxy/static/index.js
index 26ad8b8416b..dc97d42ab1b 100644
--- a/src/helm/proxy/static/index.js
+++ b/src/helm/proxy/static/index.js
@@ -237,7 +237,7 @@ $(function () {
//
// get_num_bytes() and convert_tokens_to_text() in src/helm/benchmark/basic_metrics.py are adapted from this function.
const groups = [];
- for (let i = 0; i < tokens.length; ) {
+ for (let i = 0; i < tokens.length;) {
// Aggregate consecutive tokens while they're "bytes:..."
const group = { tokens: [] };
if (tokens[i].text.startsWith("bytes:")) {
diff --git a/src/helm/proxy/test_models.py b/src/helm/proxy/test_models.py
deleted file mode 100644
index c966815927c..00000000000
--- a/src/helm/proxy/test_models.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from .models import get_model, get_model_group, get_models_by_organization, get_all_code_models, Model
-
-
-def test_get_model():
- model: Model = get_model("ai21/j1-jumbo")
- assert model.organization == "ai21"
- assert model.engine == "j1-jumbo"
-
-
-def test_get_model_with_invalid_model_name():
- try:
- get_model("invalid/model")
- assert False, "Expected to throw ValueError"
- except ValueError:
- pass
-
-
-def test_get_model_group():
- assert get_model_group("openai/text-curie-001") == "gpt3"
-
-
-def test_get_models_by_organization():
- assert get_models_by_organization("simple") == ["simple/model1"]
-
-
-def test_all_code_models():
- assert "openai/code-davinci-002" in get_all_code_models()
diff --git a/src/helm/proxy/token_counters/auto_token_counter.py b/src/helm/proxy/token_counters/auto_token_counter.py
index 31d93d3d638..60604f2aa83 100644
--- a/src/helm/proxy/token_counters/auto_token_counter.py
+++ b/src/helm/proxy/token_counters/auto_token_counter.py
@@ -38,5 +38,5 @@ def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
"""
Counts tokens based on the organization.
"""
- token_counter: TokenCounter = self.get_token_counter(request.model_organization)
+ token_counter: TokenCounter = self.get_token_counter(request.model_host)
return token_counter.count_tokens(request, completions)
diff --git a/src/helm/proxy/token_counters/openai_token_counter.py b/src/helm/proxy/token_counters/openai_token_counter.py
index e3083cea5cd..01ca7d35426 100644
--- a/src/helm/proxy/token_counters/openai_token_counter.py
+++ b/src/helm/proxy/token_counters/openai_token_counter.py
@@ -15,7 +15,7 @@ def count_tokens(self, request: Request, completions: List[Sequence]) -> int:
Counts the total number of tokens using the suggestion here:
https://community.openai.com/t/how-do-i-calculate-the-pricing-for-generation-of-text/11662/5
"""
- tokenized_prompt: TokenizationRequestResult = self.huggingface_client.tokenize(
+ tokenized_prompt: TokenizationRequestResult = self.huggingface_client.tokenizer.tokenize(
TokenizationRequest(request.prompt)
)
# Number of tokens in the prompt + number of tokens in all the completions
diff --git a/src/helm/proxy/token_counters/test_ai21_token_counter.py b/src/helm/proxy/token_counters/test_ai21_token_counter.py
index 9ae4541fcb2..026943dfc8e 100644
--- a/src/helm/proxy/token_counters/test_ai21_token_counter.py
+++ b/src/helm/proxy/token_counters/test_ai21_token_counter.py
@@ -10,11 +10,13 @@ def setup_method(self, method):
def test_count_tokens(self):
request = Request(
+ model="openai/text-davinci-002",
+ model_deployment="openai/text-davinci-002",
prompt="The Center for Research on Foundation Models (CRFM) is "
"an interdisciplinary initiative born out of the Stanford "
"Institute for Human-Centered Artificial Intelligence (HAI) "
"that aims to make fundamental advances in the study, development, "
- "and deployment of foundation models."
+ "and deployment of foundation models.",
)
completions: List[Sequence] = [
Sequence(
diff --git a/src/helm/proxy/token_counters/test_openai_token_counter.py b/src/helm/proxy/token_counters/test_openai_token_counter.py
index de9fcc3ef35..3f7bbfaebae 100644
--- a/src/helm/proxy/token_counters/test_openai_token_counter.py
+++ b/src/helm/proxy/token_counters/test_openai_token_counter.py
@@ -32,7 +32,11 @@ def teardown_method(self, method):
os.remove(self.cache_path)
def test_count_tokens(self):
- request = Request(prompt=TestOpenAITokenCounter.TEST_PROMPT)
+ request = Request(
+ model="openai/text-davinci-002",
+ model_deployment="openai/text-davinci-002",
+ prompt=TestOpenAITokenCounter.TEST_PROMPT,
+ )
completions: List[Sequence] = [
Sequence(
text=" The CRFM is dedicated to advancing our knowledge of the foundations of artificial intelligence "
diff --git a/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py b/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py
index a43c63b8414..313cc0a4be4 100644
--- a/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py
+++ b/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py
@@ -31,7 +31,7 @@ class AlephAlphaTokenizer(CachingTokenizer):
def __init__(self, api_key: str, cache_config: CacheConfig) -> None:
super().__init__(cache_config)
self.api_key: str = api_key
- self._aleph_alpha_client = AlephAlphaPythonClient(token=api_key)
+ self._aleph_alpha_client = AlephAlphaPythonClient(token=api_key) if api_key else None
self._tokenizer_name_to_tokenizer: Dict[str, InternalTokenizer] = {}
def _get_tokenizer(self, tokenizer_name: str) -> InternalTokenizer:
@@ -40,6 +40,8 @@ def _get_tokenizer(self, tokenizer_name: str) -> InternalTokenizer:
# Check if the tokenizer is cached
if tokenizer_name not in self._tokenizer_name_to_tokenizer:
+ if self._aleph_alpha_client is None:
+ raise ValueError("Aleph Alpha API key not set.")
self._tokenizer_name_to_tokenizer[tokenizer_name] = self._aleph_alpha_client.tokenizer(tokenizer_name)
hlog(f"Initialized tokenizer: {tokenizer_name}")
return self._tokenizer_name_to_tokenizer[tokenizer_name]
diff --git a/src/helm/proxy/tokenizers/auto_tokenizer.py b/src/helm/proxy/tokenizers/auto_tokenizer.py
new file mode 100644
index 00000000000..5722a7e97bf
--- /dev/null
+++ b/src/helm/proxy/tokenizers/auto_tokenizer.py
@@ -0,0 +1,89 @@
+from dataclasses import replace
+from typing import Any, Dict, Mapping, Optional
+
+from retrying import Attempt, RetryError
+
+from helm.benchmark.tokenizer_config_registry import get_tokenizer_config
+from helm.common.cache_utils import build_cache_config
+from helm.common.credentials_utils import provide_api_key
+from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hlog
+from helm.common.object_spec import create_object, inject_object_spec_args
+from helm.common.tokenization_request import (
+ DecodeRequest,
+ DecodeRequestResult,
+ TokenizationRequest,
+ TokenizationRequestResult,
+)
+from helm.proxy.tokenizers.tokenizer import Tokenizer
+
+
+class AutoTokenizer(Tokenizer):
+ """Automatically dispatch to the proper `Tokenizer` based on the tokenizer name."""
+
+ def __init__(self, credentials: Mapping[str, Any], cache_path: str, mongo_uri: str = ""):
+ self.credentials = credentials
+ self.cache_path = cache_path
+ self.mongo_uri = mongo_uri
+ self.tokenizers: Dict[str, Tokenizer] = {}
+ hlog(f"AutoTokenizer: cache_path = {cache_path}")
+ hlog(f"AutoTokenizer: mongo_uri = {mongo_uri}")
+
+ def _get_tokenizer(self, tokenizer_name: str) -> Tokenizer:
+ # First try to find the tokenizer in the cache
+ tokenizer: Optional[Tokenizer] = self.tokenizers.get(tokenizer_name)
+ if tokenizer is not None:
+ return tokenizer
+
+ # Otherwise, create the tokenizer
+ organization: str = tokenizer_name.split("/")[0]
+ cache_config: CacheConfig = build_cache_config(self.cache_path, self.mongo_uri, organization)
+
+ tokenizer_config = get_tokenizer_config(tokenizer_name)
+ if tokenizer_config:
+ tokenizer_spec = inject_object_spec_args(
+ tokenizer_config.tokenizer_spec,
+ constant_bindings={"cache_config": cache_config},
+ provider_bindings={
+ "api_key": lambda: provide_api_key(self.credentials, organization),
+ },
+ )
+ tokenizer = create_object(tokenizer_spec)
+
+ # Cache the tokenizer
+ assert isinstance(tokenizer, Tokenizer) # To make mypy happy
+ self.tokenizers[tokenizer_name] = tokenizer
+
+ return tokenizer
+
+ def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
+ """Tokenizes based on the name of the tokenizer (e.g., huggingface/gpt2)."""
+
+ def tokenize_with_retry(tokenizer: Tokenizer, request: TokenizationRequest) -> TokenizationRequestResult:
+ return tokenizer.tokenize(request)
+
+ tokenizer: Tokenizer = self._get_tokenizer(request.tokenizer)
+
+ try:
+ return tokenize_with_retry(tokenizer=tokenizer, request=request)
+ except RetryError as e:
+ last_attempt: Attempt = e.last_attempt
+ retry_error: str = f"Failed to tokenize after retrying {last_attempt.attempt_number} times"
+ hlog(retry_error)
+ return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
+
+ def decode(self, request: DecodeRequest) -> DecodeRequestResult:
+ """Decodes based on the the name of the tokenizer (e.g., huggingface/gpt2)."""
+
+ def decode_with_retry(tokenizer: Tokenizer, request: DecodeRequest) -> DecodeRequestResult:
+ return tokenizer.decode(request)
+
+ tokenizer: Tokenizer = self._get_tokenizer(request.tokenizer)
+
+ try:
+ return decode_with_retry(tokenizer=tokenizer, request=request)
+ except RetryError as e:
+ last_attempt: Attempt = e.last_attempt
+ retry_error: str = f"Failed to decode after retrying {last_attempt.attempt_number} times"
+ hlog(retry_error)
+ return replace(last_attempt.value, error=f"{retry_error}. Error: {last_attempt.value.error}")
diff --git a/src/helm/proxy/clients/test_anthropic_client.py b/src/helm/proxy/tokenizers/test_anthropic_tokenizer.py
similarity index 71%
rename from src/helm/proxy/clients/test_anthropic_client.py
rename to src/helm/proxy/tokenizers/test_anthropic_tokenizer.py
index d1a039ef07e..3556978b5ae 100644
--- a/src/helm/proxy/clients/test_anthropic_client.py
+++ b/src/helm/proxy/tokenizers/test_anthropic_tokenizer.py
@@ -10,11 +10,10 @@
TokenizationRequest,
TokenizationRequestResult,
)
-from helm.proxy.tokenizers.anthropic_tokenizer import AnthropicTokenizer
-from .anthropic_client import AnthropicClient
+from .anthropic_tokenizer import AnthropicTokenizer
-class TestAnthropicClient:
+class TestAnthropicTokenizer:
TEST_PROMPT: str = "I am a computer scientist."
TEST_ENCODED: List[int] = [45, 1413, 269, 6797, 22228, 18]
TEST_TOKENS: List[str] = ["I", " am", " a", " computer", " scientist", "."]
@@ -22,42 +21,39 @@ class TestAnthropicClient:
def setup_method(self, method):
cache_file = tempfile.NamedTemporaryFile(delete=False)
self.cache_path: str = cache_file.name
- self.client = AnthropicClient(
- tokenizer=AnthropicTokenizer(SqliteCacheConfig(self.cache_path)),
- cache_config=SqliteCacheConfig(self.cache_path),
- )
+ self.tokenizer = AnthropicTokenizer(SqliteCacheConfig(self.cache_path))
def teardown_method(self, method):
os.remove(self.cache_path)
def test_tokenize(self):
request = TokenizationRequest(text=self.TEST_PROMPT)
- result: TokenizationRequestResult = self.client.tokenize(request)
+ result: TokenizationRequestResult = self.tokenizer.tokenize(request)
assert not result.cached, "First time making the tokenize request. Result should not be cached"
assert result.raw_tokens == self.TEST_TOKENS
- result: TokenizationRequestResult = self.client.tokenize(request)
+ result: TokenizationRequestResult = self.tokenizer.tokenize(request)
assert result.cached, "Result should be cached"
assert result.raw_tokens == self.TEST_TOKENS
def test_encode(self):
request = TokenizationRequest(text=self.TEST_PROMPT, encode=True, truncation=True, max_length=1)
- result: TokenizationRequestResult = self.client.tokenize(request)
+ result: TokenizationRequestResult = self.tokenizer.tokenize(request)
assert not result.cached, "First time making the tokenize request. Result should not be cached"
assert result.raw_tokens == [self.TEST_ENCODED[0]]
- result: TokenizationRequestResult = self.client.tokenize(request)
+ result: TokenizationRequestResult = self.tokenizer.tokenize(request)
assert result.cached, "Result should be cached"
assert result.raw_tokens == [self.TEST_ENCODED[0]]
request = TokenizationRequest(text=self.TEST_PROMPT, encode=True, truncation=True, max_length=1024)
- result = self.client.tokenize(request)
+ result = self.tokenizer.tokenize(request)
assert not result.cached, "First time making this particular request. Result should not be cached"
assert result.raw_tokens == self.TEST_ENCODED
def test_decode(self):
request = DecodeRequest(tokens=self.TEST_ENCODED)
- result: DecodeRequestResult = self.client.decode(request)
+ result: DecodeRequestResult = self.tokenizer.decode(request)
assert not result.cached, "First time making the decode request. Result should not be cached"
assert result.text == self.TEST_PROMPT
- result: DecodeRequestResult = self.client.decode(request)
+ result: DecodeRequestResult = self.tokenizer.decode(request)
assert result.cached, "Result should be cached"
assert result.text == self.TEST_PROMPT