feat: evaluate refactor outcomes using LLM to make decision of whethe…

…r file edit should be accepted or discarded
zaripych · Jan 21, 2024 · 6f05f09 · 6f05f09
1 parent 671d221
commit 6f05f09
Show file tree

Hide file tree

Showing 78 changed files with 3,872 additions and 958 deletions.
diff --git a/.changeset/witty-singers-sort.md b/.changeset/witty-singers-sort.md
@@ -0,0 +1,56 @@
+---
+'refactor-bot': patch
+---
+
+feat: evaluate refactor outcomes using LLM to make decision of whether file edit
+should be accepted or discarded
+
+This is a big change which adds extra steps to the refactor process. Every time
+an LLM produces a file edit - we will pass that edit through evaluation
+algorithm to asses whether it should be accepted or discarded. Previously, this
+logic was only affected by the existence or absence of eslint errors. This will
+make the final result higher quality and more reliable.
+
+The new behavior can be disabled by setting `evaluate: false` in the `goal.md`
+file.
+
+In addition to that, this change also adds a new CLI command for internal use
+which allows us to compare results of multiple refactor runs. This is useful for
+benchmarking purposes.
+
+To run the benchmark, use the following command:
+
+```sh
+pnpm benchmark --config .refactor-bot/benchmarks/test-benchmark.yaml
+```
+
+Where the config:
+
+```yaml
+refactorConfig:
+    name: test-refactoring
+    ref: 8f1a3da55caeee3df75853042e57978c45513f18
+    budgetCents: 100
+    model: gpt-4-1106-preview
+    objective:
+        Replace all usages of `readFile` from `fs/promises` module with
+        `readFileSync` from `fs` module in
+        `packages/refactor-bot/src/refactor/planTasks.ts`,
+        `packages/refactor-bot/src/refactor/loadRefactors.ts` and
+        `packages/refactor-bot/src/refactor/discoverDependencies.ts`.
+
+numberOfRuns: 2
+
+variants:
+    - name: 'A'
+      ids: # ids of refactor runs to save mooney on
+          - VRixXEwC
+          - k0FmgQjU
+          - IpSOtP7d
+          - xqydSrSU
+    - name: 'B'
+```
+
+This will run multiple refactor runs and compare the results. At this moment no
+statistical analysis is performed as I'm not convinced we can reach statistical
+significance with the number of runs that also doesn't make you poor.
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,9 @@
 node_modules
 .tsc-out
 src/test.ts
-.refactor-bot/refactors/*/state
 .DS_Store
-.refactor-bot/prompts/playground.md
 dist
+.refactor-bot/benchmarks/state
+.refactor-bot/refactors/*/state
+.refactor-bot/prompts/playground.md
+.refactor-bot/playground-cache
diff --git a/.refactor-bot/benchmarks/test-benchmark.yaml b/.refactor-bot/benchmarks/test-benchmark.yaml
@@ -0,0 +1,22 @@
+refactorConfig:
+    name: test-refactoring
+    ref: 8f1a3da55caeee3df75853042e57978c45513f18
+    budgetCents: 100
+    model: gpt-4-1106-preview
+    objective:
+        Replace all usages of `readFile` from `fs/promises` module with
+        `readFileSync` from `fs` module in
+        `packages/refactor-bot/src/refactor/planTasks.ts`,
+        `packages/refactor-bot/src/refactor/loadRefactors.ts` and
+        `packages/refactor-bot/src/refactor/discoverDependencies.ts`.
+
+numberOfRuns: 2
+
+variants:
+    - name: 'A'
+      ids:
+          - VRixXEwC
+          - k0FmgQjU
+          - IpSOtP7d
+          - xqydSrSU
+    - name: 'B'
diff --git a/.refactor-bot/refactors/test-likely-failure/goal.md b/.refactor-bot/refactors/test-likely-failure/goal.md
@@ -2,8 +2,8 @@
 # This is to test a likely failure during refactor
 ref: 8f1a3da55caeee3df75853042e57978c45513f18
 budgetCents: 100
-model: gpt-3.5-turbo
+model: gpt-3.5-turbo-1106
 ```
 
 Replace all usages of `readFile` from `fs/promises` module with `readFileSync`
-from `fs` module in `packages/refactor-bot/src/pipeline/dependencies.ts`.
+from `fs` module in `packages/refactor-bot/src/cache/dependencies.ts`.
diff --git a/README.md b/README.md
@@ -80,20 +80,15 @@ Options:
 Performs a refactoring using Plan and Execute technique
 
 Options:
-  --help              Show help                                        [boolean]
-  --version           Show version number                              [boolean]
-  --name              Name of the refactoring to run                    [string]
-  --id                Unique id of the refactoring that was previously run but d
-                      idn't finish to start from last successful point  [string]
-  --save-to-cache     Whether to enable saving results to the cache, by default
-                      it's enabled                     [boolean] [default: true]
-  --enable-cache-for  Enable cache for specific steps only, can be useful if we
-                      want to disable cache for all other steps and replay them
-                                                                         [array]
-  --costs             Whether to print the total costs of OpenAI requests, by de
-                      fault it's disabled             [boolean] [default: false]
-  --performance       Whether to print performance metrics, by default it's disa
-                      bled                            [boolean] [default: false]
+  --help         Show help                                             [boolean]
+  --version      Show version number                                   [boolean]
+  --name         Name of the refactoring to run                         [string]
+  --id           Unique id of the refactoring that was previously run but didn't
+                  finish - use this to start from last successful step  [string]
+  --costs        Whether to print the total costs of OpenAI requests, by default
+                  it's disabled                       [boolean] [default: false]
+  --performance  Whether to print performance metrics, by default it's disabled
+                                                      [boolean] [default: false]
 ```
 
 At first it will create a file for you with description of the refactor. Open
@@ -183,7 +178,7 @@ further, but there is no way to measure how much we improved it. So the next
 step is to add a way to measure the quality of the refactor for benchmarking
 purposes.
 
--   [ ] ability to evaluate result of the refactor to benchmark the quality of
+-   [x] ability to evaluate result of the refactor to benchmark the quality of
         the refactor, so we can asses how different changes affect the quality
         and performance of the refactor-bot
 -   [ ] add Chain Of Thoughts to try to recover from failed attempts to fix

diff --git a/package.json b/package.json
@@ -19,6 +19,7 @@
     },
     "type": "module",
     "scripts": {
+        "benchmark": "pnpm tsx packages/refactor-bot/src/benchmark/cli.ts",
         "refactor-bot-bundled": "node ./packages/refactor-bot/dist/bin/refactor-bot.mjs"
     },
     "dependencies": {},

diff --git a/packages/refactor-bot/src/benchmark/benchmark.ts b/packages/refactor-bot/src/benchmark/benchmark.ts
@@ -0,0 +1,102 @@
+import orderBy from 'lodash-es/orderBy';
+import { join } from 'path';
+import { from, lastValueFrom, mergeMap, toArray } from 'rxjs';
+
+import { createCachedPipeline } from '../cache/state';
+import type { CacheStateRef } from '../cache/types';
+import { logger } from '../logger/logger';
+import { randomText } from '../utils/randomText';
+import { loadBenchmarkConfig } from './loadBenchmarkConfig';
+import { reportBenchmarkSummary } from './reportBenchmarkSummary';
+import { runVariant } from './runVariant';
+import { summarizeRefactorResult } from './summarizeRefactorResult';
+
+export async function benchmark(opts: {
+    config: string;
+    id?: string;
+    // for debugging:
+    saveToCache?: boolean;
+    enableCacheFor?: string[];
+    disableCacheFor?: string[];
+}) {
+    const config = await loadBenchmarkConfig(opts.config);
+
+    const id = opts.id ?? randomText(8);
+
+    const runVariantsAndCompare = async (
+        input: typeof config,
+        ctx?: CacheStateRef
+    ) => {
+        const variantAtATime = Math.max(
+            1,
+            input.maxConcurrentRefactors / input.variants.length
+        );
+        const maxConcurrentRefactorsPerVariant = Math.max(
+            1,
+            input.maxConcurrentRefactors / variantAtATime
+        );
+
+        logger.debug('Running refactors for multiple variants', {
+            maxConcurrentRefactors: input.maxConcurrentRefactors,
+            variantAtATime,
+            maxConcurrentRefactorsPerVariant,
+        });
+
+        const results = await lastValueFrom(
+            from(input.variants).pipe(
+                mergeMap(async (variant) => {
+                    const { resultFilePaths } = await runVariant(
+                        {
+                            id,
+                            variant,
+                            numberOfRuns: input.numberOfRuns,
+                            evaluationConfig: input.evaluationConfig,
+                            maxConcurrentRefactors:
+                                maxConcurrentRefactorsPerVariant,
+                            refactorConfig: {
+                                ...input.refactorConfig,
+                                ...config.refactorConfig,
+                            },
+                        },
+                        ctx
+                    );
+
+                    return {
+                        variant: variant.name,
+                        resultFilePaths,
+                    };
+                }, variantAtATime),
+                toArray()
+            )
+        );
+
+        logger.debug('Finished running refactors for multiple variants', {
+            results,
+        });
+
+        const summaries = await Promise.all(
+            results.map(async (result) => ({
+                variant: result.variant,
+                summary: await summarizeRefactorResult({
+                    resultFilePaths: result.resultFilePaths,
+                }),
+            }))
+        );
+
+        const orderedSummaries = orderBy(summaries, ['variant']);
+
+        await reportBenchmarkSummary({
+            summaries: orderedSummaries,
+        });
+    };
+
+    const { execute } = createCachedPipeline({
+        location: join(`.refactor-bot/benchmarks/state`, id),
+        saveToCache: opts.saveToCache ?? true,
+        enableCacheFor: opts.enableCacheFor,
+        disableCacheFor: opts.disableCacheFor,
+        pipeline: runVariantsAndCompare,
+    });
+
+    return await execute(config);
+}
diff --git a/packages/refactor-bot/src/benchmark/benchmarkConfig.ts b/packages/refactor-bot/src/benchmark/benchmarkConfig.ts
@@ -0,0 +1,61 @@
+import { z } from 'zod';
+
+import { modelsSchema } from '../chat-gpt/api';
+import { ensureHasTwoElements } from '../utils/hasOne';
+
+export const passthroughRefactorConfigSchema = z
+    .object({
+        name: z.string(),
+        model: modelsSchema,
+        objective: z.string(),
+    })
+    .passthrough();
+
+const partialRefactorConfigSchema = z
+    .object({
+        model: modelsSchema.optional(),
+        objective: z.string().optional(),
+    })
+    .passthrough();
+
+export const appVariantSchema = z.object({
+    name: z.string().regex(/^[a-z0-9-]+$/i),
+    ref: z.string().optional(),
+    repository: z.string().optional(),
+    ids: z.array(z.string()).optional(),
+    command: z
+        .array(z.string())
+        .nonempty()
+        .default(['pnpm', 'refactor-bot', 'refactor']),
+    refactorConfig: partialRefactorConfigSchema.optional(),
+});
+
+export const evaluationConfigSchema = z.object({
+    model: modelsSchema,
+    choices: z.number().default(3),
+});
+
+export const benchConfigSchema = z
+    .object({
+        variants: z
+            .array(appVariantSchema)
+            .transform((variants) => ensureHasTwoElements(variants)),
+        refactorConfig: passthroughRefactorConfigSchema,
+        evaluationConfig: evaluationConfigSchema.default({
+            model: 'gpt-4-1106-preview',
+            choices: 3,
+        }),
+        numberOfRuns: z.number().default(1),
+        maxConcurrentRefactors: z.number().default(4),
+    })
+    .transform((input, ctx) => {
+        const variants = new Set(input.variants.map((variant) => variant.name));
+        if (variants.size !== input.variants.length) {
+            ctx.addIssue({
+                code: 'custom',
+                message: 'Variants must have unique names',
+            });
+            return z.NEVER;
+        }
+        return input;
+    });
diff --git a/packages/refactor-bot/src/benchmark/cli.ts b/packages/refactor-bot/src/benchmark/cli.ts
@@ -0,0 +1,80 @@
+import type { ArgumentsCamelCase, Argv, CommandModule } from 'yargs';
+import yargs from 'yargs';
+
+import { line } from '../text/line';
+
+const builder = (yargs: Argv) =>
+    yargs
+        .option('id', {
+            type: 'string',
+            describe: line`
+                Unique id of the benchmark run to identify cache directory
+            `,
+        })
+        .option('config', {
+            type: 'string',
+            describe: line`
+                Path to the config yaml file containing benchmark configuration
+            `,
+            demandOption: true,
+        })
+        .option('save-to-cache', {
+            type: 'boolean',
+            describe: line`
+                Whether to enable saving results to the cache, by default
+                it's enabled.
+            `,
+            default: true,
+            hidden: true,
+        })
+        .option('enable-cache-for', {
+            type: 'string',
+            array: true,
+            describe: line`
+                Disable cache for specific steps - you can specify the name
+                of the step or a name followed by a hash of the cache entry.
+                This for debugging purposes only.
+            `,
+            hidden: true,
+        })
+        .option('disable-cache-for', {
+            type: 'string',
+            array: true,
+            describe: line`
+                Disable cache for specific steps - you can specify the name
+                of the step or a name followed by a hash of the cache entry.
+                This for debugging purposes only.
+            `,
+            hidden: true,
+        });
+
+type Args = {
+    config: string;
+};
+
+const benchmarkCommand = {
+    command: 'benchmark',
+    describe: line`
+        Performs refactoring using different versions of the refactor bot then
+        evaluates the results and compares them.
+    `,
+    builder,
+    handler: async (opts: ArgumentsCamelCase<Args>) => {
+        await import('dotenv').then((m) =>
+            m.config({
+                override: true,
+            })
+        );
+        const { cliHandler } = await import('./cliHandler');
+        await cliHandler(opts);
+    },
+} satisfies CommandModule<Record<never, never>, Args>;
+
+const opts = await builder(yargs(process.argv.slice(2)))
+    .usage(benchmarkCommand.describe)
+    .parseAsync();
+
+await benchmarkCommand.handler(opts).catch((err) => {
+    console.error(err);
+    process.exitCode = 1;
+});