-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: evaluate refactor outcomes using LLM to make decision of whethe…
…r file edit should be accepted or discarded
- Loading branch information
Showing
78 changed files
with
3,872 additions
and
958 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
--- | ||
'refactor-bot': patch | ||
--- | ||
|
||
feat: evaluate refactor outcomes using LLM to make decision of whether file edit | ||
should be accepted or discarded | ||
|
||
This is a big change which adds extra steps to the refactor process. Every time | ||
an LLM produces a file edit - we will pass that edit through evaluation | ||
algorithm to asses whether it should be accepted or discarded. Previously, this | ||
logic was only affected by the existence or absence of eslint errors. This will | ||
make the final result higher quality and more reliable. | ||
|
||
The new behavior can be disabled by setting `evaluate: false` in the `goal.md` | ||
file. | ||
|
||
In addition to that, this change also adds a new CLI command for internal use | ||
which allows us to compare results of multiple refactor runs. This is useful for | ||
benchmarking purposes. | ||
|
||
To run the benchmark, use the following command: | ||
|
||
```sh | ||
pnpm benchmark --config .refactor-bot/benchmarks/test-benchmark.yaml | ||
``` | ||
|
||
Where the config: | ||
|
||
```yaml | ||
refactorConfig: | ||
name: test-refactoring | ||
ref: 8f1a3da55caeee3df75853042e57978c45513f18 | ||
budgetCents: 100 | ||
model: gpt-4-1106-preview | ||
objective: | ||
Replace all usages of `readFile` from `fs/promises` module with | ||
`readFileSync` from `fs` module in | ||
`packages/refactor-bot/src/refactor/planTasks.ts`, | ||
`packages/refactor-bot/src/refactor/loadRefactors.ts` and | ||
`packages/refactor-bot/src/refactor/discoverDependencies.ts`. | ||
|
||
numberOfRuns: 2 | ||
|
||
variants: | ||
- name: 'A' | ||
ids: # ids of refactor runs to save mooney on | ||
- VRixXEwC | ||
- k0FmgQjU | ||
- IpSOtP7d | ||
- xqydSrSU | ||
- name: 'B' | ||
``` | ||
This will run multiple refactor runs and compare the results. At this moment no | ||
statistical analysis is performed as I'm not convinced we can reach statistical | ||
significance with the number of runs that also doesn't make you poor. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
refactorConfig: | ||
name: test-refactoring | ||
ref: 8f1a3da55caeee3df75853042e57978c45513f18 | ||
budgetCents: 100 | ||
model: gpt-4-1106-preview | ||
objective: | ||
Replace all usages of `readFile` from `fs/promises` module with | ||
`readFileSync` from `fs` module in | ||
`packages/refactor-bot/src/refactor/planTasks.ts`, | ||
`packages/refactor-bot/src/refactor/loadRefactors.ts` and | ||
`packages/refactor-bot/src/refactor/discoverDependencies.ts`. | ||
|
||
numberOfRuns: 2 | ||
|
||
variants: | ||
- name: 'A' | ||
ids: | ||
- VRixXEwC | ||
- k0FmgQjU | ||
- IpSOtP7d | ||
- xqydSrSU | ||
- name: 'B' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
import orderBy from 'lodash-es/orderBy'; | ||
import { join } from 'path'; | ||
import { from, lastValueFrom, mergeMap, toArray } from 'rxjs'; | ||
|
||
import { createCachedPipeline } from '../cache/state'; | ||
import type { CacheStateRef } from '../cache/types'; | ||
import { logger } from '../logger/logger'; | ||
import { randomText } from '../utils/randomText'; | ||
import { loadBenchmarkConfig } from './loadBenchmarkConfig'; | ||
import { reportBenchmarkSummary } from './reportBenchmarkSummary'; | ||
import { runVariant } from './runVariant'; | ||
import { summarizeRefactorResult } from './summarizeRefactorResult'; | ||
|
||
export async function benchmark(opts: { | ||
config: string; | ||
id?: string; | ||
// for debugging: | ||
saveToCache?: boolean; | ||
enableCacheFor?: string[]; | ||
disableCacheFor?: string[]; | ||
}) { | ||
const config = await loadBenchmarkConfig(opts.config); | ||
|
||
const id = opts.id ?? randomText(8); | ||
|
||
const runVariantsAndCompare = async ( | ||
input: typeof config, | ||
ctx?: CacheStateRef | ||
) => { | ||
const variantAtATime = Math.max( | ||
1, | ||
input.maxConcurrentRefactors / input.variants.length | ||
); | ||
const maxConcurrentRefactorsPerVariant = Math.max( | ||
1, | ||
input.maxConcurrentRefactors / variantAtATime | ||
); | ||
|
||
logger.debug('Running refactors for multiple variants', { | ||
maxConcurrentRefactors: input.maxConcurrentRefactors, | ||
variantAtATime, | ||
maxConcurrentRefactorsPerVariant, | ||
}); | ||
|
||
const results = await lastValueFrom( | ||
from(input.variants).pipe( | ||
mergeMap(async (variant) => { | ||
const { resultFilePaths } = await runVariant( | ||
{ | ||
id, | ||
variant, | ||
numberOfRuns: input.numberOfRuns, | ||
evaluationConfig: input.evaluationConfig, | ||
maxConcurrentRefactors: | ||
maxConcurrentRefactorsPerVariant, | ||
refactorConfig: { | ||
...input.refactorConfig, | ||
...config.refactorConfig, | ||
}, | ||
}, | ||
ctx | ||
); | ||
|
||
return { | ||
variant: variant.name, | ||
resultFilePaths, | ||
}; | ||
}, variantAtATime), | ||
toArray() | ||
) | ||
); | ||
|
||
logger.debug('Finished running refactors for multiple variants', { | ||
results, | ||
}); | ||
|
||
const summaries = await Promise.all( | ||
results.map(async (result) => ({ | ||
variant: result.variant, | ||
summary: await summarizeRefactorResult({ | ||
resultFilePaths: result.resultFilePaths, | ||
}), | ||
})) | ||
); | ||
|
||
const orderedSummaries = orderBy(summaries, ['variant']); | ||
|
||
await reportBenchmarkSummary({ | ||
summaries: orderedSummaries, | ||
}); | ||
}; | ||
|
||
const { execute } = createCachedPipeline({ | ||
location: join(`.refactor-bot/benchmarks/state`, id), | ||
saveToCache: opts.saveToCache ?? true, | ||
enableCacheFor: opts.enableCacheFor, | ||
disableCacheFor: opts.disableCacheFor, | ||
pipeline: runVariantsAndCompare, | ||
}); | ||
|
||
return await execute(config); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import { z } from 'zod'; | ||
|
||
import { modelsSchema } from '../chat-gpt/api'; | ||
import { ensureHasTwoElements } from '../utils/hasOne'; | ||
|
||
export const passthroughRefactorConfigSchema = z | ||
.object({ | ||
name: z.string(), | ||
model: modelsSchema, | ||
objective: z.string(), | ||
}) | ||
.passthrough(); | ||
|
||
const partialRefactorConfigSchema = z | ||
.object({ | ||
model: modelsSchema.optional(), | ||
objective: z.string().optional(), | ||
}) | ||
.passthrough(); | ||
|
||
export const appVariantSchema = z.object({ | ||
name: z.string().regex(/^[a-z0-9-]+$/i), | ||
ref: z.string().optional(), | ||
repository: z.string().optional(), | ||
ids: z.array(z.string()).optional(), | ||
command: z | ||
.array(z.string()) | ||
.nonempty() | ||
.default(['pnpm', 'refactor-bot', 'refactor']), | ||
refactorConfig: partialRefactorConfigSchema.optional(), | ||
}); | ||
|
||
export const evaluationConfigSchema = z.object({ | ||
model: modelsSchema, | ||
choices: z.number().default(3), | ||
}); | ||
|
||
export const benchConfigSchema = z | ||
.object({ | ||
variants: z | ||
.array(appVariantSchema) | ||
.transform((variants) => ensureHasTwoElements(variants)), | ||
refactorConfig: passthroughRefactorConfigSchema, | ||
evaluationConfig: evaluationConfigSchema.default({ | ||
model: 'gpt-4-1106-preview', | ||
choices: 3, | ||
}), | ||
numberOfRuns: z.number().default(1), | ||
maxConcurrentRefactors: z.number().default(4), | ||
}) | ||
.transform((input, ctx) => { | ||
const variants = new Set(input.variants.map((variant) => variant.name)); | ||
if (variants.size !== input.variants.length) { | ||
ctx.addIssue({ | ||
code: 'custom', | ||
message: 'Variants must have unique names', | ||
}); | ||
return z.NEVER; | ||
} | ||
return input; | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import type { ArgumentsCamelCase, Argv, CommandModule } from 'yargs'; | ||
import yargs from 'yargs'; | ||
|
||
import { line } from '../text/line'; | ||
|
||
const builder = (yargs: Argv) => | ||
yargs | ||
.option('id', { | ||
type: 'string', | ||
describe: line` | ||
Unique id of the benchmark run to identify cache directory | ||
`, | ||
}) | ||
.option('config', { | ||
type: 'string', | ||
describe: line` | ||
Path to the config yaml file containing benchmark configuration | ||
`, | ||
demandOption: true, | ||
}) | ||
.option('save-to-cache', { | ||
type: 'boolean', | ||
describe: line` | ||
Whether to enable saving results to the cache, by default | ||
it's enabled. | ||
`, | ||
default: true, | ||
hidden: true, | ||
}) | ||
.option('enable-cache-for', { | ||
type: 'string', | ||
array: true, | ||
describe: line` | ||
Disable cache for specific steps - you can specify the name | ||
of the step or a name followed by a hash of the cache entry. | ||
This for debugging purposes only. | ||
`, | ||
hidden: true, | ||
}) | ||
.option('disable-cache-for', { | ||
type: 'string', | ||
array: true, | ||
describe: line` | ||
Disable cache for specific steps - you can specify the name | ||
of the step or a name followed by a hash of the cache entry. | ||
This for debugging purposes only. | ||
`, | ||
hidden: true, | ||
}); | ||
|
||
type Args = { | ||
config: string; | ||
}; | ||
|
||
const benchmarkCommand = { | ||
command: 'benchmark', | ||
describe: line` | ||
Performs refactoring using different versions of the refactor bot then | ||
evaluates the results and compares them. | ||
`, | ||
builder, | ||
handler: async (opts: ArgumentsCamelCase<Args>) => { | ||
await import('dotenv').then((m) => | ||
m.config({ | ||
override: true, | ||
}) | ||
); | ||
const { cliHandler } = await import('./cliHandler'); | ||
await cliHandler(opts); | ||
}, | ||
} satisfies CommandModule<Record<never, never>, Args>; | ||
|
||
const opts = await builder(yargs(process.argv.slice(2))) | ||
.usage(benchmarkCommand.describe) | ||
.parseAsync(); | ||
|
||
await benchmarkCommand.handler(opts).catch((err) => { | ||
console.error(err); | ||
process.exitCode = 1; | ||
}); |
Oops, something went wrong.