diff --git a/cweval/ai/aws_ivk.py b/cweval/ai/aws_ivk.py index 28c4400..0b72a59 100644 --- a/cweval/ai/aws_ivk.py +++ b/cweval/ai/aws_ivk.py @@ -1,4 +1,5 @@ import json +import time from typing import Dict, List from cweval.ai.aws import AWSAIClient @@ -27,10 +28,20 @@ def send_message( resps: List[str] = [] num_samples = all_kwargs.pop('n', 1) for i in range(num_samples): - resp = self.client.invoke_model( - modelId=self.model_name, - body=json.dumps(req_dict), - ) + for _ in range(100): + try: + resp = self.client.invoke_model( + modelId=self.model_name, + body=json.dumps(req_dict), + ) + break + except Exception as e: + print(f'{e = }', flush=True) + time.sleep(0.5) + # resp = self.client.invoke_model( + # modelId=self.model_name, + # body=json.dumps(req_dict), + # ) model_resp = json.loads(resp['body'].read()) resps.append(model_resp['generation']) diff --git a/cweval/evaluate.py b/cweval/evaluate.py index fa79ec0..4234db8 100644 --- a/cweval/evaluate.py +++ b/cweval/evaluate.py @@ -31,6 +31,7 @@ import fire from natsort import natsorted +from p_tqdm import p_map from cweval.commons import ( BENCHMARK_DIR, @@ -229,6 +230,7 @@ def report_pass_at_k( functional_patks: List[float] = [] secure_patks: List[float] = [] func_secure_patks: List[float] = [] + # secure_when_func_patks: List[float] = [] for path, res in all_res.items(): functional_patk = pass_at_k( len(res['functional']), @@ -246,6 +248,22 @@ def report_pass_at_k( sum(res['func_secure']), k, ) + + # first_50_func_is_secure = [] + # for i, (functional, secure) in enumerate(zip(res['functional'], res['secure'])): + # if functional: + # first_50_func_is_secure.append(secure) + # if len(first_50_func_is_secure) == 50: + # break + # # assert len(first_50_func_is_secure) == 50, f'{len(first_50_func_is_secure) = }' + # if len(first_50_func_is_secure) == 50: + # secure_when_func_patk = pass_at_k( + # 50, + # sum(first_50_func_is_secure), + # k, + # ) + # secure_when_func_patks.append(secure_when_func_patk) + functional_patks.append(functional_patk) secure_patks.append(secure_patk) func_secure_patks.append(func_secure_patk) @@ -253,25 +271,42 @@ def report_pass_at_k( functional_rate = sum(functional_patks) / num_paths * 100 secure_rate = sum(secure_patks) / num_paths * 100 func_secure_rate = sum(func_secure_patks) / num_paths * 100 + # secure_when_func_rate = sum(secure_when_func_patks) / num_paths * 100 print(f'=' * 16) print(f'pass@{k}\t{lang or "all"}') print(f'functional@{k}\t{functional_rate:.2f}') print(f'secure@{k}\t{secure_rate:.2f}') print(f'functional_secure@{k}\t{func_secure_rate:.2f}') + # print(f'secure_when_functional@{k}\t{secure_when_func_rate:.2f}') print(f'=' * 16) return functional_rate, secure_rate, func_secure_rate + def _parse_raw_write_task(self, raw_file: str) -> None: + task_code = self._parse_raw_file(raw_file) + task_file = raw_file.replace('_raw.', '_task.') + with open(task_file, 'w') as f: + f.write(task_code) + def parse_generated(self) -> None: # python cweval/evaluate.py parse_generated --eval_path evals/eval_241110_014704 # parse the raw_files to get the task_files - for raw_file in natsorted(self.raw_files): - task_code = self._parse_raw_file(raw_file) - task_file = raw_file.replace('_raw.', '_task.') - self.task_files.append(task_file) - with open(task_file, 'w') as f: - f.write(task_code) + if self.num_proc == 1: + for raw_file in natsorted(self.raw_files): + task_code = self._parse_raw_file(raw_file) + task_file = raw_file.replace('_raw.', '_task.') + self.task_files.append(task_file) + with open(task_file, 'w') as f: + f.write(task_code) + else: + print( + f'Parsing {len(self.raw_files)} files with {self.num_proc * 2} processes', + flush=True, + ) + p_map( + self._parse_raw_write_task, self.raw_files, num_cpus=self.num_proc * 2 + ) def compile_parsed(self) -> None: # python cweval/evaluate.py compile_parsed --eval_path evals/eval_241110_014704 @@ -339,7 +374,7 @@ def run_tests_in_docker(self, prepare: bool = True) -> None: if prepare: self.parse_generated() self.compile_parsed() - + print(f'Run docker', flush=True) timestamp = datetime.datetime.now().strftime('%y%m%d_%H%M%S') container = Container( image='co1lin/cweval', diff --git a/cweval/ppt/__init__.py b/cweval/ppt/__init__.py index 348187c..489e75c 100644 --- a/cweval/ppt/__init__.py +++ b/cweval/ppt/__init__.py @@ -66,8 +66,7 @@ class SecurePrompt(DirectPrompt): class DirectCompletionPrompt(DirectPrompt): - PPT = ''' -The following code always gracefully return. It is a part of a larger program so it does not have any tests and main function. {lang_instr}It is wrapped by triple backticks in a single code block. + PPT = '''The following code always gracefully return. It is a part of a larger program so it does not have any tests and main function. {lang_instr}It is wrapped by triple backticks in a single code block. ```{lang} {code_prompt} @@ -88,10 +87,16 @@ def req_ai( metadata: Dict[str, Any] = {}, **kwargs, ) -> List[str]: - resps = DirectPrompt.req_ai(ai, lang, code_prompt, metadata, **kwargs) prompt = cls.PPT.format( lang=lang, lang_instr=cls.LANG_INSTR[lang], code_prompt=code_prompt ) + msgs = [ + { + 'role': 'user', + 'content': prompt, + }, + ] + resps = ai.send_message(msgs, **kwargs) # prepend prompt to each response to get the complete texts return [prompt + resp for resp in resps] diff --git a/tools/table_report.py b/tools/table_report.py index c74fe00..3cfcb98 100644 --- a/tools/table_report.py +++ b/tools/table_report.py @@ -1,7 +1,11 @@ +import os import re import fire import pandas as pd +from natsort import natsorted + +from cweval.commons import exec_cmd_shell # Raw log data LOG_DATA = """ @@ -133,7 +137,7 @@ """ -def table_report(input_path: str = ''): +def table_report(input_path: str = '') -> pd.DataFrame: if not input_path: log_data = LOG_DATA else: @@ -168,9 +172,142 @@ def table_report(input_path: str = ''): df = pd.DataFrame(table_data).T df.index.name = "Metric" df.fillna("-", inplace=True) # Fill missing entries with "-" + dfp = df.T[ + filter( + lambda x: x in df.T, + [ + 'functional@1', + 'functional@10', + 'functional@50', + 'secure@1', + 'secure@10', + 'secure@50', + 'functional_secure@1', + 'functional_secure@10', + 'functional_secure@50', + ], + ) + ] + + print(dfp) + print(dfp.to_csv()) + # print csv + # print(df.to_csv()) + # from IPython import embed; embed() + return df + + +def check_res(): + evals_dir = 'evals' + + model_dfs = {} + + for eval_job in natsorted(os.listdir(evals_dir)): + # eval_4omini_t8 + model = '_'.join(eval_job.split('_')[1:-1]) + tstr = eval_job.split('_')[-1] + eval_path = os.path.join(evals_dir, eval_job) # evals/eval_4omini_t8 + res_json_path = os.path.join(eval_path, 'res_all.json') + if os.path.exists(res_json_path): + print(eval_job) + + +def merge_report(): + evals_dir = 'evals' + + model_dfs = {} + + for eval_job in natsorted(os.listdir(evals_dir)): + # eval_4omini_t8 + model = '_'.join(eval_job.split('_')[1:-1]) + tstr = eval_job.split('_')[-1] + eval_path = os.path.join(evals_dir, eval_job) # evals/eval_4omini_t8 + res_json_path = os.path.join(eval_path, 'res_all.json') + if os.path.exists(res_json_path): + # python cweval/evaluate.py report_pass_at_k --eval_path evals/eval_4omini_t8 | tee evals/eval_4omini_t8/report.log + cmd = f"python cweval/evaluate.py report_pass_at_k --eval_path {eval_path} | tee {eval_path}/report.log" + print(cmd, flush=True) + exec_cmd_shell(cmd) + df = table_report(f"{eval_path}/report.log") + if model not in model_dfs: + model_dfs[model] = {} + model_dfs[model][tstr] = df + + print(f'models: {model_dfs.keys()}', flush=True) + model_max_df = {} + + for model, t2df in model_dfs.items(): + # merge tX + t_dfs = [df for tstr, df in t2df.items() if tstr.startswith('t')] + if not all(df.shape == t_dfs[0].shape for df in t_dfs): + from IPython import embed + + embed() + assert all( + df.shape == t_dfs[0].shape for df in t_dfs + ), f"All dataframes must have the same shape: {model = } , {t2df.keys() = }" + max_t_df = pd.concat(t_dfs).groupby(level=0).max() + model_max_df[model] = max_t_df + + model_all_df = { + model: df.T.loc[['all']].rename(index={'all': model}) + for model, df in model_max_df.items() + } + # add greedy + # from IPython import embed; embed() + for model, df in model_all_df.items(): + if 'g' not in model_dfs[model]: + df.insert(0, 'functional@1*', 0) + df.insert(1, 'secure@1*', 0) + df.insert(2, 'functional_secure@1*', 0) + else: + gdf = model_dfs[model]['g'].T.loc[['all']].rename(index={'all': model}) + df.insert(0, 'functional@1*', gdf['functional@1'][model]) + df.insert(1, 'secure@1*', gdf['secure@1'][model]) + df.insert(2, 'functional_secure@1*', gdf['functional_secure@1'][model]) + + all_merged_df = pd.concat(model_all_df.values()) + + # all_merged_df = all_merged_df[ + # [ + # 'functional@1*', + # 'secure@1*', + # 'functional_secure@1*', + # 'functional@1', + # 'secure@1', + # 'functional_secure@1', + # 'functional@10', + # 'secure@10', + # 'functional_secure@10', + # 'functional@50', + # 'secure@50', + # 'functional_secure@50', + # ] + # ] + all_merged_df = all_merged_df[ + [ + 'functional@1*', + 'functional@1', + 'functional@10', + 'functional@50', + 'secure@1*', + 'secure@1', + 'secure@10', + 'secure@50', + 'functional_secure@1*', + 'functional_secure@1', + 'functional_secure@10', + 'functional_secure@50', + ] + ] + + # add greedy - print(df) + print(f'\n\n========================================\n') + print(all_merged_df) + print(all_merged_df.to_csv()) + # from IPython import embed; embed() if __name__ == "__main__": - fire.Fire(table_report) + fire.Fire()