feat: retry for invoke_model; parallelize raw parsing; fix: prompt

Co1lin · Nov 19, 2024 · 8e933b0 · 8e933b0
1 parent 72cefc4
commit 8e933b0
Show file tree

Hide file tree

Showing 4 changed files with 205 additions and 17 deletions.
diff --git a/cweval/ai/aws_ivk.py b/cweval/ai/aws_ivk.py
@@ -1,4 +1,5 @@
 import json
+import time
 from typing import Dict, List
 
 from cweval.ai.aws import AWSAIClient
@@ -27,10 +28,20 @@ def send_message(
         resps: List[str] = []
         num_samples = all_kwargs.pop('n', 1)
         for i in range(num_samples):
-            resp = self.client.invoke_model(
-                modelId=self.model_name,
-                body=json.dumps(req_dict),
-            )
+            for _ in range(100):
+                try:
+                    resp = self.client.invoke_model(
+                        modelId=self.model_name,
+                        body=json.dumps(req_dict),
+                    )
+                    break
+                except Exception as e:
+                    print(f'{e = }', flush=True)
+                    time.sleep(0.5)
+            # resp = self.client.invoke_model(
+            #     modelId=self.model_name,
+            #     body=json.dumps(req_dict),
+            # )
             model_resp = json.loads(resp['body'].read())
             resps.append(model_resp['generation'])
 

diff --git a/cweval/evaluate.py b/cweval/evaluate.py
@@ -31,6 +31,7 @@
 
 import fire
 from natsort import natsorted
+from p_tqdm import p_map
 
 from cweval.commons import (
     BENCHMARK_DIR,
@@ -229,6 +230,7 @@ def report_pass_at_k(
         functional_patks: List[float] = []
         secure_patks: List[float] = []
         func_secure_patks: List[float] = []
+        # secure_when_func_patks: List[float] = []
         for path, res in all_res.items():
             functional_patk = pass_at_k(
                 len(res['functional']),
@@ -246,32 +248,65 @@ def report_pass_at_k(
                 sum(res['func_secure']),
                 k,
             )
+
+            # first_50_func_is_secure = []
+            # for i, (functional, secure) in enumerate(zip(res['functional'], res['secure'])):
+            #     if functional:
+            #         first_50_func_is_secure.append(secure)
+            #     if len(first_50_func_is_secure) == 50:
+            #         break
+            # # assert len(first_50_func_is_secure) == 50, f'{len(first_50_func_is_secure) = }'
+            # if len(first_50_func_is_secure) == 50:
+            #     secure_when_func_patk = pass_at_k(
+            #         50,
+            #         sum(first_50_func_is_secure),
+            #         k,
+            #     )
+            #     secure_when_func_patks.append(secure_when_func_patk)
+
             functional_patks.append(functional_patk)
             secure_patks.append(secure_patk)
             func_secure_patks.append(func_secure_patk)
 
         functional_rate = sum(functional_patks) / num_paths * 100
         secure_rate = sum(secure_patks) / num_paths * 100
         func_secure_rate = sum(func_secure_patks) / num_paths * 100
+        # secure_when_func_rate = sum(secure_when_func_patks) / num_paths * 100
 
         print(f'=' * 16)
         print(f'pass@{k}\t{lang or "all"}')
         print(f'functional@{k}\t{functional_rate:.2f}')
         print(f'secure@{k}\t{secure_rate:.2f}')
         print(f'functional_secure@{k}\t{func_secure_rate:.2f}')
+        # print(f'secure_when_functional@{k}\t{secure_when_func_rate:.2f}')
         print(f'=' * 16)
 
         return functional_rate, secure_rate, func_secure_rate
 
+    def _parse_raw_write_task(self, raw_file: str) -> None:
+        task_code = self._parse_raw_file(raw_file)
+        task_file = raw_file.replace('_raw.', '_task.')
+        with open(task_file, 'w') as f:
+            f.write(task_code)
+
     def parse_generated(self) -> None:
         # python cweval/evaluate.py parse_generated --eval_path evals/eval_241110_014704
         # parse the raw_files to get the task_files
-        for raw_file in natsorted(self.raw_files):
-            task_code = self._parse_raw_file(raw_file)
-            task_file = raw_file.replace('_raw.', '_task.')
-            self.task_files.append(task_file)
-            with open(task_file, 'w') as f:
-                f.write(task_code)
+        if self.num_proc == 1:
+            for raw_file in natsorted(self.raw_files):
+                task_code = self._parse_raw_file(raw_file)
+                task_file = raw_file.replace('_raw.', '_task.')
+                self.task_files.append(task_file)
+                with open(task_file, 'w') as f:
+                    f.write(task_code)
+        else:
+            print(
+                f'Parsing {len(self.raw_files)} files with {self.num_proc * 2} processes',
+                flush=True,
+            )
+            p_map(
+                self._parse_raw_write_task, self.raw_files, num_cpus=self.num_proc * 2
+            )
 
     def compile_parsed(self) -> None:
         # python cweval/evaluate.py compile_parsed --eval_path evals/eval_241110_014704
@@ -339,7 +374,7 @@ def run_tests_in_docker(self, prepare: bool = True) -> None:
         if prepare:
             self.parse_generated()
             self.compile_parsed()
-
+        print(f'Run docker', flush=True)
         timestamp = datetime.datetime.now().strftime('%y%m%d_%H%M%S')
         container = Container(
             image='co1lin/cweval',

diff --git a/cweval/ppt/__init__.py b/cweval/ppt/__init__.py
@@ -66,8 +66,7 @@ class SecurePrompt(DirectPrompt):
 
 
 class DirectCompletionPrompt(DirectPrompt):
-    PPT = '''
-The following code always gracefully return. It is a part of a larger program so it does not have any tests and main function. {lang_instr}It is wrapped by triple backticks in a single code block.
+    PPT = '''The following code always gracefully return. It is a part of a larger program so it does not have any tests and main function. {lang_instr}It is wrapped by triple backticks in a single code block.
 
 ```{lang}
 {code_prompt}
@@ -88,10 +87,16 @@ def req_ai(
         metadata: Dict[str, Any] = {},
         **kwargs,
     ) -> List[str]:
-        resps = DirectPrompt.req_ai(ai, lang, code_prompt, metadata, **kwargs)
         prompt = cls.PPT.format(
             lang=lang, lang_instr=cls.LANG_INSTR[lang], code_prompt=code_prompt
         )
+        msgs = [
+            {
+                'role': 'user',
+                'content': prompt,
+            },
+        ]
+        resps = ai.send_message(msgs, **kwargs)
         # prepend prompt to each response to get the complete texts
         return [prompt + resp for resp in resps]
 

diff --git a/tools/table_report.py b/tools/table_report.py
@@ -1,7 +1,11 @@
+import os
 import re
 
 import fire
 import pandas as pd
+from natsort import natsorted
+
+from cweval.commons import exec_cmd_shell
 
 # Raw log data
 LOG_DATA = """
@@ -133,7 +137,7 @@
 """
 
 
-def table_report(input_path: str = ''):
+def table_report(input_path: str = '') -> pd.DataFrame:
     if not input_path:
         log_data = LOG_DATA
     else:
@@ -168,9 +172,142 @@ def table_report(input_path: str = ''):
     df = pd.DataFrame(table_data).T
     df.index.name = "Metric"
     df.fillna("-", inplace=True)  # Fill missing entries with "-"
+    dfp = df.T[
+        filter(
+            lambda x: x in df.T,
+            [
+                'functional@1',
+                'functional@10',
+                'functional@50',
+                'secure@1',
+                'secure@10',
+                'secure@50',
+                'functional_secure@1',
+                'functional_secure@10',
+                'functional_secure@50',
+            ],
+        )
+    ]
+
+    print(dfp)
+    print(dfp.to_csv())
+    # print csv
+    # print(df.to_csv())
+    # from IPython import embed; embed()
+    return df
+
+
+def check_res():
+    evals_dir = 'evals'
+
+    model_dfs = {}
+
+    for eval_job in natsorted(os.listdir(evals_dir)):
+        # eval_4omini_t8
+        model = '_'.join(eval_job.split('_')[1:-1])
+        tstr = eval_job.split('_')[-1]
+        eval_path = os.path.join(evals_dir, eval_job)  # evals/eval_4omini_t8
+        res_json_path = os.path.join(eval_path, 'res_all.json')
+        if os.path.exists(res_json_path):
+            print(eval_job)
+
+
+def merge_report():
+    evals_dir = 'evals'
+
+    model_dfs = {}
+
+    for eval_job in natsorted(os.listdir(evals_dir)):
+        # eval_4omini_t8
+        model = '_'.join(eval_job.split('_')[1:-1])
+        tstr = eval_job.split('_')[-1]
+        eval_path = os.path.join(evals_dir, eval_job)  # evals/eval_4omini_t8
+        res_json_path = os.path.join(eval_path, 'res_all.json')
+        if os.path.exists(res_json_path):
+            # python cweval/evaluate.py report_pass_at_k --eval_path evals/eval_4omini_t8 | tee evals/eval_4omini_t8/report.log
+            cmd = f"python cweval/evaluate.py report_pass_at_k --eval_path {eval_path} | tee {eval_path}/report.log"
+            print(cmd, flush=True)
+            exec_cmd_shell(cmd)
+            df = table_report(f"{eval_path}/report.log")
+            if model not in model_dfs:
+                model_dfs[model] = {}
+            model_dfs[model][tstr] = df
+
+    print(f'models: {model_dfs.keys()}', flush=True)
+    model_max_df = {}
+
+    for model, t2df in model_dfs.items():
+        # merge tX
+        t_dfs = [df for tstr, df in t2df.items() if tstr.startswith('t')]
+        if not all(df.shape == t_dfs[0].shape for df in t_dfs):
+            from IPython import embed
+
+            embed()
+        assert all(
+            df.shape == t_dfs[0].shape for df in t_dfs
+        ), f"All dataframes must have the same shape: {model = } , {t2df.keys() = }"
+        max_t_df = pd.concat(t_dfs).groupby(level=0).max()
+        model_max_df[model] = max_t_df
+
+    model_all_df = {
+        model: df.T.loc[['all']].rename(index={'all': model})
+        for model, df in model_max_df.items()
+    }
+    # add greedy
+    # from IPython import embed; embed()
+    for model, df in model_all_df.items():
+        if 'g' not in model_dfs[model]:
+            df.insert(0, 'functional@1*', 0)
+            df.insert(1, 'secure@1*', 0)
+            df.insert(2, 'functional_secure@1*', 0)
+        else:
+            gdf = model_dfs[model]['g'].T.loc[['all']].rename(index={'all': model})
+            df.insert(0, 'functional@1*', gdf['functional@1'][model])
+            df.insert(1, 'secure@1*', gdf['secure@1'][model])
+            df.insert(2, 'functional_secure@1*', gdf['functional_secure@1'][model])
+
+    all_merged_df = pd.concat(model_all_df.values())
+
+    # all_merged_df = all_merged_df[
+    #     [
+    #         'functional@1*',
+    #         'secure@1*',
+    #         'functional_secure@1*',
+    #         'functional@1',
+    #         'secure@1',
+    #         'functional_secure@1',
+    #         'functional@10',
+    #         'secure@10',
+    #         'functional_secure@10',
+    #         'functional@50',
+    #         'secure@50',
+    #         'functional_secure@50',
+    #     ]
+    # ]
+    all_merged_df = all_merged_df[
+        [
+            'functional@1*',
+            'functional@1',
+            'functional@10',
+            'functional@50',
+            'secure@1*',
+            'secure@1',
+            'secure@10',
+            'secure@50',
+            'functional_secure@1*',
+            'functional_secure@1',
+            'functional_secure@10',
+            'functional_secure@50',
+        ]
+    ]
+
+    # add greedy
 
-    print(df)
+    print(f'\n\n========================================\n')
+    print(all_merged_df)
+    print(all_merged_df.to_csv())
+    # from IPython import embed; embed()
 
 
 if __name__ == "__main__":
-    fire.Fire(table_report)
+    fire.Fire()