diff --git a/analysis/process_nsight_deepcam.ipynb b/analysis/process_nsight_deepcam.ipynb
index 8037989..8a3003c 100644
--- a/analysis/process_nsight_deepcam.ipynb
+++ b/analysis/process_nsight_deepcam.ipynb
@@ -57,106 +57,113 @@
"source": [
"def transpose_frame(df_metrics):\n",
" #Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up\n",
- " selectkeys = [\"Precision\", \"Network Name\", \"Batch Size\", \"Pass\", \"Name\"]\n",
+ " selectkeys = [\"ID\", \"Name\", \"Network Name\", \"Batch Size\", \"Pass\", \"Precision\"]\n",
+ " resultkeys = [\"Precision\", \"Network Name\", \"Batch Size\", \"Pass\", \"Name\"]\n",
" \n",
" tc_peak_perf_flops = 125*10**12\n",
- "\n",
+ " \n",
" #as metricdf use df_summary\n",
" metricdf = df_metrics.copy()\n",
- " metricdf.sort_values(by=selectkeys,inplace=True)\n",
- " metricdf.reset_index(drop=True, inplace=True)\n",
+ " profiledf = pd.DataFrame(columns=selectkeys)\n",
"\n",
" ####### Get timing information\n",
- "\n",
" ### CUDA Time\n",
- " cudatimedf = metricdf[ (metricdf[\"Metric Name\"].str.contains(\"smsp__cycles_elapsed\")) ].sort_values(selectkeys)\n",
- " # get cycles and rates\n",
- " cyclesdf = metricdf.loc[(metricdf[\"Metric Name\"]==\"smsp__cycles_elapsed\") & (metricdf[\"Metric Type\"]==\"total\"), selectkeys+[\"Metric Value\"]]\n",
- " ratesdf = metricdf.loc[(metricdf[\"Metric Name\"]==\"smsp__cycles_elapsed\") & (metricdf[\"Metric Type\"]==\"rate\"), selectkeys+[\"Metric Value\"]]\n",
- " \n",
- " # combine\n",
- " cudatimedf = cyclesdf.merge(ratesdf, on=selectkeys, how=\"outer\").fillna(0.)\n",
- " cudatimedf[\"CUDA Time Avg\"] = cudatimedf[\"Metric Value_x\"] / (cudatimedf[\"Metric Value_y\"] * 1e9)\n",
- " cudatimedf = cudatimedf.fillna(0.)\n",
- " # merge into results\n",
- " metricdf = metricdf.merge(cudatimedf[selectkeys+[\"CUDA Time Avg\"]], on=selectkeys, how=\"inner\")\n",
+ " # get cycles\n",
+ " metricname = \"CUDA Cycles\"\n",
+ " cyclesdf = metricdf.loc[(metricdf[\"Metric Name\"]==\"smsp__cycles_elapsed\") & (metricdf[\"Metric Type\"]==\"total\"),\n",
+ " selectkeys+[\"Metric Unit\", \"Metric Value\"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={\"Metric Value\": metricname}).copy()\n",
+ " # get rates\n",
+ " metricname = \"CUDA Rates\"\n",
+ " ratesdf = metricdf.loc[(metricdf[\"Metric Name\"]==\"smsp__cycles_elapsed\") & (metricdf[\"Metric Type\"]==\"rate\"),\n",
+ " selectkeys+[\"Metric Unit\", \"Metric Value\"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={\"Metric Value\": metricname}).copy()\n",
+ " # check consistency\n",
+ " if not cyclesdf[['ID', 'Name']].equals(ratesdf[['ID', 'Name']]):\n",
+ " raise ValueError(\"CUDA Time data not consistent\")\n",
+ " # adjust metric unit\n",
+ " ratesdf.loc[ratesdf[\"Metric Unit\"].str.contains(\"cycle/nsecond\"), [\"CUDA Rates\"]] *= 1e9\n",
+ " # manual merge and compute CUDA Time\n",
+ " cyclesdf[\"CUDA Rates\"] = list(ratesdf[\"CUDA Rates\"])\n",
+ " cyclesdf[\"CUDA Time\"] = cyclesdf[\"CUDA Cycles\"] / cyclesdf[\"CUDA Rates\"]\n",
+ " # merge with output\n",
+ " profiledf = cyclesdf[selectkeys+['CUDA Time']].copy()\n",
" \n",
" ### Tensor Core Time\n",
- " tctimedf = metricdf[ (metricdf[\"Metric Name\"].str.contains(\"smsp__pipe_tensor_op_hmma_cycles_active\")) ].sort_values(selectkeys)\n",
- " # get cycles and rates\n",
- " cyclesdf = metricdf.loc[(metricdf[\"Metric Name\"]==\"smsp__pipe_tensor_op_hmma_cycles_active\") & (metricdf[\"Metric Type\"]==\"total\"), selectkeys+[\"Metric Value\"]]\n",
- " ratesdf = metricdf.loc[(metricdf[\"Metric Name\"]==\"smsp__pipe_tensor_op_hmma_cycles_active\") & (metricdf[\"Metric Type\"]==\"rate\"), selectkeys+[\"Metric Value\"]]\n",
+ " # get cycles\n",
+ " metricname = \"TC Cycles\"\n",
+ " cyclesdf = metricdf.loc[(metricdf[\"Metric Name\"]==\"smsp__pipe_tensor_op_hmma_cycles_active\") & (metricdf[\"Metric Type\"]==\"total\"),\n",
+ " selectkeys+[\"Metric Unit\", \"Metric Value\"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={\"Metric Value\": metricname}).copy()\n",
+ " # get rates\n",
+ " metricname = \"TC Rates\"\n",
+ " ratesdf = metricdf.loc[(metricdf[\"Metric Name\"]==\"smsp__pipe_tensor_op_hmma_cycles_active\") & (metricdf[\"Metric Type\"]==\"rate\"),\n",
+ " selectkeys+[\"Metric Unit\", \"Metric Value\"]].reset_index(drop=True).sort_values(by=selectkeys).rename(columns={\"Metric Value\": metricname}).copy()\n",
+ " # check consistency\n",
+ " if not cyclesdf[['ID', 'Name']].equals(ratesdf[['ID', 'Name']]):\n",
+ " raise ValueError(\"TC Time data not consistent\")\n",
+ " # adjust metric unit\n",
+ " ratesdf.loc[ratesdf[\"Metric Unit\"].str.contains(\"cycle/nsecond\"), [\"TC Rates\"]] *= 1e9\n",
+ " # manual merge and compute CUDA Time\n",
+ " cyclesdf[\"TC Rates\"] = list(ratesdf[\"TC Rates\"])\n",
+ " cyclesdf[\"TC Time\"] = cyclesdf[\"TC Cycles\"] / cyclesdf[\"TC Rates\"]\n",
+ " # merge & cleanup\n",
+ " profiledf = profiledf.merge(cyclesdf[selectkeys+['TC Time']], on=selectkeys, how=\"outer\").fillna(0.)\n",
" \n",
- " # combine\n",
- " tctimedf = cyclesdf.merge(ratesdf, on=selectkeys, how=\"outer\").fillna(0.)\n",
- " tctimedf[\"TC Time Avg\"] = tctimedf[\"Metric Value_x\"] / (tctimedf[\"Metric Value_y\"] * 1e9).fillna(0.)\n",
- " tctimedf = tctimedf.fillna(0.)\n",
- " metricdf = metricdf.merge(tctimedf[selectkeys+[\"TC Time Avg\"]], on=selectkeys, how=\"inner\")\n",
- " \n",
- " ### check\n",
- " #tmpdf = metricdf.loc[(abs(metricdf[\"CUDA Time Avg\"] - metricdf[\"TC Time Avg\"])/metricdf[\"CUDA Time Avg\"] > 0.01) & (metricdf[\"TC Time Avg\"] != 0)]\n",
- " #if not tmpdf.empty:\n",
- " # print(tmpdf)\n",
- " # raise ValueError(\"CUDA Time not consistent wit TC Time\") \n",
- " \n",
- " \n",
+ " ### Combine\n",
+ " del profiledf['ID']\n",
+ " del metricdf['ID']\n",
+ " profiledf['Invocations'] = 1\n",
+ " profiledf = profiledf.groupby(resultkeys).sum().reset_index()\n",
+ " #profiledf.sort_values(by=resultkeys, inplace=True)\n",
+ " #profiledf.reset_index(drop=True, inplace=True)\n",
+ "\n",
" ####### Get number of FLOPs\n",
" \n",
" ### FMA FLOPs = number of FMA instructions x 2\n",
" metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"fma\"), [\"Metric Value\"]] *= 2\n",
" \n",
- "\n",
- " ### FP64 FLOPs\n",
- " #metrics = ['smsp__sass_thread_inst_executed_op_dadd_pred_on',\n",
- " # 'smsp__sass_thread_inst_executed_op_dfma_pred_on',\n",
- " # 'smsp__sass_thread_inst_executed_op_dmul_pred_on']\n",
- " #tmpdf = metricdf.loc[ metricdf[\"Metric Name\"].isin(metrics), selectkeys+[\"Metric Value\"] ].copy()\n",
- " #tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={\"Metric Value\": \"FP64 FLOPs\"})\n",
- " #metricdf = metricdf.merge(tmpdf[selectkeys+[\"FP64 FLOPs\"]], on=selectkeys, how=\"inner\")\n",
- " \n",
- " \n",
" ### FP32 FLOPs\n",
" metrics = ['smsp__sass_thread_inst_executed_op_fadd_pred_on',\n",
" 'smsp__sass_thread_inst_executed_op_ffma_pred_on',\n",
" 'smsp__sass_thread_inst_executed_op_fmul_pred_on']\n",
- " tmpdf = metricdf.loc[ metricdf[\"Metric Name\"].isin(metrics), selectkeys+[\"Metric Value\"] ].copy()\n",
- " tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={\"Metric Value\": \"FP32 FLOPs Avg\"})\n",
- " metricdf = metricdf.merge(tmpdf[selectkeys+[\"FP32 FLOPs Avg\"]], on=selectkeys, how=\"inner\")\n",
+ " tmpdf = metricdf.loc[ metricdf[\"Metric Name\"].isin(metrics), resultkeys+[\"Metric Value\"] ].copy()\n",
+ " tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={\"Metric Value\": \"FP32 FLOPs\"})\n",
+ " # merge\n",
+ " profiledf = profiledf.merge(tmpdf[resultkeys+[\"FP32 FLOPs\"]], on=resultkeys, how=\"inner\")\n",
" \n",
" ### FP16 FLOPs\n",
" metrics = ['smsp__sass_thread_inst_executed_op_hadd_pred_on',\n",
" 'smsp__sass_thread_inst_executed_op_hfma_pred_on',\n",
" 'smsp__sass_thread_inst_executed_op_hmul_pred_on']\n",
- " tmpdf = metricdf.loc[ metricdf[\"Metric Name\"].isin(metrics), selectkeys+[\"Metric Value\"] ].copy()\n",
- " tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={\"Metric Value\": \"FP16 FLOPs Avg\"})\n",
- " metricdf = metricdf.merge(tmpdf[selectkeys+[\"FP16 FLOPs Avg\"]], on=selectkeys, how=\"inner\")\n",
+ " tmpdf = metricdf.loc[ metricdf[\"Metric Name\"].isin(metrics), resultkeys+[\"Metric Value\"] ].copy()\n",
+ " tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={\"Metric Value\": \"FP16 FLOPs\"})\n",
+ " # merge\n",
+ " profiledf = profiledf.merge(tmpdf[resultkeys+[\"FP16 FLOPs\"]], on=resultkeys, how=\"inner\")\n",
" \n",
" #### TC FLOPs\n",
- " tmpdf = metricdf.loc[ metricdf[\"Metric Name\"] == \"sm__inst_executed_pipe_tensor_op_hmma\", selectkeys+[\"TC Time Avg\", \"Metric Value\"] ].copy()\n",
- " tmpdf[\"Utilization\"] = 0.01 * tmpdf[\"Metric Value\"]\n",
- " tmpdf[\"TC FLOPs Avg\"] = tc_peak_perf_flops * tmpdf[\"Utilization\"] * tmpdf[\"TC Time Avg\"]\n",
- " metricdf = metricdf.merge(tmpdf[selectkeys+[\"TC FLOPs Avg\"]], on=selectkeys, how=\"inner\")\n",
- "\n",
+ " tmpdf = metricdf.loc[ metricdf[\"Metric Name\"] == \"sm__inst_executed_pipe_tensor_op_hmma\", resultkeys+[\"Metric Value\"] ].copy()\n",
+ " tmpdf = tmpdf.groupby(resultkeys).sum().reset_index().rename(columns={\"Metric Value\": \"TC Utilization\"})\n",
+ " tmpdf[\"TC Utilization\"] = 0.01 * tmpdf[\"TC Utilization\"]\n",
+ " profiledf = profiledf.merge(tmpdf[resultkeys+[\"TC Utilization\"]], on=resultkeys, how=\"inner\")\n",
+ " profiledf[\"TC Utilization\"] = profiledf[\"TC Utilization\"] / profiledf['Invocations']\n",
+ " profiledf[\"TC FLOPs\"] = tc_peak_perf_flops * profiledf[\"TC Utilization\"] * profiledf[\"TC Time\"]\n",
" \n",
" ### Total FLOPs\n",
- " metricdf[\"FLOPs Avg\"] = metricdf[\"FP32 FLOPs Avg\"] + metricdf[\"FP16 FLOPs Avg\"] + metricdf[\"TC FLOPs Avg\"] #+ metricdf[\"FP64 FLOPs\"]\n",
- " \n",
+ " profiledf[\"FLOPs\"] = profiledf[\"FP32 FLOPs\"] + profiledf[\"FP16 FLOPs\"] + profiledf[\"TC FLOPs\"] #+ metricdf[\"FP64 FLOPs\"]\n",
" \n",
" ### FLOPs fractions\n",
- " #metricdf[\"FP64 FLOPs Fraction\"] = metricdf[\"FP64 FLOPs\"]/metricdf[\"FLOPs\"]\n",
- " metricdf[\"FP32 FLOPs Fraction Avg\"] = metricdf[\"FP32 FLOPs Avg\"]/metricdf[\"FLOPs Avg\"]\n",
- " metricdf[\"FP16 FLOPs Fraction Avg\"] = metricdf[\"FP16 FLOPs Avg\"]/metricdf[\"FLOPs Avg\"]\n",
- " metricdf[\"TC FLOPs Fraction Avg\"] = metricdf[\"TC FLOPs Avg\"]/metricdf[\"FLOPs Avg\"]\n",
+ " #profiledf[\"FP64 FLOPs Fraction\"] = profiledf[\"FP64 FLOPs\"]/profiledf[\"FLOPs\"]\n",
+ " profiledf[\"FP32 FLOPs Fraction\"] = profiledf[\"FP32 FLOPs\"]/profiledf[\"FLOPs\"]\n",
+ " profiledf[\"FP16 FLOPs Fraction\"] = profiledf[\"FP16 FLOPs\"]/profiledf[\"FLOPs\"]\n",
+ " profiledf[\"TC FLOPs Fraction\"] = profiledf[\"TC FLOPs\"]/profiledf[\"FLOPs\"]\n",
+ " \n",
" \n",
" ####### Get number of bytes\n",
" \n",
" ### Shared transactions\n",
" #project out\n",
- " shareddf = metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"l1tex__data_pipe_lsu_wavefronts_mem_shared_op\"), selectkeys+[\"Metric Value\"] ].copy()\n",
- " shareddf = shareddf.groupby(selectkeys).sum().reset_index().rename(columns={\"Metric Value\": \"Shared Transactions Avg\"})\n",
+ " shareddf = metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"l1tex__data_pipe_lsu_wavefronts_mem_shared_op\"), resultkeys+[\"Metric Value\"] ].copy()\n",
+ " shareddf = shareddf.groupby(resultkeys).sum().reset_index().rename(columns={\"Metric Value\": \"Shared Transactions\"})\n",
" #add to timings\n",
- " metricdf = metricdf.merge(shareddf[selectkeys+[\"Shared Transactions Avg\"]], on=selectkeys, how=\"inner\")\n",
- "\n",
+ " profiledf = profiledf.merge(shareddf[resultkeys+[\"Shared Transactions\"]], on=resultkeys, how=\"inner\")\n",
" \n",
" ### L1 atomic transactions\n",
" # project out\n",
@@ -164,117 +171,94 @@
" 'l1tex__t_set_accesses_pipe_lsu_mem_global_op_red',\n",
" 'l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom',\n",
" 'l1tex__t_set_accesses_pipe_tex_mem_surface_op_red']\n",
- " atomicdf = metricdf.loc[ metricdf[\"Metric Name\"].isin(metrics), selectkeys+[\"Metric Value\"] ].copy()\n",
+ " atomicdf = metricdf.loc[ metricdf[\"Metric Name\"].isin(metrics), resultkeys+[\"Metric Value\"] ].copy()\n",
" # get reads and writes\n",
- " atomicdf = atomicdf.groupby(selectkeys).sum().reset_index().rename(columns={\"Metric Value\": \"L1 Atomic Transactions Avg\"})\n",
+ " atomicdf = atomicdf.groupby(resultkeys).sum().reset_index().rename(columns={\"Metric Value\": \"L1 Atomic Transactions\"})\n",
" # add to timings\n",
- " metricdf = metricdf.merge(atomicdf[selectkeys+[\"L1 Atomic Transactions Avg\"]], on=selectkeys, how=\"inner\")\n",
- " \n",
- " \n",
+ " profiledf = profiledf.merge(atomicdf[resultkeys+[\"L1 Atomic Transactions\"]], on=resultkeys, how=\"inner\")\n",
+ "\n",
" ### Local transactions \n",
" # project out\n",
- " localdf = metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"l1tex__t_sectors_pipe_lsu_mem_local_op\"), selectkeys+[\"Metric Value\"] ].copy()\n",
- " localdf = localdf.groupby(selectkeys).sum().reset_index().rename(columns={\"Metric Value\": \"Local Transactions Avg\"})\n",
+ " localdf = metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"l1tex__t_sectors_pipe_lsu_mem_local_op\"), resultkeys+[\"Metric Value\"] ].copy()\n",
+ " localdf = localdf.groupby(resultkeys).sum().reset_index().rename(columns={\"Metric Value\": \"Local Transactions\"})\n",
" # add to timings\n",
- " metricdf = metricdf.merge(localdf[selectkeys+[\"Local Transactions Avg\"]], on=selectkeys, how=\"inner\")\n",
- " \n",
+ " profiledf = profiledf.merge(localdf[resultkeys+[\"Local Transactions\"]], on=resultkeys, how=\"inner\")\n",
" \n",
" ### Global transactions \n",
" # project out\n",
- " globaldf = metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"l1tex__t_sectors_pipe_lsu_mem_global_op\"), selectkeys+[\"Metric Value\"] ].copy()\n",
- " globaldf = globaldf.groupby(selectkeys).sum().reset_index().rename(columns={\"Metric Value\": \"Global Transactions Avg\"})\n",
+ " globaldf = metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"l1tex__t_sectors_pipe_lsu_mem_global_op\"), resultkeys+[\"Metric Value\"] ].copy()\n",
+ " globaldf = globaldf.groupby(resultkeys).sum().reset_index().rename(columns={\"Metric Value\": \"Global Transactions\"})\n",
" # add to timings\n",
- " metricdf = metricdf.merge(globaldf[selectkeys+[\"Global Transactions Avg\"]], on=selectkeys, how=\"inner\")\n",
- " \n",
+ " profiledf = profiledf.merge(globaldf[resultkeys+[\"Global Transactions\"]], on=resultkeys, how=\"inner\")\n",
" \n",
" ### L1 Bytes\n",
- " metricdf[\"L1 Transactions Avg\"] = (metricdf[\"Shared Transactions Avg\"] + metricdf[\"L1 Atomic Transactions Avg\"]\n",
- " + metricdf[\"Local Transactions Avg\"] + metricdf[\"Global Transactions Avg\"])\n",
- " metricdf[\"L1 Bytes Avg\"] = metricdf[\"L1 Transactions Avg\"] * 32\n",
- " \n",
+ " profiledf[\"L1 Transactions\"] = (profiledf[\"Shared Transactions\"] + profiledf[\"L1 Atomic Transactions\"]\n",
+ " + profiledf[\"Local Transactions\"] + profiledf[\"Global Transactions\"])\n",
+ " profiledf[\"L1 Bytes\"] = profiledf[\"L1 Transactions\"] * 32\n",
" \n",
" ### L2 atomic & reduction\n",
" metricdf.loc[(metricdf[\"Metric Name\"].str.contains(\"lts__t_sectors_op\")) & (metricdf[\"Metric Type\"]==\"total\"), [\"Metric Value\"]] *= 2\n",
"\n",
- " \n",
" ### L2 transactions\n",
" # project out\n",
- " l2df = metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"lts__t_sectors_op\"), selectkeys+[\"Metric Value\"] ].copy()\n",
- " l2df = l2df.groupby(selectkeys).sum().reset_index().rename(columns={\"Metric Value\": \"L2 Transactions Avg\"})\n",
- " l2df[\"L2 Bytes Avg\"] = l2df[\"L2 Transactions Avg\"] * 32\n",
+ " l2df = metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"lts__t_sectors_op\"), resultkeys+[\"Metric Value\"] ].copy()\n",
+ " l2df = l2df.groupby(resultkeys).sum().reset_index().rename(columns={\"Metric Value\": \"L2 Transactions\"})\n",
+ " l2df[\"L2 Bytes\"] = l2df[\"L2 Transactions\"] * 32\n",
" # add to timings\n",
- " metricdf = metricdf.merge(l2df[selectkeys+[\"L2 Transactions Avg\", \"L2 Bytes Avg\"]], on=selectkeys, how=\"inner\")\n",
- " \n",
- " \n",
+ " profiledf = profiledf.merge(l2df[resultkeys+[\"L2 Transactions\", \"L2 Bytes\"]], on=resultkeys, how=\"inner\")\n",
+ "\n",
" ### DRAM Bytes\n",
" # project out\n",
- " dramdf = metricdf[ metricdf[\"Metric Name\"].str.contains(\"dram__sectors\") ].sort_values(selectkeys)\n",
- " # get reads and writes\n",
- " dramreadsdf = dramdf.loc[(dramdf[\"Metric Name\"]==\"dram__sectors\") & (dramdf[\"Metric Type\"]==\"read\"), selectkeys+[\"Metric Value\"]]\n",
- " dramwritesdf = dramdf.loc[(dramdf[\"Metric Name\"]==\"dram__sectors\") & (dramdf[\"Metric Type\"]==\"write\"), selectkeys+[\"Metric Value\"]]\n",
- " # combine\n",
- " dramdf = dramwritesdf.merge(dramreadsdf, on=selectkeys, how=\"outer\").fillna(0.)\n",
- " dramdf[\"DRAM Transactions Avg\"] = dramdf[\"Metric Value_x\"] + dramdf[\"Metric Value_y\"]\n",
- " dramdf[\"DRAM Bytes Avg\"] = dramdf[\"DRAM Transactions Avg\"] * 32\n",
- " #print(dramdf[['Name', 'Metric Value_x', 'Metric Value_y']])\n",
- " metricdf = metricdf.merge(dramdf[selectkeys+[\"DRAM Transactions Avg\", \"DRAM Bytes Avg\"]], on=selectkeys, how=\"inner\")\n",
- " \n",
- " \n",
+ " dramdf = metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"dram__sectors\"), resultkeys+[\"Metric Value\"] ].copy()\n",
+ " dramdf = dramdf.groupby(resultkeys).sum().reset_index().rename(columns={\"Metric Value\": \"DRAM Transactions\"})\n",
+ " dramdf[\"DRAM Bytes\"] = dramdf[\"DRAM Transactions\"] * 32\n",
+ " # add to timings\n",
+ " profiledf = profiledf.merge(dramdf[resultkeys+[\"DRAM Transactions\", \"DRAM Bytes\"]], on=resultkeys, how=\"inner\")\n",
+ " \n",
" ### Host Memory Bytes\n",
" # project out\n",
- " sysmemdf = metricdf[ metricdf[\"Metric Name\"].str.contains(\"lts__t_sectors_aperture_sysmem_op\") ].sort_values(selectkeys)\n",
- " # get reads and writes\n",
- " sysmemreadsdf = sysmemdf.loc[(sysmemdf[\"Metric Name\"]==\"lts__t_sectors_aperture_sysmem_op\") & (sysmemdf[\"Metric Type\"]==\"read\"), selectkeys+[\"Metric Value\"]]\n",
- " sysmemwritesdf = sysmemdf.loc[(sysmemdf[\"Metric Name\"]==\"lts__t_sectors_aperture_sysmem_op\") & (sysmemdf[\"Metric Type\"]==\"write\"), selectkeys+[\"Metric Value\"]]\n",
- " # combine\n",
- " sysmemdf = sysmemwritesdf.merge(sysmemreadsdf, on=selectkeys, how=\"outer\").fillna(0.)\n",
- " sysmemdf[\"SYSMEM Transactions Avg\"] = sysmemdf[\"Metric Value_x\"] + sysmemdf[\"Metric Value_y\"]\n",
- " sysmemdf[\"SYSMEM Bytes Avg\"] = sysmemdf[\"SYSMEM Transactions Avg\"] * 32\n",
- " #print(dramdf[['Name', 'Metric Value_x', 'Metric Value_y']])\n",
- " metricdf = metricdf.merge(sysmemdf[selectkeys+[\"SYSMEM Transactions Avg\", \"SYSMEM Bytes Avg\"]], on=selectkeys, how=\"inner\")\n",
- " \n",
- " ####### Clean up and return:\n",
- " del metricdf[\"Metric Value\"]\n",
- " del metricdf[\"Metric Name\"]\n",
- " del metricdf[\"Metric Type\"]\n",
- " #del metricdf[\"Invocations\"]\n",
- " metricdf.drop_duplicates(keep = 'first', inplace = True)\n",
- " \n",
+ " sysmemdf = metricdf.loc[metricdf[\"Metric Name\"].str.contains(\"lts__t_sectors_aperture_sysmem_op\"), resultkeys+[\"Metric Value\"] ].copy()\n",
+ " sysmemdf = sysmemdf.groupby(resultkeys).sum().reset_index().rename(columns={\"Metric Value\": \"SYSMEM Transactions\"})\n",
+ " sysmemdf[\"SYSMEM Bytes\"] = sysmemdf[\"SYSMEM Transactions\"] * 32\n",
+ " # add to timings\n",
+ " profiledf = profiledf.merge(sysmemdf[resultkeys+[\"SYSMEM Transactions\", \"SYSMEM Bytes\"]], on=resultkeys, how=\"inner\")\n",
"\n",
+ " \n",
" ### Get performance\n",
- " metricdf[\"Performance GFlop/s\"] = metricdf[\"FLOPs Avg\"] / (metricdf[\"CUDA Time Avg\"]*10**9)\n",
- " metricdf[\"FP32 Performance GFlop/s\"] = metricdf[\"FP32 FLOPs Avg\"] / (metricdf[\"CUDA Time Avg\"]*10**9)\n",
- " metricdf[\"FP16 Performance GFlop/s\"] = metricdf[\"FP16 FLOPs Avg\"] / (metricdf[\"CUDA Time Avg\"]*10**9)\n",
- " metricdf[\"TC Performance GFlop/s\"] = metricdf[\"TC FLOPs Avg\"] / (metricdf[\"TC Time Avg\"]*10**9)\n",
+ " profiledf[\"Performance GFlop/s\"] = profiledf[\"FLOPs\"] / (profiledf[\"CUDA Time\"]*10**9)\n",
+ " profiledf[\"FP32 Performance GFlop/s\"] = profiledf[\"FP32 FLOPs\"] / (profiledf[\"CUDA Time\"]*10**9)\n",
+ " profiledf[\"FP16 Performance GFlop/s\"] = profiledf[\"FP16 FLOPs\"] / (profiledf[\"CUDA Time\"]*10**9)\n",
+ " profiledf[\"TC Performance GFlop/s\"] = profiledf[\"TC FLOPs\"] / (profiledf[\"TC Time\"]*10**9)\n",
"\n",
" \n",
" ### Get AI\n",
" # L1\n",
- " metricdf[\"L1 AI\"] = metricdf[\"FLOPs Avg\"] / metricdf[\"L1 Bytes Avg\"]\n",
- " metricdf[\"FP32 L1 AI\"] = metricdf[\"FP32 FLOPs Avg\"] / metricdf[\"L1 Bytes Avg\"]\n",
- " metricdf[\"FP16 L1 AI\"] = metricdf[\"FP16 FLOPs Avg\"] / metricdf[\"L1 Bytes Avg\"]\n",
- " metricdf[\"TC L1 AI\"] = metricdf[\"TC FLOPs Avg\"] / metricdf[\"L1 Bytes Avg\"]\n",
+ " profiledf[\"L1 AI\"] = profiledf[\"FLOPs\"] / profiledf[\"L1 Bytes\"]\n",
+ " profiledf[\"FP32 L1 AI\"] = profiledf[\"FP32 FLOPs\"] / profiledf[\"L1 Bytes\"]\n",
+ " profiledf[\"FP16 L1 AI\"] = profiledf[\"FP16 FLOPs\"] / profiledf[\"L1 Bytes\"]\n",
+ " profiledf[\"TC L1 AI\"] = profiledf[\"TC FLOPs\"] / profiledf[\"L1 Bytes\"]\n",
" # L2\n",
- " metricdf[\"L2 AI\"] = metricdf[\"FLOPs Avg\"] / metricdf[\"L2 Bytes Avg\"]\n",
- " metricdf[\"FP32 L2 AI\"] = metricdf[\"FP32 FLOPs Avg\"] / metricdf[\"L2 Bytes Avg\"]\n",
- " metricdf[\"FP16 L2 AI\"] = metricdf[\"FP16 FLOPs Avg\"] / metricdf[\"L2 Bytes Avg\"]\n",
- " metricdf[\"TC L2 AI\"] = metricdf[\"TC FLOPs Avg\"] / metricdf[\"L2 Bytes Avg\"]\n",
+ " profiledf[\"L2 AI\"] = profiledf[\"FLOPs\"] / profiledf[\"L2 Bytes\"]\n",
+ " profiledf[\"FP32 L2 AI\"] = profiledf[\"FP32 FLOPs\"] / profiledf[\"L2 Bytes\"]\n",
+ " profiledf[\"FP16 L2 AI\"] = profiledf[\"FP16 FLOPs\"] / profiledf[\"L2 Bytes\"]\n",
+ " profiledf[\"TC L2 AI\"] = profiledf[\"TC FLOPs\"] / profiledf[\"L2 Bytes\"]\n",
" # DRAM\n",
- " metricdf[\"DRAM AI\"] = metricdf[\"FLOPs Avg\"] / metricdf[\"DRAM Bytes Avg\"]\n",
- " metricdf[\"FP32 DRAM AI\"] = metricdf[\"FP32 FLOPs Avg\"] / metricdf[\"DRAM Bytes Avg\"]\n",
- " metricdf[\"FP16 DRAM AI\"] = metricdf[\"FP16 FLOPs Avg\"] / metricdf[\"DRAM Bytes Avg\"]\n",
- " metricdf[\"TC DRAM AI\"] = metricdf[\"TC FLOPs Avg\"] / metricdf[\"DRAM Bytes Avg\"]\n",
+ " profiledf[\"DRAM AI\"] = profiledf[\"FLOPs\"] / profiledf[\"DRAM Bytes\"]\n",
+ " profiledf[\"FP32 DRAM AI\"] = profiledf[\"FP32 FLOPs\"] / profiledf[\"DRAM Bytes\"]\n",
+ " profiledf[\"FP16 DRAM AI\"] = profiledf[\"FP16 FLOPs\"] / profiledf[\"DRAM Bytes\"]\n",
+ " profiledf[\"TC DRAM AI\"] = profiledf[\"TC FLOPs\"] / profiledf[\"DRAM Bytes\"]\n",
" # SYSMEM\n",
- " metricdf[\"SYSMEM AI\"] = metricdf[\"FLOPs Avg\"] / metricdf[\"SYSMEM Bytes Avg\"]\n",
- " metricdf[\"FP32 SYSMEM AI\"] = metricdf[\"FP32 FLOPs Avg\"] / metricdf[\"SYSMEM Bytes Avg\"]\n",
- " metricdf[\"FP16 SYSMEM AI\"] = metricdf[\"FP16 FLOPs Avg\"] / metricdf[\"SYSMEM Bytes Avg\"]\n",
- " metricdf[\"TC SYSMEM AI\"] = metricdf[\"TC FLOPs Avg\"] / metricdf[\"SYSMEM Bytes Avg\"]\n",
+ " profiledf[\"SYSMEM AI\"] = profiledf[\"FLOPs\"] / profiledf[\"SYSMEM Bytes\"]\n",
+ " profiledf[\"FP32 SYSMEM AI\"] = profiledf[\"FP32 FLOPs\"] / profiledf[\"SYSMEM Bytes\"]\n",
+ " profiledf[\"FP16 SYSMEM AI\"] = profiledf[\"FP16 FLOPs\"] / profiledf[\"SYSMEM Bytes\"]\n",
+ " profiledf[\"TC SYSMEM AI\"] = profiledf[\"TC FLOPs\"] / profiledf[\"SYSMEM Bytes\"]\n",
"\n",
+ " \n",
" ### Cleanup\n",
- " metricdf.sort_values(by=selectkeys).reset_index(drop=True, inplace=True)\n",
+ " profiledf.sort_values(by=resultkeys).reset_index(drop=True, inplace=True)\n",
" #print(metricdf[['CUDA Time Avg', 'TC Time Avg']])\n",
" \n",
- " return metricdf"
+ " return profiledf"
]
},
{
@@ -293,7 +277,7 @@
"#get all the files\n",
"files = []\n",
"for datadir in datadirs:\n",
- " files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if ((os.path.splitext(x)[-1] == \".ncu-rep\"))]\n",
+ " files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if ((os.path.splitext(x)[-1] == \".ncu-rep\") or (os.path.splitext(x)[-1] == \".csv\"))]\n",
"\n",
"#recs\n",
"records = []\n",
@@ -316,7 +300,7 @@
" records.append({\"prefix\": prefix, \"file\": os.path.join(path, file)})\n",
"\n",
"#put in df\n",
- "recorddf = pd.DataFrame(records).sort_values([\"prefix\"])\n",
+ "recorddf = pd.DataFrame(records).sort_values([\"prefix\"]).reset_index(drop=True)\n",
"#with pd.option_context('display.max_rows', None, 'display.max_columns', None):"
]
},
@@ -329,75 +313,70 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_dram__sectors_read.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_dram__sectors_write.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_set_accesses_pipe_lsu_mem_global_op_red.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_set_accesses_pipe_tex_mem_surface_op_red.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_aperture_sysmem_op_read.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_aperture_sysmem_op_write.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_op_atom.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_op_read.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_op_red.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_op_write.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_sm__inst_executed_pipe_tensor_op_hmma.avg.pct_of_peak_sustained_active.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__cycles_elapsed.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__cycles_elapsed.sum.per_second.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__inst_executed_pipe_tensor_op_hmma.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__pipe_tensor_op_hmma_cycles_active.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__pipe_tensor_op_hmma_cycles_active.sum.per_second.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_fadd_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_fmul_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hadd_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hfma_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hmul_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_dram__sectors_read.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_dram__sectors_write.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_set_accesses_pipe_lsu_mem_global_op_red.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_set_accesses_pipe_tex_mem_surface_op_red.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_aperture_sysmem_op_read.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_aperture_sysmem_op_write.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_op_atom.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_op_read.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_op_red.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_op_write.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_sm__inst_executed_pipe_tensor_op_hmma.avg.pct_of_peak_sustained_active.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__cycles_elapsed.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__cycles_elapsed.sum.per_second.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__inst_executed_pipe_tensor_op_hmma.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__pipe_tensor_op_hmma_cycles_active.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__pipe_tensor_op_hmma_cycles_active.sum.per_second.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_fadd_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_fmul_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hadd_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hfma_pred_on.sum.ncu-rep\n",
- "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hmul_pred_on.sum.ncu-rep\n"
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_dram__sectors_read.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_dram__sectors_write.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_set_accesses_pipe_lsu_mem_global_op_red.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_set_accesses_pipe_tex_mem_surface_op_red.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_aperture_sysmem_op_read.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_aperture_sysmem_op_write.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_op_atom.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_op_read.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_op_red.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_lts__t_sectors_op_write.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_sm__inst_executed_pipe_tensor_op_hmma.avg.pct_of_peak_sustained_active.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__cycles_elapsed.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__cycles_elapsed.sum.per_second.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__inst_executed_pipe_tensor_op_hmma.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__pipe_tensor_op_hmma_cycles_active.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__pipe_tensor_op_hmma_cycles_active.sum.per_second.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_fadd_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_fmul_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hadd_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hfma_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hmul_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_dram__sectors_read.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_dram__sectors_write.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_set_accesses_pipe_lsu_mem_global_op_red.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_l1tex__t_set_accesses_pipe_tex_mem_surface_op_red.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_aperture_sysmem_op_read.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_aperture_sysmem_op_write.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_op_atom.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_op_read.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_op_red.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_lts__t_sectors_op_write.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_sm__inst_executed_pipe_tensor_op_hmma.avg.pct_of_peak_sustained_active.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__cycles_elapsed.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__cycles_elapsed.sum.per_second.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__inst_executed_pipe_tensor_op_hmma.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__pipe_tensor_op_hmma_cycles_active.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__pipe_tensor_op_hmma_cycles_active.sum.per_second.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_fadd_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_fmul_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hadd_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hfma_pred_on.sum.csv\n",
+ "../data/pytorch_1.5/profile.pass_forward.batchsize_2.metric_smsp__sass_thread_inst_executed_op_hmul_pred_on.sum.csv\n"
]
}
],
"source": [
- "#sort by those keys:\n",
- "sortkeys = [\"Network Name\", \\\n",
- " \"Batch Size\", \"Pass\", \\\n",
- " \"Precision\", \"Device\", \"Name\"]\n",
- " \n",
"#group by prefixes and files\n",
"all_prefixes = set([x.split(\".pass\")[0] for x in recorddf[\"prefix\"]])\n",
"all_passes = set([re.match(r'.*\\.pass_(.*?)\\.', x).groups()[0] for x in recorddf[\"prefix\"].unique()])\n",
@@ -405,13 +384,7 @@
"#metrics\n",
"df_profiles = []\n",
"\n",
- "for pref in all_prefixes:\n",
- " \n",
- " #set empty lists\n",
- " df_times = []\n",
- " df_timeline = []\n",
- " df_summary = []\n",
- " \n",
+ "for pref in all_prefixes: \n",
" #print prefix\n",
" #print(pref)\n",
" \n",
@@ -422,9 +395,15 @@
" \n",
" #project frame\n",
" files = recorddf.loc[recorddf[\"prefix\"].apply(lambda x: re.match(r'.*\\.pass_(.*?)\\.', x).groups()[0]) == pas, \"file\"].values\n",
- " \n",
+ "\n",
" #project the invididual files\n",
" metricfiles = [x for x in files if x.endswith(\".ncu-rep\")]\n",
+ " metriccsvs = [x for x in files if x.endswith(\".csv\")]\n",
+ " \n",
+ " ImportFromNsight = True\n",
+ " if len(metricfiles) == len(metriccsvs):\n",
+ " ImportFromNsight = False\n",
+ " metricfiles = metriccsvs\n",
" \n",
" for metricfile in metricfiles:\n",
" \n",
@@ -435,7 +414,7 @@
" parameters = parse_filename_nsight(os.path.basename(metricfile))\n",
" \n",
" #metrics\n",
- " metricdf = import_nsight_metric(metricfile, cuda_dir=cudadir)\n",
+ " metricdf = import_nsight_metric(ImportFromNsight, metricfile, cuda_dir=cudadir)\n",
" for key in parameters:\n",
" metricdf[key] = parameters[key]\n",
" \n",
@@ -496,16 +475,16 @@
" \n",
" \n",
" | \n",
- " Name | \n",
- " Invocations | \n",
+ " Precision | \n",
" Network Name | \n",
" Batch Size | \n",
" Pass | \n",
- " Precision | \n",
- " CUDA Time Avg | \n",
- " TC Time Avg | \n",
- " FP32 FLOPs Avg | \n",
- " FP16 FLOPs Avg | \n",
+ " Name | \n",
+ " CUDA Time | \n",
+ " TC Time | \n",
+ " Invocations | \n",
+ " FP32 FLOPs | \n",
+ " FP16 FLOPs | \n",
" ... | \n",
" FP16 L2 AI | \n",
" TC L2 AI | \n",
@@ -522,71 +501,71 @@
"
\n",
" \n",
" 0 | \n",
- " Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... | \n",
- " 384.0 | \n",
+ " mixed | \n",
" deepCam | \n",
" 2 | \n",
" backward | \n",
- " mixed | \n",
- " 0.000165 | \n",
- " 0.000165 | \n",
- " 3.382784e+06 | \n",
+ " Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... | \n",
+ " 0.063457 | \n",
+ " 0.063608 | \n",
+ " 384 | \n",
+ " 1.298989e+09 | \n",
" 0.0 | \n",
" ... | \n",
" 0.000000 | \n",
- " 86.269259 | \n",
- " 460.004408 | \n",
+ " 86.558155 | \n",
+ " 461.544485 | \n",
" 0.110112 | \n",
" 0.000000 | \n",
- " 459.894296 | \n",
- " 6.308905e+07 | \n",
+ " 461.434373 | \n",
+ " 6.330027e+07 | \n",
" 1.510171e+04 | \n",
" 0.000000 | \n",
- " 6.307394e+07 | \n",
+ " 6.328516e+07 | \n",
"
\n",
" \n",
" 1 | \n",
- " Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... | \n",
- " 12.0 | \n",
+ " mixed | \n",
" deepCam | \n",
" 2 | \n",
" backward | \n",
- " mixed | \n",
- " 0.000120 | \n",
- " 0.000120 | \n",
- " 2.048000e+06 | \n",
+ " Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... | \n",
+ " 0.001441 | \n",
+ " 0.001439 | \n",
+ " 12 | \n",
+ " 2.457600e+07 | \n",
" 0.0 | \n",
" ... | \n",
" 0.000000 | \n",
- " 175.980559 | \n",
- " 411.945348 | \n",
+ " 176.590749 | \n",
+ " 413.373309 | \n",
" 0.117225 | \n",
" 0.000000 | \n",
- " 411.828123 | \n",
- " 3.212944e+07 | \n",
+ " 413.256085 | \n",
+ " 3.224081e+07 | \n",
" 9.142857e+03 | \n",
" 0.000000 | \n",
- " 3.212029e+07 | \n",
+ " 3.223167e+07 | \n",
"
\n",
" \n",
" 2 | \n",
- " Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... | \n",
- " 6.0 | \n",
+ " mixed | \n",
" deepCam | \n",
" 2 | \n",
" backward | \n",
- " mixed | \n",
- " 0.002004 | \n",
- " 0.002003 | \n",
- " 6.930432e+06 | \n",
+ " Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... | \n",
+ " 0.012022 | \n",
+ " 0.012020 | \n",
+ " 6 | \n",
+ " 4.158259e+07 | \n",
" 0.0 | \n",
" ... | \n",
" 0.000000 | \n",
- " 421.062825 | \n",
- " 1362.422104 | \n",
+ " 421.062833 | \n",
+ " 1362.422128 | \n",
" 0.056049 | \n",
" 0.000000 | \n",
- " 1362.366055 | \n",
+ " 1362.366079 | \n",
" 7.520624e+08 | \n",
" 3.093943e+04 | \n",
" 0.000000 | \n",
@@ -594,51 +573,51 @@
"
\n",
" \n",
" 3 | \n",
- " Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x2... | \n",
- " 12.0 | \n",
+ " mixed | \n",
" deepCam | \n",
" 2 | \n",
" backward | \n",
- " mixed | \n",
- " 0.002617 | \n",
- " 0.002624 | \n",
- " 3.538944e+06 | \n",
+ " Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x2... | \n",
+ " 0.031397 | \n",
+ " 0.031395 | \n",
+ " 12 | \n",
+ " 4.246733e+07 | \n",
" 0.0 | \n",
" ... | \n",
" 0.000000 | \n",
- " 133.586092 | \n",
- " 1491.376331 | \n",
+ " 133.167388 | \n",
+ " 1486.701917 | \n",
" 0.020842 | \n",
" 0.000000 | \n",
- " 1491.355488 | \n",
- " 1.130491e+09 | \n",
+ " 1486.681075 | \n",
+ " 1.126947e+09 | \n",
" 1.579886e+04 | \n",
" 0.000000 | \n",
- " 1.130475e+09 | \n",
+ " 1.126932e+09 | \n",
"
\n",
" \n",
" 4 | \n",
- " Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_64x32... | \n",
- " 24.0 | \n",
+ " mixed | \n",
" deepCam | \n",
" 2 | \n",
" backward | \n",
- " mixed | \n",
- " 0.000211 | \n",
- " 0.000230 | \n",
- " 2.347008e+06 | \n",
+ " Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_64x32... | \n",
+ " 0.005062 | \n",
+ " 0.005076 | \n",
+ " 24 | \n",
+ " 5.632819e+07 | \n",
" 0.0 | \n",
" ... | \n",
" 0.000000 | \n",
- " 30.525030 | \n",
- " 70.019151 | \n",
+ " 28.108023 | \n",
+ " 64.476321 | \n",
" 0.017258 | \n",
" 0.000000 | \n",
- " 70.001893 | \n",
- " 4.251066e+07 | \n",
+ " 64.459064 | \n",
+ " 3.914544e+07 | \n",
" 1.047771e+04 | \n",
" 0.000000 | \n",
- " 4.250018e+07 | \n",
+ " 3.913497e+07 | \n",
"
\n",
" \n",
" ... | \n",
@@ -666,87 +645,87 @@
"
\n",
" \n",
" 110 | \n",
- " volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f... | \n",
- " 6.0 | \n",
+ " mixed | \n",
" deepCam | \n",
" 2 | \n",
" forward | \n",
- " mixed | \n",
- " 0.000080 | \n",
- " 0.000080 | \n",
- " 1.061683e+07 | \n",
- " 663552.0 | \n",
+ " volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f... | \n",
+ " 0.000477 | \n",
+ " 0.000477 | \n",
+ " 6 | \n",
+ " 6.370099e+07 | \n",
+ " 3981312.0 | \n",
" ... | \n",
" 0.010521 | \n",
- " 63.149830 | \n",
- " 262.910613 | \n",
+ " 63.151051 | \n",
+ " 262.915682 | \n",
" 0.698876 | \n",
" 0.043680 | \n",
- " 262.168057 | \n",
- " 1.783014e+07 | \n",
+ " 262.173126 | \n",
+ " 1.783048e+07 | \n",
" 4.739657e+04 | \n",
" 2962.285714 | \n",
- " 1.777978e+07 | \n",
+ " 1.778013e+07 | \n",
"
\n",
" \n",
" 111 | \n",
- " volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f... | \n",
- " 6.0 | \n",
+ " mixed | \n",
" deepCam | \n",
" 2 | \n",
" forward | \n",
- " mixed | \n",
- " 0.000303 | \n",
- " 0.000307 | \n",
- " 5.662310e+07 | \n",
- " 3538944.0 | \n",
+ " volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f... | \n",
+ " 0.001821 | \n",
+ " 0.001844 | \n",
+ " 6 | \n",
+ " 3.397386e+08 | \n",
+ " 21233664.0 | \n",
" ... | \n",
" 0.010270 | \n",
- " 57.673099 | \n",
- " 234.258467 | \n",
+ " 57.676963 | \n",
+ " 234.274116 | \n",
" 0.665453 | \n",
" 0.041591 | \n",
- " 233.551423 | \n",
- " 8.898636e+07 | \n",
+ " 233.567072 | \n",
+ " 8.899231e+07 | \n",
" 2.527817e+05 | \n",
" 15798.857143 | \n",
- " 8.871778e+07 | \n",
+ " 8.872373e+07 | \n",
"
\n",
" \n",
" 112 | \n",
- " volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nt | \n",
- " 36.0 | \n",
+ " mixed | \n",
" deepCam | \n",
" 2 | \n",
" forward | \n",
- " mixed | \n",
- " 0.000425 | \n",
- " 0.000469 | \n",
- " 5.573837e+07 | \n",
+ " volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nt | \n",
+ " 0.015462 | \n",
+ " 0.015516 | \n",
+ " 36 | \n",
+ " 2.006581e+09 | \n",
" 0.0 | \n",
" ... | \n",
" 0.000000 | \n",
- " 64.365833 | \n",
- " 246.901319 | \n",
+ " 59.159849 | \n",
+ " 226.962966 | \n",
" 0.387155 | \n",
" 0.000000 | \n",
- " 246.514164 | \n",
- " 1.586883e+08 | \n",
+ " 226.575811 | \n",
+ " 1.458735e+08 | \n",
" 2.488320e+05 | \n",
" 0.000000 | \n",
- " 1.584394e+08 | \n",
+ " 1.456247e+08 | \n",
"
\n",
" \n",
" 113 | \n",
- " volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1 | \n",
- " 12.0 | \n",
+ " mixed | \n",
" deepCam | \n",
" 2 | \n",
" forward | \n",
- " mixed | \n",
- " 0.000171 | \n",
+ " volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1 | \n",
+ " 0.002050 | \n",
" 0.000000 | \n",
- " 1.833173e+09 | \n",
+ " 12 | \n",
+ " 2.199808e+10 | \n",
" 0.0 | \n",
" ... | \n",
" 0.000000 | \n",
@@ -762,15 +741,15 @@
"
\n",
" \n",
" 114 | \n",
- " volta_fp16_sgemm_fp16_128x32_nt | \n",
- " 12.0 | \n",
+ " mixed | \n",
" deepCam | \n",
" 2 | \n",
" forward | \n",
- " mixed | \n",
- " 0.000301 | \n",
+ " volta_fp16_sgemm_fp16_128x32_nt | \n",
+ " 0.003614 | \n",
" 0.000000 | \n",
- " 3.630957e+09 | \n",
+ " 12 | \n",
+ " 4.357148e+10 | \n",
" 0.0 | \n",
" ... | \n",
" 0.000000 | \n",
@@ -786,76 +765,76 @@
"
\n",
" \n",
"\n",
- "115 rows × 47 columns
\n",
+ "115 rows × 48 columns
\n",
""
],
"text/plain": [
- " Name Invocations \\\n",
- "0 Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... 384.0 \n",
- "1 Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... 12.0 \n",
- "2 Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... 6.0 \n",
- "3 Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x2... 12.0 \n",
- "4 Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_64x32... 24.0 \n",
- ".. ... ... \n",
- "110 volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f... 6.0 \n",
- "111 volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f... 6.0 \n",
- "112 volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nt 36.0 \n",
- "113 volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1 12.0 \n",
- "114 volta_fp16_sgemm_fp16_128x32_nt 12.0 \n",
+ " Precision Network Name Batch Size Pass \\\n",
+ "0 mixed deepCam 2 backward \n",
+ "1 mixed deepCam 2 backward \n",
+ "2 mixed deepCam 2 backward \n",
+ "3 mixed deepCam 2 backward \n",
+ "4 mixed deepCam 2 backward \n",
+ ".. ... ... ... ... \n",
+ "110 mixed deepCam 2 forward \n",
+ "111 mixed deepCam 2 forward \n",
+ "112 mixed deepCam 2 forward \n",
+ "113 mixed deepCam 2 forward \n",
+ "114 mixed deepCam 2 forward \n",
"\n",
- " Network Name Batch Size Pass Precision CUDA Time Avg TC Time Avg \\\n",
- "0 deepCam 2 backward mixed 0.000165 0.000165 \n",
- "1 deepCam 2 backward mixed 0.000120 0.000120 \n",
- "2 deepCam 2 backward mixed 0.002004 0.002003 \n",
- "3 deepCam 2 backward mixed 0.002617 0.002624 \n",
- "4 deepCam 2 backward mixed 0.000211 0.000230 \n",
- ".. ... ... ... ... ... ... \n",
- "110 deepCam 2 forward mixed 0.000080 0.000080 \n",
- "111 deepCam 2 forward mixed 0.000303 0.000307 \n",
- "112 deepCam 2 forward mixed 0.000425 0.000469 \n",
- "113 deepCam 2 forward mixed 0.000171 0.000000 \n",
- "114 deepCam 2 forward mixed 0.000301 0.000000 \n",
+ " Name CUDA Time TC Time \\\n",
+ "0 Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... 0.063457 0.063608 \n",
+ "1 Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... 0.001441 0.001439 \n",
+ "2 Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1... 0.012022 0.012020 \n",
+ "3 Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x2... 0.031397 0.031395 \n",
+ "4 Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_64x32... 0.005062 0.005076 \n",
+ ".. ... ... ... \n",
+ "110 volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f... 0.000477 0.000477 \n",
+ "111 volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f... 0.001821 0.001844 \n",
+ "112 volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nt 0.015462 0.015516 \n",
+ "113 volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1 0.002050 0.000000 \n",
+ "114 volta_fp16_sgemm_fp16_128x32_nt 0.003614 0.000000 \n",
"\n",
- " FP32 FLOPs Avg FP16 FLOPs Avg ... FP16 L2 AI TC L2 AI DRAM AI \\\n",
- "0 3.382784e+06 0.0 ... 0.000000 86.269259 460.004408 \n",
- "1 2.048000e+06 0.0 ... 0.000000 175.980559 411.945348 \n",
- "2 6.930432e+06 0.0 ... 0.000000 421.062825 1362.422104 \n",
- "3 3.538944e+06 0.0 ... 0.000000 133.586092 1491.376331 \n",
- "4 2.347008e+06 0.0 ... 0.000000 30.525030 70.019151 \n",
- ".. ... ... ... ... ... ... \n",
- "110 1.061683e+07 663552.0 ... 0.010521 63.149830 262.910613 \n",
- "111 5.662310e+07 3538944.0 ... 0.010270 57.673099 234.258467 \n",
- "112 5.573837e+07 0.0 ... 0.000000 64.365833 246.901319 \n",
- "113 1.833173e+09 0.0 ... 0.000000 0.000000 30.543450 \n",
- "114 3.630957e+09 0.0 ... 0.000000 0.000000 29.308326 \n",
+ " Invocations FP32 FLOPs FP16 FLOPs ... FP16 L2 AI TC L2 AI \\\n",
+ "0 384 1.298989e+09 0.0 ... 0.000000 86.558155 \n",
+ "1 12 2.457600e+07 0.0 ... 0.000000 176.590749 \n",
+ "2 6 4.158259e+07 0.0 ... 0.000000 421.062833 \n",
+ "3 12 4.246733e+07 0.0 ... 0.000000 133.167388 \n",
+ "4 24 5.632819e+07 0.0 ... 0.000000 28.108023 \n",
+ ".. ... ... ... ... ... ... \n",
+ "110 6 6.370099e+07 3981312.0 ... 0.010521 63.151051 \n",
+ "111 6 3.397386e+08 21233664.0 ... 0.010270 57.676963 \n",
+ "112 36 2.006581e+09 0.0 ... 0.000000 59.159849 \n",
+ "113 12 2.199808e+10 0.0 ... 0.000000 0.000000 \n",
+ "114 12 4.357148e+10 0.0 ... 0.000000 0.000000 \n",
"\n",
- " FP32 DRAM AI FP16 DRAM AI TC DRAM AI SYSMEM AI FP32 SYSMEM AI \\\n",
- "0 0.110112 0.000000 459.894296 6.308905e+07 1.510171e+04 \n",
- "1 0.117225 0.000000 411.828123 3.212944e+07 9.142857e+03 \n",
- "2 0.056049 0.000000 1362.366055 7.520624e+08 3.093943e+04 \n",
- "3 0.020842 0.000000 1491.355488 1.130491e+09 1.579886e+04 \n",
- "4 0.017258 0.000000 70.001893 4.251066e+07 1.047771e+04 \n",
- ".. ... ... ... ... ... \n",
- "110 0.698876 0.043680 262.168057 1.783014e+07 4.739657e+04 \n",
- "111 0.665453 0.041591 233.551423 8.898636e+07 2.527817e+05 \n",
- "112 0.387155 0.000000 246.514164 1.586883e+08 2.488320e+05 \n",
- "113 30.543450 0.000000 0.000000 8.183808e+06 8.183808e+06 \n",
- "114 29.308326 0.000000 0.000000 1.620963e+07 1.620963e+07 \n",
+ " DRAM AI FP32 DRAM AI FP16 DRAM AI TC DRAM AI SYSMEM AI \\\n",
+ "0 461.544485 0.110112 0.000000 461.434373 6.330027e+07 \n",
+ "1 413.373309 0.117225 0.000000 413.256085 3.224081e+07 \n",
+ "2 1362.422128 0.056049 0.000000 1362.366079 7.520624e+08 \n",
+ "3 1486.701917 0.020842 0.000000 1486.681075 1.126947e+09 \n",
+ "4 64.476321 0.017258 0.000000 64.459064 3.914544e+07 \n",
+ ".. ... ... ... ... ... \n",
+ "110 262.915682 0.698876 0.043680 262.173126 1.783048e+07 \n",
+ "111 234.274116 0.665453 0.041591 233.567072 8.899231e+07 \n",
+ "112 226.962966 0.387155 0.000000 226.575811 1.458735e+08 \n",
+ "113 30.543450 30.543450 0.000000 0.000000 8.183808e+06 \n",
+ "114 29.308326 29.308326 0.000000 0.000000 1.620963e+07 \n",
"\n",
- " FP16 SYSMEM AI TC SYSMEM AI \n",
- "0 0.000000 6.307394e+07 \n",
- "1 0.000000 3.212029e+07 \n",
- "2 0.000000 7.520315e+08 \n",
- "3 0.000000 1.130475e+09 \n",
- "4 0.000000 4.250018e+07 \n",
- ".. ... ... \n",
- "110 2962.285714 1.777978e+07 \n",
- "111 15798.857143 8.871778e+07 \n",
- "112 0.000000 1.584394e+08 \n",
- "113 0.000000 0.000000e+00 \n",
- "114 0.000000 0.000000e+00 \n",
+ " FP32 SYSMEM AI FP16 SYSMEM AI TC SYSMEM AI \n",
+ "0 1.510171e+04 0.000000 6.328516e+07 \n",
+ "1 9.142857e+03 0.000000 3.223167e+07 \n",
+ "2 3.093943e+04 0.000000 7.520315e+08 \n",
+ "3 1.579886e+04 0.000000 1.126932e+09 \n",
+ "4 1.047771e+04 0.000000 3.913497e+07 \n",
+ ".. ... ... ... \n",
+ "110 4.739657e+04 2962.285714 1.778013e+07 \n",
+ "111 2.527817e+05 15798.857143 8.872373e+07 \n",
+ "112 2.488320e+05 0.000000 1.456247e+08 \n",
+ "113 8.183808e+06 0.000000 0.000000e+00 \n",
+ "114 1.620963e+07 0.000000 0.000000e+00 \n",
"\n",
- "[115 rows x 47 columns]"
+ "[115 rows x 48 columns]"
]
},
"execution_count": 7,
@@ -886,74 +865,257 @@
"#copy profiledf\n",
"combineddf = profiledf.copy()\n",
"\n",
- "#get the aggregated performance, including all kernels:\n",
- "#compute weights: multiply all measures by the number of invocations\n",
- "weighted = True\n",
- "if weighted:\n",
- " #first, get all the names of metrics which need to be weighted\n",
- " metrics = [x for x in combineddf.columns if \"Avg\" in x]\n",
- " for metric in metrics:\n",
- " combineddf[metric] *= combineddf[\"Invocations\"]\n",
- "\n",
"#sum up\n",
"combineddf = combineddf.groupby(by=combinedselectkeys).sum()#.reset_index()\n",
"\n",
"\n",
"#the flop fractions need to be recomputed\n",
- "combineddf[\"FP32 FLOPs Fraction Avg\"] = combineddf[\"FP32 FLOPs Avg\"] / combineddf[\"FLOPs Avg\"]\n",
- "combineddf[\"FP16 FLOPs Fraction Avg\"] = combineddf[\"FP16 FLOPs Avg\"] / combineddf[\"FLOPs Avg\"]\n",
- "combineddf[\"TC FLOPs Fraction Avg\"] = combineddf[\"TC FLOPs Avg\"] / combineddf[\"FLOPs Avg\"]\n",
+ "combineddf[\"FP32 FLOPs Fraction\"] = combineddf[\"FP32 FLOPs\"] / combineddf[\"FLOPs\"]\n",
+ "combineddf[\"FP16 FLOPs Fraction\"] = combineddf[\"FP16 FLOPs\"] / combineddf[\"FLOPs\"]\n",
+ "combineddf[\"TC FLOPs Fraction\"] = combineddf[\"TC FLOPs\"] / combineddf[\"FLOPs\"]\n",
"\n",
"### Get performance\n",
- "combineddf[\"Performance GFlop/s\"] = combineddf[\"FLOPs Avg\"] / (combineddf[\"CUDA Time Avg\"]*10**9)\n",
- "combineddf[\"FP32 Performance GFlop/s\"] = combineddf[\"FP32 FLOPs Avg\"] / (combineddf[\"CUDA Time Avg\"]*10**9)\n",
- "combineddf[\"FP16 Performance GFlop/s\"] = combineddf[\"FP16 FLOPs Avg\"] / (combineddf[\"CUDA Time Avg\"]*10**9)\n",
- "combineddf[\"TC Performance GFlop/s\"] = combineddf[\"TC FLOPs Avg\"] / (combineddf[\"TC Time Avg\"]*10**9)\n",
+ "combineddf[\"Performance GFlop/s\"] = combineddf[\"FLOPs\"] / (combineddf[\"CUDA Time\"]*10**9)\n",
+ "combineddf[\"FP32 Performance GFlop/s\"] = combineddf[\"FP32 FLOPs\"] / (combineddf[\"CUDA Time\"]*10**9)\n",
+ "combineddf[\"FP16 Performance GFlop/s\"] = combineddf[\"FP16 FLOPs\"] / (combineddf[\"CUDA Time\"]*10**9)\n",
+ "combineddf[\"TC Performance GFlop/s\"] = combineddf[\"TC FLOPs\"] / (combineddf[\"TC Time\"]*10**9)\n",
"\n",
"\n",
"### Get AI\n",
"# L1\n",
- "combineddf[\"L1 AI\"] = combineddf[\"FLOPs Avg\"] / combineddf[\"L1 Bytes Avg\"]\n",
- "combineddf[\"FP32 L1 AI\"] = combineddf[\"FP32 FLOPs Avg\"] / combineddf[\"L1 Bytes Avg\"]\n",
- "combineddf[\"FP16 L1 AI\"] = combineddf[\"FP16 FLOPs Avg\"] / combineddf[\"L1 Bytes Avg\"]\n",
- "combineddf[\"TC L1 AI\"] = combineddf[\"TC FLOPs Avg\"] / combineddf[\"L1 Bytes Avg\"]\n",
+ "combineddf[\"L1 AI\"] = combineddf[\"FLOPs\"] / combineddf[\"L1 Bytes\"]\n",
+ "combineddf[\"FP32 L1 AI\"] = combineddf[\"FP32 FLOPs\"] / combineddf[\"L1 Bytes\"]\n",
+ "combineddf[\"FP16 L1 AI\"] = combineddf[\"FP16 FLOPs\"] / combineddf[\"L1 Bytes\"]\n",
+ "combineddf[\"TC L1 AI\"] = combineddf[\"TC FLOPs\"] / combineddf[\"L1 Bytes\"]\n",
"# L2\n",
- "combineddf[\"L2 AI\"] = combineddf[\"FLOPs Avg\"] / combineddf[\"L2 Bytes Avg\"]\n",
- "combineddf[\"FP32 L2 AI\"] = combineddf[\"FP32 FLOPs Avg\"] / combineddf[\"L2 Bytes Avg\"]\n",
- "combineddf[\"FP16 L2 AI\"] = combineddf[\"FP16 FLOPs Avg\"] / combineddf[\"L2 Bytes Avg\"]\n",
- "combineddf[\"TC L2 AI\"] = combineddf[\"TC FLOPs Avg\"] / combineddf[\"L2 Bytes Avg\"]\n",
+ "combineddf[\"L2 AI\"] = combineddf[\"FLOPs\"] / combineddf[\"L2 Bytes\"]\n",
+ "combineddf[\"FP32 L2 AI\"] = combineddf[\"FP32 FLOPs\"] / combineddf[\"L2 Bytes\"]\n",
+ "combineddf[\"FP16 L2 AI\"] = combineddf[\"FP16 FLOPs\"] / combineddf[\"L2 Bytes\"]\n",
+ "combineddf[\"TC L2 AI\"] = combineddf[\"TC FLOPs\"] / combineddf[\"L2 Bytes\"]\n",
"# DRAM\n",
- "combineddf[\"DRAM AI\"] = combineddf[\"FLOPs Avg\"] / combineddf[\"DRAM Bytes Avg\"]\n",
- "combineddf[\"FP32 DRAM AI\"] = combineddf[\"FP32 FLOPs Avg\"] / combineddf[\"DRAM Bytes Avg\"]\n",
- "combineddf[\"FP16 DRAM AI\"] = combineddf[\"FP16 FLOPs Avg\"] / combineddf[\"DRAM Bytes Avg\"]\n",
- "combineddf[\"TC DRAM AI\"] = combineddf[\"TC FLOPs Avg\"] / combineddf[\"DRAM Bytes Avg\"]\n",
+ "combineddf[\"DRAM AI\"] = combineddf[\"FLOPs\"] / combineddf[\"DRAM Bytes\"]\n",
+ "combineddf[\"FP32 DRAM AI\"] = combineddf[\"FP32 FLOPs\"] / combineddf[\"DRAM Bytes\"]\n",
+ "combineddf[\"FP16 DRAM AI\"] = combineddf[\"FP16 FLOPs\"] / combineddf[\"DRAM Bytes\"]\n",
+ "combineddf[\"TC DRAM AI\"] = combineddf[\"TC FLOPs\"] / combineddf[\"DRAM Bytes\"]\n",
"\n",
"combineddf.sort_values(by=combinedselectkeys).reset_index(drop=True, inplace=True)"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 9,
"metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " CUDA Time | \n",
+ " TC Time | \n",
+ " Invocations | \n",
+ " FP32 FLOPs | \n",
+ " FP16 FLOPs | \n",
+ " TC Utilization | \n",
+ " TC FLOPs | \n",
+ " FLOPs | \n",
+ " FP32 FLOPs Fraction | \n",
+ " FP16 FLOPs Fraction | \n",
+ " ... | \n",
+ " FP16 L2 AI | \n",
+ " TC L2 AI | \n",
+ " DRAM AI | \n",
+ " FP32 DRAM AI | \n",
+ " FP16 DRAM AI | \n",
+ " TC DRAM AI | \n",
+ " SYSMEM AI | \n",
+ " FP32 SYSMEM AI | \n",
+ " FP16 SYSMEM AI | \n",
+ " TC SYSMEM AI | \n",
+ "
\n",
+ " \n",
+ " Precision | \n",
+ " Network Name | \n",
+ " Batch Size | \n",
+ " Pass | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " mixed | \n",
+ " deepCam | \n",
+ " 2 | \n",
+ " backward | \n",
+ " 1.349231 | \n",
+ " 0.260532 | \n",
+ " 9136 | \n",
+ " 1.799501e+12 | \n",
+ " 3.725807e+11 | \n",
+ " 10.091165 | \n",
+ " 2.173761e+13 | \n",
+ " 2.390969e+13 | \n",
+ " 0.075262 | \n",
+ " 0.015583 | \n",
+ " ... | \n",
+ " 0.579800 | \n",
+ " 33.827449 | \n",
+ " 98.231720 | \n",
+ " 7.393158 | \n",
+ " 1.530729 | \n",
+ " 89.307834 | \n",
+ " 5.278744e+09 | \n",
+ " 5.562857e+08 | \n",
+ " 1.882723e+08 | \n",
+ " 4.534186e+09 | \n",
+ "
\n",
+ " \n",
+ " forward | \n",
+ " 0.615992 | \n",
+ " 0.119549 | \n",
+ " 4782 | \n",
+ " 1.388716e+12 | \n",
+ " 6.145969e+10 | \n",
+ " 3.279819 | \n",
+ " 9.519808e+12 | \n",
+ " 1.096998e+13 | \n",
+ " 0.126592 | \n",
+ " 0.005603 | \n",
+ " ... | \n",
+ " 0.255036 | \n",
+ " 39.503903 | \n",
+ " 90.478692 | \n",
+ " 11.453912 | \n",
+ " 0.506910 | \n",
+ " 78.517870 | \n",
+ " 1.458617e+09 | \n",
+ " 3.423932e+08 | \n",
+ " 3.427154e+06 | \n",
+ " 1.112797e+09 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 43 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CUDA Time TC Time Invocations \\\n",
+ "Precision Network Name Batch Size Pass \n",
+ "mixed deepCam 2 backward 1.349231 0.260532 9136 \n",
+ " forward 0.615992 0.119549 4782 \n",
+ "\n",
+ " FP32 FLOPs FP16 FLOPs \\\n",
+ "Precision Network Name Batch Size Pass \n",
+ "mixed deepCam 2 backward 1.799501e+12 3.725807e+11 \n",
+ " forward 1.388716e+12 6.145969e+10 \n",
+ "\n",
+ " TC Utilization TC FLOPs \\\n",
+ "Precision Network Name Batch Size Pass \n",
+ "mixed deepCam 2 backward 10.091165 2.173761e+13 \n",
+ " forward 3.279819 9.519808e+12 \n",
+ "\n",
+ " FLOPs FP32 FLOPs Fraction \\\n",
+ "Precision Network Name Batch Size Pass \n",
+ "mixed deepCam 2 backward 2.390969e+13 0.075262 \n",
+ " forward 1.096998e+13 0.126592 \n",
+ "\n",
+ " FP16 FLOPs Fraction ... \\\n",
+ "Precision Network Name Batch Size Pass ... \n",
+ "mixed deepCam 2 backward 0.015583 ... \n",
+ " forward 0.005603 ... \n",
+ "\n",
+ " FP16 L2 AI TC L2 AI DRAM AI \\\n",
+ "Precision Network Name Batch Size Pass \n",
+ "mixed deepCam 2 backward 0.579800 33.827449 98.231720 \n",
+ " forward 0.255036 39.503903 90.478692 \n",
+ "\n",
+ " FP32 DRAM AI FP16 DRAM AI \\\n",
+ "Precision Network Name Batch Size Pass \n",
+ "mixed deepCam 2 backward 7.393158 1.530729 \n",
+ " forward 11.453912 0.506910 \n",
+ "\n",
+ " TC DRAM AI SYSMEM AI \\\n",
+ "Precision Network Name Batch Size Pass \n",
+ "mixed deepCam 2 backward 89.307834 5.278744e+09 \n",
+ " forward 78.517870 1.458617e+09 \n",
+ "\n",
+ " FP32 SYSMEM AI FP16 SYSMEM AI \\\n",
+ "Precision Network Name Batch Size Pass \n",
+ "mixed deepCam 2 backward 5.562857e+08 1.882723e+08 \n",
+ " forward 3.423932e+08 3.427154e+06 \n",
+ "\n",
+ " TC SYSMEM AI \n",
+ "Precision Network Name Batch Size Pass \n",
+ "mixed deepCam 2 backward 4.534186e+09 \n",
+ " forward 1.112797e+09 \n",
+ "\n",
+ "[2 rows x 43 columns]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Export Data"
+ "combineddf"
]
},
{
- "cell_type": "code",
- "execution_count": 9,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "metricdf.to_csv(\"./metrics.csv\")\n",
- "profiledf.to_csv(\"./profile.csv\")"
+ "# Export Data"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "combineddf.to_csv(\"./combined.csv\")\n",
+ "profiledf.to_csv(\"./profile.csv\")"
+ ]
}
],
"metadata": {
diff --git a/analysis/roofline_plot.ipynb b/analysis/roofline_plot.ipynb
index 278eaca..2a2d31e 100644
--- a/analysis/roofline_plot.ipynb
+++ b/analysis/roofline_plot.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -26,7 +26,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -54,7 +54,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -144,7 +144,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -156,7 +156,7 @@
" \n",
" #Mixed data\n",
" df_mixed = df[ df[\"Precision\"] == \"mixed\" ]\n",
- " Times_mixed = df_mixed[\"CUDA Time Avg\"].values\n",
+ " Times_mixed = df_mixed[\"CUDA Time\"].values\n",
" FLOPs_mixed = df_mixed[\"Performance GFlop/s\"].values #list(df_fp16[\"FP16 Performance GFlop/s\"])\n",
" \n",
" if mem_level == \"L1\":\n",
@@ -212,7 +212,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -222,12 +222,12 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "\n",
+ "image/png": "\n",
"text/plain": [
"