Add csv export endpoint to everest data api

equinor · Jan 23, 2025 · 9870a08 · 9870a08
1 parent 73e2816
commit 9870a08
Show file tree

Hide file tree

Showing 11 changed files with 467 additions and 0 deletions.
diff --git a/src/everest/api/everest_data_api.py b/src/everest/api/everest_data_api.py
@@ -270,3 +270,177 @@ def summary_values(self, batches=None, keys=None):
     @property
     def output_folder(self):
         return self._config.output_dir
+
+    def export_dataframes(
+        self,
+    ) -> tuple[polars.DataFrame, polars.DataFrame, polars.DataFrame]:
+        batch_dfs_to_join = {}
+        realization_dfs_to_join = {}
+        perturbation_dfs_to_join = {}
+
+        batch_ids = [b.batch_id for b in self._ever_storage.data.batches]
+        all_controls = self._ever_storage.data.controls["control_name"].to_list()
+
+        def _try_append_df(
+            batch_id: int,
+            df: polars.DataFrame | None,
+            target: dict[str, list[polars.DataFrame]],
+        ):
+            if df is not None:
+                if batch_id not in target:
+                    target[batch.batch_id] = []
+
+                target[batch_id].append(df)
+
+        def try_append_batch_dfs(batch_id: int, *dfs: polars.DataFrame):
+            for df_ in dfs:
+                _try_append_df(batch_id, df_, batch_dfs_to_join)
+
+        def try_append_realization_dfs(batch_id: int, *dfs: polars.DataFrame):
+            for df_ in dfs:
+                _try_append_df(batch_id, df_, realization_dfs_to_join)
+
+        def try_append_perturbation_dfs(batch_id: int, *dfs: polars.DataFrame):
+            for df_ in dfs:
+                _try_append_df(batch_id, df_, perturbation_dfs_to_join)
+
+        def pivot_gradient(df: polars.DataFrame) -> polars.DataFrame:
+            pivoted_ = df.pivot(on="control_name", index="batch_id", separator=" wrt ")
+            return pivoted_.rename(
+                {
+                    col: f"grad({col})"
+                    for col in pivoted_.columns
+                    if col != "batch_id" and col not in all_controls
+                }
+            )
+
+        for batch in self._ever_storage.data.batches:
+            try_append_perturbation_dfs(
+                batch.batch_id,
+                batch.perturbation_objectives,
+                batch.perturbation_constraints,
+            )
+
+            try_append_realization_dfs(
+                batch.batch_id,
+                batch.realization_objectives,
+                batch.realization_controls,
+                batch.realization_constraints,
+            )
+
+            if batch.batch_objective_gradient is not None:
+                try_append_batch_dfs(
+                    batch.batch_id, pivot_gradient(batch.batch_objective_gradient)
+                )
+
+            if batch.batch_constraint_gradient is not None:
+                try_append_batch_dfs(
+                    batch.batch_id,
+                    pivot_gradient(batch.batch_constraint_gradient),
+                )
+
+            try_append_batch_dfs(
+                batch.batch_id, batch.batch_objectives, batch.batch_constraints
+            )
+
+        def _join_by_batch(
+            dfs: dict[int, list[polars.DataFrame]], on: list[str]
+        ) -> list[polars.DataFrame]:
+            """
+            Creates one dataframe per batch, with one column per input/output,
+            including control, objective, constraint, gradient value.
+            """
+            dfs_to_concat_ = []
+            for batch_id in batch_ids:
+                if batch_id not in dfs:
+                    continue
+
+                batch_df_ = dfs[batch_id][0]
+                for bdf_ in dfs[batch_id][1:]:
+                    if set(all_controls).issubset(set(bdf_.columns)) and set(
+                        all_controls
+                    ).issubset(set(batch_df_.columns)):
+                        bdf_ = bdf_.drop(all_controls)
+
+                    batch_df_ = batch_df_.join(
+                        bdf_,
+                        on=on,
+                    )
+
+                dfs_to_concat_.append(batch_df_)
+
+            return dfs_to_concat_
+
+        batch_dfs_to_concat = _join_by_batch(batch_dfs_to_join, on=["batch_id"])
+        batch_df = polars.concat(batch_dfs_to_concat, how="diagonal")
+
+        realization_dfs_to_concat = _join_by_batch(
+            realization_dfs_to_join, on=["batch_id", "realization", "simulation_id"]
+        )
+        realization_df = polars.concat(realization_dfs_to_concat, how="diagonal")
+
+        perturbation_dfs_to_concat = _join_by_batch(
+            perturbation_dfs_to_join, on=["batch_id", "realization", "perturbation"]
+        )
+        perturbation_df = polars.concat(perturbation_dfs_to_concat, how="diagonal")
+
+        pert_real_df = polars.concat([realization_df, perturbation_df], how="diagonal")
+
+        pert_real_df = pert_real_df.select(
+            "batch_id",
+            "realization",
+            "perturbation",
+            *list(
+                set(pert_real_df.columns) - {"batch_id", "realization", "perturbation"}
+            ),
+        )
+
+        # Avoid name collisions when joining with simulations
+        batch_df_renamed = batch_df.rename(
+            {
+                col: f"batch_{col}"
+                for col in batch_df.columns
+                if col != "batch_id" and not col.startswith("grad")
+            }
+        )
+        combined_df = pert_real_df.join(
+            batch_df_renamed, on="batch_id", how="full", coalesce=True
+        )
+
+        def _sort_df(df: polars.DataFrame, index: list[str]):
+            sorted_cols = index + sorted(set(df.columns) - set(index))
+            df_ = df.select(sorted_cols).sort(by=index)
+            return df_
+
+        return (
+            _sort_df(
+                combined_df,
+                ["batch_id", "realization", "simulation_id", "perturbation"],
+            ),
+            _sort_df(
+                pert_real_df,
+                [
+                    "batch_id",
+                    "realization",
+                    "perturbation",
+                    "simulation_id",
+                ],
+            ),
+            _sort_df(batch_df, ["batch_id", "total_objective_value"]),
+        )
+
+    @property
+    def everest_csv(self):
+        export_filename = (
+            self._config.export.csv_output_filepath
+            if self._config.export is not None
+            else f"{self._config.config_file}.csv"
+        )
+
+        full_path = os.path.join(self.output_folder, export_filename)
+
+        if not os.path.exists(full_path):
+            combined_df, _, _ = self.export_dataframes()
+            combined_df.write_csv(full_path)
+
+        return os.path.join(self.output_folder, export_filename)
diff --git a/tests/everest/snapshots/test_api_snapshots/test_csv_export/config_advanced.yml/batch_df.csv b/tests/everest/snapshots/test_api_snapshots/test_csv_export/config_advanced.yml/batch_df.csv
@@ -0,0 +1,7 @@
+batch_id,total_objective_value,distance,grad(distance wrt point_x-0),grad(distance wrt point_x-1),grad(distance wrt point_x-2),grad(distance.total wrt point_x-0),grad(distance.total wrt point_x-1),grad(distance.total wrt point_x-2),merit_value,point_x-0,point_x-1,point_x-2,x-0_coord,x-0_coord.violation
+0,-1.6875,-1.6875,-0.4908,-0.5019,0.5042,-0.4908,-0.5019,0.5042,,1.0,-0.0,-0.0,0.15,0.0
+1,-1.6928,-1.6928,-0.5221,-0.2858,0.6487,-0.5221,-0.2858,0.6487,389.8557,1.0,0.0,0.0001,0.159,0.0
+2,-1.7356,-1.7356,-0.6428,-0.0052,0.7315,-0.6428,-0.0052,0.7315,381.4612,1.0,0.0,0.0,0.2186,0.0
+3,-1.6237,-1.6237,-0.4728,-0.0785,0.5154,-0.4728,-0.0785,0.5154,29.5855,1.0,-0.0,0.0,0.1412,0.0
+4,-1.5398,-1.5398,-0.3037,0.0231,0.2583,-0.3037,0.0231,0.2583,13.247,1.0,0.0,-0.0,0.0509,0.0
+5,-1.5256,-1.5256,-0.2166,-0.0058,0.2172,-0.2166,-0.0058,0.2172,11.7503,1.0,-0.0,0.0,0.0145,0.0