added parameter for metric and largest or smaller

Laouen · Jun 13, 2024 · 3c48845 · 3c48845
1 parent d9dac6a
commit 3c48845
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 59 deletions.
diff --git a/thoi/collectors.py b/thoi/collectors.py
@@ -68,13 +68,15 @@ def concatenate_csv(batched_dataframes, output_path, sep='\t'):
 
 
 def batch_to_tensor(partition_idxs: np.ndarray,
-                    nplets_o: np.ndarray,
-                    nplets_s: np.ndarray,
-                    nplets_tc: np.ndarray,
-                    nplets_dtc: np.ndarray,
+                    nplets_o:np.ndarray,
+                    nplets_s:np.ndarray,
+                    nplets_tc:np.ndarray,
+                    nplets_dtc:np.ndarray,
                     bn:int,
-                    only_synergestic: bool=False,
-                    top_k: Optional[int] = None):
+                    only_synergestic:bool=False,
+                    top_k:Optional[int] = None,
+                    metric:str = 'o',
+                    largest:bool = False):
 
     # If only_synergestic; remove nplets with nplet_o >= 0
     if only_synergestic:
@@ -86,7 +88,11 @@ def batch_to_tensor(partition_idxs: np.ndarray,
         partition_idxs = partition_idxs[to_keep.cpu()]
 
     if top_k is not None and len(nplets_o) > top_k:
-        _, indices = torch.topk(nplets_o, top_k, largest=False, sorted=True)
+
+        values_idxs = ['tc','dtc','o','s'].index(metric)
+        values = [nplets_tc, nplets_dtc, nplets_o, nplets_s][values_idxs]
+
+        _, indices = torch.topk(values, top_k, largest=largest, sorted=True)
 
         nplets_tc = nplets_tc[indices]
         nplets_dtc = nplets_dtc[indices]
@@ -104,7 +110,9 @@ def batch_to_tensor(partition_idxs: np.ndarray,
 
 
 def concat_tensors(batched_tensors: List[Tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor, torch.tensor]],
-                   top_k: Optional[int] = None):
+                   top_k: Optional[int] = None,
+                   metric:str = 'o',
+                   largest:bool = False):
 
     nplets_tc = torch.cat([batch[0] for batch in batched_tensors])
     nplets_dtc = torch.cat([batch[1] for batch in batched_tensors])
@@ -113,7 +121,11 @@ def concat_tensors(batched_tensors: List[Tuple[torch.tensor, torch.tensor, torch
     partition_idxs = torch.cat([batch[4] for batch in batched_tensors])
 
     if top_k is not None and len(nplets_o) > top_k:
-        _, indices = torch.topk(nplets_o, top_k, largest=False, sorted=True)
+
+        values_idxs = ['tc','dtc','o','s'].index(metric)
+        values = [nplets_tc, nplets_dtc, nplets_o, nplets_s][values_idxs]
+
+        _, indices = torch.topk(values, top_k, largest=largest, sorted=True)
         nplets_tc = nplets_tc[indices]
         nplets_dtc = nplets_dtc[indices]
         nplets_o = nplets_o[indices]

diff --git a/thoi/heuristics/greedy.py b/thoi/heuristics/greedy.py
@@ -3,46 +3,43 @@
 import torch
 from functools import partial
 
-from thoi.measures.gaussian_copula import nplets_measures, gaussian_copula, multi_order_measures
+from thoi.measures.gaussian_copula import gaussian_copula, multi_order_measures
 from thoi.collectors import batch_to_tensor, concat_tensors
+from thoi.heuristics.scoring import _evaluate_nplet
 
 
-def _gc_oinfo(covmat: torch.tensor, T:int, batched_nplets: torch.tensor):
+def greedy(X:np.ndarray,
+           order:int,
+           initial_order:int=3,
+           repeat:int=10,
+           use_cpu:bool=False,
+           batch_size:int=1000000,
+           metric:str='o',
+           largest:bool=False):
 
-    """
-        X (torch.tensor): The covariance matrix with shape (n_variables, n_variables)
-        batched_nplets (torch.tensor): The nplets to calculate the inverse of the oinformation with shape (batch_size, order)
-    """
-    # |batch_size| x |4 = (tc, dtc, o, s)|
-    batched_res = nplets_measures(covmat, batched_nplets, T=T, covmat_precomputed=True)
-
-    # Return minus the o information score to make it an maximum optimization
-    # |batch_size|
-    return batched_res[:,2].flatten()
 
-
-def greedy(X:np.ndarray, order:int, initial_order:int=3, repeat:int=10, use_cpu:bool=False, batch_size:int=1000000):
+    assert metric in ['tc', 'dtc', 'o', 's'], f'metric must be one of tc, dtc, o or s. invalid value: {metric}'
 
     current_solution = multi_order_measures(
         X, initial_order, initial_order, batch_size=batch_size, use_cpu=use_cpu,
-        batch_data_collector=partial(batch_to_tensor, top_k=repeat),
-        batch_aggregation=partial(concat_tensors, top_k=repeat)
+        batch_data_collector=partial(batch_to_tensor, top_k=repeat, metric=metric, largest=largest),
+        batch_aggregation=partial(concat_tensors, top_k=repeat, metric=metric, largest=largest)
     )[-1]
 
     T, N = X.shape
 
     covmat = torch.tensor(gaussian_copula(X)[1])
 
-    # make device cpu if not cuda available or cuda if available
+    # Make device cpu if not cuda available or cuda if available
     using_GPU = torch.cuda.is_available() and not use_cpu
     device = torch.device('cuda' if using_GPU else 'cpu')
 
     covmat = covmat.to(device).contiguous()
     current_solution = current_solution.to(device).contiguous()
 
-    best_scores = [_gc_oinfo(covmat, T, current_solution)]
+    best_scores = [_evaluate_nplet(covmat, T, current_solution, metric)]
     for _ in trange(initial_order, order, leave=False, desc='Order'):
-        best_candidate, best_score = next_order_greedy(covmat, T, current_solution)
+        best_candidate, best_score = next_order_greedy(covmat, T, current_solution, metric, min)
         best_scores.append(best_score)
 
         current_solution = torch.cat((current_solution, best_candidate.unsqueeze(1)) , dim=1)
@@ -53,7 +50,11 @@ def greedy(X:np.ndarray, order:int, initial_order:int=3, repeat:int=10, use_cpu:
 
 def next_order_greedy(covmat: torch.tensor,
                       T: int,
-                      initial_solution: torch.tensor):
+                      initial_solution: torch.tensor,
+                      metric:str='o',
+                      largest:bool=False):
+
+    assert metric in ['tc', 'dtc', 'o', 's'], f'metric must be one of tc, dtc, o or s. invalid value: {metric}'
 
     # Get parameters attributes
     device = covmat.device
@@ -68,16 +69,19 @@ def next_order_greedy(covmat: torch.tensor,
         for b in torch.arange(batch_size, device=device)
     ]).contiguous()
 
-    # current_solution constructed by adding first element of valid_candidate to input solution
+    # Current_solution constructed by adding first element of valid_candidate to input solution
     current_solution = torch.cat((initial_solution, valid_candidates[:, [0]]) , dim=1)
 
-    # start best solution first_candidate
+    # Start best solution first_candidate
     # |batch_size| x |order+1|
     best_candidates = valid_candidates[:, 0]
     # |batch_size|
-    best_score = _gc_oinfo(covmat, T, current_solution)
+    best_score = _evaluate_nplet(covmat, T, current_solution, metric)
+
+    if not largest:
+        best_score = -best_score
 
-    # iterate candidate from 1 since candidate 0 is already in the initial best solution 
+    # Iterate candidate from 1 since candidate 0 is already in the initial best solution 
     for i_cand in trange(1,valid_candidates.shape[1], leave=False, desc=f'Candidates'):
 
         # Update current solution to the next candidate inplace to avoid memory overhead
@@ -86,11 +90,16 @@ def next_order_greedy(covmat: torch.tensor,
 
         # Calculate score of new solution
         # |batch_size|
-        new_score = _gc_oinfo(covmat, T, current_solution)
+        new_score = _evaluate_nplet(covmat, T, current_solution, metric)
+
+        # if minimizing, then return score to optimizing
+        if not largest:
+            new_score = -new_score
 
         # Determine if we should accept the new solution
+        # new_score is bigger (more optimal) than best_score
         # |batch_size|
-        new_global_maxima = new_score < best_score
+        new_global_maxima = new_score > best_score
 
         # update best solution based on accpetance criteria
         # |batch_size| x |order|
@@ -106,5 +115,9 @@ def next_order_greedy(covmat: torch.tensor,
             new_score,
             best_score
         )
+
+    # If minimizing, then return score to its real value
+    if not largest:
+        best_score = -best_score
 
     return best_candidates, best_score
diff --git a/thoi/heuristics/scoring.py b/thoi/heuristics/scoring.py
@@ -0,0 +1,24 @@
+import torch
+from thoi.measures.gaussian_copula import nplets_measures
+
+
+def _evaluate_nplet(covmat: torch.tensor, T:int, batched_nplets: torch.tensor, metric:str):
+
+    """
+        X (torch.tensor): The covariance matrix with shape (n_variables, n_variables)
+        batched_nplets (torch.tensor): The nplets to calculate the inverse of the oinformation with shape (batch_size, order)
+        metric (str): The metric to evaluate. One of tc, dtc, o or s
+    """
+
+    # get the metric index to get from the result of nplets
+    METRICS = ['tc', 'dtc', 'o', 's']
+    metric_idx = METRICS.index(metric)
+
+    # |batch_size| x |4 = (tc, dtc, o, s)|
+    batched_res = nplets_measures(covmat, batched_nplets, T=T, covmat_precomputed=True)
+
+    # Return minus the o information score to make it an maximum optimization (energy)
+    # |batch_size|
+    res = batched_res[:,metric_idx].flatten()
+
+    return res
diff --git a/thoi/heuristics/simulated_annealing.py b/thoi/heuristics/simulated_annealing.py
@@ -4,17 +4,18 @@
 import torch
 from functools import partial
 
-from thoi.measures.gaussian_copula import nplets_measures, gaussian_copula, multi_order_measures
+from thoi.measures.gaussian_copula import gaussian_copula, multi_order_measures
 from thoi.collectors import batch_to_tensor, concat_tensors
+from thoi.heuristics.scoring import _evaluate_nplet
 
-def init_lower_order(X: np.ndarray, order:int, lower_order:int, repeat:int, use_cpu:bool, device:torch.device):
+def init_lower_order(X: np.ndarray, order:int, lower_order:int, repeat:int, metric:str, largest:bool, use_cpu:bool, device:torch.device):
     N = X.shape[1]
 
     # |repeat| x |lower_order|
     current_solution = multi_order_measures(
         X, lower_order, lower_order, batch_size=repeat, use_cpu=use_cpu,
-        batch_data_collector=partial(batch_to_tensor, top_k=repeat),
-        batch_aggregation=partial(concat_tensors, top_k=repeat)
+        batch_data_collector=partial(batch_to_tensor, top_k=repeat, metric=metric, largest=largest),
+        batch_aggregation=partial(concat_tensors, top_k=repeat, metric=metric, largest=largest)
     )[-1].to(device)
 
     # |N|
@@ -35,6 +36,7 @@ def init_lower_order(X: np.ndarray, order:int, lower_order:int, repeat:int, use_
     # |repeat| x |order|
     return torch.cat((current_solution, valid_candidates) , dim=1).contiguous()
 
+
 def random_sampler(N:int, order:int, repeat:int, device:torch.device=None):
 
     if device is None:
@@ -46,21 +48,6 @@ def random_sampler(N:int, order:int, repeat:int, device:torch.device=None):
     ])
 
 
-def _gc_oinfo_energy(covmat: torch.tensor, T:int, batched_nplets: torch.tensor):
-
-    """
-        X (torch.tensor): The covariance matrix with shape (n_variables, n_variables)
-        batched_nplets (torch.tensor): The nplets to calculate the inverse of the oinformation with shape (batch_size, order)
-    """
-    # |batch_size| x |4 = (tc, dtc, o, s)|
-    batched_res = nplets_measures(covmat, batched_nplets, T=T, covmat_precomputed=True)
-
-    # Return minus the o information score to make it an maximum optimization (energy)
-    # |batch_size|
-    return -batched_res[:,2].flatten()
-
-
-# TODO: add optimization value option as parameter
 def simulated_annealing(X: np.ndarray, 
                         order: int,
                         initial_temp:float=100.0,
@@ -71,11 +58,15 @@ def simulated_annealing(X: np.ndarray,
                         init_method:str='random', # lower_order, 'random', 'precumputed', 'precomputed_lower_order';
                         lower_order:int=None,
                         early_stop:int=100,
-                        current_solution: Optional[torch.tensor]=None):
+                        current_solution: Optional[torch.tensor]=None,
+                        metric:str='o', # tc, dtc, o, s
+                        largest:bool=False):
 
     lower_order = order-1 if lower_order is None else lower_order
     assert init_method != 'lower_order' or lower_order < order, 'Init from optima lower order cannot start from a lower_order higher than the order to compute.' 
 
+    assert metric in ['tc', 'dtc', 'o', 's'], f'metric must be one of tc, dtc, o or s. invalid value: {metric}'
+
     # make device cpu if not cuda available or cuda if available
     using_GPU = torch.cuda.is_available() and not use_cpu
     device = torch.device('cuda' if using_GPU else 'cpu')
@@ -85,17 +76,20 @@ def simulated_annealing(X: np.ndarray,
     covmat = torch.tensor(gaussian_copula(X)[1])
     covmat = covmat.to(device).contiguous()
 
-    # Initialize random solution
+    # Compute initial solution
     # |batch_size| x |order|
     if init_method == 'random':
         current_solution = random_sampler(N, order, repeat, device)
     elif init_method == 'lower_order':
-        current_solution = init_lower_order(X, order, lower_order, repeat, use_cpu, device)
+        current_solution = init_lower_order(X, order, lower_order, repeat, metric, largest, use_cpu, device)
     elif init_method == 'precomputed':
         assert current_solution is not None, 'current_solution must be a torch tensor'
 
     # |batch_size|
-    current_energy = _gc_oinfo_energy(covmat, T, current_solution)
+    current_energy = _evaluate_nplet(covmat, T, current_solution, metric)
+
+    if not largest:
+        current_energy = -current_energy
 
     # Initial valid candidates
     all_elements = torch.arange(N, device=device)
@@ -132,7 +126,10 @@ def simulated_annealing(X: np.ndarray,
 
         # Calculate energy of new solution
         # |batch_size|
-        new_energy = _gc_oinfo_energy(covmat, T, new_solution)
+        new_energy = _evaluate_nplet(covmat, T, new_solution, metric)
+
+        if not largest:
+            new_energy = -new_energy
 
         # Calculate change in energy
         # delca_energy > 0 means new_energy is bigger (more optimal) than current_energy
@@ -196,4 +193,8 @@ def simulated_annealing(X: np.ndarray,
             print('Early stop reached')
             break
 
+    # If minimizing, then return score to its real value
+    if not largest:
+        best_energy = -best_energy
+
     return best_solution, best_energy