Skip to content

Commit

Permalink
fixed types and device in new nplets dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
Laouen committed Oct 16, 2024
1 parent 7b54360 commit f7d1366
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 49 deletions.
22 changes: 11 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,28 +60,28 @@ import numpy as np
X = np.random.normal(0,1, (1000, 10))
# Computation of O information for the entire system
# Computation of O information for the nplet that consider all the variables of X
measures = nplets_measures(X)
# Computation of O info for the sub-system composed by 0, 1 and 3
measures = nplets_measures(X, [0,1,3])
# Computation of O info for a single nplet (it must be a list of nplets even if it is a single nplet)
measures = nplets_measures(X, [[0,1,3]])
# Computation of O info for the sub-system composed by 0, 1 and 3
# Computation of O info for multiple nplets
measures = nplets_measures(X, [[0,1,3],[3,7,4],[2,6,3]])
# Extensive computation of O information measures over all combinations of X
# Extensive computation of O information measures over all combinations of features in X
measures = multi_order_measures(X)
# compute the best 10 combinations using greedy, starting by exaustive search in
# Compute the best 10 combinations of features (nplet) using greedy, starting by exaustive search in
# lower order and building from there. Result shows best O information for
# each built optimal orders
best_partitions, best_scores = greedy(X, 3, 5, repeat=10)
best_nplets, best_scores = greedy(X, 3, 5, repeat=10)
# compute the best 10 combinations using simulated annealing: There are two initialization options
# 1. Starting by exaustive search in lower order, then building with gready.
# 2. Selection random sample of initial solutions.
# Compute the best 10 combinations of features (nplet) using simulated annealing: There are two initialization options
# 1. Starting by a custom initial solution with shape (repeat, order) explicitely provided by the user.
# 2. Selecting random samples from the order.
# Result shows best O information for each built optimal orders
best_partitions, best_scores = simulated_annealing(X, 5, repeat=10)
best_nplets, best_scores = simulated_annealing(X, 5, repeat=10)
```

For detailed usage and examples, please refer to the [documentation](https://github.com/Laouen/THOI).
Expand Down
4 changes: 2 additions & 2 deletions tests/test_multiorder_measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def test_multiorder_measures_precomputed_hot_encoded(self):
T, N = self.X.shape
covmat = gaussian_copula_covmat(self.X)

df_res = multi_order_measures_hot_encoded(covmat, batch_size=10000, use_cpu=True)
df_res = multi_order_measures_hot_encoded(covmat, batch_size=200000, use_cpu=True)

dfs = []
for order in sorted(df_res['order'].unique()):
Expand All @@ -122,7 +122,7 @@ def test_multiorder_measures_precomputed_hot_encoded(self):
df_desc_order = df_desc_order.sort_index()
df_stats_order = df_stats_order.sort_index()

self.assertTrue(np.allclose(df_desc_order.values, df_stats_order.values, atol=1e-6, equal_nan=True))
self.assertTrue(np.allclose(df_desc_order.values, df_stats_order.values, atol=1e-4, equal_nan=True))

def test_multiple_times_same_datasets(self):
# TODO: implement
Expand Down
34 changes: 13 additions & 21 deletions thoi/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import scipy as sp
import torch

from thoi.typing import TensorLikeArray


def _get_string_metric(batched_res: np.ndarray, metric:str):
'''
Expand Down Expand Up @@ -60,16 +62,15 @@ def _to_numpy(X):
return X.detach().cpu().numpy()
elif isinstance(X, np.ndarray):
return X
else:
raise TypeError(f"Unsupported type: {type(X)}")
return np.array(X)

def _get_device(use_cpu:bool=False):
"""Set the use of GPU if available"""
using_GPU = torch.cuda.is_available() and not use_cpu
device = torch.device('cuda' if using_GPU else 'cpu')
return device

def _normalize_input_data(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
def _normalize_input_data(X: TensorLikeArray,
covmat_precomputed: bool=False,
T: Optional[Union[int, List[int]]]=None,
use_cpu: bool=False):
Expand All @@ -88,30 +89,21 @@ def _normalize_input_data(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], L

# Handle different options for X parameter. Accept multivariate data or covariance matrix
if covmat_precomputed:

if isinstance(X, (np.ndarray, torch.Tensor)):
assert X.shape[-2] == X.shape[-1], 'Covariance matrix should be square'
assert len(X.shape) in [2, 3], 'Covariance matrix should have dimensions (N, N) or (D, N, N)'
covmats = torch.as_tensor(X)
covmats = covmats.unsqueeze(0) if len(covmats.shape) == 2 else covmats
else:
assert all([len(x.shape) == 2 for x in X]), 'All covariance matrices should have dimensions (N, N)'
assert all([x.shape[0] == x.shape[1] == X[0].shape[0] for x in X]), 'All covariance matrices should have same dimensions (N, N)'
covmats = torch.stack([torch.as_tensor(x) for x in X])
covmats = torch.as_tensor(X)
covmats = covmats.unsqueeze(0) if len(covmats.shape) == 2 else covmats
assert X.shape[-2] == X.shape[-1], 'Covariance matrix should be square'
assert len(X.shape) == 3, 'Covariance matrix should have dimensions (N, N) or (D, N, N)'
else:

if isinstance(X, (np.ndarray, torch.Tensor)):
try:
X = _to_numpy(X)
assert len(X.shape) in [2, 3], 'Covariance matrix should have dimensions (T, N) or (D, T, N)'
if len(X.shape) == 2:
X = [X]
else:
X = [X[i] for i in range(X.shape[0])]
else:
X = [X] if len(X.shape) == 2 else [X[i] for i in range(X.shape[0])]
except:
X = [_to_numpy(x) for x in X]
assert all([len(x.shape) == 2 for x in X]), 'All multivariate series should have dimensions (T, N) where T my vary and N be constant across all series'
assert all([x.shape[1] == X[0].shape[1] for x in X]), 'All multivariate series should have dimensions (T, N) where T my vary and N be constant across all series'
X = [_to_numpy(x) for x in X]


covmats = torch.stack([torch.from_numpy(gaussian_copula_covmat(x)) for x in X])
T = [x.shape[0] for x in X]

Expand Down
5 changes: 3 additions & 2 deletions thoi/heuristics/greedy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@

@torch.no_grad()
def greedy(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
covmat_precomputed: bool=False,
T: Optional[Union[int, List[int]]]=None,
initial_order: int=3,
order: Optional[int]=None,
*,
covmat_precomputed: bool=False,
T: Optional[Union[int, List[int]]]=None,
repeat: int=10,
use_cpu: bool=False,
batch_size: int=1000000,
Expand Down
3 changes: 2 additions & 1 deletion thoi/heuristics/simulated_annealing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ def random_sampler(N:int, order:int, repeat:int, device:Optional[torch.device]=N

@torch.no_grad()
def simulated_annealing(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
order: Optional[int]=None,
*,
covmat_precomputed: bool=False,
T: Optional[Union[int, List[int]]]=None,
initial_solution: Optional[torch.Tensor] = None,
order: Optional[int]=None,
repeat: int = 10,
use_cpu: bool = False,
max_iterations: int = 1000,
Expand Down
1 change: 1 addition & 0 deletions thoi/heuristics/simulated_annealing_multi_order.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def hot_encode_to_indexes(nplets):

@torch.no_grad()
def simulated_annealing_multi_order(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
*,
covmat_precomputed: bool=False,
T: Optional[Union[int, List[int]]]=None,
initial_solution: Optional[torch.Tensor] = None,
Expand Down
14 changes: 9 additions & 5 deletions thoi/measures/gaussian_copula.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch
from torch.utils.data import DataLoader

from thoi.typing import TensorLikeArray
from thoi.commons import _normalize_input_data, _get_device
from thoi.dataset import CovarianceDataset
from thoi.collectors import batch_to_csv, concat_and_sort_csv
Expand Down Expand Up @@ -98,8 +99,8 @@ def _get_tc_dtc_from_batched_covmat(covmats: torch.Tensor, allmin1: torch.Tensor
return nplet_tc, nplet_dtc, nplet_o, nplet_s

@torch.no_grad()
def nplets_measures(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
nplets: Optional[Union[np.ndarray,torch.Tensor]] = None,
def nplets_measures(X: Union[TensorLikeArray],
nplets: Optional[TensorLikeArray] = None,
covmat_precomputed: bool = False,
T: Optional[Union[int, List[int]]] = None,
use_cpu: bool = False):
Expand Down Expand Up @@ -164,7 +165,7 @@ def nplets_measures(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[to
nplets_s.view(batch_size, D)], dim=-1)

@torch.no_grad()
def multi_order_measures(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
def multi_order_measures(X: TensorLikeArray,
covmat_precomputed: bool=False,
T: Optional[Union[int, List[int]]]=None,
min_order: int=3,
Expand Down Expand Up @@ -235,11 +236,14 @@ def multi_order_measures(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], Li
# calculate measurments for each batch
for bn, nplets in enumerate(tqdm(dataloader, total=len(dataloader), leave=False, desc='Batch')):
curr_batch_size = nplets.shape[0]


# Send nplets to the device in case it is not there
nplets = nplets.to(device)

# Create the covariance matrices for each nplet in the batch
# |curr_batch_size| x |D| x |N| x |N|
nplets_covmats = _generate_nplets_covmants(covmats, nplets)

# Pack covmats in a single batch
# |curr_batch_size x D| x |N| x |N|
nplets_covmats = nplets_covmats.view(curr_batch_size*D, order, order)
Expand Down
7 changes: 4 additions & 3 deletions thoi/measures/gaussian_copula_hot_encoded.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch
from torch.utils.data import DataLoader

from thoi.typing import TensorLikeArray
from thoi.dataset import HotEncodedMultiOrderDataset
from thoi.collectors import batch_to_csv, concat_and_sort_csv
from thoi.measures.utils import _all_min_1_ids, _gaussian_entropy_bias_correction, _gaussian_entropy_estimation, _get_single_exclusion_covmats
Expand Down Expand Up @@ -177,8 +178,8 @@ def _compute_nplets_measures_hot_encoded(covmats: torch.Tensor,
)

@torch.no_grad()
def nplets_measures_hot_encoded(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
nplets: Optional[Union[np.ndarray,torch.Tensor]] = None,
def nplets_measures_hot_encoded(X: TensorLikeArray,
nplets: Optional[TensorLikeArray] = None,
covmat_precomputed: bool = False,
T: Optional[int] = None,
use_cpu: bool = False):
Expand All @@ -204,7 +205,7 @@ def nplets_measures_hot_encoded(X: Union[np.ndarray, torch.Tensor, List[np.ndarr


@torch.no_grad()
def multi_order_measures_hot_encoded(X: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
def multi_order_measures_hot_encoded(X: TensorLikeArray,
covmat_precomputed: bool=False,
T: Optional[int]=None,
min_order: int=3,
Expand Down
9 changes: 5 additions & 4 deletions thoi/measures/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from thoi.measures.constants import TWOPIE


def _all_min_1_ids(N, device=torch.device('cpu')):
def _all_min_1_ids(N: torch.device, device: torch.device=torch.device('cpu')):
base_tensor = torch.arange(N, device=device).unsqueeze(0).repeat(N, 1) # Shape: (N, N)
mask = base_tensor != torch.arange(N, device=device).unsqueeze(1) # Shape: (N, N)
result = base_tensor[mask].view(N, N - 1) # Shape: (N, N-1)
Expand All @@ -28,20 +28,21 @@ def _get_single_exclusion_covmats(covmats: torch.Tensor, allmin1: torch.Tensor):
batch_size, N, _ = covmats.shape

# Step 1: Expand allmin1 to match the batch size
# Shape: (batch_size, N, N-1)
# |batch_size| |N| |N-1|
allmin1_expanded = allmin1.unsqueeze(0).expand(batch_size, -1, -1)

# Step 2: Expand covmats to include the N dimension for variable exclusion
# Shape: (batch_size, N, N, N)
# |batch_size| |N| |N| |N|
covmats_expanded = covmats.unsqueeze(1).expand(-1, N, -1, -1)

# Step 3: Gather the rows corresponding to the indices in allmin1
# Shape of indices_row: (batch_size, N, N-1, N)
# |batch_size| |N| |N-1| |N|
indices_row = allmin1_expanded.unsqueeze(-1).expand(-1, -1, -1, N)
gathered_rows = torch.gather(covmats_expanded, 2, indices_row)

# Step 4: Gather the columns corresponding to the indices in allmin1
# Shape of indices_col: (batch_size, N, N-1, N-1)
# |batch_size| |N| |N-1| |N-1|
indices_col = allmin1_expanded.unsqueeze(-2).expand(-1, -1, N-1, -1)
covmats_sub = torch.gather(gathered_rows, 3, indices_col)

Expand Down
9 changes: 9 additions & 0 deletions thoi/typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import Union, Sequence, Any
import torch
import numpy as np

TensorLikeArray = Union[
torch.Tensor,
np.ndarray,
Sequence[Union[np.ndarray, Sequence[Any]]],
]

0 comments on commit f7d1366

Please sign in to comment.