From 71fbea062a1f5b8857498d6278e431f17308c381 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Mon, 11 Mar 2024 22:01:04 +0900 Subject: [PATCH 01/48] feat(skorch): add prototype of an inherited class from skorch.NeuralNet that is compatible with PyTorch Frame --- torch_frame/utils/skorch.py | 146 ++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 torch_frame/utils/skorch.py diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py new file mode 100644 index 000000000..d54bef399 --- /dev/null +++ b/torch_frame/utils/skorch.py @@ -0,0 +1,146 @@ +from skorch import NeuralNetClassifier, NeuralNet +from skorch.dataset import Dataset as SkorchDataset +import torch.nn as nn +import torch_frame +from torch_frame.data.tensor_frame import TensorFrame +from torch_frame.utils import infer_df_stype +from torch_frame.data.dataset import DataFrameToTensorFrameConverter, Dataset +from torch_frame.data.loader import DataLoader +import torch +from torch_frame.typing import IndexSelectType +from torch import Tensor +from pandas import DataFrame +from typing import Any +import pandas as pd +from numpy.typing import ArrayLike +from torch_frame.config import ( + ImageEmbedderConfig, + TextEmbedderConfig, + TextTokenizerConfig, +) + +class NeuralNetPytorchFrameDataLoader(DataLoader): + def __init__( + self, dataset: Dataset | TensorFrame, *args, device: torch.device, **kwargs + ): + super().__init__(dataset, *args, **kwargs) + self.device = device + + def collate_fn(self, index: IndexSelectType) -> tuple[TensorFrame, Tensor | None]: + index = torch.tensor(index) + res = super().collate_fn(index).to(self.device) + return res, res.y + + +class NeuralNetPytorchFrame(NeuralNet): + def __init__( + self, + # NeuralNet parameters + module, + criterion, + optimizer=torch.optim.SGD, + lr=0.01, + max_epochs=10, + batch_size=128, + iterator_train=..., + iterator_valid=..., + dataset=..., + train_split=..., + callbacks=None, + predict_nonlinearity="auto", + warm_start=False, + verbose=1, + device="cpu", + compile=False, + use_caching="auto", + # torch_frame.Dataset parameters + col_to_stype: dict[str, torch_frame.stype] | None = None, + target_col: str | None = None, + split_col: str | None = None, + col_to_sep: str | None | dict[str, str | None] = None, + col_to_text_embedder_cfg: dict[str, TextEmbedderConfig] + | TextEmbedderConfig | None = None, + col_to_text_tokenizer_cfg: dict[str, TextTokenizerConfig] + | TextTokenizerConfig | None = None, + col_to_image_embedder_cfg: dict[str, ImageEmbedderConfig] + | ImageEmbedderConfig | None = None, + col_to_time_format: str | None | dict[str, str | None] = None, + # other NeuralNet parameters + **kwargs, + ): + super().__init__( + module=module, + criterion=criterion, + optimizer=optimizer, + lr=lr, + max_epochs=max_epochs, + batch_size=batch_size, + iterator_train=self.iterator_train_valid, # changed + iterator_valid=self.iterator_train_valid, # changed + dataset=self.create_dataset, # changed + train_split=self.split_dataset, # changed + callbacks=callbacks, + predict_nonlinearity=predict_nonlinearity, + warm_start=warm_start, + verbose=verbose, + device=device, + compile=compile, + use_caching=use_caching, + **kwargs, + ) + self.col_to_stype = col_to_stype + self.target_col = target_col + self.split_col = split_col + self.col_to_sep = col_to_sep + self.col_to_text_embedder_cfg = col_to_text_embedder_cfg + self.col_to_text_tokenizer_cfg = col_to_text_tokenizer_cfg + self.col_to_image_embedder_cfg = col_to_image_embedder_cfg + self.col_to_time_format = col_to_time_format + + def create_dataset(self, df: DataFrame, _: Any) -> Dataset: + dataset_ = Dataset( + df, + self.dataset_.col_to_stype, + split_col=self.dataset_.split_col, + target_col=self.dataset_.target_col, + col_to_sep=self.dataset_.col_to_sep, + col_to_text_embedder_cfg=self.dataset_.col_to_text_embedder_cfg, + col_to_text_tokenizer_cfg=self.dataset_.col_to_text_tokenizer_cfg, + col_to_image_embedder_cfg=self.dataset_.col_to_image_embedder_cfg, + col_to_time_format=self.dataset_.col_to_time_format, + ) + dataset_.materialize() + return dataset_ + + def split_dataset(self, dataset: Dataset) -> tuple[TensorFrame, TensorFrame]: + datasets = dataset.split()[:2] + return datasets[0].tensor_frame, datasets[1].tensor_frame + + def iterator_train_valid(self, dataset: Dataset, **kwargs: Any) -> DataLoader: + return NeuralNetPytorchFrameDataLoader(dataset, device=self.device, **kwargs) + + def fit(self, X: Dataset | DataFrame, y: ArrayLike | None=None, **fit_params): + if isinstance(X, DataFrame): + if y is not None: + X["target_col"] = y + self.dataset_ = Dataset( + X, + self.col_to_stype or infer_df_stype(X), + split_col=self.split_col, + target_col=self.target_col, + col_to_sep=self.col_to_sep, + col_to_text_embedder_cfg=self.col_to_text_embedder_cfg, + col_to_text_tokenizer_cfg=self.col_to_text_tokenizer_cfg, + col_to_image_embedder_cfg=self.col_to_image_embedder_cfg, + col_to_time_format=self.col_to_time_format, + ) + else: + self.dataset_ = X + return super().fit(self.dataset_.df, None, **fit_params) + +# TODO: make this behave more like NeuralNetClassifier +class NeuralNetClassifierPytorchFrame(NeuralNetPytorchFrame): + def fit(self, X: Dataset | DataFrame, y: ArrayLike | None=None, **fit_params): + fit_result = super().fit(X, y, **fit_params) + self.classes = self.dataset_.df["target_col"].unique() + return fit_result \ No newline at end of file From b8e8ae4adef7dfae1dc80744b2273a7687782a71 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Mon, 11 Mar 2024 22:01:24 +0900 Subject: [PATCH 02/48] docs: add tutorial for the last commit --- examples/tutorial.py | 48 ++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/examples/tutorial.py b/examples/tutorial.py index 1b58b0e3a..3d6ad57eb 100644 --- a/examples/tutorial.py +++ b/examples/tutorial.py @@ -34,6 +34,7 @@ parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--seed', type=int, default=0) +parser.add_argument("--framework", type=str, default="torch") args = parser.parse_args() torch.manual_seed(args.seed) @@ -223,7 +224,7 @@ def train(epoch: int) -> float: model.train() loss_accum = total_count = 0 - for tf in tqdm(train_loader, desc=f'Epoch: {epoch}'): + for tf in tqdm(train_loader, desc=f"Epoch: {epoch}"): tf = tf.to(device) pred = model(tf) loss = F.cross_entropy(pred, tf.y) @@ -250,17 +251,34 @@ def test(loader: DataLoader) -> float: return accum / total_count -best_val_acc = 0 -best_test_acc = 0 -for epoch in range(1, args.epochs + 1): - train_loss = train(epoch) - train_acc = test(train_loader) - val_acc = test(val_loader) - test_acc = test(test_loader) - if best_val_acc < val_acc: - best_val_acc = val_acc - best_test_acc = test_acc - print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, ' - f'Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}') - -print(f'Best Val Acc: {best_val_acc:.4f}, Best Test Acc: {best_test_acc:.4f}') +if args.framework == "torch": + best_val_acc = 0 + best_test_acc = 0 + for epoch in range(1, args.epochs + 1): + train_loss = train(epoch) + train_acc = test(train_loader) + val_acc = test(val_loader) + test_acc = test(test_loader) + if best_val_acc < val_acc: + best_val_acc = val_acc + best_test_acc = test_acc + print( + f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, " + f"Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}" + ) + + print(f"Best Val Acc: {best_val_acc:.4f}, Best Test Acc: {best_test_acc:.4f}") +elif args.framework == "skorch": + from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame + import torch.nn as nn + + net = NeuralNetClassifierPytorchFrame( + module=model, + criterion=nn.CrossEntropyLoss, + max_epochs=args.epochs, + lr=args.lr, + device=device, + verbose=1, + batch_size=args.batch_size, + ) + net.fit(dataset) From df8ecc462a95f1a7fda7b74a6e49fcd758fb0c69 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Mar 2024 13:06:54 +0000 Subject: [PATCH 03/48] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/tutorial.py | 13 ++++----- torch_frame/utils/skorch.py | 53 ++++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/examples/tutorial.py b/examples/tutorial.py index 3d6ad57eb..9db4b6bb1 100644 --- a/examples/tutorial.py +++ b/examples/tutorial.py @@ -262,16 +262,17 @@ def test(loader: DataLoader) -> float: if best_val_acc < val_acc: best_val_acc = val_acc best_test_acc = test_acc - print( - f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, " - f"Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}" - ) + print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, " + f"Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}") - print(f"Best Val Acc: {best_val_acc:.4f}, Best Test Acc: {best_test_acc:.4f}") + print( + f"Best Val Acc: {best_val_acc:.4f}, Best Test Acc: {best_test_acc:.4f}" + ) elif args.framework == "skorch": - from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame import torch.nn as nn + from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame + net = NeuralNetClassifierPytorchFrame( module=model, criterion=nn.CrossEntropyLoss, diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index d54bef399..44c3cf2a7 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -1,32 +1,35 @@ -from skorch import NeuralNetClassifier, NeuralNet -from skorch.dataset import Dataset as SkorchDataset -import torch.nn as nn -import torch_frame -from torch_frame.data.tensor_frame import TensorFrame -from torch_frame.utils import infer_df_stype -from torch_frame.data.dataset import DataFrameToTensorFrameConverter, Dataset -from torch_frame.data.loader import DataLoader -import torch -from torch_frame.typing import IndexSelectType -from torch import Tensor -from pandas import DataFrame from typing import Any + import pandas as pd +import torch +import torch.nn as nn from numpy.typing import ArrayLike +from pandas import DataFrame +from skorch import NeuralNet, NeuralNetClassifier +from skorch.dataset import Dataset as SkorchDataset +from torch import Tensor + +import torch_frame from torch_frame.config import ( ImageEmbedderConfig, TextEmbedderConfig, TextTokenizerConfig, ) +from torch_frame.data.dataset import DataFrameToTensorFrameConverter, Dataset +from torch_frame.data.loader import DataLoader +from torch_frame.data.tensor_frame import TensorFrame +from torch_frame.typing import IndexSelectType +from torch_frame.utils import infer_df_stype + class NeuralNetPytorchFrameDataLoader(DataLoader): - def __init__( - self, dataset: Dataset | TensorFrame, *args, device: torch.device, **kwargs - ): + def __init__(self, dataset: Dataset | TensorFrame, *args, + device: torch.device, **kwargs): super().__init__(dataset, *args, **kwargs) self.device = device - def collate_fn(self, index: IndexSelectType) -> tuple[TensorFrame, Tensor | None]: + def collate_fn( + self, index: IndexSelectType) -> tuple[TensorFrame, Tensor | None]: index = torch.tensor(index) res = super().collate_fn(index).to(self.device) return res, res.y @@ -112,14 +115,18 @@ def create_dataset(self, df: DataFrame, _: Any) -> Dataset: dataset_.materialize() return dataset_ - def split_dataset(self, dataset: Dataset) -> tuple[TensorFrame, TensorFrame]: + def split_dataset(self, + dataset: Dataset) -> tuple[TensorFrame, TensorFrame]: datasets = dataset.split()[:2] return datasets[0].tensor_frame, datasets[1].tensor_frame - def iterator_train_valid(self, dataset: Dataset, **kwargs: Any) -> DataLoader: - return NeuralNetPytorchFrameDataLoader(dataset, device=self.device, **kwargs) + def iterator_train_valid(self, dataset: Dataset, + **kwargs: Any) -> DataLoader: + return NeuralNetPytorchFrameDataLoader(dataset, device=self.device, + **kwargs) - def fit(self, X: Dataset | DataFrame, y: ArrayLike | None=None, **fit_params): + def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, + **fit_params): if isinstance(X, DataFrame): if y is not None: X["target_col"] = y @@ -138,9 +145,11 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None=None, **fit_params): self.dataset_ = X return super().fit(self.dataset_.df, None, **fit_params) + # TODO: make this behave more like NeuralNetClassifier class NeuralNetClassifierPytorchFrame(NeuralNetPytorchFrame): - def fit(self, X: Dataset | DataFrame, y: ArrayLike | None=None, **fit_params): + def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, + **fit_params): fit_result = super().fit(X, y, **fit_params) self.classes = self.dataset_.df["target_col"].unique() - return fit_result \ No newline at end of file + return fit_result From ca95b8f2582952e47d292a47cfbe6ca631a3efeb Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Mon, 11 Mar 2024 22:19:09 +0900 Subject: [PATCH 04/48] fix: patch `skorch.utils.to_tensor()` --- torch_frame/utils/skorch.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 44c3cf2a7..a6300992e 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -1,3 +1,17 @@ +import skorch.utils + +# TODO: make it more safe +old_to_tensor = skorch.utils.to_tensor + +def to_tensor(X, device, accept_sparse=False): + if isinstance(X, TensorFrame): + return X + return old_to_tensor(X, device, accept_sparse) + +skorch.utils.to_tensor = to_tensor +import importlib +importlib.reload(skorch.net) + from typing import Any import pandas as pd From 0b9426f262ec64d03ee0a9c87ef8f3119672117d Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 16 Mar 2024 12:52:07 +0900 Subject: [PATCH 05/48] style: format code --- torch_frame/utils/skorch.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index a6300992e..d16f1f425 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -1,26 +1,11 @@ -import skorch.utils - -# TODO: make it more safe -old_to_tensor = skorch.utils.to_tensor - -def to_tensor(X, device, accept_sparse=False): - if isinstance(X, TensorFrame): - return X - return old_to_tensor(X, device, accept_sparse) - -skorch.utils.to_tensor = to_tensor import importlib -importlib.reload(skorch.net) - from typing import Any -import pandas as pd +import skorch.utils import torch -import torch.nn as nn from numpy.typing import ArrayLike from pandas import DataFrame -from skorch import NeuralNet, NeuralNetClassifier -from skorch.dataset import Dataset as SkorchDataset +from skorch import NeuralNet from torch import Tensor import torch_frame @@ -29,12 +14,26 @@ def to_tensor(X, device, accept_sparse=False): TextEmbedderConfig, TextTokenizerConfig, ) -from torch_frame.data.dataset import DataFrameToTensorFrameConverter, Dataset +from torch_frame.data.dataset import Dataset from torch_frame.data.loader import DataLoader from torch_frame.data.tensor_frame import TensorFrame from torch_frame.typing import IndexSelectType from torch_frame.utils import infer_df_stype +# TODO: make it more safe +old_to_tensor = skorch.utils.to_tensor + + +def to_tensor(X, device, accept_sparse=False): + if isinstance(X, TensorFrame): + return X + return old_to_tensor(X, device, accept_sparse) + + +skorch.utils.to_tensor = to_tensor + +importlib.reload(skorch.net) + class NeuralNetPytorchFrameDataLoader(DataLoader): def __init__(self, dataset: Dataset | TensorFrame, *args, @@ -42,7 +41,7 @@ def __init__(self, dataset: Dataset | TensorFrame, *args, super().__init__(dataset, *args, **kwargs) self.device = device - def collate_fn( + def collate_fn( # type: ignore self, index: IndexSelectType) -> tuple[TensorFrame, Tensor | None]: index = torch.tensor(index) res = super().collate_fn(index).to(self.device) From 198b749a50e595ad952e6b906af7bd53dd0af48a Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 16 Mar 2024 13:54:02 +0900 Subject: [PATCH 06/48] feat: fix multiple issues, support sklearn-like datasets and predict() --- examples/tutorial.py | 36 +++++++++++++++++++++++++ test/nn/models/test_resnet.py | 6 ++--- test/utils/test_skorch.py | 0 torch_frame/utils/skorch.py | 51 ++++++++++++++++++++++++++++------- 4 files changed, 81 insertions(+), 12 deletions(-) create mode 100644 test/utils/test_skorch.py diff --git a/examples/tutorial.py b/examples/tutorial.py index 9db4b6bb1..b573b4d0f 100644 --- a/examples/tutorial.py +++ b/examples/tutorial.py @@ -283,3 +283,39 @@ def test(loader: DataLoader) -> float: batch_size=args.batch_size, ) net.fit(dataset) + y_pred = net.predict(test_dataset) + test_acc = (torch.Tensor(y_pred).argmax( + dim=-1) == test_tensor_frame.y).float().mean() + print(f"Test Acc: {test_acc:.4f}") +elif args.framework == "skorch-dataframe": + import pandas as pd + import torch.nn as nn + + from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame + + df = dataset.df + df_train = pd.concat([train_dataset.df, val_dataset.df]) + X_train, y_train = df_train.drop( + columns=[dataset.target_col, dataset.split_col]), df_train[ + dataset.target_col] + df_test = test_dataset.df + X_test, y_test = df_test.drop( + columns=[dataset.target_col, dataset.split_col]), df_test[ + dataset.target_col] + + # use DataFrames with no `split_col` or `target_col` + # like normal sklearn datasets from now on + net = NeuralNetClassifierPytorchFrame( + module=model, + criterion=nn.CrossEntropyLoss, + max_epochs=args.epochs, + lr=args.lr, + device=device, + verbose=1, + col_to_stype={"C_feature_7": stype.categorical}, + batch_size=args.batch_size, + ) + net.fit(X_train, y_train) + y_pred = net.predict(X_test) + test_acc = (y_pred.argmax(-1) == y_test).mean() + print(f"Test Acc: {test_acc:.4f}") diff --git a/test/nn/models/test_resnet.py b/test/nn/models/test_resnet.py index 24be0df67..6a67a7338 100644 --- a/test/nn/models/test_resnet.py +++ b/test/nn/models/test_resnet.py @@ -2,11 +2,11 @@ from torch_frame.data.dataset import Dataset from torch_frame.datasets import FakeDataset -from torch_frame.nn import ResNet +from torch_frame.nn import MLP @pytest.mark.parametrize('batch_size', [0, 5]) -def test_resnet(batch_size): +def test_mlp(batch_size): channels = 8 out_channels = 1 num_layers = 3 @@ -14,7 +14,7 @@ def test_resnet(batch_size): dataset.materialize() tensor_frame = dataset.tensor_frame[:batch_size] # Feature-based embeddings - model = ResNet( + model = MLP( channels=channels, out_channels=out_channels, num_layers=num_layers, diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py new file mode 100644 index 000000000..e69de29bb diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index d16f1f425..660014961 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -3,8 +3,9 @@ import skorch.utils import torch -from numpy.typing import ArrayLike +from numpy.typing import ArrayLike, NDArray from pandas import DataFrame +from sklearn.model_selection import train_test_split from skorch import NeuralNet from torch import Tensor @@ -58,10 +59,10 @@ def __init__( lr=0.01, max_epochs=10, batch_size=128, - iterator_train=..., - iterator_valid=..., - dataset=..., - train_split=..., + iterator_train=None, + iterator_valid=None, + dataset=None, + train_split=None, callbacks=None, predict_nonlinearity="auto", warm_start=False, @@ -71,8 +72,8 @@ def __init__( use_caching="auto", # torch_frame.Dataset parameters col_to_stype: dict[str, torch_frame.stype] | None = None, - target_col: str | None = None, - split_col: str | None = None, + target_col: str | None = "target_col", + split_col: str | None = "split_col", col_to_sep: str | None | dict[str, str | None] = None, col_to_text_embedder_cfg: dict[str, TextEmbedderConfig] | TextEmbedderConfig | None = None, @@ -112,6 +113,9 @@ def __init__( self.col_to_text_tokenizer_cfg = col_to_text_tokenizer_cfg self.col_to_image_embedder_cfg = col_to_image_embedder_cfg self.col_to_time_format = col_to_time_format + # save dataset for partial_fit + self.train_split_original = train_split or ( + lambda x: train_test_split(x, test_size=0.2)) def create_dataset(self, df: DataFrame, _: Any) -> Dataset: dataset_ = Dataset( @@ -142,10 +146,18 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, **fit_params): if isinstance(X, DataFrame): if y is not None: - X["target_col"] = y + X[self.target_col] = y + if self.split_col not in X: + X_train, X_val = self.train_split_original(X, **fit_params) + # if index is in X_train, 0, otherwise 1 + X[self.split_col] = (X.index.isin(X_train.index)).astype(int) self.dataset_ = Dataset( X, - self.col_to_stype or infer_df_stype(X), + { + k: v + for k, v in infer_df_stype(X).items() + if k not in (self.split_col, ) + } | (self.col_to_stype or {}), split_col=self.split_col, target_col=self.target_col, col_to_sep=self.col_to_sep, @@ -158,6 +170,27 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, self.dataset_ = X return super().fit(self.dataset_.df, None, **fit_params) + def predict(self, X: Dataset | DataFrame) -> NDArray[Any]: + if isinstance(X, DataFrame): + self.dataset_ = Dataset( + X, + { + k: v + for k, v in self.dataset_.col_to_stype.items() + if k not in (self.target_col, ) + }, + split_col=None, + target_col=None, + col_to_sep=self.col_to_sep, + col_to_text_embedder_cfg=self.col_to_text_embedder_cfg, + col_to_text_tokenizer_cfg=self.col_to_text_tokenizer_cfg, + col_to_image_embedder_cfg=self.col_to_image_embedder_cfg, + col_to_time_format=self.col_to_time_format, + ) + else: + self.dataset_ = X + return super().predict(self.dataset_.df) + # TODO: make this behave more like NeuralNetClassifier class NeuralNetClassifierPytorchFrame(NeuralNetPytorchFrame): From d26448843897f0ae83cc5372ccd39aca2522b33f Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 16 Mar 2024 14:07:26 +0900 Subject: [PATCH 07/48] chore(example): test with regression as well --- examples/revisiting.py | 113 +++++++++++++++++++++++++++++++---------- examples/tutorial.py | 3 +- 2 files changed, 88 insertions(+), 28 deletions(-) diff --git a/examples/revisiting.py b/examples/revisiting.py index 7a8eef6bd..3030e1fea 100644 --- a/examples/revisiting.py +++ b/examples/revisiting.py @@ -46,6 +46,8 @@ parser.add_argument('--epochs', type=int, default=100) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--compile', action='store_true') +parser.add_argument("--framework", type=str, default="torch", + choices=["torch", "skorch-dataframe"]) args = parser.parse_args() torch.manual_seed(args.seed) @@ -156,30 +158,87 @@ def test(loader: DataLoader) -> float: return rmse -if is_classification: - metric = 'Acc' - best_val_metric = 0 - best_test_metric = 0 -else: - metric = 'RMSE' - best_val_metric = float('inf') - best_test_metric = float('inf') - -for epoch in range(1, args.epochs + 1): - train_loss = train(epoch) - train_metric = test(train_loader) - val_metric = test(val_loader) - test_metric = test(test_loader) - - if is_classification and val_metric > best_val_metric: - best_val_metric = val_metric - best_test_metric = test_metric - elif not is_classification and val_metric < best_val_metric: - best_val_metric = val_metric - best_test_metric = test_metric - - print(f'Train Loss: {train_loss:.4f}, Train {metric}: {train_metric:.4f}, ' - f'Val {metric}: {val_metric:.4f}, Test {metric}: {test_metric:.4f}') - -print(f'Best Val {metric}: {best_val_metric:.4f}, ' - f'Best Test {metric}: {best_test_metric:.4f}') +if args.framework == "torch": + if is_classification: + metric = 'Acc' + best_val_metric = 0 + best_test_metric = 0 + else: + metric = 'RMSE' + best_val_metric = float('inf') + best_test_metric = float('inf') + + for epoch in range(1, args.epochs + 1): + train_loss = train(epoch) + train_metric = test(train_loader) + val_metric = test(val_loader) + test_metric = test(test_loader) + + if is_classification and val_metric > best_val_metric: + best_val_metric = val_metric + best_test_metric = test_metric + elif not is_classification and val_metric < best_val_metric: + best_val_metric = val_metric + best_test_metric = test_metric + + print( + f'Train Loss: {train_loss:.4f}, ' + f'Train {metric}: {train_metric:.4f}, ' + f'Val {metric}: {val_metric:.4f}, Test {metric}: {test_metric:.4f}' + ) + + print(f'Best Val {metric}: {best_val_metric:.4f}, ' + f'Best Test {metric}: {best_test_metric:.4f}') +elif args.framework == "skorch-dataframe": + import numpy as np + import pandas as pd + import torch.nn as nn + + from torch_frame.utils.skorch import ( + NeuralNetClassifierPytorchFrame, + NeuralNetPytorchFrame, + ) + + df = dataset.df + df_train = pd.concat([train_dataset.df, val_dataset.df]) + X_train, y_train = df_train.drop( + columns=[dataset.target_col, dataset.split_col]), df_train[ + dataset.target_col] + df_test = test_dataset.df + X_test, y_test = df_test.drop( + columns=[dataset.target_col, dataset.split_col]), df_test[ + dataset.target_col] + + # use DataFrames with no `split_col` or `target_col` + # like normal sklearn datasets from now on + if is_classification: + net = NeuralNetClassifierPytorchFrame( + module=model, + criterion=nn.CrossEntropyLoss, + max_epochs=args.epochs, + lr=args.lr, + device=device, + verbose=1, + # col_to_stype={"C_feature_7": stype.categorical}, + batch_size=args.batch_size, + ) + else: + net = NeuralNetPytorchFrame( + module=model, + criterion=nn.MSELoss, + max_epochs=args.epochs, + lr=args.lr, + device=device, + verbose=1, + # col_to_stype={"C_feature_7": stype.categorical}, + batch_size=args.batch_size, + ) + net.fit(X_train, y_train) + y_pred = net.predict(X_test) + + if is_classification: + test_acc = (y_pred.argmax(-1) == y_test).mean() + print(f"Test Acc: {test_acc:.4f}") + else: + test_rmse = np.sqrt(((y_pred.squeeze() - y_test)**2).mean()) + print(f"Test RMSE: {test_rmse:.4f}") diff --git a/examples/tutorial.py b/examples/tutorial.py index b573b4d0f..8ad8c89c4 100644 --- a/examples/tutorial.py +++ b/examples/tutorial.py @@ -34,7 +34,8 @@ parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--seed', type=int, default=0) -parser.add_argument("--framework", type=str, default="torch") +parser.add_argument("--framework", type=str, default="torch", + choices=["torch", "skorch", "skorch-dataframe"]) args = parser.parse_args() torch.manual_seed(args.seed) From 9cc4fe1c0f0867fe303512ca6eb43cd61ba63c73 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 16 Mar 2024 14:09:27 +0900 Subject: [PATCH 08/48] docs: add changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8cc9cc97f..54bd1c220 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - Added light-weight MLP ([#372](https://github.com/pyg-team/pytorch-frame/pull/372)) +- Added an inherited class from skorch.NeuralNet that is compatible with PyTorch Frame ([#375](https://github.com/pyg-team/pytorch-frame/pull/375)) ### Changed From 98aea5c722b4a77fd00d10301e2c754f004d8b2a Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 16 Mar 2024 14:10:23 +0900 Subject: [PATCH 09/48] fix(skorch): import annotations from __future__ --- torch_frame/utils/skorch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 660014961..5ec953509 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import importlib from typing import Any From 0f650d89e8cc5a73356b365680e229fccb6939ec Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 16 Mar 2024 14:13:59 +0900 Subject: [PATCH 10/48] revert: revert wrong changes --- test/nn/models/test_resnet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/nn/models/test_resnet.py b/test/nn/models/test_resnet.py index 6a67a7338..24be0df67 100644 --- a/test/nn/models/test_resnet.py +++ b/test/nn/models/test_resnet.py @@ -2,11 +2,11 @@ from torch_frame.data.dataset import Dataset from torch_frame.datasets import FakeDataset -from torch_frame.nn import MLP +from torch_frame.nn import ResNet @pytest.mark.parametrize('batch_size', [0, 5]) -def test_mlp(batch_size): +def test_resnet(batch_size): channels = 8 out_channels = 1 num_layers = 3 @@ -14,7 +14,7 @@ def test_mlp(batch_size): dataset.materialize() tensor_frame = dataset.tensor_frame[:batch_size] # Feature-based embeddings - model = MLP( + model = ResNet( channels=channels, out_channels=out_channels, num_layers=num_layers, From 95688e376816d900c6d2e5099a90fab8a25befad Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 16 Mar 2024 14:49:32 +0900 Subject: [PATCH 11/48] style(skorch): fix typing --- torch_frame/utils/skorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 5ec953509..d30d73624 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -155,7 +155,7 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, X[self.split_col] = (X.index.isin(X_train.index)).astype(int) self.dataset_ = Dataset( X, - { + { # type: ignore k: v for k, v in infer_df_stype(X).items() if k not in (self.split_col, ) From 7594b443d499d25d4f2e33a7487f423729275243 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 16 Mar 2024 14:50:47 +0900 Subject: [PATCH 12/48] fix(skorch): use `classes` if specified --- torch_frame/utils/skorch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index d30d73624..9d0c3f1da 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -199,5 +199,6 @@ class NeuralNetClassifierPytorchFrame(NeuralNetPytorchFrame): def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, **fit_params): fit_result = super().fit(X, y, **fit_params) - self.classes = self.dataset_.df["target_col"].unique() + self.classes = getattr( + self, "classes", None) or self.dataset_.df["target_col"].unique() return fit_result From bacf31f6a967cec2f34e5541bb5b8b2182ac4564 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 20:17:22 +0900 Subject: [PATCH 13/48] chore: remove comments --- examples/revisiting.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/revisiting.py b/examples/revisiting.py index 3030e1fea..6a4f9b532 100644 --- a/examples/revisiting.py +++ b/examples/revisiting.py @@ -219,7 +219,6 @@ def test(loader: DataLoader) -> float: lr=args.lr, device=device, verbose=1, - # col_to_stype={"C_feature_7": stype.categorical}, batch_size=args.batch_size, ) else: @@ -230,7 +229,6 @@ def test(loader: DataLoader) -> float: lr=args.lr, device=device, verbose=1, - # col_to_stype={"C_feature_7": stype.categorical}, batch_size=args.batch_size, ) net.fit(X_train, y_train) From 1c50a59cdfa3519ef7c0ab68073f4200d198c965 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 22:04:40 +0900 Subject: [PATCH 14/48] chore(skorch): add more comments --- torch_frame/utils/skorch.py | 79 ++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 9d0c3f1da..43715f1b7 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -2,6 +2,7 @@ import importlib from typing import Any +import warnings import skorch.utils import torch @@ -86,7 +87,55 @@ def __init__( col_to_time_format: str | None | dict[str, str | None] = None, # other NeuralNet parameters **kwargs, - ): + ) -> None: + """`skorch.NeuralNet` with `torch_frame` support. + + Additional parameters are **ONLY** used when creating a dummy torch_frame.data.dataset.Dataset + if pandas.DataFrame is passed as X in `fit` or `predict` methods. + + Parameters + ---------- + col_to_stype (Dict[str, torch_frame.stype]): A dictionary that maps + each column in the data frame to a semantic type. + target_col (str, optional): The column used as target. + (default: :obj:`None`) + split_col (str, optional): The column that stores the pre-defined split + information. The column should only contain :obj:`0`, :obj:`1`, or + :obj:`2`. (default: :obj:`None`). + col_to_sep (Union[str, Dict[str, Optional[str]]]): A dictionary or a + string/:obj:`None` specifying the separator/delimiter for the + multi-categorical columns. If a string/:obj:`None` is specified, + then the same separator will be used throughout all the + multi-categorical columns. Note that if :obj:`None` is specified, + it assumes a multi-category is given as a :obj:`list` of + categories. If a dictionary is given, we use a separator specified + for each column. (default: :obj:`None`) + col_to_text_embedder_cfg (TextEmbedderConfig or dict, optional): + A text embedder configuration or a dictionary of configurations + specifying :obj:`text_embedder` that embeds texts into vectors and + :obj:`batch_size` that specifies the mini-batch size for + :obj:`text_embedder`. (default: :obj:`None`) + col_to_text_tokenizer_cfg (TextTokenizerConfig or dict, optional): + A text tokenizer configuration or dictionary of configurations + specifying :obj:`text_tokenizer` that maps sentences into a + list of dictionary of tensors. Each element in the list + corresponds to each sentence, keys are input arguments to + the model such as :obj:`input_ids`, and values are tensors + such as tokens. :obj:`batch_size` specifies the mini-batch + size for :obj:`text_tokenizer`. (default: :obj:`None`) + col_to_image_embedder_cfg (ImageEmbedderConfig or dict, optional): + No documentation provided. + col_to_time_format (Union[str, Dict[str, Optional[str]]], optional): A + dictionary or a string specifying the format for the timestamp + columns. See `strfttime documentation + `_ + for more information on formats. If a string is specified, + then the same format will be used throughout all the timestamp + columns. If a dictionary is given, we use a different format + specified for each column. If not specified, pandas's internal + to_datetime function will be used to auto parse time columns. + (default: :obj:`None`) + """ super().__init__( module=module, criterion=criterion, @@ -107,6 +156,8 @@ def __init__( use_caching=use_caching, **kwargs, ) + # additional parameters used when creating a dummy + # torch_frame.data.dataset.Dataset self.col_to_stype = col_to_stype self.target_col = target_col self.split_col = split_col @@ -118,8 +169,10 @@ def __init__( # save dataset for partial_fit self.train_split_original = train_split or ( lambda x: train_test_split(x, test_size=0.2)) + # 0.2 is the default test_size in train_test_split in skorch def create_dataset(self, df: DataFrame, _: Any) -> Dataset: + # skorch API dataset_ = Dataset( df, self.dataset_.col_to_stype, @@ -136,25 +189,40 @@ def create_dataset(self, df: DataFrame, _: Any) -> Dataset: def split_dataset(self, dataset: Dataset) -> tuple[TensorFrame, TensorFrame]: + # skorch API datasets = dataset.split()[:2] return datasets[0].tensor_frame, datasets[1].tensor_frame def iterator_train_valid(self, dataset: Dataset, **kwargs: Any) -> DataLoader: + # skorch API return NeuralNetPytorchFrameDataLoader(dataset, device=self.device, **kwargs) def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, **fit_params): if isinstance(X, DataFrame): + # create target_col if not exists if y is not None: X[self.target_col] = y + elif self.target_col not in X: + warnings.warn( + f"target_col {self.target_col} not found in X and y is None", + UserWarning + , stacklevel=2) + + # create split_col if not exists if self.split_col not in X: + # first split the data with the split function X_train, X_val = self.train_split_original(X, **fit_params) # if index is in X_train, 0, otherwise 1 + # X[self.split_col] = (X.index.isin(X_train.index)).astype(int) + # split_col uses iloc instead of loc, this is weird X[self.split_col] = (X.index.isin(X_train.index)).astype(int) + self.dataset_ = Dataset( X, + # do not include split_col { # type: ignore k: v for k, v in infer_df_stype(X).items() @@ -168,8 +236,16 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, col_to_image_embedder_cfg=self.col_to_image_embedder_cfg, col_to_time_format=self.col_to_time_format, ) + # materialize the dataset to add col_stats and col_names_dict + self.dataset_.materialize() else: self.dataset_ = X + + # self.module.encoder.col_stats = self.dataset_.col_stats + # self.module.encoder.col_names_dict = self.dataset_.tensor_frame.col_names_dict + # self.module = self.module.__class__(col_stats=self.dataset_.col_stats, + # col_names_dict=self.dataset_.tensor_frame.col_names_dict, + # **self.module.__dict__ return super().fit(self.dataset_.df, None, **fit_params) def predict(self, X: Dataset | DataFrame) -> NDArray[Any]: @@ -189,6 +265,7 @@ def predict(self, X: Dataset | DataFrame) -> NDArray[Any]: col_to_image_embedder_cfg=self.col_to_image_embedder_cfg, col_to_time_format=self.col_to_time_format, ) + # no need to materialize probably else: self.dataset_ = X return super().predict(self.dataset_.df) From 3a903922976f2339f59911d9b5c4b0e418764b41 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 4 Jul 2024 13:06:03 +0000 Subject: [PATCH 15/48] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- torch_frame/utils/skorch.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 43715f1b7..df4b21293 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -1,8 +1,8 @@ from __future__ import annotations import importlib -from typing import Any import warnings +from typing import Any import skorch.utils import torch @@ -89,7 +89,7 @@ def __init__( **kwargs, ) -> None: """`skorch.NeuralNet` with `torch_frame` support. - + Additional parameters are **ONLY** used when creating a dummy torch_frame.data.dataset.Dataset if pandas.DataFrame is passed as X in `fit` or `predict` methods. @@ -156,7 +156,7 @@ def __init__( use_caching=use_caching, **kwargs, ) - # additional parameters used when creating a dummy + # additional parameters used when creating a dummy # torch_frame.data.dataset.Dataset self.col_to_stype = col_to_stype self.target_col = target_col @@ -208,9 +208,8 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, elif self.target_col not in X: warnings.warn( f"target_col {self.target_col} not found in X and y is None", - UserWarning - , stacklevel=2) - + UserWarning, stacklevel=2) + # create split_col if not exists if self.split_col not in X: # first split the data with the split function @@ -219,7 +218,7 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, # X[self.split_col] = (X.index.isin(X_train.index)).astype(int) # split_col uses iloc instead of loc, this is weird X[self.split_col] = (X.index.isin(X_train.index)).astype(int) - + self.dataset_ = Dataset( X, # do not include split_col From 474caffe156a1ebe174175b7009017c5a0f35fd2 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 22:07:40 +0900 Subject: [PATCH 16/48] test: add prototype test --- test/utils/test_skorch.py | 101 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index e69de29bb..32167490e 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -0,0 +1,101 @@ +import pandas as pd +import pytest +import torch.nn as nn + +from torch_frame import TaskType, stype +from torch_frame.config.text_embedder import TextEmbedderConfig +from torch_frame.data.dataset import Dataset +from torch_frame.datasets.fake import FakeDataset +from torch_frame.nn.models.mlp import MLP +from torch_frame.testing.text_embedder import HashTextEmbedder +from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame + + +@pytest.mark.parametrize('cls', ["mlp"]) +@pytest.mark.parametrize( + 'stypes', + [ + [stype.numerical], + [stype.categorical], + # [stype.text_embedded], + # [stype.numerical, stype.numerical, stype.text_embedded], + ]) +@pytest.mark.parametrize('task_type_and_loss_cls', [ + (TaskType.REGRESSION, nn.MSELoss), + (TaskType.BINARY_CLASSIFICATION, nn.BCEWithLogitsLoss), + (TaskType.MULTICLASS_CLASSIFICATION, nn.CrossEntropyLoss), +]) +@pytest.mark.parametrize('pass_dataset', [False, True]) +def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, + pass_dataset: bool): + task_type, loss_cls = task_type_and_loss_cls + loss = loss_cls() + + # initialize dataset + dataset: Dataset = FakeDataset( + num_rows=30, + with_nan=True, + stypes=stypes, + create_split=True, + task_type=task_type, + col_to_text_embedder_cfg=TextEmbedderConfig( + text_embedder=HashTextEmbedder(8)), + ) + dataset.materialize() + train_dataset, val_dataset, test_dataset = dataset.split() + + # convert to dataframe + if not pass_dataset: + df = dataset.df + df_train = pd.concat([train_dataset.df, val_dataset.df]) + X_train, y_train = df_train.drop( + columns=[dataset.target_col, dataset.split_col]), df_train[ + dataset.target_col] + df_test = test_dataset.df + X_test, y_test = df_test.drop( + columns=[dataset.target_col, dataset.split_col]), df_test[ + dataset.target_col] + + # never use dataset again + # we assume that only dataframes are available + del dataset, train_dataset, val_dataset, test_dataset + + if cls == "mlp": + channels = 8 + out_channels = 1 + num_layers = 3 + model = MLP( + channels=channels, + out_channels=out_channels, + num_layers=num_layers, + col_stats={}, + col_names_dict={}, + normalization="layer_norm", + ) + else: + raise NotImplementedError + + if pass_dataset: + net = NeuralNetClassifierPytorchFrame( + module=model, + criterion=loss, + max_epochs=2, + # lr=args.lr, + # device=device, + verbose=1, + batch_size=1, + ) + net.fit(dataset) + y_pred = net.predict(test_dataset) + else: + net = NeuralNetClassifierPytorchFrame( + module=model, + criterion=loss, + max_epochs=2, + # lr=args.lr, + # device=device, + verbose=1, + batch_size=1, + ) + net.fit(X_train, y_train) + y_pred = net.predict(X_test) From cda1524981e2a4622870a3440ae1fec3f47f35f2 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 22:53:41 +0900 Subject: [PATCH 17/48] feat: add NeuralNetBinaryClassifierPytorchFrame --- torch_frame/utils/skorch.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index df4b21293..39693af71 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -4,6 +4,7 @@ import warnings from typing import Any +import numpy as np import skorch.utils import torch from numpy.typing import ArrayLike, NDArray @@ -242,9 +243,11 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, # self.module.encoder.col_stats = self.dataset_.col_stats # self.module.encoder.col_names_dict = self.dataset_.tensor_frame.col_names_dict + # import inspect + # init_param_names = inspect.signature(self.module.__class__).parameters.keys() # self.module = self.module.__class__(col_stats=self.dataset_.col_stats, # col_names_dict=self.dataset_.tensor_frame.col_names_dict, - # **self.module.__dict__ + # **{k: v for k, v in self.module.__dict__.items() if k in init_param_names}) return super().fit(self.dataset_.df, None, **fit_params) def predict(self, X: Dataset | DataFrame) -> NDArray[Any]: @@ -278,3 +281,7 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, self.classes = getattr( self, "classes", None) or self.dataset_.df["target_col"].unique() return fit_result + +# class NeuralNetBinaryClassifierPytorchFrame(NeuralNetPytorchFrame, skorch.NeuralNetBinaryClassifier): +class NeuralNetBinaryClassifierPytorchFrame(NeuralNetPytorchFrame): + num_classes = np.array([0, 1]) \ No newline at end of file From 568a6ded4d99c476c987b0af5f60e2384fc7608b Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 22:54:04 +0900 Subject: [PATCH 18/48] test: update test --- test/utils/test_skorch.py | 56 +++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index 32167490e..5b9acf54d 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -8,8 +8,13 @@ from torch_frame.datasets.fake import FakeDataset from torch_frame.nn.models.mlp import MLP from torch_frame.testing.text_embedder import HashTextEmbedder -from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame - +from torch_frame.utils.skorch import NeuralNetBinaryClassifierPytorchFrame, NeuralNetClassifierPytorchFrame +import torch.nn.functional as F +class BCEWithLogitsLossSigmoidSqueeze(nn.BCEWithLogitsLoss): + def forward(self, input, target): + # float to long + input = F.sigmoid(input).long() + return super().forward(input.squeeze(), target.long()) @pytest.mark.parametrize('cls', ["mlp"]) @pytest.mark.parametrize( @@ -22,10 +27,10 @@ ]) @pytest.mark.parametrize('task_type_and_loss_cls', [ (TaskType.REGRESSION, nn.MSELoss), - (TaskType.BINARY_CLASSIFICATION, nn.BCEWithLogitsLoss), + # (TaskType.BINARY_CLASSIFICATION, BCEWithLogitsLossSqueeze), (TaskType.MULTICLASS_CLASSIFICATION, nn.CrossEntropyLoss), ]) -@pytest.mark.parametrize('pass_dataset', [False, True]) +@pytest.mark.parametrize('pass_dataset', [False]) def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, pass_dataset: bool): task_type, loss_cls = task_type_and_loss_cls @@ -34,7 +39,7 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, # initialize dataset dataset: Dataset = FakeDataset( num_rows=30, - with_nan=True, + # with_nan=True, stypes=stypes, create_split=True, task_type=task_type, @@ -43,10 +48,16 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, ) dataset.materialize() train_dataset, val_dataset, test_dataset = dataset.split() - - # convert to dataframe + # print(dataset.col_stats) + # # convert to dataframe + # col_to_stype = dataset.col_to_stype + # # remove split_col and target_col + # col_to_stype = { + # k: v + # for k, v in col_to_stype.items() + # if k not in [dataset.split_col, dataset.target_col] + # } if not pass_dataset: - df = dataset.df df_train = pd.concat([train_dataset.df, val_dataset.df]) X_train, y_train = df_train.drop( columns=[dataset.target_col, dataset.split_col]), df_train[ @@ -58,24 +69,24 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, # never use dataset again # we assume that only dataframes are available - del dataset, train_dataset, val_dataset, test_dataset + # del dataset, train_dataset, val_dataset, test_dataset if cls == "mlp": channels = 8 - out_channels = 1 + out_channels = dataset.num_classes if task_type == TaskType.MULTICLASS_CLASSIFICATION else 1 num_layers = 3 model = MLP( channels=channels, out_channels=out_channels, num_layers=num_layers, - col_stats={}, - col_names_dict={}, + col_stats=dataset.col_stats, + col_names_dict=dataset.tensor_frame.col_names_dict, normalization="layer_norm", ) else: raise NotImplementedError - if pass_dataset: + if task_type in [TaskType.REGRESSION, TaskType.MULTICLASS_CLASSIFICATION]: net = NeuralNetClassifierPytorchFrame( module=model, criterion=loss, @@ -83,19 +94,24 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, # lr=args.lr, # device=device, verbose=1, - batch_size=1, + batch_size=3, + # col_to_stype=col_to_stype, ) - net.fit(dataset) - y_pred = net.predict(test_dataset) - else: - net = NeuralNetClassifierPytorchFrame( + elif task_type == TaskType.BINARY_CLASSIFICATION: + net = NeuralNetBinaryClassifierPytorchFrame( module=model, criterion=loss, max_epochs=2, # lr=args.lr, # device=device, verbose=1, - batch_size=1, - ) + batch_size=3, + # col_to_stype=col_to_stype, + ) + + if pass_dataset: + net.fit(dataset) + y_pred = net.predict(test_dataset) + else: net.fit(X_train, y_train) y_pred = net.predict(X_test) From 7cc2f30f5e1f087c0fcbe4d77242413ec743bb28 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 22:54:24 +0900 Subject: [PATCH 19/48] fix(dataset): fix dataset --- torch_frame/data/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_frame/data/dataset.py b/torch_frame/data/dataset.py index 02102c3af..a897f2894 100644 --- a/torch_frame/data/dataset.py +++ b/torch_frame/data/dataset.py @@ -7,6 +7,7 @@ from collections import defaultdict from typing import Any, Dict +import numpy as np import pandas as pd import torch from torch import Tensor @@ -733,8 +734,7 @@ def get_split(self, split: str) -> Dataset: if split not in ["train", "val", "test"]: raise ValueError(f"The split named '{split}' is not available. " f"Needs to be either 'train', 'val', or 'test'.") - indices = self.df.index[self.df[self.split_col] == - SPLIT_TO_NUM[split]].tolist() + indices = np.where(self.df[self.split_col] == SPLIT_TO_NUM[split])[0] return self[indices] def split(self) -> tuple[Dataset, Dataset, Dataset]: From 95f22c1e63146e8db6ff55b91ad14dd2d834cee0 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 23:15:36 +0900 Subject: [PATCH 20/48] feat: allow creating module later --- test/utils/test_skorch.py | 31 +++++++++++++++++-------------- torch_frame/utils/skorch.py | 10 ++++++++++ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index 5b9acf54d..4feff7eb5 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -1,3 +1,4 @@ +from typing import Any import pandas as pd import pytest import torch.nn as nn @@ -7,6 +8,7 @@ from torch_frame.data.dataset import Dataset from torch_frame.datasets.fake import FakeDataset from torch_frame.nn.models.mlp import MLP +from torch_frame.data.stats import StatType from torch_frame.testing.text_embedder import HashTextEmbedder from torch_frame.utils.skorch import NeuralNetBinaryClassifierPytorchFrame, NeuralNetClassifierPytorchFrame import torch.nn.functional as F @@ -69,26 +71,27 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, # never use dataset again # we assume that only dataframes are available - # del dataset, train_dataset, val_dataset, test_dataset + del train_dataset, val_dataset, test_dataset if cls == "mlp": - channels = 8 - out_channels = dataset.num_classes if task_type == TaskType.MULTICLASS_CLASSIFICATION else 1 - num_layers = 3 - model = MLP( - channels=channels, - out_channels=out_channels, - num_layers=num_layers, - col_stats=dataset.col_stats, - col_names_dict=dataset.tensor_frame.col_names_dict, - normalization="layer_norm", - ) + def get_module(*, col_stats: dict[str, dict[StatType, Any]], col_names_dict: dict[stype, list[str]]) -> MLP: + channels = 8 + out_channels = dataset.num_classes if task_type == TaskType.MULTICLASS_CLASSIFICATION else 1 + num_layers = 3 + return MLP( + channels=channels, + out_channels=out_channels, + num_layers=num_layers, + col_stats=col_stats, + col_names_dict=col_names_dict, + normalization="layer_norm", + ) else: raise NotImplementedError if task_type in [TaskType.REGRESSION, TaskType.MULTICLASS_CLASSIFICATION]: net = NeuralNetClassifierPytorchFrame( - module=model, + module=get_module, criterion=loss, max_epochs=2, # lr=args.lr, @@ -99,7 +102,7 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, ) elif task_type == TaskType.BINARY_CLASSIFICATION: net = NeuralNetBinaryClassifierPytorchFrame( - module=model, + module=get_module, criterion=loss, max_epochs=2, # lr=args.lr, diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 39693af71..deaa96918 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -14,6 +14,7 @@ from torch import Tensor import torch_frame +from torch_frame import nn from torch_frame.config import ( ImageEmbedderConfig, TextEmbedderConfig, @@ -199,6 +200,12 @@ def iterator_train_valid(self, dataset: Dataset, # skorch API return NeuralNetPytorchFrameDataLoader(dataset, device=self.device, **kwargs) + + def initialize_module(self): + if isinstance(self.module, nn.Module): + return super().initialize_module() + self.module_ = staticmethod(self.module)(col_stats=self.dataset_.col_stats,col_names_dict= self.dataset_.tensor_frame.col_names_dict) + return self def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, **fit_params): @@ -248,6 +255,9 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, # self.module = self.module.__class__(col_stats=self.dataset_.col_stats, # col_names_dict=self.dataset_.tensor_frame.col_names_dict, # **{k: v for k, v in self.module.__dict__.items() if k in init_param_names}) + + # if function + return super().fit(self.dataset_.df, None, **fit_params) def predict(self, X: Dataset | DataFrame) -> NDArray[Any]: From 7f2ec3e8066558ec5ab737025ba61a53ef0f6057 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 23:19:44 +0900 Subject: [PATCH 21/48] test: add binary test --- test/utils/test_skorch.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index 4feff7eb5..e95da9822 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -15,8 +15,9 @@ class BCEWithLogitsLossSigmoidSqueeze(nn.BCEWithLogitsLoss): def forward(self, input, target): # float to long - input = F.sigmoid(input).long() - return super().forward(input.squeeze(), target.long()) + input = F.sigmoid(input).float().squeeze() + target = target.float() + return super().forward(input, target) @pytest.mark.parametrize('cls', ["mlp"]) @pytest.mark.parametrize( @@ -29,7 +30,7 @@ def forward(self, input, target): ]) @pytest.mark.parametrize('task_type_and_loss_cls', [ (TaskType.REGRESSION, nn.MSELoss), - # (TaskType.BINARY_CLASSIFICATION, BCEWithLogitsLossSqueeze), + (TaskType.BINARY_CLASSIFICATION, BCEWithLogitsLossSigmoidSqueeze), (TaskType.MULTICLASS_CLASSIFICATION, nn.CrossEntropyLoss), ]) @pytest.mark.parametrize('pass_dataset', [False]) From 098cb4f38177e435facd71aaf829687d30cc89ef Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 23:35:11 +0900 Subject: [PATCH 22/48] feat: add sklearn test --- test/utils/test_skorch.py | 46 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index e95da9822..b49fb8f6d 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -1,8 +1,10 @@ from typing import Any import pandas as pd import pytest +from sklearn.model_selection import train_test_split import torch.nn as nn +from sklearn.datasets import load_iris, load_diabetes from torch_frame import TaskType, stype from torch_frame.config.text_embedder import TextEmbedderConfig from torch_frame.data.dataset import Dataset @@ -119,3 +121,47 @@ def get_module(*, col_stats: dict[str, dict[StatType, Any]], col_names_dict: dic else: net.fit(X_train, y_train) y_pred = net.predict(X_test) + +from sklearn.impute import SimpleImputer +from sklearn.metrics import accuracy_score, mean_squared_error +@pytest.mark.parametrize('task_type', [TaskType.MULTICLASS_CLASSIFICATION, TaskType.REGRESSION]) +def test_sklearn_only(task_type) -> None: + if task_type == TaskType.MULTICLASS_CLASSIFICATION: + X, y = load_iris(return_X_y=True, as_frame=True) + num_classes = 3 + else: + X, y = load_diabetes(return_X_y=True, as_frame=True) + + X_train, X_test, y_train, y_test = train_test_split(X, y) + + def get_module(*, col_stats: dict[str, dict[StatType, Any]], col_names_dict: dict[stype, list[str]]) -> MLP: + channels = 8 + out_channels = num_classes if task_type == TaskType.MULTICLASS_CLASSIFICATION else 1 + num_layers = 3 + return MLP( + channels=channels, + out_channels=out_channels, + num_layers=num_layers, + col_stats=col_stats, + col_names_dict=col_names_dict, + normalization="layer_norm", + ) + net = NeuralNetClassifierPytorchFrame( + module=get_module, + criterion=nn.CrossEntropyLoss() if task_type == TaskType.MULTICLASS_CLASSIFICATION else nn.MSELoss(), + max_epochs=2, + verbose=1, + lr=0.0001, + batch_size=3, + ) + net.fit(X_train, y_train) + y_pred = net.predict(X_test) + + if task_type == TaskType.MULTICLASS_CLASSIFICATION: + assert y_pred.shape == (len(y_test), num_classes) + acc = accuracy_score(y_test, y_pred.argmax(-1)) + print(acc) + else: + assert y_pred.shape == (len(y_test), 1) + mse = mean_squared_error(y_test, y_pred) + print(mse) \ No newline at end of file From d8a1ca5e3a901c8758ab54416ba85544a631cc0d Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 23:55:50 +0900 Subject: [PATCH 23/48] style: format code --- test/utils/test_skorch.py | 61 ++++++++++++++++++++++++------------- torch_frame/utils/skorch.py | 30 ++++++++---------- 2 files changed, 51 insertions(+), 40 deletions(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index b49fb8f6d..502f8decc 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -1,19 +1,26 @@ from typing import Any + import pandas as pd import pytest -from sklearn.model_selection import train_test_split import torch.nn as nn +import torch.nn.functional as F +from sklearn.datasets import load_diabetes, load_iris +from sklearn.metrics import accuracy_score, mean_squared_error +from sklearn.model_selection import train_test_split -from sklearn.datasets import load_iris, load_diabetes from torch_frame import TaskType, stype from torch_frame.config.text_embedder import TextEmbedderConfig from torch_frame.data.dataset import Dataset +from torch_frame.data.stats import StatType from torch_frame.datasets.fake import FakeDataset from torch_frame.nn.models.mlp import MLP -from torch_frame.data.stats import StatType from torch_frame.testing.text_embedder import HashTextEmbedder -from torch_frame.utils.skorch import NeuralNetBinaryClassifierPytorchFrame, NeuralNetClassifierPytorchFrame -import torch.nn.functional as F +from torch_frame.utils.skorch import ( + NeuralNetBinaryClassifierPytorchFrame, + NeuralNetClassifierPytorchFrame, +) + + class BCEWithLogitsLossSigmoidSqueeze(nn.BCEWithLogitsLoss): def forward(self, input, target): # float to long @@ -21,6 +28,7 @@ def forward(self, input, target): target = target.float() return super().forward(input, target) + @pytest.mark.parametrize('cls', ["mlp"]) @pytest.mark.parametrize( 'stypes', @@ -68,7 +76,7 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, columns=[dataset.target_col, dataset.split_col]), df_train[ dataset.target_col] df_test = test_dataset.df - X_test, y_test = df_test.drop( + X_test, _ = df_test.drop( columns=[dataset.target_col, dataset.split_col]), df_test[ dataset.target_col] @@ -77,9 +85,13 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, del train_dataset, val_dataset, test_dataset if cls == "mlp": - def get_module(*, col_stats: dict[str, dict[StatType, Any]], col_names_dict: dict[stype, list[str]]) -> MLP: + + def get_module(*, col_stats: dict[str, dict[StatType, Any]], + col_names_dict: dict[stype, list[str]]) -> MLP: channels = 8 - out_channels = dataset.num_classes if task_type == TaskType.MULTICLASS_CLASSIFICATION else 1 + out_channels = 1 + if task_type == TaskType.MULTICLASS_CLASSIFICATION: + out_channels = dataset.num_classes num_layers = 3 return MLP( channels=channels, @@ -113,30 +125,33 @@ def get_module(*, col_stats: dict[str, dict[StatType, Any]], col_names_dict: dic verbose=1, batch_size=3, # col_to_stype=col_to_stype, - ) - + ) + if pass_dataset: net.fit(dataset) - y_pred = net.predict(test_dataset) + _ = net.predict(test_dataset) else: net.fit(X_train, y_train) - y_pred = net.predict(X_test) + _ = net.predict(X_test) -from sklearn.impute import SimpleImputer -from sklearn.metrics import accuracy_score, mean_squared_error -@pytest.mark.parametrize('task_type', [TaskType.MULTICLASS_CLASSIFICATION, TaskType.REGRESSION]) + +@pytest.mark.parametrize( + 'task_type', [TaskType.MULTICLASS_CLASSIFICATION, TaskType.REGRESSION]) def test_sklearn_only(task_type) -> None: if task_type == TaskType.MULTICLASS_CLASSIFICATION: X, y = load_iris(return_X_y=True, as_frame=True) num_classes = 3 else: X, y = load_diabetes(return_X_y=True, as_frame=True) - + X_train, X_test, y_train, y_test = train_test_split(X, y) - - def get_module(*, col_stats: dict[str, dict[StatType, Any]], col_names_dict: dict[stype, list[str]]) -> MLP: + + def get_module(*, col_stats: dict[str, dict[StatType, Any]], + col_names_dict: dict[stype, list[str]]) -> MLP: channels = 8 - out_channels = num_classes if task_type == TaskType.MULTICLASS_CLASSIFICATION else 1 + out_channels = 1 + if task_type == TaskType.MULTICLASS_CLASSIFICATION: + out_channels = num_classes num_layers = 3 return MLP( channels=channels, @@ -146,9 +161,11 @@ def get_module(*, col_stats: dict[str, dict[StatType, Any]], col_names_dict: dic col_names_dict=col_names_dict, normalization="layer_norm", ) + net = NeuralNetClassifierPytorchFrame( module=get_module, - criterion=nn.CrossEntropyLoss() if task_type == TaskType.MULTICLASS_CLASSIFICATION else nn.MSELoss(), + criterion=nn.CrossEntropyLoss() + if task_type == TaskType.MULTICLASS_CLASSIFICATION else nn.MSELoss(), max_epochs=2, verbose=1, lr=0.0001, @@ -156,7 +173,7 @@ def get_module(*, col_stats: dict[str, dict[StatType, Any]], col_names_dict: dic ) net.fit(X_train, y_train) y_pred = net.predict(X_test) - + if task_type == TaskType.MULTICLASS_CLASSIFICATION: assert y_pred.shape == (len(y_test), num_classes) acc = accuracy_score(y_test, y_pred.argmax(-1)) @@ -164,4 +181,4 @@ def get_module(*, col_stats: dict[str, dict[StatType, Any]], col_names_dict: dic else: assert y_pred.shape == (len(y_test), 1) mse = mean_squared_error(y_test, y_pred) - print(mse) \ No newline at end of file + print(mse) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index deaa96918..405461ce6 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -92,7 +92,8 @@ def __init__( ) -> None: """`skorch.NeuralNet` with `torch_frame` support. - Additional parameters are **ONLY** used when creating a dummy torch_frame.data.dataset.Dataset + Additional parameters are **ONLY** used + when creating a dummy torch_frame.data.dataset.Dataset if pandas.DataFrame is passed as X in `fit` or `predict` methods. Parameters @@ -200,11 +201,14 @@ def iterator_train_valid(self, dataset: Dataset, # skorch API return NeuralNetPytorchFrameDataLoader(dataset, device=self.device, **kwargs) - + def initialize_module(self): + # skorch API if isinstance(self.module, nn.Module): return super().initialize_module() - self.module_ = staticmethod(self.module)(col_stats=self.dataset_.col_stats,col_names_dict= self.dataset_.tensor_frame.col_names_dict) + self.module_ = staticmethod(self.module)( + col_stats=self.dataset_.col_stats, + col_names_dict=self.dataset_.tensor_frame.col_names_dict) return self def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, @@ -215,8 +219,8 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, X[self.target_col] = y elif self.target_col not in X: warnings.warn( - f"target_col {self.target_col} not found in X and y is None", - UserWarning, stacklevel=2) + f"target_col {self.target_col}" + " not found in X and y is None", UserWarning, stacklevel=2) # create split_col if not exists if self.split_col not in X: @@ -230,7 +234,7 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, self.dataset_ = Dataset( X, # do not include split_col - { # type: ignore + { k: v for k, v in infer_df_stype(X).items() if k not in (self.split_col, ) @@ -248,16 +252,6 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, else: self.dataset_ = X - # self.module.encoder.col_stats = self.dataset_.col_stats - # self.module.encoder.col_names_dict = self.dataset_.tensor_frame.col_names_dict - # import inspect - # init_param_names = inspect.signature(self.module.__class__).parameters.keys() - # self.module = self.module.__class__(col_stats=self.dataset_.col_stats, - # col_names_dict=self.dataset_.tensor_frame.col_names_dict, - # **{k: v for k, v in self.module.__dict__.items() if k in init_param_names}) - - # if function - return super().fit(self.dataset_.df, None, **fit_params) def predict(self, X: Dataset | DataFrame) -> NDArray[Any]: @@ -292,6 +286,6 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, self, "classes", None) or self.dataset_.df["target_col"].unique() return fit_result -# class NeuralNetBinaryClassifierPytorchFrame(NeuralNetPytorchFrame, skorch.NeuralNetBinaryClassifier): + class NeuralNetBinaryClassifierPytorchFrame(NeuralNetPytorchFrame): - num_classes = np.array([0, 1]) \ No newline at end of file + num_classes = np.array([0, 1]) From 4d12972e9833615361a032a2b789114bfab93ec8 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 23:56:06 +0900 Subject: [PATCH 24/48] docs: update docs --- README.md | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ba825b4b4..a8408e83c 100644 --- a/README.md +++ b/README.md @@ -68,12 +68,15 @@ PyTorch Frame democratizes deep learning research for tabular data, catering to
-* [Library Highlights](#library-highlights) -* [Architecture Overview](#architecture-overview) -* [Quick Tour](#quick-tour) -* [Implemented Deep Tabular Models](#implemented-deep-tabular-models) -* [Benchmark](#benchmark) -* [Installation](#installation) +- [Library Highlights](#library-highlights) +- [Architecture Overview](#architecture-overview) +- [Quick Tour](#quick-tour) + - [Build and train your own deep tabular model](#build-and-train-your-own-deep-tabular-model) + - [Scikit-learn Compatible API](#scikit-learn-compatible-api) +- [Implemented Deep Tabular Models](#implemented-deep-tabular-models) +- [Benchmark](#benchmark) +- [Installation](#installation) +- [Cite](#cite) ## Library Highlights @@ -206,6 +209,65 @@ for epoch in range(50): loss.backward() ``` +### Scikit-learn Compatible API (Experimental) + +A scikit-learn compliant API based on skorch allows DataFrame to be trained directly as a dataset. However, this has many limitations. + +```python +from typing import Any + +import torch.nn as nn +from sklearn.datasets import load_diabetes +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split + +from torch_frame import stype +from torch_frame.data.stats import StatType +from torch_frame.nn import Trompt +from torch_frame.nn.models.trompt import Trompt +from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame + +# load the diabetes dataset +X, y = load_diabetes(return_X_y=True, as_frame=True) + +# split the data into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y) + + +# define the function to get the module +def get_module(*, col_stats: dict[str, dict[StatType, Any]], + col_names_dict: dict[stype, list[str]]) -> Trompt: + channels = 8 + out_channels = 1 + num_prompts = 2 + num_layers = 3 + return Trompt(channels=channels, out_channels=out_channels, + num_prompts=num_prompts, num_layers=num_layers, + col_stats=col_stats, col_names_dict=col_names_dict, + stype_encoder_dicts=None) + + +# wrap the function in a NeuralNetClassifierPytorchFrame +net = NeuralNetClassifierPytorchFrame( + module=get_module, + criterion=nn.MSELoss(), + max_epochs=10, + verbose=1, + lr=0.0001, + batch_size=30, +) + +# fit the model +net.fit(X_train, y_train) + +# predict on the test set +y_pred = net.predict(X_test) + +# calculate the mean squared error +mse = mean_squared_error(y_test, y_pred) +print(mse) +``` + ## Implemented Deep Tabular Models We list currently supported deep tabular models: From 03e9d56e4b76d80e29aae258c63b41c80814654e Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 4 Jul 2024 23:56:49 +0900 Subject: [PATCH 25/48] chore(deps): add skorch as deps --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 7f8d44d36..763bf7879 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ full=[ "lightgbm", "datasets", "torchmetrics", + "skorch", ] [project.urls] From ca284f3aa15527a7c06df55956fcf9747e8e4eee Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Fri, 5 Jul 2024 00:42:12 +0900 Subject: [PATCH 26/48] test_skorch.py --- test/utils/test_skorch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index 502f8decc..a2d56aaf2 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -1,3 +1,5 @@ +from __furure__ import annotations + from typing import Any import pandas as pd From 60529107b44bc4ab57a27efdc1291d79af3bf25e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 4 Jul 2024 15:42:28 +0000 Subject: [PATCH 27/48] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/utils/test_skorch.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index a2d56aaf2..8a014090a 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -1,11 +1,10 @@ -from __furure__ import annotations - from typing import Any import pandas as pd import pytest import torch.nn as nn import torch.nn.functional as F +from __furure__ import annotations from sklearn.datasets import load_diabetes, load_iris from sklearn.metrics import accuracy_score, mean_squared_error from sklearn.model_selection import train_test_split From 282610073a165cb7ab62dbfc714fb222a43bd39e Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Fri, 5 Jul 2024 00:46:28 +0900 Subject: [PATCH 28/48] test_skorch.py --- test/utils/test_skorch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index 8a014090a..afd579942 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -1,10 +1,11 @@ +from __future__ import annotations + from typing import Any import pandas as pd import pytest import torch.nn as nn import torch.nn.functional as F -from __furure__ import annotations from sklearn.datasets import load_diabetes, load_iris from sklearn.metrics import accuracy_score, mean_squared_error from sklearn.model_selection import train_test_split From ba740ba14082511cd2104e2ef38e1799e308a78a Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Fri, 5 Jul 2024 12:05:32 +0900 Subject: [PATCH 29/48] fix: use dict.update instead of dict | dict --- torch_frame/utils/skorch.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 405461ce6..22cbffc22 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -231,14 +231,19 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, # split_col uses iloc instead of loc, this is weird X[self.split_col] = (X.index.isin(X_train.index)).astype(int) + # col_to_stype + col_to_stype = { + k: v + for k, v in infer_df_stype(X).items() + if k not in (self.split_col, ) + } + if self.col_to_stype is not None: + col_to_stype.update(self.col_to_stype) + self.dataset_ = Dataset( X, # do not include split_col - { - k: v - for k, v in infer_df_stype(X).items() - if k not in (self.split_col, ) - } | (self.col_to_stype or {}), + col_to_stype=col_to_stype, split_col=self.split_col, target_col=self.target_col, col_to_sep=self.col_to_sep, @@ -248,6 +253,7 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, col_to_time_format=self.col_to_time_format, ) # materialize the dataset to add col_stats and col_names_dict + # in initialize_module() self.dataset_.materialize() else: self.dataset_ = X From a0b3f519ad8468a61b5b67951489b8f2abc64b04 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Fri, 5 Jul 2024 12:08:41 +0900 Subject: [PATCH 30/48] fix(dataset): convert indices to list --- torch_frame/data/dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch_frame/data/dataset.py b/torch_frame/data/dataset.py index a897f2894..4117029f0 100644 --- a/torch_frame/data/dataset.py +++ b/torch_frame/data/dataset.py @@ -734,7 +734,8 @@ def get_split(self, split: str) -> Dataset: if split not in ["train", "val", "test"]: raise ValueError(f"The split named '{split}' is not available. " f"Needs to be either 'train', 'val', or 'test'.") - indices = np.where(self.df[self.split_col] == SPLIT_TO_NUM[split])[0] + indices = np.where( + self.df[self.split_col] == SPLIT_TO_NUM[split])[0].tolist() return self[indices] def split(self) -> tuple[Dataset, Dataset, Dataset]: From 90f57d45e16728d113b6d3d0da34e40fcc5eea15 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Fri, 5 Jul 2024 12:23:20 +0900 Subject: [PATCH 31/48] fix: fix staticmethod usage for < Python 310 https://stackoverflow.com/a/12718272/14819077 --- torch_frame/utils/skorch.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 22cbffc22..a06592aa4 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -173,6 +173,16 @@ def __init__( self.train_split_original = train_split or ( lambda x: train_test_split(x, test_size=0.2)) # 0.2 is the default test_size in train_test_split in skorch + for name, v in zip( + ["iterator_train", "iterator_valid", "dataset"], + [iterator_train, iterator_valid, dataset], + ): + if v is not None: + warnings.warn( + "NeuralNetPytorchFrame does not support" + f" specifying {name}, " + "consider overriding the methods instead", UserWarning, + stacklevel=2) def create_dataset(self, df: DataFrame, _: Any) -> Dataset: # skorch API @@ -206,7 +216,7 @@ def initialize_module(self): # skorch API if isinstance(self.module, nn.Module): return super().initialize_module() - self.module_ = staticmethod(self.module)( + self.module_ = staticmethod(self.module).__func__( col_stats=self.dataset_.col_stats, col_names_dict=self.dataset_.tensor_frame.col_names_dict) return self From 10689d9bbfa5fe2cf67fd51f0cd01d9b7a106a8f Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Fri, 5 Jul 2024 12:50:23 +0900 Subject: [PATCH 32/48] fix: safer patch --- test/utils/test_skorch.py | 4 ---- torch_frame/utils/skorch.py | 25 +++++++++++++------------ 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index afd579942..4eadab8a3 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -111,8 +111,6 @@ def get_module(*, col_stats: dict[str, dict[StatType, Any]], module=get_module, criterion=loss, max_epochs=2, - # lr=args.lr, - # device=device, verbose=1, batch_size=3, # col_to_stype=col_to_stype, @@ -122,8 +120,6 @@ def get_module(*, col_stats: dict[str, dict[StatType, Any]], module=get_module, criterion=loss, max_epochs=2, - # lr=args.lr, - # device=device, verbose=1, batch_size=3, # col_to_stype=col_to_stype, diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index a06592aa4..cd2ccdbc2 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -26,19 +26,19 @@ from torch_frame.typing import IndexSelectType from torch_frame.utils import infer_df_stype -# TODO: make it more safe -old_to_tensor = skorch.utils.to_tensor - -def to_tensor(X, device, accept_sparse=False): - if isinstance(X, TensorFrame): - return X - return old_to_tensor(X, device, accept_sparse) +# TODO: make it more safe +def _patch_skorch_support_tenforframe() -> None: + old_to_tensor = skorch.utils.to_tensor + def to_tensor(X, device, accept_sparse=False): + if isinstance(X, TensorFrame): + return X + return old_to_tensor(X, device, accept_sparse) -skorch.utils.to_tensor = to_tensor + skorch.utils.to_tensor = to_tensor -importlib.reload(skorch.net) + importlib.reload(skorch.net) class NeuralNetPytorchFrameDataLoader(DataLoader): @@ -139,6 +139,7 @@ def __init__( to_datetime function will be used to auto parse time columns. (default: :obj:`None`) """ + _patch_skorch_support_tenforframe() super().__init__( module=module, criterion=criterion, @@ -214,8 +215,10 @@ def iterator_train_valid(self, dataset: Dataset, def initialize_module(self): # skorch API - if isinstance(self.module, nn.Module): + # if module, behave like the original NeuralNet + if isinstance(self.module, nn.Module) or isinstance(self.module, type): return super().initialize_module() + # assume that self.module is a function self.module_ = staticmethod(self.module).__func__( col_stats=self.dataset_.col_stats, col_names_dict=self.dataset_.tensor_frame.col_names_dict) @@ -237,8 +240,6 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, # first split the data with the split function X_train, X_val = self.train_split_original(X, **fit_params) # if index is in X_train, 0, otherwise 1 - # X[self.split_col] = (X.index.isin(X_train.index)).astype(int) - # split_col uses iloc instead of loc, this is weird X[self.split_col] = (X.index.isin(X_train.index)).astype(int) # col_to_stype From 71d97639fb9b17025e1984f2db270068e121883d Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Fri, 5 Jul 2024 12:53:25 +0900 Subject: [PATCH 33/48] fix: do not call twice --- torch_frame/utils/skorch.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index cd2ccdbc2..b7e169e02 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -41,6 +41,9 @@ def to_tensor(X, device, accept_sparse=False): importlib.reload(skorch.net) +_patch_skorch_support_tenforframe() + + class NeuralNetPytorchFrameDataLoader(DataLoader): def __init__(self, dataset: Dataset | TensorFrame, *args, device: torch.device, **kwargs): @@ -139,7 +142,6 @@ def __init__( to_datetime function will be used to auto parse time columns. (default: :obj:`None`) """ - _patch_skorch_support_tenforframe() super().__init__( module=module, criterion=criterion, From eca190531be059fe3342cec510675ea7c04ec65d Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Fri, 5 Jul 2024 13:12:56 +0900 Subject: [PATCH 34/48] fix: copy dataframe before adding columns --- torch_frame/utils/skorch.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index b7e169e02..450181e86 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -231,6 +231,7 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, if isinstance(X, DataFrame): # create target_col if not exists if y is not None: + X = X.copy() X[self.target_col] = y elif self.target_col not in X: warnings.warn( @@ -239,6 +240,8 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, # create split_col if not exists if self.split_col not in X: + if y is None: + X = X.copy() # first split the data with the split function X_train, X_val = self.train_split_original(X, **fit_params) # if index is in X_train, 0, otherwise 1 From 33009b62f0bcd93e41c2fb5ef852a55e59160310 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 6 Jul 2024 15:58:34 +0900 Subject: [PATCH 35/48] docs: add docs to _patch_skorch_support_tenforframe --- torch_frame/utils/skorch.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 450181e86..95d7570bc 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -29,6 +29,9 @@ # TODO: make it more safe def _patch_skorch_support_tenforframe() -> None: + """Patch skorch.utils.to_tensor to support TensorFrame + as it raises an error when TensorFrame is passed. + """ old_to_tensor = skorch.utils.to_tensor def to_tensor(X, device, accept_sparse=False): From e624953dd856310aace4d31ffdf8eb01778db5c7 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 6 Jul 2024 16:02:38 +0900 Subject: [PATCH 36/48] fix(skorch): wrap with functools.wraps --- torch_frame/utils/skorch.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 95d7570bc..086ad8229 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -2,6 +2,7 @@ import importlib import warnings +from functools import wraps from typing import Any import numpy as np @@ -32,12 +33,13 @@ def _patch_skorch_support_tenforframe() -> None: """Patch skorch.utils.to_tensor to support TensorFrame as it raises an error when TensorFrame is passed. """ - old_to_tensor = skorch.utils.to_tensor + original_to_tensor = skorch.utils.to_tensor + @wraps(original_to_tensor) def to_tensor(X, device, accept_sparse=False): if isinstance(X, TensorFrame): return X - return old_to_tensor(X, device, accept_sparse) + return original_to_tensor(X, device, accept_sparse) skorch.utils.to_tensor = to_tensor From 32769031938ac1a5375bd269db2520e5f155e3dc Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 6 Jul 2024 16:10:59 +0900 Subject: [PATCH 37/48] fix: move imports --- examples/revisiting.py | 6 +++--- examples/tutorial.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/revisiting.py b/examples/revisiting.py index 6a4f9b532..65fa93579 100644 --- a/examples/revisiting.py +++ b/examples/revisiting.py @@ -17,7 +17,10 @@ import argparse import os.path as osp +import numpy as np +import pandas as pd import torch +import torch.nn as nn import torch.nn.functional as F from tqdm import tqdm @@ -190,9 +193,6 @@ def test(loader: DataLoader) -> float: print(f'Best Val {metric}: {best_val_metric:.4f}, ' f'Best Test {metric}: {best_test_metric:.4f}') elif args.framework == "skorch-dataframe": - import numpy as np - import pandas as pd - import torch.nn as nn from torch_frame.utils.skorch import ( NeuralNetClassifierPytorchFrame, diff --git a/examples/tutorial.py b/examples/tutorial.py index 8ad8c89c4..19b382cf2 100644 --- a/examples/tutorial.py +++ b/examples/tutorial.py @@ -7,7 +7,9 @@ import os.path as osp from typing import Any, Dict, List +import pandas as pd import torch +import torch.nn as nn import torch.nn.functional as F from torch import Tensor from torch.nn import LayerNorm, Linear, Module, ModuleList @@ -270,7 +272,6 @@ def test(loader: DataLoader) -> float: f"Best Val Acc: {best_val_acc:.4f}, Best Test Acc: {best_test_acc:.4f}" ) elif args.framework == "skorch": - import torch.nn as nn from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame @@ -289,8 +290,6 @@ def test(loader: DataLoader) -> float: dim=-1) == test_tensor_frame.y).float().mean() print(f"Test Acc: {test_acc:.4f}") elif args.framework == "skorch-dataframe": - import pandas as pd - import torch.nn as nn from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame From 18a50eec2e47a8f27a9c369f03f2b229f92f1058 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 6 Jul 2024 16:12:32 +0900 Subject: [PATCH 38/48] chore: do not use NeuralNetClassifierPytorchFrame for regression although it can be used --- README.md | 10 ++++++---- test/utils/test_skorch.py | 12 +++++++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a8408e83c..24e2e687d 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ PyTorch Frame democratizes deep learning research for tabular data, catering to - [Architecture Overview](#architecture-overview) - [Quick Tour](#quick-tour) - [Build and train your own deep tabular model](#build-and-train-your-own-deep-tabular-model) - - [Scikit-learn Compatible API](#scikit-learn-compatible-api) + - [Scikit-learn Compatible API (Experimental)](#scikit-learn-compatible-api-experimental) - [Implemented Deep Tabular Models](#implemented-deep-tabular-models) - [Benchmark](#benchmark) - [Installation](#installation) @@ -225,7 +225,7 @@ from torch_frame import stype from torch_frame.data.stats import StatType from torch_frame.nn import Trompt from torch_frame.nn.models.trompt import Trompt -from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame +from torch_frame.utils.skorch import NeuralNetPytorchFrame # load the diabetes dataset X, y = load_diabetes(return_X_y=True, as_frame=True) @@ -247,8 +247,10 @@ def get_module(*, col_stats: dict[str, dict[StatType, Any]], stype_encoder_dicts=None) -# wrap the function in a NeuralNetClassifierPytorchFrame -net = NeuralNetClassifierPytorchFrame( +# wrap the function in a NeuralNetPytorchFrame +# NeuralNetClassifierPytorchFrame and NeuralNetBinaryClassifierPytorchFrame +# are also available +net = NeuralNetPytorchFrame( module=get_module, criterion=nn.MSELoss(), max_epochs=10, diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index 4eadab8a3..2acd21a6a 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -20,6 +20,7 @@ from torch_frame.utils.skorch import ( NeuralNetBinaryClassifierPytorchFrame, NeuralNetClassifierPytorchFrame, + NeuralNetPytorchFrame, ) @@ -106,7 +107,16 @@ def get_module(*, col_stats: dict[str, dict[StatType, Any]], else: raise NotImplementedError - if task_type in [TaskType.REGRESSION, TaskType.MULTICLASS_CLASSIFICATION]: + if task_type == TaskType.REGRESSION: + net = NeuralNetPytorchFrame( + module=get_module, + criterion=loss, + max_epochs=2, + verbose=1, + batch_size=3, + # col_to_stype=col_to_stype, + ) + if task_type == TaskType.MULTICLASS_CLASSIFICATION: net = NeuralNetClassifierPytorchFrame( module=get_module, criterion=loss, From d53061d6fba38457cfc7e0e951e70ff830915db2 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 6 Jul 2024 16:26:13 +0900 Subject: [PATCH 39/48] fix(skorch): add typing only for module --- torch_frame/utils/skorch.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 086ad8229..731b373ad 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -3,7 +3,7 @@ import importlib import warnings from functools import wraps -from typing import Any +from typing import Any, Callable import numpy as np import skorch.utils @@ -15,7 +15,7 @@ from torch import Tensor import torch_frame -from torch_frame import nn +from torch_frame import nn, stype from torch_frame.config import ( ImageEmbedderConfig, TextEmbedderConfig, @@ -23,6 +23,7 @@ ) from torch_frame.data.dataset import Dataset from torch_frame.data.loader import DataLoader +from torch_frame.data.stats import StatType from torch_frame.data.tensor_frame import TensorFrame from torch_frame.typing import IndexSelectType from torch_frame.utils import infer_df_stype @@ -66,7 +67,9 @@ class NeuralNetPytorchFrame(NeuralNet): def __init__( self, # NeuralNet parameters - module, + module: type[nn.Module] | nn.Module + | Callable[[dict[str, dict[StatType, Any]], dict[stype, list[str]]], + nn.Module], criterion, optimizer=torch.optim.SGD, lr=0.01, From a09beb27439adb143737754a4996e3568e7122a2 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 6 Jul 2024 16:40:39 +0900 Subject: [PATCH 40/48] fix: support specifying module as class --- test/utils/test_skorch.py | 92 ++++++++++++++++++++----------------- torch_frame/utils/skorch.py | 9 +++- 2 files changed, 56 insertions(+), 45 deletions(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index 2acd21a6a..bcfc0adfc 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -28,7 +28,7 @@ class BCEWithLogitsLossSigmoidSqueeze(nn.BCEWithLogitsLoss): def forward(self, input, target): # float to long input = F.sigmoid(input).float().squeeze() - target = target.float() + target = target.float().squeeze() return super().forward(input, target) @@ -46,9 +46,11 @@ def forward(self, input, target): (TaskType.BINARY_CLASSIFICATION, BCEWithLogitsLossSigmoidSqueeze), (TaskType.MULTICLASS_CLASSIFICATION, nn.CrossEntropyLoss), ]) -@pytest.mark.parametrize('pass_dataset', [False]) +@pytest.mark.parametrize('pass_dataset', [False, True]) +@pytest.mark.parametrize('module_as_function', [False, True]) def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, - pass_dataset: bool): + pass_dataset: bool, + module_as_function: bool): task_type, loss_cls = task_type_and_loss_cls loss = loss_cls() @@ -88,52 +90,56 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, del train_dataset, val_dataset, test_dataset if cls == "mlp": - - def get_module(*, col_stats: dict[str, dict[StatType, Any]], - col_names_dict: dict[stype, list[str]]) -> MLP: - channels = 8 - out_channels = 1 - if task_type == TaskType.MULTICLASS_CLASSIFICATION: - out_channels = dataset.num_classes - num_layers = 3 - return MLP( - channels=channels, - out_channels=out_channels, - num_layers=num_layers, - col_stats=col_stats, - col_names_dict=col_names_dict, - normalization="layer_norm", - ) + if module_as_function: + + def get_module(col_stats: dict[str, dict[StatType, Any]], + col_names_dict: dict[stype, list[str]]) -> MLP: + channels = 8 + out_channels = 1 + if task_type == TaskType.MULTICLASS_CLASSIFICATION: + out_channels = dataset.num_classes + num_layers = 3 + return MLP( + channels=channels, + out_channels=out_channels, + num_layers=num_layers, + col_stats=col_stats, + col_names_dict=col_names_dict, + normalization="layer_norm", + ) + + module = get_module + kwargs = {} + else: + module = MLP + kwargs = { + "channels": + 8, + "out_channels": + dataset.num_classes + if task_type == TaskType.MULTICLASS_CLASSIFICATION else 1, + "num_layers": + 3, + "normalization": + "layer_norm", + } + kwargs = {f"module__{k}": v for k, v in kwargs.items()} else: raise NotImplementedError + kwargs.update({ + "module": module, + "criterion": loss, + "max_epochs": 2, + "verbose": 1, + "batch_size": 3, + }) if task_type == TaskType.REGRESSION: - net = NeuralNetPytorchFrame( - module=get_module, - criterion=loss, - max_epochs=2, - verbose=1, - batch_size=3, - # col_to_stype=col_to_stype, - ) + net = NeuralNetPytorchFrame(**kwargs, ) if task_type == TaskType.MULTICLASS_CLASSIFICATION: - net = NeuralNetClassifierPytorchFrame( - module=get_module, - criterion=loss, - max_epochs=2, - verbose=1, - batch_size=3, - # col_to_stype=col_to_stype, - ) + net = NeuralNetClassifierPytorchFrame(**kwargs, ) elif task_type == TaskType.BINARY_CLASSIFICATION: - net = NeuralNetBinaryClassifierPytorchFrame( - module=get_module, - criterion=loss, - max_epochs=2, - verbose=1, - batch_size=3, - # col_to_stype=col_to_stype, - ) + net = NeuralNetBinaryClassifierPytorchFrame(**kwargs, ) if pass_dataset: net.fit(dataset) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 731b373ad..9478d46b0 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -8,6 +8,7 @@ import numpy as np import skorch.utils import torch +import torch.nn as nn from numpy.typing import ArrayLike, NDArray from pandas import DataFrame from sklearn.model_selection import train_test_split @@ -15,7 +16,7 @@ from torch import Tensor import torch_frame -from torch_frame import nn, stype +from torch_frame import stype from torch_frame.config import ( ImageEmbedderConfig, TextEmbedderConfig, @@ -227,6 +228,9 @@ def initialize_module(self): # skorch API # if module, behave like the original NeuralNet if isinstance(self.module, nn.Module) or isinstance(self.module, type): + self.module__col_stats = self.dataset_.col_stats + self.module__col_names_dict = ( + self.dataset_.tensor_frame.col_names_dict) return super().initialize_module() # assume that self.module is a function self.module_ = staticmethod(self.module).__func__( @@ -313,7 +317,8 @@ def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, **fit_params): fit_result = super().fit(X, y, **fit_params) self.classes = getattr( - self, "classes", None) or self.dataset_.df["target_col"].unique() + self, "classes", + None) or self.dataset_.df[self.dataset_.target_col].unique() return fit_result From a967b0dc95aadf0feea9e35a182e796f46627c8f Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 6 Jul 2024 16:43:24 +0900 Subject: [PATCH 41/48] docs: add docs --- torch_frame/utils/skorch.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 9478d46b0..3d4c74ebd 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -52,6 +52,10 @@ def to_tensor(X, device, accept_sparse=False): class NeuralNetPytorchFrameDataLoader(DataLoader): + """Custom DataLoader for NeuralNetPytorchFrame. + + Converts the index to a tensor and separates the input and target tensors. + """ def __init__(self, dataset: Dataset | TensorFrame, *args, device: torch.device, **kwargs): super().__init__(dataset, *args, **kwargs) From bc07d7bb8a81ef6e4cc5977a251d0431113171bd Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Sat, 6 Jul 2024 16:53:48 +0900 Subject: [PATCH 42/48] fix: fix dtype --- test/utils/test_skorch.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index bcfc0adfc..f6bd5581c 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -4,8 +4,8 @@ import pandas as pd import pytest +import torch import torch.nn as nn -import torch.nn.functional as F from sklearn.datasets import load_diabetes, load_iris from sklearn.metrics import accuracy_score, mean_squared_error from sklearn.model_selection import train_test_split @@ -24,12 +24,18 @@ ) -class BCEWithLogitsLossSigmoidSqueeze(nn.BCEWithLogitsLoss): +class EnsureDtypeLoss(nn.Module): + def __init__(self, loss: nn.Module, dtype_input: torch.dtype = torch.float, + dtype_target: torch.dtype = torch.float): + super().__init__() + self.loss = loss + self.dtype_input = dtype_input + self.dtype_target = dtype_target + def forward(self, input, target): - # float to long - input = F.sigmoid(input).float().squeeze() - target = target.float().squeeze() - return super().forward(input, target) + return self.loss( + input.to(dtype=self.dtype_input).squeeze(), + target.to(dtype=self.dtype_target).squeeze()) @pytest.mark.parametrize('cls', ["mlp"]) @@ -43,7 +49,7 @@ def forward(self, input, target): ]) @pytest.mark.parametrize('task_type_and_loss_cls', [ (TaskType.REGRESSION, nn.MSELoss), - (TaskType.BINARY_CLASSIFICATION, BCEWithLogitsLossSigmoidSqueeze), + (TaskType.BINARY_CLASSIFICATION, nn.BCEWithLogitsLoss), (TaskType.MULTICLASS_CLASSIFICATION, nn.CrossEntropyLoss), ]) @pytest.mark.parametrize('pass_dataset', [False, True]) @@ -53,6 +59,9 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, module_as_function: bool): task_type, loss_cls = task_type_and_loss_cls loss = loss_cls() + loss = EnsureDtypeLoss( + loss, dtype_target=torch.long + if task_type == TaskType.MULTICLASS_CLASSIFICATION else torch.float) # initialize dataset dataset: Dataset = FakeDataset( From 2fe4f69123e7431c32cfb665bdf87a6225d6bcdb Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 11 Jul 2024 18:51:24 +0900 Subject: [PATCH 43/48] test: remove comment --- test/utils/test_skorch.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index f6bd5581c..5d980b480 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -75,15 +75,6 @@ def test_skorch_torchframe_dataset(cls, stypes, task_type_and_loss_cls, ) dataset.materialize() train_dataset, val_dataset, test_dataset = dataset.split() - # print(dataset.col_stats) - # # convert to dataframe - # col_to_stype = dataset.col_to_stype - # # remove split_col and target_col - # col_to_stype = { - # k: v - # for k, v in col_to_stype.items() - # if k not in [dataset.split_col, dataset.target_col] - # } if not pass_dataset: df_train = pd.concat([train_dataset.df, val_dataset.df]) X_train, y_train = df_train.drop( From 14f0e7bf3c4ec94f4528f88dff18fb78fbe165aa Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 11 Jul 2024 18:54:03 +0900 Subject: [PATCH 44/48] Discard changes to examples/revisiting.py --- examples/revisiting.py | 111 ++++++++++------------------------------- 1 file changed, 27 insertions(+), 84 deletions(-) diff --git a/examples/revisiting.py b/examples/revisiting.py index 65fa93579..7a8eef6bd 100644 --- a/examples/revisiting.py +++ b/examples/revisiting.py @@ -17,10 +17,7 @@ import argparse import os.path as osp -import numpy as np -import pandas as pd import torch -import torch.nn as nn import torch.nn.functional as F from tqdm import tqdm @@ -49,8 +46,6 @@ parser.add_argument('--epochs', type=int, default=100) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--compile', action='store_true') -parser.add_argument("--framework", type=str, default="torch", - choices=["torch", "skorch-dataframe"]) args = parser.parse_args() torch.manual_seed(args.seed) @@ -161,82 +156,30 @@ def test(loader: DataLoader) -> float: return rmse -if args.framework == "torch": - if is_classification: - metric = 'Acc' - best_val_metric = 0 - best_test_metric = 0 - else: - metric = 'RMSE' - best_val_metric = float('inf') - best_test_metric = float('inf') - - for epoch in range(1, args.epochs + 1): - train_loss = train(epoch) - train_metric = test(train_loader) - val_metric = test(val_loader) - test_metric = test(test_loader) - - if is_classification and val_metric > best_val_metric: - best_val_metric = val_metric - best_test_metric = test_metric - elif not is_classification and val_metric < best_val_metric: - best_val_metric = val_metric - best_test_metric = test_metric - - print( - f'Train Loss: {train_loss:.4f}, ' - f'Train {metric}: {train_metric:.4f}, ' - f'Val {metric}: {val_metric:.4f}, Test {metric}: {test_metric:.4f}' - ) - - print(f'Best Val {metric}: {best_val_metric:.4f}, ' - f'Best Test {metric}: {best_test_metric:.4f}') -elif args.framework == "skorch-dataframe": - - from torch_frame.utils.skorch import ( - NeuralNetClassifierPytorchFrame, - NeuralNetPytorchFrame, - ) - - df = dataset.df - df_train = pd.concat([train_dataset.df, val_dataset.df]) - X_train, y_train = df_train.drop( - columns=[dataset.target_col, dataset.split_col]), df_train[ - dataset.target_col] - df_test = test_dataset.df - X_test, y_test = df_test.drop( - columns=[dataset.target_col, dataset.split_col]), df_test[ - dataset.target_col] - - # use DataFrames with no `split_col` or `target_col` - # like normal sklearn datasets from now on - if is_classification: - net = NeuralNetClassifierPytorchFrame( - module=model, - criterion=nn.CrossEntropyLoss, - max_epochs=args.epochs, - lr=args.lr, - device=device, - verbose=1, - batch_size=args.batch_size, - ) - else: - net = NeuralNetPytorchFrame( - module=model, - criterion=nn.MSELoss, - max_epochs=args.epochs, - lr=args.lr, - device=device, - verbose=1, - batch_size=args.batch_size, - ) - net.fit(X_train, y_train) - y_pred = net.predict(X_test) - - if is_classification: - test_acc = (y_pred.argmax(-1) == y_test).mean() - print(f"Test Acc: {test_acc:.4f}") - else: - test_rmse = np.sqrt(((y_pred.squeeze() - y_test)**2).mean()) - print(f"Test RMSE: {test_rmse:.4f}") +if is_classification: + metric = 'Acc' + best_val_metric = 0 + best_test_metric = 0 +else: + metric = 'RMSE' + best_val_metric = float('inf') + best_test_metric = float('inf') + +for epoch in range(1, args.epochs + 1): + train_loss = train(epoch) + train_metric = test(train_loader) + val_metric = test(val_loader) + test_metric = test(test_loader) + + if is_classification and val_metric > best_val_metric: + best_val_metric = val_metric + best_test_metric = test_metric + elif not is_classification and val_metric < best_val_metric: + best_val_metric = val_metric + best_test_metric = test_metric + + print(f'Train Loss: {train_loss:.4f}, Train {metric}: {train_metric:.4f}, ' + f'Val {metric}: {val_metric:.4f}, Test {metric}: {test_metric:.4f}') + +print(f'Best Val {metric}: {best_val_metric:.4f}, ' + f'Best Test {metric}: {best_test_metric:.4f}') From 06ec88ee1d7af921b1ebabac76cf747e022454e2 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 11 Jul 2024 18:54:09 +0900 Subject: [PATCH 45/48] Discard changes to examples/tutorial.py --- examples/tutorial.py | 85 ++++++++------------------------------------ 1 file changed, 15 insertions(+), 70 deletions(-) diff --git a/examples/tutorial.py b/examples/tutorial.py index 19b382cf2..1b58b0e3a 100644 --- a/examples/tutorial.py +++ b/examples/tutorial.py @@ -7,9 +7,7 @@ import os.path as osp from typing import Any, Dict, List -import pandas as pd import torch -import torch.nn as nn import torch.nn.functional as F from torch import Tensor from torch.nn import LayerNorm, Linear, Module, ModuleList @@ -36,8 +34,6 @@ parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=10) parser.add_argument('--seed', type=int, default=0) -parser.add_argument("--framework", type=str, default="torch", - choices=["torch", "skorch", "skorch-dataframe"]) args = parser.parse_args() torch.manual_seed(args.seed) @@ -227,7 +223,7 @@ def train(epoch: int) -> float: model.train() loss_accum = total_count = 0 - for tf in tqdm(train_loader, desc=f"Epoch: {epoch}"): + for tf in tqdm(train_loader, desc=f'Epoch: {epoch}'): tf = tf.to(device) pred = model(tf) loss = F.cross_entropy(pred, tf.y) @@ -254,68 +250,17 @@ def test(loader: DataLoader) -> float: return accum / total_count -if args.framework == "torch": - best_val_acc = 0 - best_test_acc = 0 - for epoch in range(1, args.epochs + 1): - train_loss = train(epoch) - train_acc = test(train_loader) - val_acc = test(val_loader) - test_acc = test(test_loader) - if best_val_acc < val_acc: - best_val_acc = val_acc - best_test_acc = test_acc - print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, " - f"Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}") - - print( - f"Best Val Acc: {best_val_acc:.4f}, Best Test Acc: {best_test_acc:.4f}" - ) -elif args.framework == "skorch": - - from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame - - net = NeuralNetClassifierPytorchFrame( - module=model, - criterion=nn.CrossEntropyLoss, - max_epochs=args.epochs, - lr=args.lr, - device=device, - verbose=1, - batch_size=args.batch_size, - ) - net.fit(dataset) - y_pred = net.predict(test_dataset) - test_acc = (torch.Tensor(y_pred).argmax( - dim=-1) == test_tensor_frame.y).float().mean() - print(f"Test Acc: {test_acc:.4f}") -elif args.framework == "skorch-dataframe": - - from torch_frame.utils.skorch import NeuralNetClassifierPytorchFrame - - df = dataset.df - df_train = pd.concat([train_dataset.df, val_dataset.df]) - X_train, y_train = df_train.drop( - columns=[dataset.target_col, dataset.split_col]), df_train[ - dataset.target_col] - df_test = test_dataset.df - X_test, y_test = df_test.drop( - columns=[dataset.target_col, dataset.split_col]), df_test[ - dataset.target_col] - - # use DataFrames with no `split_col` or `target_col` - # like normal sklearn datasets from now on - net = NeuralNetClassifierPytorchFrame( - module=model, - criterion=nn.CrossEntropyLoss, - max_epochs=args.epochs, - lr=args.lr, - device=device, - verbose=1, - col_to_stype={"C_feature_7": stype.categorical}, - batch_size=args.batch_size, - ) - net.fit(X_train, y_train) - y_pred = net.predict(X_test) - test_acc = (y_pred.argmax(-1) == y_test).mean() - print(f"Test Acc: {test_acc:.4f}") +best_val_acc = 0 +best_test_acc = 0 +for epoch in range(1, args.epochs + 1): + train_loss = train(epoch) + train_acc = test(train_loader) + val_acc = test(val_loader) + test_acc = test(test_loader) + if best_val_acc < val_acc: + best_val_acc = val_acc + best_test_acc = test_acc + print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, ' + f'Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}') + +print(f'Best Val Acc: {best_val_acc:.4f}, Best Test Acc: {best_test_acc:.4f}') From 8d4d32d87a95c5d356b48742c77584603040e6af Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 11 Jul 2024 18:54:59 +0900 Subject: [PATCH 46/48] Discard changes to README.md --- README.md | 76 +++++-------------------------------------------------- 1 file changed, 6 insertions(+), 70 deletions(-) diff --git a/README.md b/README.md index 24e2e687d..ba825b4b4 100644 --- a/README.md +++ b/README.md @@ -68,15 +68,12 @@ PyTorch Frame democratizes deep learning research for tabular data, catering to
-- [Library Highlights](#library-highlights) -- [Architecture Overview](#architecture-overview) -- [Quick Tour](#quick-tour) - - [Build and train your own deep tabular model](#build-and-train-your-own-deep-tabular-model) - - [Scikit-learn Compatible API (Experimental)](#scikit-learn-compatible-api-experimental) -- [Implemented Deep Tabular Models](#implemented-deep-tabular-models) -- [Benchmark](#benchmark) -- [Installation](#installation) -- [Cite](#cite) +* [Library Highlights](#library-highlights) +* [Architecture Overview](#architecture-overview) +* [Quick Tour](#quick-tour) +* [Implemented Deep Tabular Models](#implemented-deep-tabular-models) +* [Benchmark](#benchmark) +* [Installation](#installation) ## Library Highlights @@ -209,67 +206,6 @@ for epoch in range(50): loss.backward() ``` -### Scikit-learn Compatible API (Experimental) - -A scikit-learn compliant API based on skorch allows DataFrame to be trained directly as a dataset. However, this has many limitations. - -```python -from typing import Any - -import torch.nn as nn -from sklearn.datasets import load_diabetes -from sklearn.metrics import mean_squared_error -from sklearn.model_selection import train_test_split - -from torch_frame import stype -from torch_frame.data.stats import StatType -from torch_frame.nn import Trompt -from torch_frame.nn.models.trompt import Trompt -from torch_frame.utils.skorch import NeuralNetPytorchFrame - -# load the diabetes dataset -X, y = load_diabetes(return_X_y=True, as_frame=True) - -# split the data into training and testing sets -X_train, X_test, y_train, y_test = train_test_split(X, y) - - -# define the function to get the module -def get_module(*, col_stats: dict[str, dict[StatType, Any]], - col_names_dict: dict[stype, list[str]]) -> Trompt: - channels = 8 - out_channels = 1 - num_prompts = 2 - num_layers = 3 - return Trompt(channels=channels, out_channels=out_channels, - num_prompts=num_prompts, num_layers=num_layers, - col_stats=col_stats, col_names_dict=col_names_dict, - stype_encoder_dicts=None) - - -# wrap the function in a NeuralNetPytorchFrame -# NeuralNetClassifierPytorchFrame and NeuralNetBinaryClassifierPytorchFrame -# are also available -net = NeuralNetPytorchFrame( - module=get_module, - criterion=nn.MSELoss(), - max_epochs=10, - verbose=1, - lr=0.0001, - batch_size=30, -) - -# fit the model -net.fit(X_train, y_train) - -# predict on the test set -y_pred = net.predict(X_test) - -# calculate the mean squared error -mse = mean_squared_error(y_test, y_pred) -print(mse) -``` - ## Implemented Deep Tabular Models We list currently supported deep tabular models: From 947daf1cba89b879199211f636ef6f27c8deab5f Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 11 Jul 2024 19:00:01 +0900 Subject: [PATCH 47/48] fix: use args instead of kwargs to match typing --- test/utils/test_skorch.py | 2 +- torch_frame/utils/skorch.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/test/utils/test_skorch.py b/test/utils/test_skorch.py index 5d980b480..391bb210a 100644 --- a/test/utils/test_skorch.py +++ b/test/utils/test_skorch.py @@ -160,7 +160,7 @@ def test_sklearn_only(task_type) -> None: X_train, X_test, y_train, y_test = train_test_split(X, y) - def get_module(*, col_stats: dict[str, dict[StatType, Any]], + def get_module(col_stats: dict[str, dict[StatType, Any]], col_names_dict: dict[stype, list[str]]) -> MLP: channels = 8 out_channels = 1 diff --git a/torch_frame/utils/skorch.py b/torch_frame/utils/skorch.py index 3d4c74ebd..672bf43f8 100644 --- a/torch_frame/utils/skorch.py +++ b/torch_frame/utils/skorch.py @@ -238,8 +238,7 @@ def initialize_module(self): return super().initialize_module() # assume that self.module is a function self.module_ = staticmethod(self.module).__func__( - col_stats=self.dataset_.col_stats, - col_names_dict=self.dataset_.tensor_frame.col_names_dict) + self.dataset_.col_stats, self.dataset_.tensor_frame.col_names_dict) return self def fit(self, X: Dataset | DataFrame, y: ArrayLike | None = None, From 3769f2d193544623f45de3ce009fef0f4786c768 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Thu, 11 Jul 2024 19:00:37 +0900 Subject: [PATCH 48/48] feat: add example for sklearn api --- examples/sklearn_api.py | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 examples/sklearn_api.py diff --git a/examples/sklearn_api.py b/examples/sklearn_api.py new file mode 100644 index 000000000..6b195fc86 --- /dev/null +++ b/examples/sklearn_api.py @@ -0,0 +1,54 @@ +from typing import Any + +import torch.nn as nn +from sklearn.datasets import load_diabetes +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split + +from torch_frame import stype +from torch_frame.data.stats import StatType +from torch_frame.nn import Trompt +from torch_frame.nn.models.trompt import Trompt +from torch_frame.utils.skorch import NeuralNetPytorchFrame + +# load the diabetes dataset +X, y = load_diabetes(return_X_y=True, as_frame=True) + +# split the data into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y) + + +# define the function to get the module +def get_module(col_stats: dict[str, dict[StatType, Any]], + col_names_dict: dict[stype, list[str]]) -> Trompt: + channels = 8 + out_channels = 1 + num_prompts = 2 + num_layers = 3 + return Trompt(channels=channels, out_channels=out_channels, + num_prompts=num_prompts, num_layers=num_layers, + col_stats=col_stats, col_names_dict=col_names_dict, + stype_encoder_dicts=None) + + +# wrap the function in a NeuralNetPytorchFrame +# NeuralNetClassifierPytorchFrame and NeuralNetBinaryClassifierPytorchFrame +# are also available +net = NeuralNetPytorchFrame( + module=get_module, + criterion=nn.MSELoss(), + max_epochs=10, + verbose=1, + lr=0.0001, + batch_size=30, +) + +# fit the model +net.fit(X_train, y_train) + +# predict on the test set +y_pred = net.predict(X_test) + +# calculate the mean squared error +mse = mean_squared_error(y_test, y_pred) +print(mse)