From c535767f4d577394372d9681343d16d420816b5b Mon Sep 17 00:00:00 2001 From: Josh Horton Date: Fri, 28 Jun 2024 10:43:38 +0100 Subject: [PATCH] lint --- naglmbis/__init__.py | 1 + .../dataset/analysis_train_val_test_split.py | 19 ++++++------ scripts/dataset/setup_labeled_data.py | 31 +++++++++++++------ scripts/dataset/split_by_deepchem.py | 16 ++++++---- scripts/training/train_model.py | 8 +++-- 5 files changed, 48 insertions(+), 27 deletions(-) diff --git a/naglmbis/__init__.py b/naglmbis/__init__.py index 060f7cb..edf4351 100644 --- a/naglmbis/__init__.py +++ b/naglmbis/__init__.py @@ -2,6 +2,7 @@ naglmbis Models built with NAGL to predict MBIS properties. """ + from . import _version __version__ = _version.get_versions()["version"] diff --git a/scripts/dataset/analysis_train_val_test_split.py b/scripts/dataset/analysis_train_val_test_split.py index 1e80c80..453dc02 100644 --- a/scripts/dataset/analysis_train_val_test_split.py +++ b/scripts/dataset/analysis_train_val_test_split.py @@ -16,7 +16,7 @@ def calculate_stats(dataset_name: str): # load the dataset dataset = dc.data.DiskDataset(dataset_name) - + for smiles in dataset.ids: mol = Chem.MolFromSmiles(smiles, ps) charges = [] @@ -27,7 +27,7 @@ def calculate_stats(dataset_name: str): elements[atomic_number] += 1 else: elements[atomic_number] = 1 - + total_charge = sum(charges) if total_charge in formal_charges: formal_charges[total_charge] += 1 @@ -40,11 +40,12 @@ def calculate_stats(dataset_name: str): return formal_charges, molecular_weights, elements, heavy_atom_count -for dataset in ['maxmin-train', 'maxmin-valid', 'maxmin-test']: +for dataset in ["maxmin-train", "maxmin-valid", "maxmin-test"]: charges, weights, atoms, heavy_atoms = calculate_stats(dataset_name=dataset) - print(f'Running {dataset} number of molecules {len(weights)}') - print('Total formal charges ', charges) - print('Total elements', atoms) - print(f'Average mol weight {np.mean(weights)} and std {np.std(weights)}') - print(f'Average number of heavy atoms {np.mean(heavy_atoms)} and std {np.std(heavy_atoms)}') - + print(f"Running {dataset} number of molecules {len(weights)}") + print("Total formal charges ", charges) + print("Total elements", atoms) + print(f"Average mol weight {np.mean(weights)} and std {np.std(weights)}") + print( + f"Average number of heavy atoms {np.mean(heavy_atoms)} and std {np.std(heavy_atoms)}" + ) diff --git a/scripts/dataset/setup_labeled_data.py b/scripts/dataset/setup_labeled_data.py index 986689c..d4b7469 100644 --- a/scripts/dataset/setup_labeled_data.py +++ b/scripts/dataset/setup_labeled_data.py @@ -9,12 +9,16 @@ # setup the parquet datasets using the splits generated by deepchem - # load up both files training_db = h5py.File("TrainingSet-v1.hdf5", "r") -valid_test_db = h5py.File('ValSet-v1.hdf5', 'r') +valid_test_db = h5py.File("ValSet-v1.hdf5", "r") + -def create_parquet_dataset(parquet_name: str, deep_chem_dataset: dc.data.DiskDataset, reference_datasets: typing.List[h5py.File]): +def create_parquet_dataset( + parquet_name: str, + deep_chem_dataset: dc.data.DiskDataset, + reference_datasets: typing.List[h5py.File], +): dataset_keys = deep_chem_dataset.X dataset_smiles = deep_chem_dataset.ids coloumn_names = ["smiles", "conformation", "dipole", "mbis-charges"] @@ -39,8 +43,9 @@ def create_parquet_dataset(parquet_name: str, deep_chem_dataset: dc.data.DiskDat results["mbis-charges"].append(charges[i]) results["dipole"].append(dipoles[i]) # make to store in bohr - results["conformation"].append(conformations[i].m_as(unit.bohr).flatten()) - + results["conformation"].append( + conformations[i].m_as(unit.bohr).flatten() + ) for key, values in results.items(): assert len(values) == total_records, print(key) @@ -50,10 +55,18 @@ def create_parquet_dataset(parquet_name: str, deep_chem_dataset: dc.data.DiskDat pyarrow.parquet.write_table(table, parquet_name) -for file_name, dataset_name in [('training.parquet', 'maxmin-train'), ('validation.parquet', 'maxmin-valid'), ('testing.parquet', 'maxmin-test')]: - print('creating parquet for ', dataset_name) +for file_name, dataset_name in [ + ("training.parquet", "maxmin-train"), + ("validation.parquet", "maxmin-valid"), + ("testing.parquet", "maxmin-test"), +]: + print("creating parquet for ", dataset_name) dc_dataset = dc.data.DiskDataset(dataset_name) - create_parquet_dataset(parquet_name=file_name, deep_chem_dataset=dc_dataset, reference_datasets=[training_db, valid_test_db]) + create_parquet_dataset( + parquet_name=file_name, + deep_chem_dataset=dc_dataset, + reference_datasets=[training_db, valid_test_db], + ) training_db.close() -valid_test_db.close() \ No newline at end of file +valid_test_db.close() diff --git a/scripts/dataset/split_by_deepchem.py b/scripts/dataset/split_by_deepchem.py index 122b60e..c023451 100644 --- a/scripts/dataset/split_by_deepchem.py +++ b/scripts/dataset/split_by_deepchem.py @@ -5,9 +5,9 @@ dataset_keys = [] smiles_ids = [] -training_set = h5py.File('TrainingSet-v1.hdf5', 'r') +training_set = h5py.File("TrainingSet-v1.hdf5", "r") for key, group in training_set.items(): - smiles_ids.append(group['smiles'].asstr()[0]) + smiles_ids.append(group["smiles"].asstr()[0]) # use the key to quickly split the datasets later dataset_keys.append(key) training_set.close() @@ -20,13 +20,17 @@ # val_set.close() -print(f'The total number of unique molecules {len(smiles_ids)}') -print('Running MaxMin Splitter ...') +print(f"The total number of unique molecules {len(smiles_ids)}") +print("Running MaxMin Splitter ...") xs = np.array(dataset_keys) total_dataset = dc.data.DiskDataset.from_numpy(X=xs, ids=smiles_ids) max_min_split = dc.splits.MaxMinSplitter() -train, validation, test = max_min_split.train_valid_test_split(total_dataset, train_dir='maxmin-train', valid_dir='maxmin-valid', test_dir='maxmin-test') - +train, validation, test = max_min_split.train_valid_test_split( + total_dataset, + train_dir="maxmin-train", + valid_dir="maxmin-valid", + test_dir="maxmin-test", +) diff --git a/scripts/training/train_model.py b/scripts/training/train_model.py index 439aae8..38cdd16 100644 --- a/scripts/training/train_model.py +++ b/scripts/training/train_model.py @@ -209,15 +209,17 @@ def main(): n_gpus = 0 if not torch.cuda.is_available() else 1 print(f"Using {n_gpus} GPUs") - model_checkpoint = ModelCheckpoint(monitor='val/loss', dirpath=output_dir.joinpath('')) + model_checkpoint = ModelCheckpoint( + monitor="val/loss", dirpath=output_dir.joinpath("") + ) trainer = pl.Trainer( - accelerator='cpu', + accelerator="cpu", # devices=n_gpus, min_epochs=n_epochs, max_epochs=n_epochs, logger=logger, log_every_n_steps=50, - callbacks=[model_checkpoint] + callbacks=[model_checkpoint], ) trainer.fit(model, datamodule=data)