From 00740789bc7a00cba16516c3e72501a060bd4f24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 19 Nov 2024 14:41:29 +0100 Subject: [PATCH 1/3] Speed up spot reading --- ppanggolin/formats/readBinaries.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 7d03f144..da370e01 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -773,23 +773,27 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ - Read hotspot in pangenome hdf5 file to add in pangenome object + Read hotspots in the pangenome HDF5 file and add them to the pangenome object. - :param pangenome: Pangenome object without spot - :param h5f: Pangenome HDF5 file with spot computed - :param disable_bar: Disable the progress bar + Args: + pangenome (Pangenome): Pangenome object. + h5f (tables.File): Pangenome HDF5 file with spots computed. + disable_bar (bool): Whether to disable the progress bar. """ table = h5f.root.spots spots = {} + curr_spot_id = None for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="spot", disable=disable_bar): - curr_spot = spots.get(int(row["spot"])) - if curr_spot is None: - curr_spot = Spot(int(row["spot"])) - spots[row["spot"]] = curr_spot + if curr_spot_id != int(row["spot"]): + curr_spot_id = int(row["spot"]) + curr_spot = spots.get(curr_spot_id) + if curr_spot is None: + curr_spot = Spot(int(row["spot"])) + spots[row["spot"]] = curr_spot region = pangenome.get_region(row["RGP"].decode()) curr_spot.add(region) - curr_spot.spot_2_families() for spot in spots.values(): + spot.spot_2_families() pangenome.add_spot(spot) pangenome.status["spots"] = "Loaded" @@ -1174,9 +1178,8 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa logging.getLogger("PPanGGOLiN").info("Reading the spots...") read_spots(pangenome, h5f, disable_bar=disable_bar) else: - raise Exception(f"The pangenome in file '{filename}' does not have spots information, " - f"or has been improperly filled") - + raise AttributeError(f"The pangenome in file '{pangenome.file}' does not have spots information, " + f"or has been improperly filled") if modules: if h5f.root.status._v_attrs.modules: logging.getLogger("PPanGGOLiN").info("Reading the modules...") From 789e3cc414e358193bf4320c5e8299b5efa893fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 20 Nov 2024 11:13:02 +0100 Subject: [PATCH 2/3] Reformat with black --- ppanggolin/formats/readBinaries.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 46bc61a5..b817ee24 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -989,7 +989,12 @@ def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False table = h5f.root.spots spots = {} curr_spot_id = None - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="spot", disable=disable_bar): + for row in tqdm( + read_chunks(table, chunk=20000), + total=table.nrows, + unit="spot", + disable=disable_bar, + ): if curr_spot_id != int(row["spot"]): curr_spot_id = int(row["spot"]) curr_spot = spots.get(curr_spot_id) From 0086fe3bdbc00abe733fb08df50833ddfdf81767 Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Tue, 26 Nov 2024 09:37:25 +0100 Subject: [PATCH 3/3] readBinaries.py: revert commenting style to match the rest of the code --- ppanggolin/formats/readBinaries.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index b817ee24..650b4bc7 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -981,10 +981,9 @@ def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False """ Read hotspots in the pangenome HDF5 file and add them to the pangenome object. - Args: - pangenome (Pangenome): Pangenome object. - h5f (tables.File): Pangenome HDF5 file with spots computed. - disable_bar (bool): Whether to disable the progress bar. + :param pangenome: Pangenome object without spot + :param h5f: Pangenome HDF5 file with spot computed + :param disable_bar: Disable the progress bar """ table = h5f.root.spots spots = {}