diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py index 5bacb583..f177f116 100644 --- a/src/lsdb/catalog/catalog.py +++ b/src/lsdb/catalog/catalog.py @@ -14,7 +14,7 @@ from lsdb.catalog.margin_catalog import MarginCatalog from lsdb.core.crossmatch.abstract_crossmatch_algorithm import AbstractCrossmatchAlgorithm from lsdb.core.crossmatch.crossmatch_algorithms import BuiltInCrossmatchAlgorithm -from lsdb.core.search import ConeSearch, IndexSearch, PolygonSearch +from lsdb.core.search import ConeSearch, IndexSearch, OrderSearch, PolygonSearch from lsdb.core.search.abstract_search import AbstractSearch from lsdb.core.search.box_search import BoxSearch from lsdb.dask.crossmatch_catalog_data import crossmatch_catalog_data @@ -266,6 +266,19 @@ def index_search(self, ids, catalog_index: HCIndexCatalog, fine: bool = True) -> """ return self._search(IndexSearch(ids, catalog_index), fine) + def order_search(self, min_order: int = 0, max_order: int | None = None) -> Catalog: + """ + Filter catalog by order of HEALPix + + Args: + min_order (int): Minimum HEALPix order to select. Defaults to 0. + max_order (int): Maximum HEALPix order to select. Defaults to maximum catalog order. + + Returns: + A new Catalog containing only the pixels of orders specified (inclusive) + """ + return self._search(OrderSearch(min_order, max_order), fine=False) + def _search(self, search: AbstractSearch, fine: bool = True): """Find rows by reusable search algorithm. diff --git a/src/lsdb/core/search/__init__.py b/src/lsdb/core/search/__init__.py index 884fc109..dbf19130 100644 --- a/src/lsdb/core/search/__init__.py +++ b/src/lsdb/core/search/__init__.py @@ -1,3 +1,4 @@ from .cone_search import ConeSearch from .index_search import IndexSearch +from .order_search import OrderSearch from .polygon_search import PolygonSearch diff --git a/src/lsdb/core/search/order_search.py b/src/lsdb/core/search/order_search.py new file mode 100644 index 00000000..cda99804 --- /dev/null +++ b/src/lsdb/core/search/order_search.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from typing import List + +import pandas as pd +from hipscat.pixel_math import HealpixPixel + +from lsdb.core.search.abstract_search import AbstractSearch + + +class OrderSearch(AbstractSearch): + """Filter the catalog by HEALPix order. + + Filters partitions in the catalog to those that are in the orders specified. + Does not filter points inside those partitions. + """ + + def __init__(self, min_order: int = 0, max_order: int | None = None): + if max_order and min_order > max_order: + raise ValueError("The minimum order should be lower than or equal to the maximum order") + self.min_order = min_order + self.max_order = max_order + + def search_partitions(self, pixels: List[HealpixPixel]) -> List[HealpixPixel]: + """Determine the target partitions for further filtering.""" + max_catalog_order = max(pixel.order for pixel in pixels) + max_order = max_catalog_order if self.max_order is None else self.max_order + if self.min_order > max_order: + raise ValueError("The minimum order is higher than the catalog's maximum order") + return [pixel for pixel in pixels if self.min_order <= pixel.order <= max_order] + + def search_points(self, frame: pd.DataFrame, _) -> pd.DataFrame: + """Determine the search results within a data frame.""" + return frame diff --git a/tests/conftest.py b/tests/conftest.py index 605a3711..42b64241 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,6 +11,7 @@ DATA_DIR_NAME = "data" SMALL_SKY_DIR_NAME = "small_sky" SMALL_SKY_LEFT_XMATCH_NAME = "small_sky_left_xmatch" +SMALL_SKY_SOURCE_DIR_NAME = "small_sky_source" SMALL_SKY_SOURCE_MARGIN_NAME = "small_sky_source_margin" SMALL_SKY_ORDER3_SOURCE_MARGIN_NAME = "small_sky_order3_source_margin" SMALL_SKY_XMATCH_NAME = "small_sky_xmatch" @@ -77,6 +78,16 @@ def small_sky_order1_source_dir(test_data_dir): return os.path.join(test_data_dir, SMALL_SKY_ORDER1_SOURCE_NAME) +@pytest.fixture +def small_sky_source_dir(test_data_dir): + return os.path.join(test_data_dir, SMALL_SKY_SOURCE_DIR_NAME) + + +@pytest.fixture +def small_sky_source_catalog(small_sky_source_dir): + return lsdb.read_hipscat(small_sky_source_dir) + + @pytest.fixture def small_sky_order1_source_margin_dir(test_data_dir): return os.path.join(test_data_dir, SMALL_SKY_ORDER1_SOURCE_MARGIN_NAME) diff --git a/tests/data/generate_data.ipynb b/tests/data/generate_data.ipynb index a803be91..0da71549 100644 --- a/tests/data/generate_data.ipynb +++ b/tests/data/generate_data.ipynb @@ -35,8 +35,6 @@ "tmp_path = tempfile.TemporaryDirectory()\n", "tmp_dir = tmp_path.name\n", "\n", - "hipscat_import_dir = \"../../../hipscat-import/tests/hipscat_import/data/\"\n", - "\n", "client = Client(n_workers=1, threads_per_worker=1, local_directory=tmp_dir)" ] }, @@ -176,9 +174,36 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### small_sky_source_margin\n", - "\n", - "This one is tricky, as it uses a catalog that we only have in the `hipscat` and `hipscat-import` test directories." + "### small_sky_source" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "args = ImportArguments(\n", + " input_file_list=[\"raw/small_sky_source/small_sky_source.csv\"],\n", + " output_path=\".\",\n", + " file_reader=\"csv\",\n", + " ra_column=\"source_ra\",\n", + " dec_column=\"source_dec\",\n", + " catalog_type=\"source\",\n", + " output_artifact_name=\"small_sky_source\",\n", + " highest_healpix_order=2,\n", + " pixel_threshold=3000,\n", + " overwrite=True,\n", + " tmp_dir=tmp_dir,\n", + ")\n", + "runner.pipeline_with_client(args, client)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### small_sky_source_margin" ] }, { @@ -188,7 +213,7 @@ "outputs": [], "source": [ "args = MarginCacheArguments(\n", - " input_catalog_path=Path(hipscat_import_dir) / \"small_sky_source_catalog\",\n", + " input_catalog_path=\"small_sky_source\",\n", " output_path=\".\",\n", " output_artifact_name=\"small_sky_source_margin\",\n", " margin_threshold=180,\n", diff --git a/tests/data/small_sky_source/Norder=0/Dir=0/Npix=4.parquet b/tests/data/small_sky_source/Norder=0/Dir=0/Npix=4.parquet new file mode 100644 index 00000000..b30f05ed Binary files /dev/null and b/tests/data/small_sky_source/Norder=0/Dir=0/Npix=4.parquet differ diff --git a/tests/data/small_sky_source/Norder=1/Dir=0/Npix=47.parquet b/tests/data/small_sky_source/Norder=1/Dir=0/Npix=47.parquet new file mode 100644 index 00000000..62081204 Binary files /dev/null and b/tests/data/small_sky_source/Norder=1/Dir=0/Npix=47.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=176.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=176.parquet new file mode 100644 index 00000000..822eb36a Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=176.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=177.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=177.parquet new file mode 100644 index 00000000..1b94613b Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=177.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=178.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=178.parquet new file mode 100644 index 00000000..96832d80 Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=178.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=179.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=179.parquet new file mode 100644 index 00000000..8c875905 Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=179.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=180.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=180.parquet new file mode 100644 index 00000000..69aa9f7b Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=180.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=181.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=181.parquet new file mode 100644 index 00000000..b783a0a9 Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=181.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=182.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=182.parquet new file mode 100644 index 00000000..325a75f1 Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=182.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=183.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=183.parquet new file mode 100644 index 00000000..d28c219d Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=183.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=184.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=184.parquet new file mode 100644 index 00000000..4b87714c Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=184.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=185.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=185.parquet new file mode 100644 index 00000000..b3576459 Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=185.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=186.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=186.parquet new file mode 100644 index 00000000..d769ab8d Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=186.parquet differ diff --git a/tests/data/small_sky_source/Norder=2/Dir=0/Npix=187.parquet b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=187.parquet new file mode 100644 index 00000000..e8f52446 Binary files /dev/null and b/tests/data/small_sky_source/Norder=2/Dir=0/Npix=187.parquet differ diff --git a/tests/data/small_sky_source/_common_metadata b/tests/data/small_sky_source/_common_metadata new file mode 100644 index 00000000..5d6359eb Binary files /dev/null and b/tests/data/small_sky_source/_common_metadata differ diff --git a/tests/data/small_sky_source/_metadata b/tests/data/small_sky_source/_metadata new file mode 100644 index 00000000..ab2d06bb Binary files /dev/null and b/tests/data/small_sky_source/_metadata differ diff --git a/tests/data/small_sky_source/catalog_info.json b/tests/data/small_sky_source/catalog_info.json new file mode 100644 index 00000000..97e68711 --- /dev/null +++ b/tests/data/small_sky_source/catalog_info.json @@ -0,0 +1,8 @@ +{ + "catalog_name": "small_sky_source", + "catalog_type": "source", + "total_rows": 17161, + "epoch": "J2000", + "ra_column": "source_ra", + "dec_column": "source_dec" +} diff --git a/tests/data/small_sky_source/partition_info.csv b/tests/data/small_sky_source/partition_info.csv new file mode 100644 index 00000000..3a67166b --- /dev/null +++ b/tests/data/small_sky_source/partition_info.csv @@ -0,0 +1,15 @@ +Norder,Npix,Dir +0,4,0 +1,47,0 +2,176,0 +2,177,0 +2,178,0 +2,179,0 +2,180,0 +2,181,0 +2,182,0 +2,183,0 +2,184,0 +2,185,0 +2,186,0 +2,187,0 diff --git a/tests/data/small_sky_source/provenance_info.json b/tests/data/small_sky_source/provenance_info.json new file mode 100644 index 00000000..4424f019 --- /dev/null +++ b/tests/data/small_sky_source/provenance_info.json @@ -0,0 +1,53 @@ +{ + "catalog_name": "small_sky_source", + "catalog_type": "source", + "total_rows": 17161, + "epoch": "J2000", + "ra_column": "source_ra", + "dec_column": "source_dec", + "version": "0.2.8", + "generation_date": "2024.03.15", + "tool_args": { + "tool_name": "hipscat_import", + "version": "0.2.5.dev8+g9d0bfc4", + "runtime_args": { + "catalog_name": "small_sky_source", + "output_path": ".", + "output_artifact_name": "small_sky_source", + "tmp_dir": "/var/folders/x4/rmzh8l_s0zxc74nwr72z12340000gn/T/tmpax37bk_h", + "overwrite": true, + "dask_tmp": "", + "dask_n_workers": 1, + "dask_threads_per_worker": 1, + "catalog_path": "./small_sky_source", + "tmp_path": "/var/folders/x4/rmzh8l_s0zxc74nwr72z12340000gn/T/tmpax37bk_h/small_sky_source/intermediate", + "epoch": "J2000", + "catalog_type": "source", + "input_path": null, + "input_paths": [ + "raw/small_sky_source/small_sky_source.csv" + ], + "input_file_list": [ + "raw/small_sky_source/small_sky_source.csv" + ], + "ra_column": "source_ra", + "dec_column": "source_dec", + "use_hipscat_index": false, + "sort_columns": null, + "constant_healpix_order": -1, + "highest_healpix_order": 2, + "pixel_threshold": 3000, + "mapping_healpix_order": 2, + "debug_stats_only": false, + "file_reader_info": { + "input_reader_type": "CsvReader", + "chunksize": 500000, + "header": "infer", + "schema_file": null, + "separator": ",", + "column_names": null, + "type_map": {} + } + } + } +} diff --git a/tests/lsdb/catalog/test_order_search.py b/tests/lsdb/catalog/test_order_search.py new file mode 100644 index 00000000..7a56245e --- /dev/null +++ b/tests/lsdb/catalog/test_order_search.py @@ -0,0 +1,41 @@ +import pandas as pd +import pytest + +from lsdb.core.search import OrderSearch + + +def test_order_search_filters_correct_pixels(small_sky_source_catalog, assert_divisions_are_correct): + order_search_catalog = small_sky_source_catalog.order_search(min_order=1, max_order=1) + pixel_orders = [pixel.order for pixel in order_search_catalog.get_healpix_pixels()] + assert all(order == 1 for order in pixel_orders) + assert_divisions_are_correct(order_search_catalog) + + order_search_catalog = small_sky_source_catalog.order_search(min_order=1, max_order=2) + pixel_orders = [pixel.order for pixel in order_search_catalog.get_healpix_pixels()] + assert all(1 <= order <= 2 for order in pixel_orders) + assert_divisions_are_correct(order_search_catalog) + + order_search_catalog = small_sky_source_catalog.order_search(min_order=1) + pixel_orders = [pixel.order for pixel in order_search_catalog.get_healpix_pixels()] + assert all(1 <= order <= 2 for order in pixel_orders) + assert_divisions_are_correct(order_search_catalog) + + order_search_catalog = small_sky_source_catalog.order_search(max_order=1) + pixel_orders = [pixel.order for pixel in order_search_catalog.get_healpix_pixels()] + assert all(0 <= order <= 1 for order in pixel_orders) + assert_divisions_are_correct(order_search_catalog) + + +def test_order_search_keeps_all_points(small_sky_source_catalog): + metadata = small_sky_source_catalog.hc_structure + partition_df = small_sky_source_catalog._ddf.partitions[0].compute() + search = OrderSearch(min_order=1, max_order=2) + filtered_df = search.search_points(partition_df, metadata) + pd.testing.assert_frame_equal(partition_df, filtered_df) + + +def test_order_search_invalid_args(small_sky_source_catalog): + with pytest.raises(ValueError, match="lower than or equal to the maximum order"): + small_sky_source_catalog.order_search(min_order=2, max_order=1) + with pytest.raises(ValueError, match="minimum order is higher than"): + small_sky_source_catalog.order_search(min_order=3) diff --git a/tests/lsdb/loaders/hipscat/test_read_hipscat.py b/tests/lsdb/loaders/hipscat/test_read_hipscat.py index 7b9cf8a6..707effb9 100644 --- a/tests/lsdb/loaders/hipscat/test_read_hipscat.py +++ b/tests/lsdb/loaders/hipscat/test_read_hipscat.py @@ -6,7 +6,7 @@ from hipscat.catalog.index.index_catalog import IndexCatalog import lsdb -from lsdb.core.search import ConeSearch, IndexSearch, PolygonSearch +from lsdb.core.search import ConeSearch, IndexSearch, OrderSearch, PolygonSearch from lsdb.core.search.box_search import BoxSearch @@ -133,6 +133,17 @@ def test_read_hipscat_subset_with_index_search( assert index_search_catalog.get_healpix_pixels() == index_search_catalog_2.get_healpix_pixels() +def test_read_hipscat_subset_with_order_search(small_sky_source_catalog, small_sky_source_dir): + order_search = OrderSearch(min_order=1, max_order=2) + # Filtering using catalog's order_search + order_search_catalog = small_sky_source_catalog.order_search(min_order=1, max_order=2) + # Filtering when calling `read_hipscat` + order_search_catalog_2 = lsdb.read_hipscat(small_sky_source_dir, search_filter=order_search) + assert isinstance(order_search_catalog_2, lsdb.Catalog) + # The partitions of the catalogs are equivalent + assert order_search_catalog.get_healpix_pixels() == order_search_catalog_2.get_healpix_pixels() + + def test_read_hipscat_subset_no_partitions(small_sky_order1_dir, small_sky_order1_id_index_dir): with pytest.raises(ValueError, match="no partitions"): catalog_index = IndexCatalog.read_from_hipscat(small_sky_order1_id_index_dir)