Skip to content

Commit

Permalink
Merge pull request #238 from ykwang1/issue/237/select_pixel_order
Browse files Browse the repository at this point in the history
Filter based on HEALpix order
  • Loading branch information
camposandro authored Mar 18, 2024
2 parents e6ffb6b + 8cc82f6 commit 45f7752
Show file tree
Hide file tree
Showing 26 changed files with 220 additions and 8 deletions.
15 changes: 14 additions & 1 deletion src/lsdb/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from lsdb.catalog.margin_catalog import MarginCatalog
from lsdb.core.crossmatch.abstract_crossmatch_algorithm import AbstractCrossmatchAlgorithm
from lsdb.core.crossmatch.crossmatch_algorithms import BuiltInCrossmatchAlgorithm
from lsdb.core.search import ConeSearch, IndexSearch, PolygonSearch
from lsdb.core.search import ConeSearch, IndexSearch, OrderSearch, PolygonSearch
from lsdb.core.search.abstract_search import AbstractSearch
from lsdb.core.search.box_search import BoxSearch
from lsdb.dask.crossmatch_catalog_data import crossmatch_catalog_data
Expand Down Expand Up @@ -266,6 +266,19 @@ def index_search(self, ids, catalog_index: HCIndexCatalog, fine: bool = True) ->
"""
return self._search(IndexSearch(ids, catalog_index), fine)

def order_search(self, min_order: int = 0, max_order: int | None = None) -> Catalog:
"""
Filter catalog by order of HEALPix
Args:
min_order (int): Minimum HEALPix order to select. Defaults to 0.
max_order (int): Maximum HEALPix order to select. Defaults to maximum catalog order.
Returns:
A new Catalog containing only the pixels of orders specified (inclusive)
"""
return self._search(OrderSearch(min_order, max_order), fine=False)

def _search(self, search: AbstractSearch, fine: bool = True):
"""Find rows by reusable search algorithm.
Expand Down
1 change: 1 addition & 0 deletions src/lsdb/core/search/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .cone_search import ConeSearch
from .index_search import IndexSearch
from .order_search import OrderSearch
from .polygon_search import PolygonSearch
34 changes: 34 additions & 0 deletions src/lsdb/core/search/order_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from __future__ import annotations

from typing import List

import pandas as pd
from hipscat.pixel_math import HealpixPixel

from lsdb.core.search.abstract_search import AbstractSearch


class OrderSearch(AbstractSearch):
"""Filter the catalog by HEALPix order.
Filters partitions in the catalog to those that are in the orders specified.
Does not filter points inside those partitions.
"""

def __init__(self, min_order: int = 0, max_order: int | None = None):
if max_order and min_order > max_order:
raise ValueError("The minimum order should be lower than or equal to the maximum order")
self.min_order = min_order
self.max_order = max_order

def search_partitions(self, pixels: List[HealpixPixel]) -> List[HealpixPixel]:
"""Determine the target partitions for further filtering."""
max_catalog_order = max(pixel.order for pixel in pixels)
max_order = max_catalog_order if self.max_order is None else self.max_order
if self.min_order > max_order:
raise ValueError("The minimum order is higher than the catalog's maximum order")
return [pixel for pixel in pixels if self.min_order <= pixel.order <= max_order]

def search_points(self, frame: pd.DataFrame, _) -> pd.DataFrame:
"""Determine the search results within a data frame."""
return frame
11 changes: 11 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
DATA_DIR_NAME = "data"
SMALL_SKY_DIR_NAME = "small_sky"
SMALL_SKY_LEFT_XMATCH_NAME = "small_sky_left_xmatch"
SMALL_SKY_SOURCE_DIR_NAME = "small_sky_source"
SMALL_SKY_SOURCE_MARGIN_NAME = "small_sky_source_margin"
SMALL_SKY_ORDER3_SOURCE_MARGIN_NAME = "small_sky_order3_source_margin"
SMALL_SKY_XMATCH_NAME = "small_sky_xmatch"
Expand Down Expand Up @@ -77,6 +78,16 @@ def small_sky_order1_source_dir(test_data_dir):
return os.path.join(test_data_dir, SMALL_SKY_ORDER1_SOURCE_NAME)


@pytest.fixture
def small_sky_source_dir(test_data_dir):
return os.path.join(test_data_dir, SMALL_SKY_SOURCE_DIR_NAME)


@pytest.fixture
def small_sky_source_catalog(small_sky_source_dir):
return lsdb.read_hipscat(small_sky_source_dir)


@pytest.fixture
def small_sky_order1_source_margin_dir(test_data_dir):
return os.path.join(test_data_dir, SMALL_SKY_ORDER1_SOURCE_MARGIN_NAME)
Expand Down
37 changes: 31 additions & 6 deletions tests/data/generate_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@
"tmp_path = tempfile.TemporaryDirectory()\n",
"tmp_dir = tmp_path.name\n",
"\n",
"hipscat_import_dir = \"../../../hipscat-import/tests/hipscat_import/data/\"\n",
"\n",
"client = Client(n_workers=1, threads_per_worker=1, local_directory=tmp_dir)"
]
},
Expand Down Expand Up @@ -176,9 +174,36 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### small_sky_source_margin\n",
"\n",
"This one is tricky, as it uses a catalog that we only have in the `hipscat` and `hipscat-import` test directories."
"### small_sky_source"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"args = ImportArguments(\n",
" input_file_list=[\"raw/small_sky_source/small_sky_source.csv\"],\n",
" output_path=\".\",\n",
" file_reader=\"csv\",\n",
" ra_column=\"source_ra\",\n",
" dec_column=\"source_dec\",\n",
" catalog_type=\"source\",\n",
" output_artifact_name=\"small_sky_source\",\n",
" highest_healpix_order=2,\n",
" pixel_threshold=3000,\n",
" overwrite=True,\n",
" tmp_dir=tmp_dir,\n",
")\n",
"runner.pipeline_with_client(args, client)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### small_sky_source_margin"
]
},
{
Expand All @@ -188,7 +213,7 @@
"outputs": [],
"source": [
"args = MarginCacheArguments(\n",
" input_catalog_path=Path(hipscat_import_dir) / \"small_sky_source_catalog\",\n",
" input_catalog_path=\"small_sky_source\",\n",
" output_path=\".\",\n",
" output_artifact_name=\"small_sky_source_margin\",\n",
" margin_threshold=180,\n",
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/data/small_sky_source/_common_metadata
Binary file not shown.
Binary file added tests/data/small_sky_source/_metadata
Binary file not shown.
8 changes: 8 additions & 0 deletions tests/data/small_sky_source/catalog_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"catalog_name": "small_sky_source",
"catalog_type": "source",
"total_rows": 17161,
"epoch": "J2000",
"ra_column": "source_ra",
"dec_column": "source_dec"
}
15 changes: 15 additions & 0 deletions tests/data/small_sky_source/partition_info.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Norder,Npix,Dir
0,4,0
1,47,0
2,176,0
2,177,0
2,178,0
2,179,0
2,180,0
2,181,0
2,182,0
2,183,0
2,184,0
2,185,0
2,186,0
2,187,0
53 changes: 53 additions & 0 deletions tests/data/small_sky_source/provenance_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"catalog_name": "small_sky_source",
"catalog_type": "source",
"total_rows": 17161,
"epoch": "J2000",
"ra_column": "source_ra",
"dec_column": "source_dec",
"version": "0.2.8",
"generation_date": "2024.03.15",
"tool_args": {
"tool_name": "hipscat_import",
"version": "0.2.5.dev8+g9d0bfc4",
"runtime_args": {
"catalog_name": "small_sky_source",
"output_path": ".",
"output_artifact_name": "small_sky_source",
"tmp_dir": "/var/folders/x4/rmzh8l_s0zxc74nwr72z12340000gn/T/tmpax37bk_h",
"overwrite": true,
"dask_tmp": "",
"dask_n_workers": 1,
"dask_threads_per_worker": 1,
"catalog_path": "./small_sky_source",
"tmp_path": "/var/folders/x4/rmzh8l_s0zxc74nwr72z12340000gn/T/tmpax37bk_h/small_sky_source/intermediate",
"epoch": "J2000",
"catalog_type": "source",
"input_path": null,
"input_paths": [
"raw/small_sky_source/small_sky_source.csv"
],
"input_file_list": [
"raw/small_sky_source/small_sky_source.csv"
],
"ra_column": "source_ra",
"dec_column": "source_dec",
"use_hipscat_index": false,
"sort_columns": null,
"constant_healpix_order": -1,
"highest_healpix_order": 2,
"pixel_threshold": 3000,
"mapping_healpix_order": 2,
"debug_stats_only": false,
"file_reader_info": {
"input_reader_type": "CsvReader",
"chunksize": 500000,
"header": "infer",
"schema_file": null,
"separator": ",",
"column_names": null,
"type_map": {}
}
}
}
}
41 changes: 41 additions & 0 deletions tests/lsdb/catalog/test_order_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd
import pytest

from lsdb.core.search import OrderSearch


def test_order_search_filters_correct_pixels(small_sky_source_catalog, assert_divisions_are_correct):
order_search_catalog = small_sky_source_catalog.order_search(min_order=1, max_order=1)
pixel_orders = [pixel.order for pixel in order_search_catalog.get_healpix_pixels()]
assert all(order == 1 for order in pixel_orders)
assert_divisions_are_correct(order_search_catalog)

order_search_catalog = small_sky_source_catalog.order_search(min_order=1, max_order=2)
pixel_orders = [pixel.order for pixel in order_search_catalog.get_healpix_pixels()]
assert all(1 <= order <= 2 for order in pixel_orders)
assert_divisions_are_correct(order_search_catalog)

order_search_catalog = small_sky_source_catalog.order_search(min_order=1)
pixel_orders = [pixel.order for pixel in order_search_catalog.get_healpix_pixels()]
assert all(1 <= order <= 2 for order in pixel_orders)
assert_divisions_are_correct(order_search_catalog)

order_search_catalog = small_sky_source_catalog.order_search(max_order=1)
pixel_orders = [pixel.order for pixel in order_search_catalog.get_healpix_pixels()]
assert all(0 <= order <= 1 for order in pixel_orders)
assert_divisions_are_correct(order_search_catalog)


def test_order_search_keeps_all_points(small_sky_source_catalog):
metadata = small_sky_source_catalog.hc_structure
partition_df = small_sky_source_catalog._ddf.partitions[0].compute()
search = OrderSearch(min_order=1, max_order=2)
filtered_df = search.search_points(partition_df, metadata)
pd.testing.assert_frame_equal(partition_df, filtered_df)


def test_order_search_invalid_args(small_sky_source_catalog):
with pytest.raises(ValueError, match="lower than or equal to the maximum order"):
small_sky_source_catalog.order_search(min_order=2, max_order=1)
with pytest.raises(ValueError, match="minimum order is higher than"):
small_sky_source_catalog.order_search(min_order=3)
13 changes: 12 additions & 1 deletion tests/lsdb/loaders/hipscat/test_read_hipscat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from hipscat.catalog.index.index_catalog import IndexCatalog

import lsdb
from lsdb.core.search import ConeSearch, IndexSearch, PolygonSearch
from lsdb.core.search import ConeSearch, IndexSearch, OrderSearch, PolygonSearch
from lsdb.core.search.box_search import BoxSearch


Expand Down Expand Up @@ -133,6 +133,17 @@ def test_read_hipscat_subset_with_index_search(
assert index_search_catalog.get_healpix_pixels() == index_search_catalog_2.get_healpix_pixels()


def test_read_hipscat_subset_with_order_search(small_sky_source_catalog, small_sky_source_dir):
order_search = OrderSearch(min_order=1, max_order=2)
# Filtering using catalog's order_search
order_search_catalog = small_sky_source_catalog.order_search(min_order=1, max_order=2)
# Filtering when calling `read_hipscat`
order_search_catalog_2 = lsdb.read_hipscat(small_sky_source_dir, search_filter=order_search)
assert isinstance(order_search_catalog_2, lsdb.Catalog)
# The partitions of the catalogs are equivalent
assert order_search_catalog.get_healpix_pixels() == order_search_catalog_2.get_healpix_pixels()


def test_read_hipscat_subset_no_partitions(small_sky_order1_dir, small_sky_order1_id_index_dir):
with pytest.raises(ValueError, match="no partitions"):
catalog_index = IndexCatalog.read_from_hipscat(small_sky_order1_id_index_dir)
Expand Down

0 comments on commit 45f7752

Please sign in to comment.