Cross-matching fails because left partition is empty #310

hombit · 2024-05-07T23:47:17Z

Bug report

Reproducible example:

import lsdb

ztf_path = 'https://epyc.astro.washington.edu/~lincc-frameworks/hipscat_surveys/ztf/ztf_dr14/'
gaia_path = 'https://epyc.astro.washington.edu/~lincc-frameworks/hipscat_surveys/gaia_dr3/gaia/'
gaia_margin_path = 'https://epyc.astro.washington.edu/~lincc-frameworks/hipscat_surveys/gaia_dr3/gaia_10arcs/'

cone_search = dict(ra=283.44639, dec=33.06623, radius_arcsec=600)

ztf_catalog = lsdb.read_hipscat(ztf_path, columns=["ra", "dec"]).cone_search(**cone_search)
gaia_margin_catalog = lsdb.read_hipscat(gaia_margin_path, columns=["ra", "dec"])
gaia_catalog = lsdb.read_hipscat(
    gaia_path,
    columns=["ra", "dec"],
    margin_cache=gaia_margin_catalog,
).cone_search(**cone_search)

lsdb_result = ztf_catalog.crossmatch(gaia_catalog).compute()

Traceback

/ocean/projects/phy210048p/malanche/lsdb-xmatch-perf/cenv/lib/python3.12/site-packages/lsdb/dask/merge_catalog_functions.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.
  joined_df = pd.concat([partition, margin_filtered]) if margin_filtered is not None else partition

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 21
     14 gaia_margin_catalog = lsdb.read_hipscat(gaia_margin_path, columns=["ra", "dec"])
     15 gaia_catalog = lsdb.read_hipscat(
     16     gaia_path,
     17     columns=["ra", "dec"],
     18     margin_cache=gaia_margin_catalog,
     19 ).cone_search(**cone_search)
---> 21 lsdb_result = ztf_catalog.crossmatch(gaia_catalog).compute()

File /ocean/projects/phy210048p/malanche/lsdb-xmatch-perf/cenv/lib/python3.12/site-packages/lsdb/catalog/dataset/dataset.py:39, in Dataset.compute(self)
     37 def compute(self) -> pd.DataFrame:
     38     """Compute dask distributed dataframe to pandas dataframe"""
---> 39     return self._ddf.compute()

File /ocean/projects/phy210048p/malanche/lsdb-xmatch-perf/cenv/lib/python3.12/site-packages/dask/base.py:375, in DaskMethodsMixin.compute(self, **kwargs)
    351 def compute(self, **kwargs):
    352     """Compute this dask collection
    353 
    354     This turns a lazy Dask collection into its in-memory equivalent.
   (...)
    373     dask.compute
    374     """
--> 375     (result,) = compute(self, traverse=False, **kwargs)
    376     return result

File /ocean/projects/phy210048p/malanche/lsdb-xmatch-perf/cenv/lib/python3.12/site-packages/dask/base.py:661, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
    658     postcomputes.append(x.__dask_postcompute__())
    660 with shorten_traceback():
--> 661     results = schedule(dsk, keys, **kwargs)
    663 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])

File /ocean/projects/phy210048p/malanche/lsdb-xmatch-perf/cenv/lib/python3.12/site-packages/lsdb/dask/crossmatch_catalog_data.py:57, in perform_crossmatch(left_df, right_df, right_margin_df, left_pix, right_pix, right_margin_pix, left_hc_structure, right_hc_structure, right_margin_hc_structure, algorithm, suffixes, right_columns, **kwargs)
     53     left_df = filter_by_hipscat_index_to_pixel(left_df, right_pix.order, right_pix.pixel)
     55 right_joined_df = concat_partition_and_margin(right_df, right_margin_df, right_columns)
---> 57 return algorithm(
     58     left_df,
     59     right_joined_df,
     60     left_pix.order,
     61     left_pix.pixel,
     62     right_pix.order,
     63     right_pix.pixel,
     64     left_hc_structure,
     65     right_hc_structure,
     66     right_margin_hc_structure,
     67     suffixes,
     68 ).crossmatch(**kwargs)

File /ocean/projects/phy210048p/malanche/lsdb-xmatch-perf/cenv/lib/python3.12/site-packages/lsdb/core/crossmatch/kdtree_match.py:69, in KdTreeCrossmatch.crossmatch(self, n_neighbors, radius_arcsec, **kwargs)
     67 left_xyz, right_xyz = self._get_point_coordinates()
     68 # get matching indices for cross-matched rows
---> 69 chord_distances, left_idx, right_idx = _find_crossmatch_indices(
     70     left_xyz, right_xyz, n_neighbors=n_neighbors, max_distance=max_d_chord
     71 )
     72 arc_distances = np.degrees(2.0 * np.arcsin(0.5 * chord_distances)) * 3600
     73 return self._create_crossmatch_df(left_idx, right_idx, arc_distances)

File /ocean/projects/phy210048p/malanche/lsdb-xmatch-perf/cenv/lib/python3.12/site-packages/lsdb/core/crossmatch/kdtree_utils.py:27, in _find_crossmatch_indices(left_xyz, right_xyz, n_neighbors, max_distance, min_distance)
     20 tree = KDTree(right_xyz, compact_nodes=True, balanced_tree=True, copy_data=False)
     22 # find the indices for the nearest neighbors
     23 # this is the cross-match calculation
     24 distances, right_index = (
     25     _query_min_max_neighbors(tree, left_xyz, right_xyz, n_neighbors, min_distance, max_distance)
     26     if min_distance > 0
---> 27     else tree.query(left_xyz, k=n_neighbors, distance_upper_bound=max_distance)
     28 )
     30 # index of the corresponding row in the left table [[0, 0, 0], [1, 1, 1], [2, 2, 2], ...]
     31 left_index = np.arange(left_xyz.shape[0])

File /ocean/projects/phy210048p/malanche/lsdb-xmatch-perf/cenv/lib/python3.12/site-packages/scipy/spatial/_kdtree.py:475, in KDTree.query(self, x, k, eps, p, distance_upper_bound, workers)
    472 if k is None:
    473     raise ValueError("k must be an integer or a sequence of integers")
--> 475 d, i = super().query(x, k, eps, p, distance_upper_bound, workers)
    476 if isinstance(i, int):
    477     i = np.intp(i)

File _ckdtree.pyx:818, in scipy.spatial._ckdtree.cKDTree.query()

File /ocean/projects/phy210048p/malanche/lsdb-xmatch-perf/cenv/lib/python3.12/site-packages/numpy/core/fromnumeric.py:2810, in max(a, axis, out, keepdims, initial, where)
   2692 @array_function_dispatch(_max_dispatcher)
   2693 @set_module('numpy')
   2694 def max(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
   2695          where=np._NoValue):
   2696     """
   2697     Return the maximum of an array or maximum along an axis.
   2698 
   (...)
   2808     5
   2809     """
-> 2810     return _wrapreduction(a, np.maximum, 'max', axis, None, out,
   2811                           keepdims=keepdims, initial=initial, where=where)

File /ocean/projects/phy210048p/malanche/lsdb-xmatch-perf/cenv/lib/python3.12/site-packages/numpy/core/fromnumeric.py:88, in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
     85         else:
     86             return reduction(axis=axis, out=out, **passkwargs)
---> 88 return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

ValueError: zero-size array to reduction operation maximum which has no identity

Should we have a short path when one of the partitions is empty? On a step before we call cross-matching algorithm

Before submitting
Please check the following:

I have described the situation in which the bug arose, including what code was executed, information about my environment, and any applicable data others will need to reproduce the problem.
I have included available evidence of the unexpected behavior (including error messages, screenshots, and/or plots) as well as a descriprion of what I expected instead.
If I have a solution in mind, I have provided an explanation and/or pseudocode and/or task list.

The text was updated successfully, but these errors were encountered:

nevencaplar · 2024-05-08T03:01:57Z

Should we have a short path when one of the partitions is empty?
You mean, should we have a shortcut? Like, check if it is empty and return empty without executing? Is that what you meant?

hombit · 2024-05-08T11:59:16Z

@nevencaplar Yes, I'd do it in this function

lsdb/src/lsdb/dask/crossmatch_catalog_data.py

Lines 31 to 68 in 2b102a3

    
           @dask.delayed 
        
           def perform_crossmatch( 
        
               left_df, 
        
               right_df, 
        
               right_margin_df, 
        
               left_pix, 
        
               right_pix, 
        
               right_margin_pix, 
        
               left_hc_structure, 
        
               right_hc_structure, 
        
               right_margin_hc_structure, 
        
               algorithm, 
        
               suffixes, 
        
               right_columns, 
        
               **kwargs, 
        
           ): 
        
               """Performs a crossmatch on data from a HEALPix pixel in each catalog 
        
               Filters the left catalog before performing the cross-match to stop duplicate points appearing in 
        
               the result. 
        
               """ 
        
               if right_pix.order > left_pix.order: 
        
                   left_df = filter_by_hipscat_index_to_pixel(left_df, right_pix.order, right_pix.pixel) 
        
               right_joined_df = concat_partition_and_margin(right_df, right_margin_df, right_columns) 
        
               return algorithm( 
        
                   left_df, 
        
                   right_joined_df, 
        
                   left_pix.order, 
        
                   left_pix.pixel, 
        
                   right_pix.order, 
        
                   right_pix.pixel, 
        
                   left_hc_structure, 
        
                   right_hc_structure, 
        
                   right_margin_hc_structure, 
        
                   suffixes, 
        
               ).crossmatch(**kwargs)

hombit added the bug Something isn't working label May 7, 2024

delucchi-cmu added this to HATS / LSDB May 7, 2024

camposandro mentioned this issue May 10, 2024

Allow empty left partitions on crossmatch #315

Merged

camposandro closed this as completed in #315 May 13, 2024

github-project-automation bot moved this to Done in HATS / LSDB May 13, 2024

hombit mentioned this issue Jun 6, 2024

Cross-matching fails with "zero-size array to reduction operation maximum which has no identity" #348

Closed

3 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Cross-matching fails because left partition is empty #310

Cross-matching fails because left partition is empty #310

hombit commented May 7, 2024 •

edited

Loading

nevencaplar commented May 8, 2024

hombit commented May 8, 2024

Cross-matching fails because left partition is empty #310

Cross-matching fails because left partition is empty #310

Comments

hombit commented May 7, 2024 • edited Loading

nevencaplar commented May 8, 2024

hombit commented May 8, 2024

hombit commented May 7, 2024 •

edited

Loading