diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index eefe08859c1e9..471a0b0a1f984 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -23,6 +23,9 @@ iNaT, lib, ) + +from pandas._libs.missing import NA + from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -544,10 +547,20 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), # isin is faster for small sizes + + # GH60678 + # Ensure values don't contain , otherwise it throws exception with np.in1d + values_contains_NA = False + + if values.size != 0: + vectorized_check = np.vectorize(lambda v: v is NA) + values_contains_NA = vectorized_check(values).any() + if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 and comps_array.dtype != object + and values_contains_NA == False ): # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index e997ae32cf2e2..5b1ff210d134c 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -211,6 +211,24 @@ def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype, data, values, expected", [ + ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]), + ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]), + ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]) +]) +def test_isin_large_series_and_pdNA(dtype, data, values, expected, monkeypatch): + # https://github.com/pandas-dev/pandas/issues/60678 + # combination of large series (> _MINIMUM_COMP_ARR_LEN elements) and + # values contains pdNA + min_isin_comp = 2 + ser = Series(data, dtype=dtype) + expected = pd.Series(expected, dtype="boolean") + + with monkeypatch.context() as m: + m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp) + result = ser.isin(values) + tm.assert_series_equal(result, expected) + def test_isin_complex_numbers(): # GH 17927 array = [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j]