Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/pandas api isin #30

Merged
merged 15 commits into from
Jan 22, 2024
99 changes: 99 additions & 0 deletions docs/user-guide/advanced/Pandas_API.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3014,6 +3014,105 @@
"\n",
"Example Table."
]
},
{
"cell_type": "markdown",
"id": "7f08eb84",
"metadata": {},
"source": [
"## Comparison\n",
"\n",
"### Table.isin()\n",
"\n",
"```\n",
"Table.isin(\n",
" values\n",
")\n",
"```\n",
"\n",
"Whether each element in the DataFrame is contained in values.\n",
"\n",
"**Parameters:**\n",
"\n",
"| Name | Type | Description | Default |\n",
"| :--------------: | :---------------------------------: | :-------------------------------------------------------------------------- | :------: |\n",
"| values | Union[List, dict, Table, KeyedTable] | The result will only be true at a location if all the labels match. If values is a dict, the keys must be the column names, which must match. If values is a Table or KeyedTable, then both the index and column labels must match. | None|\n",
"\n",
"\n",
"**Returns:**\n",
"\n",
"| Type | Description |\n",
"| :-----------------------: | :---------------------------------------------- |\n",
"| Table | Boolean type Table/KeyedTable showing whether each element in the DataFrame is contained in values.|\n",
"\n",
"**Examples:**\n",
"\n",
"Example Table."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6e453c8",
"metadata": {},
"outputs": [],
"source": [
"tab = kx.Table(data={'x': list(range(3)), 'y': [\"A\", \"B\", \"C\"]})"
]
},
{
"cell_type": "markdown",
"id": "aadd23c1",
"metadata": {},
"source": [
"Find if element \"A\" or \"1\" is in the table:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d41d40e0",
"metadata": {},
"outputs": [],
"source": [
"tab.isin([\"A\", 1])"
]
},
{
"cell_type": "markdown",
"id": "cff856fe",
"metadata": {},
"source": [
"Find if element \"A\" is in colum \"y\":"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bccf59d9",
"metadata": {},
"outputs": [],
"source": [
"tab.isin({\"y\": [\"A\"]})"
]
},
{
"cell_type": "markdown",
"id": "ed704cce",
"metadata": {},
"source": [
"Find if element \"A\" is in the first position of \"y\" column:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41840cc0",
"metadata": {},
"outputs": [],
"source": [
"tab.isin(kx.Table(data={\"y\":[\"A\"]}))"
]
}
],
"metadata": {
Expand Down
49 changes: 49 additions & 0 deletions src/pykx/pandas_api/pandas_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,55 @@ def abs(self, numeric_only=False):
tab = _get_numeric_only_subtable(self)
return q.abs(tab)

@api_return
def isin(self, values):
tab = self
key_table = 'KeyedTable' in str(type(tab))
key_value = 'KeyedTable' in str(type(values))
n_rows = 0
false_dataframe_f = q("""{u:(cols x);
v:(count[u],count[x])#0b;
flip u!v}""")
if key_value and not key_table:
return false_dataframe_f(tab)
if key_table:
kcols = q.key(tab)
if key_value:
n_rows, tab = q("""{n_rows:max 0, count[x]-
count rows:(key y) inter key x;
(n_rows;
x each rows)}""", tab, values)
values = q.value(values)
else:
tab = q.value(tab)
dic_value, is_tab = q("""{$[98h = type x;
(flip x; 1b);
(x; 0b)]}""", values)
if key_table and not key_value and is_tab:
ftable = false_dataframe_f(tab)
else:
ftable = q("""{ [table; values; is_tab; n_rows]
flip (cols table)!
{[col_name; tab; values; v_is_tab; n_rows]
col: tab col_name;
ltype: .Q.ty col;
values: $[99h~type values; values col_name; values];
$[v_is_tab or ltype=" "; ;
values@:where (lower ltype) = .Q.t abs type each values];
marcosvm13 marked this conversation as resolved.
Show resolved Hide resolved
$[0 = count values;
(n_rows + count[col])#0b;
$[v_is_tab;
$[any ltype = (" ";"C"); ~'; =]
[mlen#col;mlen#values],
(n_rows + max 0,count[col]-
mlen: min count[values],
count[col])#0b;
any $[any ltype = (" ";"C"); ~/:\:; =\:][values;col]
]
marcosvm13 marked this conversation as resolved.
Show resolved Hide resolved
]}[; table; values; is_tab; n_rows]
each cols table}""", tab, dic_value, is_tab, n_rows)
return ftable.set_index(kcols) if key_table else ftable

@convert_result
def all(self, axis=0, bool_only=False, skipna=True):
res, cols = preparse_computations(self, axis, skipna, bool_only=bool_only)
Expand Down
24 changes: 24 additions & 0 deletions tests/test_pandas_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2029,3 +2029,27 @@ def test_keyed_loc_fixes(q):
mkt[['k1', 'y']]
with pytest.raises(KeyError):
mkt['k1']

marcosvm13 marked this conversation as resolved.
Show resolved Hide resolved

def test_pandas_isin(kx):
tab = kx.q("""([] k1: 0n 1. 0n 2. 0n;
k2: ("A";" ";"B";" ";"A");
k3: (`a;1.;`c;5;`d))""")
keyed_tab = kx.q("""([`a`b`c`d`e]
k1: 0n 1. 0n 2. 0n;
k2: ("A";" ";"B";" ";"A");
k3: (`a;1.;`c;5;`d))""")

list_value = kx.q('(`a;1.;"A")')
dict_value = {"k1": [1., 2., 3.]}
tab_value = kx.q('([] k1: 1. 2. 3.; k2: ("A";"B";"C"))')
keyed_tab_value = kx.q('([`a`b] k1: 1. 2.; k2: ("A";"B"))')

assert tab.isin(list_value).pd().equals(tab.pd().isin(list_value.py()))
assert tab.isin(dict_value).pd().equals(tab.pd().isin(dict_value))
assert tab.isin(tab_value).pd().equals(tab.pd().isin(tab_value.pd()))
assert tab.isin(keyed_tab_value).pd().equals(tab.pd().isin(keyed_tab_value))
assert keyed_tab.isin(list_value).pd().equals(keyed_tab.pd().isin(list_value.py()))
assert keyed_tab.isin(dict_value).pd().equals(keyed_tab.pd().isin(dict_value))
assert keyed_tab.isin(keyed_tab_value).pd().equals(keyed_tab.pd().isin(keyed_tab_value.pd()))
assert keyed_tab.isin(tab_value).pd().equals(keyed_tab.pd().isin(tab_value))