Skip to content

Commit

Permalink
Merge branch 'main' into frag_feature_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
gtca authored Oct 17, 2024
2 parents a5ddb50 + eee8df0 commit a485795
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 144 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ name: Python package

on:
push:
branches: [master]
branches: [main]
pull_request:
branches: [master]
branches: [main]
schedule:
- cron: "0 5 1,15 * *"

Expand All @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.8, 3.12]
python-version: ["3.10", "3.12"]

steps:
- uses: actions/checkout@v4
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ __pycache__/
# C extensions
*.so

# cached data
data/

# Distribution / packaging
.Python
build/
Expand Down
2 changes: 1 addition & 1 deletion muon/_atac/preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def tfidf(
if log_tfidf:
tf_idf = np.log1p(tf_idf)

res = np.nan_to_num(tf_idf, 0)
res = np.nan_to_num(tf_idf, nan=0.0)
if not inplace:
return res

Expand Down
20 changes: 8 additions & 12 deletions muon/_core/plot.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union, List, Optional, Iterable, Sequence, Dict
from typing import Dict, Iterable, List, Optional, Sequence, Union
import warnings

from matplotlib.axes import Axes
Expand Down Expand Up @@ -43,7 +43,7 @@ def scatter(
y : Optional[str]
y coordinate
color : Optional[Union[str, Sequence[str]]], optional (default: None)
Keys for variables or annotations of observations (.obs columns),
Keys or a single key for variables or annotations of observations (.obs columns),
or a hex colour specification.
use_raw : Optional[bool], optional (default: None)
Use `.raw` attribute of the modality where a feature (from `color`) is derived from.
Expand All @@ -53,9 +53,7 @@ def scatter(
No layer is used by default. A single layer value will be expanded to [layer, layer, layer].
"""
if isinstance(data, AnnData):
return sc.pl.embedding(
data, x=x, y=y, color=color, use_raw=use_raw, layers=layers, **kwargs
)
return sc.pl.scatter(data, x=x, y=y, color=color, use_raw=use_raw, layers=layers, **kwargs)

if isinstance(layers, str) or layers is None:
layers = [layers, layers, layers]
Expand All @@ -72,10 +70,9 @@ def scatter(
if isinstance(color, str):
color_obs = _get_values(data, color, use_raw=use_raw, layer=layers[2])
color_obs = pd.DataFrame({color: color_obs})
color = [color]
else:
# scanpy#311 / scanpy#1497 has to be fixed for this to work
color_obs = _get_values(data, color, use_raw=use_raw, layer=layers[2])

color_obs.index = data.obs_names
obs = pd.concat([obs, color_obs], axis=1, ignore_index=False)

Expand All @@ -86,11 +83,10 @@ def scatter(
# and are now stored in .obs
retval = sc.pl.scatter(ad, x=x, y=y, color=color, **kwargs)
if color is not None:
for col in color:
try:
data.uns[f"{col}_colors"] = ad.uns[f"{col}_colors"]
except KeyError:
pass
try:
data.uns[f"{color}_colors"] = ad.uns[f"{color}_colors"]
except KeyError:
pass
return retval


Expand Down
237 changes: 114 additions & 123 deletions muon/_core/preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,23 +657,24 @@ def intersect_obs(mdata: MuData):
return


# Utility functions: filtering observations
# Utility functions: filtering observations or variables


def filter_obs(
data: Union[AnnData, MuData], var: Union[str, Sequence[str]], func: Optional[Callable] = None
def _filter_attr(
data: Union[AnnData, MuData],
attr: Literal["obs", "var"],
key: Union[str, Sequence[str]],
func: Optional[Callable] = None,
) -> None:
"""
Filter observations (samples or cells) in-place
using any column in .obs or in .X.
Filter observations or variables in-place.
Parameters
----------
data: AnnData or MuData
AnnData or MuData object
var: str or Sequence[str]
Column name in .obs or in .X to be used for filtering.
Alternatively, obs_names can be provided directly.
key: str or Sequence[str]
Names or key to filter
func
Function to apply to the variable used for filtering.
If the variable is of type boolean and func is an identity function,
Expand All @@ -694,51 +695,76 @@ def filter_obs(
"MuData object is backed. The requested subset of the .X matrices of its modalities will be read into memory, and the object will not be backed anymore."
)

if isinstance(var, str):
if var in data.obs.columns:
assert attr in ("obs", "var"), "Attribute has to be either 'obs' or 'var'."

df = getattr(data, attr)
names = getattr(data, f"{attr}_names")
other = "obs" if attr == "var" else "var"
other_names = getattr(data, f"{other}_names")
attrm = getattr(data, f"{attr}m")
attrp = getattr(data, f"{attr}p")

if isinstance(key, str):
if key in df.columns:
if func is None:
if data.obs[var].dtypes.name == "bool":
if df[key].dtypes.name == "bool":

def func(x):
return x

else:
raise ValueError(f"Function has to be provided since {var} is not boolean")
obs_subset = func(data.obs[var].values)
elif var in data.var_names:
obs_subset = func(data.X[:, np.where(data.var_names == var)[0]].reshape(-1))
raise ValueError(f"Function has to be provided since {key} is not boolean")
subset = func(df[key].values)
elif key in other_names:
if attr == "obs":
subset = func(data.X[:, np.where(other_names == key)[0]].reshape(-1))
else:
subset = func(data.X[np.where(other_names == key)[0], :].reshape(-1))
else:
raise ValueError(
f"Column name from .obs or one of the var_names was expected but got {var}."
f"Column name from .{attr} or one of the {other}_names was expected but got {key}."
)
else:
if func is None:
if np.array(var).dtype == bool:
obs_subset = np.array(var)
if np.array(key).dtype == bool:
subset = np.array(key)
else:
obs_subset = data.obs_names.isin(var)
subset = names.isin(key)
else:
raise ValueError("When providing obs_names directly, func has to be None.")
raise ValueError(f"When providing {attr}_names directly, func has to be None.")

# Subset .obs
data._obs = data.obs[obs_subset]
data._n_obs = data.obs.shape[0]
if isinstance(data, AnnData):
# Collect elements to subset
# NOTE: accessing them after subsetting .obs/.var
# will fail due to _validate_value()
attrm = dict(attrm)
attrp = dict(attrp)

# Subset .obsm
for k, v in data.obsm.items():
data.obsm[k] = v[obs_subset]
# Subset .obs/.var
setattr(data, f"_{attr}", df[subset])

# Subset .obsp
for k, v in data.obsp.items():
data.obsp[k] = v[obs_subset][:, obs_subset]
# Subset .obsm/.varm
for k, v in attrm.items():
attrm[k] = v[subset]
setattr(data, f"{attr}m", attrm)

# Subset .obsp/.obsp
for k, v in attrp.items():
attrp[k] = v[subset][:, subset]
setattr(data, f"{attr}p", attrp)

if isinstance(data, AnnData):
# Subset .X
if data._X is not None:
try:
data._X = data.X[obs_subset, :]
if attr == "obs":
data._X = data.X[subset, :]
else:
data._X = data.X[:, subset]
except TypeError:
data._X = data.X[np.where(obs_subset)[0], :]
if attr == "obs":
data._X = data.X[np.where(subset)[0], :]
else:
data._X = data.X[:, np.where(subset)[0]]
# For some h5py versions, indexing arrays must have integer dtypes
# https://github.com/h5py/h5py/issues/1847

Expand All @@ -748,29 +774,71 @@ def func(x):

# Subset layers
for layer in data.layers:
data.layers[layer] = data.layers[layer][obs_subset, :]
if attr == "obs":
data.layers[layer] = data.layers[layer][subset, :]
else:
data.layers[layer] = data.layers[layer][:, subset]

# Subset raw
if data.raw is not None:
data.raw._X = data.raw.X[obs_subset, :]
data.raw._n_obs = data.raw.X.shape[0]
# Subset raw - only when subsetting obs
if attr == "obs" and data.raw is not None:
data.raw._X = data.raw.X[subset, :]

else:
# filter_obs() for each modality
attrmap = getattr(data, f"{attr}map")

# Subset .obs/.var
setattr(data, f"_{attr}", df[subset])

# Subset .obsm/.varm
for k, v in attrm.items():
attrm[k] = v[subset]
setattr(data, f"{attr}m", attrm)

# Subset .obsp/.varp
for k, v in attrp.items():
attrp[k] = v[subset][:, subset]
setattr(data, f"{attr}p", attrp)

# _filter_attr() for each modality
for m, mod in data.mod.items():
obsmap = data.obsmap[m][obs_subset]
obsidx = obsmap > 0
filter_obs(mod, mod.obs_names[obsmap[obsidx] - 1])
maporder = np.argsort(obsmap[obsidx])
map_subset = attrmap[m][subset]
attridx = map_subset > 0
orig_attr = getattr(mod, attr).copy()
mod_names = getattr(mod, f"{attr}_names")
_filter_attr(mod, attr, mod_names[map_subset[attridx] - 1])
data.mod[m]._remove_unused_categories(orig_attr, getattr(mod, attr), mod.uns)
maporder = np.argsort(map_subset[attridx])
nobsmap = np.empty(maporder.size)
nobsmap[maporder] = np.arange(1, maporder.size + 1)
obsmap[obsidx] = nobsmap
data.obsmap[m] = obsmap
map_subset[attridx] = nobsmap
getattr(data, f"{attr}map")[m] = map_subset

return


# Utility functions: filtering variables
def filter_obs(
data: Union[AnnData, MuData], var: Union[str, Sequence[str]], func: Optional[Callable] = None
) -> None:
"""
Filter observations (samples or cells) in-place
using any column in .obs or in .X.
Parameters
----------
data: AnnData or MuData
AnnData or MuData object
var: str or Sequence[str]
Column name in .obs or in .X to be used for filtering.
Alternatively, obs_names can be provided directly.
func
Function to apply to the variable used for filtering.
If the variable is of type boolean and func is an identity function,
the func argument can be omitted.
"""

_filter_attr(data, "obs", var, func)

return


def filter_var(
Expand All @@ -793,84 +861,7 @@ def filter_var(
the func argument can be omitted.
"""

if data.is_view:
raise ValueError(
"The provided adata is a view. In-place filtering does not operate on views."
)
if data.isbacked:
if isinstance(data, AnnData):
warnings.warn(
"AnnData object is backed. The requested subset of the matrix .X will be read into memory, and the object will not be backed anymore."
)
else:
warnings.warn(
"MuData object is backed. The requested subset of the .X matrices of its modalities will be read into memory, and the object will not be backed anymore."
)

if isinstance(var, str):
if var in data.var.columns:
if func is None:
if data.var[var].dtypes.name == "bool":

def func(x):
return x

else:
raise ValueError(f"Function has to be provided since {var} is not boolean")
var_subset = func(data.var[var].values)
elif var in data.obs_names:
var_subset = func(data.X[:, np.where(data.obs_names == var)[0]].reshape(-1))
else:
raise ValueError(
f"Column name from .var or one of the obs_names was expected but got {var}."
)
else:
if func is None:
var_subset = var if np.array(var).dtype == bool else data.var_names.isin(var)
else:
raise ValueError("When providing var_names directly, func has to be None.")

# Subset .var
data._var = data.var[var_subset]
data._n_vars = data.var.shape[0]

# Subset .varm
for k, v in data.varm.items():
data.varm[k] = v[var_subset]

# Subset .varp
for k, v in data.varp.items():
data.varp[k] = v[var_subset][:, var_subset]

if isinstance(data, AnnData):
# Subset .X
try:
data._X = data.X[:, var_subset]
except TypeError:
data._X = data.X[:, np.where(var_subset)[0]]
# For some h5py versions, indexing arrays must have integer dtypes
# https://github.com/h5py/h5py/issues/1847
if data.isbacked:
data.file.close()
data.filename = None

# Subset layers
for layer in data.layers:
data.layers[layer] = data.layers[layer][:, var_subset]

# NOTE: .raw is not subsetted

else:
# filter_var() for each modality
for m, mod in data.mod.items():
varmap = data.varmap[m][var_subset]
varidx = varmap > 0
filter_var(mod, mod.var_names[varmap[varidx] - 1])
maporder = np.argsort(varmap[varidx])
nvarmap = np.empty(maporder.size)
nvarmap[maporder] = np.arange(1, maporder.size + 1)
varmap[varidx] = nvarmap
data.varmap[m] = varmap
_filter_attr(data, "var", var, func)

return

Expand Down
Loading

0 comments on commit a485795

Please sign in to comment.