Preprocessing

`valency-anndata` methods¶

valency_anndata.preprocessing.rebuild_vote_matrix ¶

rebuild_vote_matrix(
    data: AnnData,
    trim_rule: int | float | str | datetime = 1.0,
    time_col: str = "timestamp",
    inplace: bool = True,
) -> Optional[AnnData]

Rebuild a vote matrix from votes stored in adata.uns['votes'].

Trims votes by time according to trim_rule.
Deduplicates votes by keeping the last vote per voter-comment pair.
Returns a new AnnData with .obs = voters, .var = comments, .X = vote values.
Preserves existing uns, obsm, and layers.

Source code in src/valency_anndata/preprocessing/_rebuild_vote_matrix.py

def rebuild_vote_matrix(
    data: AnnData,
    trim_rule: int | float | str | datetime = 1.0,
    time_col: str = "timestamp",
    inplace: bool = True,
) -> Optional[AnnData]:
    """
    Rebuild a vote matrix from votes stored in `adata.uns['votes']`.

    - Trims votes by time according to `trim_rule`.
    - Deduplicates votes by keeping the last vote per voter-comment pair.
    - Returns a new AnnData with `.obs` = voters, `.var` = comments, `.X` = vote values.
    - Preserves existing `uns`, `obsm`, and `layers`.
    """

    # Load votes CSV
    votes_df = data.uns.get("votes")
    if votes_df is None:
        raise KeyError("`uns['votes']` not found in AnnData")
    votes_df = votes_df.copy()

    # Trim by time
    votes_df = votes_df.pipe(trim_by_time, rule=trim_rule, col=time_col)

    # Sort & deduplicate
    votes_df = votes_df.sort_values(time_col)
    votes_df = votes_df.drop_duplicates(
        subset=["voter-id", "comment-id"], keep="last"
    )

    # Pivot into voter × comment
    vote_matrix_df = votes_df.pivot(
        index="voter-id", columns="comment-id", values="vote"
    )

    # Build a new AnnData
    new_adata = AnnData(
        X=vote_matrix_df.to_numpy(dtype=float),
        obs=data.obs.reindex(vote_matrix_df.index.astype(str)),
        var=data.var.reindex(vote_matrix_df.columns.astype(str))
    )

    # Copy over other metadata
    new_adata.uns.update(data.uns)
    new_adata.obsm.update(data.obsm)
    new_adata.layers.update(data.layers)

    if inplace:
        # Replace all internal state of the original AnnData
        data._init_as_actual(new_adata)
        return None
    else:
        return new_adata

valency_anndata.preprocessing.calculate_qc_metrics ¶

calculate_qc_metrics(
    adata: AnnData, *, inplace: bool = False
) -> Optional[Tuple[DataFrame, DataFrame]]

Compute participant- and statement-level metrics using describe_obs and describe_var.

Source code in src/valency_anndata/preprocessing/_qc.py

def calculate_qc_metrics(
    adata: ad.AnnData,
    *,
    inplace: bool = False,
) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
    """Compute participant- and statement-level metrics using describe_obs and describe_var."""
    X = adata.X
    if X is None:
        raise ValueError("adata.X is None")
    obs_metrics = describe_obs(X, obs_names=adata.obs_names)
    var_metrics = describe_var(X, var_names=adata.var_names)

    if inplace:
        adata.obs[obs_metrics.columns] = obs_metrics
        adata.var[var_metrics.columns] = var_metrics
        return None

    return obs_metrics, var_metrics

valency_anndata.preprocessing.impute ¶

impute(
    adata: AnnData,
    *,
    strategy: Literal["zero", "mean", "median"] = "mean",
    source_layer: Optional[str] = None,
    target_layer: Optional[str] = None,
    overwrite: bool = False,
) -> None

Impute NaN values in an AnnData matrix and store the result in a layer.

Parameters:

Name	Type	Description	Default
`adata`	`AnnData`	AnnData object.	required
`strategy`	`Literal['zero', 'mean', 'median']`	Imputation strategy. Currently supports: - "zero": replace NaNs with 0 - "mean": column-wise mean - "median": column-wise median	`'mean'`
`source_layer`	`Optional[str]`	Layer to read from. If None, uses adata.X.	`None`
`target_layer`	`Optional[str]`	Layer to write to. Defaults to "X_imputed_".	`None`
`overwrite`	`bool`	Whether to overwrite an existing target layer.	`False`

Source code in src/valency_anndata/preprocessing/_impute.py

def impute(
    adata: AnnData,
    *,
    strategy: Literal["zero", "mean", "median"] = "mean",
    source_layer: Optional[str] = None,
    target_layer: Optional[str] = None,
    overwrite: bool = False,
) -> None:
    """
    Impute NaN values in an AnnData matrix and store the result in a layer.

    Parameters
    ----------
    adata
        AnnData object.
    strategy
        Imputation strategy. Currently supports:
        - "zero": replace NaNs with 0
        - "mean": column-wise mean
        - "median": column-wise median
    source_layer
        Layer to read from. If None, uses adata.X.
    target_layer
        Layer to write to. Defaults to "X_imputed_<strategy>".
    overwrite
        Whether to overwrite an existing target layer.
    """
    if target_layer is None:
        target_layer = f"X_imputed_{strategy}"

    if not overwrite and target_layer in adata.layers:
        return

    # Select source matrix
    if source_layer is None:
        X = adata.X
    else:
        X = adata.layers[source_layer]

    if X is None:
        raise ValueError("No source matrix available for imputation.")

    # Work on a copy
    X = np.asarray(X, dtype=float).copy()

    nan_mask = np.isnan(X)
    if not nan_mask.any():
        adata.layers[target_layer] = X
        return

    if strategy == "zero":
        X[nan_mask] = 0.0

    elif strategy in {"mean", "median"}:
        reducer = np.nanmean if strategy == "mean" else np.nanmedian
        col_stats = reducer(X, axis=0)

        # Replace NaNs column-wise
        rows, cols = np.where(nan_mask)
        X[rows, cols] = col_stats[cols]

    else:
        raise ValueError(f"Unknown imputation strategy: {strategy!r}")

    adata.layers[target_layer] = X

valency_anndata.preprocessing.highly_variable_statements ¶

highly_variable_statements(
    adata: AnnData,
    *,
    layer: str | None = None,
    n_bins: int | None = 1,
    min_disp: float | None = None,
    max_disp: float | None = None,
    min_cov: int | None = 2,
    max_cov: int | None = None,
    n_top_statements: int | None = None,
    subset: bool = False,
    inplace: bool = True,
    key_added: str = "highly_variable",
    variance_mode: str = "overall",
    bin_by: str = "coverage",
)

Identify highly variable statements in a vote matrix (AnnData).

Analogous to scanpy.pp.highly_variable_genes for single-cell data, this function identifies statements with high variability across participants. The function computes various dispersion metrics, normalizes them within bins, and marks statements as highly variable based on user-defined criteria.

Parameters:

Name	Type	Description	Default
`adata`	`AnnData`	AnnData object containing vote matrix.	required
`layer`	`str \| None`	Layer to use for computation. If None, uses `adata.X`.	`None`
`n_bins`	`int \| None`	Number of bins for dispersion normalization. Values <=1 or None disable binning. Default is 1 (no binning).	`1`
`min_disp`	`float \| None`	Minimum normalized dispersion threshold for selecting highly variable statements. Only used if `n_top_statements` is None.	`None`
`max_disp`	`float \| None`	Maximum normalized dispersion threshold for selecting highly variable statements. Only used if `n_top_statements` is None.	`None`
`min_cov`	`int \| None`	Minimum coverage (number of non-NaN votes) required for a statement. Default is 2.	`2`
`max_cov`	`int \| None`	Maximum coverage threshold for selecting highly variable statements. Only used if `n_top_statements` is None.	`None`
`n_top_statements`	`int \| None`	Select this many top statements by normalized dispersion. If provided, overrides `min_disp`, `max_disp`, and `max_cov` filters.	`None`
`subset`	`bool`	If True, subset the AnnData object to highly variable statements.	`False`
`inplace`	`bool`	If True, add results to `adata.var` and `adata.uns[key_added]`. If False, return results as DataFrame.	`True`
`key_added`	`str`	Key under which to store the highly variable boolean mask in `adata.var` and metadata in `adata.uns`. Default is "highly_variable".	`'highly_variable'`
`variance_mode`	`str`	Which variance metric to use for computing dispersion: - "overall": variance of raw votes (including NaN as missing) - "valence": variance of engaged votes only (excluding passes/NaN) - "engagement": variance of engagement (1 if ±1, 0 if pass) Default is "overall".	`'overall'`
`bin_by`	`str`	Variable to bin on for normalization. Options: - "coverage": number of non-NaN votes - "p_engaged": proportion of engaged votes (±1) - "mean_valence": average valence of engaged votes - "mean_abs_valence": absolute value of mean valence Default is "coverage".	`'coverage'`

Returns:

Type	Description
`DataFrame \| None`	If `inplace=False`, returns a DataFrame with columns: `coverage`, `mean_valence`, `mean_abs_valence`, `p_engaged`, `bin_idx`, `var_overall`, `var_valence`, `var_engagement`, `dispersions`, `dispersions_norm`, and a boolean column named by `key_added`. If `inplace=True`, modifies `adata` in place and returns None.

Examples:

Select top 50 most variable statements:

import valency_anndata as val
adata = val.datasets.aufstehen()
val.preprocessing.highly_variable_statements(adata, n_top_statements=50)

Use normalized dispersion thresholds with binning:

val.preprocessing.highly_variable_statements(
    adata,
    n_bins=10,
    min_disp=0.5,
    min_cov=5,
    bin_by="coverage"
)

Focus on valence variance instead of overall variance:

val.preprocessing.highly_variable_statements(
    adata,
    n_top_statements=100,
    variance_mode="valence"
)

Run multiple times with different settings using key_added:

# Identify top 50 statements
val.preprocessing.highly_variable_statements(
    adata,
    n_top_statements=50,
    key_added="highly_variable_top50"
)
# Also identify top 100 statements
val.preprocessing.highly_variable_statements(
    adata,
    n_top_statements=100,
    key_added="highly_variable_top100"
)
# Now you can use either mask with recipe_polis
val.tools.recipe_polis(adata, mask_var="highly_variable_top50")

Source code in src/valency_anndata/preprocessing/_highly_variable_statements.py

def highly_variable_statements(
    adata: AnnData,
    *,
    layer: str | None = None,
    n_bins: int | None = 1,
    min_disp: float | None = None,
    max_disp: float | None = None,
    min_cov: int | None = 2,
    max_cov: int | None = None,
    n_top_statements: int | None = None,
    subset: bool = False,
    inplace: bool = True,
    key_added: str = "highly_variable",
    variance_mode: str = "overall",  # "overall", "valence", "engagement"
    bin_by: str = "coverage",        # "coverage", "p_engaged", "mean_valence", "mean_abs_valence"
):
    """
    Identify highly variable statements in a vote matrix (AnnData).

    Analogous to [scanpy.pp.highly_variable_genes][] for single-cell data, this function
    identifies statements with high variability across participants. The function computes
    various dispersion metrics, normalizes them within bins, and marks statements as highly
    variable based on user-defined criteria.

    Parameters
    ----------
    adata
        AnnData object containing vote matrix.
    layer
        Layer to use for computation. If None, uses `adata.X`.
    n_bins
        Number of bins for dispersion normalization. Values <=1 or None disable binning.
        Default is 1 (no binning).
    min_disp
        Minimum normalized dispersion threshold for selecting highly variable statements.
        Only used if `n_top_statements` is None.
    max_disp
        Maximum normalized dispersion threshold for selecting highly variable statements.
        Only used if `n_top_statements` is None.
    min_cov
        Minimum coverage (number of non-NaN votes) required for a statement.
        Default is 2.
    max_cov
        Maximum coverage threshold for selecting highly variable statements.
        Only used if `n_top_statements` is None.
    n_top_statements
        Select this many top statements by normalized dispersion. If provided, overrides
        `min_disp`, `max_disp`, and `max_cov` filters.
    subset
        If True, subset the AnnData object to highly variable statements.
    inplace
        If True, add results to `adata.var` and `adata.uns[key_added]`.
        If False, return results as DataFrame.
    key_added
        Key under which to store the highly variable boolean mask in `adata.var`
        and metadata in `adata.uns`. Default is "highly_variable".
    variance_mode
        Which variance metric to use for computing dispersion:
        - "overall": variance of raw votes (including NaN as missing)
        - "valence": variance of engaged votes only (excluding passes/NaN)
        - "engagement": variance of engagement (1 if ±1, 0 if pass)
        Default is "overall".
    bin_by
        Variable to bin on for normalization. Options:
        - "coverage": number of non-NaN votes
        - "p_engaged": proportion of engaged votes (±1)
        - "mean_valence": average valence of engaged votes
        - "mean_abs_valence": absolute value of mean valence
        Default is "coverage".

    Returns
    -------
    pd.DataFrame | None
        If `inplace=False`, returns a DataFrame with columns:
        `coverage`, `mean_valence`, `mean_abs_valence`, `p_engaged`,
        `bin_idx`, `var_overall`, `var_valence`, `var_engagement`,
        `dispersions`, `dispersions_norm`, and a boolean column named by `key_added`.
        If `inplace=True`, modifies `adata` in place and returns None.

    Examples
    --------
    Select top 50 most variable statements:

    ```py
    import valency_anndata as val
    adata = val.datasets.aufstehen()
    val.preprocessing.highly_variable_statements(adata, n_top_statements=50)
    ```

    Use normalized dispersion thresholds with binning:

    ```py
    val.preprocessing.highly_variable_statements(
        adata,
        n_bins=10,
        min_disp=0.5,
        min_cov=5,
        bin_by="coverage"
    )
    ```

    Focus on valence variance instead of overall variance:

    ```py
    val.preprocessing.highly_variable_statements(
        adata,
        n_top_statements=100,
        variance_mode="valence"
    )
    ```

    Run multiple times with different settings using `key_added`:

    ```py
    # Identify top 50 statements
    val.preprocessing.highly_variable_statements(
        adata,
        n_top_statements=50,
        key_added="highly_variable_top50"
    )
    # Also identify top 100 statements
    val.preprocessing.highly_variable_statements(
        adata,
        n_top_statements=100,
        key_added="highly_variable_top100"
    )
    # Now you can use either mask with recipe_polis
    val.tools.recipe_polis(adata, mask_var="highly_variable_top50")
    ```
    """

    # ---- 0. select matrix ---------------------------------------------
    X = adata.layers[layer] if layer is not None else adata.X
    X = np.asarray(X)
    n_statements = X.shape[1]

    # ---- 1. coverage and engagement -----------------------------------
    coverage = np.sum(~np.isnan(X), axis=0)
    engaged = (~np.isnan(X)) & (X != 0)
    p_engaged = engaged.sum(axis=0) / np.maximum(coverage, 1)

    # average valence for engaged votes only
    mean_valence = np.full(X.shape[1], np.nan)
    for j in range(X.shape[1]):
        vals = X[engaged[:, j], j]
        if vals.size > 0:
            mean_valence[j] = np.mean(vals)

    # optional: absolute version
    mean_abs_valence = np.abs(mean_valence)

    # ---- 2. compute variances -----------------------------------------
    # overall variance
    var_overall = np.nanvar(X, axis=0, ddof=1)

    # engagement variance: 1 if engaged, 0 if pass
    X_eng = np.where(np.isnan(X), np.nan, np.where(X != 0, 1.0, 0.0))
    var_engagement = np.nanvar(X_eng, axis=0, ddof=1)

    # valence variance: only consider engaged votes
    X_val = np.where(X == 0, np.nan, X)
    var_valence = np.nanvar(X_val, axis=0, ddof=1)

    # choose variance based on mode
    if variance_mode == "overall":
        dispersions = var_overall
    elif variance_mode == "valence":
        dispersions = var_valence
    elif variance_mode == "engagement":
        dispersions = var_engagement
    else:
        raise ValueError(f"Unknown variance_mode: {variance_mode}")

    valid = coverage >= 2  # same as before

    # ---- 3. binning ---------------------------------------------------
    if n_bins is None or n_bins <= 1:
        bin_idx = np.zeros(n_statements, dtype=int)
    else:
        if bin_by == "coverage":
            bin_idx = pd.cut(coverage, bins=n_bins, labels=False)
        elif bin_by == "mean_valence":
            bin_idx = pd.cut(mean_valence, bins=n_bins, labels=False)
        elif bin_by == "mean_abs_valence":
            bin_idx = pd.cut(mean_abs_valence, bins=n_bins, labels=False)
        elif bin_by == "p_engaged":
            bin_idx = pd.cut(p_engaged, bins=n_bins, labels=False)
        else:
            raise ValueError(f"Unknown bin_by: {bin_by}")

    # ---- 4. normalize within bins ------------------------------------
    dispersions_norm = np.full(n_statements, np.nan)
    for b in np.unique(bin_idx[valid]):
        mask = (bin_idx == b) & valid
        if mask.sum() < 2:
            continue
        d = dispersions[mask]
        mu = d.mean()
        sd = d.std()
        if sd == 0 or not np.isfinite(sd):
            continue
        dispersions_norm[mask] = (d - mu) / sd

    # ---- 5. stats table ----------------------------------------------
    stats = pd.DataFrame(
        {
            "coverage": coverage,
            "mean_valence": mean_valence,
            "mean_abs_valence": mean_abs_valence,
            "p_engaged": p_engaged,
            "bin_idx": bin_idx,
            "var_overall": var_overall,
            "var_valence": var_valence,
            "var_engagement": var_engagement,
            "dispersions": dispersions,
            "dispersions_norm": dispersions_norm,
        },
        index=adata.var_names,
    )

    # ---- 6. selection -------------------------------------------------
    if n_top_statements is not None:
        # rank by normalized dispersion first, then raw
        order = np.lexsort(
            (-stats["dispersions"].values, -stats["dispersions_norm"].values)
        )
        hv = np.zeros(n_statements, dtype=bool)
        hv[order[:n_top_statements]] = True
    else:
        hv = valid.copy()
        if min_cov is not None:
            hv &= stats["coverage"].values >= min_cov
        if max_cov is not None:
            hv &= stats["coverage"].values <= max_cov
        if min_disp is not None:
            hv &= stats["dispersions_norm"].values >= min_disp
        if max_disp is not None:
            hv &= stats["dispersions_norm"].values <= max_disp

    stats[key_added] = hv

    # ---- 7. output ----------------------------------------------------
    if not inplace:
        return stats

    for k in stats.columns:
        adata.var[k] = stats[k].values

    # store metadata in .uns
    adata.uns[key_added] = {
        "variance_mode": variance_mode,
        "bin_by": bin_by,
        "n_bins": n_bins,
        "min_disp": min_disp,
        "max_disp": max_disp,
        "min_cov": min_cov,
        "max_cov": max_cov,
        "n_top_statements": n_top_statements,
        "subset": subset,
        "valid": valid,
        "statement_names": adata.var_names.tolist(),
    }

    if subset:
        adata._inplace_subset_var(hv)

`scanpy` methods (inherited)¶

Note

These methods are simply quick convenience wrappers around methods in scanpy, a tool for single-cell gene expression. They will use terms like "cells", "genes" and "counts", but you can think of these as "participants", "statements" and "votes".

See scanpy.pp for more methods you can experiment with via the val.scanpy.pp namespace.

valency_anndata.preprocessing.neighbors ¶

neighbors(
    adata: AnnData,
    n_neighbors: int = 15,
    n_pcs: int | None = None,
    *,
    use_rep: str | None = None,
    knn: bool = True,
    method: _Method = "umap",
    transformer: KnnTransformerLike
    | _KnownTransformer
    | None = None,
    metric: _Metric | _MetricFn = "euclidean",
    metric_kwds: Mapping[str, Any] = MappingProxyType({}),
    random_state: _LegacyRandom = 0,
    key_added: str | None = None,
    copy: bool = False,
) -> AnnData | None

Compute the nearest neighbors distance matrix and a neighborhood graph of observations :cite:p:McInnes2018.

The neighbor search efficiency of this heavily relies on UMAP :cite:p:McInnes2018, which also provides a method for estimating connectivities of data points - the connectivity of the manifold (method=='umap'). If method=='gauss', connectivities are computed according to :cite:t:Coifman2005, in the adaption of :cite:t:Haghverdi2016.

Parameters:

Name	Type	Description	Default
`adata`	`AnnData`	Annotated data matrix.	required
`n_neighbors`	`int`	The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor. ignored if `transformer` is an instance.	`15`
`knn`	`bool`	If `True`, use a hard threshold to restrict the number of neighbors to `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor.	`True`
`method`	`_Method`	Use 'umap' :cite:p:`McInnes2018` or 'gauss' (Gauss kernel following :cite:t:`Coifman2005` with adaptive width :cite:t:`Haghverdi2016`) for computing connectivities.	`'umap'`
`transformer`	`KnnTransformerLike \| _KnownTransformer \| None`	Approximate kNN search implementation following the API of :class:`~sklearn.neighbors.KNeighborsTransformer`. See :doc:`/how-to/knn-transformers` for more details. Also accepts the following known options: `None` (the default) Behavior depends on data size. For small data, we will calculate exact kNN, otherwise we use :class:`~pynndescent.pynndescent_.PyNNDescentTransformer` `'pynndescent'` :class:`~pynndescent.pynndescent_.PyNNDescentTransformer` `'rapids'` A transformer based on :class:`cuml.neighbors.NearestNeighbors`. .. deprecated:: 1.10.0 Use :func:`rapids_singlecell.pp.neighbors` instead.	`None`
`metric`	`_Metric \| _MetricFn`	A known metric’s name or a callable that returns a distance. ignored if `transformer` is an instance.	`'euclidean'`
`metric_kwds`	`Mapping[str, Any]`	Options for the metric. ignored if `transformer` is an instance.	`MappingProxyType({})`
`random_state`	`_LegacyRandom`	A numpy random seed. ignored if `transformer` is an instance.	`0`
`key_added`	`str \| None`	If not specified, the neighbors data is stored in `.uns['neighbors']`, distances and connectivities are stored in `.obsp['distances']` and `.obsp['connectivities']` respectively. If specified, the neighbors data is added to .uns[key_added], distances are stored in `.obsp[key_added+'_distances']` and connectivities in `.obsp[key_added+'_connectivities']`.	`None`
`copy`	`bool`	Return a copy instead of writing to adata.	`False`

Returns:

Type	Description
Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields:
`adata.obsp['distances' \| key_added+'_distances']` : :class:`scipy.sparse.csr_matrix` (dtype `float`)	Distance matrix of the nearest neighbors search. Each row (cell) has `n_neighbors`-1 non-zero entries. These are the distances to their `n_neighbors`-1 nearest neighbors (excluding the cell itself).
`adata.obsp['connectivities' \| key_added+'_connectivities']` : :class:`scipy.sparse._csr.csr_matrix` (dtype `float`)	Weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities.
`adata.uns['neighbors' \| key_added]` : :class:`dict`	neighbors parameters.

Examples:

>>> import scanpy as sc
>>> adata = sc.datasets.pbmc68k_reduced()
>>> # Basic usage
>>> sc.pp.neighbors(adata, 20, metric="cosine")
>>> # Provide your own transformer for more control and flexibility
>>> from sklearn.neighbors import KNeighborsTransformer
>>> transformer = KNeighborsTransformer(
...     n_neighbors=10, metric="manhattan", algorithm="kd_tree"
... )
>>> sc.pp.neighbors(adata, transformer=transformer)
>>> # now you can e.g. access the index: `transformer._tree`

See Also

:doc:/how-to/knn-transformers

Source code in .venv/lib/python3.10/site-packages/scanpy/neighbors/__init__.py

@_doc_params(n_pcs=doc_n_pcs, use_rep=doc_use_rep)
def neighbors(  # noqa: PLR0913
    adata: AnnData,
    n_neighbors: int = 15,
    n_pcs: int | None = None,
    *,
    use_rep: str | None = None,
    knn: bool = True,
    method: _Method = "umap",
    transformer: KnnTransformerLike | _KnownTransformer | None = None,
    metric: _Metric | _MetricFn = "euclidean",
    metric_kwds: Mapping[str, Any] = MappingProxyType({}),
    random_state: _LegacyRandom = 0,
    key_added: str | None = None,
    copy: bool = False,
) -> AnnData | None:
    """Compute the nearest neighbors distance matrix and a neighborhood graph of observations :cite:p:`McInnes2018`.

    The neighbor search efficiency of this heavily relies on UMAP :cite:p:`McInnes2018`,
    which also provides a method for estimating connectivities of data points -
    the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`,
    connectivities are computed according to :cite:t:`Coifman2005`, in the adaption of
    :cite:t:`Haghverdi2016`.

    Parameters
    ----------
    adata
        Annotated data matrix.
    n_neighbors
        The size of local neighborhood (in terms of number of neighboring data
        points) used for manifold approximation. Larger values result in more
        global views of the manifold, while smaller values result in more local
        data being preserved. In general values should be in the range 2 to 100.
        If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
        is `False`, a Gaussian kernel width is set to the distance of the
        `n_neighbors` neighbor.

        *ignored if ``transformer`` is an instance.*
    {n_pcs}
    {use_rep}
    knn
        If `True`, use a hard threshold to restrict the number of neighbors to
        `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
        Kernel to assign low weights to neighbors more distant than the
        `n_neighbors` nearest neighbor.
    method
        Use 'umap' :cite:p:`McInnes2018` or 'gauss' (Gauss kernel following :cite:t:`Coifman2005`
        with adaptive width :cite:t:`Haghverdi2016`) for computing connectivities.
    transformer
        Approximate kNN search implementation following the API of
        :class:`~sklearn.neighbors.KNeighborsTransformer`.
        See :doc:`/how-to/knn-transformers` for more details.
        Also accepts the following known options:

        `None` (the default)
            Behavior depends on data size.
            For small data, we will calculate exact kNN, otherwise we use
            :class:`~pynndescent.pynndescent_.PyNNDescentTransformer`
        `'pynndescent'`
            :class:`~pynndescent.pynndescent_.PyNNDescentTransformer`
        `'rapids'`
            A transformer based on :class:`cuml.neighbors.NearestNeighbors`.

            .. deprecated:: 1.10.0
               Use :func:`rapids_singlecell.pp.neighbors` instead.
    metric
        A known metric’s name or a callable that returns a distance.

        *ignored if ``transformer`` is an instance.*
    metric_kwds
        Options for the metric.

        *ignored if ``transformer`` is an instance.*
    random_state
        A numpy random seed.

        *ignored if ``transformer`` is an instance.*
    key_added
        If not specified, the neighbors data is stored in `.uns['neighbors']`,
        distances and connectivities are stored in `.obsp['distances']` and
        `.obsp['connectivities']` respectively.
        If specified, the neighbors data is added to .uns[key_added],
        distances are stored in `.obsp[key_added+'_distances']` and
        connectivities in `.obsp[key_added+'_connectivities']`.
    copy
        Return a copy instead of writing to adata.

    Returns
    -------
    Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields:

    `adata.obsp['distances' | key_added+'_distances']` : :class:`scipy.sparse.csr_matrix` (dtype `float`)
        Distance matrix of the nearest neighbors search. Each row (cell) has `n_neighbors`-1 non-zero entries. These are the distances to their `n_neighbors`-1 nearest neighbors (excluding the cell itself).
    `adata.obsp['connectivities' | key_added+'_connectivities']` : :class:`scipy.sparse._csr.csr_matrix` (dtype `float`)
        Weighted adjacency matrix of the neighborhood graph of data
        points. Weights should be interpreted as connectivities.
    `adata.uns['neighbors' | key_added]` : :class:`dict`
        neighbors parameters.

    Examples
    --------
    >>> import scanpy as sc
    >>> adata = sc.datasets.pbmc68k_reduced()
    >>> # Basic usage
    >>> sc.pp.neighbors(adata, 20, metric="cosine")
    >>> # Provide your own transformer for more control and flexibility
    >>> from sklearn.neighbors import KNeighborsTransformer
    >>> transformer = KNeighborsTransformer(
    ...     n_neighbors=10, metric="manhattan", algorithm="kd_tree"
    ... )
    >>> sc.pp.neighbors(adata, transformer=transformer)
    >>> # now you can e.g. access the index: `transformer._tree`

    See Also
    --------
    :doc:`/how-to/knn-transformers`

    """
    start = logg.info("computing neighbors")
    adata = adata.copy() if copy else adata
    if adata.is_view:  # we shouldn't need this here...
        adata._init_as_actual(adata.copy())
    neighbors = Neighbors(adata)
    neighbors.compute_neighbors(
        n_neighbors,
        n_pcs=n_pcs,
        use_rep=use_rep,
        knn=knn,
        method=method,
        transformer=transformer,
        metric=metric,
        metric_kwds=metric_kwds,
        random_state=random_state,
    )

    if key_added is None:
        key_added = "neighbors"
        conns_key = "connectivities"
        dists_key = "distances"
    else:
        conns_key = f"{key_added}_connectivities"
        dists_key = f"{key_added}_distances"

    adata.uns[key_added] = {}

    neighbors_dict = adata.uns[key_added]

    neighbors_dict["connectivities_key"] = conns_key
    neighbors_dict["distances_key"] = dists_key

    neighbors_dict["params"] = NeighborsParams(
        n_neighbors=neighbors.n_neighbors,
        method=method,
        random_state=random_state,
        metric=metric,
    )
    if metric_kwds:
        neighbors_dict["params"]["metric_kwds"] = metric_kwds
    if use_rep is not None:
        neighbors_dict["params"]["use_rep"] = use_rep
    if n_pcs is not None:
        neighbors_dict["params"]["n_pcs"] = n_pcs

    adata.obsp[dists_key] = neighbors.distances
    adata.obsp[conns_key] = neighbors.connectivities

    if neighbors.rp_forest is not None:
        neighbors_dict["rp_forest"] = neighbors.rp_forest
    logg.info(
        "    finished",
        time=start,
        deep=(
            f"added to `.uns[{key_added!r}]`\n"
            f"    `.obsp[{dists_key!r}]`, distances for each pair of neighbors\n"
            f"    `.obsp[{conns_key!r}]`, weighted adjacency matrix"
        ),
    )
    return adata if copy else None

Preprocessing

valency-anndata methods¶

valency_anndata.preprocessing.rebuild_vote_matrix ¶

valency_anndata.preprocessing.calculate_qc_metrics ¶

valency_anndata.preprocessing.impute ¶

valency_anndata.preprocessing.highly_variable_statements ¶

scanpy methods (inherited)¶

valency_anndata.preprocessing.neighbors ¶

`valency-anndata` methods¶

`scanpy` methods (inherited)¶