Skip to content

Preprocessing

valency-anndata methods

valency_anndata.preprocessing.rebuild_vote_matrix

rebuild_vote_matrix(
    data: AnnData,
    trim_rule: int | float | str | datetime = 1.0,
    time_col: str = "timestamp",
    inplace: bool = True,
) -> Optional[AnnData]

Rebuild a vote matrix from votes stored in adata.uns['votes'].

  • Trims votes by time according to trim_rule.
  • Deduplicates votes by keeping the last vote per voter-comment pair.
  • Returns a new AnnData with .obs = voters, .var = comments, .X = vote values.
  • Preserves existing uns, obsm, and layers.
Source code in src/valency_anndata/preprocessing/_rebuild_vote_matrix.py
def rebuild_vote_matrix(
    data: AnnData,
    trim_rule: int | float | str | datetime = 1.0,
    time_col: str = "timestamp",
    inplace: bool = True,
) -> Optional[AnnData]:
    """
    Rebuild a vote matrix from votes stored in `adata.uns['votes']`.

    - Trims votes by time according to `trim_rule`.
    - Deduplicates votes by keeping the last vote per voter-comment pair.
    - Returns a new AnnData with `.obs` = voters, `.var` = comments, `.X` = vote values.
    - Preserves existing `uns`, `obsm`, and `layers`.
    """

    # Load votes CSV
    votes_df = data.uns.get("votes")
    if votes_df is None:
        raise KeyError("`uns['votes']` not found in AnnData")
    votes_df = votes_df.copy()

    # Trim by time
    votes_df = votes_df.pipe(trim_by_time, rule=trim_rule, col=time_col)

    # Sort & deduplicate
    votes_df = votes_df.sort_values(time_col)
    votes_df = votes_df.drop_duplicates(
        subset=["voter-id", "comment-id"], keep="last"
    )

    # Pivot into voter × comment
    vote_matrix_df = votes_df.pivot(
        index="voter-id", columns="comment-id", values="vote"
    )

    # Build a new AnnData
    new_adata = AnnData(
        X=vote_matrix_df.to_numpy(dtype=float),
        obs=data.obs.reindex(vote_matrix_df.index.astype(str)),
        var=data.var.reindex(vote_matrix_df.columns.astype(str))
    )

    # Copy over other metadata
    new_adata.uns.update(data.uns)
    new_adata.obsm.update(data.obsm)
    new_adata.layers.update(data.layers)

    if inplace:
        # Replace all internal state of the original AnnData
        data._init_as_actual(new_adata)
        return None
    else:
        return new_adata

valency_anndata.preprocessing.calculate_qc_metrics

calculate_qc_metrics(
    adata: AnnData, *, inplace: bool = False
) -> Optional[Tuple[DataFrame, DataFrame]]

Compute participant- and statement-level metrics using describe_obs and describe_var.

Source code in src/valency_anndata/preprocessing/_qc.py
def calculate_qc_metrics(
    adata: ad.AnnData,
    *,
    inplace: bool = False,
) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
    """Compute participant- and statement-level metrics using describe_obs and describe_var."""
    X = adata.X
    if X is None:
        raise ValueError("adata.X is None")
    obs_metrics = describe_obs(X, obs_names=adata.obs_names)
    var_metrics = describe_var(X, var_names=adata.var_names)

    if inplace:
        adata.obs[obs_metrics.columns] = obs_metrics
        adata.var[var_metrics.columns] = var_metrics
        return None

    return obs_metrics, var_metrics

valency_anndata.preprocessing.impute

impute(
    adata: AnnData,
    *,
    strategy: Literal["zero", "mean", "median"] = "mean",
    source_layer: Optional[str] = None,
    target_layer: Optional[str] = None,
    overwrite: bool = False,
) -> None

Impute NaN values in an AnnData matrix and store the result in a layer.

Parameters:

Name Type Description Default
adata AnnData

AnnData object.

required
strategy Literal['zero', 'mean', 'median']

Imputation strategy. Currently supports: - "zero": replace NaNs with 0 - "mean": column-wise mean - "median": column-wise median

'mean'
source_layer Optional[str]

Layer to read from. If None, uses adata.X.

None
target_layer Optional[str]

Layer to write to. Defaults to "X_imputed_".

None
overwrite bool

Whether to overwrite an existing target layer.

False
Source code in src/valency_anndata/preprocessing/_impute.py
def impute(
    adata: AnnData,
    *,
    strategy: Literal["zero", "mean", "median"] = "mean",
    source_layer: Optional[str] = None,
    target_layer: Optional[str] = None,
    overwrite: bool = False,
) -> None:
    """
    Impute NaN values in an AnnData matrix and store the result in a layer.

    Parameters
    ----------
    adata
        AnnData object.
    strategy
        Imputation strategy. Currently supports:
        - "zero": replace NaNs with 0
        - "mean": column-wise mean
        - "median": column-wise median
    source_layer
        Layer to read from. If None, uses adata.X.
    target_layer
        Layer to write to. Defaults to "X_imputed_<strategy>".
    overwrite
        Whether to overwrite an existing target layer.
    """
    if target_layer is None:
        target_layer = f"X_imputed_{strategy}"

    if not overwrite and target_layer in adata.layers:
        return

    # Select source matrix
    if source_layer is None:
        X = adata.X
    else:
        X = adata.layers[source_layer]

    if X is None:
        raise ValueError("No source matrix available for imputation.")

    # Work on a copy
    X = np.asarray(X, dtype=float).copy()

    nan_mask = np.isnan(X)
    if not nan_mask.any():
        adata.layers[target_layer] = X
        return

    if strategy == "zero":
        X[nan_mask] = 0.0

    elif strategy in {"mean", "median"}:
        reducer = np.nanmean if strategy == "mean" else np.nanmedian
        col_stats = reducer(X, axis=0)

        # Replace NaNs column-wise
        rows, cols = np.where(nan_mask)
        X[rows, cols] = col_stats[cols]

    else:
        raise ValueError(f"Unknown imputation strategy: {strategy!r}")

    adata.layers[target_layer] = X

valency_anndata.preprocessing.highly_variable_statements

highly_variable_statements(
    adata: AnnData,
    *,
    layer: str | None = None,
    n_bins: int | None = 1,
    min_disp: float | None = None,
    max_disp: float | None = None,
    min_cov: int | None = 2,
    max_cov: int | None = None,
    n_top_statements: int | None = None,
    subset: bool = False,
    inplace: bool = True,
    key_added: str = "highly_variable",
    variance_mode: str = "overall",
    bin_by: str = "coverage",
)

Identify highly variable statements in a vote matrix (AnnData).

Analogous to scanpy.pp.highly_variable_genes for single-cell data, this function identifies statements with high variability across participants. The function computes various dispersion metrics, normalizes them within bins, and marks statements as highly variable based on user-defined criteria.

Parameters:

Name Type Description Default
adata AnnData

AnnData object containing vote matrix.

required
layer str | None

Layer to use for computation. If None, uses adata.X.

None
n_bins int | None

Number of bins for dispersion normalization. Values <=1 or None disable binning. Default is 1 (no binning).

1
min_disp float | None

Minimum normalized dispersion threshold for selecting highly variable statements. Only used if n_top_statements is None.

None
max_disp float | None

Maximum normalized dispersion threshold for selecting highly variable statements. Only used if n_top_statements is None.

None
min_cov int | None

Minimum coverage (number of non-NaN votes) required for a statement. Default is 2.

2
max_cov int | None

Maximum coverage threshold for selecting highly variable statements. Only used if n_top_statements is None.

None
n_top_statements int | None

Select this many top statements by normalized dispersion. If provided, overrides min_disp, max_disp, and max_cov filters.

None
subset bool

If True, subset the AnnData object to highly variable statements.

False
inplace bool

If True, add results to adata.var and adata.uns[key_added]. If False, return results as DataFrame.

True
key_added str

Key under which to store the highly variable boolean mask in adata.var and metadata in adata.uns. Default is "highly_variable".

'highly_variable'
variance_mode str

Which variance metric to use for computing dispersion: - "overall": variance of raw votes (including NaN as missing) - "valence": variance of engaged votes only (excluding passes/NaN) - "engagement": variance of engagement (1 if ±1, 0 if pass) Default is "overall".

'overall'
bin_by str

Variable to bin on for normalization. Options: - "coverage": number of non-NaN votes - "p_engaged": proportion of engaged votes (±1) - "mean_valence": average valence of engaged votes - "mean_abs_valence": absolute value of mean valence Default is "coverage".

'coverage'

Returns:

Type Description
DataFrame | None

If inplace=False, returns a DataFrame with columns: coverage, mean_valence, mean_abs_valence, p_engaged, bin_idx, var_overall, var_valence, var_engagement, dispersions, dispersions_norm, and a boolean column named by key_added. If inplace=True, modifies adata in place and returns None.

Examples:

Select top 50 most variable statements:

import valency_anndata as val
adata = val.datasets.aufstehen()
val.preprocessing.highly_variable_statements(adata, n_top_statements=50)

Use normalized dispersion thresholds with binning:

val.preprocessing.highly_variable_statements(
    adata,
    n_bins=10,
    min_disp=0.5,
    min_cov=5,
    bin_by="coverage"
)

Focus on valence variance instead of overall variance:

val.preprocessing.highly_variable_statements(
    adata,
    n_top_statements=100,
    variance_mode="valence"
)

Run multiple times with different settings using key_added:

# Identify top 50 statements
val.preprocessing.highly_variable_statements(
    adata,
    n_top_statements=50,
    key_added="highly_variable_top50"
)
# Also identify top 100 statements
val.preprocessing.highly_variable_statements(
    adata,
    n_top_statements=100,
    key_added="highly_variable_top100"
)
# Now you can use either mask with recipe_polis
val.tools.recipe_polis(adata, mask_var="highly_variable_top50")
Source code in src/valency_anndata/preprocessing/_highly_variable_statements.py
def highly_variable_statements(
    adata: AnnData,
    *,
    layer: str | None = None,
    n_bins: int | None = 1,
    min_disp: float | None = None,
    max_disp: float | None = None,
    min_cov: int | None = 2,
    max_cov: int | None = None,
    n_top_statements: int | None = None,
    subset: bool = False,
    inplace: bool = True,
    key_added: str = "highly_variable",
    variance_mode: str = "overall",  # "overall", "valence", "engagement"
    bin_by: str = "coverage",        # "coverage", "p_engaged", "mean_valence", "mean_abs_valence"
):
    """
    Identify highly variable statements in a vote matrix (AnnData).

    Analogous to [scanpy.pp.highly_variable_genes][] for single-cell data, this function
    identifies statements with high variability across participants. The function computes
    various dispersion metrics, normalizes them within bins, and marks statements as highly
    variable based on user-defined criteria.

    Parameters
    ----------
    adata
        AnnData object containing vote matrix.
    layer
        Layer to use for computation. If None, uses `adata.X`.
    n_bins
        Number of bins for dispersion normalization. Values <=1 or None disable binning.
        Default is 1 (no binning).
    min_disp
        Minimum normalized dispersion threshold for selecting highly variable statements.
        Only used if `n_top_statements` is None.
    max_disp
        Maximum normalized dispersion threshold for selecting highly variable statements.
        Only used if `n_top_statements` is None.
    min_cov
        Minimum coverage (number of non-NaN votes) required for a statement.
        Default is 2.
    max_cov
        Maximum coverage threshold for selecting highly variable statements.
        Only used if `n_top_statements` is None.
    n_top_statements
        Select this many top statements by normalized dispersion. If provided, overrides
        `min_disp`, `max_disp`, and `max_cov` filters.
    subset
        If True, subset the AnnData object to highly variable statements.
    inplace
        If True, add results to `adata.var` and `adata.uns[key_added]`.
        If False, return results as DataFrame.
    key_added
        Key under which to store the highly variable boolean mask in `adata.var`
        and metadata in `adata.uns`. Default is "highly_variable".
    variance_mode
        Which variance metric to use for computing dispersion:
        - "overall": variance of raw votes (including NaN as missing)
        - "valence": variance of engaged votes only (excluding passes/NaN)
        - "engagement": variance of engagement (1 if ±1, 0 if pass)
        Default is "overall".
    bin_by
        Variable to bin on for normalization. Options:
        - "coverage": number of non-NaN votes
        - "p_engaged": proportion of engaged votes (±1)
        - "mean_valence": average valence of engaged votes
        - "mean_abs_valence": absolute value of mean valence
        Default is "coverage".

    Returns
    -------
    pd.DataFrame | None
        If `inplace=False`, returns a DataFrame with columns:
        `coverage`, `mean_valence`, `mean_abs_valence`, `p_engaged`,
        `bin_idx`, `var_overall`, `var_valence`, `var_engagement`,
        `dispersions`, `dispersions_norm`, and a boolean column named by `key_added`.
        If `inplace=True`, modifies `adata` in place and returns None.

    Examples
    --------
    Select top 50 most variable statements:

    ```py
    import valency_anndata as val
    adata = val.datasets.aufstehen()
    val.preprocessing.highly_variable_statements(adata, n_top_statements=50)
    ```

    Use normalized dispersion thresholds with binning:

    ```py
    val.preprocessing.highly_variable_statements(
        adata,
        n_bins=10,
        min_disp=0.5,
        min_cov=5,
        bin_by="coverage"
    )
    ```

    Focus on valence variance instead of overall variance:

    ```py
    val.preprocessing.highly_variable_statements(
        adata,
        n_top_statements=100,
        variance_mode="valence"
    )
    ```

    Run multiple times with different settings using `key_added`:

    ```py
    # Identify top 50 statements
    val.preprocessing.highly_variable_statements(
        adata,
        n_top_statements=50,
        key_added="highly_variable_top50"
    )
    # Also identify top 100 statements
    val.preprocessing.highly_variable_statements(
        adata,
        n_top_statements=100,
        key_added="highly_variable_top100"
    )
    # Now you can use either mask with recipe_polis
    val.tools.recipe_polis(adata, mask_var="highly_variable_top50")
    ```
    """

    # ---- 0. select matrix ---------------------------------------------
    X = adata.layers[layer] if layer is not None else adata.X
    X = np.asarray(X)
    n_statements = X.shape[1]

    # ---- 1. coverage and engagement -----------------------------------
    coverage = np.sum(~np.isnan(X), axis=0)
    engaged = (~np.isnan(X)) & (X != 0)
    p_engaged = engaged.sum(axis=0) / np.maximum(coverage, 1)

    # average valence for engaged votes only
    mean_valence = np.full(X.shape[1], np.nan)
    for j in range(X.shape[1]):
        vals = X[engaged[:, j], j]
        if vals.size > 0:
            mean_valence[j] = np.mean(vals)

    # optional: absolute version
    mean_abs_valence = np.abs(mean_valence)

    # ---- 2. compute variances -----------------------------------------
    # overall variance
    var_overall = np.nanvar(X, axis=0, ddof=1)

    # engagement variance: 1 if engaged, 0 if pass
    X_eng = np.where(np.isnan(X), np.nan, np.where(X != 0, 1.0, 0.0))
    var_engagement = np.nanvar(X_eng, axis=0, ddof=1)

    # valence variance: only consider engaged votes
    X_val = np.where(X == 0, np.nan, X)
    var_valence = np.nanvar(X_val, axis=0, ddof=1)

    # choose variance based on mode
    if variance_mode == "overall":
        dispersions = var_overall
    elif variance_mode == "valence":
        dispersions = var_valence
    elif variance_mode == "engagement":
        dispersions = var_engagement
    else:
        raise ValueError(f"Unknown variance_mode: {variance_mode}")

    valid = coverage >= 2  # same as before

    # ---- 3. binning ---------------------------------------------------
    if n_bins is None or n_bins <= 1:
        bin_idx = np.zeros(n_statements, dtype=int)
    else:
        if bin_by == "coverage":
            bin_idx = pd.cut(coverage, bins=n_bins, labels=False)
        elif bin_by == "mean_valence":
            bin_idx = pd.cut(mean_valence, bins=n_bins, labels=False)
        elif bin_by == "mean_abs_valence":
            bin_idx = pd.cut(mean_abs_valence, bins=n_bins, labels=False)
        elif bin_by == "p_engaged":
            bin_idx = pd.cut(p_engaged, bins=n_bins, labels=False)
        else:
            raise ValueError(f"Unknown bin_by: {bin_by}")

    # ---- 4. normalize within bins ------------------------------------
    dispersions_norm = np.full(n_statements, np.nan)
    for b in np.unique(bin_idx[valid]):
        mask = (bin_idx == b) & valid
        if mask.sum() < 2:
            continue
        d = dispersions[mask]
        mu = d.mean()
        sd = d.std()
        if sd == 0 or not np.isfinite(sd):
            continue
        dispersions_norm[mask] = (d - mu) / sd

    # ---- 5. stats table ----------------------------------------------
    stats = pd.DataFrame(
        {
            "coverage": coverage,
            "mean_valence": mean_valence,
            "mean_abs_valence": mean_abs_valence,
            "p_engaged": p_engaged,
            "bin_idx": bin_idx,
            "var_overall": var_overall,
            "var_valence": var_valence,
            "var_engagement": var_engagement,
            "dispersions": dispersions,
            "dispersions_norm": dispersions_norm,
        },
        index=adata.var_names,
    )

    # ---- 6. selection -------------------------------------------------
    if n_top_statements is not None:
        # rank by normalized dispersion first, then raw
        order = np.lexsort(
            (-stats["dispersions"].values, -stats["dispersions_norm"].values)
        )
        hv = np.zeros(n_statements, dtype=bool)
        hv[order[:n_top_statements]] = True
    else:
        hv = valid.copy()
        if min_cov is not None:
            hv &= stats["coverage"].values >= min_cov
        if max_cov is not None:
            hv &= stats["coverage"].values <= max_cov
        if min_disp is not None:
            hv &= stats["dispersions_norm"].values >= min_disp
        if max_disp is not None:
            hv &= stats["dispersions_norm"].values <= max_disp

    stats[key_added] = hv

    # ---- 7. output ----------------------------------------------------
    if not inplace:
        return stats

    for k in stats.columns:
        adata.var[k] = stats[k].values

    # store metadata in .uns
    adata.uns[key_added] = {
        "variance_mode": variance_mode,
        "bin_by": bin_by,
        "n_bins": n_bins,
        "min_disp": min_disp,
        "max_disp": max_disp,
        "min_cov": min_cov,
        "max_cov": max_cov,
        "n_top_statements": n_top_statements,
        "subset": subset,
        "valid": valid,
        "statement_names": adata.var_names.tolist(),
    }

    if subset:
        adata._inplace_subset_var(hv)

scanpy methods (inherited)

Note

These methods are simply quick convenience wrappers around methods in scanpy, a tool for single-cell gene expression. They will use terms like "cells", "genes" and "counts", but you can think of these as "participants", "statements" and "votes".

See scanpy.pp for more methods you can experiment with via the val.scanpy.pp namespace.

valency_anndata.preprocessing.neighbors

neighbors(
    adata: AnnData,
    n_neighbors: int = 15,
    n_pcs: int | None = None,
    *,
    use_rep: str | None = None,
    knn: bool = True,
    method: _Method = "umap",
    transformer: KnnTransformerLike
    | _KnownTransformer
    | None = None,
    metric: _Metric | _MetricFn = "euclidean",
    metric_kwds: Mapping[str, Any] = MappingProxyType({}),
    random_state: _LegacyRandom = 0,
    key_added: str | None = None,
    copy: bool = False,
) -> AnnData | None

Compute the nearest neighbors distance matrix and a neighborhood graph of observations :cite:p:McInnes2018.

The neighbor search efficiency of this heavily relies on UMAP :cite:p:McInnes2018, which also provides a method for estimating connectivities of data points - the connectivity of the manifold (method=='umap'). If method=='gauss', connectivities are computed according to :cite:t:Coifman2005, in the adaption of :cite:t:Haghverdi2016.

Parameters:

Name Type Description Default
adata AnnData

Annotated data matrix.

required
n_neighbors int

The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If knn is True, number of nearest neighbors to be searched. If knn is False, a Gaussian kernel width is set to the distance of the n_neighbors neighbor.

ignored if transformer is an instance.

15
knn bool

If True, use a hard threshold to restrict the number of neighbors to n_neighbors, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the n_neighbors nearest neighbor.

True
method _Method

Use 'umap' :cite:p:McInnes2018 or 'gauss' (Gauss kernel following :cite:t:Coifman2005 with adaptive width :cite:t:Haghverdi2016) for computing connectivities.

'umap'
transformer KnnTransformerLike | _KnownTransformer | None

Approximate kNN search implementation following the API of :class:~sklearn.neighbors.KNeighborsTransformer. See :doc:/how-to/knn-transformers for more details. Also accepts the following known options:

None (the default) Behavior depends on data size. For small data, we will calculate exact kNN, otherwise we use :class:~pynndescent.pynndescent_.PyNNDescentTransformer 'pynndescent' :class:~pynndescent.pynndescent_.PyNNDescentTransformer 'rapids' A transformer based on :class:cuml.neighbors.NearestNeighbors.

.. deprecated:: 1.10.0
   Use :func:`rapids_singlecell.pp.neighbors` instead.
None
metric _Metric | _MetricFn

A known metric’s name or a callable that returns a distance.

ignored if transformer is an instance.

'euclidean'
metric_kwds Mapping[str, Any]

Options for the metric.

ignored if transformer is an instance.

MappingProxyType({})
random_state _LegacyRandom

A numpy random seed.

ignored if transformer is an instance.

0
key_added str | None

If not specified, the neighbors data is stored in .uns['neighbors'], distances and connectivities are stored in .obsp['distances'] and .obsp['connectivities'] respectively. If specified, the neighbors data is added to .uns[key_added], distances are stored in .obsp[key_added+'_distances'] and connectivities in .obsp[key_added+'_connectivities'].

None
copy bool

Return a copy instead of writing to adata.

False

Returns:

Type Description
Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields:
`adata.obsp['distances' | key_added+'_distances']` : :class:`scipy.sparse.csr_matrix` (dtype `float`)

Distance matrix of the nearest neighbors search. Each row (cell) has n_neighbors-1 non-zero entries. These are the distances to their n_neighbors-1 nearest neighbors (excluding the cell itself).

`adata.obsp['connectivities' | key_added+'_connectivities']` : :class:`scipy.sparse._csr.csr_matrix` (dtype `float`)

Weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities.

`adata.uns['neighbors' | key_added]` : :class:`dict`

neighbors parameters.

Examples:

>>> import scanpy as sc
>>> adata = sc.datasets.pbmc68k_reduced()
>>> # Basic usage
>>> sc.pp.neighbors(adata, 20, metric="cosine")
>>> # Provide your own transformer for more control and flexibility
>>> from sklearn.neighbors import KNeighborsTransformer
>>> transformer = KNeighborsTransformer(
...     n_neighbors=10, metric="manhattan", algorithm="kd_tree"
... )
>>> sc.pp.neighbors(adata, transformer=transformer)
>>> # now you can e.g. access the index: `transformer._tree`
See Also

:doc:/how-to/knn-transformers

Source code in .venv/lib/python3.10/site-packages/scanpy/neighbors/__init__.py
@_doc_params(n_pcs=doc_n_pcs, use_rep=doc_use_rep)
def neighbors(  # noqa: PLR0913
    adata: AnnData,
    n_neighbors: int = 15,
    n_pcs: int | None = None,
    *,
    use_rep: str | None = None,
    knn: bool = True,
    method: _Method = "umap",
    transformer: KnnTransformerLike | _KnownTransformer | None = None,
    metric: _Metric | _MetricFn = "euclidean",
    metric_kwds: Mapping[str, Any] = MappingProxyType({}),
    random_state: _LegacyRandom = 0,
    key_added: str | None = None,
    copy: bool = False,
) -> AnnData | None:
    """Compute the nearest neighbors distance matrix and a neighborhood graph of observations :cite:p:`McInnes2018`.

    The neighbor search efficiency of this heavily relies on UMAP :cite:p:`McInnes2018`,
    which also provides a method for estimating connectivities of data points -
    the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`,
    connectivities are computed according to :cite:t:`Coifman2005`, in the adaption of
    :cite:t:`Haghverdi2016`.

    Parameters
    ----------
    adata
        Annotated data matrix.
    n_neighbors
        The size of local neighborhood (in terms of number of neighboring data
        points) used for manifold approximation. Larger values result in more
        global views of the manifold, while smaller values result in more local
        data being preserved. In general values should be in the range 2 to 100.
        If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
        is `False`, a Gaussian kernel width is set to the distance of the
        `n_neighbors` neighbor.

        *ignored if ``transformer`` is an instance.*
    {n_pcs}
    {use_rep}
    knn
        If `True`, use a hard threshold to restrict the number of neighbors to
        `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
        Kernel to assign low weights to neighbors more distant than the
        `n_neighbors` nearest neighbor.
    method
        Use 'umap' :cite:p:`McInnes2018` or 'gauss' (Gauss kernel following :cite:t:`Coifman2005`
        with adaptive width :cite:t:`Haghverdi2016`) for computing connectivities.
    transformer
        Approximate kNN search implementation following the API of
        :class:`~sklearn.neighbors.KNeighborsTransformer`.
        See :doc:`/how-to/knn-transformers` for more details.
        Also accepts the following known options:

        `None` (the default)
            Behavior depends on data size.
            For small data, we will calculate exact kNN, otherwise we use
            :class:`~pynndescent.pynndescent_.PyNNDescentTransformer`
        `'pynndescent'`
            :class:`~pynndescent.pynndescent_.PyNNDescentTransformer`
        `'rapids'`
            A transformer based on :class:`cuml.neighbors.NearestNeighbors`.

            .. deprecated:: 1.10.0
               Use :func:`rapids_singlecell.pp.neighbors` instead.
    metric
        A known metric’s name or a callable that returns a distance.

        *ignored if ``transformer`` is an instance.*
    metric_kwds
        Options for the metric.

        *ignored if ``transformer`` is an instance.*
    random_state
        A numpy random seed.

        *ignored if ``transformer`` is an instance.*
    key_added
        If not specified, the neighbors data is stored in `.uns['neighbors']`,
        distances and connectivities are stored in `.obsp['distances']` and
        `.obsp['connectivities']` respectively.
        If specified, the neighbors data is added to .uns[key_added],
        distances are stored in `.obsp[key_added+'_distances']` and
        connectivities in `.obsp[key_added+'_connectivities']`.
    copy
        Return a copy instead of writing to adata.

    Returns
    -------
    Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields:

    `adata.obsp['distances' | key_added+'_distances']` : :class:`scipy.sparse.csr_matrix` (dtype `float`)
        Distance matrix of the nearest neighbors search. Each row (cell) has `n_neighbors`-1 non-zero entries. These are the distances to their `n_neighbors`-1 nearest neighbors (excluding the cell itself).
    `adata.obsp['connectivities' | key_added+'_connectivities']` : :class:`scipy.sparse._csr.csr_matrix` (dtype `float`)
        Weighted adjacency matrix of the neighborhood graph of data
        points. Weights should be interpreted as connectivities.
    `adata.uns['neighbors' | key_added]` : :class:`dict`
        neighbors parameters.

    Examples
    --------
    >>> import scanpy as sc
    >>> adata = sc.datasets.pbmc68k_reduced()
    >>> # Basic usage
    >>> sc.pp.neighbors(adata, 20, metric="cosine")
    >>> # Provide your own transformer for more control and flexibility
    >>> from sklearn.neighbors import KNeighborsTransformer
    >>> transformer = KNeighborsTransformer(
    ...     n_neighbors=10, metric="manhattan", algorithm="kd_tree"
    ... )
    >>> sc.pp.neighbors(adata, transformer=transformer)
    >>> # now you can e.g. access the index: `transformer._tree`

    See Also
    --------
    :doc:`/how-to/knn-transformers`

    """
    start = logg.info("computing neighbors")
    adata = adata.copy() if copy else adata
    if adata.is_view:  # we shouldn't need this here...
        adata._init_as_actual(adata.copy())
    neighbors = Neighbors(adata)
    neighbors.compute_neighbors(
        n_neighbors,
        n_pcs=n_pcs,
        use_rep=use_rep,
        knn=knn,
        method=method,
        transformer=transformer,
        metric=metric,
        metric_kwds=metric_kwds,
        random_state=random_state,
    )

    if key_added is None:
        key_added = "neighbors"
        conns_key = "connectivities"
        dists_key = "distances"
    else:
        conns_key = f"{key_added}_connectivities"
        dists_key = f"{key_added}_distances"

    adata.uns[key_added] = {}

    neighbors_dict = adata.uns[key_added]

    neighbors_dict["connectivities_key"] = conns_key
    neighbors_dict["distances_key"] = dists_key

    neighbors_dict["params"] = NeighborsParams(
        n_neighbors=neighbors.n_neighbors,
        method=method,
        random_state=random_state,
        metric=metric,
    )
    if metric_kwds:
        neighbors_dict["params"]["metric_kwds"] = metric_kwds
    if use_rep is not None:
        neighbors_dict["params"]["use_rep"] = use_rep
    if n_pcs is not None:
        neighbors_dict["params"]["n_pcs"] = n_pcs

    adata.obsp[dists_key] = neighbors.distances
    adata.obsp[conns_key] = neighbors.connectivities

    if neighbors.rp_forest is not None:
        neighbors_dict["rp_forest"] = neighbors.rp_forest
    logg.info(
        "    finished",
        time=start,
        deep=(
            f"added to `.uns[{key_added!r}]`\n"
            f"    `.obsp[{dists_key!r}]`, distances for each pair of neighbors\n"
            f"    `.obsp[{conns_key!r}]`, weighted adjacency matrix"
        ),
    )
    return adata if copy else None