Source code for uchrom.im.trace._polymer

"""Gaussian-chain polymer primitives for the spatial genome aligner.

Two-end distance distribution of an ideal Gaussian chain of contour
length ``L`` with Kuhn segment ``b = 2 * l_p`` (persistence length) is
isotropic Gaussian with variance ``Nb² / 3`` per axis — i.e.
``<R²> = 2 * l_p * L`` for the 3-D distance, and ``<R²> / 3`` per axis.

Adding independent Gaussian localisation noise with axis variance
``σ²`` at each end yields

    S²(i, j) = σ_i² + σ_j² + (2/3) · l_p · τ · L_ij

which is the expected **axis-wise** variance of the observed distance
between two FISH spots connected by a contour length ``L_ij`` bp under
a scale factor ``τ`` (nm/bp).  ``S²`` scales the Gaussian-chain bond
probability used by the aligner.

All lengths here are in **nm** and all genomic distances in **bp**.
"""

from __future__ import annotations

from typing import Union, Tuple

import numpy as np

# Physical constants ----------------------------------------------------
PERSISTENCE_LENGTH_BP_DEFAULT: float = 150.0
"""Persistence length of bare B-form dsDNA, in bp.

One persistence length ≈ 50 nm at the canonical 0.34 nm/bp rise, which
gives ~150 bp.  For chromatin the *effective* persistence length is
much larger, but is folded into the fitted scale factor ``τ`` so we
keep the bp value constant.
"""


# ----------------------------------------------------------------------
# Distance statistics
# ----------------------------------------------------------------------

[docs] def expected_distance_sq( L_bp: Union[np.ndarray, float], l_p_bp: float = PERSISTENCE_LENGTH_BP_DEFAULT, tau_nm_per_bp: float = 0.01, sigma_nm: Union[np.ndarray, float] = 0.0, ) -> np.ndarray: """Gaussian-chain axis variance of the end-to-end distance. ``S²(L) = 2·σ² + (2/3)·l_p·τ·L`` Parameters ---------- L_bp : ndarray or float Genomic distance(s) in bp between the two loci. l_p_bp : float Persistence length in bp. tau_nm_per_bp : float Genomic-to-spatial scale factor (nm / bp). sigma_nm : ndarray or float Axis-wise localisation uncertainty of each endpoint, in nm. A scalar is broadcast to both endpoints; an array of shape (2,) or matching ``L_bp`` is also accepted. Returns ------- S2 : ndarray or float Axis-wise variance of the observed distance, in nm². """ L = np.asarray(L_bp, dtype=np.float64) sig = np.asarray(sigma_nm, dtype=np.float64) chain_term = (2.0 / 3.0) * l_p_bp * tau_nm_per_bp * L if sig.ndim == 0: noise_term = 2.0 * sig ** 2 elif sig.shape == (2,): noise_term = sig[0] ** 2 + sig[1] ** 2 else: # assume caller already summed σ_i² + σ_j² noise_term = sig return chain_term + noise_term
[docs] def expected_distance( L_bp: Union[np.ndarray, float], l_p_bp: float = PERSISTENCE_LENGTH_BP_DEFAULT, tau_nm_per_bp: float = 0.01, ) -> np.ndarray: """Gaussian-chain mean 3-D end-to-end distance ``<R> ≈ √(2·l_p·τ·L)``. Strictly ``<R> = √(8/(3π)) · √(<R²>)`` for a 3-D Gaussian variable, but the scaling-law form ``√(<R²>)`` is what the literature calls the "Gaussian chain" expected distance and what jie uses. """ L = np.asarray(L_bp, dtype=np.float64) return np.sqrt(2.0 * l_p_bp * tau_nm_per_bp * np.maximum(L, 0.0))
[docs] def bond_log_probability( R_nm: Union[np.ndarray, float], S2_nm2: Union[np.ndarray, float], ) -> np.ndarray: """Log probability density of an observed distance ``R`` under an isotropic 3-D Gaussian with axis variance ``S²``. ``log p = -1.5 · log(2π·S²) - R² / (2·S²)`` """ R = np.asarray(R_nm, dtype=np.float64) S2 = np.asarray(S2_nm2, dtype=np.float64) S2_safe = np.where(S2 > 0, S2, np.nan) return -1.5 * np.log(2.0 * np.pi * S2_safe) - R * R / (2.0 * S2_safe)
[docs] def mahalanobis_cost( R_nm: Union[np.ndarray, float], S2_nm2: Union[np.ndarray, float], ) -> np.ndarray: """Mahalanobis-style squared cost ``R² / (2·S²)``. This is the **data-dependent** part of the negative log Gaussian density (the exponent, without the ``-1.5·log(2π·S²)`` normalisation constant). For shortest-path aligners this has a useful property that :func:`bond_log_probability` does not: the cost of a single edge is independent of how many loci lie between its endpoints, so paths with different numbers of skipped loci can be compared fairly. Together with a per-skip ``gap_penalty`` it gives well-calibrated edge weights:: w_ij = R²_ij / (2·S²_ij) + (c - 1) · gap_penalty """ R = np.asarray(R_nm, dtype=np.float64) S2 = np.asarray(S2_nm2, dtype=np.float64) S2_safe = np.where(S2 > 0, S2, np.nan) return R * R / (2.0 * S2_safe)
# ---------------------------------------------------------------------- # Scale factor fitting # ----------------------------------------------------------------------
[docs] def fit_scale_factor( observed_dist_nm: np.ndarray, genomic_dist_bp: np.ndarray, l_p_bp: float = PERSISTENCE_LENGTH_BP_DEFAULT, fix_exponent: bool = True, ) -> Tuple[float, float]: """Fit Gaussian-chain scale factor ``τ`` to observed trace distances. Performs a log-log regression of ``R`` on ``L``:: log R = 0.5 · log(2·l_p·τ) + α · log L with ``α = 0.5`` fixed for an ideal Gaussian chain (``fix_exponent= True``) or fitted jointly. Returns ``(τ, α)``. Parameters ---------- observed_dist_nm : ndarray Observed 3-D distances (nm). NaN and non-positive values are dropped. genomic_dist_bp : ndarray Genomic separations (bp) with the same shape as ``observed_dist_nm``. l_p_bp : float Persistence length (bp). fix_exponent : bool If True, force ``α = 0.5`` (ideal Gaussian chain). If False, fit it jointly — useful for chromatin where ``α`` often ≈ 0.33 (crumpled globule). Returns ------- tau_nm_per_bp : float scaling_exponent : float ``0.5`` when ``fix_exponent=True``; otherwise the fitted value. """ R = np.asarray(observed_dist_nm, dtype=np.float64).ravel() L = np.asarray(genomic_dist_bp, dtype=np.float64).ravel() mask = np.isfinite(R) & np.isfinite(L) & (R > 0) & (L > 0) if mask.sum() < 3: raise ValueError( "Need at least 3 finite, positive (R, L) samples to fit τ" ) R = R[mask] L = L[mask] logR = np.log(R) logL = np.log(L) if fix_exponent: alpha = 0.5 intercept = float(np.mean(logR - alpha * logL)) else: A = np.vstack([np.ones_like(logL), logL]).T intercept, alpha = np.linalg.lstsq(A, logR, rcond=None)[0] intercept = float(intercept) alpha = float(alpha) # intercept = 0.5 · log(2 · l_p · τ) when α = 0.5 # 通用: intercept = log(prefactor), prefactor = √(2 l_p τ) under α=0.5 # generalised via matching <R> ≈ prefactor · L^α if fix_exponent: tau = float(np.exp(2.0 * intercept) / (2.0 * l_p_bp)) else: # With free α we just report an effective τ under the same α. prefactor = float(np.exp(intercept)) tau = float(prefactor ** 2 / (2.0 * l_p_bp)) return tau, float(alpha)
[docs] def fit_scale_factor_from_matrix( dist_matrix_nm: np.ndarray, bin_genomic_bp: np.ndarray, l_p_bp: float = PERSISTENCE_LENGTH_BP_DEFAULT, aggregator: str = "median", fix_exponent: bool = True, ) -> Tuple[float, float]: """Fit ``τ`` from a (n_bins, n_bins) distance matrix. For each unique off-diagonal genomic separation ``L``, aggregates the observed distances (median or mean) across all bin pairs with that separation, then runs :func:`fit_scale_factor` on the result. Parameters ---------- dist_matrix_nm : ndarray (n_bins, n_bins) Pairwise distances in nm. NaNs are ignored. bin_genomic_bp : ndarray (n_bins,) Genomic midpoint of each bin, in bp. l_p_bp : float aggregator : 'median' | 'mean' fix_exponent : bool Returns ------- tau_nm_per_bp : float scaling_exponent : float """ M = np.asarray(dist_matrix_nm, dtype=np.float64) bp = np.asarray(bin_genomic_bp, dtype=np.float64).ravel() n = M.shape[0] if M.shape != (n, n) or bp.size != n: raise ValueError("dist_matrix must be square and match bin_genomic_bp") iu, ju = np.triu_indices(n, k=1) Rvals = M[iu, ju] Lvals = np.abs(bp[iu] - bp[ju]) mask = np.isfinite(Rvals) & np.isfinite(Lvals) & (Rvals > 0) & (Lvals > 0) Rvals = Rvals[mask] Lvals = Lvals[mask] if Rvals.size == 0: raise ValueError("No finite pairs in distance matrix") # Aggregate by unique L (stratified median / mean) unique_L, inverse = np.unique(Lvals, return_inverse=True) agg = np.full(unique_L.size, np.nan) fn = np.median if aggregator == "median" else np.mean for k in range(unique_L.size): sel = Rvals[inverse == k] if sel.size: agg[k] = float(fn(sel)) keep = np.isfinite(agg) return fit_scale_factor( agg[keep], unique_L[keep], l_p_bp=l_p_bp, fix_exponent=fix_exponent, )
__all__ = [ "PERSISTENCE_LENGTH_BP_DEFAULT", "expected_distance", "expected_distance_sq", "bond_log_probability", "mahalanobis_cost", "fit_scale_factor", "fit_scale_factor_from_matrix", ]