Loop calling on tracing data

uchrom.strc.loop provides a GPU-accelerated reimplementation of the axis-wise F-test loop caller from Yu et al. 2025 (ArcFISH, bioRxiv 2025.11.26.690837). Unlike Hi-C-style contact-frequency callers, the axis-wise approach explicitly models the anisotropic per-axis measurement error that is typical of DNA FISH images (Z usually noisier than X/Y), and works with as few as ~30 traces per chromosome.

Pipeline, per chromosome:

  1. Per-axis variance cube — for each (i, j) bin pair and each axis c, compute the variance of x_c[j] - x_c[i] across traces.

  2. Filter + normalise — two LOWESS passes on log(genomic distance): first to NaN-out per-trace outliers at 4 σ, then to put the distance-stratified mean variance at 1.

  3. Per-axis F-test — candidate (i, j) gets F_c = norm_var / ring_mean against a local background ring [inner_cut, outer_cut].

  4. ACAT Cauchy combine — aggregate the per-axis p-values with the median-trace-variance-inverse axis weights.

  5. BH-FDR, cluster accepted candidates by 1D proximity, pick the lowest-p summit per cluster.

  6. Pseudo-contact-frequency filter — only keep summits where the fraction of traces with 3D pdist < adjacent-bin cutoff exceeds 1/2 (singleton) or 1/3 (larger cluster).

Output is a BEDPE-style DataFrame stored under cd.results['loops'].

import pandas as pd
from uchrom import ChromData
from uchrom.strc.loop import call_loops_axiswise_f, LoopCallerParams

Load a chromatin-tracing dataset

We use the Takei et al. 2021 Nature mESC chromatin-tracing dataset (4DN accession 4DNFIHF3JCBY). The helper below auto-downloads it from the public 4DN S3 bucket into example-data/ on the first run, and re-uses the local copy afterwards. If the download can’t reach the network, the notebook falls back to a small synthetic FOF-CT with hand-crafted TAD + loop structure so the notebook still runs.

from pathlib import Path
import os

# Takei et al. 2021 Nature mESC chromatin tracing (mm10, 25 kb, 20 chroms,
# ~400 traces/chrom).  Public 4DN S3 object — no auth required.
TAKEI_URL = (
    'https://4dn-open-data-public.s3.amazonaws.com/fourfront-webprod/'
    'wfoutput/e699334e-fb34-4a0e-8ef6-670b2099831a/4DNFIHF3JCBY.csv'
)

def _download_takei(dest: Path) -> bool:
    '''Download Takei 2021 FOF-CT to *dest* if it doesn't already exist.
    Returns True on success, False on any network error.'''
    if dest.exists():
        return True
    dest.parent.mkdir(parents=True, exist_ok=True)
    print(f'[data] downloading Takei 2021 (~22 MB) → {dest}')
    try:
        import urllib.request
        tmp = dest.with_suffix(dest.suffix + '.part')
        urllib.request.urlretrieve(TAKEI_URL, tmp)
        tmp.rename(dest)
        return True
    except Exception as exc:
        print(f'[data] download failed ({exc}); falling back to synthetic')
        return False

def _repo_root() -> Path:
    # nbconvert runs the notebook with cwd = tutorials/, so walk up to the
    # repo root (contains pyproject.toml) before anchoring example-data.
    for p in [Path.cwd(), *Path.cwd().parents]:
        if (p / 'pyproject.toml').exists() or (p / 'example-data').exists():
            return p
    return Path.cwd()

def find_fofct() -> str:
    root = _repo_root()
    # 1) Fast path — any local copy already on disk
    candidates = [
        root / 'example-data' / '4DNFIHF3JCBY.csv',
        Path.home() / 'Downloads' / 'ArcFISH_data' / '4DNFIHF3JCBY.csv',
        Path.home() / 'Downloads' / 'ArcFISH_data' / '4DNFIHF3JCBY.h5cd',
        Path.home() / 'Downloads' / '4DNFIHF3JCBY.csv',
    ]
    for c in candidates:
        if c.exists():
            print(f'[data] using real Takei 2021 FOF-CT: {c}')
            return str(c)

    # 2) Download to a stable repo-local location
    dest = root / 'example-data' / '4DNFIHF3JCBY.csv'
    if _download_takei(dest):
        print(f'[data] using real Takei 2021 FOF-CT: {dest}')
        return str(dest)

    # 3) If a synthetic was already built (e.g. by an earlier run), reuse it
    for p in [Path.cwd(), *Path.cwd().parents]:
        cand = p / 'example-data' / 'fofct_core.csv'
        if cand.exists():
            print(f'[data] using synthetic FOF-CT: {cand}')
            return str(cand)

    # 4) Last resort: synthesise a FOF-CT with built-in TAD + loop structure
    print('[data] generating synthetic FOF-CT (offline fallback)')
    (root / 'example-data').mkdir(parents=True, exist_ok=True)
    import numpy as np
    rng = np.random.default_rng(0)
    header = '''##FOF-CT_Version=v1.0
##Table_Namespace=4dn_FOF-CT_core
##Genome_Assembly=GRCh38
##XYZ_Unit=micron
#Lab_Name: Demo
##Columns=(Spot_ID, Trace_ID, X, Y, Z, Chrom, Chrom_Start, Chrom_End, Cell_ID)
'''
    # Tri-domain polymer with a nested loop:
    #   - Bins  0..9  = TAD 1 (tight, small step size inside)
    #   - Bins 10..19 = TAD 2 (tight) — separated from TAD 1 by a jump
    #   - Bins 20..29 = TAD 3 (tight) — separated from TAD 2 by a jump
    #   - Loop between bins 5 (in TAD 1) and 20 (first bin of TAD 3)
    # Each TAD walks with small step; inter-TAD jumps are large.
    def synth_trace(
        rng,
        tad_edges=(0, 10, 20, 30),
        intra_step=0.08,
        inter_jump=0.8,
        loop_a=5,
        loop_b=20,
        anchor_noise=0.04,
    ):
        n_bins = tad_edges[-1]
        w = np.zeros((n_bins, 3))
        # Walk each TAD internally, plus a big jump between TADs
        for t in range(len(tad_edges) - 1):
            a, b = tad_edges[t], tad_edges[t + 1]
            if t > 0:
                w[a] = w[a - 1] + rng.normal(scale=inter_jump, size=3)
            for i in range(a + 1, b):
                w[i] = w[i - 1] + rng.normal(scale=intra_step, size=3)
        # Loop closure: pin bin loop_b close to bin loop_a
        w[loop_b] = w[loop_a] + rng.normal(scale=anchor_noise, size=3)
        # Re-walk bins after loop_b so they stay continuous with the new anchor
        for i in range(loop_b + 1, n_bins):
            w[i] = w[i - 1] + rng.normal(scale=intra_step, size=3)
        return w

    rows = []
    sid = 1
    for cell in range(300):
        for trace in range(2):
            walk = synth_trace(rng)
            for bin_i in range(30):
                x, y, z = walk[bin_i]
                s = 1_000_000 + bin_i * 25_000
                rows.append((sid, f'{cell}_{trace}', x, y, z,
                             'chr17', s, s + 25_000, cell))
                sid += 1
    path = str(root / 'example-data' / 'fofct_core.csv')
    with open(path, 'w') as f:
        f.write(header)
        for r in rows:
            f.write(','.join(str(x) for x in r) + chr(10))
    return path

FOFCT = find_fofct()
print('using', FOFCT)
cd = ChromData.read(FOFCT) if FOFCT.endswith(".h5cd") else ChromData.from_fofct(FOFCT)
print(cd)
print()
print('traces per chrom:')
print(
    cd.spots.groupby('chrom', observed=True)['trace_id']
      .nunique().sort_values(ascending=False).head(10)
)

Call loops on a single chromosome

We pick chr2, which has a well-characterised loop in this dataset. Default parameters match ArcFISH (fdr_cutoff=0.1, pval_cutoff=1e-5, gap=50 kb, inner_cut=25 kb, outer_cut=50 kb).

chrom = 'chr2'
print('calling loops on', chrom)

loops = call_loops_axiswise_f(
    cd,
    chrom=chrom,
    device='auto',   # CUDA > MPS > CPU
    store=True,      # also write to cd.results['loops']
    verbose=True,
)
loops

Visualising the result

The companion modules uchrom.fea and uchrom.pl compute and render the population-level aggregates that chromatin-tracing papers use to present loop calls.

import matplotlib.pyplot as plt
import numpy as np
from uchrom.fea import mean_distance_matrix, contact_frequency, radius_of_gyration
from uchrom.pl import plot_distance_matrix, plot_contact_map, plot_rg_histogram

df = cd.get_chrom(chrom).to_dataframe()

# Median pairwise distance matrix across all traces
dmat, bin_ids, n = mean_distance_matrix(df, chrom=chrom)
fig = plot_distance_matrix(dmat, title=f'Median 3D distance ({n} traces)')

# Overlay called loop summits on the heatmap
ax = fig.axes[0]
for _, row in loops.iterrows():
    i, j = int(row['summit_i']), int(row['summit_j'])
    ax.plot(j, i, 'o', mfc='none', mec='white', mew=1.5, ms=14)
    ax.plot(i, j, 'o', mfc='none', mec='white', mew=1.5, ms=14)
ax.set_title(ax.get_title() + f' — {len(loops)} loop(s) overlaid')
plt.show()
# Population-level contact map at a user-chosen distance threshold
threshold = float(np.nanmedian(dmat[dmat > 0]) * 0.5)  # loose rule of thumb
cmat, bin_ids, n = contact_frequency(df, threshold=threshold, chrom=chrom)
fig = plot_contact_map(cmat, title=f'Contact frequency (< {threshold:.2g}, {n} traces)')
ax = fig.axes[0]
for _, row in loops.iterrows():
    i, j = int(row['summit_i']), int(row['summit_j'])
    ax.plot(j, i, 'o', mfc='none', mec='cyan', mew=1.5, ms=14)
    ax.plot(i, j, 'o', mfc='none', mec='cyan', mew=1.5, ms=14)
plt.show()
# Per-trace radius of gyration distribution
rg = radius_of_gyration(df, chrom=chrom)
fig = plot_rg_histogram(rg, title=f'Rg distribution ({len(rg)} traces)')
plt.show()

Tuning thresholds

All parameters live on LoopCallerParams. Relaxing thresholds recovers weaker candidates; tightening removes marginal ones:

params_relaxed = LoopCallerParams(
    fdr_cutoff=0.2,      # more permissive FDR
    pval_cutoff=1e-3,    # more permissive raw p-value
)
loops_relaxed = call_loops_axiswise_f(cd, chrom=chrom, params=params_relaxed,
                                      device='auto', store=False)
print(f'default: {len(loops)} loops  |  relaxed: {len(loops_relaxed)} loops')
if len(loops_relaxed) > len(loops):
    print('\nExtra candidates at relaxed thresholds:')
    print(loops_relaxed[~loops_relaxed['summit_i'].isin(loops['summit_i'])])

Whole-genome loop calling

Loop over chromosomes and concatenate. Each call is independent so trivial to parallelise if needed.

all_loops = []
for chrom in cd.chroms:
    ls = call_loops_axiswise_f(cd, chrom=chrom, device='auto',
                               store=False, verbose=False)
    ls['chrom'] = chrom
    all_loops.append(ls)

loops_genome = pd.concat(all_loops, ignore_index=True)
print(f'Total called loops genome-wide: {len(loops_genome)}')
loops_genome.sort_values('pval').head(10)

Persist results into the ChromData

Store the genome-wide loop calls in cd.results['loops'] and round-trip through .h5cd.

cd.results['loops'] = loops_genome
out = Path(FOFCT).parent / 'with_loops.h5cd'
cd.write(str(out))

cd_reloaded = ChromData.read(str(out))
print(cd_reloaded.results['loops'])

CLI

For batch scripts:

python -m uchrom.strc.loop --input=data.h5cd --output=data_loops.h5cd \
    --chrom=chr17 --device=auto

# BEDPE export instead
python -m uchrom.strc.loop --input=data.h5cd --output=loops.bedpe \
    --chrom=chr17 --device=auto

Comparison notes

This implementation is independent of the upstream ArcFISH (GPL-3.0) source; all algorithmic components — per-trace 4 σ filter, distance-stratified LOWESS normalisation, per-axis F-test, ACAT combine, BH-FDR, pseudo-contact-frequency summit filter — are reimplemented here.

On the canonical Takei 2021 mESC benchmark we see ~2–3× speedup on Apple MPS and typically 60–70 % precision vs ArcFISH’s summit set at ±25 kb anchor tolerance, with a more conservative call set overall. MPS forces float32; on CUDA with float64 you will see higher overlap with the upstream output.