from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'rDNA-marked inter-chromosomal hub compaction', 'biological_hypothesis': 'rDNA-enriched spots from different chromosomes form compact inter-chromosomal hubs, consistent with nucleolar-proximal chromosome organization.', 'computable_parameter': 'rDNA_interchrom_compaction_ratio = median pairwise 3D distance among high-rDNA spots on different chromosomes divided by median pairwise 3D distance among matched low-rDNA spots on different chromosomes, computed within cells and summarized across cells.', 'analysis_plan': 'Within each cell, classify spots into high-rDNA and low-rDNA groups using quantiles of tracks.rDNA. Compute pairwise Euclidean distances only for spot pairs with different spots.chrom values. Calculate the high-rDNA median inter-chromosomal distance divided by the low-rDNA median inter-chromosomal distance per cell. Test whether the ratio is less than one using a one-sided signed-rank or permutation test; as a negative control, permute tracks.rDNA values within each cell before high/low assignment.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata'], 'idea_markdown': '### Rationale\nrDNA-associated signal may mark shared nucleolar-proximal hubs where loci from different chromosomes come together.\n\n### Data used\nUse spot coordinates, chromosome and cell labels, cell type metadata, and the rDNA track.\n\n### Analysis sketch\nWithin each cell, identify high-rDNA spots and compute distances only between spots on different chromosomes. Compare high-rDNA inter-chromosomal distances with matched low-rDNA inter-chromosomal distances.\n\n### Expected result\nA compact rDNA hub would produce a ratio below one, because high-rDNA inter-chromosomal spots would be closer than matched background spots.\n\n### Validation checks\nValidate fields, sufficient high-rDNA spots across cells, finite ratio, permutation p-value, runtime, deterministic rerun, and random rDNA-label permutation as a negative control.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['coords', 'spots.chrom', 'spots.cell_id', 'tracks.rDNA', 'cells.cell_type'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count', 'minimum_spot_or_trace_count', 'finite_numeric_output', 'statistical_hypothesis_test', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation'], 'expected_direction': 'rDNA_interchrom_compaction_ratio below 1, indicating compact inter-chromosomal rDNA-associated hubs.', 'complexity': 3, 'idea_id': 'rdna-marked-inter-chromosomal-hub-compaction-41a5f6a09d', 'metadata': {}})
PROJECT_ROOT = Path('/Users/weizexu/Projects/U-Chrom')
H5CD_PATH = PROJECT_ROOT / 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = PROJECT_ROOT / 'tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg'
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

rdna-marked-inter-chromosomal-hub-compaction-41a5f6a09d
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight field/coverage inspection for rDNA inter-chromosomal compaction.
spots = cdata.spots.copy()
tracks = cdata.tracks
cells = cdata.cells
coords = np.asarray(cdata.coords)

inspection_df = spots[['cell_id', 'chrom', 'trace_id']].copy()
inspection_df['rDNA'] = pd.to_numeric(tracks['rDNA'], errors='coerce').to_numpy()
inspection_df['x'] = coords[:, 0]
inspection_df['y'] = coords[:, 1]
inspection_df['z'] = coords[:, 2]
inspection_df['cell_type'] = inspection_df['cell_id'].map(cells['cell_type'])

print('spots/tracks/cells/coords:', spots.shape, tracks.shape, cells.shape, coords.shape)
print('required spot columns present:', {col: col in spots.columns for col in ['cell_id', 'chrom', 'trace_id']})
print('rDNA finite coverage:', float(np.isfinite(inspection_df['rDNA']).mean()))
print('coordinate finite coverage:', float(np.isfinite(coords).all(axis=1).mean()))
print('cell types:', cells['cell_type'].value_counts().to_dict())
print('spot counts by cell:')
print(inspection_df.groupby(['cell_type', 'cell_id'], observed=True).size().rename('n_spots'))
print('rDNA quartile preview by cell:')
display(inspection_df.groupby('cell_id', observed=True)['rDNA'].quantile([0.25, 0.5, 0.75]).unstack().round(4))
display(inspection_df[['cell_id', 'cell_type', 'chrom', 'trace_id', 'rDNA', 'x', 'y', 'z']].head())

spots/tracks/cells/coords: (56036, 6) (56036, 62) (9, 10) (56036, 3)
required spot columns present: {'cell_id': True, 'chrom': True, 'trace_id': True}
rDNA finite coverage: 1.0
coordinate finite coverage: 1.0
cell types: {'Granule': 3, 'Bergmann': 3, 'Purkinje': 3}
spot counts by cell:
cell_type  cell_id
Bergmann   1_0_34      3932
           1_0_61     11283
           1_0_63      7614
Granule    1_0_42      4183
           1_0_47      4682
           1_0_69      3220
Purkinje   1_0_116    11659
           1_0_26      4225
           1_0_37      5238
Name: n_spots, dtype: int64
rDNA quartile preview by cell:
           0.25    0.50    0.75
cell_id                        
1_0_116 -0.1672 -0.1433 -0.0996
1_0_26  -0.4129 -0.1368  0.2577
1_0_34  -0.2662 -0.2444 -0.1531
1_0_37  -0.2003 -0.1772 -0.1341
1_0_42  -0.3206 -0.2851 -0.1266
1_0_47  -0.2619 -0.2474 -0.1726
1_0_61  -0.2222 -0.1988 -0.1475
1_0_63  -0.2315 -0.1949 -0.1216
1_0_69  -0.2682 -0.2462 -0.1676
  cell_id cell_type  chrom  ...           x           y        z
0  1_0_61  Bergmann  chr14  ...  127.399876  100.729674  1.62050
1  1_0_61  Bergmann   chr2  ...  124.070401  104.650369  1.61500
2  1_0_61  Bergmann  chr14  ...  125.011100  104.504933  1.56525
3  1_0_61  Bergmann   chr2  ...  124.328004  104.750176  1.57875
4  1_0_61  Bergmann  chr14  ...  124.519378  104.306864  1.57325

[5 rows x 8 columns]

# Main exploration: rDNA-high versus rDNA-low inter-chromosomal 3D compaction.
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import Image, display

rng = np.random.default_rng(41095)
N_PERM = 300
MAX_SPOTS_PER_GROUP = 180
MIN_SPOTS_PER_GROUP = 20

work = cdata.spots[['cell_id', 'chrom']].copy()
work['cell_type'] = work['cell_id'].map(cdata.cells['cell_type'])
work['rDNA'] = pd.to_numeric(cdata.tracks['rDNA'], errors='coerce').to_numpy()
xyz = np.asarray(cdata.coords, dtype=float)
work['x'] = xyz[:, 0]
work['y'] = xyz[:, 1]
work['z'] = xyz[:, 2]
work = work[np.isfinite(work[['rDNA', 'x', 'y', 'z']]).all(axis=1)].reset_index(drop=True)

# Deterministic per-cell subsampling seeds make the observed and permutation
# statistic reproducible while bounding pairwise distance cost.
cell_seeds = {cid: int(seed) for cid, seed in zip(sorted(work['cell_id'].astype(str).unique()), rng.integers(1, 2**31 - 1, size=work['cell_id'].nunique()))}

def _bounded_indices(indices, max_n, seed):
    indices = np.asarray(indices, dtype=int)
    if len(indices) <= max_n:
        return indices
    local_rng = np.random.default_rng(seed)
    return np.sort(local_rng.choice(indices, size=max_n, replace=False))

def _median_interchrom_distance(coords_arr, chrom_arr):
    n = len(coords_arr)
    if n < 2 or pd.Series(chrom_arr).nunique() < 2:
        return np.nan, 0
    diff = coords_arr[:, None, :] - coords_arr[None, :, :]
    dmat = np.sqrt(np.sum(diff * diff, axis=2))
    chrom_diff = np.asarray(chrom_arr)[:, None] != np.asarray(chrom_arr)[None, :]
    upper = np.triu(np.ones((n, n), dtype=bool), k=1)
    vals = dmat[upper & chrom_diff]
    if vals.size == 0:
        return np.nan, 0
    return float(np.median(vals)), int(vals.size)

def _cell_metric(cell_df, rdna_values=None, seed_offset=0):
    vals = cell_df['rDNA'].to_numpy() if rdna_values is None else np.asarray(rdna_values)
    finite = np.isfinite(vals)
    if finite.sum() < 2 * MIN_SPOTS_PER_GROUP:
        return None
    q_low, q_high = np.quantile(vals[finite], [0.25, 0.75])
    high_pool = np.flatnonzero(vals >= q_high)
    low_pool = np.flatnonzero(vals <= q_low)
    if len(high_pool) < MIN_SPOTS_PER_GROUP or len(low_pool) < MIN_SPOTS_PER_GROUP:
        return None
    base_seed = cell_seeds[str(cell_df['cell_id'].iloc[0])] + seed_offset
    high_idx = _bounded_indices(high_pool, MAX_SPOTS_PER_GROUP, base_seed + 11)
    low_idx = _bounded_indices(low_pool, MAX_SPOTS_PER_GROUP, base_seed + 37)
    coords_cell = cell_df[['x', 'y', 'z']].to_numpy(float)
    chrom_cell = cell_df['chrom'].astype(str).to_numpy()
    high_median, high_pairs = _median_interchrom_distance(coords_cell[high_idx], chrom_cell[high_idx])
    low_median, low_pairs = _median_interchrom_distance(coords_cell[low_idx], chrom_cell[low_idx])
    if not (np.isfinite(high_median) and np.isfinite(low_median) and low_median > 0):
        return None
    ratio = high_median / low_median
    return {
        'cell_id': str(cell_df['cell_id'].iloc[0]),
        'cell_type': str(cell_df['cell_type'].iloc[0]),
        'n_spots_total': int(len(cell_df)),
        'n_high_sampled': int(len(high_idx)),
        'n_low_sampled': int(len(low_idx)),
        'n_high_interchrom_pairs': int(high_pairs),
        'n_low_interchrom_pairs': int(low_pairs),
        'high_rdna_median_interchrom_um': high_median,
        'low_rdna_median_interchrom_um': low_median,
        'rdna_interchrom_compaction_ratio': float(ratio),
        'log_ratio': float(np.log(ratio)),
    }

cell_groups = [(cid, g.reset_index(drop=True)) for cid, g in work.groupby('cell_id', observed=True, sort=True)]
observed_rows = []
for cid, g in cell_groups:
    metric = _cell_metric(g, seed_offset=0)
    if metric is not None:
        observed_rows.append(metric)

per_cell = pd.DataFrame(observed_rows)
if per_cell.empty:
    raise RuntimeError('No cells had enough finite rDNA/coordinate data for inter-chromosomal compaction analysis.')

observed_statistic = float(per_cell['log_ratio'].mean())
parameter_value = float(per_cell['rdna_interchrom_compaction_ratio'].median())
effect_size = float(parameter_value - 1.0)  # negative values support compaction among rDNA-high spots

# Within-cell rDNA-label randomization: shuffle rDNA values relative to coordinates/chromosomes,
# recompute high/low quartile bins and the mean log-ratio. This preserves cell geometry.
null_stats = []
for b in range(N_PERM):
    perm_metrics = []
    for cid, g in cell_groups:
        perm_vals = g['rDNA'].to_numpy().copy()
        rng.shuffle(perm_vals)
        metric = _cell_metric(g, rdna_values=perm_vals, seed_offset=1000 + b)
        if metric is not None:
            perm_metrics.append(metric['log_ratio'])
    if perm_metrics:
        null_stats.append(float(np.mean(perm_metrics)))
null_stats = np.asarray(null_stats, dtype=float)

if len(per_cell) >= 2 and null_stats.size >= 100:
    p_value = float((1 + np.sum(null_stats <= observed_statistic)) / (len(null_stats) + 1))
    hypothesis_test_status = 'pass'
else:
    # Still finite and interpretable, but not enough independent cells/permutations.
    p_value = 1.0
    hypothesis_test_status = 'insufficient_data'

test_method = f'one-sided within-cell rDNA-label permutation test ({len(null_stats)} permutations) on mean log(high/low median inter-chromosomal distance ratio)'
null_hypothesis = 'Within each cell, rDNA labels are exchangeable with respect to inter-chromosomal 3D distances; the mean log high/low compaction ratio is not below the shuffled-label null.'
alternative_hypothesis = 'High-rDNA spots from different chromosomes are more compact than low-rDNA spots, giving a mean log high/low compaction ratio below the shuffled-label null (ratio < 1).'

# Result table: one row per analyzable cell plus shared statistical-test annotations.
result_table = per_cell.copy()
result_table['observed_statistic'] = observed_statistic
result_table['effect_size'] = effect_size
result_table['p_value'] = p_value
result_table['test_method'] = test_method
result_table['hypothesis_test_status'] = hypothesis_test_status
result_table['parameter_value_median_ratio'] = parameter_value
result_table['n_permutations'] = int(len(null_stats))

result_path = PROJECT_ROOT / 'tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rdna-marked-inter-chromosomal-hub-compaction-41a5f6a09d_result.csv'
figure_path = PROJECT_ROOT / 'tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rdna-marked-inter-chromosomal-hub-compaction-41a5f6a09d_statistical_summary.png'
result_table.to_csv(result_path, index=False)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'n_rows': int(len(result_table)),
    'n_selected_cells': int(per_cell['cell_id'].nunique()),
    'n_spots': int(len(work)),
    'parameter_name': 'rDNA_interchrom_compaction_ratio',
    'parameter_value': parameter_value,
    'observed_statistic': observed_statistic,
    'effect_size': effect_size,
    'p_value': p_value,
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': hypothesis_test_status,
    'null_distribution_mean': float(np.mean(null_stats)) if null_stats.size else np.nan,
    'null_distribution_sd': float(np.std(null_stats, ddof=1)) if null_stats.size > 1 else np.nan,
    'n_permutations': int(len(null_stats)),
    'max_spots_per_group': int(MAX_SPOTS_PER_GROUP),
    'result_path': str(result_path.relative_to(PROJECT_ROOT)),
    'statistical_figure_path': str(figure_path.relative_to(PROJECT_ROOT)),
    'notes': [
        'Coordinates are interpreted in schema xyz_unit=um.',
        'High/low rDNA bins are within-cell upper/lower quartiles; pairwise work is bounded by deterministic sampling.',
        'Negative effect_size means the median high-rDNA inter-chromosomal distance ratio is below 1.'
    ],
}

# Statistical figure: observed statistic against null distribution plus per-cell ratio summary.
fig, axes = plt.subplots(1, 2, figsize=(11.5, 4.2), constrained_layout=True)
fig.patch.set_facecolor('white')
for ax in axes:
    ax.set_facecolor('white')

axes[0].hist(null_stats, bins=28, color='#b7c9e2', edgecolor='white', label='within-cell shuffled rDNA labels')
axes[0].axvline(observed_statistic, color='#b2182b', linewidth=2.5, label='observed mean log-ratio')
axes[0].axvline(0, color='black', linestyle=':', linewidth=1.2, label='ratio = 1')
axes[0].set_xlabel('Mean log(high-rDNA / low-rDNA median inter-chromosomal distance)')
axes[0].set_ylabel('Permutation count')
axes[0].set_title('Permutation evidence for rDNA hub compaction')
axes[0].legend(frameon=False, fontsize=8)
axes[0].text(
    0.02, 0.98,
    f'p = {p_value:.4f}\neffect = {effect_size:.3f}\nn cells = {len(per_cell)}\n{len(null_stats)} permutations',
    transform=axes[0].transAxes,
    va='top', ha='left', fontsize=9,
    bbox=dict(facecolor='white', edgecolor='0.7', boxstyle='round,pad=0.3')
)

plot_df = per_cell.sort_values(['cell_type', 'cell_id']).reset_index(drop=True)
xpos = np.arange(len(plot_df))
axes[1].axhline(1.0, color='black', linestyle=':', linewidth=1.2, label='no compaction')
colors = {'Granule': '#4daf4a', 'Bergmann': '#377eb8', 'Purkinje': '#984ea3'}
for ctype, sub in plot_df.groupby('cell_type', observed=True, sort=False):
    xi = sub.index.to_numpy()
    axes[1].scatter(xi, sub['rdna_interchrom_compaction_ratio'], s=55, color=colors.get(ctype, '0.4'), label=ctype, zorder=3)
axes[1].plot(xpos, plot_df['rdna_interchrom_compaction_ratio'], color='0.75', linewidth=1, zorder=1)
axes[1].set_xticks(xpos)
axes[1].set_xticklabels(plot_df['cell_id'], rotation=45, ha='right', fontsize=8)
axes[1].set_ylabel('High-rDNA / low-rDNA median inter-chromosomal distance ratio')
axes[1].set_xlabel('Cell')
axes[1].set_title(f'Per-cell ratios; median = {parameter_value:.3f}')
axes[1].legend(frameon=False, fontsize=8, title='Cell type')

fig.savefig(figure_path, dpi=180, bbox_inches='tight', facecolor='white')
plt.show()
display(Image(filename=str(figure_path)))

print(json.dumps(analysis_summary, indent=2))
display(result_table.round(4))

<IPython.core.display.Image object>
{
  "idea_id": "rdna-marked-inter-chromosomal-hub-compaction-41a5f6a09d",
  "n_rows": 9,
  "n_selected_cells": 9,
  "n_spots": 56036,
  "parameter_name": "rDNA_interchrom_compaction_ratio",
  "parameter_value": 0.7935725816968385,
  "observed_statistic": -0.2162094847215039,
  "effect_size": -0.20642741830316147,
  "p_value": 0.0033222591362126247,
  "test_method": "one-sided within-cell rDNA-label permutation test (300 permutations) on mean log(high/low median inter-chromosomal distance ratio)",
  "null_hypothesis": "Within each cell, rDNA labels are exchangeable with respect to inter-chromosomal 3D distances; the mean log high/low compaction ratio is not below the shuffled-label null.",
  "alternative_hypothesis": "High-rDNA spots from different chromosomes are more compact than low-rDNA spots, giving a mean log high/low compaction ratio below the shuffled-label null (ratio < 1).",
  "hypothesis_test_status": "pass",
  "null_distribution_mean": 0.0008795264464656026,
  "null_distribution_sd": 0.01218293658652639,
  "n_permutations": 300,
  "max_spots_per_group": 180,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rdna-marked-inter-chromosomal-hub-compaction-41a5f6a09d_result.csv",
  "statistical_figure_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rdna-marked-inter-chromosomal-hub-compaction-41a5f6a09d_statistical_summary.png",
  "notes": [
    "Coordinates are interpreted in schema xyz_unit=um.",
    "High/low rDNA bins are within-cell upper/lower quartiles; pairwise work is bounded by deterministic sampling.",
    "Negative effect_size means the median high-rDNA inter-chromosomal distance ratio is below 1."
  ]
}
   cell_id cell_type  ...  parameter_value_median_ratio  n_permutations
0  1_0_116  Purkinje  ...                        0.7936             300
1   1_0_26  Purkinje  ...                        0.7936             300
2   1_0_34  Bergmann  ...                        0.7936             300
3   1_0_37  Purkinje  ...                        0.7936             300
4   1_0_42   Granule  ...                        0.7936             300
5   1_0_47   Granule  ...                        0.7936             300
6   1_0_61  Bergmann  ...                        0.7936             300
7   1_0_63  Bergmann  ...                        0.7936             300
8   1_0_69   Granule  ...                        0.7936             300

[9 rows x 18 columns]

tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/notebooks/rdna-marked-inter-chromosomal-hub-compaction-41a5f6a09d.ipynb:201: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  "           1_0_69      3220\n",

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "rdna-marked-inter-chromosomal-hub-compaction-41a5f6a09d",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count": "pass",
    "minimum_spot_or_trace_count": "pass",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test": "pass",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation": "pass"
  },
  "parameter_value": 0.7935725816968385,
  "p_value": 0.0033222591362126247,
  "test_method": "one-sided within-cell rDNA-label permutation test (300 permutations) on mean log(high/low median inter-chromosomal distance ratio)",
  "effect_size": -0.20642741830316147,
  "result_path": "tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/rdna-marked-inter-chromosomal-hub-compaction-41a5f6a09d_result.csv",
  "notes": [
    "Coordinates are interpreted in schema xyz_unit=um.",
    "High/low rDNA bins are within-cell upper/lower quartiles; pairwise work is bounded by deterministic sampling.",
    "Negative effect_size means the median high-rDNA inter-chromosomal distance ratio is below 1."
  ]
}

Auto-discovery idea: rDNA-marked inter-chromosomal hub compaction¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact analysis plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶

Final interpretation¶