# Ensure relative paths in scaffold cells resolve from the workspace root.
import os
os.chdir('/Users/weizexu/Projects/U-Chrom')
print('cwd:', os.getcwd())

cwd: /Users/weizexu/Projects/U-Chrom

from pathlib import Path
import json
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from uchrom import ChromData
from uchrom.auto_discovery import DiscoveryIdea, review_idea_against_schema

IDEA = DiscoveryIdea.from_dict({'idea_title': 'Active chromatin assortativity between chromosomes', 'biological_hypothesis': 'Chromosomes with stronger H3K27ac signal preferentially occupy nearby inter-chromosomal neighborhoods, producing active-state assortativity in nuclear organization.', 'computable_parameter': 'H3K27ac_interchrom_assortativity = Spearman correlation between chromosome-pair mean H3K27ac and inverse chromosome-centroid distance across all inter-chromosomal pairs within cells, summarized across cells.', 'analysis_plan': 'Within each cell, compute a centroid for each chromosome from coords and spots.chrom, and compute the chromosome-level median tracks.H3K27ac. For every inter-chromosomal pair, calculate inverse centroid distance and the mean of the two chromosome H3K27ac medians. Estimate Spearman correlation per cell and summarize across cells. Test whether correlations are greater than zero using a signed-rank test or a permutation test that shuffles H3K27ac summaries across chromosome labels within each cell.', 'modalities': ['chromatin_tracing', 'if_tracks', 'cell_metadata'], 'idea_markdown': '### Rationale\nInter-chromosomal organization may be shaped by active chromatin state, with H3K27ac-rich chromosomes preferentially occupying shared spatial neighborhoods.\n\n### Data used\nUse 3D coordinates, chromosome labels, cell IDs, cell type metadata, and H3K27ac intensity at traced genomic bins.\n\n### Analysis sketch\nFor each cell, summarize each chromosome by its centroid and median H3K27ac signal. Across chromosome pairs, relate pairwise centroid proximity to the average H3K27ac level of the pair.\n\n### Expected result\nIf active chromosomes cluster together, chromosome pairs with higher H3K27ac should have shorter centroid distances, producing a positive proximity-assortativity statistic.\n\n### Validation checks\nCheck fields, at least several chromosomes per cell, finite correlation, a permutation p-value, runtime, deterministic rerun, and shuffled H3K27ac chromosome assignments as a negative control.', 'cell_types': ['Granule', 'Bergmann', 'Purkinje'], 'required_fields': ['coords', 'spots.chrom', 'spots.cell_id', 'tracks.H3K27ac', 'cells.cell_type'], 'validation_checks': ['required_fields_exist', 'minimum_cell_count', 'minimum_spot_or_trace_count', 'finite_numeric_output', 'statistical_hypothesis_test', 'runtime_under_budget', 'deterministic_rerun', 'negative_control_or_permutation'], 'expected_direction': 'Positive H3K27ac_interchrom_assortativity, meaning H3K27ac-rich chromosome pairs are spatially closer.', 'complexity': 3, 'idea_id': 'active-chromatin-assortativity-between-chromosom-6fcdb55564', 'metadata': {}})
H5CD_PATH = 'tmp/takei_auto_discovery_doc/takei_doc_auto_subset.h5cd'
RUN_OUTPUT_DIR = Path('tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
cdata = ChromData.read(H5CD_PATH) if H5CD_PATH else None
schema = cdata.discovery_schema if cdata is not None else None
adata = cdata.linked_adata if cdata is not None else None
print(IDEA.idea_id)
if cdata is not None:
    print(cdata)
    print(cdata.describe_for_agent(max_items=20))

active-chromatin-assortativity-between-chromosom-6fcdb55564
ChromData: n_spots=56036, n_traces=213, n_cells=9
  spots:   ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
  cells:   ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts'] (9 cells)
  cellm:   {'umap': (9, 2)}
  tracks:  ['CPSF6', 'ATRX', 'H4K8ac', 'HDAC2', 'H3K9ac', 'H3K9me3', 'H3K9me2', 'RNAPIISer2-P', 'H3', 'H3K36me2', 'UBTF', 'LaminB1', 'RNAPIISer5-P', 'RYBP', 'HP1beta', 'RING1B', 'H2A.X', 'H3K4me1', 'H4K20me2', 'H3K27me2', 'JARID2', 'SF3A66', 'CBP', 'H2AK119u1', 'EZH2', 'H3K4me2', 'BRG1', 'HP1alpha', 'Fibrillarin', 'KAP1', 'H3K27ac', 'H3K4me3', 'H3K36ac', 'H3K14ac', 'H4K20me1', 'HP1gamma', 'H4K20me3', 'H3K27me3', 'mH2A1', 'CHD4', 'KAT3B_p300', 'H3K56ac', 'H3K36me3', 'HDAC1', 'SUZ12', 'H4K16ac', 'BRD4', 'SOX2', 'rDNA', 'MajSat', 'LINE1', 'SINEB1', 'Telomere', 'MinSat', 'Xist_RNA', 'ITS1_RNA', 'Rnu2_RNA', 'polyA_RNA', 'Malat1_RNA', 'dot_int', 'n_rad_score', 'n_per_dist(um)']
  traces:  ['dbscan_allele', 'dbscan_ldp_allele'] (213 traces)
  uns:     ['allele_col', 'genome_assembly', 'keep_unclustered', 'source', 'voxel_xy_nm', 'voxel_z_nm', 'xyz_unit', 'zenodo_record', 'auto_discovery_schema', 'leiden_to_cell_type', 'linked_anndata']
  linked_adata: (9, 60)
# ChromData discovery schema

dataset: takei2025_doc_subset_pantheon_20
genome: mm10
xyz_unit: um
shape: 56036 spots, 213 traces, 9 cells

modalities:
- cell_metadata: present; operations: cell_type_stratification, embedding_visualization
- chromatin_tracing: present; operations: chromosome_subset, cell_subset, trace_subset, pairwise_3d_distance, intra_chromatin_distance, inter_chromatin_distance
- if_tracks: present; operations: marker_high_low_bin_selection, marker_stratified_distance, per_cell_marker_summary, per_cell_type_marker_summary
- rna_expression: present; operations: gene_expression_lookup, expression_stratification, gene_marker_correlation, chromatin_expression_association

chroms: 20 [chr1, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX]
cell_types: 3 [Bergmann=3, Granule=3, Purkinje=3]
tracks: 62 [CPSF6, ATRX, H4K8ac, HDAC2, H3K9ac, H3K9me3, H3K9me2, RNAPIISer2-P, H3, H3K36me2, UBTF, LaminB1, RNAPIISer5-P, RYBP, HP1beta, RING1B, H2A.X, H3K4me1, H4K20me2, H3K27me2 ...]
linked_adata: shape=[9, 60], X=csr_matrix
genes: 60 [Aldoc, Calb1, Cdh22, Drd3, Eomes, Ephb2, Foxj1, Gabra6, Gpr176, Grm1, Hspb1, Mrc1, Nefh, Npas3, Nptn, Olig1, Pcp2, Pcp4, Plcb3, Plcb4 ...]

known_missing:
- cellm['if_mean'] per-cell IF mean matrix
- raw RNA seqFISH spot geometry as a first-class ChromData component
- scRNA reference matrix for external expression comparison
- gene annotation cache for gene-neighborhood analyses

verification_required:
- required_fields_exist
- minimum_cell_count
- minimum_spot_or_trace_count
- finite_numeric_output
- statistical_hypothesis_test
- runtime_under_budget
- deterministic_rerun
- negative_control_or_permutation
- redundancy_against_existing_parameters

review = review_idea_against_schema(IDEA, schema) if schema is not None else None
print(None if review is None else review.to_dict())
assert review is None or review.accepted, review.to_dict()

{'accepted': True, 'errors': [], 'warnings': ['multi-modal idea should include a cell_id_alignment validation check'], 'missing_fields': []}

# Lightweight data inspection for required fields and alignment assumptions
import numpy as np
import pandas as pd

print('cwd:', os.getcwd())
print('n spots/traces/cells:', cdata.n_spots, cdata.n_traces, cdata.n_cells)
print('spots columns:', list(cdata.spots.columns))
print('cells columns:', list(cdata.cells.columns)[:12])
print('tracks includes H3K27ac:', 'H3K27ac' in cdata.tracks.columns)
print('coords type/shape:', type(cdata.coords), getattr(cdata.coords, 'shape', None))
print('coords preview:')
print(pd.DataFrame(np.asarray(cdata.coords)[:5], columns=['x_um','y_um','z_um']))
preview_cols = ['chrom', 'cell_id', 'trace_id']
print('spots preview:')
print(cdata.spots[preview_cols].head())
print('H3K27ac finite coverage:', int(np.isfinite(cdata.tracks['H3K27ac']).sum()), '/', len(cdata.tracks))
print('cells by type:', cdata.cells['cell_type'].value_counts().to_dict())
inspection_summary = {
    'n_cells': int(cdata.n_cells),
    'n_spots': int(cdata.n_spots),
    'n_chroms': int(cdata.spots['chrom'].nunique()),
    'h3k27ac_finite_fraction': float(np.isfinite(cdata.tracks['H3K27ac']).mean()),
}

cwd: /Users/weizexu/Projects/U-Chrom
n spots/traces/cells: 56036 213 9
spots columns: ['chrom', 'start', 'end', 'trace_id', 'cell_id', 'name']
cells columns: ['leiden', 'cell_type', 'x_centroid', 'y_centroid', 'z_centroid', 'nuc_volume_um3', 'doublet', 'batch', 'n_transcripts', 'n_genes_by_counts']
tracks includes H3K27ac: True
coords type/shape: <class 'numpy.ndarray'> (56036, 3)
coords preview:
         x_um        y_um     z_um
0  127.399876  100.729674  1.62050
1  124.070401  104.650369  1.61500
2  125.011100  104.504933  1.56525
3  124.328004  104.750176  1.57875
4  124.519378  104.306864  1.57325
spots preview:
   chrom cell_id         trace_id
0  chr14  1_0_61  1_0_61_chr14_a2
1   chr2  1_0_61   1_0_61_chr2_a2
2  chr14  1_0_61  1_0_61_chr14_a1
3   chr2  1_0_61   1_0_61_chr2_a2
4  chr14  1_0_61  1_0_61_chr14_a1
H3K27ac finite coverage: 56036 / 56036
cells by type: {'Granule': 3, 'Bergmann': 3, 'Purkinje': 3}

# Main compact exploration: chromosome-level H3K27ac proximity assortativity
import os
os.environ.setdefault('MPLBACKEND', 'Agg')
import matplotlib
matplotlib.use('Agg', force=True)
import matplotlib.pyplot as plt
from IPython.display import Image, display
from itertools import combinations
from scipy.stats import spearmanr
import numpy as np
import pandas as pd
import json

rng = np.random.default_rng(20250317)
N_PERM = 500
EPS = 1e-9
OUTDIR_ABS = Path('/Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg')
OUTDIR_ABS.mkdir(parents=True, exist_ok=True)
RESULT_PATH = OUTDIR_ABS / 'active-chromatin-assortativity-between-chromosom-6fcdb55564_result.csv'
FIG_PATH = OUTDIR_ABS / 'active-chromatin-assortativity-between-chromosom-6fcdb55564_statistical_summary.png'

# Assemble spot-level table from documented fields only.
coords = np.asarray(cdata.coords)
spot_df = cdata.spots[['chrom', 'cell_id']].copy().reset_index(drop=True)
spot_df[['x_um', 'y_um', 'z_um']] = coords[:, :3]
spot_df['H3K27ac'] = np.asarray(cdata.tracks['H3K27ac'])
spot_df = spot_df.replace([np.inf, -np.inf], np.nan).dropna(subset=['x_um', 'y_um', 'z_um', 'H3K27ac', 'chrom', 'cell_id'])

# Per cell/chromosome centroid and median activity. Require modest coverage per chromosome.
chrom_summary = (
    spot_df.groupby(['cell_id', 'chrom'], observed=True)
    .agg(
        x_um=('x_um', 'mean'),
        y_um=('y_um', 'mean'),
        z_um=('z_um', 'mean'),
        h3k27ac_median=('H3K27ac', 'median'),
        n_spots=('H3K27ac', 'size'),
    )
    .reset_index()
)
chrom_summary = chrom_summary[chrom_summary['n_spots'] >= 10].copy()

cell_type_map = cdata.cells['cell_type'].astype(str).to_dict()
if not set(chrom_summary['cell_id']).issubset(set(cell_type_map)):
    # cdata.cells is expected to be indexed by cell_id; retain a robust fallback just in case.
    cell_type_map = dict(zip(map(str, cdata.cells.index), cdata.cells['cell_type'].astype(str)))

pair_rows = []
cell_rows = []
for cell_id, grp in chrom_summary.groupby('cell_id', observed=True):
    grp = grp.sort_values('chrom').reset_index(drop=True)
    if len(grp) < 4:
        continue
    local_pair_rows = []
    for i, j in combinations(range(len(grp)), 2):
        a = grp.iloc[i]
        b = grp.iloc[j]
        dist = float(np.linalg.norm(a[['x_um', 'y_um', 'z_um']].to_numpy(dtype=float) - b[['x_um', 'y_um', 'z_um']].to_numpy(dtype=float)))
        if not np.isfinite(dist) or dist <= 0:
            continue
        local_pair_rows.append({
            'cell_id': str(cell_id),
            'cell_type': cell_type_map.get(str(cell_id), cell_type_map.get(cell_id, 'unknown')),
            'chrom_a': a['chrom'],
            'chrom_b': b['chrom'],
            'distance_um': dist,
            'inverse_distance_per_um': 1.0 / (dist + EPS),
            'pair_mean_h3k27ac': float((a['h3k27ac_median'] + b['h3k27ac_median']) / 2.0),
        })
    if len(local_pair_rows) < 6:
        continue
    pairs_df = pd.DataFrame(local_pair_rows)
    rho, p_asym = spearmanr(pairs_df['pair_mean_h3k27ac'], pairs_df['inverse_distance_per_um'])
    if np.isfinite(rho):
        pair_rows.extend(local_pair_rows)
        cell_rows.append({
            'cell_id': str(cell_id),
            'cell_type': cell_type_map.get(str(cell_id), cell_type_map.get(cell_id, 'unknown')),
            'n_chromosomes': int(len(grp)),
            'n_pairs': int(len(pairs_df)),
            'spearman_rho': float(rho),
            'spearman_asymptotic_p_two_sided': float(p_asym) if np.isfinite(p_asym) else np.nan,
        })

cell_stats = pd.DataFrame(cell_rows)
pair_table = pd.DataFrame(pair_rows)

if len(cell_stats) >= 2:
    observed_statistic = float(cell_stats['spearman_rho'].mean())
    null_stats = []
    for perm_idx in range(N_PERM):
        perm_cell_rhos = []
        for cell_id, grp in chrom_summary.groupby('cell_id', observed=True):
            grp = grp.sort_values('chrom').reset_index(drop=True)
            if len(grp) < 4:
                continue
            shuffled_h3 = rng.permutation(grp['h3k27ac_median'].to_numpy(dtype=float))
            inv_d = []
            pair_h3 = []
            xyz = grp[['x_um', 'y_um', 'z_um']].to_numpy(dtype=float)
            for i, j in combinations(range(len(grp)), 2):
                dist = float(np.linalg.norm(xyz[i] - xyz[j]))
                if np.isfinite(dist) and dist > 0:
                    inv_d.append(1.0 / (dist + EPS))
                    pair_h3.append((shuffled_h3[i] + shuffled_h3[j]) / 2.0)
            if len(inv_d) >= 6:
                rho_perm, _ = spearmanr(pair_h3, inv_d)
                if np.isfinite(rho_perm):
                    perm_cell_rhos.append(float(rho_perm))
        if perm_cell_rhos:
            null_stats.append(float(np.mean(perm_cell_rhos)))
    null_stats = np.asarray(null_stats, dtype=float)
    p_value = float((1 + np.sum(null_stats >= observed_statistic)) / (len(null_stats) + 1)) if len(null_stats) else 1.0
    null_mean = float(np.mean(null_stats)) if len(null_stats) else 0.0
    null_sd = float(np.std(null_stats, ddof=1)) if len(null_stats) > 1 else float('nan')
    effect_size = float(observed_statistic - null_mean)
    hypothesis_test_status = 'pass' if len(null_stats) >= 100 else 'insufficient_data'
    note = f'Used {len(null_stats)} chromosome-label permutations.'
else:
    observed_statistic = float(cell_stats['spearman_rho'].mean()) if len(cell_stats) else 0.0
    null_stats = np.asarray([], dtype=float)
    p_value = 1.0
    null_mean = 0.0
    null_sd = float('nan')
    effect_size = float(observed_statistic - null_mean)
    hypothesis_test_status = 'insufficient_data'
    note = 'Too few cells with finite chromosome-pair correlations for permutation inference.'

test_method = f'one-sided chromosome-label permutation test ({len(null_stats)} permutations)'
null_hypothesis = 'Within each cell, chromosome H3K27ac summaries are exchangeable with respect to chromosome centroid distances; mean Spearman rho is no greater than expected by shuffled chromosome labels.'
alternative_hypothesis = 'H3K27ac-rich chromosome pairs have higher inverse centroid distance than expected by shuffled chromosome labels, yielding positive active-state proximity assortativity.'

result_table = cell_stats.copy()
if result_table.empty:
    result_table = pd.DataFrame([{'cell_id': 'none', 'cell_type': 'none', 'n_chromosomes': 0, 'n_pairs': 0, 'spearman_rho': observed_statistic}])
result_table['observed_statistic'] = observed_statistic
result_table['effect_size'] = effect_size
result_table['p_value'] = p_value
result_table['test_method'] = test_method
result_table['null_mean'] = null_mean
result_table['null_sd'] = null_sd
result_table['hypothesis_test_status'] = hypothesis_test_status
result_table.to_csv(RESULT_PATH, index=False)

analysis_summary = {
    'idea_id': IDEA.idea_id,
    'parameter_name': 'H3K27ac_interchrom_assortativity',
    'parameter_value': observed_statistic,
    'observed_statistic': observed_statistic,
    'effect_size': effect_size,
    'p_value': p_value,
    'test_method': test_method,
    'null_hypothesis': null_hypothesis,
    'alternative_hypothesis': alternative_hypothesis,
    'hypothesis_test_status': hypothesis_test_status,
    'n_selected_cells': int(result_table['cell_id'].nunique()) if 'cell_id' in result_table else 0,
    'n_rows': int(len(result_table)),
    'n_pair_rows': int(len(pair_table)),
    'n_permutations': int(len(null_stats)),
    'result_path': str(RESULT_PATH),
    'figure_path': str(FIG_PATH),
    'notes': [note, 'Effect size is observed mean per-cell Spearman rho minus mean shuffled-label null statistic.'],
}

# Statistical figure: null distribution versus observed statistic and per-cell estimates.
fig, axes = plt.subplots(1, 2, figsize=(11.5, 4.2), constrained_layout=True)
fig.patch.set_facecolor('white')
ax = axes[0]
if len(null_stats):
    ax.hist(null_stats, bins=24, color='#9ecae1', edgecolor='white', label='Shuffled chromosome-label null')
else:
    ax.text(0.5, 0.5, 'No null distribution', ha='center', va='center', transform=ax.transAxes)
ax.axvline(observed_statistic, color='#d62728', lw=2.5, label='Observed mean rho')
ax.axvline(0, color='0.35', lw=1, ls=':', label='rho = 0')
ax.set_xlabel('Mean per-cell Spearman rho\n(pair H3K27ac vs inverse centroid distance)')
ax.set_ylabel('Permutation count')
ax.set_title('Permutation evidence for active assortativity')
ax.legend(frameon=False, fontsize=8)
ax.text(0.02, 0.78, f'p={p_value:.3g}\neffect={effect_size:.3f}\nn cells={len(cell_stats)}\n500 label shuffles', transform=ax.transAxes, va='top', ha='left', fontsize=8, bbox=dict(facecolor='white', edgecolor='0.85', alpha=0.9))

ax = axes[1]
if not cell_stats.empty:
    order = list(cell_stats.sort_values(['cell_type', 'cell_id'])['cell_id'])
    colors = {'Granule': '#4daf4a', 'Bergmann': '#377eb8', 'Purkinje': '#984ea3'}
    x = np.arange(len(order))
    vals = cell_stats.set_index('cell_id').loc[order]
    ax.bar(x, vals['spearman_rho'], color=[colors.get(ct, '0.5') for ct in vals['cell_type']], edgecolor='black', linewidth=0.5)
    ax.axhline(0, color='0.35', lw=1)
    ax.axhline(observed_statistic, color='#d62728', lw=1.8, ls='--', label='Mean observed rho')
    ax.set_xticks(x)
    ax.set_xticklabels([f'{cid}\n{ct}' for cid, ct in zip(vals.index, vals['cell_type'])], rotation=45, ha='right', fontsize=8)
    ax.set_ylabel('Per-cell Spearman rho')
    ax.set_title('Cell-level chromosome-pair assortativity')
    handles = [plt.Line2D([0], [0], marker='s', color='w', markerfacecolor=c, markeredgecolor='black', label=k, markersize=8) for k, c in colors.items() if k in set(vals['cell_type'])]
    handles.append(plt.Line2D([0], [0], color='#d62728', ls='--', label='Mean observed rho'))
    ax.legend(handles=handles, frameon=False, fontsize=8, loc='best')
else:
    ax.text(0.5, 0.5, 'No finite cell-level statistics', ha='center', va='center', transform=ax.transAxes)
    ax.set_axis_off()

for ax in axes:
    ax.spines[['top', 'right']].set_visible(False)

fig.savefig(FIG_PATH, dpi=180, bbox_inches='tight', facecolor='white')
plt.close(fig)
display(Image(filename=str(FIG_PATH)))

print(json.dumps(analysis_summary, indent=2))
print('result_table preview:')
display(result_table.head(10))

<IPython.core.display.Image object>
{
  "idea_id": "active-chromatin-assortativity-between-chromosom-6fcdb55564",
  "parameter_name": "H3K27ac_interchrom_assortativity",
  "parameter_value": 0.16466336754481312,
  "observed_statistic": 0.16466336754481312,
  "effect_size": 0.16533794015569558,
  "p_value": 0.001996007984031936,
  "test_method": "one-sided chromosome-label permutation test (500 permutations)",
  "null_hypothesis": "Within each cell, chromosome H3K27ac summaries are exchangeable with respect to chromosome centroid distances; mean Spearman rho is no greater than expected by shuffled chromosome labels.",
  "alternative_hypothesis": "H3K27ac-rich chromosome pairs have higher inverse centroid distance than expected by shuffled chromosome labels, yielding positive active-state proximity assortativity.",
  "hypothesis_test_status": "pass",
  "n_selected_cells": 9,
  "n_rows": 9,
  "n_pair_rows": 1441,
  "n_permutations": 500,
  "result_path": "/Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/active-chromatin-assortativity-between-chromosom-6fcdb55564_result.csv",
  "figure_path": "/Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/active-chromatin-assortativity-between-chromosom-6fcdb55564_statistical_summary.png",
  "notes": [
    "Used 500 chromosome-label permutations.",
    "Effect size is observed mean per-cell Spearman rho minus mean shuffled-label null statistic."
  ]
}
result_table preview:
   cell_id cell_type  n_chromosomes  ...  null_mean   null_sd  hypothesis_test_status
0  1_0_116  Purkinje             20  ...  -0.000675  0.043545                    pass
1   1_0_26  Purkinje             12  ...  -0.000675  0.043545                    pass
2   1_0_34  Bergmann             18  ...  -0.000675  0.043545                    pass
3   1_0_37  Purkinje             16  ...  -0.000675  0.043545                    pass
4   1_0_42   Granule             19  ...  -0.000675  0.043545                    pass
5   1_0_47   Granule             19  ...  -0.000675  0.043545                    pass
6   1_0_61  Bergmann             20  ...  -0.000675  0.043545                    pass
7   1_0_63  Bergmann             20  ...  -0.000675  0.043545                    pass
8   1_0_69   Granule             20  ...  -0.000675  0.043545                    pass

[9 rows x 13 columns]

checks = {check: 'not_run' for check in IDEA.validation_checks}
notes = []
checks.setdefault('statistical_hypothesis_test', 'not_run')

def _check_keys(prefix):
    return [key for key in checks if key == prefix or key.startswith(prefix + ':')]

def _set_check(prefix, value):
    keys = _check_keys(prefix)
    if not keys:
        checks[prefix] = value
        return
    for key in keys:
        checks[key] = value

def _check_status(prefix):
    values = [checks[key] for key in _check_keys(prefix)]
    if not values:
        return None
    if 'fail' in values:
        return 'fail'
    if all(value == 'pass' for value in values):
        return 'pass'
    return values[0]

_set_check('required_fields_exist', 'pass' if review is not None and review.accepted else 'fail')
if _check_keys('cell_id_alignment'):
    aligned = True
    if cdata is not None and adata is not None and len(cdata.cells) == len(adata.obs_names):
        aligned = list(map(str, cdata.cells.index)) == list(map(str, adata.obs_names))
    _set_check('cell_id_alignment', 'pass' if aligned else 'fail')
if _check_keys('minimum_cell_count'):
    n_cells = analysis_summary.get('n_selected_cells')
    if n_cells is None and 'cell_type' in getattr(result_table, 'columns', []):
        n_cells = len(result_table)
    if n_cells is None:
        n_cells = len(cdata.cells) if cdata is not None and getattr(cdata, 'n_cells', 0) else 0
    _set_check('minimum_cell_count', 'pass' if n_cells >= 1 else 'fail')
if _check_keys('minimum_spot_or_trace_count'):
    n_rows = analysis_summary.get('n_rows')
    if n_rows is None:
        n_rows = len(result_table) if result_table is not None else 0
    _set_check('minimum_spot_or_trace_count', 'pass' if n_rows >= 1 else 'fail')
if _check_keys('finite_numeric_output'):
    value = analysis_summary.get('parameter_value')
    _set_check('finite_numeric_output', 'pass' if value is not None and np.isfinite(value) else 'fail')
if _check_keys('statistical_hypothesis_test'):
    p_value = analysis_summary.get('p_value')
    test_method = analysis_summary.get('test_method')
    null_hypothesis = analysis_summary.get('null_hypothesis')
    alternative_hypothesis = analysis_summary.get('alternative_hypothesis')
    observed_statistic = analysis_summary.get('observed_statistic')
    effect_size = analysis_summary.get('effect_size')
    hypothesis_test_status = analysis_summary.get('hypothesis_test_status', 'pass')
    try:
        p_float = float(p_value)
    except Exception:
        p_float = np.nan
    try:
        stat_float = float(observed_statistic)
    except Exception:
        stat_float = np.nan
    try:
        effect_float = float(effect_size)
    except Exception:
        effect_float = np.nan
    has_required_test = (
        test_method is not None
        and str(test_method).strip() != ''
        and null_hypothesis is not None
        and str(null_hypothesis).strip() != ''
        and alternative_hypothesis is not None
        and str(alternative_hypothesis).strip() != ''
        and np.isfinite(p_float)
        and 0.0 <= p_float <= 1.0
        and np.isfinite(stat_float)
        and np.isfinite(effect_float)
        and hypothesis_test_status != 'insufficient_data'
    )
    if result_table is not None and hasattr(result_table, 'columns'):
        has_required_test = has_required_test and 'p_value' in result_table.columns and 'test_method' in result_table.columns
    else:
        has_required_test = False
    _set_check('statistical_hypothesis_test', 'pass' if has_required_test else 'fail')
    if not has_required_test:
        notes.append('statistical_hypothesis_test failed: analysis_summary must include null_hypothesis, alternative_hypothesis, test_method, observed_statistic, effect_size, finite p_value in [0,1], and result_table columns p_value/test_method')
if _check_keys('negative_control_or_permutation'):
    test_method_text = str(analysis_summary.get('test_method', '')).lower()
    summary_keys_text = ' '.join(str(key).lower() for key in analysis_summary.keys())
    result_columns_text = ''
    if result_table is not None and hasattr(result_table, 'columns'):
        result_columns_text = ' '.join(str(col).lower() for col in result_table.columns)
    control_text = ' '.join([test_method_text, summary_keys_text, result_columns_text])
    has_control_or_permutation = any(
        token in control_text
        for token in ['permutation', 'randomization', 'shuffle', 'negative_control', 'null_distribution', 'control']
    )
    _set_check(
        'negative_control_or_permutation',
        'pass' if has_control_or_permutation else 'not_implemented',
    )
for check in list(checks):
    if checks[check] == 'not_run' and ('negative_control' in check or check.endswith('_control')):
        checks[check] = 'not_implemented'

required_for_pass = ['required_fields_exist', 'minimum_cell_count', 'finite_numeric_output', 'statistical_hypothesis_test']
status = 'pass'
for check in required_for_pass:
    if _check_status(check) == 'fail':
        status = 'fail'
        notes.append(f'{check} failed')
n_rows_for_status = analysis_summary.get('n_rows')
if n_rows_for_status is None:
    n_rows_for_status = len(result_table) if result_table is not None else 0
if n_rows_for_status == 0:
    status = 'fail'
    notes.append('analysis produced no result rows')

verification = {
    'idea_id': IDEA.idea_id,
    'status': status,
    'checks': checks,
    'parameter_value': analysis_summary.get('parameter_value'),
    'p_value': analysis_summary.get('p_value'),
    'test_method': analysis_summary.get('test_method'),
    'effect_size': analysis_summary.get('effect_size'),
    'result_path': analysis_summary.get('result_path'),
    'notes': notes + analysis_summary.get('notes', []),
}
print(json.dumps(verification, indent=2))

{
  "idea_id": "active-chromatin-assortativity-between-chromosom-6fcdb55564",
  "status": "pass",
  "checks": {
    "required_fields_exist": "pass",
    "minimum_cell_count": "pass",
    "minimum_spot_or_trace_count": "pass",
    "finite_numeric_output": "pass",
    "statistical_hypothesis_test": "pass",
    "runtime_under_budget": "not_run",
    "deterministic_rerun": "not_run",
    "negative_control_or_permutation": "pass"
  },
  "parameter_value": 0.16466336754481312,
  "p_value": 0.001996007984031936,
  "test_method": "one-sided chromosome-label permutation test (500 permutations)",
  "effect_size": 0.16533794015569558,
  "result_path": "/Users/weizexu/Projects/U-Chrom/tmp/takei_auto_discovery_doc/run_pantheon_20_ideas_verified_agg/active-chromatin-assortativity-between-chromosom-6fcdb55564_result.csv",
  "notes": [
    "Used 500 chromosome-label permutations.",
    "Effect size is observed mean per-cell Spearman rho minus mean shuffled-label null statistic."
  ]
}

Auto-discovery idea: Active chromatin assortativity between chromosomes¶

Rationale¶

Data used¶

Analysis sketch¶

Expected result¶

Validation checks¶

Graphical abstract¶

Required data checks¶

Exploration¶

Critique and compact plan¶

Statistical figure¶

Runner verification summary¶

Final interpretation¶